vore 0.2.0-x86_64-linux → 0.2.2-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/vore-spider +0 -0
- data/lib/vore/crawler.rb +19 -3
- data/lib/vore/handlers/content_extractor.rb +7 -1
- data/lib/vore/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c27c900fcf6aa58b097af96ed54fa00f9ec6457f96039328dd5745dd0be8e5b2
|
4
|
+
data.tar.gz: 97e9e59e081538976e39dad93d5a70638a487bfa5ed81f321a0eb73a505879de
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b95950fef4cf650e97430ec7514b09aa9e2ef6009e8cf2589e870fe150e703cd6b5477278cea4ab99c3ffd429d126592a0c8c9f283f6369fa58f04fab8a7ced
|
7
|
+
data.tar.gz: 742ca3d1ad27ede5a8a2a4bdc469e438fa2113c65b7502f3a33dc542e272b3eddac9a91b79c5e130757fbe20ee6bb9809afe83b43a008ecf6fa7e0379536ef30
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Vore
|
|
17
17
|
@selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
|
18
18
|
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
19
19
|
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
20
|
-
@
|
20
|
+
@parent_output_dir = "tmp/vore"
|
21
21
|
|
22
22
|
return if File.exist?(@executable)
|
23
23
|
|
@@ -26,11 +26,12 @@ module Vore
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def scrape_each_page(website, &block)
|
29
|
-
output_dir = "#{@
|
29
|
+
output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
30
30
|
Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
|
31
31
|
|
32
32
|
output = %x(#{@executable} \
|
33
33
|
--user-agent #{user_agent} \
|
34
|
+
--delay 3000
|
34
35
|
--url #{website} \
|
35
36
|
download \
|
36
37
|
-t \
|
@@ -38,12 +39,25 @@ module Vore
|
|
38
39
|
|
39
40
|
Vore.logger.info("Vore finished crawling #{website}: #{output}")
|
40
41
|
|
41
|
-
|
42
|
+
results = {
|
43
|
+
pages_visited: 0,
|
44
|
+
pages_unprocessed: 0,
|
45
|
+
unprocessed_pages: [],
|
46
|
+
}
|
47
|
+
|
48
|
+
Dir.glob(File.join(output_dir, "**", "*")).each do |path|
|
42
49
|
next unless File.file?(path)
|
43
50
|
|
44
51
|
html_file = File.read(path).force_encoding("UTF-8")
|
45
52
|
rewritten_html_file = @selma.rewrite(html_file)
|
46
53
|
|
54
|
+
results[:pages_visited] += 1
|
55
|
+
if rewritten_html_file.empty?
|
56
|
+
results[:pages_unprocessed] += 1
|
57
|
+
results[:unprocessed_pages] << path
|
58
|
+
next
|
59
|
+
end
|
60
|
+
|
47
61
|
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
48
62
|
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
49
63
|
|
@@ -58,6 +72,8 @@ module Vore
|
|
58
72
|
ensure
|
59
73
|
File.delete(path) if File.file?(path)
|
60
74
|
end
|
75
|
+
|
76
|
+
results
|
61
77
|
end
|
62
78
|
|
63
79
|
# def crawl(site, block)
|
@@ -19,7 +19,13 @@ module Vole
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def handle_element(element)
|
22
|
-
if element.tag_name == "pre" ||
|
22
|
+
if element.tag_name == "pre" ||
|
23
|
+
element.tag_name == "code" ||
|
24
|
+
element.tag_name == "form" ||
|
25
|
+
element.tag_name == "style" ||
|
26
|
+
element.tag_name == "noscript" ||
|
27
|
+
element.tag_name == "script" ||
|
28
|
+
element.tag_name == "svg"
|
23
29
|
element.remove
|
24
30
|
elsif element.tag_name == "title"
|
25
31
|
@within_title = true
|
data/lib/vore/version.rb
CHANGED