vore 0.2.1-x86_64-windows → 0.2.2-x86_64-windows
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/vore-spider.exe +0 -0
- data/lib/vore/crawler.rb +16 -0
- data/lib/vore/handlers/content_extractor.rb +7 -1
- data/lib/vore/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f43312e386621e74f76efadca6c5e74f19f62fc5d259f75963d2d39f642f504
|
4
|
+
data.tar.gz: 6e189804c155718fe5b739282f2130f344da01325acab55fa1900019d97b935b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9c6fb7dd1cd7da0730ca5caac2ac0d4beea60c88374b721c7181fab02f56a34dae880b3c4ee4c11da287eca26ae766030ff471f8ffaf29c9211bc5b55e32c155
|
7
|
+
data.tar.gz: 76ebda58d854ff7f286f0307a32136dbc6b25e9eabc095ed993ac233428365438d741458aa0adf1786841bc391240691bed46576a441c5f030291fb723628669
|
data/exe/vore-spider.exe
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -31,6 +31,7 @@ module Vore
|
|
31
31
|
|
32
32
|
output = %x(#{@executable} \
|
33
33
|
--user-agent #{user_agent} \
|
34
|
+
--delay 3000
|
34
35
|
--url #{website} \
|
35
36
|
download \
|
36
37
|
-t \
|
@@ -38,12 +39,25 @@ module Vore
|
|
38
39
|
|
39
40
|
Vore.logger.info("Vore finished crawling #{website}: #{output}")
|
40
41
|
|
42
|
+
results = {
|
43
|
+
pages_visited: 0,
|
44
|
+
pages_unprocessed: 0,
|
45
|
+
unprocessed_pages: [],
|
46
|
+
}
|
47
|
+
|
41
48
|
Dir.glob(File.join(output_dir, "**", "*")).each do |path|
|
42
49
|
next unless File.file?(path)
|
43
50
|
|
44
51
|
html_file = File.read(path).force_encoding("UTF-8")
|
45
52
|
rewritten_html_file = @selma.rewrite(html_file)
|
46
53
|
|
54
|
+
results[:pages_visited] += 1
|
55
|
+
if rewritten_html_file.empty?
|
56
|
+
results[:pages_unprocessed] += 1
|
57
|
+
results[:unprocessed_pages] << path
|
58
|
+
next
|
59
|
+
end
|
60
|
+
|
47
61
|
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
48
62
|
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
49
63
|
|
@@ -58,6 +72,8 @@ module Vore
|
|
58
72
|
ensure
|
59
73
|
File.delete(path) if File.file?(path)
|
60
74
|
end
|
75
|
+
|
76
|
+
results
|
61
77
|
end
|
62
78
|
|
63
79
|
# def crawl(site, block)
|
@@ -19,7 +19,13 @@ module Vole
|
|
19
19
|
end
|
20
20
|
|
21
21
|
def handle_element(element)
|
22
|
-
if element.tag_name == "pre" ||
|
22
|
+
if element.tag_name == "pre" ||
|
23
|
+
element.tag_name == "code" ||
|
24
|
+
element.tag_name == "form" ||
|
25
|
+
element.tag_name == "style" ||
|
26
|
+
element.tag_name == "noscript" ||
|
27
|
+
element.tag_name == "script" ||
|
28
|
+
element.tag_name == "svg"
|
23
29
|
element.remove
|
24
30
|
elsif element.tag_name == "title"
|
25
31
|
@within_title = true
|
data/lib/vore/version.rb
CHANGED