vore 0.2.4-x86_64-windows → 0.2.8-x86_64-windows
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/vore-spider.exe +0 -0
- data/lib/vore/crawler.rb +26 -11
- data/lib/vore/handlers/content_extractor.rb +0 -1
- data/lib/vore/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85a50437d0557e28ff3eeb155a8e527163aeb5f90fd98f0e5cb8e09e3d81bb6e
|
4
|
+
data.tar.gz: 61d1255a042db43b6e50bc749fccceb452080afd07e1eac04461ef4894fb8027
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5c8a45adf9b4402d3600ae26728afb89b5e4d24953ad905462ad6b7c2d682a2806ed452673c486000621e5bf0373de6dd85a28ba592cb2e5eb1e86d83eb973f
|
7
|
+
data.tar.gz: 0077fcc1c2173c46be332da7727f6560a1cf8b2351491bc147b0d1111cefc82dc8bf8081e0f2370e2f7432d1905b2419e8bc168294dfef8a4387d609d512dae8
|
data/exe/vore-spider.exe
CHANGED
Binary file
|
data/lib/vore/crawler.rb
CHANGED
@@ -8,6 +8,8 @@ module Vore
|
|
8
8
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
9
9
|
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
10
10
|
|
11
|
+
attr_reader :output_dir
|
12
|
+
|
11
13
|
# Creates a crawler
|
12
14
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
13
15
|
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
|
@@ -26,16 +28,10 @@ module Vore
|
|
26
28
|
end
|
27
29
|
|
28
30
|
def scrape_each_page(website, &block)
|
29
|
-
output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
31
|
+
@output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
30
32
|
Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
|
31
33
|
|
32
|
-
output =
|
33
|
-
--user-agent #{user_agent} \
|
34
|
-
--delay 3000 \
|
35
|
-
--url #{website} \
|
36
|
-
download \
|
37
|
-
-t \
|
38
|
-
#{output_dir})
|
34
|
+
output = run_command(website, @output_dir)
|
39
35
|
|
40
36
|
Vore.logger.info("Vore finished crawling #{website}: #{output}")
|
41
37
|
|
@@ -48,16 +44,25 @@ module Vore
|
|
48
44
|
Dir.glob(File.join(output_dir, "**", "*")).each do |path|
|
49
45
|
next unless File.file?(path)
|
50
46
|
|
47
|
+
results[:pages_visited] += 1
|
48
|
+
|
51
49
|
html_file = File.read(path).force_encoding("UTF-8")
|
52
|
-
rewritten_html_file =
|
50
|
+
rewritten_html_file = ""
|
53
51
|
|
54
|
-
|
55
|
-
if rewritten_html_file.empty?
|
52
|
+
if html_file.empty?
|
56
53
|
results[:pages_unprocessed] += 1
|
57
54
|
results[:unprocessed_pages] << path
|
58
55
|
next
|
59
56
|
end
|
60
57
|
|
58
|
+
begin
|
59
|
+
rewritten_html_file = @selma.rewrite(html_file)
|
60
|
+
rescue StandardError => e
|
61
|
+
Vore.logger.warn("Error rewriting #{path}: #{e}")
|
62
|
+
results[:pages_unprocessed] += 1
|
63
|
+
next
|
64
|
+
end
|
65
|
+
|
61
66
|
# drops the first 3 parts of the path, which are "tmp", "vore", and the site name
|
62
67
|
url_path = path.split(FILE_SEPERATOR)[3..].join("/")
|
63
68
|
|
@@ -81,6 +86,16 @@ module Vore
|
|
81
86
|
# crawl_site(site)
|
82
87
|
# end
|
83
88
|
|
89
|
+
def run_command(website, output_dir)
|
90
|
+
%x(#{@executable} \
|
91
|
+
--user-agent #{user_agent} \
|
92
|
+
--delay 3500 \
|
93
|
+
--url #{website} \
|
94
|
+
download \
|
95
|
+
-t \
|
96
|
+
#{output_dir})
|
97
|
+
end
|
98
|
+
|
84
99
|
def user_agent
|
85
100
|
"'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
|
86
101
|
end
|
data/lib/vore/version.rb
CHANGED