vore 0.2.6-x86_64-linux → 0.3.0-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/vore-spider +0 -0
- data/lib/vore/configuration.rb +4 -0
- data/lib/vore/crawler.rb +16 -10
- data/lib/vore/handlers/content_extractor.rb +0 -1
- data/lib/vore/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5fa0065651385809a53579488f9985cc3332197d4fa9818508859bb24274a16b
|
4
|
+
data.tar.gz: a0754878a08d651215dd5df40220e3002255ef65fedcb365e0605ff228db27fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48298f9bf6de3e76b443a4ea46fa0adc687e4a39b90fc94ae6a97e0ce0153c6e983e292af0c8d6a53dcb44e94b8ecba94f8adc75fe5ce8cd7857e947ab113795
|
7
|
+
data.tar.gz: 404712a662f24fff36346998e791cfac7c9ac44661005209f0a2a65b38146f0233dfcdc9cfc115c5772340d310d085dbadbca8766201fad810843359ccff1c98
|
data/exe/vore-spider
CHANGED
Binary file
|
data/lib/vore/configuration.rb
CHANGED
data/lib/vore/crawler.rb
CHANGED
@@ -8,9 +8,11 @@ module Vore
|
|
8
8
|
PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
|
9
9
|
FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
|
10
10
|
|
11
|
+
attr_reader :output_dir
|
12
|
+
|
11
13
|
# Creates a crawler
|
12
14
|
# denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
|
13
|
-
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
|
15
|
+
def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG, options: Vole::Configuration::DEFAULT_OPTIONS)
|
14
16
|
@denylist_regexp = Regexp.union(denylist)
|
15
17
|
|
16
18
|
@content_extractor = Vole::Handlers::ContentExtractor.new
|
@@ -18,6 +20,7 @@ module Vore
|
|
18
20
|
ext = PLATFORM.include?("windows") ? ".exe" : ""
|
19
21
|
@executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
|
20
22
|
@parent_output_dir = "tmp/vore"
|
23
|
+
@options = options
|
21
24
|
|
22
25
|
return if File.exist?(@executable)
|
23
26
|
|
@@ -26,16 +29,10 @@ module Vore
|
|
26
29
|
end
|
27
30
|
|
28
31
|
def scrape_each_page(website, &block)
|
29
|
-
output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
32
|
+
@output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
|
30
33
|
Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
|
31
34
|
|
32
|
-
output =
|
33
|
-
--user-agent #{user_agent} \
|
34
|
-
--delay 3500 \
|
35
|
-
--url #{website} \
|
36
|
-
download \
|
37
|
-
-t \
|
38
|
-
#{output_dir})
|
35
|
+
output = run_command(website, delay: @options[:delay])
|
39
36
|
|
40
37
|
Vore.logger.info("Vore finished crawling #{website}: #{output}")
|
41
38
|
|
@@ -54,7 +51,6 @@ module Vore
|
|
54
51
|
rewritten_html_file = ""
|
55
52
|
|
56
53
|
if html_file.empty?
|
57
|
-
Vore.logger.warn("HTML file empty: #{path}")
|
58
54
|
results[:pages_unprocessed] += 1
|
59
55
|
results[:unprocessed_pages] << path
|
60
56
|
next
|
@@ -91,6 +87,16 @@ module Vore
|
|
91
87
|
# crawl_site(site)
|
92
88
|
# end
|
93
89
|
|
90
|
+
def run_command(website, delay: 3500)
|
91
|
+
%x(#{@executable} \
|
92
|
+
--user-agent #{user_agent} \
|
93
|
+
--delay #{delay} \
|
94
|
+
--url #{website} \
|
95
|
+
download \
|
96
|
+
-t \
|
97
|
+
#{@output_dir})
|
98
|
+
end
|
99
|
+
|
94
100
|
def user_agent
|
95
101
|
"'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
|
96
102
|
end
|
data/lib/vore/version.rb
CHANGED