vore 0.2.5-x86_64-windows → 0.3.0-x86_64-windows

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b244d29c525e65f76e7cffc59c453f26883d3c7c5d7a7ed2248ecaf319f22515
4
- data.tar.gz: 9bd4a0446694812585e68cf8c0b750f808979f0732a136d6d15273a84e7afd4b
3
+ metadata.gz: d9c0a3b54d7618f058010a7b420d8085bb26614a2a25a2d13574694d73639faf
4
+ data.tar.gz: 3b13cf4124744f822c0d3cb969c960d811cb76d39b542531dde19cfe4bc53c25
5
5
  SHA512:
6
- metadata.gz: 2783cd4297442c2f4aee698adf301b130873d8e38d01bbdea0f32ec59fc70cdd332cd08f4b7bf3fbfd5e796cb003bc332a9e55d199af36bbacc40cac56e746ca
7
- data.tar.gz: caea7c9488653fa4ad09eafc82dc6b1584379016a6e45256ba4bf7f83f8ddfe9ba8218144591ad62593f38b8dd6abc32e6d599932ec0a53c41f09efb3c594a7f
6
+ metadata.gz: 870cc3ea8b2ba8ae56ac0a4435f08baa0504d7f4cb809b06bc052598f0837ea46ab10f4b93b73c859f58be7378a88942d4861ebe5f9ab126de5273ac919c160e
7
+ data.tar.gz: a6c10f0f3233aec66f9f68c25aba29a87f69fab17070c5aad040b2e08a04e652a289ff6a4143e86dd160f2b411da4d4c9dda9458789144602424ee565a83cc50
data/exe/vore-spider.exe CHANGED
Binary file
@@ -5,5 +5,9 @@ module Vole
5
5
  DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
6
  allow_doctype: false,
7
7
  })
8
+
9
+ DEFAULT_OPTIONS = {
10
+ delay: 3500,
11
+ }
8
12
  end
9
13
  end
data/lib/vore/crawler.rb CHANGED
@@ -8,9 +8,11 @@ module Vore
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
9
  FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
10
10
 
11
+ attr_reader :output_dir
12
+
11
13
  # Creates a crawler
12
14
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
13
- def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
15
+ def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG, options: Vole::Configuration::DEFAULT_OPTIONS)
14
16
  @denylist_regexp = Regexp.union(denylist)
15
17
 
16
18
  @content_extractor = Vole::Handlers::ContentExtractor.new
@@ -18,6 +20,7 @@ module Vore
18
20
  ext = PLATFORM.include?("windows") ? ".exe" : ""
19
21
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
20
22
  @parent_output_dir = "tmp/vore"
23
+ @options = options
21
24
 
22
25
  return if File.exist?(@executable)
23
26
 
@@ -26,16 +29,10 @@ module Vore
26
29
  end
27
30
 
28
31
  def scrape_each_page(website, &block)
29
- output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
32
+ @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
30
33
  Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
31
34
 
32
- output = %x(#{@executable} \
33
- --user-agent #{user_agent} \
34
- --delay 3500 \
35
- --url #{website} \
36
- download \
37
- -t \
38
- #{output_dir})
35
+ output = run_command(website, delay: @options[:delay])
39
36
 
40
37
  Vore.logger.info("Vore finished crawling #{website}: #{output}")
41
38
 
@@ -51,6 +48,7 @@ module Vore
51
48
  results[:pages_visited] += 1
52
49
 
53
50
  html_file = File.read(path).force_encoding("UTF-8")
51
+ rewritten_html_file = ""
54
52
 
55
53
  if html_file.empty?
56
54
  results[:pages_unprocessed] += 1
@@ -58,7 +56,13 @@ module Vore
58
56
  next
59
57
  end
60
58
 
61
- rewritten_html_file = @selma.rewrite(html_file)
59
+ begin
60
+ rewritten_html_file = @selma.rewrite(html_file)
61
+ rescue StandardError => e
62
+ Vore.logger.warn("Error rewriting #{path}: #{e}")
63
+ results[:pages_unprocessed] += 1
64
+ next
65
+ end
62
66
 
63
67
  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
64
68
  url_path = path.split(FILE_SEPERATOR)[3..].join("/")
@@ -83,6 +87,16 @@ module Vore
83
87
  # crawl_site(site)
84
88
  # end
85
89
 
90
+ def run_command(website, delay: 3500)
91
+ %x(#{@executable} \
92
+ --user-agent #{user_agent} \
93
+ --delay #{delay} \
94
+ --url #{website} \
95
+ download \
96
+ -t \
97
+ #{@output_dir})
98
+ end
99
+
86
100
  def user_agent
87
101
  "'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
88
102
  end
@@ -20,7 +20,6 @@ module Vole
20
20
 
21
21
  def handle_element(element)
22
22
  if element.tag_name == "pre" ||
23
- element.tag_name == "code" ||
24
23
  element.tag_name == "form" ||
25
24
  element.tag_name == "style" ||
26
25
  element.tag_name == "noscript" ||
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.2.5"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.3.0
5
5
  platform: x86_64-windows
6
6
  authors:
7
7
  - Garen J. Torikian