vore 0.2.6-arm64-darwin → 0.3.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 402d817b4979ae3cb7caee99653fc2ddfccf726b53f49c76dc078d54a9868af3
4
- data.tar.gz: 1768208e84f98fbbcb5e8bb683528f6f476c928d79c1e006cb1b6a8bf521cae7
3
+ metadata.gz: 54a3f9525133d20d29eadd67c83ef0c996c3e85ae6843b3ea239e3cff0d9bea3
4
+ data.tar.gz: 065af90bf1234459fe430a2c49882d988e4b1367786dc506a7cf571804812339
5
5
  SHA512:
6
- metadata.gz: '0091d5f55c923e91abfa57b01e7857072196c85e1053568b5799c715c6d188a27075a82c9393e9fdadade34ff02832c98780dcf9e1f70970f45db29a77fbc733'
7
- data.tar.gz: 5915151dbdd4d055f5cf6305dada2a0a82d817a8563ddffab52582f6c580903a2f29269b81c3591c407775ab522d66e55c5a901d3afb4640aa0e959c56f88e2c
6
+ metadata.gz: 540ceeac482a7b9274161a5b64c7d5d0fd570dd3756934ae8a6dcaa760869b81c3f56d89b7c959f7a18c66fffe94c9b215efff8e648d5580aaeaa823e10405e3
7
+ data.tar.gz: f1d56bc655ac4e42e720837d00df756d497f55ceca844edc4fd7ceb887736458afb02532b90a1fe0f95a7e8b1fe3e551edd3ea196fa8d995c3bd3a5f42db7f32
data/exe/vore-spider CHANGED
Binary file
@@ -5,5 +5,9 @@ module Vole
5
5
  DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
6
  allow_doctype: false,
7
7
  })
8
+
9
+ DEFAULT_OPTIONS = {
10
+ delay: 3500,
11
+ }
8
12
  end
9
13
  end
data/lib/vore/crawler.rb CHANGED
@@ -8,9 +8,11 @@ module Vore
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
9
  FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
10
10
 
11
+ attr_reader :output_dir
12
+
11
13
  # Creates a crawler
12
14
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
13
- def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
15
+ def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG, options: Vole::Configuration::DEFAULT_OPTIONS)
14
16
  @denylist_regexp = Regexp.union(denylist)
15
17
 
16
18
  @content_extractor = Vole::Handlers::ContentExtractor.new
@@ -18,6 +20,7 @@ module Vore
18
20
  ext = PLATFORM.include?("windows") ? ".exe" : ""
19
21
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
20
22
  @parent_output_dir = "tmp/vore"
23
+ @options = options
21
24
 
22
25
  return if File.exist?(@executable)
23
26
 
@@ -26,16 +29,10 @@ module Vore
26
29
  end
27
30
 
28
31
  def scrape_each_page(website, &block)
29
- output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
32
+ @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
30
33
  Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
31
34
 
32
- output = %x(#{@executable} \
33
- --user-agent #{user_agent} \
34
- --delay 3500 \
35
- --url #{website} \
36
- download \
37
- -t \
38
- #{output_dir})
35
+ output = run_command(website, delay: @options[:delay])
39
36
 
40
37
  Vore.logger.info("Vore finished crawling #{website}: #{output}")
41
38
 
@@ -54,7 +51,6 @@ module Vore
54
51
  rewritten_html_file = ""
55
52
 
56
53
  if html_file.empty?
57
- Vore.logger.warn("HTML file empty: #{path}")
58
54
  results[:pages_unprocessed] += 1
59
55
  results[:unprocessed_pages] << path
60
56
  next
@@ -91,6 +87,16 @@ module Vore
91
87
  # crawl_site(site)
92
88
  # end
93
89
 
90
+ def run_command(website, delay: 3500)
91
+ %x(#{@executable} \
92
+ --user-agent #{user_agent} \
93
+ --delay #{delay} \
94
+ --url #{website} \
95
+ download \
96
+ -t \
97
+ #{@output_dir})
98
+ end
99
+
94
100
  def user_agent
95
101
  "'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
96
102
  end
@@ -20,7 +20,6 @@ module Vole
20
20
 
21
21
  def handle_element(element)
22
22
  if element.tag_name == "pre" ||
23
- element.tag_name == "code" ||
24
23
  element.tag_name == "form" ||
25
24
  element.tag_name == "style" ||
26
25
  element.tag_name == "noscript" ||
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.2.6"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.3.0
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian