vore 0.2.4-x86_64-windows → 0.2.8-x86_64-windows

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 571c472b2c7e94a786883d6005c5a913e87581762f7ee90bdd915ec366cda335
4
- data.tar.gz: 55cf52342b9b3b335469b7ac527f0c9bd9c13f86858591b015920106b5a7b70c
3
+ metadata.gz: 85a50437d0557e28ff3eeb155a8e527163aeb5f90fd98f0e5cb8e09e3d81bb6e
4
+ data.tar.gz: 61d1255a042db43b6e50bc749fccceb452080afd07e1eac04461ef4894fb8027
5
5
  SHA512:
6
- metadata.gz: 3d8adc4b1cad88301ca10d57b654781ff4c7a66a75fb5e8cec08aa21469de51568162c22615001f60881a3a6b0efe72606ba1ca8d4846e77b2fd1527a2906eb2
7
- data.tar.gz: 6db4625ac4e0d7c586c1e12b5012494eab0fb74a75a441cd38f49873e09b8d2d6b9b532d227fdd2807b7cf9f6edb153c3a4798e737b4913cd51aa3d855db4b0d
6
+ metadata.gz: d5c8a45adf9b4402d3600ae26728afb89b5e4d24953ad905462ad6b7c2d682a2806ed452673c486000621e5bf0373de6dd85a28ba592cb2e5eb1e86d83eb973f
7
+ data.tar.gz: 0077fcc1c2173c46be332da7727f6560a1cf8b2351491bc147b0d1111cefc82dc8bf8081e0f2370e2f7432d1905b2419e8bc168294dfef8a4387d609d512dae8
data/exe/vore-spider.exe CHANGED
Binary file
data/lib/vore/crawler.rb CHANGED
@@ -8,6 +8,8 @@ module Vore
8
8
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
9
  FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
10
10
 
11
+ attr_reader :output_dir
12
+
11
13
  # Creates a crawler
12
14
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
13
15
  def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG)
@@ -26,16 +28,10 @@ module Vore
26
28
  end
27
29
 
28
30
  def scrape_each_page(website, &block)
29
- output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
31
+ @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
30
32
  Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
31
33
 
32
- output = %x(#{@executable} \
33
- --user-agent #{user_agent} \
34
- --delay 3000 \
35
- --url #{website} \
36
- download \
37
- -t \
38
- #{output_dir})
34
+ output = run_command(website, @output_dir)
39
35
 
40
36
  Vore.logger.info("Vore finished crawling #{website}: #{output}")
41
37
 
@@ -48,16 +44,25 @@ module Vore
48
44
  Dir.glob(File.join(output_dir, "**", "*")).each do |path|
49
45
  next unless File.file?(path)
50
46
 
47
+ results[:pages_visited] += 1
48
+
51
49
  html_file = File.read(path).force_encoding("UTF-8")
52
- rewritten_html_file = @selma.rewrite(html_file)
50
+ rewritten_html_file = ""
53
51
 
54
- results[:pages_visited] += 1
55
- if rewritten_html_file.empty?
52
+ if html_file.empty?
56
53
  results[:pages_unprocessed] += 1
57
54
  results[:unprocessed_pages] << path
58
55
  next
59
56
  end
60
57
 
58
+ begin
59
+ rewritten_html_file = @selma.rewrite(html_file)
60
+ rescue StandardError => e
61
+ Vore.logger.warn("Error rewriting #{path}: #{e}")
62
+ results[:pages_unprocessed] += 1
63
+ next
64
+ end
65
+
61
66
  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
62
67
  url_path = path.split(FILE_SEPERATOR)[3..].join("/")
63
68
 
@@ -81,6 +86,16 @@ module Vore
81
86
  # crawl_site(site)
82
87
  # end
83
88
 
89
+ def run_command(website, output_dir)
90
+ %x(#{@executable} \
91
+ --user-agent #{user_agent} \
92
+ --delay 3500 \
93
+ --url #{website} \
94
+ download \
95
+ -t \
96
+ #{output_dir})
97
+ end
98
+
84
99
  def user_agent
85
100
  "'Mozilla/5.0 (compatible; Vore/#{Vore::VERSION}; +https://github.com/gjtorikian/vore)'"
86
101
  end
@@ -20,7 +20,6 @@ module Vole
20
20
 
21
21
  def handle_element(element)
22
22
  if element.tag_name == "pre" ||
23
- element.tag_name == "code" ||
24
23
  element.tag_name == "form" ||
25
24
  element.tag_name == "style" ||
26
25
  element.tag_name == "noscript" ||
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.2.4"
4
+ VERSION = "0.2.8"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.8
5
5
  platform: x86_64-windows
6
6
  authors:
7
7
  - Garen J. Torikian