vore 0.2.0-x86_64-darwin → 0.2.2-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5c54c9d9a3d685f0545ebcec67d0aa30a3a6751d5ae31093197a021d24178040
4
- data.tar.gz: d961d2d7c2bbcf3fad014e4bf1eea9f5d7a692901dcea50f66c3bdccf64066f9
3
+ metadata.gz: 5e3cf9efca051fe9cb002a0fbae78e520880a13d90192416875174e208a42a53
4
+ data.tar.gz: 3d9c3a6ca47d3d7c0b1f15bb6311d54ab7e172390e55c7fad83b76480be03a31
5
5
  SHA512:
6
- metadata.gz: 1a9ae3ef5f6b227c86cb21ecef0377ab7582e96ca562505e83b127333c7cad7579c4e5b1525793cd54a4d97a42c626646fcfae4146b547e00373db339f4b9615
7
- data.tar.gz: ec1173ad2572e8bf07760360941493045d9e1d39912651cab8403be4418da6d39df42e9bdcd3d2f8d61ddf576324fbb73de7d252ca301a75f83920e77a89de79
6
+ metadata.gz: 016c40c00eb7914370c2bfaeea8fffcd99c5449a6b786d6c5e97802e7ee504c4ebef4c08758dc13fb5b27cb7ecf57d71b91fe30f9bdc150929973a30135cb45a
7
+ data.tar.gz: edefee484a4de8340d2e9171c5b5e7a8d1e433daf4920bca860df892c3471f45a91e9e2b94ca000200a32ad98d0ef5e7ce683d19b6f77a9f27ea8e335f9f541e
data/exe/vore-spider CHANGED
Binary file
data/lib/vore/crawler.rb CHANGED
@@ -17,7 +17,7 @@ module Vore
17
17
  @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
18
18
  ext = PLATFORM.include?("windows") ? ".exe" : ""
19
19
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
20
- @output_dir = "tmp/vore"
20
+ @parent_output_dir = "tmp/vore"
21
21
 
22
22
  return if File.exist?(@executable)
23
23
 
@@ -26,11 +26,12 @@ module Vore
26
26
  end
27
27
 
28
28
  def scrape_each_page(website, &block)
29
- output_dir = "#{@output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
29
+ output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
30
30
  Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
31
31
 
32
32
  output = %x(#{@executable} \
33
33
  --user-agent #{user_agent} \
34
+ --delay 3000
34
35
  --url #{website} \
35
36
  download \
36
37
  -t \
@@ -38,12 +39,25 @@ module Vore
38
39
 
39
40
  Vore.logger.info("Vore finished crawling #{website}: #{output}")
40
41
 
41
- Dir.glob("tmp/**/*").each do |path|
42
+ results = {
43
+ pages_visited: 0,
44
+ pages_unprocessed: 0,
45
+ unprocessed_pages: [],
46
+ }
47
+
48
+ Dir.glob(File.join(output_dir, "**", "*")).each do |path|
42
49
  next unless File.file?(path)
43
50
 
44
51
  html_file = File.read(path).force_encoding("UTF-8")
45
52
  rewritten_html_file = @selma.rewrite(html_file)
46
53
 
54
+ results[:pages_visited] += 1
55
+ if rewritten_html_file.empty?
56
+ results[:pages_unprocessed] += 1
57
+ results[:unprocessed_pages] << path
58
+ next
59
+ end
60
+
47
61
  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
48
62
  url_path = path.split(FILE_SEPERATOR)[3..].join("/")
49
63
 
@@ -58,6 +72,8 @@ module Vore
58
72
  ensure
59
73
  File.delete(path) if File.file?(path)
60
74
  end
75
+
76
+ results
61
77
  end
62
78
 
63
79
  # def crawl(site, block)
@@ -19,7 +19,13 @@ module Vole
19
19
  end
20
20
 
21
21
  def handle_element(element)
22
- if element.tag_name == "pre" || element.tag_name == "code" || element.tag_name == "script" || element.tag_name == "form"
22
+ if element.tag_name == "pre" ||
23
+ element.tag_name == "code" ||
24
+ element.tag_name == "form" ||
25
+ element.tag_name == "style" ||
26
+ element.tag_name == "noscript" ||
27
+ element.tag_name == "script" ||
28
+ element.tag_name == "svg"
23
29
  element.remove
24
30
  elsif element.tag_name == "title"
25
31
  @within_title = true
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.2
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian