vore 0.3.0-arm64-darwin → 0.4.0-arm64-darwin

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 54a3f9525133d20d29eadd67c83ef0c996c3e85ae6843b3ea239e3cff0d9bea3
4
- data.tar.gz: 065af90bf1234459fe430a2c49882d988e4b1367786dc506a7cf571804812339
3
+ metadata.gz: 214aba124a131e6567e5d6eeba171d3efb63d9b74f73efe19e2e63730da5f510
4
+ data.tar.gz: 971b1d9b5e5adc36ba7de5b3e6451443b4b3124ef3e73b63f44c3dc96d9effa1
5
5
  SHA512:
6
- metadata.gz: 540ceeac482a7b9274161a5b64c7d5d0fd570dd3756934ae8a6dcaa760869b81c3f56d89b7c959f7a18c66fffe94c9b215efff8e648d5580aaeaa823e10405e3
7
- data.tar.gz: f1d56bc655ac4e42e720837d00df756d497f55ceca844edc4fd7ceb887736458afb02532b90a1fe0f95a7e8b1fe3e551edd3ea196fa8d995c3bd3a5f42db7f32
6
+ metadata.gz: c123316da7dfeba1c4f3b5f8c4a62411493770a3d726097ddefa23e06868a4ec7a38d853b8b973ec75ee7773383f302b671d83057a5ac0a343f801a4c90a3420
7
+ data.tar.gz: 6de3f0622a91ad15f63331075ed3b79746f4d894f9521c25dd8815779212d93492c10014e47cb36aa0c6bdaa1ae8c2feb5d6bafdc19cdc792f17eccc97db0e74
data/Cargo.lock CHANGED
@@ -1786,9 +1786,9 @@ dependencies = [
1786
1786
 
1787
1787
  [[package]]
1788
1788
  name = "spider"
1789
- version = "1.99.5"
1789
+ version = "1.99.8"
1790
1790
  source = "registry+https://github.com/rust-lang/crates.io-index"
1791
- checksum = "2f62dc0e4f32d36a931471a1694d5b5b29e916c537121ca8028742c9acbe510d"
1791
+ checksum = "525670cdc6aec8f4cb91da17ce0255050e89eb7c889272216d8a4fb644d67530"
1792
1792
  dependencies = [
1793
1793
  "ahash",
1794
1794
  "bytes",
@@ -1817,9 +1817,9 @@ dependencies = [
1817
1817
 
1818
1818
  [[package]]
1819
1819
  name = "spider_cli"
1820
- version = "1.99.5"
1820
+ version = "1.99.8"
1821
1821
  source = "registry+https://github.com/rust-lang/crates.io-index"
1822
- checksum = "6031d46576f5fdba52d5c054a8c69ba7ee17cc824563b764fce4c8471c15c3a1"
1822
+ checksum = "9bd9d95178dc0715608d5f28501c8321de3e14b40046c6584a12ffce96f0a676"
1823
1823
  dependencies = [
1824
1824
  "clap",
1825
1825
  "env_logger",
data/README.md CHANGED
@@ -18,13 +18,75 @@ If bundler is not being used to manage dependencies, install the gem by executin
18
18
  ## Usage
19
19
 
20
20
  ```ruby
21
- crawler = Vore::Crawler.new
22
- crawler.scrape_each_page("https://choosealicense.com") do |page|
23
- puts page
24
- end
21
+ crawler = Vore::Crawler.new
22
+ crawler.scrape_each_page("https://choosealicense.com") do |page|
23
+ puts page
24
+ end
25
25
  ```
26
26
 
27
- Each `page` is simply every text node. The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
27
+ Each `page` is a simple class consisting of the following values:
28
+
29
+ * `content`: the text of the HTML document, sans tags
30
+ * `title`: the title of the HTML document (if any)
31
+ * `meta`: the document's meta tags (if any)
32
+ * `path`: the document's path
33
+
34
+ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
35
+
36
+ ### Configuration
37
+
38
+ | Name | Description | Default |
39
+ | ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
40
+ | `delay` | A value (in milliseconds) which introduces an artifical delay when crawling. Useful for situations where there's rate limiting involved. | `0` |
41
+ | `output_dir` | Where the resulting HTML files are stored. | `"tmp/vore"` |
42
+ | `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
43
+ | `log_level` | The logging level. | `:warn` |
44
+
45
+ ### In tests
46
+
47
+ Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
48
+
49
+ Based on your needs, you can overwrite any of the existing methods to suit your application's needs. For example, if you prefer HTML to be generated by Faker, you can create and require a file that looks like the following:
50
+
51
+ ```ruby
52
+
53
+ require "vore/minitest_helper"
54
+
55
+ module Vore
56
+ module TestHelperExtension
57
+ DOCUMENT_TITLES = [
58
+ "Hello, I need help",
59
+ "I need to update my payment information",
60
+ ]
61
+ DOCUMENT_CONTENT = [
62
+ "Hey, I'm having trouble with my computer. Can you help me?",
63
+ # v--- always creates three page chunks
64
+ "I need to update my payment information. Like, now. Right now. Now. Can you help me? Please? Now?" + "Can you help me? Please? Now?" * 100,
65
+ ]
66
+
67
+ def content
68
+ @counter = -1 unless defined?(@counter)
69
+ @counter += 1
70
+
71
+ html = "<!DOCTYPE html><html><head><title>#{DOCUMENT_TITLES[@counter]}</title>"
72
+
73
+ meta_tag_count.times do # arbitrarily set to 5
74
+ html += "<meta name=\"#{Faker::Lorem.word}\" content=\"#{Faker::Lorem.word}\" />"
75
+ end
76
+
77
+ html += "</head><body>"
78
+
79
+ html += "<p>#{DOCUMENT_CONTENT[@counter]}</p>"
80
+
81
+ html += "</body></html>"
82
+
83
+ html
84
+ end
85
+ end
86
+
87
+ Vore::TestHelper.prepend(Vore::TestHelperExtension)
88
+ end
89
+ ```
28
90
 
29
91
  ## Development
30
92
 
data/exe/vore-spider CHANGED
Binary file
@@ -1,13 +1,16 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module Vole
3
+ module Vore
4
4
  class Configuration
5
5
  DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
6
  allow_doctype: false,
7
7
  })
8
8
 
9
9
  DEFAULT_OPTIONS = {
10
- delay: 3500,
10
+ delay: 0,
11
+ output_dir: "tmp/vore",
12
+ delete_after_yield: true,
13
+ log_level: :warn,
11
14
  }
12
15
  end
13
16
  end
data/lib/vore/crawler.rb CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  require_relative "handlers/content_extractor"
4
4
 
5
+ require "listen"
6
+
5
7
  module Vore
6
8
  # This is the class that starts and controls the crawling
7
9
  class Crawler
@@ -12,15 +14,22 @@ module Vore
12
14
 
13
15
  # Creates a crawler
14
16
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
15
- def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG, options: Vole::Configuration::DEFAULT_OPTIONS)
16
- @denylist_regexp = Regexp.union(denylist)
17
-
18
- @content_extractor = Vole::Handlers::ContentExtractor.new
17
+ def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
18
+ @content_extractor = Vore::Handlers::ContentExtractor.new
19
19
  @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
20
20
  ext = PLATFORM.include?("windows") ? ".exe" : ""
21
21
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
22
- @parent_output_dir = "tmp/vore"
23
- @options = options
22
+ @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
23
+ @parent_output_dir = @options[:output_dir]
24
+ @parent_output_dir_len = @parent_output_dir.to_s.split(FILE_SEPERATOR).size
25
+
26
+ Vore.logger.level = @options[:log_level]
27
+ Listen.logger = Vore.logger
28
+
29
+ @results = {
30
+ pages_visited: 0,
31
+ unprocessed_pages: [],
32
+ }
24
33
 
25
34
  return if File.exist?(@executable)
26
35
 
@@ -30,71 +39,82 @@ module Vore
30
39
 
31
40
  def scrape_each_page(website, &block)
32
41
  @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
33
- Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
34
-
35
- output = run_command(website, delay: @options[:delay])
42
+ FileUtils.rm_rf(@output_dir)
43
+ FileUtils.mkdir_p(@output_dir)
44
+
45
+ listener = Listen.to(@output_dir) do |_modified, added, _removed|
46
+ if added.any?
47
+ added.each do |path|
48
+ process_file(path, &block)
49
+ File.delete(path) if @options[:delete_after_yield]
50
+ end
51
+ end
52
+ end
53
+ listener.start
36
54
 
37
- Vore.logger.info("Vore finished crawling #{website}: #{output}")
55
+ Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
38
56
 
39
- results = {
40
- pages_visited: 0,
41
- pages_unprocessed: 0,
42
- unprocessed_pages: [],
43
- }
57
+ begin
58
+ run_command(website, delay: @options[:delay])
59
+ ensure
60
+ sleep(0.5) # give listener time to clean up
61
+ listener.stop
62
+ end
44
63
 
45
- Dir.glob(File.join(output_dir, "**", "*")).each do |path|
46
- next unless File.file?(path)
64
+ Vore.logger.info("Vore finished crawling #{website}")
47
65
 
48
- results[:pages_visited] += 1
66
+ @results
67
+ end
49
68
 
50
- html_file = File.read(path).force_encoding("UTF-8")
51
- rewritten_html_file = ""
69
+ def process_file(path, &block)
70
+ @results[:pages_visited] += 1
52
71
 
53
- if html_file.empty?
54
- results[:pages_unprocessed] += 1
55
- results[:unprocessed_pages] << path
56
- next
57
- end
72
+ html_file = File.read(path).force_encoding("UTF-8")
73
+ rewritten_html_file = ""
58
74
 
59
- begin
60
- rewritten_html_file = @selma.rewrite(html_file)
61
- rescue StandardError => e
62
- Vore.logger.warn("Error rewriting #{path}: #{e}")
63
- results[:pages_unprocessed] += 1
64
- next
65
- end
75
+ if html_file.empty?
76
+ @results[:unprocessed_pages] << path
77
+ return
78
+ end
66
79
 
67
- # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
68
- url_path = path.split(FILE_SEPERATOR)[3..].join("/")
80
+ begin
81
+ rewritten_html_file = @selma.rewrite(html_file)
82
+ rescue StandardError => e
83
+ Vore.logger.warn("Error rewriting #{path}: #{e}")
84
+ @results[:unprocessed_pages] << path
85
+ return
86
+ end
69
87
 
70
- page = Vore::PageData.new(
71
- content: rewritten_html_file,
72
- title: @content_extractor.title,
73
- meta: @content_extractor.meta,
74
- path: url_path,
75
- )
88
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
89
+ url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
76
90
 
77
- yield page
78
- ensure
79
- File.delete(path) if File.file?(path)
80
- end
91
+ page = Vore::PageData.new(
92
+ content: rewritten_html_file,
93
+ title: @content_extractor.title,
94
+ meta: @content_extractor.meta,
95
+ path: url_path,
96
+ )
81
97
 
82
- results
98
+ yield page
83
99
  end
84
100
 
85
- # def crawl(site, block)
86
- # Vore.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
87
- # crawl_site(site)
88
- # end
89
-
90
- def run_command(website, delay: 3500)
91
- %x(#{@executable} \
92
- --user-agent #{user_agent} \
93
- --delay #{delay} \
94
- --url #{website} \
95
- download \
96
- -t \
97
- #{@output_dir})
101
+ def run_command(website, delay: 0)
102
+ pid = Process.spawn(
103
+ @executable,
104
+ "--user-agent",
105
+ user_agent,
106
+ "--delay",
107
+ delay.to_s,
108
+ "--url",
109
+ website,
110
+ "download",
111
+ "-t",
112
+ @output_dir,
113
+ )
114
+
115
+ _, _status = Process.waitpid2(pid)
116
+ rescue StandardError => e
117
+ Vore.logger.error(e)
98
118
  end
99
119
 
100
120
  def user_agent
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module Vole
3
+ module Vore
4
4
  module Handlers
5
5
  class ContentExtractor
6
6
  SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
data/lib/vore/logger.rb CHANGED
@@ -23,6 +23,10 @@ module Vore
23
23
  instance
24
24
  end
25
25
  end
26
+
27
+ def level=(level)
28
+ instance.level = level
29
+ end
26
30
  end
27
31
  end
28
32
  end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "minitest/mock"
4
+ require "net/http"
5
+
6
+ module Vore
7
+ module TestHelper
8
+ def run_command(website, **options)
9
+ loop_times.times do |time|
10
+ net_http = ::Minitest::Mock.new
11
+ response = ::Minitest::Mock.new
12
+ response.expect(:is_a?, true, [::Net::HTTPSuccess])
13
+
14
+ # we need to trigger an HTTP call to pretend that we're making
15
+ # an external request. this way, the gem hooks into VCR/Webmock
16
+ net_http.expect(:get, response)
17
+ html = content
18
+ response.expect(:body, html)
19
+ time_s = time.to_s
20
+ uri = URI("#{website}/#{time_s}")
21
+ Net::HTTP.get(uri)
22
+
23
+ file = File.join(@output_dir, time_s)
24
+ File.write("#{file}.html", html)
25
+ end
26
+ end
27
+
28
+ def loop_times=(times)
29
+ @loop_times = times
30
+ end
31
+
32
+ def loop_times
33
+ @loop_times ||= 5
34
+ end
35
+
36
+ def meta_tag_count=(count)
37
+ @meta_tag_count = count
38
+ end
39
+
40
+ def meta_tag_count
41
+ @meta_tag_count ||= 5
42
+ end
43
+
44
+ def generate_word
45
+ ("a".."z").to_a.sample(8).join
46
+ end
47
+
48
+ def generate_sentence
49
+ Array.new((5..15).to_a.sample) { generate_word }.join(" ")
50
+ end
51
+
52
+ def generate_path
53
+ Array.new((1..3).to_a.sample) { generate_word }.join("/")
54
+ end
55
+
56
+ def content
57
+ html = "<!DOCTYPE html><html><head><title>#{generate_word}</title>"
58
+ meta_tag_count.times do
59
+ html += "<meta name=\"#{generate_word}\" content=\"#{generate_word}\" />"
60
+ end
61
+
62
+ html += "</head><body>"
63
+
64
+ 50.times do
65
+ tagname = ["p", "h1", "h2", "h3", "h4", "h5", "h6"].sample
66
+ html += "<#{tagname}>#{generate_sentence}</#{tagname}>"
67
+ end
68
+
69
+ html += "</body></html>"
70
+ html
71
+ end
72
+ end
73
+
74
+ Vore::Crawler.prepend(Vore::TestHelper)
75
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.3.0"
4
+ VERSION = "0.4.0"
5
5
  end
metadata CHANGED
@@ -1,15 +1,30 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-18 00:00:00.000000000 Z
11
+ date: 2024-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: listen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.9'
27
+ force_ruby_platform: false
13
28
  - !ruby/object:Gem::Dependency
14
29
  name: selma
15
30
  requirement: !ruby/object:Gem::Requirement
@@ -47,6 +62,7 @@ files:
47
62
  - lib/vore/crawler.rb
48
63
  - lib/vore/handlers/content_extractor.rb
49
64
  - lib/vore/logger.rb
65
+ - lib/vore/minitest_helper.rb
50
66
  - lib/vore/page.rb
51
67
  - lib/vore/page_data.rb
52
68
  - lib/vore/version.rb