vore 0.3.0-x86_64-linux → 0.5.0-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5fa0065651385809a53579488f9985cc3332197d4fa9818508859bb24274a16b
4
- data.tar.gz: a0754878a08d651215dd5df40220e3002255ef65fedcb365e0605ff228db27fd
3
+ metadata.gz: 8909a229b2e8ef2864c3009a3f652dd5e770ba476689c41af61ea0fc87429ca4
4
+ data.tar.gz: 02e47c30a1495eef5ddc3d63ac98114179830682bd08a31659013b4642b00d19
5
5
  SHA512:
6
- metadata.gz: 48298f9bf6de3e76b443a4ea46fa0adc687e4a39b90fc94ae6a97e0ce0153c6e983e292af0c8d6a53dcb44e94b8ecba94f8adc75fe5ce8cd7857e947ab113795
7
- data.tar.gz: 404712a662f24fff36346998e791cfac7c9ac44661005209f0a2a65b38146f0233dfcdc9cfc115c5772340d310d085dbadbca8766201fad810843359ccff1c98
6
+ metadata.gz: f67d01c4c685e5f0a2d475d4d412b408dfc90d1c4429744301c2b308eda51f2df8ebafebece24769cf3de8742072cad55c77d27d6586a3807deee59b0f2ef93d
7
+ data.tar.gz: 0b91cc2ab15d3e619f2a23ccb6af243bc8a103a3741d7f127845666fb14a957e88f7ff0a424b6a113fafbb3fdaf4d0baec6d48807630843252a3c6d0a03ef626
data/Cargo.lock CHANGED
@@ -238,7 +238,7 @@ version = "0.2.3"
238
238
  source = "registry+https://github.com/rust-lang/crates.io-index"
239
239
  checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
240
240
  dependencies = [
241
- "compact_str",
241
+ "compact_str 0.7.1",
242
242
  "serde",
243
243
  ]
244
244
 
@@ -346,6 +346,20 @@ dependencies = [
346
346
  "static_assertions",
347
347
  ]
348
348
 
349
+ [[package]]
350
+ name = "compact_str"
351
+ version = "0.8.0"
352
+ source = "registry+https://github.com/rust-lang/crates.io-index"
353
+ checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
354
+ dependencies = [
355
+ "castaway",
356
+ "cfg-if",
357
+ "itoa",
358
+ "rustversion",
359
+ "ryu",
360
+ "static_assertions",
361
+ ]
362
+
349
363
  [[package]]
350
364
  name = "cookie"
351
365
  version = "0.18.1"
@@ -1418,18 +1432,18 @@ dependencies = [
1418
1432
 
1419
1433
  [[package]]
1420
1434
  name = "rb-sys"
1421
- version = "0.9.98"
1435
+ version = "0.9.99"
1422
1436
  source = "registry+https://github.com/rust-lang/crates.io-index"
1423
- checksum = "8914b2e6af10bd50dd7aaac8c5146872d3924d6012929b4ff504e988f6badd24"
1437
+ checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
1424
1438
  dependencies = [
1425
1439
  "rb-sys-build",
1426
1440
  ]
1427
1441
 
1428
1442
  [[package]]
1429
1443
  name = "rb-sys-build"
1430
- version = "0.9.98"
1444
+ version = "0.9.99"
1431
1445
  source = "registry+https://github.com/rust-lang/crates.io-index"
1432
- checksum = "12af68c9757d419b82d65a12b5db538990dfe9416049fea3f0ba4b9a8ca108cd"
1446
+ checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
1433
1447
  dependencies = [
1434
1448
  "bindgen",
1435
1449
  "lazy_static",
@@ -1786,14 +1800,14 @@ dependencies = [
1786
1800
 
1787
1801
  [[package]]
1788
1802
  name = "spider"
1789
- version = "1.99.5"
1803
+ version = "1.99.11"
1790
1804
  source = "registry+https://github.com/rust-lang/crates.io-index"
1791
- checksum = "2f62dc0e4f32d36a931471a1694d5b5b29e916c537121ca8028742c9acbe510d"
1805
+ checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
1792
1806
  dependencies = [
1793
1807
  "ahash",
1794
1808
  "bytes",
1795
1809
  "case_insensitive_string",
1796
- "compact_str",
1810
+ "compact_str 0.8.0",
1797
1811
  "cssparser",
1798
1812
  "ego-tree",
1799
1813
  "fast_html5ever",
@@ -1817,9 +1831,9 @@ dependencies = [
1817
1831
 
1818
1832
  [[package]]
1819
1833
  name = "spider_cli"
1820
- version = "1.99.5"
1834
+ version = "1.99.11"
1821
1835
  source = "registry+https://github.com/rust-lang/crates.io-index"
1822
- checksum = "6031d46576f5fdba52d5c054a8c69ba7ee17cc824563b764fce4c8471c15c3a1"
1836
+ checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
1823
1837
  dependencies = [
1824
1838
  "clap",
1825
1839
  "env_logger",
data/README.md CHANGED
@@ -18,13 +18,98 @@ If bundler is not being used to manage dependencies, install the gem by executin
18
18
  ## Usage
19
19
 
20
20
  ```ruby
21
- crawler = Vore::Crawler.new
22
- crawler.scrape_each_page("https://choosealicense.com") do |page|
23
- puts page
24
- end
21
+ crawler = Vore::Crawler.new
22
+ crawler.scrape_each_page("https://choosealicense.com") do |page|
23
+ puts page
24
+ end
25
+ ```
26
+
27
+ Each `page` is a simple class consisting of the following values:
28
+
29
+ * `content`: the text of the HTML document, sans tags
30
+ * `title`: the title of the HTML document (if any)
31
+ * `meta`: the document's meta tags (if any)
32
+ * `path`: the document's path
33
+
34
+ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
35
+
36
+ ### Configuration
37
+
38
+ | Name | Description | Default |
39
+ | ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
40
+ | `delay` | A value (in milliseconds) which introduces an artifical delay when crawling. Useful for situations where there's rate limiting involved. | `0` |
41
+ | `output_dir` | Where the resulting HTML files are stored. | `"tmp/vore"` |
42
+ | `delete_after_yield` | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
43
+ | `log_level` | The logging level. | `:warn` |
44
+
45
+ ### Processing pages
46
+
47
+ Vore processes HTML using handlers. By default, there are two:
48
+
49
+ * The `MetaExtractor`, which extracts information from your `title` and `meta` tags
50
+ * The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
51
+
52
+ If you wish to process the HTML further, you can provide your own handler:
53
+
54
+
55
+ ```ruby
56
+ Vore::Crawler.new(handlers: [MySpecialHandler.new])
25
57
  ```
26
58
 
27
- Each `page` is simply every text node. The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
59
+ Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
60
+
61
+
62
+ ```ruby
63
+ # preserve Vore's default content handler while adding your own;
64
+ # `MetaExtractor` is prefixed to the front
65
+ Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
66
+ ```
67
+
68
+ ### In tests
69
+
70
+ Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
71
+
72
+ Based on your needs, you can overwrite any of the existing methods to suit your application's needs. For example, if you prefer HTML to be generated by Faker, you can create and require a file that looks like the following:
73
+
74
+ ```ruby
75
+
76
+ require "vore/minitest_helper"
77
+
78
+ module Vore
79
+ module TestHelperExtension
80
+ DOCUMENT_TITLES = [
81
+ "Hello, I need help",
82
+ "I need to update my payment information",
83
+ ]
84
+ DOCUMENT_CONTENT = [
85
+ "Hey, I'm having trouble with my computer. Can you help me?",
86
+ # v--- always creates three page chunks
87
+ "I need to update my payment information. Like, now. Right now. Now. Can you help me? Please? Now?" + "Can you help me? Please? Now?" * 100,
88
+ ]
89
+
90
+ def content
91
+ @counter = -1 unless defined?(@counter)
92
+ @counter += 1
93
+
94
+ html = "<!DOCTYPE html><html><head><title>#{DOCUMENT_TITLES[@counter]}</title>"
95
+
96
+ meta_tag_count.times do # arbitrarily set to 5
97
+ html += "<meta name=\"#{Faker::Lorem.word}\" content=\"#{Faker::Lorem.word}\" />"
98
+ end
99
+
100
+ html += "</head><body>"
101
+
102
+ html += "<p>#{DOCUMENT_CONTENT[@counter]}</p>"
103
+
104
+ html += "</body></html>"
105
+
106
+ html
107
+ end
108
+ end
109
+
110
+ Vore::TestHelper.prepend(Vore::TestHelperExtension)
111
+ end
112
+ ```
28
113
 
29
114
  ## Development
30
115
 
data/exe/vore-spider CHANGED
Binary file
@@ -1,13 +1,17 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module Vole
3
+ module Vore
4
4
  class Configuration
5
5
  DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
6
+ allow_comments: false,
6
7
  allow_doctype: false,
7
8
  })
8
9
 
9
10
  DEFAULT_OPTIONS = {
10
- delay: 3500,
11
+ delay: 0,
12
+ output_dir: "tmp/vore",
13
+ delete_after_yield: true,
14
+ log_level: :warn,
11
15
  }
12
16
  end
13
17
  end
data/lib/vore/crawler.rb CHANGED
@@ -1,6 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "handlers/content_extractor"
3
+ require_relative "handlers/meta_extractor"
4
+ require_relative "handlers/tag_remover"
5
+
6
+ require "listen"
4
7
 
5
8
  module Vore
6
9
  # This is the class that starts and controls the crawling
@@ -8,19 +11,33 @@ module Vore
8
11
  PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
9
12
  FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
10
13
 
11
- attr_reader :output_dir
14
+ attr_reader :handlers, :output_dir
12
15
 
13
16
  # Creates a crawler
14
17
  # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
15
- def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG, options: Vole::Configuration::DEFAULT_OPTIONS)
16
- @denylist_regexp = Regexp.union(denylist)
18
+ def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
19
+ @meta_extractor = Vore::Handlers::MetaExtractor.new
20
+
21
+ @handlers = if handlers.nil?
22
+ [@meta_extractor, Vore::Handlers::TagRemover.new]
23
+ else
24
+ handlers.unshift(@meta_extractor)
25
+ end
17
26
 
18
- @content_extractor = Vole::Handlers::ContentExtractor.new
19
- @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
27
+ @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
20
28
  ext = PLATFORM.include?("windows") ? ".exe" : ""
21
29
  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
22
- @parent_output_dir = "tmp/vore"
23
- @options = options
30
+ @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
31
+ @parent_output_dir = @options[:output_dir]
32
+ @parent_output_dir_len = @parent_output_dir.to_s.split(FILE_SEPERATOR).size
33
+
34
+ Vore.logger.level = @options[:log_level]
35
+ Listen.logger = Vore.logger
36
+
37
+ @results = {
38
+ pages_visited: 0,
39
+ unprocessed_pages: [],
40
+ }
24
41
 
25
42
  return if File.exist?(@executable)
26
43
 
@@ -30,71 +47,84 @@ module Vore
30
47
 
31
48
  def scrape_each_page(website, &block)
32
49
  @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
33
- Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
50
+ FileUtils.rm_rf(@output_dir)
51
+ FileUtils.mkdir_p(@output_dir)
52
+
53
+ listener = Listen.to(@output_dir) do |_modified, added, _removed|
54
+ if added.any?
55
+ added.each do |path|
56
+ process_file(path, &block)
57
+ File.delete(path) if @options[:delete_after_yield]
58
+ end
59
+ end
60
+ end
61
+ listener.start
34
62
 
35
- output = run_command(website, delay: @options[:delay])
63
+ Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
36
64
 
37
- Vore.logger.info("Vore finished crawling #{website}: #{output}")
65
+ begin
66
+ run_command(website, delay: @options[:delay])
67
+ ensure
68
+ sleep(0.5) # give listener time to clean up
69
+ listener.stop
70
+ end
38
71
 
39
- results = {
40
- pages_visited: 0,
41
- pages_unprocessed: 0,
42
- unprocessed_pages: [],
43
- }
72
+ Vore.logger.info("Vore finished crawling #{website}")
44
73
 
45
- Dir.glob(File.join(output_dir, "**", "*")).each do |path|
46
- next unless File.file?(path)
74
+ @results
75
+ end
47
76
 
48
- results[:pages_visited] += 1
77
+ def process_file(path, &block)
78
+ @results[:pages_visited] += 1
49
79
 
50
- html_file = File.read(path).force_encoding("UTF-8")
51
- rewritten_html_file = ""
80
+ html_file = File.read(path).force_encoding("UTF-8")
52
81
 
53
- if html_file.empty?
54
- results[:pages_unprocessed] += 1
55
- results[:unprocessed_pages] << path
56
- next
57
- end
82
+ if html_file.empty?
83
+ @results[:unprocessed_pages] << path
84
+ return
85
+ end
58
86
 
59
- begin
60
- rewritten_html_file = @selma.rewrite(html_file)
61
- rescue StandardError => e
62
- Vore.logger.warn("Error rewriting #{path}: #{e}")
63
- results[:pages_unprocessed] += 1
64
- next
65
- end
87
+ rewritten_html_file = @selma.rewrite(html_file)
88
+ return if rewritten_html_file.empty?
66
89
 
67
- # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
68
- url_path = path.split(FILE_SEPERATOR)[3..].join("/")
90
+ # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
91
+ url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
69
92
 
70
- page = Vore::PageData.new(
71
- content: rewritten_html_file,
72
- title: @content_extractor.title,
73
- meta: @content_extractor.meta,
74
- path: url_path,
75
- )
93
+ page = Vore::PageData.new(
94
+ content: rewritten_html_file,
95
+ title: @meta_extractor.title,
96
+ meta: @meta_extractor.meta,
97
+ path: url_path,
98
+ )
76
99
 
77
- yield page
78
- ensure
79
- File.delete(path) if File.file?(path)
80
- end
100
+ yield page
101
+ end
81
102
 
82
- results
103
+ def rewrite(html_file)
104
+ @selma.rewrite(html_file)
105
+ rescue StandardError => e
106
+ Vore.logger.warn("Error rewriting #{path}: #{e}")
107
+ @results[:unprocessed_pages] << path
108
+ ""
83
109
  end
84
110
 
85
- # def crawl(site, block)
86
- # Vore.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
87
- # crawl_site(site)
88
- # end
89
-
90
- def run_command(website, delay: 3500)
91
- %x(#{@executable} \
92
- --user-agent #{user_agent} \
93
- --delay #{delay} \
94
- --url #{website} \
95
- download \
96
- -t \
97
- #{@output_dir})
111
+ def run_command(website, delay: 0)
112
+ pid = Process.spawn(
113
+ @executable,
114
+ "--user-agent",
115
+ user_agent,
116
+ "--delay",
117
+ delay.to_s,
118
+ "--url",
119
+ website,
120
+ "download",
121
+ "-t",
122
+ @output_dir,
123
+ )
124
+
125
+ _, _status = Process.waitpid2(pid)
126
+ rescue StandardError => e
127
+ Vore.logger.error(e)
98
128
  end
99
129
 
100
130
  def user_agent
@@ -1,8 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- module Vole
3
+ module Vore
4
4
  module Handlers
5
- class ContentExtractor
5
+ class MetaExtractor
6
6
  SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
7
7
 
8
8
  attr_reader :title, :meta
@@ -19,22 +19,14 @@ module Vole
19
19
  end
20
20
 
21
21
  def handle_element(element)
22
- if element.tag_name == "pre" ||
23
- element.tag_name == "form" ||
24
- element.tag_name == "style" ||
25
- element.tag_name == "noscript" ||
26
- element.tag_name == "script" ||
27
- element.tag_name == "svg"
28
- element.remove
29
- elsif element.tag_name == "title"
22
+ if element.tag_name == "title"
30
23
  @within_title = true
24
+
31
25
  element.remove
32
26
  elsif element.tag_name == "meta"
33
27
  return if element.attributes["name"].nil?
34
28
 
35
29
  @meta[element.attributes["name"]] = element.attributes["content"]
36
- else
37
- element.remove_and_keep_content
38
30
  end
39
31
  end
40
32
 
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Vore
4
+ module Handlers
5
+ class TagRemover
6
+ SELECTOR = Selma::Selector.new(match_element: "*")
7
+
8
+ def selector
9
+ SELECTOR
10
+ end
11
+
12
+ UNNECESSARY_TAGS = [
13
+ # Remove code elements
14
+ "pre",
15
+
16
+ # Remove unnecessary elements
17
+ "head",
18
+
19
+ "form",
20
+ "style",
21
+ "noscript",
22
+ "script",
23
+ "svg",
24
+
25
+ # Remove unnecessary nav elements
26
+ "header",
27
+ "footer",
28
+ "nav",
29
+ "aside",
30
+ ]
31
+
32
+ CONTENT_TO_KEEP = [
33
+ "html",
34
+ "body",
35
+ ]
36
+
37
+ def handle_element(element)
38
+ if UNNECESSARY_TAGS.include?(element.tag_name)
39
+ element.remove
40
+ elsif CONTENT_TO_KEEP.include?(element.tag_name)
41
+ element.remove_and_keep_content
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
data/lib/vore/logger.rb CHANGED
@@ -23,6 +23,10 @@ module Vore
23
23
  instance
24
24
  end
25
25
  end
26
+
27
+ def level=(level)
28
+ instance.level = level
29
+ end
26
30
  end
27
31
  end
28
32
  end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "minitest/mock"
4
+ require "net/http"
5
+
6
+ module Vore
7
+ module TestHelper
8
+ def run_command(website, **options)
9
+ loop_times.times do |time|
10
+ net_http = ::Minitest::Mock.new
11
+ response = ::Minitest::Mock.new
12
+ response.expect(:is_a?, true, [::Net::HTTPSuccess])
13
+
14
+ # we need to trigger an HTTP call to pretend that we're making
15
+ # an external request. this way, the gem hooks into VCR/Webmock
16
+ net_http.expect(:get, response)
17
+ html = content
18
+ response.expect(:body, html)
19
+ time_s = time.to_s
20
+ uri = URI("#{website}/#{time_s}")
21
+ Net::HTTP.get(uri)
22
+
23
+ file = File.join(@output_dir, time_s)
24
+ File.write("#{file}.html", html)
25
+ end
26
+ end
27
+
28
+ def loop_times=(times)
29
+ @loop_times = times
30
+ end
31
+
32
+ def loop_times
33
+ @loop_times ||= 5
34
+ end
35
+
36
+ def meta_tag_count=(count)
37
+ @meta_tag_count = count
38
+ end
39
+
40
+ def meta_tag_count
41
+ @meta_tag_count ||= 5
42
+ end
43
+
44
+ def generate_word
45
+ ("a".."z").to_a.sample(8).join
46
+ end
47
+
48
+ def generate_sentence
49
+ Array.new((5..15).to_a.sample) { generate_word }.join(" ")
50
+ end
51
+
52
+ def generate_path
53
+ Array.new((1..3).to_a.sample) { generate_word }.join("/")
54
+ end
55
+
56
+ def content
57
+ html = "<!DOCTYPE html><html><head><title>#{generate_word}</title>"
58
+ meta_tag_count.times do
59
+ html += "<meta name=\"#{generate_word}\" content=\"#{generate_word}\" />"
60
+ end
61
+
62
+ html += "</head><body>"
63
+
64
+ 50.times do
65
+ tagname = ["p", "h1", "h2", "h3", "h4", "h5", "h6"].sample
66
+ html += "<#{tagname}>#{generate_sentence}</#{tagname}>"
67
+ end
68
+
69
+ html += "</body></html>"
70
+ html
71
+ end
72
+ end
73
+
74
+ Vore::Crawler.prepend(Vore::TestHelper)
75
+ end
data/lib/vore/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vore
4
- VERSION = "0.3.0"
4
+ VERSION = "0.5.0"
5
5
  end
metadata CHANGED
@@ -1,15 +1,30 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.5.0
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Garen J. Torikian
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-18 00:00:00.000000000 Z
11
+ date: 2024-07-29 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: listen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.9'
27
+ force_ruby_platform: false
13
28
  - !ruby/object:Gem::Dependency
14
29
  name: selma
15
30
  requirement: !ruby/object:Gem::Requirement
@@ -45,8 +60,10 @@ files:
45
60
  - lib/vore.rb
46
61
  - lib/vore/configuration.rb
47
62
  - lib/vore/crawler.rb
48
- - lib/vore/handlers/content_extractor.rb
63
+ - lib/vore/handlers/meta_extractor.rb
64
+ - lib/vore/handlers/tag_remover.rb
49
65
  - lib/vore/logger.rb
66
+ - lib/vore/minitest_helper.rb
50
67
  - lib/vore/page.rb
51
68
  - lib/vore/page_data.rb
52
69
  - lib/vore/version.rb