RubyGems - vore - Versions diffs - 0.3.0-x86_64-linux → 0.5.0-x86_64-linux - Mend

vore 0.3.0-x86_64-linux → 0.5.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/Cargo.lock +24 -10
data/README.md +90 -5
data/exe/vore-spider +0 -0
data/lib/vore/configuration.rb +6 -2
data/lib/vore/crawler.rb +89 -59
data/lib/vore/handlers/{content_extractor.rb → meta_extractor.rb} +4 -12
data/lib/vore/handlers/tag_remover.rb +46 -0
data/lib/vore/logger.rb +4 -0
data/lib/vore/minitest_helper.rb +75 -0
data/lib/vore/version.rb +1 -1
metadata +20 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5fa0065651385809a53579488f9985cc3332197d4fa9818508859bb24274a16b
-  data.tar.gz: a0754878a08d651215dd5df40220e3002255ef65fedcb365e0605ff228db27fd
+  metadata.gz: 8909a229b2e8ef2864c3009a3f652dd5e770ba476689c41af61ea0fc87429ca4
+  data.tar.gz: 02e47c30a1495eef5ddc3d63ac98114179830682bd08a31659013b4642b00d19
 SHA512:
-  metadata.gz: 48298f9bf6de3e76b443a4ea46fa0adc687e4a39b90fc94ae6a97e0ce0153c6e983e292af0c8d6a53dcb44e94b8ecba94f8adc75fe5ce8cd7857e947ab113795
-  data.tar.gz: 404712a662f24fff36346998e791cfac7c9ac44661005209f0a2a65b38146f0233dfcdc9cfc115c5772340d310d085dbadbca8766201fad810843359ccff1c98
+  metadata.gz: f67d01c4c685e5f0a2d475d4d412b408dfc90d1c4429744301c2b308eda51f2df8ebafebece24769cf3de8742072cad55c77d27d6586a3807deee59b0f2ef93d
+  data.tar.gz: 0b91cc2ab15d3e619f2a23ccb6af243bc8a103a3741d7f127845666fb14a957e88f7ff0a424b6a113fafbb3fdaf4d0baec6d48807630843252a3c6d0a03ef626

data/Cargo.lock CHANGED Viewed

@@ -238,7 +238,7 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
 dependencies = [
- "compact_str",
+ "compact_str 0.7.1",
  "serde",
 ]
@@ -346,6 +346,20 @@ dependencies = [
  "static_assertions",
 ]
+[[package]]
+name = "compact_str"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
+dependencies = [
+ "castaway",
+ "cfg-if",
+ "itoa",
+ "rustversion",
+ "ryu",
+ "static_assertions",
+]
 [[package]]
 name = "cookie"
 version = "0.18.1"
@@ -1418,18 +1432,18 @@ dependencies = [
 [[package]]
 name = "rb-sys"
-version = "0.9.98"
+version = "0.9.99"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8914b2e6af10bd50dd7aaac8c5146872d3924d6012929b4ff504e988f6badd24"
+checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
 dependencies = [
  "rb-sys-build",
 ]
 [[package]]
 name = "rb-sys-build"
-version = "0.9.98"
+version = "0.9.99"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12af68c9757d419b82d65a12b5db538990dfe9416049fea3f0ba4b9a8ca108cd"
+checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
 dependencies = [
  "bindgen",
  "lazy_static",
@@ -1786,14 +1800,14 @@ dependencies = [
 [[package]]
 name = "spider"
-version = "1.99.5"
+version = "1.99.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f62dc0e4f32d36a931471a1694d5b5b29e916c537121ca8028742c9acbe510d"
+checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
 dependencies = [
  "ahash",
  "bytes",
  "case_insensitive_string",
- "compact_str",
+ "compact_str 0.8.0",
  "cssparser",
  "ego-tree",
  "fast_html5ever",
@@ -1817,9 +1831,9 @@ dependencies = [
 [[package]]
 name = "spider_cli"
-version = "1.99.5"
+version = "1.99.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6031d46576f5fdba52d5c054a8c69ba7ee17cc824563b764fce4c8471c15c3a1"
+checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
 dependencies = [
  "clap",
  "env_logger",

data/README.md CHANGED Viewed

@@ -18,13 +18,98 @@ If bundler is not being used to manage dependencies, install the gem by executin
 ## Usage
 ```ruby
-    crawler = Vore::Crawler.new
-    crawler.scrape_each_page("https://choosealicense.com") do |page|
-      puts page
-    end
+crawler = Vore::Crawler.new
+crawler.scrape_each_page("https://choosealicense.com") do |page|
+  puts page
+end
+```
+Each `page` is a simple class consisting of the following values:
+* `content`: the text of the HTML document, sans tags
+* `title`: the title of the HTML document (if any)
+* `meta`: the document's meta tags (if any)
+* `path`: the document's path
+The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
+### Configuration
+| Name                          | Description                                                                                                                                 | Default |
+| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| `delay`                       | A value (in milliseconds) which introduces an artifical delay when crawling. Useful for situations where there's rate limiting involved. | `0` |
+| `output_dir`                  | Where the resulting HTML files are stored. | `"tmp/vore"`    |
+| `delete_after_yield`          | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
+| `log_level`                   | The logging level. | `:warn` |
+### Processing pages
+Vore processes HTML using handlers. By default, there are two:
+* The `MetaExtractor`, which extracts information from your `title` and `meta` tags
+* The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
+If you wish to process the HTML further, you can provide your own handler:
+```ruby
+Vore::Crawler.new(handlers: [MySpecialHandler.new])
 ```
-Each `page` is simply every text node. The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), so you know it's fast.
+Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
+```ruby
+# preserve Vore's default content handler while adding your own;
+# `MetaExtractor` is prefixed to the front
+Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
+```
+### In tests
+Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
+Based on your needs, you can overwrite any of the existing methods to suit your application's needs. For example, if you prefer HTML to be generated by Faker, you can create and require a file that looks like the following:
+```ruby
+require "vore/minitest_helper"
+module Vore
+  module TestHelperExtension
+    DOCUMENT_TITLES = [
+      "Hello, I need help",
+      "I need to update my payment information",
+    ]
+    DOCUMENT_CONTENT = [
+      "Hey, I'm having trouble with my computer. Can you help me?",
+      # v--- always creates three page chunks
+      "I need to update my payment information. Like, now. Right now. Now. Can you help me? Please? Now?" + "Can you help me? Please? Now?" * 100,
+    ]
+    def content
+      @counter = -1 unless defined?(@counter)
+      @counter += 1
+      html = "<!DOCTYPE html><html><head><title>#{DOCUMENT_TITLES[@counter]}</title>"
+      meta_tag_count.times do # arbitrarily set to 5
+        html += "<meta name=\"#{Faker::Lorem.word}\" content=\"#{Faker::Lorem.word}\" />"
+      end
+      html += "</head><body>"
+      html += "<p>#{DOCUMENT_CONTENT[@counter]}</p>"
+      html += "</body></html>"
+      html
+    end
+  end
+  Vore::TestHelper.prepend(Vore::TestHelperExtension)
+end
+```
 ## Development

data/exe/vore-spider CHANGED Viewed

Binary file

data/lib/vore/configuration.rb CHANGED Viewed

@@ -1,13 +1,17 @@
 # frozen_string_literal: true
-module Vole
+module Vore
   class Configuration
     DEFAULT_SANITIZATION_CONFIG = Selma::Sanitizer::Config::RELAXED.dup.merge({
+      allow_comments: false,
       allow_doctype: false,
     })
     DEFAULT_OPTIONS = {
-      delay: 3500,
+      delay: 0,
+      output_dir: "tmp/vore",
+      delete_after_yield: true,
+      log_level: :warn,
     }
   end
 end

data/lib/vore/crawler.rb CHANGED Viewed

@@ -1,6 +1,9 @@
 # frozen_string_literal: true
-require_relative "handlers/content_extractor"
+require_relative "handlers/meta_extractor"
+require_relative "handlers/tag_remover"
+require "listen"
 module Vore
   # This is the class that starts and controls the crawling
@@ -8,19 +11,33 @@ module Vore
     PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
     FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
-    attr_reader :output_dir
+    attr_reader :handlers, :output_dir
     # Creates a crawler
     # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
-    def initialize(denylist: /a^/, sanitization_config: Vole::Configuration::DEFAULT_SANITIZATION_CONFIG, options: Vole::Configuration::DEFAULT_OPTIONS)
-      @denylist_regexp = Regexp.union(denylist)
+    def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
+      @meta_extractor = Vore::Handlers::MetaExtractor.new
+      @handlers = if handlers.nil?
+        [@meta_extractor, Vore::Handlers::TagRemover.new]
+      else
+        handlers.unshift(@meta_extractor)
+      end
-      @content_extractor = Vole::Handlers::ContentExtractor.new
-      @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: [@content_extractor])
+      @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
       ext = PLATFORM.include?("windows") ? ".exe" : ""
       @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
-      @parent_output_dir = "tmp/vore"
-      @options = options
+      @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
+      @parent_output_dir = @options[:output_dir]
+      @parent_output_dir_len = @parent_output_dir.to_s.split(FILE_SEPERATOR).size
+      Vore.logger.level = @options[:log_level]
+      Listen.logger = Vore.logger
+      @results = {
+        pages_visited: 0,
+        unprocessed_pages: [],
+      }
       return if File.exist?(@executable)
@@ -30,71 +47,84 @@ module Vore
     def scrape_each_page(website, &block)
       @output_dir = "#{@parent_output_dir}/#{website.gsub(/[^a-zA-Z0-9]/, "_").squeeze("_")}"
-      Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
+      FileUtils.rm_rf(@output_dir)
+      FileUtils.mkdir_p(@output_dir)
+      listener = Listen.to(@output_dir) do |_modified, added, _removed|
+        if added.any?
+          added.each do |path|
+            process_file(path, &block)
+            File.delete(path) if @options[:delete_after_yield]
+          end
+        end
+      end
+      listener.start
-      output = run_command(website, delay: @options[:delay])
+      Vore.logger.info("Vore started crawling #{website}, outputting to #{output_dir}")
-      Vore.logger.info("Vore finished crawling #{website}: #{output}")
+      begin
+        run_command(website, delay: @options[:delay])
+      ensure
+        sleep(0.5) # give listener time to clean up
+        listener.stop
+      end
-      results = {
-        pages_visited: 0,
-        pages_unprocessed: 0,
-        unprocessed_pages: [],
-      }
+      Vore.logger.info("Vore finished crawling #{website}")
-      Dir.glob(File.join(output_dir, "**", "*")).each do |path|
-        next unless File.file?(path)
+      @results
+    end
-        results[:pages_visited] += 1
+    def process_file(path, &block)
+      @results[:pages_visited] += 1
-        html_file = File.read(path).force_encoding("UTF-8")
-        rewritten_html_file = ""
+      html_file = File.read(path).force_encoding("UTF-8")
-        if html_file.empty?
-          results[:pages_unprocessed] += 1
-          results[:unprocessed_pages] << path
-          next
-        end
+      if html_file.empty?
+        @results[:unprocessed_pages] << path
+        return
+      end
-        begin
-          rewritten_html_file = @selma.rewrite(html_file)
-        rescue StandardError => e
-          Vore.logger.warn("Error rewriting #{path}: #{e}")
-          results[:pages_unprocessed] += 1
-          next
-        end
+      rewritten_html_file = @selma.rewrite(html_file)
+      return if rewritten_html_file.empty?
-        # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
-        url_path = path.split(FILE_SEPERATOR)[3..].join("/")
+      # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
+      url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
-        page = Vore::PageData.new(
-          content: rewritten_html_file,
-          title: @content_extractor.title,
-          meta: @content_extractor.meta,
-          path: url_path,
-        )
+      page = Vore::PageData.new(
+        content: rewritten_html_file,
+        title: @meta_extractor.title,
+        meta: @meta_extractor.meta,
+        path: url_path,
+      )
-        yield page
-      ensure
-        File.delete(path) if File.file?(path)
-      end
+      yield page
+    end
-      results
+    def rewrite(html_file)
+      @selma.rewrite(html_file)
+    rescue StandardError => e
+      Vore.logger.warn("Error rewriting #{path}: #{e}")
+      @results[:unprocessed_pages] << path
+      ""
     end
-    # def crawl(site, block)
-    #   Vore.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
-    #   crawl_site(site)
-    # end
-    def run_command(website, delay: 3500)
-      %x(#{@executable} \
-        --user-agent #{user_agent} \
-        --delay #{delay} \
-        --url #{website} \
-        download \
-        -t \
-        #{@output_dir})
+    def run_command(website, delay: 0)
+      pid = Process.spawn(
+        @executable,
+        "--user-agent",
+        user_agent,
+        "--delay",
+        delay.to_s,
+        "--url",
+        website,
+        "download",
+        "-t",
+        @output_dir,
+      )
+      _, _status = Process.waitpid2(pid)
+    rescue StandardError => e
+      Vore.logger.error(e)
     end
     def user_agent

data/lib/vore/handlers/{content_extractor.rb → meta_extractor.rb} RENAMED Viewed

@@ -1,8 +1,8 @@
 # frozen_string_literal: true
-module Vole
+module Vore
   module Handlers
-    class ContentExtractor
+    class MetaExtractor
       SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
       attr_reader :title, :meta
@@ -19,22 +19,14 @@ module Vole
       end
       def handle_element(element)
-        if element.tag_name == "pre" ||
-            element.tag_name == "form" ||
-            element.tag_name == "style" ||
-            element.tag_name == "noscript" ||
-            element.tag_name == "script" ||
-            element.tag_name == "svg"
-          element.remove
-        elsif element.tag_name == "title"
+        if element.tag_name == "title"
           @within_title = true
           element.remove
         elsif element.tag_name == "meta"
           return if element.attributes["name"].nil?
           @meta[element.attributes["name"]] = element.attributes["content"]
-        else
-          element.remove_and_keep_content
         end
       end

data/lib/vore/handlers/tag_remover.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+module Vore
+  module Handlers
+    class TagRemover
+      SELECTOR = Selma::Selector.new(match_element: "*")
+      def selector
+        SELECTOR
+      end
+      UNNECESSARY_TAGS = [
+        # Remove code elements
+        "pre",
+        # Remove unnecessary elements
+        "head",
+        "form",
+        "style",
+        "noscript",
+        "script",
+        "svg",
+        # Remove unnecessary nav elements
+        "header",
+        "footer",
+        "nav",
+        "aside",
+      ]
+      CONTENT_TO_KEEP = [
+        "html",
+        "body",
+      ]
+      def handle_element(element)
+        if UNNECESSARY_TAGS.include?(element.tag_name)
+          element.remove
+        elsif CONTENT_TO_KEEP.include?(element.tag_name)
+          element.remove_and_keep_content
+        end
+      end
+    end
+  end
+end

data/lib/vore/logger.rb CHANGED Viewed

@@ -23,6 +23,10 @@ module Vore
           instance
         end
       end
+      def level=(level)
+        instance.level = level
+      end
     end
   end
 end

data/lib/vore/minitest_helper.rb ADDED Viewed

@@ -0,0 +1,75 @@
+# frozen_string_literal: true
+require "minitest/mock"
+require "net/http"
+module Vore
+  module TestHelper
+    def run_command(website, **options)
+      loop_times.times do |time|
+        net_http = ::Minitest::Mock.new
+        response = ::Minitest::Mock.new
+        response.expect(:is_a?, true, [::Net::HTTPSuccess])
+        # we need to trigger an HTTP call to pretend that we're making
+        # an external request. this way, the gem hooks into VCR/Webmock
+        net_http.expect(:get, response)
+        html = content
+        response.expect(:body, html)
+        time_s = time.to_s
+        uri = URI("#{website}/#{time_s}")
+        Net::HTTP.get(uri)
+        file = File.join(@output_dir, time_s)
+        File.write("#{file}.html", html)
+      end
+    end
+    def loop_times=(times)
+      @loop_times = times
+    end
+    def loop_times
+      @loop_times ||= 5
+    end
+    def meta_tag_count=(count)
+      @meta_tag_count = count
+    end
+    def meta_tag_count
+      @meta_tag_count ||= 5
+    end
+    def generate_word
+      ("a".."z").to_a.sample(8).join
+    end
+    def generate_sentence
+      Array.new((5..15).to_a.sample) { generate_word }.join(" ")
+    end
+    def generate_path
+      Array.new((1..3).to_a.sample) { generate_word }.join("/")
+    end
+    def content
+      html = "<!DOCTYPE html><html><head><title>#{generate_word}</title>"
+      meta_tag_count.times do
+        html += "<meta name=\"#{generate_word}\" content=\"#{generate_word}\" />"
+      end
+      html += "</head><body>"
+      50.times do
+        tagname = ["p", "h1", "h2", "h3", "h4", "h5", "h6"].sample
+        html += "<#{tagname}>#{generate_sentence}</#{tagname}>"
+      end
+      html += "</body></html>"
+      html
+    end
+  end
+  Vore::Crawler.prepend(Vore::TestHelper)
+end

data/lib/vore/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Vore
-  VERSION = "0.3.0"
+  VERSION = "0.5.0"
 end

metadata CHANGED Viewed

@@ -1,15 +1,30 @@
 --- !ruby/object:Gem::Specification
 name: vore
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.5.0
 platform: x86_64-linux
 authors:
 - Garen J. Torikian
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-07-18 00:00:00.000000000 Z
+date: 2024-07-29 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: listen
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.9'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.9'
+  force_ruby_platform: false
 - !ruby/object:Gem::Dependency
   name: selma
   requirement: !ruby/object:Gem::Requirement
@@ -45,8 +60,10 @@ files:
 - lib/vore.rb
 - lib/vore/configuration.rb
 - lib/vore/crawler.rb
-- lib/vore/handlers/content_extractor.rb
+- lib/vore/handlers/meta_extractor.rb
+- lib/vore/handlers/tag_remover.rb
 - lib/vore/logger.rb
+- lib/vore/minitest_helper.rb
 - lib/vore/page.rb
 - lib/vore/page_data.rb
 - lib/vore/version.rb