vore 0.4.0-x86_64-windows → 0.5.0-x86_64-windows
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +24 -10
- data/README.md +23 -0
- data/exe/vore-spider.exe +0 -0
- data/lib/vore/configuration.rb +1 -0
- data/lib/vore/crawler.rb +25 -15
- data/lib/vore/handlers/{content_extractor.rb → meta_extractor.rb} +3 -11
- data/lib/vore/handlers/tag_remover.rb +46 -0
- data/lib/vore/version.rb +1 -1
- metadata +4 -3
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: a1fdd47818712d05d40947298e12b0c1e1ee8b3ab4c5e5cfcb5e7b2d16782f24
         | 
| 4 | 
            +
              data.tar.gz: 18a2c3e58ccaf8ce9f9ddc5fe4f243f78bf24dd3feb0603b41d2c5ab25bbba63
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 18e98ac29ca02eb7e6c56146d634c8e730f82186e54c4e5162bd0b5f2a0b475c7230eaed5a7d08e7413672366b35a1b4b993626235ec7a21a34926a5671ce4bc
         | 
| 7 | 
            +
              data.tar.gz: a3c16f5d9a7fa5d641faea6ebc6f74781762a0f7a8e3a3c90e527beb44f8763e114a086c4de80eb0da278bc7ea1f3ffd203ebf6123d650c7328f1602c482fbc8
         | 
    
        data/Cargo.lock
    CHANGED
    
    | @@ -238,7 +238,7 @@ version = "0.2.3" | |
| 238 238 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 239 239 | 
             
            checksum = "fc229be27b394115abdc89e09500d5030407734d21a143a833eae5f136821bcd"
         | 
| 240 240 | 
             
            dependencies = [
         | 
| 241 | 
            -
             "compact_str",
         | 
| 241 | 
            +
             "compact_str 0.7.1",
         | 
| 242 242 | 
             
             "serde",
         | 
| 243 243 | 
             
            ]
         | 
| 244 244 |  | 
| @@ -346,6 +346,20 @@ dependencies = [ | |
| 346 346 | 
             
             "static_assertions",
         | 
| 347 347 | 
             
            ]
         | 
| 348 348 |  | 
| 349 | 
            +
            [[package]]
         | 
| 350 | 
            +
            name = "compact_str"
         | 
| 351 | 
            +
            version = "0.8.0"
         | 
| 352 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 353 | 
            +
            checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644"
         | 
| 354 | 
            +
            dependencies = [
         | 
| 355 | 
            +
             "castaway",
         | 
| 356 | 
            +
             "cfg-if",
         | 
| 357 | 
            +
             "itoa",
         | 
| 358 | 
            +
             "rustversion",
         | 
| 359 | 
            +
             "ryu",
         | 
| 360 | 
            +
             "static_assertions",
         | 
| 361 | 
            +
            ]
         | 
| 362 | 
            +
             | 
| 349 363 | 
             
            [[package]]
         | 
| 350 364 | 
             
            name = "cookie"
         | 
| 351 365 | 
             
            version = "0.18.1"
         | 
| @@ -1418,18 +1432,18 @@ dependencies = [ | |
| 1418 1432 |  | 
| 1419 1433 | 
             
            [[package]]
         | 
| 1420 1434 | 
             
            name = "rb-sys"
         | 
| 1421 | 
            -
            version = "0.9. | 
| 1435 | 
            +
            version = "0.9.99"
         | 
| 1422 1436 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 1423 | 
            -
            checksum = " | 
| 1437 | 
            +
            checksum = "d83151cfea2b67db2444f68c53b119ff77cff235ad711c765072e4daf8f3185b"
         | 
| 1424 1438 | 
             
            dependencies = [
         | 
| 1425 1439 | 
             
             "rb-sys-build",
         | 
| 1426 1440 | 
             
            ]
         | 
| 1427 1441 |  | 
| 1428 1442 | 
             
            [[package]]
         | 
| 1429 1443 | 
             
            name = "rb-sys-build"
         | 
| 1430 | 
            -
            version = "0.9. | 
| 1444 | 
            +
            version = "0.9.99"
         | 
| 1431 1445 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 1432 | 
            -
            checksum = " | 
| 1446 | 
            +
            checksum = "32d038214c118ad4a75db555ccb78672e17e1c5c10f344456cd129008dbaa7de"
         | 
| 1433 1447 | 
             
            dependencies = [
         | 
| 1434 1448 | 
             
             "bindgen",
         | 
| 1435 1449 | 
             
             "lazy_static",
         | 
| @@ -1786,14 +1800,14 @@ dependencies = [ | |
| 1786 1800 |  | 
| 1787 1801 | 
             
            [[package]]
         | 
| 1788 1802 | 
             
            name = "spider"
         | 
| 1789 | 
            -
            version = "1.99. | 
| 1803 | 
            +
            version = "1.99.11"
         | 
| 1790 1804 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 1791 | 
            -
            checksum = " | 
| 1805 | 
            +
            checksum = "e23ad22d5e55b09f480f849b37dd2fe315e3cf1df0f5261209aa5482483c617f"
         | 
| 1792 1806 | 
             
            dependencies = [
         | 
| 1793 1807 | 
             
             "ahash",
         | 
| 1794 1808 | 
             
             "bytes",
         | 
| 1795 1809 | 
             
             "case_insensitive_string",
         | 
| 1796 | 
            -
             "compact_str",
         | 
| 1810 | 
            +
             "compact_str 0.8.0",
         | 
| 1797 1811 | 
             
             "cssparser",
         | 
| 1798 1812 | 
             
             "ego-tree",
         | 
| 1799 1813 | 
             
             "fast_html5ever",
         | 
| @@ -1817,9 +1831,9 @@ dependencies = [ | |
| 1817 1831 |  | 
| 1818 1832 | 
             
            [[package]]
         | 
| 1819 1833 | 
             
            name = "spider_cli"
         | 
| 1820 | 
            -
            version = "1.99. | 
| 1834 | 
            +
            version = "1.99.11"
         | 
| 1821 1835 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 1822 | 
            -
            checksum = " | 
| 1836 | 
            +
            checksum = "be5da7d570871156c08025bdc13de670807d36a90be94f8aa8342a04e5268662"
         | 
| 1823 1837 | 
             
            dependencies = [
         | 
| 1824 1838 | 
             
             "clap",
         | 
| 1825 1839 | 
             
             "env_logger",
         | 
    
        data/README.md
    CHANGED
    
    | @@ -42,6 +42,29 @@ The scraping is managed by [`spider-rs`](https://github.com/spider-rs/spider), s | |
| 42 42 | 
             
            | `delete_after_yield`          | Whether the downloaded HTML files are deleted after the yield block finishes. | `true` |
         | 
| 43 43 | 
             
            | `log_level`                   | The logging level. | `:warn` |
         | 
| 44 44 |  | 
| 45 | 
            +
            ### Processing pages
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            Vore processes HTML using handlers. By default, there are two:
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            * The `MetaExtractor`, which extracts information from your `title` and `meta` tags
         | 
| 50 | 
            +
            * The `TagRemover`, which removes unnecessary elements like `header`, `footer`, `script`
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            If you wish to process the HTML further, you can provide your own handler:
         | 
| 53 | 
            +
             | 
| 54 | 
            +
             | 
| 55 | 
            +
            ```ruby
         | 
| 56 | 
            +
            Vore::Crawler.new(handlers: [MySpecialHandler.new])
         | 
| 57 | 
            +
            ```
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            Handlers are defined using [Selma](https://github.com/gjtorikian/selma?tab=readme-ov-file#defining-handlers). Note that the `MetaExtractor` is always included and defined first, but if you pass in anything to the `handler` array, it'll overwrite Vore's other default handlers. You can of course choose to include them manually:
         | 
| 60 | 
            +
             | 
| 61 | 
            +
             | 
| 62 | 
            +
            ```ruby
         | 
| 63 | 
            +
            # preserve Vore's default content handler while adding your own;
         | 
| 64 | 
            +
            # `MetaExtractor` is prefixed to the front
         | 
| 65 | 
            +
            Vore::Crawler.new(handlers: [Vore::Handlers::TagRemover.new, MySpecialHandler.new])
         | 
| 66 | 
            +
            ```
         | 
| 67 | 
            +
             | 
| 45 68 | 
             
            ### In tests
         | 
| 46 69 |  | 
| 47 70 | 
             
            Since the actual HTTP calls occur in a separate process, Vore will not integrate with libraries like VCR or Webmock by default. You'll need to `require "vore/minitest_helper"` to get a function that emulates the HTTP `GET` requests in a way Ruby can interpret.
         | 
    
        data/exe/vore-spider.exe
    CHANGED
    
    | Binary file | 
    
        data/lib/vore/configuration.rb
    CHANGED
    
    
    
        data/lib/vore/crawler.rb
    CHANGED
    
    | @@ -1,6 +1,7 @@ | |
| 1 1 | 
             
            # frozen_string_literal: true
         | 
| 2 2 |  | 
| 3 | 
            -
            require_relative "handlers/ | 
| 3 | 
            +
            require_relative "handlers/meta_extractor"
         | 
| 4 | 
            +
            require_relative "handlers/tag_remover"
         | 
| 4 5 |  | 
| 5 6 | 
             
            require "listen"
         | 
| 6 7 |  | 
| @@ -10,13 +11,20 @@ module Vore | |
| 10 11 | 
             
                PLATFORM = [:cpu, :os].map { |m| Gem::Platform.local.send(m) }.join("-")
         | 
| 11 12 | 
             
                FILE_SEPERATOR = PLATFORM.include?("windows") ? File::ALT_SEPARATOR : File::SEPARATOR
         | 
| 12 13 |  | 
| 13 | 
            -
                attr_reader :output_dir
         | 
| 14 | 
            +
                attr_reader :handlers, :output_dir
         | 
| 14 15 |  | 
| 15 16 | 
             
                # Creates a crawler
         | 
| 16 17 | 
             
                # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
         | 
| 17 | 
            -
                def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, options: {})
         | 
| 18 | 
            -
                  @ | 
| 19 | 
            -
             | 
| 18 | 
            +
                def initialize(sanitization_config: Vore::Configuration::DEFAULT_SANITIZATION_CONFIG, handlers: nil, options: {})
         | 
| 19 | 
            +
                  @meta_extractor = Vore::Handlers::MetaExtractor.new
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  @handlers = if handlers.nil?
         | 
| 22 | 
            +
                    [@meta_extractor, Vore::Handlers::TagRemover.new]
         | 
| 23 | 
            +
                  else
         | 
| 24 | 
            +
                    handlers.unshift(@meta_extractor)
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  @selma = Selma::Rewriter.new(sanitizer: Selma::Sanitizer.new(sanitization_config), handlers: @handlers)
         | 
| 20 28 | 
             
                  ext = PLATFORM.include?("windows") ? ".exe" : ""
         | 
| 21 29 | 
             
                  @executable = File.expand_path([__FILE__, "..", "..", "..", "exe", "vore-spider#{ext}"].join(FILE_SEPERATOR))
         | 
| 22 30 | 
             
                  @options = Vore::Configuration::DEFAULT_OPTIONS.merge(options)
         | 
| @@ -70,34 +78,36 @@ module Vore | |
| 70 78 | 
             
                  @results[:pages_visited] += 1
         | 
| 71 79 |  | 
| 72 80 | 
             
                  html_file = File.read(path).force_encoding("UTF-8")
         | 
| 73 | 
            -
                  rewritten_html_file = ""
         | 
| 74 81 |  | 
| 75 82 | 
             
                  if html_file.empty?
         | 
| 76 83 | 
             
                    @results[:unprocessed_pages] << path
         | 
| 77 84 | 
             
                    return
         | 
| 78 85 | 
             
                  end
         | 
| 79 86 |  | 
| 80 | 
            -
                   | 
| 81 | 
            -
             | 
| 82 | 
            -
                  rescue StandardError => e
         | 
| 83 | 
            -
                    Vore.logger.warn("Error rewriting #{path}: #{e}")
         | 
| 84 | 
            -
                    @results[:unprocessed_pages] << path
         | 
| 85 | 
            -
                    return
         | 
| 86 | 
            -
                  end
         | 
| 87 | 
            +
                  rewritten_html_file = @selma.rewrite(html_file)
         | 
| 88 | 
            +
                  return if rewritten_html_file.empty?
         | 
| 87 89 |  | 
| 88 90 | 
             
                  # drops the first 3 parts of the path, which are "tmp", "vore", and the site name
         | 
| 89 91 | 
             
                  url_path = path.split(FILE_SEPERATOR)[(@parent_output_dir_len + 1)..].join("/")
         | 
| 90 92 |  | 
| 91 93 | 
             
                  page = Vore::PageData.new(
         | 
| 92 94 | 
             
                    content: rewritten_html_file,
         | 
| 93 | 
            -
                    title: @ | 
| 94 | 
            -
                    meta: @ | 
| 95 | 
            +
                    title: @meta_extractor.title,
         | 
| 96 | 
            +
                    meta: @meta_extractor.meta,
         | 
| 95 97 | 
             
                    path: url_path,
         | 
| 96 98 | 
             
                  )
         | 
| 97 99 |  | 
| 98 100 | 
             
                  yield page
         | 
| 99 101 | 
             
                end
         | 
| 100 102 |  | 
| 103 | 
            +
                def rewrite(html_file)
         | 
| 104 | 
            +
                  @selma.rewrite(html_file)
         | 
| 105 | 
            +
                rescue StandardError => e
         | 
| 106 | 
            +
                  Vore.logger.warn("Error rewriting #{path}: #{e}")
         | 
| 107 | 
            +
                  @results[:unprocessed_pages] << path
         | 
| 108 | 
            +
                  ""
         | 
| 109 | 
            +
                end
         | 
| 110 | 
            +
             | 
| 101 111 | 
             
                def run_command(website, delay: 0)
         | 
| 102 112 | 
             
                  pid = Process.spawn(
         | 
| 103 113 | 
             
                    @executable,
         | 
| @@ -2,7 +2,7 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            module Vore
         | 
| 4 4 | 
             
              module Handlers
         | 
| 5 | 
            -
                class  | 
| 5 | 
            +
                class MetaExtractor
         | 
| 6 6 | 
             
                  SELECTOR = Selma::Selector.new(match_element: "*", match_text_within: "title")
         | 
| 7 7 |  | 
| 8 8 | 
             
                  attr_reader :title, :meta
         | 
| @@ -19,22 +19,14 @@ module Vore | |
| 19 19 | 
             
                  end
         | 
| 20 20 |  | 
| 21 21 | 
             
                  def handle_element(element)
         | 
| 22 | 
            -
                    if element.tag_name == " | 
| 23 | 
            -
                        element.tag_name == "form" ||
         | 
| 24 | 
            -
                        element.tag_name == "style" ||
         | 
| 25 | 
            -
                        element.tag_name == "noscript" ||
         | 
| 26 | 
            -
                        element.tag_name == "script" ||
         | 
| 27 | 
            -
                        element.tag_name == "svg"
         | 
| 28 | 
            -
                      element.remove
         | 
| 29 | 
            -
                    elsif element.tag_name == "title"
         | 
| 22 | 
            +
                    if element.tag_name == "title"
         | 
| 30 23 | 
             
                      @within_title = true
         | 
| 24 | 
            +
             | 
| 31 25 | 
             
                      element.remove
         | 
| 32 26 | 
             
                    elsif element.tag_name == "meta"
         | 
| 33 27 | 
             
                      return if element.attributes["name"].nil?
         | 
| 34 28 |  | 
| 35 29 | 
             
                      @meta[element.attributes["name"]] = element.attributes["content"]
         | 
| 36 | 
            -
                    else
         | 
| 37 | 
            -
                      element.remove_and_keep_content
         | 
| 38 30 | 
             
                    end
         | 
| 39 31 | 
             
                  end
         | 
| 40 32 |  | 
| @@ -0,0 +1,46 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Vore
         | 
| 4 | 
            +
              module Handlers
         | 
| 5 | 
            +
                class TagRemover
         | 
| 6 | 
            +
                  SELECTOR = Selma::Selector.new(match_element: "*")
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                  def selector
         | 
| 9 | 
            +
                    SELECTOR
         | 
| 10 | 
            +
                  end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  UNNECESSARY_TAGS = [
         | 
| 13 | 
            +
                    # Remove code elements
         | 
| 14 | 
            +
                    "pre",
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    # Remove unnecessary elements
         | 
| 17 | 
            +
                    "head",
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    "form",
         | 
| 20 | 
            +
                    "style",
         | 
| 21 | 
            +
                    "noscript",
         | 
| 22 | 
            +
                    "script",
         | 
| 23 | 
            +
                    "svg",
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    # Remove unnecessary nav elements
         | 
| 26 | 
            +
                    "header",
         | 
| 27 | 
            +
                    "footer",
         | 
| 28 | 
            +
                    "nav",
         | 
| 29 | 
            +
                    "aside",
         | 
| 30 | 
            +
                  ]
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  CONTENT_TO_KEEP = [
         | 
| 33 | 
            +
                    "html",
         | 
| 34 | 
            +
                    "body",
         | 
| 35 | 
            +
                  ]
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                  def handle_element(element)
         | 
| 38 | 
            +
                    if UNNECESSARY_TAGS.include?(element.tag_name)
         | 
| 39 | 
            +
                      element.remove
         | 
| 40 | 
            +
                    elsif CONTENT_TO_KEEP.include?(element.tag_name)
         | 
| 41 | 
            +
                      element.remove_and_keep_content
         | 
| 42 | 
            +
                    end
         | 
| 43 | 
            +
                  end
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
            end
         | 
    
        data/lib/vore/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: vore
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.5.0
         | 
| 5 5 | 
             
            platform: x86_64-windows
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Garen J. Torikian
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2024-07- | 
| 11 | 
            +
            date: 2024-07-29 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: listen
         | 
| @@ -60,7 +60,8 @@ files: | |
| 60 60 | 
             
            - lib/vore.rb
         | 
| 61 61 | 
             
            - lib/vore/configuration.rb
         | 
| 62 62 | 
             
            - lib/vore/crawler.rb
         | 
| 63 | 
            -
            - lib/vore/handlers/ | 
| 63 | 
            +
            - lib/vore/handlers/meta_extractor.rb
         | 
| 64 | 
            +
            - lib/vore/handlers/tag_remover.rb
         | 
| 64 65 | 
             
            - lib/vore/logger.rb
         | 
| 65 66 | 
             
            - lib/vore/minitest_helper.rb
         | 
| 66 67 | 
             
            - lib/vore/page.rb
         |