RubyGems - scrapetor - Versions diffs - 0.2.0 - Mend

scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +242 -0
data/LICENSE +21 -0
data/README.md +440 -0
data/bin/scrapetor +190 -0
data/bin/scrapetor-bench +5 -0
data/ext/scrapetor/README.md +53 -0
data/ext/scrapetor/native/extconf.rb +67 -0
data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
data/ext/scrapetor/native/scrapetor_http.c +2591 -0
data/ext/scrapetor/native/scrapetor_native.c +1156 -0
data/lib/scrapetor/builder.rb +158 -0
data/lib/scrapetor/cleaner.rb +10 -0
data/lib/scrapetor/comment_node.rb +67 -0
data/lib/scrapetor/document.rb +457 -0
data/lib/scrapetor/dom/parser.rb +69 -0
data/lib/scrapetor/dom/selectors.rb +208 -0
data/lib/scrapetor/dom.rb +563 -0
data/lib/scrapetor/encoding.rb +85 -0
data/lib/scrapetor/entities.rb +90 -0
data/lib/scrapetor/errors.rb +12 -0
data/lib/scrapetor/extractor.rb +147 -0
data/lib/scrapetor/fetcher.rb +390 -0
data/lib/scrapetor/fingerprint.rb +29 -0
data/lib/scrapetor/form.rb +141 -0
data/lib/scrapetor/http.rb +114 -0
data/lib/scrapetor/microdata.rb +132 -0
data/lib/scrapetor/money.rb +30 -0
data/lib/scrapetor/native.rb +291 -0
data/lib/scrapetor/native_dom.rb +2258 -0
data/lib/scrapetor/node.rb +539 -0
data/lib/scrapetor/node_set.rb +301 -0
data/lib/scrapetor/page_type.rb +95 -0
data/lib/scrapetor/pagination.rb +109 -0
data/lib/scrapetor/persistent_cache.rb +130 -0
data/lib/scrapetor/robots.rb +159 -0
data/lib/scrapetor/sax.rb +285 -0
data/lib/scrapetor/schema.rb +144 -0
data/lib/scrapetor/selector.rb +576 -0
data/lib/scrapetor/session.rb +141 -0
data/lib/scrapetor/sitemap.rb +52 -0
data/lib/scrapetor/stream.rb +111 -0
data/lib/scrapetor/structured_data.rb +74 -0
data/lib/scrapetor/template_registry.rb +24 -0
data/lib/scrapetor/text_node.rb +101 -0
data/lib/scrapetor/url.rb +21 -0
data/lib/scrapetor/version.rb +5 -0
data/lib/scrapetor/xpath.rb +1603 -0
data/lib/scrapetor.rb +167 -0
data/scrapetor.gemspec +77 -0
metadata +200 -0

data/lib/scrapetor/node_set.rb ADDED Viewed

@@ -0,0 +1,301 @@
+# frozen_string_literal: true
+module Scrapetor
+  class NodeSet
+    include Enumerable
+    def initialize(doc, backing_nodes)
+      @doc = doc
+      # `defined?` guard so this works when the native extension isn't
+      # loaded (e.g. install-time build failure, or the gem is required
+      # before its C extension is in place). Without the guard a plain
+      # NodeSet construction raises NameError on missing constant —
+      # which is the v0.1.x crash a production audit run surfaced.
+      if defined?(Scrapetor::Native::DocumentWrapper::LazyIds) &&
+         backing_nodes.is_a?(Scrapetor::Native::DocumentWrapper::LazyIds)
+        @lazy_ids = backing_nodes
+        @nodes    = nil
+      else
+        @nodes = backing_nodes
+      end
+    end
+    def each
+      return enum_for(:each) unless block_given?
+      if @lazy_ids
+        wrap = @lazy_ids.wrapper
+        native = @lazy_ids.native
+        @lazy_ids.ids.each do |id|
+          yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap))
+        end
+      else
+        @nodes.each { |n| yield Node.new(@doc, n) }
+      end
+    end
+    def first
+      if @lazy_ids
+        id = @lazy_ids.ids.first
+        return nil unless id
+        Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
+      else
+        n = @nodes.first
+        n && Node.new(@doc, n)
+      end
+    end
+    def last
+      if @lazy_ids
+        id = @lazy_ids.ids.last
+        return nil unless id
+        Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
+      else
+        n = @nodes.last
+        n && Node.new(@doc, n)
+      end
+    end
+    def [](index, length = nil)
+      if length
+        slice = backing_nodes[index, length]
+        return self.class.new(@doc, slice || [])
+      end
+      if index.is_a?(Range)
+        slice = backing_nodes[index]
+        return self.class.new(@doc, slice || [])
+      end
+      if @lazy_ids
+        id = @lazy_ids.ids[index]
+        return nil unless id
+        Node.new(@doc, Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper))
+      else
+        n = @nodes[index]
+        n && Node.new(@doc, n)
+      end
+    end
+    alias slice []
+    def size
+      @lazy_ids ? @lazy_ids.ids.size : @nodes.size
+    end
+    alias length size
+    alias count size
+    def empty?
+      @lazy_ids ? @lazy_ids.ids.empty? : @nodes.empty?
+    end
+    def map
+      return enum_for(:map) unless block_given?
+      if @lazy_ids
+        wrap = @lazy_ids.wrapper
+        native = @lazy_ids.native
+        @lazy_ids.ids.map { |id| yield Node.new(@doc, Scrapetor::Native::Element.new(native, id, wrap)) }
+      else
+        @nodes.map { |n| yield Node.new(@doc, n) }
+      end
+    end
+    def text
+      backing_nodes.map(&:text).join
+    end
+    alias inner_text text
+    alias content    text
+    def at(selector)
+      first&.at(selector)
+    end
+    alias at_css at
+    def css(selector)
+      # Determine up front whether the selector ends in a `::text` /
+      # `::attr` pseudo-element. Inferring from the result shape (was
+      # the previous approach) misclassifies zero-match queries as
+      # string-shaped and breaks `.at_css` chained off an empty NodeSet.
+      pe = selector.to_s
+      string_result = pe.include?("::") &&
+                      pe =~ /::(?:text|attr\([^)]+\)|first-letter|first-line|before|after)\s*\z/i
+      collected = []
+      backing_nodes.each do |n|
+        next unless n.respond_to?(:css)
+        result = n.css(selector)
+        result = result.to_a if result.respond_to?(:to_a)
+        result.each { |hit| collected << hit }
+      end
+      return collected if string_result
+      NodeSet.new(@doc, collected)
+    end
+    alias search css
+    # Aggregate of children across all nodes in the set. Mirrors
+    # Nokogiri's NodeSet#children — every child of every node, including
+    # text and comment nodes, flattened into a single NodeSet. Pulls
+    # children straight from the backing element (rather than going
+    # through Node#children, which filters to elements only) so callers
+    # that iterate mixed-content can still see the text segments.
+    def children
+      collected = []
+      backing_nodes.each do |bk|
+        next unless bk.respond_to?(:children)
+        kids = bk.children
+        kids = kids.to_a if kids.respond_to?(:to_a)
+        kids.each { |c| collected << c }
+      end
+      NodeSet.new(@doc, collected)
+    end
+    def to_html
+      backing_nodes.map { |n| n.respond_to?(:to_html) ? n.to_html : n.to_s }.join
+    end
+    alias inner_html to_html
+    alias to_s to_html
+    def attr(name)
+      first&.attr(name)
+    end
+    alias attribute attr
+    def reverse
+      self.class.new(@doc, backing_nodes.reverse)
+    end
+    def +(other)
+      other_nodes = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
+      self.class.new(@doc, backing_nodes + other_nodes)
+    end
+    def to_a
+      map { |n| n }
+    end
+    # Implicit conversion target — without this, `Array#+` /
+    # `Array#concat` / splat (`*nodeset`) all raise
+    # `TypeError: no implicit conversion of Scrapetor::NodeSet into Array`
+    # because Ruby's coercion path looks for to_ary, not to_a.
+    alias to_ary to_a
+    def backing_nodes
+      return materialize if @lazy_ids
+      @nodes
+    end
+    # Force the lazy-ids path to allocate its Element wrappers. Used by
+    # operations that need the original backing nodes (set algebra,
+    # +/-/&, removal).
+    def materialize
+      return @nodes unless @lazy_ids
+      @nodes = @lazy_ids.ids.map { |id| Scrapetor::Native::Element.new(@lazy_ids.native, id, @lazy_ids.wrapper) }
+      @lazy_ids = nil
+      @nodes
+    end
+    # ----- Bulk mutation passthroughs -----
+    #
+    # Nokogiri NodeSet exposes a handful of bulk operations that map onto
+    # iterating the underlying nodes. We keep parity so callers can do
+    # `doc.css('br').remove` etc. without crashing.
+    def remove
+      # Two-phase. First promote every backing node to its Dom
+      # equivalent (so path-based lookup happens against the still-
+      # intact tree); then remove. A naive "iterate + remove" works on
+      # a mutable Dom but invalidates the position-index paths the
+      # Native::Element fallback relies on after the first deletion.
+      resolved = backing_nodes.map do |n|
+        if n.respond_to?(:promote_to_dom!)
+          n.promote_to_dom!
+        else
+          n
+        end
+      end
+      resolved.each do |target|
+        if target.respond_to?(:remove)
+          target.remove
+        else
+          Node.new(@doc, target).remove
+        end
+      end
+      self
+    end
+    alias unlink remove
+    def each_with_index
+      return enum_for(:each_with_index) unless block_given?
+      backing_nodes.each_with_index { |n, i| yield Node.new(@doc, n), i }
+    end
+    def select
+      return enum_for(:select) unless block_given?
+      kept = []
+      backing_nodes.each do |n|
+        wrapped = Node.new(@doc, n)
+        kept << n if yield(wrapped)
+      end
+      self.class.new(@doc, kept)
+    end
+    alias filter select
+    def reject
+      return enum_for(:reject) unless block_given?
+      kept = []
+      backing_nodes.each do |n|
+        wrapped = Node.new(@doc, n)
+        kept << n unless yield(wrapped)
+      end
+      self.class.new(@doc, kept)
+    end
+    def find_all
+      return enum_for(:find_all) unless block_given?
+      select { |n| yield(n) }
+    end
+    def push(node)
+      materialize
+      @nodes << (node.is_a?(Node) ? node.backing_node : node)
+      self
+    end
+    alias << push
+    def pop
+      materialize
+      n = @nodes.pop
+      n && Node.new(@doc, n)
+    end
+    def shift
+      materialize
+      n = @nodes.shift
+      n && Node.new(@doc, n)
+    end
+    def index(node)
+      target = node.is_a?(Node) ? node.backing_node : node
+      backing_nodes.index(target)
+    end
+    def include?(node)
+      target = node.is_a?(Node) ? node.backing_node : node
+      backing_nodes.include?(target)
+    end
+    def -(other)
+      drop = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
+      self.class.new(@doc, backing_nodes - drop)
+    end
+    def &(other)
+      keep = other.respond_to?(:backing_nodes) ? other.backing_nodes : Array(other)
+      self.class.new(@doc, backing_nodes & keep)
+    end
+    # Map every node through the `extract(fields)` extraction. Lets
+    # the standard SERP-result pattern collapse to:
+    #
+    #   doc.css(".result").extract(title: ".t", price: ".p")
+    #   # => [{title: ..., price: ...}, ...]
+    def extract(fields)
+      map { |n| n.extract(fields) }
+    end
+  end
+end

data/lib/scrapetor/page_type.rb ADDED Viewed

@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+module Scrapetor
+  # Heuristic page-type detection.
+  #
+  # Returns one of:
+  #   :product_page, :product_listing, :article, :search_results,
+  #   :forum_thread, :profile, :documentation, :unknown
+  #
+  # The heuristic prefers strong signals (JSON-LD @type, OpenGraph
+  # og:type) and falls back to structural heuristics (repeated card
+  # patterns, byline + body, search bar + result list).
+  module PageType
+    PRODUCT_OG_TYPES = %w[product product.item og:product].freeze
+    ARTICLE_OG_TYPES = %w[article news.article].freeze
+    PROFILE_OG_TYPES = %w[profile person og:profile].freeze
+    def self.detect(doc)
+      from_structured_data(doc) ||
+        from_opengraph(doc) ||
+        from_structure(doc) ||
+        :unknown
+    end
+    # ----- strong signals: JSON-LD -----
+    def self.from_structured_data(doc)
+      types = doc.json_ld.flat_map { |item| Array(item.is_a?(Hash) ? item["@type"] : nil) }.compact.map(&:to_s)
+      return nil if types.empty?
+      return :product_listing if types.include?("ItemList") &&
+                                 (types.include?("Product") || types.include?("Offer"))
+      return :product_page    if types.include?("Product")
+      return :article         if (types & %w[NewsArticle Article BlogPosting]).any?
+      return :search_results  if types.include?("SearchResultsPage")
+      return :profile         if (types & %w[Person ProfilePage]).any?
+      return :forum_thread    if types.include?("DiscussionForumPosting")
+      return :documentation   if types.include?("TechArticle")
+      nil
+    end
+    # ----- OpenGraph signals -----
+    def self.from_opengraph(doc)
+      og = doc.opengraph
+      t = (og["type"] || "").to_s.downcase
+      return :product_page if PRODUCT_OG_TYPES.any? { |x| t.include?(x) }
+      return :article      if ARTICLE_OG_TYPES.any? { |x| t.include?(x) }
+      return :profile      if PROFILE_OG_TYPES.any? { |x| t.include?(x) }
+      nil
+    end
+    # ----- structural fallback -----
+    def self.from_structure(doc)
+      # Search results: a search bar + a list of result items
+      if doc.css('input[type="search"], form[role="search"], [class*="search-result"]').any?
+        return :search_results
+      end
+      # Repeated cards = listing
+      grid_candidates = %w[
+        .product-card .product-tile .product-item .listing-item
+        [class*="product-grid"] [class*="card"] [class*="tile"]
+      ].flat_map { |sel| doc.css(sel).to_a }.uniq
+      return :product_listing if grid_candidates.size >= 6
+      # Article: <article> with a byline AND a long body
+      articles = doc.css("article")
+      if articles.any?
+        text = articles.first.text.to_s
+        word_count = text.scan(/\S+/).size
+        has_byline = doc.css(".byline, .author, [rel='author'], [itemprop='author']").any?
+        return :article if word_count >= 200 || has_byline
+      end
+      # Profile: avatar + name + bio
+      if doc.css('[class*="avatar"], [class*="profile-header"]').any? &&
+         doc.css('[class*="bio"], [class*="about"]').any?
+        return :profile
+      end
+      # Forum thread
+      if doc.css('.thread, .topic, [class*="post-message"]').size >= 2
+        return :forum_thread
+      end
+      # Documentation: code blocks + heading hierarchy
+      if doc.css("pre code").size >= 3 && doc.css("h1, h2, h3").size >= 3
+        return :documentation
+      end
+      nil
+    end
+  end
+end

data/lib/scrapetor/pagination.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+require "uri"
+module Scrapetor
+  # Pagination helper. Walks a page sequence by detecting the "next
+  # page" URL from the document — in priority order:
+  #
+  #   1. <link rel="next" href="..."> in <head>
+  #   2. a[rel~="next"] (most common pattern; HTML spec compliant)
+  #   3. The configured CSS selector via :next_link
+  #
+  # Stops when no next link is found, when max_pages is reached, or
+  # when the next URL hasn't changed (defensive against malformed
+  # next links pointing at self).
+  #
+  #   Scrapetor::Pagination.each_page("https://example.com/listings") do |doc, url|
+  #     doc.css(".product").each { |p| ... }
+  #   end
+  #
+  # Yields (doc, url) for each page in order. When :http is set to a
+  # Scrapetor::Fetcher / Session-like object, it's used for fetches;
+  # otherwise Scrapetor::Fetcher (HTTP/2 via libcurl) is used by
+  # default, with a Net::HTTP fallback if libcurl isn't available.
+  module Pagination
+    DEFAULT_MAX_PAGES = 50
+    DEFAULT_DELAY     = 0.0
+    def self.each_page(start_url, max_pages: DEFAULT_MAX_PAGES,
+                       delay: DEFAULT_DELAY, http: nil,
+                       next_link: nil)
+      return enum_for(:each_page, start_url,
+                       max_pages: max_pages, delay: delay,
+                       http: http, next_link: next_link) unless block_given?
+      url = start_url.to_s
+      visited = {}
+      page_no = 0
+      while url && page_no < max_pages
+        break if visited[url]
+        visited[url] = true
+        page_no += 1
+        doc = fetch_page(url, http)
+        yield doc, url
+        nxt = next_page_url(doc, url, next_link)
+        sleep delay if delay > 0 && nxt
+        url = nxt
+      end
+      nil
+    end
+    # Inspect a document and return the next page URL, or nil if
+    # this is the last page. Honours <link rel=next> > a[rel=next] >
+    # a custom selector via :next_link.
+    def self.next_page_url(doc, current_url, custom_selector = nil)
+      # 1. <link rel="next">
+      if (link = doc.at_css('link[rel~="next"]'))
+        href = link["href"] || link[:href]
+        return absolutize(href, current_url) if href && !href.empty?
+      end
+      # 2. a[rel~="next"]
+      doc.css('a[rel~="next"]').each do |a|
+        href = a["href"] || a[:href]
+        next unless href && !href.empty?
+        abs = absolutize(href, current_url)
+        return abs if abs && abs != current_url
+      end
+      # 3. Custom selector — first link element under the match.
+      if custom_selector
+        node = doc.at_css(custom_selector)
+        if node
+          # Walk up if user gave us a link target like ".next-link"
+          # already pointing at an <a>, or treat as the wrapper and
+          # grab the first <a> within.
+          link_node = node.respond_to?(:name) && node.name.casecmp?("a") ? node : node.at_css("a")
+          if link_node
+            href = link_node["href"] || link_node[:href]
+            return absolutize(href, current_url) if href && !href.empty?
+          end
+        end
+      end
+      nil
+    end
+    def self.fetch_page(url, http)
+      if http && http.respond_to?(:fetch)
+        http.fetch(url)
+      elsif defined?(Scrapetor::Fetcher) && Scrapetor::Fetcher.available?
+        Scrapetor::Fetcher.fetch(url)
+      else
+        Scrapetor.fetch(url)
+      end
+    end
+    def self.absolutize(href, base)
+      return nil if href.nil? || href.empty?
+      URI.join(base, href).to_s
+    rescue URI::InvalidURIError
+      nil
+    end
+  end
+  # Top-level shorthand.
+  def self.each_page(start_url, **opts, &block)
+    Pagination.each_page(start_url, **opts, &block)
+  end
+end

data/lib/scrapetor/persistent_cache.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+require "digest"
+require "fileutils"
+module Scrapetor
+  # Disk-backed parse cache. Persists the parsed arena (nodes blob,
+  # attrs blob, html bytes) to disk so subsequent process invocations
+  # restore the document via memcpy + index rebuild — the SAX
+  # tokeniser doesn't run on hit. Implementation is fully native:
+  # `Scrapetor::Native::Document#serialize_to_file` writes the binary
+  # arena; `Scrapetor::Native::Document.load_from_file` reads it back.
+  #
+  # Designed for:
+  #   - CI / test suites looping the same fixture HTML across boots
+  #   - Batch jobs that restart (cron, sidekiq workers)
+  #   - A/B parser comparisons over a corpus
+  #
+  # Storage layout: SCRAP_CACHE_DIR/<first-2-bytes>/<sha256>.arena
+  # Files are content-addressed so identical HTML inputs share one
+  # cache entry regardless of caller.
+  #
+  # Opt-in via SCRAP_PERSISTENT_CACHE=1 or Scrapetor::PersistentCache.enable!
+  # Override the cache root via SCRAP_CACHE_DIR (default
+  # ~/.cache/scrapetor/parse).
+  module PersistentCache
+    DEFAULT_DIR = File.expand_path("~/.cache/scrapetor/parse")
+    class << self
+      attr_accessor :dir
+      def enabled?
+        e = defined?(@enabled) ? @enabled : nil
+        return e unless e.nil?
+        ENV["SCRAP_PERSISTENT_CACHE"] == "1"
+      end
+      def enable!
+        @enabled = true
+        @dir   ||= ENV.fetch("SCRAP_CACHE_DIR", DEFAULT_DIR)
+        FileUtils.mkdir_p(@dir)
+        true
+      end
+      def disable!
+        @enabled = false
+      end
+      def directory
+        @dir ||= ENV.fetch("SCRAP_CACHE_DIR", DEFAULT_DIR)
+      end
+      # Load a cached parsed arena for the given HTML, or nil on miss.
+      # The return value is a Scrapetor::Native::Document ready to be
+      # wrapped by Scrapetor::Document.
+      def load(html)
+        return nil unless enabled?
+        return nil if html.nil? || html.empty?
+        key = key_for(html)
+        path = path_for(key)
+        return nil unless File.exist?(path)
+        native = Scrapetor::Native::Document.load_from_file(path)
+        native
+      rescue StandardError
+        File.delete(path) rescue nil
+        nil
+      end
+      # Persist a parsed arena to disk under its content fingerprint.
+      # Takes the Scrapetor::Native::Document handle (i.e.
+      # `doc.backing.native` for an unmutated document). Returns the
+      # cache key on success, nil on miss / disabled.
+      def store(html, native_doc)
+        return nil unless enabled?
+        return nil if html.nil? || html.empty?
+        return nil if native_doc.nil?
+        key = key_for(html)
+        path = path_for(key)
+        return key if File.exist?(path)
+        FileUtils.mkdir_p(File.dirname(path))
+        tmp = "#{path}.tmp.#{Process.pid}"
+        ok = native_doc.serialize_to_file(tmp)
+        unless ok
+          File.delete(tmp) rescue nil
+          return nil
+        end
+        File.rename(tmp, path)
+        key
+      end
+      # SHA-256 of the HTML — collisions effectively zero.
+      def key_for(html)
+        Digest::SHA256.hexdigest(html)
+      end
+      # Pre-warm the cache for a directory of fixtures.
+      def warm(paths_or_globs)
+        return 0 unless enabled?
+        n = 0
+        Array(paths_or_globs).each do |entry|
+          Dir.glob(entry).each do |path|
+            html = File.read(path)
+            doc = Scrapetor.parse(html)
+            store(html, doc.backing.native)
+            n += 1
+          end
+        end
+        n
+      end
+      def disk_usage
+        return 0 unless File.directory?(directory)
+        Dir.glob(File.join(directory, "*", "*.arena")).sum { |p| File.size(p) }
+      end
+      def clear!
+        return 0 unless File.directory?(directory)
+        Dir.glob(File.join(directory, "*", "*.arena")).each(&File.method(:delete)).size
+      end
+      private
+      def path_for(key)
+        File.join(directory, key[0, 2], "#{key}.arena")
+      end
+    end
+    enable! if enabled?
+  end
+end