RubyGems - scrapetor - Versions diffs - 0.2.0 - Mend

scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +242 -0
data/LICENSE +21 -0
data/README.md +440 -0
data/bin/scrapetor +190 -0
data/bin/scrapetor-bench +5 -0
data/ext/scrapetor/README.md +53 -0
data/ext/scrapetor/native/extconf.rb +67 -0
data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
data/ext/scrapetor/native/scrapetor_http.c +2591 -0
data/ext/scrapetor/native/scrapetor_native.c +1156 -0
data/lib/scrapetor/builder.rb +158 -0
data/lib/scrapetor/cleaner.rb +10 -0
data/lib/scrapetor/comment_node.rb +67 -0
data/lib/scrapetor/document.rb +457 -0
data/lib/scrapetor/dom/parser.rb +69 -0
data/lib/scrapetor/dom/selectors.rb +208 -0
data/lib/scrapetor/dom.rb +563 -0
data/lib/scrapetor/encoding.rb +85 -0
data/lib/scrapetor/entities.rb +90 -0
data/lib/scrapetor/errors.rb +12 -0
data/lib/scrapetor/extractor.rb +147 -0
data/lib/scrapetor/fetcher.rb +390 -0
data/lib/scrapetor/fingerprint.rb +29 -0
data/lib/scrapetor/form.rb +141 -0
data/lib/scrapetor/http.rb +114 -0
data/lib/scrapetor/microdata.rb +132 -0
data/lib/scrapetor/money.rb +30 -0
data/lib/scrapetor/native.rb +291 -0
data/lib/scrapetor/native_dom.rb +2258 -0
data/lib/scrapetor/node.rb +539 -0
data/lib/scrapetor/node_set.rb +301 -0
data/lib/scrapetor/page_type.rb +95 -0
data/lib/scrapetor/pagination.rb +109 -0
data/lib/scrapetor/persistent_cache.rb +130 -0
data/lib/scrapetor/robots.rb +159 -0
data/lib/scrapetor/sax.rb +285 -0
data/lib/scrapetor/schema.rb +144 -0
data/lib/scrapetor/selector.rb +576 -0
data/lib/scrapetor/session.rb +141 -0
data/lib/scrapetor/sitemap.rb +52 -0
data/lib/scrapetor/stream.rb +111 -0
data/lib/scrapetor/structured_data.rb +74 -0
data/lib/scrapetor/template_registry.rb +24 -0
data/lib/scrapetor/text_node.rb +101 -0
data/lib/scrapetor/url.rb +21 -0
data/lib/scrapetor/version.rb +5 -0
data/lib/scrapetor/xpath.rb +1603 -0
data/lib/scrapetor.rb +167 -0
data/scrapetor.gemspec +77 -0
metadata +200 -0

data/lib/scrapetor/session.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require "tempfile"
+require "uri"
+require "thread"
+module Scrapetor
+  # Stateful HTTP session. Wraps Scrapetor::Fetcher with:
+  #   - persistent cookie jar (libcurl COOKIEJAR/COOKIEFILE)
+  #   - default headers merged into every request
+  #   - basic / bearer auth applied automatically
+  #   - per-host rate limiting (polite throttle)
+  #   - default retry/backoff
+  #   - auto charset transcoding of HTML bodies to UTF-8
+  #
+  #   session = Scrapetor::Session.new(
+  #     cookies:     true,          # ephemeral tempfile jar
+  #     user_agent:  "MyBot/1.0",
+  #     rate_limit:  0.5,           # min seconds between same-host requests
+  #     retry:       3,
+  #     headers:     { "Accept-Language" => "en-US" },
+  #   )
+  #   doc = session.fetch("https://example.com/login")
+  #   session.post("https://example.com/login", form: { user: "x", pass: "y" })
+  #   doc = session.fetch("https://example.com/dashboard")
+  #
+  # Cookies set during the login persist for the dashboard call.
+  class Session
+    DEFAULT_HEADERS = {
+      "Accept"          => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+      "Accept-Language" => "en-US,en;q=0.5",
+    }.freeze
+    attr_reader :cookie_jar_path
+    def initialize(cookies: true,
+                   user_agent: nil,
+                   headers: {},
+                   basic_auth: nil,
+                   bearer_token: nil,
+                   proxy: nil,
+                   ca_path: nil,
+                   rate_limit: nil,
+                   retry: 0,
+                   backoff: 0.3,
+                   max_backoff: 10.0,
+                   timeout_ms: 30_000,
+                   follow_redirects: true,
+                   insecure: false,
+                   transcode_charset: true)
+      Scrapetor::Fetcher.ensure_available!
+      @cookie_jar_path =
+        case cookies
+        when String then cookies
+        when true   then ephemeral_jar_path
+        when false, nil then nil
+        else raise ArgumentError, "cookies: must be String/true/false"
+        end
+      @defaults = {
+        user_agent: user_agent || Scrapetor::Fetcher::DEFAULT_USER_AGENT,
+        headers: DEFAULT_HEADERS.merge(headers),
+        basic_auth: basic_auth,
+        bearer_token: bearer_token,
+        proxy: proxy,
+        ca_path: ca_path,
+        retry: binding.local_variable_get(:retry),
+        backoff: backoff,
+        max_backoff: max_backoff,
+        timeout_ms: timeout_ms,
+        follow_redirects: follow_redirects,
+        insecure: insecure,
+      }.compact
+      @defaults[:transcode_utf8] = transcode_charset
+      @defaults[:rate_limit_ms] = (rate_limit * 1000).to_i if rate_limit
+    end
+    %w[get post put patch delete head].each do |verb|
+      define_method(verb) do |url, **opts|
+        merged = merge_opts(opts)
+        Scrapetor::Fetcher.public_send(verb, url, **merged)
+      end
+    end
+    # GET + parse to a Document.
+    def fetch(url, **opts)
+      resp = get(url, **opts)
+      raise Scrapetor::Fetcher::FetchError.new(
+        "Session.fetch #{url} -> HTTP #{resp[:status]}",
+        status: resp[:status], response: resp
+      ) if resp[:status] < 200 || resp[:status] >= 400
+      Scrapetor.parse(resp[:body], base_url: resp[:final_url])
+    end
+    # parallel_get respects the session's defaults (cookies, headers,
+    # auth, per-host rate limit). The native batch honours
+    # rate_limit_ms per-host via a shared C-side throttle table, so N
+    # parallel workers hitting one host all queue at that gate while
+    # different hosts run concurrently.
+    def parallel_get(urls, **opts)
+      merged = merge_opts(opts)
+      Scrapetor::Fetcher.parallel_get(urls, **merged)
+    end
+    def close
+      File.delete(@cookie_jar_path) if @cookie_jar_path && File.exist?(@cookie_jar_path) && @ephemeral
+    rescue StandardError
+      # tempfile may have already been GC'd; ignore
+    end
+    private
+    def ephemeral_jar_path
+      @ephemeral = true
+      f = Tempfile.new(["scrapetor_jar", ".txt"])
+      f.close
+      path = f.path
+      ObjectSpace.define_finalizer(self, self.class.send(:make_jar_finalizer, path))
+      path
+    end
+    def self.make_jar_finalizer(path)
+      proc { File.delete(path) if File.exist?(path) rescue nil }
+    end
+    def merge_opts(opts)
+      m = @defaults.merge(opts) do |_, ours, theirs|
+        if ours.is_a?(Hash) && theirs.is_a?(Hash)
+          ours.merge(theirs)
+        else
+          theirs.nil? ? ours : theirs
+        end
+      end
+      if @cookie_jar_path
+        m[:cookiejar]  ||= @cookie_jar_path
+        m[:cookiefile] ||= @cookie_jar_path
+      end
+      m
+    end
+  end
+end

data/lib/scrapetor/sitemap.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+require "stringio"
+module Scrapetor
+  # Sitemap.xml ingestion. Handles both <urlset> (URL listings) and
+  # <sitemapindex> (nested sitemap references), streaming so a huge
+  # sitemap doesn't have to fit in memory at once.
+  #
+  #   Scrapetor::Sitemap.urls("https://example.com/sitemap.xml") do |url, meta|
+  #     puts url, meta[:lastmod], meta[:priority]
+  #   end
+  #
+  # Or, return an array:
+  #
+  #   Scrapetor::Sitemap.urls("https://example.com/sitemap.xml").to_a
+  module Sitemap
+    # Stream-iterate every URL in the sitemap. Recurses into
+    # <sitemapindex> entries automatically. Yields (url, meta) where
+    # meta carries :lastmod / :changefreq / :priority when present.
+    def self.urls(source, depth: 0, max_depth: 5, &block)
+      return enum_for(:urls, source, depth: depth, max_depth: max_depth) unless block
+      raise ArgumentError, "sitemap recursion too deep" if depth > max_depth
+      io = open_source(source)
+      Scrapetor.stream(io, outer: "url") do |doc|
+        loc = doc.at_css("loc")&.text&.strip
+        next unless loc && !loc.empty?
+        meta = {
+          lastmod:    doc.at_css("lastmod")&.text&.strip,
+          changefreq: doc.at_css("changefreq")&.text&.strip,
+          priority:   doc.at_css("priority")&.text&.strip,
+        }
+        yield loc, meta
+      end
+      # If the file was a sitemapindex instead, the <url> stream above
+      # found nothing. Re-open and scan for <sitemap><loc>.
+      child_io = open_source(source)
+      Scrapetor.stream(child_io, outer: "sitemap") do |doc|
+        child_loc = doc.at_css("loc")&.text&.strip
+        next unless child_loc
+        urls(child_loc, depth: depth + 1, max_depth: max_depth, &block)
+      end
+    end
+    def self.open_source(source)
+      return source if source.respond_to?(:read)
+      return StringIO.new(source) if source.is_a?(String) && !source.start_with?("http")
+      resp = Scrapetor::Fetcher.get(source.to_s)
+      StringIO.new(resp[:body])
+    end
+  end
+end

data/lib/scrapetor/stream.rb ADDED Viewed

@@ -0,0 +1,111 @@
+# frozen_string_literal: true
+require "stringio"
+module Scrapetor
+  # Streaming parser. Reads HTML incrementally from an IO and yields one
+  # complete row at a time. Peak memory stays bounded to roughly
+  # max(read_chunk, longest_row_in_bytes) regardless of total document
+  # size, so multi-gigabyte fixtures, paginated dumps, and slow socket
+  # feeds work without buffering the whole thing.
+  #
+  # The "row" boundary is byte-scanned in C — no DOM is built for the
+  # outer-document context. Once a row is found, its HTML slice is
+  # parsed as a fragment through the standard native path so all the
+  # normal Document / Element / extract APIs are available.
+  #
+  #   Scrapetor.stream(io, outer: "div.result") do |doc|
+  #     puts doc.at_css(".title")&.text
+  #   end
+  #
+  # With a schema, each row is run through the native extractor and
+  # yielded as a Hash:
+  #
+  #   Scrapetor.stream(io, outer: "li.product", fields: {
+  #     title: ".title::text",
+  #     price: ".price::text",
+  #   }) do |row|
+  #     puts row[:title]
+  #   end
+  #
+  # The outer pattern accepts:
+  #   - "tag"          (any element of that name)
+  #   - "tag.class"    (element with that class token)
+  #   - ".class"       — not supported; provide a tag for byte scanning
+  class Stream
+    DEFAULT_CHUNK = 64 * 1024
+    def initialize(io, outer:, fields: nil, chunk_size: DEFAULT_CHUNK)
+      tag, id, classes = self.class.parse_outer(outer)
+      @native = Scrapetor::Native::Stream.new(tag, id, classes)
+      @io = io
+      @fields = fields
+      @chunk_size = chunk_size
+    end
+    def each
+      return enum_for(:each) unless block_given?
+      loop do
+        # Pull every row currently available in the buffer.
+        while (row_html = @native.next_row)
+          yield materialise(row_html)
+        end
+        break if @native.done?
+        chunk = @io.read(@chunk_size)
+        if chunk.nil? || chunk.empty?
+          @native.set_eof
+          # Final drain after EOF — buffer may still have buffered rows.
+          while (row_html = @native.next_row)
+            yield materialise(row_html)
+          end
+          break
+        else
+          @native.feed(chunk)
+        end
+      end
+      self
+    end
+    # Accepts:
+    #   "tag"                 -> [tag, nil, []]
+    #   "tag.class"           -> [tag, nil, ["class"]]
+    #   "tag.cls1.cls2"       -> [tag, nil, ["cls1", "cls2"]]
+    #   "tag#id"              -> [tag, "id", []]
+    #   "tag#id.cls1"         -> [tag, "id", ["cls1"]]
+    #   "tag.cls#id"          -> [tag, "id", ["cls"]]   (any order after tag)
+    def self.parse_outer(outer)
+      m = outer.match(/\A([a-zA-Z][\w-]*)((?:[.#][\w-]+)*)\z/)
+      raise ArgumentError,
+            "Scrapetor.stream outer must be 'tag', 'tag.class', 'tag#id', " \
+            "or 'tag#id.cls1.cls2' (got #{outer.inspect})" unless m
+      tag = m[1]
+      tail = m[2]
+      id = nil
+      classes = []
+      tail.scan(/([.#])([\w-]+)/).each do |sigil, name|
+        if sigil == "#"
+          raise ArgumentError,
+                "Scrapetor.stream outer: only one #id is supported (got #{outer.inspect})" if id
+          id = name
+        else
+          classes << name
+        end
+      end
+      [tag, id, classes]
+    end
+    private
+    def materialise(row_html)
+      doc = Scrapetor.parse(row_html)
+      return doc unless @fields
+      root = doc.css("*").first || doc
+      root.extract(@fields)
+    end
+  end
+  def self.stream(source, outer:, fields: nil, chunk_size: Stream::DEFAULT_CHUNK, &block)
+    io = source.respond_to?(:read) ? source : StringIO.new(source)
+    Stream.new(io, outer: outer, fields: fields, chunk_size: chunk_size).each(&block)
+  end
+end

data/lib/scrapetor/structured_data.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+require "json"
+module Scrapetor
+  # Extract structured-data signals every SEO/RAG pipeline needs:
+  # JSON-LD, OpenGraph, Twitter Cards, Schema.org microdata.
+  #
+  # These are deterministic and fast — no DOM walk beyond `doc.css(...)`
+  # which is delegated to the backing tokenizer.
+  module StructuredData
+    JSON_LD_SELECTOR = 'script[type="application/ld+json"]'.freeze
+    def self.json_ld(doc)
+      out = []
+      doc.css(JSON_LD_SELECTOR).each do |script|
+        body = script.text
+        next if body.nil? || body.strip.empty?
+        begin
+          parsed = JSON.parse(body)
+        rescue JSON::ParserError
+          next
+        end
+        if parsed.is_a?(Array)
+          out.concat(parsed)
+        elsif parsed.is_a?(Hash) && parsed["@graph"].is_a?(Array)
+          out.concat(parsed["@graph"])
+        else
+          out << parsed
+        end
+      end
+      out
+    end
+    def self.opengraph(doc)
+      collect_meta(doc, prefix: "og:")
+    end
+    def self.twitter_card(doc)
+      collect_meta(doc, prefix: "twitter:")
+    end
+    def self.schema_org(doc, type: nil)
+      list = json_ld(doc)
+      return list if type.nil?
+      target = type.to_s
+      list.select do |item|
+        next false unless item.is_a?(Hash)
+        t = item["@type"]
+        case t
+        when String then t == target
+        when Array  then t.include?(target)
+        else false
+        end
+      end
+    end
+    def self.collect_meta(doc, prefix:)
+      h = {}
+      doc.css("meta").each do |meta|
+        # OpenGraph uses `property=`; Twitter Cards use `name=`. Some sites
+        # do both. Check both.
+        key = meta.attr("property") || meta.attr("name")
+        next if key.nil?
+        next unless key.start_with?(prefix)
+        val = meta.attr("content")
+        next if val.nil?
+        short_key = key[prefix.length..]
+        h[short_key] = val if !h.key?(short_key)
+      end
+      h
+    end
+  end
+end

data/lib/scrapetor/template_registry.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module Scrapetor
+  # Phase 1: in-process registry mapping structural fingerprints to
+  # compiled extraction plans (Schema instances). Phase 8 (per plan.md)
+  # promotes this to an mmap-backed cross-process store.
+  class TemplateRegistry
+    def initialize
+      @plans = {}
+    end
+    def store(fingerprint, plan)
+      @plans[fingerprint] = plan
+    end
+    def fetch(fingerprint)
+      @plans[fingerprint]
+    end
+    def size
+      @plans.size
+    end
+  end
+end

data/lib/scrapetor/text_node.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+module Scrapetor
+  # Result type for `::text` and `::attr(name)` pseudo-element queries.
+  #
+  # Scrapy / Parsel-style code expects strings directly from these
+  # selectors (`doc.css("h3::text").get`), but Nokogiri-style scrapers
+  # routinely chain a `.text` / `.content` accessor onto each result
+  # (`doc.css("h3::text").first.text` or `node.at("a::attr(href)").text`).
+  # Returning a bare String breaks the Nokogiri-style call path with
+  # NoMethodError, even though the String already _is_ the text we
+  # would have returned.
+  #
+  # TextNode is a thin String subclass that closes the gap: it equals,
+  # compares, splits, and concatenates exactly like a String, and adds
+  # the Node-shaped accessors (`text`, `content`, `inner_text`, `name`,
+  # `element?`, `text?`) plus the Parsel-shaped `get` / `getall`. The
+  # underlying byte string is the actual text content; the extra methods
+  # all return self (or trivial derivatives), so chaining stays cheap.
+  class TextNode < String
+    def text;       String.new(self); end
+    alias inner_text text
+    alias content    text
+    # Parsel-style accessors.
+    def get;        String.new(self); end
+    def getall;     [String.new(self)]; end
+    # Node-shape predicates so duck-typing checks (`n.element?`,
+    # `n.text?`, `n.name == "#text"`) don't blow up.
+    def name;       "#text"; end
+    def element?;   false; end
+    def text?;      true; end
+    def comment?;   false; end
+    def document?;  false; end
+    def cdata?;     false; end
+    def to_html;    self.to_s; end
+    alias outer_html to_html
+    alias inner_html to_html
+    # No-op mutation API. Heterogeneous selectors like
+    # `.foo > ::text, .bar` can hand a TextNode to a caller that
+    # assumes an Element interface (e.g.
+    # `node.inner_html = node.inner_html.gsub(...)`). The reassignment
+    # would crash on bare String; we accept the write silently so the
+    # subsequent `.text` read still works. The mutation is intentionally
+    # dropped — TextNode wraps frozen content of the original element.
+    def inner_html=(_v); _v; end
+    def content=(_v);    _v; end
+    def []=(*_args);     nil; end
+    def add_class(_k);    self; end
+    def remove_class(*_); self; end
+    def remove;           self; end
+    def unlink;           self; end
+    # Containing element (the node whose text/attribute this TextNode
+    # represents). Set by the css() boundary when we know the parent;
+    # left nil otherwise. Production code chains
+    # `result.at(::text).parent.css(...)` to navigate to siblings of
+    # the text node, mirroring the Nokogiri shape where text nodes
+    # carry a `.parent` back-reference.
+    attr_accessor :parent_node
+    def parent;                 @parent_node; end
+    def next_sibling;           nil; end
+    def previous_sibling;       nil; end
+    def next_element_sibling;   nil; end
+    def previous_element_sibling; nil; end
+    def children;               []; end
+    def element_children;       []; end
+    def attributes;             {}; end
+    def attribute_nodes;        []; end
+    def attribute(_name);       nil; end
+    def keys;                   []; end
+    def values;                 []; end
+    def classes;                []; end
+    def has_class?(_klass);     false; end
+    def [](*args)
+      # String byte/range subscript when called with a single non-string
+      # argument; nil for attribute-style String access.
+      if args.size == 1 && args.first.is_a?(String)
+        nil
+      elsif args.size == 1 && args.first.is_a?(Symbol)
+        nil
+      else
+        super
+      end
+    end
+    def css(_selector);         []; end
+    def at_css(_selector);      nil; end
+    def at(_selector);          nil; end
+    def search(_selector);      []; end
+    def xpath(*_args);          []; end
+    def at_xpath(*_args);       nil; end
+    def inspect
+      "#<Scrapetor::TextNode #{super}>"
+    end
+  end
+end

data/lib/scrapetor/url.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+require "uri"
+module Scrapetor
+  module URL
+    ABSOLUTE = %r{\A[a-zA-Z][\w+.\-]*://}.freeze
+    def self.absolute(href, base = nil)
+      return nil if href.nil?
+      h = href.to_s
+      return h if h.match?(ABSOLUTE)
+      return h if base.nil?
+      begin
+        URI.join(base.to_s, h).to_s
+      rescue URI::InvalidURIError, ArgumentError
+        h
+      end
+    end
+  end
+end

data/lib/scrapetor/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Scrapetor
+  VERSION = "0.2.0"
+end