RubyGems - scrapetor - Versions diffs - 0.2.0 - Mend

scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +242 -0
data/LICENSE +21 -0
data/README.md +440 -0
data/bin/scrapetor +190 -0
data/bin/scrapetor-bench +5 -0
data/ext/scrapetor/README.md +53 -0
data/ext/scrapetor/native/extconf.rb +67 -0
data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
data/ext/scrapetor/native/scrapetor_http.c +2591 -0
data/ext/scrapetor/native/scrapetor_native.c +1156 -0
data/lib/scrapetor/builder.rb +158 -0
data/lib/scrapetor/cleaner.rb +10 -0
data/lib/scrapetor/comment_node.rb +67 -0
data/lib/scrapetor/document.rb +457 -0
data/lib/scrapetor/dom/parser.rb +69 -0
data/lib/scrapetor/dom/selectors.rb +208 -0
data/lib/scrapetor/dom.rb +563 -0
data/lib/scrapetor/encoding.rb +85 -0
data/lib/scrapetor/entities.rb +90 -0
data/lib/scrapetor/errors.rb +12 -0
data/lib/scrapetor/extractor.rb +147 -0
data/lib/scrapetor/fetcher.rb +390 -0
data/lib/scrapetor/fingerprint.rb +29 -0
data/lib/scrapetor/form.rb +141 -0
data/lib/scrapetor/http.rb +114 -0
data/lib/scrapetor/microdata.rb +132 -0
data/lib/scrapetor/money.rb +30 -0
data/lib/scrapetor/native.rb +291 -0
data/lib/scrapetor/native_dom.rb +2258 -0
data/lib/scrapetor/node.rb +539 -0
data/lib/scrapetor/node_set.rb +301 -0
data/lib/scrapetor/page_type.rb +95 -0
data/lib/scrapetor/pagination.rb +109 -0
data/lib/scrapetor/persistent_cache.rb +130 -0
data/lib/scrapetor/robots.rb +159 -0
data/lib/scrapetor/sax.rb +285 -0
data/lib/scrapetor/schema.rb +144 -0
data/lib/scrapetor/selector.rb +576 -0
data/lib/scrapetor/session.rb +141 -0
data/lib/scrapetor/sitemap.rb +52 -0
data/lib/scrapetor/stream.rb +111 -0
data/lib/scrapetor/structured_data.rb +74 -0
data/lib/scrapetor/template_registry.rb +24 -0
data/lib/scrapetor/text_node.rb +101 -0
data/lib/scrapetor/url.rb +21 -0
data/lib/scrapetor/version.rb +5 -0
data/lib/scrapetor/xpath.rb +1603 -0
data/lib/scrapetor.rb +167 -0
data/scrapetor.gemspec +77 -0
metadata +200 -0

data/lib/scrapetor/form.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+require "uri"
+module Scrapetor
+  # HTML form helper. Pulls fields + default values out of a `<form>`
+  # element, lets the caller override or add values, and submits via
+  # the right method/action.
+  #
+  #   doc  = Scrapetor::Fetcher.fetch("https://example.com/login")
+  #   form = Scrapetor::Form.new(doc.at_css("form#login"),
+  #                              base_url: "https://example.com/login")
+  #   form["username"] = "alice"
+  #   form["password"] = "secret"
+  #   resp = form.submit                  # uses Scrapetor::Fetcher
+  #
+  # Captures every named control's default value (incl. <select> /
+  # <input type=hidden|checkbox|radio> / <textarea>); pre-loaded
+  # fields like CSRF tokens carry forward automatically. Buttons are
+  # NOT included unless explicitly set — the caller decides which
+  # submit button "fired".
+  class Form
+    attr_reader :action, :method, :enctype, :fields
+    def initialize(form_node, base_url: nil, http: nil)
+      raise ArgumentError, "form_node is required" if form_node.nil?
+      @form    = form_node
+      @base    = base_url
+      @http    = http
+      @method  = (form_node["method"] || form_node[:method] || "GET").upcase
+      @enctype = (form_node["enctype"] || form_node[:enctype] || "application/x-www-form-urlencoded").downcase
+      raw_action = form_node["action"] || form_node[:action] || ""
+      @action = if raw_action.empty?
+                  base_url
+                elsif base_url
+                  begin
+                    URI.join(base_url, raw_action).to_s
+                  rescue URI::InvalidURIError
+                    raw_action
+                  end
+                else
+                  raw_action
+                end
+      @fields = capture_defaults(form_node)
+    end
+    def [](name);            @fields[name.to_s]; end
+    def []=(name, value);    @fields[name.to_s] = value.to_s; end
+    def delete(name);        @fields.delete(name.to_s); end
+    def merge!(hash);        hash.each { |k, v| self[k] = v }; self; end
+    # Returns the params Hash that would be submitted, with all the
+    # captured defaults plus user overrides. Useful for inspection
+    # before #submit fires the request.
+    def to_h
+      @fields.dup
+    end
+    def submit(extra: {}, **fetcher_opts)
+      params = @fields.merge(extra.transform_keys(&:to_s))
+      client = @http || Scrapetor::Fetcher
+      case @method
+      when "GET"
+        url = append_query(@action, params)
+        client.get(url, **fetcher_opts)
+      when "POST"
+        if @enctype.include?("multipart")
+          client.post(@action, multipart: params, **fetcher_opts)
+        else
+          client.post(@action, form: params, **fetcher_opts)
+        end
+      else
+        # PUT/PATCH/DELETE via form are non-standard but supported.
+        verb = @method.downcase.to_sym
+        client.send(verb, @action,
+                    body: URI.encode_www_form(params),
+                    **fetcher_opts.merge(
+                      headers: (fetcher_opts[:headers] || {}).merge(
+                        "Content-Type" => "application/x-www-form-urlencoded"
+                      )
+                    ))
+      end
+    end
+    private
+    def capture_defaults(form)
+      out = {}
+      # <input>
+      form.css("input").each do |inp|
+        name = (inp["name"] || inp[:name])&.to_s
+        next if name.nil? || name.empty?
+        type = (inp["type"] || inp[:type] || "text").to_s.downcase
+        case type
+        when "submit", "button", "image", "reset", "file"
+          # Skip — submit buttons are caller-driven; file inputs
+          # need explicit Fetcher.upload_file via :extra.
+          next
+        when "checkbox", "radio"
+          # Default-checked controls contribute their value; others
+          # don't. Falls back to "on" per HTML spec.
+          if inp["checked"] || inp[:checked]
+            out[name] = (inp["value"] || inp[:value] || "on").to_s
+          end
+        else
+          out[name] = (inp["value"] || inp[:value] || "").to_s
+        end
+      end
+      # <select>
+      form.css("select").each do |sel|
+        name = (sel["name"] || sel[:name])&.to_s
+        next if name.nil? || name.empty?
+        # First check for an option marked selected; fall back to
+        # the first option (HTML semantics for single-select).
+        selected = sel.css("option").find { |o| o["selected"] || o[:selected] }
+        selected ||= sel.at_css("option")
+        out[name] = selected ? (selected["value"] || selected[:value] || selected.text).to_s : ""
+      end
+      # <textarea>
+      form.css("textarea").each do |t|
+        name = (t["name"] || t[:name])&.to_s
+        next if name.nil? || name.empty?
+        out[name] = t.text.to_s
+      end
+      out
+    end
+    def append_query(url, params)
+      return url if params.empty?
+      uri = URI(url)
+      existing = uri.query ? URI.decode_www_form(uri.query) : []
+      override_names = params.keys.to_set
+      existing.reject! { |k, _| override_names.include?(k) }
+      merged = existing + params.to_a
+      uri.query = URI.encode_www_form(merged)
+      uri.to_s
+    end
+  end
+end
+require "set"

data/lib/scrapetor/http.rb ADDED Viewed

@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+require "net/http"
+require "uri"
+module Scrapetor
+  # Convenience HTTP fetcher built on `Net::HTTP` (Ruby stdlib — no
+  # external runtime dep).
+  #
+  #   doc = Scrapetor.fetch("https://example.com/products")
+  #   doc.css(".product").map { |p| p.at(".title").text }
+  #
+  # Handles 3xx redirects, sets a sensible User-Agent, applies the
+  # response's encoding to the parsed document, and uses the request URL
+  # as `base_url` for absolute-URL helpers.
+  #
+  # For production scraping you'll usually want a real HTTP client
+  # (HTTPX, Typhoeus, Faraday) with connection pooling, retries, and
+  # cookie storage. `Scrapetor.fetch` is intentionally minimal — it's
+  # here so simple scripts and the CLI don't need extra deps.
+  module HTTP
+    DEFAULT_HEADERS = {
+      "User-Agent"      => "Scrapetor/#{Scrapetor::VERSION} (+https://scrapetor.org)",
+      "Accept"          => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+      "Accept-Language" => "en-US,en;q=0.5",
+      "Accept-Encoding" => "identity"
+    }.freeze
+    MAX_REDIRECTS = 5
+    class FetchError < Scrapetor::Error; end
+    class TooManyRedirects < FetchError; end
+    def self.get(url, headers: {}, follow_redirects: true, max_redirects: MAX_REDIRECTS, open_timeout: 10, read_timeout: 30)
+      uri = URI(url.to_s)
+      raise FetchError, "unsupported scheme: #{uri.scheme.inspect}" unless %w[http https].include?(uri.scheme)
+      hops = 0
+      loop do
+        req = Net::HTTP::Get.new(uri.request_uri)
+        DEFAULT_HEADERS.each { |k, v| req[k] = v }
+        headers.each { |k, v| req[k.to_s] = v.to_s }
+        net = Net::HTTP.new(uri.host, uri.port)
+        net.use_ssl     = (uri.scheme == "https")
+        net.open_timeout = open_timeout
+        net.read_timeout = read_timeout
+        resp = net.start { |h| h.request(req) }
+        case resp
+        when Net::HTTPSuccess
+          return Response.new(resp, uri)
+        when Net::HTTPRedirection
+          raise TooManyRedirects, "exceeded #{max_redirects} redirects" if hops >= max_redirects
+          raise FetchError, "redirect with no Location header" unless resp["location"]
+          uri = URI.join(uri.to_s, resp["location"])
+          hops += 1
+          next if follow_redirects
+          return Response.new(resp, uri)
+        else
+          raise FetchError, "HTTP #{resp.code} #{resp.message} for #{uri}"
+        end
+      end
+    end
+    # Fetch + parse + return a `Scrapetor::Document` whose `base_url` is
+    # the final URL after redirects.
+    def self.fetch(url, **opts)
+      resp = get(url, **opts)
+      Scrapetor.parse(resp.body, base_url: resp.final_url.to_s)
+    end
+    # Fetch + extract.
+    def self.fetch_extract(url, schema, **opts)
+      resp = get(url, **opts)
+      Scrapetor.parse(resp.body, base_url: resp.final_url.to_s).extract(schema)
+    end
+    class Response
+      attr_reader :net_response, :final_url
+      def initialize(net_response, final_url)
+        @net_response = net_response
+        @final_url    = final_url
+      end
+      def body
+        @net_response.body
+      end
+      def status
+        @net_response.code.to_i
+      end
+      def headers
+        @net_response.to_hash
+      end
+      def [](header_name)
+        @net_response[header_name]
+      end
+    end
+  end
+  # Module-level shortcut. Most users only want this.
+  def self.fetch(url, **opts)
+    HTTP.fetch(url, **opts)
+  end
+  def self.fetch_extract(url, schema, **opts)
+    HTTP.fetch_extract(url, schema, **opts)
+  end
+end

data/lib/scrapetor/microdata.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# frozen_string_literal: true
+module Scrapetor
+  # Microdata extractor (HTML5 itemscope / itemprop / itemtype).
+  #
+  # Walks the DOM looking for itemscope elements and emits a nested
+  # hash structure of items + properties. The format mirrors what
+  # https://schema.org/docs/datamodel.html describes:
+  #
+  #   {
+  #     "type"       => "https://schema.org/Product",  # from itemtype
+  #     "id"         => "...",                          # from itemid
+  #     "properties" => {
+  #       "name"  => "Widget",
+  #       "price" => "19.99",
+  #       "offer" => { "type" => "https://schema.org/Offer", ... }
+  #     }
+  #   }
+  module Microdata
+    def self.extract(doc)
+      items = []
+      doc.css("[itemscope]").each do |node|
+        # Skip nested items — they'll be reached via the parent's properties.
+        next if has_itemscope_ancestor?(node)
+        items << build_item(node)
+      end
+      items
+    end
+    def self.has_itemscope_ancestor?(node)
+      ancestor = node.parent
+      while ancestor
+        return true if ancestor.respond_to?(:[]) && ancestor["itemscope"]
+        ancestor = ancestor.respond_to?(:parent) ? ancestor.parent : nil
+      end
+      false
+    end
+    def self.build_item(node)
+      item = {}
+      item["type"] = node["itemtype"] if node["itemtype"]
+      item["id"]   = node["itemid"]   if node["itemid"]
+      props = {}
+      gather_props(node, props)
+      item["properties"] = props
+      item
+    end
+    def self.gather_props(scope, props)
+      scope.css("[itemprop]").each do |el|
+        # Only direct descendants in microdata terms: an itemprop on a
+        # descendant of a nested itemscope belongs to the nested item.
+        next if descendant_of_nested_itemscope?(el, scope)
+        names = (el["itemprop"] || "").split(/\s+/).reject(&:empty?)
+        next if names.empty?
+        value = property_value(el)
+        names.each do |n|
+          if props.key?(n)
+            props[n] = [props[n]] unless props[n].is_a?(Array)
+            props[n] << value
+          else
+            props[n] = value
+          end
+        end
+      end
+    end
+    def self.descendant_of_nested_itemscope?(el, scope)
+      cur = el.parent
+      while cur && cur != scope
+        return true if cur.respond_to?(:[]) && cur["itemscope"]
+        cur = cur.respond_to?(:parent) ? cur.parent : nil
+      end
+      false
+    end
+    def self.property_value(el)
+      if el["itemscope"]
+        return build_item(el)
+      end
+      tag = el.respond_to?(:name) ? el.name.to_s.downcase : ""
+      case tag
+      when "meta"                  then el["content"]
+      when "audio", "embed", "iframe", "img", "source", "track", "video"
+        el["src"]
+      when "a", "area", "link"     then el["href"]
+      when "object"                then el["data"]
+      when "data"                  then el["value"] || el.text
+      when "meter"                 then el["value"] || el.text
+      when "time"                  then el["datetime"] || el.text
+      else
+        text = el.text.to_s
+        text.gsub(/\s+/, " ").strip
+      end
+    end
+  end
+  # RDFa extractor — minimal implementation covering the typical
+  # subset used on the web (property, content, datatype, typeof).
+  module RDFa
+    def self.extract(doc)
+      out = []
+      doc.css("[typeof]").each do |node|
+        item = {
+          "type"       => node["typeof"],
+          "about"      => node["about"] || node["resource"],
+          "properties" => collect_props(node)
+        }
+        out << item
+      end
+      out
+    end
+    def self.collect_props(scope)
+      props = {}
+      scope.css("[property]").each do |el|
+        names = (el["property"] || "").split(/\s+/).reject(&:empty?)
+        value = el["content"] || el.text.to_s.strip
+        names.each do |n|
+          if props.key?(n)
+            props[n] = [props[n]] unless props[n].is_a?(Array)
+            props[n] << value
+          else
+            props[n] = value
+          end
+        end
+      end
+      props
+    end
+  end
+end

data/lib/scrapetor/money.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+module Scrapetor
+  module Money
+    NUMERIC = /-?\d[\d.,]*/.freeze
+    THOUSAND_GROUPED_COMMAS = /\A-?\d{1,3}(?:,\d{3})+\z/.freeze
+    THOUSAND_GROUPED_DOTS = /\A-?\d{1,3}(?:\.\d{3})+\z/.freeze
+    def self.parse(s)
+      return nil if s.nil?
+      m = s.to_s.match(NUMERIC)
+      return nil unless m
+      num = m[0]
+      dots = num.count(".")
+      commas = num.count(",")
+      if dots > 0 && commas > 0
+        if num.rindex(".") > num.rindex(",")
+          num = num.delete(",")
+        else
+          num = num.delete(".").tr(",", ".")
+        end
+      elsif commas > 0
+        num = THOUSAND_GROUPED_COMMAS.match?(num) ? num.delete(",") : num.tr(",", ".")
+      elsif dots > 1
+        num = THOUSAND_GROUPED_DOTS.match?(num) ? num.delete(".") : num
+      end
+      num.to_f
+    end
+  end
+end