RubyGems - scrapetor - Versions diffs - 0.2.0 - Mend

scrapetor 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +242 -0
data/LICENSE +21 -0
data/README.md +440 -0
data/bin/scrapetor +190 -0
data/bin/scrapetor-bench +5 -0
data/ext/scrapetor/README.md +53 -0
data/ext/scrapetor/native/extconf.rb +67 -0
data/ext/scrapetor/native/scrapetor_dom.c +6346 -0
data/ext/scrapetor/native/scrapetor_http.c +2591 -0
data/ext/scrapetor/native/scrapetor_native.c +1156 -0
data/lib/scrapetor/builder.rb +158 -0
data/lib/scrapetor/cleaner.rb +10 -0
data/lib/scrapetor/comment_node.rb +67 -0
data/lib/scrapetor/document.rb +457 -0
data/lib/scrapetor/dom/parser.rb +69 -0
data/lib/scrapetor/dom/selectors.rb +208 -0
data/lib/scrapetor/dom.rb +563 -0
data/lib/scrapetor/encoding.rb +85 -0
data/lib/scrapetor/entities.rb +90 -0
data/lib/scrapetor/errors.rb +12 -0
data/lib/scrapetor/extractor.rb +147 -0
data/lib/scrapetor/fetcher.rb +390 -0
data/lib/scrapetor/fingerprint.rb +29 -0
data/lib/scrapetor/form.rb +141 -0
data/lib/scrapetor/http.rb +114 -0
data/lib/scrapetor/microdata.rb +132 -0
data/lib/scrapetor/money.rb +30 -0
data/lib/scrapetor/native.rb +291 -0
data/lib/scrapetor/native_dom.rb +2258 -0
data/lib/scrapetor/node.rb +539 -0
data/lib/scrapetor/node_set.rb +301 -0
data/lib/scrapetor/page_type.rb +95 -0
data/lib/scrapetor/pagination.rb +109 -0
data/lib/scrapetor/persistent_cache.rb +130 -0
data/lib/scrapetor/robots.rb +159 -0
data/lib/scrapetor/sax.rb +285 -0
data/lib/scrapetor/schema.rb +144 -0
data/lib/scrapetor/selector.rb +576 -0
data/lib/scrapetor/session.rb +141 -0
data/lib/scrapetor/sitemap.rb +52 -0
data/lib/scrapetor/stream.rb +111 -0
data/lib/scrapetor/structured_data.rb +74 -0
data/lib/scrapetor/template_registry.rb +24 -0
data/lib/scrapetor/text_node.rb +101 -0
data/lib/scrapetor/url.rb +21 -0
data/lib/scrapetor/version.rb +5 -0
data/lib/scrapetor/xpath.rb +1603 -0
data/lib/scrapetor.rb +167 -0
data/scrapetor.gemspec +77 -0
metadata +200 -0

data/lib/scrapetor/robots.rb ADDED Viewed

@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+require "uri"
+module Scrapetor
+  # robots.txt parser + path-match decider.
+  #
+  #   r = Scrapetor::Robots.fetch_for("https://example.com")
+  #   r.allowed?("https://example.com/private")
+  #   r.crawl_delay
+  #   r.sitemaps
+  #
+  # Implements the de-facto Google / RFC 9309 longest-match semantics:
+  # the most-specific (longest pattern) Allow/Disallow rule wins.
+  # User-agent matching is case-insensitive prefix; '*' is the fallback.
+  class Robots
+    Rule = Struct.new(:type, :pattern) # type: :allow or :disallow
+    attr_reader :sitemaps
+    def initialize(body, user_agent: "*")
+      @ua = user_agent
+      @groups = {}      # ua_pattern (lowercased) => Array<Rule>
+      @delays = {}      # ua_pattern => Float
+      @sitemaps = []
+      parse!(body.to_s)
+    end
+    def allowed?(url)
+      s = url.to_s
+      path =
+        if s.start_with?("/")
+          s
+        else
+          uri = URI(s)
+          (uri.path.empty? ? "/" : uri.path) + (uri.query ? "?#{uri.query}" : "")
+        end
+      rules = applicable_rules
+      return true if rules.empty?
+      # Find the longest matching pattern (Google convention; RFC 9309
+      # also says the most specific match wins).
+      best = nil
+      rules.each do |r|
+        next unless path_matches?(path, r.pattern)
+        if best.nil? || r.pattern.length > best.pattern.length
+          best = r
+        end
+      end
+      best.nil? || best.type == :allow
+    end
+    def disallowed?(url)
+      !allowed?(url)
+    end
+    def crawl_delay
+      ua = ua_for(@ua)
+      @delays[ua] || @delays["*"]
+    end
+    def self.fetch_for(origin, user_agent: "*", **opts)
+      uri = URI(origin.to_s)
+      url = "#{uri.scheme}://#{uri.host}#{uri.port == uri.default_port ? "" : ":#{uri.port}"}/robots.txt"
+      resp = Scrapetor::Fetcher.get(url, raise_for_status: false, **opts)
+      body = resp[:status] == 200 ? resp[:body] : ""
+      new(body, user_agent: user_agent)
+    end
+    private
+    def applicable_rules
+      ua = ua_for(@ua)
+      @groups[ua] || @groups["*"] || []
+    end
+    # Pick the most-specific UA group whose name is a case-insensitive
+    # prefix of @ua, or '*' as fallback.
+    def ua_for(ua)
+      ua_lc = ua.to_s.downcase
+      best = nil
+      @groups.each_key do |key|
+        next if key == "*"
+        if ua_lc.start_with?(key) && (best.nil? || key.length > best.length)
+          best = key
+        end
+      end
+      best || "*"
+    end
+    # robots.txt allows '*' wildcards and '$' end-anchor inside patterns.
+    # Translate to regex once per call; for hot-path callers, cache.
+    def path_matches?(path, pattern)
+      regex = pattern_cache(pattern)
+      regex.match?(path)
+    end
+    def pattern_cache(pattern)
+      @pattern_cache ||= {}
+      @pattern_cache[pattern] ||= compile_pattern(pattern)
+    end
+    def compile_pattern(pattern)
+      buf = +"\\A"
+      i = 0
+      while i < pattern.length
+        ch = pattern[i]
+        if ch == "*"
+          buf << ".*"
+        elsif ch == "$" && i == pattern.length - 1
+          buf << "\\z"
+        else
+          buf << Regexp.escape(ch)
+        end
+        i += 1
+      end
+      Regexp.new(buf)
+    end
+    def parse!(body)
+      current_uas = []
+      buffer = []
+      flush = lambda do
+        current_uas.each { |u| (@groups[u] ||= []).concat(buffer) }
+        buffer = []
+      end
+      body.each_line do |line|
+        # Strip trailing newline before slicing off the inline comment.
+        # Using \z against an each_line chunk would leave the '#' run
+        # in place because '.' doesn't span newlines.
+        line = line.chomp.sub(/#.*\z/, "").strip
+        next if line.empty?
+        key, val = line.split(":", 2)
+        next unless val
+        key = key.strip.downcase
+        val = val.strip
+        case key
+        when "user-agent"
+          flush.call unless buffer.empty?
+          if current_uas.empty? || current_uas.last == val.downcase
+            current_uas << val.downcase
+          else
+            current_uas = [val.downcase]
+          end
+        when "disallow"
+          # An empty disallow means "allow all"; skip — empty pattern would match everything.
+          buffer << Rule.new(:disallow, val) unless val.empty?
+        when "allow"
+          buffer << Rule.new(:allow, val) unless val.empty?
+        when "crawl-delay"
+          d = val.to_f
+          current_uas.each { |u| @delays[u] = d } if d > 0
+        when "sitemap"
+          @sitemaps << val unless val.empty?
+        end
+      end
+      flush.call unless buffer.empty?
+    end
+  end
+end

data/lib/scrapetor/sax.rb ADDED Viewed

@@ -0,0 +1,285 @@
+# frozen_string_literal: true
+module Scrapetor
+  # Pure-Ruby SAX-style streaming HTML parser.
+  #
+  # The hot path for production extraction is the C streaming engine
+  # behind `doc.extract`. This module exists for the cases where you
+  # genuinely want token-by-token control — debugging, custom incremental
+  # processors, conversion to other formats.
+  #
+  # Usage:
+  #
+  #   class MyHandler < Scrapetor::SAX::Document
+  #     def start_element(name, attrs); puts "<#{name}>"; end
+  #     def end_element(name);          puts "</#{name}>"; end
+  #     def characters(text);            puts text; end
+  #     def comment(text);               puts "<!--#{text}-->"; end
+  #     def doctype(name);               puts "<!DOCTYPE #{name}>"; end
+  #   end
+  #
+  #   Scrapetor::SAX::Parser.new(MyHandler.new).parse(html)
+  module SAX
+    # Subclass to selectively override callbacks. All default to no-ops.
+    class Document
+      def start_document; end
+      def end_document;   end
+      def start_element(name, attrs); end
+      def end_element(name); end
+      def characters(text); end
+      def comment(text); end
+      def doctype(name); end
+      def cdata_block(text); end
+      def error(msg); end
+      def warning(msg); end
+    end
+    class Parser
+      def initialize(handler)
+        @handler = handler
+      end
+      def parse(html)
+        Tokenizer.new(html).each_event do |event|
+          type, *args = event
+          case type
+          when :doc_start   then @handler.start_document
+          when :doc_end     then @handler.end_document
+          when :start       then @handler.start_element(args[0], args[1])
+          when :end         then @handler.end_element(args[0])
+          when :text        then @handler.characters(args[0])
+          when :comment     then @handler.comment(args[0])
+          when :doctype     then @handler.doctype(args[0])
+          when :cdata       then @handler.cdata_block(args[0])
+          end
+        end
+        self
+      end
+      def parse_file(path)
+        parse(File.read(path))
+      end
+      def parse_io(io)
+        parse(io.read)
+      end
+    end
+    # Standalone tokenizer — yields events without going through a handler.
+    # Useful when you just want an enumerator:
+    #
+    #   Scrapetor::SAX::Tokenizer.new(html).each_event do |type, *args|
+    #     # ...
+    #   end
+    class Tokenizer
+      VOID = %w[
+        area base br col embed hr img input link meta source track wbr
+      ].freeze
+      RAW_TEXT = %w[script style].freeze
+      def initialize(html)
+        @html = Scrapetor::Encoding.to_utf8(html)
+        @pos  = 0
+        @len  = @html.bytesize
+      end
+      def each_event(&block)
+        return enum_for(:each_event) unless block_given?
+        block.call([:doc_start])
+        while @pos < @len
+          ch = byte(@pos)
+          if ch == 0x3C # '<'
+            handle_open(&block)
+          else
+            handle_text(&block)
+          end
+        end
+        block.call([:doc_end])
+        self
+      end
+      private
+      def byte(i)
+        @html.getbyte(i)
+      end
+      def slice(s, e)
+        @html.byteslice(s, e - s) || ""
+      end
+      def handle_text(&block)
+        start = @pos
+        while @pos < @len && byte(@pos) != 0x3C
+          @pos += 1
+        end
+        text = slice(start, @pos)
+        block.call([:text, text]) unless text.empty?
+      end
+      def handle_open(&block)
+        return unless @pos + 1 < @len
+        nxt = byte(@pos + 1)
+        # Comment
+        if nxt == 0x21 && @pos + 3 < @len && byte(@pos + 2) == 0x2D && byte(@pos + 3) == 0x2D
+          start = @pos + 4
+          e = @html.index("-->", start)
+          if e.nil?
+            @pos = @len
+            return
+          end
+          block.call([:comment, slice(start, e)])
+          @pos = e + 3
+          return
+        end
+        # Doctype or bogus !
+        if nxt == 0x21
+          gt = @html.index(">", @pos)
+          if gt.nil?
+            @pos = @len
+            return
+          end
+          decl = slice(@pos + 2, gt)
+          if decl =~ /\A\s*DOCTYPE\b\s*([^\s>]+)?/i
+            block.call([:doctype, ($1 || "").downcase])
+          end
+          @pos = gt + 1
+          return
+        end
+        # End tag
+        if nxt == 0x2F # '/'
+          @pos += 2
+          name_start = @pos
+          while @pos < @len && name_char?(byte(@pos))
+            @pos += 1
+          end
+          name = slice(name_start, @pos).downcase
+          # Skip to '>'
+          while @pos < @len && byte(@pos) != 0x3E
+            @pos += 1
+          end
+          @pos += 1 if @pos < @len
+          block.call([:end, name]) unless name.empty?
+          return
+        end
+        # Start tag
+        if name_start?(nxt)
+          @pos += 1
+          name_start = @pos
+          while @pos < @len && name_char?(byte(@pos))
+            @pos += 1
+          end
+          name = slice(name_start, @pos).downcase
+          attrs = parse_attrs
+          self_closing = consume_close
+          block.call([:start, name, attrs])
+          if VOID.include?(name) || self_closing
+            block.call([:end, name])
+          elsif RAW_TEXT.include?(name)
+            # Raw text content until matching </name>
+            text_start = @pos
+            needle = "</#{name}"
+            close_idx = @html.downcase.index(needle, @pos)
+            close_idx ||= @len
+            block.call([:text, slice(text_start, close_idx)]) if close_idx > text_start
+            @pos = close_idx
+            # consume </name ... >
+            if @pos < @len
+              while @pos < @len && byte(@pos) != 0x3E
+                @pos += 1
+              end
+              @pos += 1 if @pos < @len
+              block.call([:end, name])
+            end
+          end
+          return
+        end
+        # Literal '<' followed by non-name — emit as text
+        block.call([:text, "<"])
+        @pos += 1
+      end
+      def parse_attrs
+        attrs = {}
+        while @pos < @len
+          skip_ws
+          break if @pos >= @len
+          ch = byte(@pos)
+          break if ch == 0x3E   # '>'
+          break if ch == 0x2F   # '/' (self-closing marker)
+          # Attribute name
+          name_start = @pos
+          while @pos < @len
+            nc = byte(@pos)
+            break if nc == 0x3D || nc == 0x3E || nc == 0x2F || ws?(nc)
+            @pos += 1
+          end
+          aname = slice(name_start, @pos).downcase
+          next if aname.empty?
+          skip_ws
+          value = nil
+          if @pos < @len && byte(@pos) == 0x3D
+            @pos += 1
+            skip_ws
+            if @pos < @len
+              q = byte(@pos)
+              if q == 0x22 || q == 0x27
+                @pos += 1
+                val_start = @pos
+                while @pos < @len && byte(@pos) != q
+                  @pos += 1
+                end
+                value = slice(val_start, @pos)
+                @pos += 1 if @pos < @len
+              else
+                val_start = @pos
+                while @pos < @len && !ws?(byte(@pos)) && byte(@pos) != 0x3E
+                  @pos += 1
+                end
+                value = slice(val_start, @pos)
+              end
+            end
+          end
+          attrs[aname] = value || ""
+        end
+        attrs
+      end
+      def consume_close
+        self_closing = false
+        if @pos < @len && byte(@pos) == 0x2F
+          self_closing = true
+          @pos += 1
+        end
+        while @pos < @len && byte(@pos) != 0x3E
+          @pos += 1
+        end
+        @pos += 1 if @pos < @len
+        self_closing
+      end
+      def skip_ws
+        @pos += 1 while @pos < @len && ws?(byte(@pos))
+      end
+      def name_start?(b)
+        (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A) || b == 0x5F
+      end
+      def name_char?(b)
+        (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A) ||
+          (b >= 0x30 && b <= 0x39) || b == 0x2D || b == 0x5F || b == 0x3A
+      end
+      def ws?(b)
+        b == 0x20 || b == 0x09 || b == 0x0A || b == 0x0D || b == 0x0C || b == 0x0B
+      end
+    end
+  end
+end

data/lib/scrapetor/schema.rb ADDED Viewed

@@ -0,0 +1,144 @@
+# frozen_string_literal: true
+module Scrapetor
+  class Schema
+    Field = Struct.new(
+      :name, :selector, :attr, :attr_str, :type, :clean, :multi,
+      :normalize_url, :default, :required, :transform, :delimiter
+    )
+    Group = Struct.new(:name, :selector, :fields, :groups)
+    attr_reader :fields, :groups
+    def initialize
+      @fields = []
+      @groups = []
+    end
+    def self.build(&block)
+      s = new
+      s.instance_eval(&block) if block
+      s
+    end
+    # field :name, from: SELECTOR, attr: SYM, type: SYM,
+    #              clean: BOOL, multi: BOOL, normalize_url: BOOL,
+    #              default: VALUE, required: BOOL,
+    #              transform: PROC, delimiter: STRING_OR_REGEX
+    #
+    # from: may be a String selector or an Array of selectors (tried in
+    # order until one matches).
+    #
+    # Types: :text :integer :float :money :url :date :json :html :list
+    #        :boolean :array (alias for multi:true)
+    def field(name,
+              from:,
+              attr: nil,
+              type: :text,
+              clean: false,
+              multi: false,
+              normalize_url: false,
+              default: nil,
+              required: false,
+              transform: nil,
+              delimiter: /\s*,\s*/)
+      multi = true if type == :array
+      type  = :text if type == :array
+      @fields << Field.new(
+        name, from, attr, attr && attr.to_s, type, clean, multi,
+        normalize_url, default, required, transform, delimiter
+      )
+    end
+    def repeated(selector, as:, &block)
+      sub = self.class.build(&block)
+      @groups << Group.new(as, selector, sub.fields, sub.groups)
+    end
+    # ----- Cross-process plan cache -----
+    #
+    # Serialize a schema to a binary blob (Marshal) so a worker can
+    # restore the compiled descriptor without re-parsing the Ruby DSL.
+    # Schemas using `transform:` (procs) can't be dumped — those plans
+    # must be rebuilt from source.
+    def dump
+      Marshal.dump(self.class.dumpable(self))
+    end
+    def self.load(blob)
+      new_from_h(Marshal.load(blob)) # rubocop:disable Security/MarshalLoad
+    end
+    def self.dump_to_file(schema, path)
+      File.binwrite(path, schema.dump)
+      path
+    end
+    def self.load_file(path)
+      load(File.binread(path))
+    end
+    # Convert a schema to a portable Hash (no procs).
+    def self.dumpable(schema)
+      {
+        fields: schema.fields.map { |f| field_to_h(f) },
+        groups: schema.groups.map { |g| group_to_h(g) }
+      }
+    end
+    def self.field_to_h(f)
+      raise SchemaError, "transform: blocks can't be serialized" if f.transform
+      {
+        name:          f.name,
+        selector:      f.selector,
+        attr:          f.attr,
+        attr_str:      f.attr_str,
+        type:          f.type,
+        clean:         f.clean,
+        multi:         f.multi,
+        normalize_url: f.normalize_url,
+        default:       f.default,
+        required:      f.required,
+        delimiter:     f.delimiter
+      }
+    end
+    def self.group_to_h(g)
+      {
+        name:     g.name,
+        selector: g.selector,
+        fields:   g.fields.map { |f| field_to_h(f) },
+        groups:   g.groups.map { |sub| group_to_h(sub) }
+      }
+    end
+    def self.new_from_h(h)
+      schema = new
+      h[:fields].each { |fh| schema.fields << field_from_h(fh) }
+      h[:groups].each { |gh| schema.groups << group_from_h(gh) }
+      schema
+    end
+    def self.field_from_h(h)
+      Field.new(
+        h[:name], h[:selector], h[:attr], h[:attr_str], h[:type],
+        h[:clean], h[:multi], h[:normalize_url], h[:default],
+        h[:required], nil, h[:delimiter]
+      )
+    end
+    def self.group_from_h(h)
+      Group.new(
+        h[:name],
+        h[:selector],
+        h[:fields].map { |fh| field_from_h(fh) },
+        h[:groups].map { |gh| group_from_h(gh) }
+      )
+    end
+    def to_h
+      self.class.dumpable(self)
+    end
+  end
+end