RubyGems - crawlr - Versions diffs - 0.1.0 - Mend

crawlr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +9 -0
data/CHANGELOG.md +5 -0
data/LICENSE.txt +21 -0
data/README.md +326 -0
data/Rakefile +12 -0
data/lib/crawlr/callbacks.rb +177 -0
data/lib/crawlr/collector.rb +632 -0
data/lib/crawlr/config.rb +232 -0
data/lib/crawlr/context.rb +80 -0
data/lib/crawlr/domains.rb +166 -0
data/lib/crawlr/hooks.rb +161 -0
data/lib/crawlr/http_interface.rb +286 -0
data/lib/crawlr/parser.rb +242 -0
data/lib/crawlr/robots.rb +329 -0
data/lib/crawlr/version.rb +5 -0
data/lib/crawlr/visits.rb +190 -0
data/lib/crawlr.rb +16 -0
data/sig/crawlr.rbs +4 -0
metadata +209 -0

data/lib/crawlr/http_interface.rb ADDED Viewed

@@ -0,0 +1,286 @@
+# frozen_string_literal: true
+require "async"
+require "async/timeout"
+require "async/http/internet"
+require "http/cookie_jar"
+module Crawlr
+  # Handles fetching documents via async HTTP with proxy and cookie support.
+  #
+  # The HTTPInterface class provides a high-level async HTTP client specifically
+  # designed for web scraping. It supports proxy rotation, cookie management,
+  # configurable timeouts, and transforms raw HTTP responses into a simplified
+  # response structure suitable for content processing.
+  #
+  # @example Basic HTTP fetching
+  #   config = Crawlr::Config.new(timeout: 10)
+  #   http = Crawlr::HTTPInterface.new(config)
+  #
+  #   response = http.get('https://example.com')
+  #   puts response.status  #=> 200
+  #   puts response.body    #=> HTML content
+  #
+  # @example With cookie support
+  #   config = Crawlr::Config.new(allow_cookies: true)
+  #   http = Crawlr::HTTPInterface.new(config)
+  #
+  #   # Cookies are automatically managed across requests
+  #   login_response = http.get('https://site.com/login')
+  #   profile_response = http.get('https://site.com/profile')  # Uses login cookies
+  #
+  # @example With proxy rotation
+  #   config = Crawlr::Config.new(
+  #     proxies: ['http://proxy1:8080', 'socks5://proxy2:1080'],
+  #     proxy_strategy: :round_robin
+  #   )
+  #   http = Crawlr::HTTPInterface.new(config)
+  #
+  #   response = http.get('https://example.com')  # Uses proxy1
+  #   response = http.get('https://example.com')  # Uses proxy2
+  #
+  # @example With request hooks
+  #   response = http.get('https://api.example.com') do |url, headers|
+  #     headers['Authorization'] = "Bearer #{get_token()}"
+  #     headers['X-Request-ID'] = SecureRandom.uuid
+  #   end
+  #
+  # @author [Your Name]
+  # @since 0.1.0
+  class HTTPInterface
+    # Simplified HTTP response structure for internal use
+    #
+    # @!attribute [r] url
+    #   @return [String] The requested URL
+    # @!attribute [r] status
+    #   @return [Integer] HTTP status code
+    # @!attribute [r] headers
+    #   @return [Hash] HTTP response headers
+    # @!attribute [r] version
+    #   @return [String] HTTP protocol version
+    # @!attribute [r] body
+    #   @return [String, nil] Response body content
+    Response = Struct.new(:url, :status, :headers, :version, :body)
+    # @return [Crawlr::Config] Configuration object containing HTTP settings
+    attr_reader :config
+    # Initializes a new HTTPInterface with the given configuration
+    #
+    # Sets up cookie management (if enabled) and proxy rotation state.
+    # The cookie jar persists across all requests made by this interface instance.
+    #
+    # @param config [Crawlr::Config] Configuration object with HTTP settings
+    # @option config [Boolean] :allow_cookies Enable cookie jar management
+    # @option config [Array<String>] :proxies List of proxy URLs
+    # @option config [Symbol] :proxy_strategy Proxy selection strategy (:round_robin, :random)
+    # @option config [Integer] :timeout Request timeout in seconds
+    # @option config [Hash] :headers Default headers for all requests
+    #
+    # @example
+    #   config = Crawlr::Config.new(
+    #     allow_cookies: true,
+    #     timeout: 15,
+    #     proxies: ['http://proxy.example.com:8080']
+    #   )
+    #   http = Crawlr::HTTPInterface.new(config)
+    def initialize(config)
+      @config = config
+      @cookie_jar = @config.allow_cookies ? HTTP::CookieJar.new : nil
+      @proxy_index = 0
+    end
+    # Performs an HTTP GET request with full async support and cookie management
+    #
+    # This method handles the complete HTTP request lifecycle including:
+    # - Proxy selection and connection setup
+    # - Cookie retrieval and attachment
+    # - Request header customization via block
+    # - Async execution with timeout handling
+    # - Response cookie parsing and storage
+    # - Resource cleanup and connection closing
+    #
+    # @param url [String] The URL to fetch
+    # @param block [Proc] Optional block for request customization
+    # @yieldparam url [String] The URL being requested
+    # @yieldparam headers [Hash] Mutable headers hash for customization
+    # @return [HTTPInterface::Response] Simplified response object
+    # @raise [Async::TimeoutError] When request exceeds configured timeout
+    # @raise [URI::InvalidURIError] When URL is malformed
+    # @raise [StandardError] For other HTTP-related errors
+    #
+    # @example Basic GET request
+    #   response = http.get('https://example.com/api/data')
+    #   if response.status == 200
+    #     data = JSON.parse(response.body)
+    #   end
+    #
+    # @example With custom headers
+    #   response = http.get('https://api.service.com/endpoint') do |url, headers|
+    #     headers['Accept'] = 'application/json'
+    #     headers['X-API-Key'] = ENV['API_KEY']
+    #     headers['User-Agent'] = 'MyBot/1.0'
+    #   end
+    #
+    # @example With authentication
+    #   response = http.get('https://secure.site.com/data') do |url, headers|
+    #     token = authenticate_user(url)
+    #     headers['Authorization'] = "Bearer #{token}"
+    #   end
+    #
+    # @example Error handling
+    #   begin
+    #     response = http.get('https://unreliable.com/data')
+    #   rescue Async::TimeoutError
+    #     puts "Request timed out"
+    #   rescue StandardError => e
+    #     puts "Request failed: #{e.message}"
+    #   end
+    def get(url)
+      Crawlr.logger.debug "Fetching #{url}"
+      uri = URI.parse(url)
+      proxy_url = next_proxy
+      internet = build_internet_connection(proxy_url)
+      request_headers = @config.headers.dup
+      if @config.allow_cookies
+        cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
+        request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
+      end
+      yield(url, request_headers) if block_given?
+      raw_response = nil
+      begin
+        Sync do |task|
+          raw_response = task.with_timeout(@config.timeout) do
+            internet.get(url, request_headers)
+          end
+        end
+        parse_and_set_cookies(uri, raw_response) if @config.allow_cookies && raw_response
+        make_response_struct(url, raw_response)
+      rescue Async::TimeoutError
+        Crawlr.logger.warn "Timeout fetching #{url} after #{@config.timeout}sec"
+        raise
+      ensure
+        raw_response&.close
+        internet&.close
+        Crawlr.logger.debug "Done fetching #{url}"
+      end
+    end
+    private
+    # Builds an async HTTP connection with optional proxy support
+    #
+    # Creates either a direct internet connection or a proxied connection
+    # based on the provided proxy URL. Supports HTTP and SOCKS5 proxies.
+    #
+    # @param proxy [String, nil] Proxy URL or nil for direct connection
+    # @return [Async::HTTP::Internet, Async::HTTP::Client] HTTP connection object
+    # @raise [URI::InvalidURIError] When proxy URL is malformed
+    # @api private
+    #
+    # @example Direct connection
+    #   connection = build_internet_connection(nil)
+    #
+    # @example HTTP proxy
+    #   connection = build_internet_connection('http://proxy.example.com:8080')
+    #
+    # @example SOCKS proxy with authentication
+    #   connection = build_internet_connection('socks5://user:pass@proxy.example.com:1080')
+    def build_internet_connection(proxy = nil)
+      if proxy
+        # Expected format: "http://user:pass@host:port" or "socks5://host:port"
+        uri = URI.parse(proxy)
+        Crawlr.logger.debug "Using proxy: #{uri}"
+        # Async::HTTP::Proxy requires target endpoint
+        endpoint = Async::HTTP::Endpoint.parse(uri.to_s)
+        Async::HTTP::Client.new(endpoint)
+      else
+        Async::HTTP::Internet.new
+      end
+    end
+    # Selects the next proxy according to the configured strategy
+    #
+    # Implements proxy rotation strategies to distribute requests across
+    # multiple proxy servers. Maintains state for round-robin selection.
+    #
+    # @return [String, nil] Next proxy URL or nil if no proxies configured
+    # @raise [StandardError] When proxy_strategy is unknown
+    # @api private
+    #
+    # @example Round-robin selection
+    #   proxy = next_proxy  # Returns first proxy
+    #   proxy = next_proxy  # Returns second proxy
+    #   proxy = next_proxy  # Wraps back to first proxy
+    #
+    # @example Random selection
+    #   # config.proxy_strategy = :random
+    #   proxy = next_proxy  # Returns random proxy from list
+    def next_proxy
+      return nil if @config.proxies.empty?
+      case @config.proxy_strategy
+      when :round_robin
+        proxy = @config.proxies[@proxy_index % @config.proxies.size]
+        @proxy_index += 1
+        proxy
+      when :random
+        @config.proxies.sample
+      else
+        raise "Unknown proxy strategy: #{@config.proxy_strategy}"
+      end
+    end
+    # Creates a simplified response struct from the raw HTTP response
+    #
+    # Transforms the async-http response object into a simplified structure
+    # that's easier to work with in the scraping framework. Safely handles
+    # body reading with error recovery.
+    #
+    # @param url [String] The original request URL
+    # @param response [Async::HTTP::Response] Raw async-http response object
+    # @return [HTTPInterface::Response] Simplified response struct
+    # @api private
+    def make_response_struct(url, response)
+      body = begin
+        response.read
+      rescue StandardError
+        nil
+      end
+      Response.new(url, response.status, response.headers, response.version, body)
+    end
+    # Parses and stores cookies from HTTP response headers
+    #
+    # Extracts Set-Cookie headers from the response and adds them to the
+    # internal cookie jar for use in subsequent requests. Handles multiple
+    # cookies and logs cookie information for debugging.
+    #
+    # @param uri [URI] The request URI for cookie domain/path context
+    # @param response [Async::HTTP::Response] HTTP response containing cookies
+    # @return [void]
+    # @api private
+    #
+    # @example Cookie processing
+    #   # Response contains: Set-Cookie: session_id=abc123; Domain=.example.com; Path=/
+    #   parse_and_set_cookies(uri, response)
+    #   # Cookie is stored and will be sent with future requests to example.com
+    def parse_and_set_cookies(uri, response)
+      set_cookies = response.headers["set-cookie"]
+      Array(set_cookies).each do |set_cookie|
+        HTTP::Cookie.parse(set_cookie.to_s, uri).each do |cookie|
+          @cookie_jar.add(cookie)
+          Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
+                              " domain=#{cookie.domain}, path=#{cookie.path}"
+        end
+      end
+    end
+  end
+end

data/lib/crawlr/parser.rb ADDED Viewed

@@ -0,0 +1,242 @@
+# frozen_string_literal: true
+require "nokogiri"
+module Crawlr
+  # Document parsing and callback execution engine.
+  #
+  # The Parser module provides the core document processing functionality for
+  # the Crawlr framework. It efficiently parses HTML and XML content using
+  # Nokogiri and executes registered callbacks on matching elements. The module
+  # optimizes performance by grouping callbacks by document format to minimize
+  # parsing overhead.
+  #
+  # @example Basic callback execution
+  #   content = '<html><body><h1>Title</h1><p>Content</p></body></html>'
+  #
+  #   callbacks = [
+  #     {
+  #       format: :html,
+  #       selector_type: :css,
+  #       selector: 'h1',
+  #       block: ->(node, ctx) { ctx.titles << node.text }
+  #     }
+  #   ]
+  #
+  #   context = OpenStruct.new(titles: [])
+  #   Crawlr::Parser.apply_callbacks(
+  #     content: content,
+  #     callbacks: callbacks,
+  #     context: context
+  #   )
+  #   puts context.titles #=> ["Title"]
+  #
+  # @example Mixed HTML and XML parsing
+  #   callbacks = [
+  #     {
+  #       format: :html,
+  #       selector_type: :css,
+  #       selector: '.product',
+  #       block: ->(node, ctx) { process_html_product(node, ctx) }
+  #     },
+  #     {
+  #       format: :xml,
+  #       selector_type: :xpath,
+  #       selector: '//item[@type="product"]',
+  #       block: ->(node, ctx) { process_xml_product(node, ctx) }
+  #     }
+  #   ]
+  #
+  #   Crawlr::Parser.apply_callbacks(
+  #     content: xml_content,
+  #     callbacks: callbacks,
+  #     context: scraping_context
+  #   )
+  #
+  # @example Performance optimization with format grouping
+  #   # Multiple callbacks for same format - document parsed only once
+  #   callbacks = [
+  #     { format: :html, selector_type: :css, selector: 'title', block: title_proc },
+  #     { format: :html, selector_type: :css, selector: 'meta', block: meta_proc },
+  #     { format: :html, selector_type: :xpath, selector: '//a[@href]', block: link_proc }
+  #   ]
+  #
+  #   # HTML content parsed once, all callbacks executed on same document
+  #   Crawlr::Parser.apply_callbacks(content: html, callbacks: callbacks, context: ctx)
+  #
+  # @author [Your Name]
+  # @since 0.1.0
+  module Parser
+    # Applies registered callbacks to parsed document content
+    #
+    # This method is the main entry point for document processing. It efficiently
+    # handles multiple callbacks by grouping them by document format, ensuring
+    # that each piece of content is parsed only once per format regardless of
+    # how many callbacks are registered for that format.
+    #
+    # The method performs the following operations:
+    # 1. Groups callbacks by document format (:html or :xml)
+    # 2. Parses content once per format using appropriate Nokogiri parser
+    # 3. Executes all callbacks for each format on the parsed document
+    # 4. Extracts matching nodes using CSS or XPath selectors
+    # 5. Calls callback blocks with matched nodes and context
+    #
+    # @param content [String] Raw HTML or XML content to parse
+    # @param callbacks [Array<Hash>] Array of callback configuration hashes
+    # @param context [Object] Context object passed to callback blocks
+    # @option callbacks [Symbol] :format Document format (:html or :xml, defaults to :html)
+    # @option callbacks [Symbol] :selector_type Selector type (:css or :xpath)
+    # @option callbacks [String] :selector CSS or XPath selector string
+    # @option callbacks [Proc] :block Callback block to execute on matching nodes
+    # @return [void]
+    #
+    # @example Single callback execution
+    #   callbacks = [{
+    #     format: :html,
+    #     selector_type: :css,
+    #     selector: '.article-title',
+    #     block: ->(node, ctx) { ctx.titles << node.text.strip }
+    #   }]
+    #
+    #   Crawlr::Parser.apply_callbacks(
+    #     content: html_content,
+    #     callbacks: callbacks,
+    #     context: context_object
+    #   )
+    #
+    # @example Multiple callbacks with different selectors
+    #   callbacks = [
+    #     {
+    #       format: :html,
+    #       selector_type: :css,
+    #       selector: 'h1, h2, h3',
+    #       block: ->(node, ctx) { ctx.headings << { text: node.text, level: node.name } }
+    #     },
+    #     {
+    #       format: :html,
+    #       selector_type: :xpath,
+    #       selector: '//a[@href and text()]',
+    #       block: ->(node, ctx) { ctx.links << { url: node['href'], text: node.text } }
+    #     }
+    #   ]
+    #
+    #   Crawlr::Parser.apply_callbacks(
+    #     content: page_html,
+    #     callbacks: callbacks,
+    #     context: scraping_context
+    #   )
+    #
+    # @example XML feed processing
+    #   callbacks = [{
+    #     format: :xml,
+    #     selector_type: :xpath,
+    #     selector: '//item/title',
+    #     block: ->(node, ctx) { ctx.feed_titles << node.text }
+    #   }]
+    #
+    #   Crawlr::Parser.apply_callbacks(
+    #     content: rss_xml,
+    #     callbacks: callbacks,
+    #     context: feed_context
+    #   )
+    #
+    # @example Complex data extraction
+    #   callbacks = [{
+    #     format: :html,
+    #     selector_type: :css,
+    #     selector: '.product-card',
+    #     block: ->(node, ctx) {
+    #       product = {
+    #         name: node.css('.product-name').text,
+    #         price: node.css('.price').text,
+    #         image: node.css('img')&.first&.[]('src')
+    #       }
+    #       ctx.products << product
+    #     }
+    #   }]
+    #
+    #   Crawlr::Parser.apply_callbacks(
+    #     content: product_page_html,
+    #     callbacks: callbacks,
+    #     context: product_context
+    #   )
+    def self.apply_callbacks(content:, callbacks:, context:)
+      # Group callbacks by format to minimize parsing
+      callbacks_by_format = callbacks.group_by { |cb| cb[:format] || :html }
+      callbacks_by_format.each do |format, format_callbacks|
+        doc = parse_content(format, content)
+        format_callbacks.each do |callback|
+          Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
+          nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
+          nodes.each { |node| callback[:block].call(node, context) }
+        end
+      end
+    end
+    # Parses content using the appropriate Nokogiri parser
+    #
+    # Creates a Nokogiri document object using either the HTML or XML parser
+    # based on the specified format. The HTML parser is more lenient and
+    # handles malformed markup better, while the XML parser is stricter and
+    # preserves XML-specific features.
+    #
+    # @param format [Symbol] Document format (:html or :xml)
+    # @param content [String] Raw document content to parse
+    # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document] Parsed document
+    # @raise [ArgumentError] When format is not :html or :xml
+    # @api private
+    #
+    # @example HTML parsing
+    #   doc = parse_content(:html, '<html><body>Hello</body></html>')
+    #   doc.class #=> Nokogiri::HTML::Document
+    #
+    # @example XML parsing
+    #   doc = parse_content(:xml, '<?xml version="1.0"?><root><item>data</item></root>')
+    #   doc.class #=> Nokogiri::XML::Document
+    private_class_method def self.parse_content(format, content)
+      case format
+      when :html then Nokogiri::HTML(content)
+      when :xml then Nokogiri::XML(content)
+      else raise ArgumentError, "Unsupported format #{format}"
+      end
+    end
+    # Extracts nodes from parsed document using specified selector
+    #
+    # Executes CSS or XPath selectors against the parsed document to find
+    # matching elements. Returns a NodeSet that can be iterated over to
+    # process each matching element.
+    #
+    # @param doc [Nokogiri::HTML::Document, Nokogiri::XML::Document] Parsed document
+    # @param selector_type [Symbol] Type of selector (:css or :xpath)
+    # @param selector [String] Selector expression to find matching nodes
+    # @return [Nokogiri::XML::NodeSet] Collection of matching nodes
+    # @raise [ArgumentError] When selector_type is not :css or :xpath
+    # @api private
+    #
+    # @example CSS selector extraction
+    #   nodes = extract_nodes(doc, :css, '.product-title')
+    #   nodes.each { |node| puts node.text }
+    #
+    # @example XPath selector extraction
+    #   nodes = extract_nodes(doc, :xpath, '//div[@class="content"]//p')
+    #   nodes.each { |node| process_paragraph(node) }
+    #
+    # @example Complex CSS selector
+    #   nodes = extract_nodes(doc, :css, 'article > header h1, article > header h2')
+    #   # Returns all h1 and h2 elements that are direct children of article headers
+    #
+    # @example XPath with attributes
+    #   nodes = extract_nodes(doc, :xpath, '//a[@href and contains(@class, "external")]')
+    #   # Returns all links with href attribute containing "external" class
+    private_class_method def self.extract_nodes(doc, selector_type, selector)
+      case selector_type
+      when :css then doc.css(selector)
+      when :xpath then doc.xpath(selector)
+      else raise ArgumentError, "Unsupported selector type #{selector_type}"
+      end
+    end
+  end
+end