RubyGems - crawlr - Versions diffs - 0.1.0 - Mend

crawlr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +9 -0
data/CHANGELOG.md +5 -0
data/LICENSE.txt +21 -0
data/README.md +326 -0
data/Rakefile +12 -0
data/lib/crawlr/callbacks.rb +177 -0
data/lib/crawlr/collector.rb +632 -0
data/lib/crawlr/config.rb +232 -0
data/lib/crawlr/context.rb +80 -0
data/lib/crawlr/domains.rb +166 -0
data/lib/crawlr/hooks.rb +161 -0
data/lib/crawlr/http_interface.rb +286 -0
data/lib/crawlr/parser.rb +242 -0
data/lib/crawlr/robots.rb +329 -0
data/lib/crawlr/version.rb +5 -0
data/lib/crawlr/visits.rb +190 -0
data/lib/crawlr.rb +16 -0
data/sig/crawlr.rbs +4 -0
metadata +209 -0

data/lib/crawlr/robots.rb ADDED Viewed

@@ -0,0 +1,329 @@
+# frozen_string_literal: true
+require "uri"
+module Crawlr
+  # Robots.txt parser and compliance checker for respectful web scraping.
+  #
+  # The Robots class implements full robots.txt specification compliance,
+  # including user-agent matching, path pattern matching with wildcards,
+  # allow/disallow precedence rules, and crawl-delay directives. It helps
+  # ensure that scrapers respect website crawling policies and avoid
+  # making unwanted requests.
+  #
+  # @example Basic robots.txt compliance
+  #   robots = Crawlr::Robots.new
+  #
+  #   # Parse robots.txt content
+  #   robots_content = <<~ROBOTS
+  #     User-agent: *
+  #     Disallow: /private/
+  #     Allow: /public/
+  #     Crawl-delay: 1
+  #   ROBOTS
+  #
+  #   robots.parse('https://example.com', robots_content)
+  #
+  #   # Check URL permissions
+  #   robots.allowed?('https://example.com/public/page', 'MyBot/1.0')  #=> true
+  #   robots.allowed?('https://example.com/private/data', 'MyBot/1.0') #=> false
+  #
+  # @example Complex user-agent matching
+  #   robots_content = <<~ROBOTS
+  #     User-agent: Googlebot
+  #     Disallow: /admin/
+  #
+  #     User-agent: *
+  #     Disallow: /
+  #     Allow: /public/
+  #   ROBOTS
+  #
+  #   robots.parse('https://site.com', robots_content)
+  #
+  #   robots.allowed?('https://site.com/admin/', 'Googlebot/2.1')      #=> false
+  #   robots.allowed?('https://site.com/public/', 'Googlebot/2.1')     #=> true
+  #   robots.allowed?('https://site.com/anything/', 'OtherBot/1.0')    #=> false
+  #
+  # @example Wildcard pattern matching
+  #   robots_content = <<~ROBOTS
+  #     User-agent: *
+  #     Disallow: /*.pdf$
+  #     Disallow: /temp/*
+  #     Allow: /temp/public/*
+  #   ROBOTS
+  #
+  #   robots.parse('https://example.com', robots_content)
+  #
+  #   robots.allowed?('https://example.com/document.pdf', 'Bot')        #=> false
+  #   robots.allowed?('https://example.com/temp/secret.txt', 'Bot')     #=> false
+  #   robots.allowed?('https://example.com/temp/public/file.txt', 'Bot') #=> true
+  #
+  # @author [Your Name]
+  # @since 0.1.0
+  class Robots
+    # Represents a robots.txt rule for a specific user-agent
+    #
+    # @!attribute [r] user_agent
+    #   @return [String] User-agent pattern this rule applies to
+    # @!attribute [r] allow
+    #   @return [Array<String>] Array of allowed path patterns
+    # @!attribute [r] disallow
+    #   @return [Array<String>] Array of disallowed path patterns
+    # @!attribute [r] crawl_delay
+    #   @return [String, nil] Crawl delay in seconds for this user-agent
+    Rule = Struct.new(:user_agent, :allow, :disallow, :crawl_delay)
+    # @return [Hash<String, Array<Rule>>] Internal store of parsed robots.txt rules by domain
+    attr_reader :store
+    # Initializes a new Robots instance
+    #
+    # Creates an empty store for caching parsed robots.txt files by domain.
+    # Each domain's robots.txt is parsed once and cached for subsequent
+    # permission checks.
+    #
+    # @example
+    #   robots = Crawlr::Robots.new
+    def initialize
+      @store = {}
+    end
+    # Checks if robots.txt has been parsed and cached for a given origin
+    #
+    # @param origin [String] The origin URL (scheme + host + port)
+    # @return [Boolean] true if robots.txt data exists for this origin
+    #
+    # @example
+    #   robots.exists?('https://example.com')  #=> false
+    #   robots.parse('https://example.com', robots_content)
+    #   robots.exists?('https://example.com')  #=> true
+    def exists?(origin)
+      @store.key?(origin)
+    end
+    # Determines if a URL is allowed to be crawled according to robots.txt rules
+    #
+    # This method implements the full robots.txt specification including:
+    # - User-agent matching with prefix matching and wildcards
+    # - Path pattern matching with wildcards and end anchors
+    # - Allow/disallow precedence with longest match wins
+    # - Graceful fallback when no robots.txt exists
+    #
+    # @param url [String] The full URL to check for crawling permission
+    # @param user_agent [String] The user-agent string to match against rules
+    # @return [Boolean] true if the URL is allowed to be crawled
+    #
+    # @example Basic permission checking
+    #   robots.allowed?('https://example.com/page.html', 'MyBot/1.0')
+    #
+    # @example With specific user-agent rules
+    #   # robots.txt contains specific rules for "MyBot"
+    #   robots.allowed?('https://site.com/admin/', 'MyBot/2.0')  #=> depends on rules
+    #   robots.allowed?('https://site.com/admin/', 'OtherBot')   #=> uses wildcard rules
+    #
+    # @example Pattern matching examples
+    #   # robots.txt: Disallow: /*.pdf$
+    #   robots.allowed?('https://site.com/doc.pdf', 'Bot')     #=> false
+    #   robots.allowed?('https://site.com/doc.pdf.html', 'Bot') #=> true
+    #
+    #   # robots.txt: Disallow: /temp/*
+    #   robots.allowed?('https://site.com/temp/file.txt', 'Bot') #=> false
+    #   robots.allowed?('https://site.com/temporary/', 'Bot')    #=> true
+    def allowed?(url, user_agent)
+      rule = get_rule(url, user_agent)
+      return true unless rule # if no robots.txt or no rule, allow
+      path = URI.parse(url).path
+      matched = []
+      # Match allow/disallow using fnmatch (robots.txt style)
+      rule.allow.each do |pattern|
+        matched << [:allow, pattern] if robots_match?(pattern, path)
+      end
+      rule.disallow.each do |pattern|
+        matched << [:disallow, pattern] if robots_match?(pattern, path)
+      end
+      return true if matched.empty?
+      # Longest match wins
+      action, = matched.max_by { |_, p| p.length }
+      action == :allow
+    end
+    # Parses robots.txt content and stores rules for the given URL's domain
+    #
+    # Extracts and processes all robots.txt directives including:
+    # - User-agent declarations
+    # - Allow and Disallow rules
+    # - Crawl-delay directives
+    # - Sitemap declarations
+    # - Comment and empty line handling
+    #
+    # @param url [String] The URL where this robots.txt was fetched from
+    # @param content [String] Raw robots.txt file content
+    # @return [void]
+    #
+    # @example Parse standard robots.txt
+    #   robots_content = <<~ROBOTS
+    #     # This is a comment
+    #     User-agent: *
+    #     Disallow: /private/
+    #     Allow: /public/
+    #     Crawl-delay: 2
+    #
+    #     User-agent: Googlebot
+    #     Allow: /
+    #
+    #     Sitemap: https://example.com/sitemap.xml
+    #   ROBOTS
+    #
+    #   robots.parse('https://example.com/robots.txt', robots_content)
+    #
+    # @example Parse with wildcards and patterns
+    #   robots_content = <<~ROBOTS
+    #     User-agent: *
+    #     Disallow: /*.json$
+    #     Disallow: /api/v*/private/
+    #     Allow: /api/v*/public/
+    #   ROBOTS
+    #
+    #   robots.parse('https://api.example.com', robots_content)
+    def parse(url, content)
+      uri = URI.parse(url)
+      domain = uri.host.downcase
+      hash = parse_to_hash(content)
+      rules = []
+      hash[:rules].each do |user_agent, rule|
+        rules << Rule.new(user_agent, rule[:allow], rule[:disallow], rule[:crawl_delay])
+      end
+      @store[domain] ||= rules
+    end
+    private
+    # Finds the most applicable rule for a URL and user-agent combination
+    #
+    # Implements the robots.txt user-agent matching algorithm:
+    # 1. Find rules with user-agent prefix matching (case-insensitive)
+    # 2. If no matches, fall back to wildcard (*) rules
+    # 3. Return the most specific match (longest user-agent string)
+    #
+    # @param url [String] URL to find rules for
+    # @param user_agent [String] User-agent to match
+    # @return [Rule, nil] Most applicable rule or nil if none found
+    # @api private
+    def get_rule(url, user_agent)
+      uri = URI.parse(url)
+      domain = uri.host.downcase
+      rules = @store[domain]
+      return nil unless rules
+      # Case-insensitive prefix match
+      applicable_rules = rules.select do |rule|
+        next if rule.user_agent.nil?
+        user_agent.downcase.start_with?(rule.user_agent.downcase)
+      end
+      # Fallback to wildcard
+      applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
+      # Most specific (longest UA name) wins
+      applicable_rules.max_by { |r| r.user_agent.length }
+    end
+    # Tests if a robots.txt pattern matches a given path
+    #
+    # Implements robots.txt pattern matching including:
+    # - Wildcard matching using File.fnmatch
+    # - End anchor ($) support for exact suffix matching
+    # - Extended glob patterns support
+    #
+    # @param pattern [String] robots.txt path pattern (may include wildcards and anchors)
+    # @param path [String] URL path to test against pattern
+    # @return [Boolean] true if pattern matches the path
+    # @api private
+    #
+    # @example Wildcard patterns
+    #   robots_match?('/temp/*', '/temp/file.txt')     #=> true
+    #   robots_match?('/temp/*', '/temporary/')        #=> false
+    #
+    # @example End anchor patterns
+    #   robots_match?('*.pdf$', '/document.pdf')      #=> true
+    #   robots_match?('*.pdf$', '/document.pdf.html') #=> false
+    #
+    # @example Exact path matching
+    #   robots_match?('/admin/', '/admin/')            #=> true
+    #   robots_match?('/admin/', '/admin/page.html')   #=> false
+    def robots_match?(pattern, path)
+      # Handle `$` end anchor (remove and check exact end)
+      anchored = pattern.end_with?("$")
+      pattern = pattern.chomp("$") if anchored
+      matched = File.fnmatch?(pattern, path, File::FNM_EXTGLOB)
+      return matched unless anchored
+      matched && path.end_with?(pattern.delete_prefix("*"))
+    end
+    # Parses robots.txt content into a structured hash format
+    #
+    # Processes the raw robots.txt file line by line, handling:
+    # - User-agent declarations and grouping
+    # - Allow/Disallow rule accumulation
+    # - Crawl-delay value extraction
+    # - Sitemap URL collection
+    # - Comment and whitespace filtering
+    #
+    # @param content [String] Raw robots.txt file content
+    # @return [Hash] Structured hash with :sitemap and :rules keys
+    # @api private
+    #
+    # @example Return structure
+    #   {
+    #     sitemap: ['https://example.com/sitemap.xml'],
+    #     rules: {
+    #       '*' => { allow: ['/public/'], disallow: ['/private/'], crawl_delay: '1' },
+    #       'Googlebot' => { allow: ['/'], disallow: [], crawl_delay: nil }
+    #     }
+    #   }
+    def parse_to_hash(content)
+      robots_hash = {
+        sitemap: [],
+        rules: {}
+      }
+      curr_user_agents = []
+      content.each_line do |line|
+        clean_line = line.strip
+        next if clean_line.empty? || clean_line.start_with?("#")
+        key, value = clean_line.split(":", 2).map(&:strip)
+        next unless key && value
+        key = key.downcase
+        case key
+        when "sitemap"
+          robots_hash[:sitemap] << value
+        when "user-agent"
+          curr_user_agents = [value]
+          robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
+        when "allow"
+          curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
+        when "disallow"
+          curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
+        when "crawl-delay"
+          curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
+        end
+      end
+      robots_hash
+    end
+  end
+end

data/lib/crawlr/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module Crawlr
+  VERSION = "0.1.0"
+end

data/lib/crawlr/visits.rb ADDED Viewed

@@ -0,0 +1,190 @@
+# frozen_string_literal: true
+require "concurrent"
+module Crawlr
+  # Thread-safe visit tracking system for URL deduplication and history management.
+  #
+  # The Visits class maintains a record of visited URLs to prevent duplicate
+  # requests during scraping sessions. It uses concurrent data structures to
+  # ensure thread safety in parallel scraping environments and implements
+  # memory management through configurable visit limits with automatic cache
+  # reset when limits are reached.
+  #
+  # @example Basic visit tracking
+  #   config = Crawlr::Config.new(allow_url_revisit: false, max_visited: 1000)
+  #   visits = Crawlr::Visits.new(config)
+  #
+  #   visits.new?('https://example.com/page1')  #=> true (first time)
+  #   visits.register('https://example.com/page1')
+  #   visits.new?('https://example.com/page1')  #=> false (already visited)
+  #
+  # @example With URL revisiting allowed
+  #   config = Crawlr::Config.new(allow_url_revisit: true)
+  #   visits = Crawlr::Visits.new(config)
+  #
+  #   visits.new?('https://example.com/page')   #=> true (always allowed)
+  #   visits.register('https://example.com/page')
+  #   visits.new?('https://example.com/page')   #=> true (revisiting allowed)
+  #
+  # @example Memory management with limits
+  #   config = Crawlr::Config.new(max_visited: 5)
+  #   visits = Crawlr::Visits.new(config)
+  #
+  #   # Add URLs up to limit
+  #   (1..5).each do |i|
+  #     visits.register("https://example.com/page#{i}")
+  #   end
+  #
+  #   # Next check triggers cache reset
+  #   visits.new?('https://example.com/page6')  #=> true (cache was reset)
+  #   visits.stats[:visited_count]              #=> 0 (cache cleared)
+  #
+  # @example Thread-safe parallel scraping
+  #   visits = Crawlr::Visits.new(config)
+  #
+  #   # Safe to use across multiple threads
+  #   threads = 10.times.map do |i|
+  #     Thread.new do
+  #       url = "https://example.com/thread#{i}/page"
+  #       if visits.new?(url)
+  #         visits.register(url)
+  #         scrape_page(url)
+  #       end
+  #     end
+  #   end
+  #
+  #   threads.each(&:join)
+  #
+  # @author [Your Name]
+  # @since 0.1.0
+  class Visits
+    # Initializes a new Visits tracker with the given configuration
+    #
+    # Creates a thread-safe concurrent map for storing visited URLs and
+    # configures behavior based on the provided settings for revisiting
+    # and memory management.
+    #
+    # @param config [Crawlr::Config] Configuration object with visit tracking settings
+    # @option config [Boolean] :allow_url_revisit Whether to allow revisiting URLs
+    # @option config [Integer] :max_visited Maximum URLs to track before cache reset
+    #
+    # @example
+    #   config = Crawlr::Config.new(
+    #     allow_url_revisit: false,
+    #     max_visited: 10_000
+    #   )
+    #   visits = Crawlr::Visits.new(config)
+    def initialize(config)
+      @config = config
+      @visited = Concurrent::Map.new
+    end
+    # Registers a URL as visited in the tracking system
+    #
+    # Marks the given URL as visited by storing it in the concurrent map.
+    # This method is thread-safe and can be called from multiple threads
+    # simultaneously without risk of data corruption.
+    #
+    # @param url [String] The URL to mark as visited
+    # @return [Boolean] Always returns true (the stored value)
+    #
+    # @example
+    #   visits.register('https://example.com/page')
+    #   visits.register('https://api.example.com/data?id=123')
+    def register(url)
+      @visited[url] = true
+    end
+    # Checks if the visit tracking system is empty
+    #
+    # Useful for determining if this is the first URL being processed
+    # or if the cache has been recently cleared. Can be used to apply
+    # different behavior for initial requests (like skipping delays).
+    #
+    # @return [Boolean] true if no URLs have been visited or cache is empty
+    #
+    # @example
+    #   visits.blank?  #=> true (no visits yet)
+    #   visits.register('https://example.com')
+    #   visits.blank?  #=> false (has visits)
+    def blank?
+      @visited.keys.empty?
+    end
+    # Returns statistics about the visit tracking system
+    #
+    # Provides metrics about the current state of visit tracking including
+    # the number of URLs currently stored and the configured maximum limit.
+    # Useful for monitoring memory usage and debugging scraping behavior.
+    #
+    # @return [Hash<Symbol, Integer>] Statistics hash containing visit metrics
+    # @option return [Integer] :visited_count Number of URLs currently tracked
+    # @option return [Integer] :max_visited Maximum URLs before cache reset
+    #
+    # @example
+    #   stats = visits.stats
+    #   puts "Visited #{stats[:visited_count]} / #{stats[:max_visited]} URLs"
+    #
+    #   if stats[:visited_count] > stats[:max_visited] * 0.8
+    #     puts "Approaching visit limit, cache will reset soon"
+    #   end
+    def stats
+      {
+        visited_count: @visited.size,
+        max_visited: @config.max_visited
+      }
+    end
+    # Determines if a URL is new (not previously visited)
+    #
+    # This method implements the core visit deduplication logic including:
+    # - Automatic cache reset when maximum visit limit is reached
+    # - Configurable URL revisiting behavior
+    # - Thread-safe duplicate detection
+    # - Logging for debugging and monitoring
+    #
+    # The method performs memory management by clearing the visited cache
+    # when the configured maximum is reached, preventing unbounded memory
+    # growth during long-running scraping sessions.
+    #
+    # @param url [String] URL to check for previous visits
+    # @return [Boolean] true if URL is new or revisiting is allowed, false if already visited
+    #
+    # @example Basic deduplication
+    #   visits.new?('https://example.com/page1')  #=> true
+    #   visits.register('https://example.com/page1')
+    #   visits.new?('https://example.com/page1')  #=> false
+    #
+    # @example With revisiting enabled
+    #   # config.allow_url_revisit = true
+    #   visits.new?('https://example.com/page')   #=> true (always)
+    #
+    # @example Memory limit handling
+    #   # When max_visited limit is reached
+    #   visits.new?('https://example.com/new')    #=> true (cache reset)
+    #   # Previous visits are forgotten after reset
+    #
+    # @example In parallel scraping context
+    #   # Thread-safe checking across multiple workers
+    #   if visits.new?(discovered_url)
+    #     visits.register(discovered_url)
+    #     process_url(discovered_url)
+    #   else
+    #     skip_duplicate(discovered_url)
+    #   end
+    def new?(url)
+      # Reset if max visited reached
+      if @visited.size >= @config.max_visited
+        Crawlr.logger.warn "Reached max visited URLs limit (#{@config.max_visited}). Resetting visited cache."
+        @visited.clear
+      end
+      return true if @config.allow_url_revisit
+      return true unless @visited.key?(url)
+      Crawlr.logger.debug "Already visited #{url}; Skipping"
+      false
+    end
+  end
+end

data/lib/crawlr.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+require_relative "crawlr/version"
+# A Ruby scraping framework for parsing HTML and XML documents
+# @author [Your Name]
+# @since 0.1.0
+module Crawlr
+  class Error < StandardError; end
+  class << self
+    attr_accessor :logger
+  end
+  self.logger = Logger.new($stdout, level: Logger::INFO)
+end

data/sig/crawlr.rbs ADDED Viewed

@@ -0,0 +1,4 @@
+module Crawlr
+  VERSION: String
+  # See the writing guide of rbs: https://github.com/ruby/rbs#guides
+end