RubyGems - crawlr - Versions diffs - 0.2.1 → 0.2.3 - Mend

crawlr 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/.rubocop.yml +15 -2
data/CHANGELOG.md +9 -0
data/README.md +1 -1
data/lib/crawlr/callbacks.rb +1 -3
data/lib/crawlr/cookie_jar.rb +21 -0
data/lib/crawlr/domains.rb +9 -6
data/lib/crawlr/http_interface.rb +21 -16
data/lib/crawlr/parser.rb +7 -6
data/lib/crawlr/robots.rb +62 -44
data/lib/crawlr/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
-  data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
+  metadata.gz: 461033ddf39311187e8137c08382c3eb25a6f6a88e8324a0e305fffbbd35c6cc
+  data.tar.gz: aa70f5cb9f87cca95192e6447fe615b2357675c8ab0710b38c0197af4c0a91c6
 SHA512:
-  metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
-  data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
+  metadata.gz: fcbada68009ff7aa92e4ed37b57f9dde86132f6dd99a16f4f5999b4461b78a5b8d365b768d76f969dcb6aa34bcec795f0813576cb47bdbc24223bfe39b1e04a4
+  data.tar.gz: 6b22f9947b34c32c5ec5e1d2b684f7b0e37f8c49adf047bc66cdd24f7adbf8174826bdab5cdfc88f8115e2cc946f5edc300d7047a475001aebc30dd1c8b7dbc0

data/.rubocop.yml CHANGED Viewed

@@ -1,9 +1,22 @@
 AllCops:
   TargetRubyVersion: 3.1
   SuggestExtensions: false
+  Exclude:
+    - examples/*.rb
+    - spec/**/*.rb
+Metrics/MethodLength:
+  Max: 20
+Metrics/ClassLength:
+  Max: 300
+Metrics/AbcSize:
+  Max: 30
+Metrics/CyclomaticComplexity:
+  Max: 10
+Metrics/PerceivedComplexity:
+  Max: 10
+Layout/LineLength:
+  Max: 130
 Style/StringLiterals:
   EnforcedStyle: double_quotes
 Style/StringLiteralsInInterpolation:
   EnforcedStyle: double_quotes

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,14 @@
 ## [Unreleased]
+## [0.2.3] - 2025-10-03
+- Introduce custom HTTPInterface cookie_jar wrapped in concurrent map for safe reads/writes when batch visiting async
+- Rubocop related updates
+## [0.2.2] - 2025-10-01
+- Refactor robots.rb and parser.rb to address a few rubocop complaints
 ## [0.2.1] - 2025-09-30
 - Fix paginated_visit to properly handle provided url queries (if present)

data/README.md CHANGED Viewed

@@ -3,7 +3,7 @@
 A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
 [![Gem Version](https://badge.fury.io/rb/crawlr.svg)](https://badge.fury.io/rb/crawlr)
-[![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
+[![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/main.yml)
 ## ✨ Features

data/lib/crawlr/callbacks.rb CHANGED Viewed

@@ -141,9 +141,7 @@ module Crawlr
       raise ArgumentError, "Unsupported format: #{format}" unless ALLOWED_FORMATS.include?(format)
       selector_type, selector = parse_input(input)
-      unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
-        raise ArgumentError, "Unsupported selector type: #{selector_type}"
-      end
+      raise ArgumentError, "Unsupported selector type: #{selector_type}" unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
       register(format, selector_type, selector, &block)
     end

data/lib/crawlr/cookie_jar.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+require "http/cookie_jar"
+module Crawlr
+  # A thread-safe wrapper around the HTTP::CookieJar class
+  class CookieJar
+    def initialize
+      @jar = HTTP::CookieJar.new
+      @lock = Concurrent::ReadWriteLock.new
+    end
+    def add(cookie)
+      @lock.with_write_lock { @jar.add(cookie) }
+    end
+    def cookies(uri)
+      @lock.with_read_lock { @jar.cookies(uri) }
+    end
+  end
+end

data/lib/crawlr/domains.rb CHANGED Viewed

@@ -77,12 +77,7 @@ module Crawlr
     #   domains.allowed?('https://any-domain.com')          #=> true
     def allowed?(url)
       return true if @allowed_domains.empty? && @domain_glob.empty?
-      unless @domain_glob.empty?
-        @domain_glob.each do |glob|
-          return true if File.fnmatch?(glob, url)
-        end
-      end
+      return true if !@domain_glob.empty? && matches_domain_glob?(url)
       uri = URI(url)
       base_name = uri.host.sub("www.", "")
@@ -114,6 +109,14 @@ module Crawlr
     private
+    def matches_domain_glob?(url)
+      @domain_glob.each do |glob|
+        return true if File.fnmatch?(glob, url)
+      end
+      false
+    end
     # Extracts and normalizes domain names from the configuration
     #
     # Processes the list of allowed domains by:

data/lib/crawlr/http_interface.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 require "async"
 require "async/timeout"
 require "async/http/internet"
-require "http/cookie_jar"
+require_relative "cookie_jar"
 module Crawlr
   # Handles fetching documents via async HTTP with proxy and cookie support.
@@ -85,7 +85,7 @@ module Crawlr
     #   http = Crawlr::HTTPInterface.new(config)
     def initialize(config)
       @config = config
-      @cookie_jar = @config.allow_cookies ? HTTP::CookieJar.new : nil
+      @cookie_jars = Concurrent::Map.new if @config.allow_cookies
       @proxy_index = 0
     end
@@ -135,7 +135,7 @@ module Crawlr
     #   rescue StandardError => e
     #     puts "Request failed: #{e.message}"
     #   end
-    def get(url)
+    def get(url) # rubocop:disable Metrics/MethodLength
       Crawlr.logger.debug "Fetching #{url}"
       uri = URI.parse(url)
@@ -143,13 +143,9 @@ module Crawlr
       internet = build_internet_connection(proxy_url)
       request_headers = @config.headers.dup
+      handle_cookies(uri, request_headers)
-      if @config.allow_cookies
-        cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
-        request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
-      end
-      yield(url, request_headers) if block_given?
+      yield(url, request_headers) if block_given? # Used for request customization hook
       raw_response = nil
       begin
@@ -272,14 +268,23 @@ module Crawlr
     #   parse_and_set_cookies(uri, response)
     #   # Cookie is stored and will be sent with future requests to example.com
     def parse_and_set_cookies(uri, response)
-      set_cookies = response.headers["set-cookie"]
-      Array(set_cookies).each do |set_cookie|
-        HTTP::Cookie.parse(set_cookie.to_s, uri).each do |cookie|
-          @cookie_jar.add(cookie)
-          Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
-                              " domain=#{cookie.domain}, path=#{cookie.path}"
-        end
+      jar = cookie_jar_for(uri)
+      Array(response.headers["set-cookie"]).each do |set_cookie|
+        HTTP::Cookie.parse(set_cookie.to_s, uri).each { |cookie| jar.add(cookie) }
       end
     end
+    # Get or create a thread-safe jar for a domain
+    def cookie_jar_for(uri)
+      @cookie_jars.compute_if_absent(uri.host) { Crawlr::CookieJar.new }
+    end
+    def handle_cookies(uri, request_headers)
+      return unless @config.allow_cookies
+      jar = cookie_jar_for(uri)
+      cookie_header = HTTP::Cookie.cookie_value(jar.cookies(uri))
+      request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
+    end
   end
 end

data/lib/crawlr/parser.rb CHANGED Viewed

@@ -165,15 +165,16 @@ module Crawlr
       callbacks_by_format.each do |format, format_callbacks|
         doc = parse_content(format, content)
-        format_callbacks.each do |callback|
-          Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
-          nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
-          nodes.each { |node| callback[:block].call(node, context) }
-        end
+        format_callbacks.each { |callback| apply_callback(doc, callback, context) }
       end
     end
+    private_class_method def self.apply_callback(doc, callback, context)
+      Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
+      nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
+      nodes.each { |node| callback[:block].call(node, context) }
+    end
     # Parses content using the appropriate Nokogiri parser
     #
     # Creates a Nokogiri document object using either the HTML or XML parser

data/lib/crawlr/robots.rb CHANGED Viewed

@@ -130,25 +130,13 @@ module Crawlr
     #   robots.allowed?('https://site.com/temporary/', 'Bot')    #=> true
     def allowed?(url, user_agent)
       rule = get_rule(url, user_agent)
-      return true unless rule # if no robots.txt or no rule, allow
+      return true unless rule
       path = URI.parse(url).path
-      matched = []
-      # Match allow/disallow using fnmatch (robots.txt style)
-      rule.allow.each do |pattern|
-        matched << [:allow, pattern] if robots_match?(pattern, path)
-      end
-      rule.disallow.each do |pattern|
-        matched << [:disallow, pattern] if robots_match?(pattern, path)
-      end
+      matched = matched_rules(rule, path)
       return true if matched.empty?
-      # Longest match wins
-      action, = matched.max_by { |_, p| p.length }
-      action == :allow
+      longest_match_allows?(matched)
     end
     # Parses robots.txt content and stores rules for the given URL's domain
@@ -204,6 +192,25 @@ module Crawlr
     private
+    def matched_rules(rule, path)
+      matched = []
+      rule.allow.each do |pattern|
+        matched << [:allow, pattern] if robots_match?(pattern, path)
+      end
+      rule.disallow.each do |pattern|
+        matched << [:disallow, pattern] if robots_match?(pattern, path)
+      end
+      matched
+    end
+    def longest_match_allows?(matched)
+      action, = matched.max_by { |_, pattern| pattern.length }
+      action == :allow
+    end
     # Finds the most applicable rule for a URL and user-agent combination
     #
     # Implements the robots.txt user-agent matching algorithm:
@@ -222,11 +229,7 @@ module Crawlr
       return nil unless rules
       # Case-insensitive prefix match
-      applicable_rules = rules.select do |rule|
-        next if rule.user_agent.nil?
-        user_agent.downcase.start_with?(rule.user_agent.downcase)
-      end
+      applicable_rules = rules_by_prefix_match(user_agent, rules)
       # Fallback to wildcard
       applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
@@ -235,6 +238,14 @@ module Crawlr
       applicable_rules.max_by { |r| r.user_agent.length }
     end
+    def rules_by_prefix_match(user_agent, rules)
+      rules.select do |rule|
+        next if rule.user_agent.nil?
+        user_agent.downcase.start_with?(rule.user_agent.downcase)
+      end
+    end
     # Tests if a robots.txt pattern matches a given path
     #
     # Implements robots.txt pattern matching including:
@@ -291,38 +302,45 @@ module Crawlr
     #     }
     #   }
     def parse_to_hash(content)
-      robots_hash = {
-        sitemap: [],
-        rules: {}
-      }
+      robots_hash = { sitemap: [], rules: {} }
       curr_user_agents = []
       content.each_line do |line|
-        clean_line = line.strip
-        next if clean_line.empty? || clean_line.start_with?("#")
-        key, value = clean_line.split(":", 2).map(&:strip)
+        key, value = parse_line(line)
         next unless key && value
-        key = key.downcase
-        case key
-        when "sitemap"
-          robots_hash[:sitemap] << value
-        when "user-agent"
-          curr_user_agents = [value]
-          robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
-        when "allow"
-          curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
-        when "disallow"
-          curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
-        when "crawl-delay"
-          curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
-        end
+        curr_user_agents = apply_rule(robots_hash, key, value, curr_user_agents)
       end
       robots_hash
     end
+    def parse_line(line)
+      clean_line = line.strip
+      return if clean_line.empty? || clean_line.start_with?("#")
+      key, value = clean_line.split(":", 2).map(&:strip)
+      return unless key && value
+      [key.downcase, value]
+    end
+    def apply_rule(robots_hash, key, value, curr_user_agents)
+      case key
+      when "sitemap"
+        robots_hash[:sitemap] << value
+      when "user-agent"
+        curr_user_agents = [value]
+        robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
+      when "allow"
+        curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
+      when "disallow"
+        curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
+      when "crawl-delay"
+        curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
+      end
+      curr_user_agents
+    end
   end
 end

data/lib/crawlr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Crawlr
-  VERSION = "0.2.1"
+  VERSION = "0.2.3"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: crawlr
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.2.3
 platform: ruby
 authors:
 - Aristotelis Rapai
@@ -174,6 +174,7 @@ files:
 - lib/crawlr/collector.rb
 - lib/crawlr/config.rb
 - lib/crawlr/context.rb
+- lib/crawlr/cookie_jar.rb
 - lib/crawlr/domains.rb
 - lib/crawlr/hooks.rb
 - lib/crawlr/http_interface.rb