RubyGems - crawlr - Versions diffs - 0.2.2 → 0.2.3 - Mend

crawlr 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/.rubocop.yml +15 -2
data/CHANGELOG.md +5 -0
data/lib/crawlr/callbacks.rb +1 -3
data/lib/crawlr/cookie_jar.rb +21 -0
data/lib/crawlr/domains.rb +9 -6
data/lib/crawlr/http_interface.rb +21 -16
data/lib/crawlr/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b74c6111f3df0866bf50f28c5510b637c74264fe8809d57092d2694d1694c974
-  data.tar.gz: 93799e89ba870575b86d9f70f4938c145cefa8aea8effc60d0bbacc1e9919a87
+  metadata.gz: 461033ddf39311187e8137c08382c3eb25a6f6a88e8324a0e305fffbbd35c6cc
+  data.tar.gz: aa70f5cb9f87cca95192e6447fe615b2357675c8ab0710b38c0197af4c0a91c6
 SHA512:
-  metadata.gz: 492e82dddbc07a130135137c94f561307f0692a405f630eec51500cd8b4046ad7a3147f010758bdfa91f1b2aa6dd23135fdcbea5a9ba62c76c2715525a117fbb
-  data.tar.gz: ad76ec821b6b4929779c1823107c1b21ca8180ec3fea3b71402d66a8853eddeb986bd8895f0146ce8e14d88d9090d3df696b8015c8b51b0c1f5ca224b36b6986
+  metadata.gz: fcbada68009ff7aa92e4ed37b57f9dde86132f6dd99a16f4f5999b4461b78a5b8d365b768d76f969dcb6aa34bcec795f0813576cb47bdbc24223bfe39b1e04a4
+  data.tar.gz: 6b22f9947b34c32c5ec5e1d2b684f7b0e37f8c49adf047bc66cdd24f7adbf8174826bdab5cdfc88f8115e2cc946f5edc300d7047a475001aebc30dd1c8b7dbc0

data/.rubocop.yml CHANGED Viewed

@@ -1,9 +1,22 @@
 AllCops:
   TargetRubyVersion: 3.1
   SuggestExtensions: false
+  Exclude:
+    - examples/*.rb
+    - spec/**/*.rb
+Metrics/MethodLength:
+  Max: 20
+Metrics/ClassLength:
+  Max: 300
+Metrics/AbcSize:
+  Max: 30
+Metrics/CyclomaticComplexity:
+  Max: 10
+Metrics/PerceivedComplexity:
+  Max: 10
+Layout/LineLength:
+  Max: 130
 Style/StringLiterals:
   EnforcedStyle: double_quotes
 Style/StringLiteralsInInterpolation:
   EnforcedStyle: double_quotes

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,10 @@
 ## [Unreleased]
+## [0.2.3] - 2025-10-03
+- Introduce custom HTTPInterface cookie_jar wrapped in concurrent map for safe reads/writes when batch visiting async
+- Rubocop related updates
 ## [0.2.2] - 2025-10-01
 - Refactor robots.rb and parser.rb to address a few rubocop complaints

data/lib/crawlr/callbacks.rb CHANGED Viewed

@@ -141,9 +141,7 @@ module Crawlr
       raise ArgumentError, "Unsupported format: #{format}" unless ALLOWED_FORMATS.include?(format)
       selector_type, selector = parse_input(input)
-      unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
-        raise ArgumentError, "Unsupported selector type: #{selector_type}"
-      end
+      raise ArgumentError, "Unsupported selector type: #{selector_type}" unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
       register(format, selector_type, selector, &block)
     end

data/lib/crawlr/cookie_jar.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+require "http/cookie_jar"
+module Crawlr
+  # A thread-safe wrapper around the HTTP::CookieJar class
+  class CookieJar
+    def initialize
+      @jar = HTTP::CookieJar.new
+      @lock = Concurrent::ReadWriteLock.new
+    end
+    def add(cookie)
+      @lock.with_write_lock { @jar.add(cookie) }
+    end
+    def cookies(uri)
+      @lock.with_read_lock { @jar.cookies(uri) }
+    end
+  end
+end

data/lib/crawlr/domains.rb CHANGED Viewed

@@ -77,12 +77,7 @@ module Crawlr
     #   domains.allowed?('https://any-domain.com')          #=> true
     def allowed?(url)
       return true if @allowed_domains.empty? && @domain_glob.empty?
-      unless @domain_glob.empty?
-        @domain_glob.each do |glob|
-          return true if File.fnmatch?(glob, url)
-        end
-      end
+      return true if !@domain_glob.empty? && matches_domain_glob?(url)
       uri = URI(url)
       base_name = uri.host.sub("www.", "")
@@ -114,6 +109,14 @@ module Crawlr
     private
+    def matches_domain_glob?(url)
+      @domain_glob.each do |glob|
+        return true if File.fnmatch?(glob, url)
+      end
+      false
+    end
     # Extracts and normalizes domain names from the configuration
     #
     # Processes the list of allowed domains by:

data/lib/crawlr/http_interface.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 require "async"
 require "async/timeout"
 require "async/http/internet"
-require "http/cookie_jar"
+require_relative "cookie_jar"
 module Crawlr
   # Handles fetching documents via async HTTP with proxy and cookie support.
@@ -85,7 +85,7 @@ module Crawlr
     #   http = Crawlr::HTTPInterface.new(config)
     def initialize(config)
       @config = config
-      @cookie_jar = @config.allow_cookies ? HTTP::CookieJar.new : nil
+      @cookie_jars = Concurrent::Map.new if @config.allow_cookies
       @proxy_index = 0
     end
@@ -135,7 +135,7 @@ module Crawlr
     #   rescue StandardError => e
     #     puts "Request failed: #{e.message}"
     #   end
-    def get(url)
+    def get(url) # rubocop:disable Metrics/MethodLength
       Crawlr.logger.debug "Fetching #{url}"
       uri = URI.parse(url)
@@ -143,13 +143,9 @@ module Crawlr
       internet = build_internet_connection(proxy_url)
       request_headers = @config.headers.dup
+      handle_cookies(uri, request_headers)
-      if @config.allow_cookies
-        cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
-        request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
-      end
-      yield(url, request_headers) if block_given?
+      yield(url, request_headers) if block_given? # Used for request customization hook
       raw_response = nil
       begin
@@ -272,14 +268,23 @@ module Crawlr
     #   parse_and_set_cookies(uri, response)
     #   # Cookie is stored and will be sent with future requests to example.com
     def parse_and_set_cookies(uri, response)
-      set_cookies = response.headers["set-cookie"]
-      Array(set_cookies).each do |set_cookie|
-        HTTP::Cookie.parse(set_cookie.to_s, uri).each do |cookie|
-          @cookie_jar.add(cookie)
-          Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
-                              " domain=#{cookie.domain}, path=#{cookie.path}"
-        end
+      jar = cookie_jar_for(uri)
+      Array(response.headers["set-cookie"]).each do |set_cookie|
+        HTTP::Cookie.parse(set_cookie.to_s, uri).each { |cookie| jar.add(cookie) }
       end
     end
+    # Get or create a thread-safe jar for a domain
+    def cookie_jar_for(uri)
+      @cookie_jars.compute_if_absent(uri.host) { Crawlr::CookieJar.new }
+    end
+    def handle_cookies(uri, request_headers)
+      return unless @config.allow_cookies
+      jar = cookie_jar_for(uri)
+      cookie_header = HTTP::Cookie.cookie_value(jar.cookies(uri))
+      request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
+    end
   end
 end

data/lib/crawlr/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Crawlr
-  VERSION = "0.2.2"
+  VERSION = "0.2.3"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: crawlr
 version: !ruby/object:Gem::Version
-  version: 0.2.2
+  version: 0.2.3
 platform: ruby
 authors:
 - Aristotelis Rapai
@@ -174,6 +174,7 @@ files:
 - lib/crawlr/collector.rb
 - lib/crawlr/config.rb
 - lib/crawlr/context.rb
+- lib/crawlr/cookie_jar.rb
 - lib/crawlr/domains.rb
 - lib/crawlr/hooks.rb
 - lib/crawlr/http_interface.rb