crawlr 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b74c6111f3df0866bf50f28c5510b637c74264fe8809d57092d2694d1694c974
4
- data.tar.gz: 93799e89ba870575b86d9f70f4938c145cefa8aea8effc60d0bbacc1e9919a87
3
+ metadata.gz: 461033ddf39311187e8137c08382c3eb25a6f6a88e8324a0e305fffbbd35c6cc
4
+ data.tar.gz: aa70f5cb9f87cca95192e6447fe615b2357675c8ab0710b38c0197af4c0a91c6
5
5
  SHA512:
6
- metadata.gz: 492e82dddbc07a130135137c94f561307f0692a405f630eec51500cd8b4046ad7a3147f010758bdfa91f1b2aa6dd23135fdcbea5a9ba62c76c2715525a117fbb
7
- data.tar.gz: ad76ec821b6b4929779c1823107c1b21ca8180ec3fea3b71402d66a8853eddeb986bd8895f0146ce8e14d88d9090d3df696b8015c8b51b0c1f5ca224b36b6986
6
+ metadata.gz: fcbada68009ff7aa92e4ed37b57f9dde86132f6dd99a16f4f5999b4461b78a5b8d365b768d76f969dcb6aa34bcec795f0813576cb47bdbc24223bfe39b1e04a4
7
+ data.tar.gz: 6b22f9947b34c32c5ec5e1d2b684f7b0e37f8c49adf047bc66cdd24f7adbf8174826bdab5cdfc88f8115e2cc946f5edc300d7047a475001aebc30dd1c8b7dbc0
data/.rubocop.yml CHANGED
@@ -1,9 +1,22 @@
1
1
  AllCops:
2
2
  TargetRubyVersion: 3.1
3
3
  SuggestExtensions: false
4
-
4
+ Exclude:
5
+ - examples/*.rb
6
+ - spec/**/*.rb
7
+ Metrics/MethodLength:
8
+ Max: 20
9
+ Metrics/ClassLength:
10
+ Max: 300
11
+ Metrics/AbcSize:
12
+ Max: 30
13
+ Metrics/CyclomaticComplexity:
14
+ Max: 10
15
+ Metrics/PerceivedComplexity:
16
+ Max: 10
17
+ Layout/LineLength:
18
+ Max: 130
5
19
  Style/StringLiterals:
6
20
  EnforcedStyle: double_quotes
7
-
8
21
  Style/StringLiteralsInInterpolation:
9
22
  EnforcedStyle: double_quotes
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.2.3] - 2025-10-03
4
+
5
+ - Introduce custom HTTPInterface cookie_jar wrapped in concurrent map for safe reads/writes when batch visiting async
6
+ - Rubocop related updates
7
+
3
8
  ## [0.2.2] - 2025-10-01
4
9
 
5
10
  - Refactor robots.rb and parser.rb to address a few rubocop complaints
@@ -141,9 +141,7 @@ module Crawlr
141
141
  raise ArgumentError, "Unsupported format: #{format}" unless ALLOWED_FORMATS.include?(format)
142
142
 
143
143
  selector_type, selector = parse_input(input)
144
- unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
145
- raise ArgumentError, "Unsupported selector type: #{selector_type}"
146
- end
144
+ raise ArgumentError, "Unsupported selector type: #{selector_type}" unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
147
145
 
148
146
  register(format, selector_type, selector, &block)
149
147
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "http/cookie_jar"
4
+
5
+ module Crawlr
6
+ # A thread-safe wrapper around the HTTP::CookieJar class
7
+ class CookieJar
8
+ def initialize
9
+ @jar = HTTP::CookieJar.new
10
+ @lock = Concurrent::ReadWriteLock.new
11
+ end
12
+
13
+ def add(cookie)
14
+ @lock.with_write_lock { @jar.add(cookie) }
15
+ end
16
+
17
+ def cookies(uri)
18
+ @lock.with_read_lock { @jar.cookies(uri) }
19
+ end
20
+ end
21
+ end
@@ -77,12 +77,7 @@ module Crawlr
77
77
  # domains.allowed?('https://any-domain.com') #=> true
78
78
  def allowed?(url)
79
79
  return true if @allowed_domains.empty? && @domain_glob.empty?
80
-
81
- unless @domain_glob.empty?
82
- @domain_glob.each do |glob|
83
- return true if File.fnmatch?(glob, url)
84
- end
85
- end
80
+ return true if !@domain_glob.empty? && matches_domain_glob?(url)
86
81
 
87
82
  uri = URI(url)
88
83
  base_name = uri.host.sub("www.", "")
@@ -114,6 +109,14 @@ module Crawlr
114
109
 
115
110
  private
116
111
 
112
+ def matches_domain_glob?(url)
113
+ @domain_glob.each do |glob|
114
+ return true if File.fnmatch?(glob, url)
115
+ end
116
+
117
+ false
118
+ end
119
+
117
120
  # Extracts and normalizes domain names from the configuration
118
121
  #
119
122
  # Processes the list of allowed domains by:
@@ -3,7 +3,7 @@
3
3
  require "async"
4
4
  require "async/timeout"
5
5
  require "async/http/internet"
6
- require "http/cookie_jar"
6
+ require_relative "cookie_jar"
7
7
 
8
8
  module Crawlr
9
9
  # Handles fetching documents via async HTTP with proxy and cookie support.
@@ -85,7 +85,7 @@ module Crawlr
85
85
  # http = Crawlr::HTTPInterface.new(config)
86
86
  def initialize(config)
87
87
  @config = config
88
- @cookie_jar = @config.allow_cookies ? HTTP::CookieJar.new : nil
88
+ @cookie_jars = Concurrent::Map.new if @config.allow_cookies
89
89
  @proxy_index = 0
90
90
  end
91
91
 
@@ -135,7 +135,7 @@ module Crawlr
135
135
  # rescue StandardError => e
136
136
  # puts "Request failed: #{e.message}"
137
137
  # end
138
- def get(url)
138
+ def get(url) # rubocop:disable Metrics/MethodLength
139
139
  Crawlr.logger.debug "Fetching #{url}"
140
140
 
141
141
  uri = URI.parse(url)
@@ -143,13 +143,9 @@ module Crawlr
143
143
  internet = build_internet_connection(proxy_url)
144
144
 
145
145
  request_headers = @config.headers.dup
146
+ handle_cookies(uri, request_headers)
146
147
 
147
- if @config.allow_cookies
148
- cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
149
- request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
150
- end
151
-
152
- yield(url, request_headers) if block_given?
148
+ yield(url, request_headers) if block_given? # Used for request customization hook
153
149
 
154
150
  raw_response = nil
155
151
  begin
@@ -272,14 +268,23 @@ module Crawlr
272
268
  # parse_and_set_cookies(uri, response)
273
269
  # # Cookie is stored and will be sent with future requests to example.com
274
270
  def parse_and_set_cookies(uri, response)
275
- set_cookies = response.headers["set-cookie"]
276
- Array(set_cookies).each do |set_cookie|
277
- HTTP::Cookie.parse(set_cookie.to_s, uri).each do |cookie|
278
- @cookie_jar.add(cookie)
279
- Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
280
- " domain=#{cookie.domain}, path=#{cookie.path}"
281
- end
271
+ jar = cookie_jar_for(uri)
272
+ Array(response.headers["set-cookie"]).each do |set_cookie|
273
+ HTTP::Cookie.parse(set_cookie.to_s, uri).each { |cookie| jar.add(cookie) }
282
274
  end
283
275
  end
276
+
277
+ # Get or create a thread-safe jar for a domain
278
+ def cookie_jar_for(uri)
279
+ @cookie_jars.compute_if_absent(uri.host) { Crawlr::CookieJar.new }
280
+ end
281
+
282
+ def handle_cookies(uri, request_headers)
283
+ return unless @config.allow_cookies
284
+
285
+ jar = cookie_jar_for(uri)
286
+ cookie_header = HTTP::Cookie.cookie_value(jar.cookies(uri))
287
+ request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
288
+ end
284
289
  end
285
290
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlr
4
- VERSION = "0.2.2"
4
+ VERSION = "0.2.3"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aristotelis Rapai
@@ -174,6 +174,7 @@ files:
174
174
  - lib/crawlr/collector.rb
175
175
  - lib/crawlr/config.rb
176
176
  - lib/crawlr/context.rb
177
+ - lib/crawlr/cookie_jar.rb
177
178
  - lib/crawlr/domains.rb
178
179
  - lib/crawlr/hooks.rb
179
180
  - lib/crawlr/http_interface.rb