crawlr 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
4
- data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
3
+ metadata.gz: 461033ddf39311187e8137c08382c3eb25a6f6a88e8324a0e305fffbbd35c6cc
4
+ data.tar.gz: aa70f5cb9f87cca95192e6447fe615b2357675c8ab0710b38c0197af4c0a91c6
5
5
  SHA512:
6
- metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
7
- data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
6
+ metadata.gz: fcbada68009ff7aa92e4ed37b57f9dde86132f6dd99a16f4f5999b4461b78a5b8d365b768d76f969dcb6aa34bcec795f0813576cb47bdbc24223bfe39b1e04a4
7
+ data.tar.gz: 6b22f9947b34c32c5ec5e1d2b684f7b0e37f8c49adf047bc66cdd24f7adbf8174826bdab5cdfc88f8115e2cc946f5edc300d7047a475001aebc30dd1c8b7dbc0
data/.rubocop.yml CHANGED
@@ -1,9 +1,22 @@
1
1
  AllCops:
2
2
  TargetRubyVersion: 3.1
3
3
  SuggestExtensions: false
4
-
4
+ Exclude:
5
+ - examples/*.rb
6
+ - spec/**/*.rb
7
+ Metrics/MethodLength:
8
+ Max: 20
9
+ Metrics/ClassLength:
10
+ Max: 300
11
+ Metrics/AbcSize:
12
+ Max: 30
13
+ Metrics/CyclomaticComplexity:
14
+ Max: 10
15
+ Metrics/PerceivedComplexity:
16
+ Max: 10
17
+ Layout/LineLength:
18
+ Max: 130
5
19
  Style/StringLiterals:
6
20
  EnforcedStyle: double_quotes
7
-
8
21
  Style/StringLiteralsInInterpolation:
9
22
  EnforcedStyle: double_quotes
data/CHANGELOG.md CHANGED
@@ -1,5 +1,14 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.2.3] - 2025-10-03
4
+
5
+ - Introduce custom HTTPInterface cookie_jar wrapped in concurrent map for safe reads/writes when batch visiting async
6
+ - Rubocop related updates
7
+
8
+ ## [0.2.2] - 2025-10-01
9
+
10
+ - Refactor robots.rb and parser.rb to address a few rubocop complaints
11
+
3
12
  ## [0.2.1] - 2025-09-30
4
13
 
5
14
  - Fix paginated_visit to properly handle provided url queries (if present)
data/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
4
4
 
5
5
  [![Gem Version](https://badge.fury.io/rb/crawlr.svg)](https://badge.fury.io/rb/crawlr)
6
- [![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
6
+ [![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/main.yml)
7
7
 
8
8
  ## ✨ Features
9
9
 
@@ -141,9 +141,7 @@ module Crawlr
141
141
  raise ArgumentError, "Unsupported format: #{format}" unless ALLOWED_FORMATS.include?(format)
142
142
 
143
143
  selector_type, selector = parse_input(input)
144
- unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
145
- raise ArgumentError, "Unsupported selector type: #{selector_type}"
146
- end
144
+ raise ArgumentError, "Unsupported selector type: #{selector_type}" unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
147
145
 
148
146
  register(format, selector_type, selector, &block)
149
147
  end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "http/cookie_jar"
4
+
5
+ module Crawlr
6
+ # A thread-safe wrapper around the HTTP::CookieJar class
7
+ class CookieJar
8
+ def initialize
9
+ @jar = HTTP::CookieJar.new
10
+ @lock = Concurrent::ReadWriteLock.new
11
+ end
12
+
13
+ def add(cookie)
14
+ @lock.with_write_lock { @jar.add(cookie) }
15
+ end
16
+
17
+ def cookies(uri)
18
+ @lock.with_read_lock { @jar.cookies(uri) }
19
+ end
20
+ end
21
+ end
@@ -77,12 +77,7 @@ module Crawlr
77
77
  # domains.allowed?('https://any-domain.com') #=> true
78
78
  def allowed?(url)
79
79
  return true if @allowed_domains.empty? && @domain_glob.empty?
80
-
81
- unless @domain_glob.empty?
82
- @domain_glob.each do |glob|
83
- return true if File.fnmatch?(glob, url)
84
- end
85
- end
80
+ return true if !@domain_glob.empty? && matches_domain_glob?(url)
86
81
 
87
82
  uri = URI(url)
88
83
  base_name = uri.host.sub("www.", "")
@@ -114,6 +109,14 @@ module Crawlr
114
109
 
115
110
  private
116
111
 
112
+ def matches_domain_glob?(url)
113
+ @domain_glob.each do |glob|
114
+ return true if File.fnmatch?(glob, url)
115
+ end
116
+
117
+ false
118
+ end
119
+
117
120
  # Extracts and normalizes domain names from the configuration
118
121
  #
119
122
  # Processes the list of allowed domains by:
@@ -3,7 +3,7 @@
3
3
  require "async"
4
4
  require "async/timeout"
5
5
  require "async/http/internet"
6
- require "http/cookie_jar"
6
+ require_relative "cookie_jar"
7
7
 
8
8
  module Crawlr
9
9
  # Handles fetching documents via async HTTP with proxy and cookie support.
@@ -85,7 +85,7 @@ module Crawlr
85
85
  # http = Crawlr::HTTPInterface.new(config)
86
86
  def initialize(config)
87
87
  @config = config
88
- @cookie_jar = @config.allow_cookies ? HTTP::CookieJar.new : nil
88
+ @cookie_jars = Concurrent::Map.new if @config.allow_cookies
89
89
  @proxy_index = 0
90
90
  end
91
91
 
@@ -135,7 +135,7 @@ module Crawlr
135
135
  # rescue StandardError => e
136
136
  # puts "Request failed: #{e.message}"
137
137
  # end
138
- def get(url)
138
+ def get(url) # rubocop:disable Metrics/MethodLength
139
139
  Crawlr.logger.debug "Fetching #{url}"
140
140
 
141
141
  uri = URI.parse(url)
@@ -143,13 +143,9 @@ module Crawlr
143
143
  internet = build_internet_connection(proxy_url)
144
144
 
145
145
  request_headers = @config.headers.dup
146
+ handle_cookies(uri, request_headers)
146
147
 
147
- if @config.allow_cookies
148
- cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
149
- request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
150
- end
151
-
152
- yield(url, request_headers) if block_given?
148
+ yield(url, request_headers) if block_given? # Used for request customization hook
153
149
 
154
150
  raw_response = nil
155
151
  begin
@@ -272,14 +268,23 @@ module Crawlr
272
268
  # parse_and_set_cookies(uri, response)
273
269
  # # Cookie is stored and will be sent with future requests to example.com
274
270
  def parse_and_set_cookies(uri, response)
275
- set_cookies = response.headers["set-cookie"]
276
- Array(set_cookies).each do |set_cookie|
277
- HTTP::Cookie.parse(set_cookie.to_s, uri).each do |cookie|
278
- @cookie_jar.add(cookie)
279
- Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
280
- " domain=#{cookie.domain}, path=#{cookie.path}"
281
- end
271
+ jar = cookie_jar_for(uri)
272
+ Array(response.headers["set-cookie"]).each do |set_cookie|
273
+ HTTP::Cookie.parse(set_cookie.to_s, uri).each { |cookie| jar.add(cookie) }
282
274
  end
283
275
  end
276
+
277
+ # Get or create a thread-safe jar for a domain
278
+ def cookie_jar_for(uri)
279
+ @cookie_jars.compute_if_absent(uri.host) { Crawlr::CookieJar.new }
280
+ end
281
+
282
+ def handle_cookies(uri, request_headers)
283
+ return unless @config.allow_cookies
284
+
285
+ jar = cookie_jar_for(uri)
286
+ cookie_header = HTTP::Cookie.cookie_value(jar.cookies(uri))
287
+ request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
288
+ end
284
289
  end
285
290
  end
data/lib/crawlr/parser.rb CHANGED
@@ -165,15 +165,16 @@ module Crawlr
165
165
 
166
166
  callbacks_by_format.each do |format, format_callbacks|
167
167
  doc = parse_content(format, content)
168
-
169
- format_callbacks.each do |callback|
170
- Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
171
- nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
172
- nodes.each { |node| callback[:block].call(node, context) }
173
- end
168
+ format_callbacks.each { |callback| apply_callback(doc, callback, context) }
174
169
  end
175
170
  end
176
171
 
172
+ private_class_method def self.apply_callback(doc, callback, context)
173
+ Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
174
+ nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
175
+ nodes.each { |node| callback[:block].call(node, context) }
176
+ end
177
+
177
178
  # Parses content using the appropriate Nokogiri parser
178
179
  #
179
180
  # Creates a Nokogiri document object using either the HTML or XML parser
data/lib/crawlr/robots.rb CHANGED
@@ -130,25 +130,13 @@ module Crawlr
130
130
  # robots.allowed?('https://site.com/temporary/', 'Bot') #=> true
131
131
  def allowed?(url, user_agent)
132
132
  rule = get_rule(url, user_agent)
133
- return true unless rule # if no robots.txt or no rule, allow
133
+ return true unless rule
134
134
 
135
135
  path = URI.parse(url).path
136
- matched = []
137
-
138
- # Match allow/disallow using fnmatch (robots.txt style)
139
- rule.allow.each do |pattern|
140
- matched << [:allow, pattern] if robots_match?(pattern, path)
141
- end
142
-
143
- rule.disallow.each do |pattern|
144
- matched << [:disallow, pattern] if robots_match?(pattern, path)
145
- end
146
-
136
+ matched = matched_rules(rule, path)
147
137
  return true if matched.empty?
148
138
 
149
- # Longest match wins
150
- action, = matched.max_by { |_, p| p.length }
151
- action == :allow
139
+ longest_match_allows?(matched)
152
140
  end
153
141
 
154
142
  # Parses robots.txt content and stores rules for the given URL's domain
@@ -204,6 +192,25 @@ module Crawlr
204
192
 
205
193
  private
206
194
 
195
+ def matched_rules(rule, path)
196
+ matched = []
197
+
198
+ rule.allow.each do |pattern|
199
+ matched << [:allow, pattern] if robots_match?(pattern, path)
200
+ end
201
+
202
+ rule.disallow.each do |pattern|
203
+ matched << [:disallow, pattern] if robots_match?(pattern, path)
204
+ end
205
+
206
+ matched
207
+ end
208
+
209
+ def longest_match_allows?(matched)
210
+ action, = matched.max_by { |_, pattern| pattern.length }
211
+ action == :allow
212
+ end
213
+
207
214
  # Finds the most applicable rule for a URL and user-agent combination
208
215
  #
209
216
  # Implements the robots.txt user-agent matching algorithm:
@@ -222,11 +229,7 @@ module Crawlr
222
229
  return nil unless rules
223
230
 
224
231
  # Case-insensitive prefix match
225
- applicable_rules = rules.select do |rule|
226
- next if rule.user_agent.nil?
227
-
228
- user_agent.downcase.start_with?(rule.user_agent.downcase)
229
- end
232
+ applicable_rules = rules_by_prefix_match(user_agent, rules)
230
233
 
231
234
  # Fallback to wildcard
232
235
  applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
@@ -235,6 +238,14 @@ module Crawlr
235
238
  applicable_rules.max_by { |r| r.user_agent.length }
236
239
  end
237
240
 
241
+ def rules_by_prefix_match(user_agent, rules)
242
+ rules.select do |rule|
243
+ next if rule.user_agent.nil?
244
+
245
+ user_agent.downcase.start_with?(rule.user_agent.downcase)
246
+ end
247
+ end
248
+
238
249
  # Tests if a robots.txt pattern matches a given path
239
250
  #
240
251
  # Implements robots.txt pattern matching including:
@@ -291,38 +302,45 @@ module Crawlr
291
302
  # }
292
303
  # }
293
304
  def parse_to_hash(content)
294
- robots_hash = {
295
- sitemap: [],
296
- rules: {}
297
- }
298
-
305
+ robots_hash = { sitemap: [], rules: {} }
299
306
  curr_user_agents = []
300
307
 
301
308
  content.each_line do |line|
302
- clean_line = line.strip
303
- next if clean_line.empty? || clean_line.start_with?("#")
304
-
305
- key, value = clean_line.split(":", 2).map(&:strip)
309
+ key, value = parse_line(line)
306
310
  next unless key && value
307
311
 
308
- key = key.downcase
309
-
310
- case key
311
- when "sitemap"
312
- robots_hash[:sitemap] << value
313
- when "user-agent"
314
- curr_user_agents = [value]
315
- robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
316
- when "allow"
317
- curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
318
- when "disallow"
319
- curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
320
- when "crawl-delay"
321
- curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
322
- end
312
+ curr_user_agents = apply_rule(robots_hash, key, value, curr_user_agents)
323
313
  end
324
314
 
325
315
  robots_hash
326
316
  end
317
+
318
+ def parse_line(line)
319
+ clean_line = line.strip
320
+ return if clean_line.empty? || clean_line.start_with?("#")
321
+
322
+ key, value = clean_line.split(":", 2).map(&:strip)
323
+ return unless key && value
324
+
325
+ [key.downcase, value]
326
+ end
327
+
328
+ def apply_rule(robots_hash, key, value, curr_user_agents)
329
+ case key
330
+ when "sitemap"
331
+ robots_hash[:sitemap] << value
332
+ when "user-agent"
333
+ curr_user_agents = [value]
334
+ robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
335
+ when "allow"
336
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
337
+ when "disallow"
338
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
339
+ when "crawl-delay"
340
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
341
+ end
342
+
343
+ curr_user_agents
344
+ end
327
345
  end
328
346
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlr
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.3"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aristotelis Rapai
@@ -174,6 +174,7 @@ files:
174
174
  - lib/crawlr/collector.rb
175
175
  - lib/crawlr/config.rb
176
176
  - lib/crawlr/context.rb
177
+ - lib/crawlr/cookie_jar.rb
177
178
  - lib/crawlr/domains.rb
178
179
  - lib/crawlr/hooks.rb
179
180
  - lib/crawlr/http_interface.rb