crawlr 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -2
- data/CHANGELOG.md +9 -0
- data/README.md +1 -1
- data/lib/crawlr/callbacks.rb +1 -3
- data/lib/crawlr/cookie_jar.rb +21 -0
- data/lib/crawlr/domains.rb +9 -6
- data/lib/crawlr/http_interface.rb +21 -16
- data/lib/crawlr/parser.rb +7 -6
- data/lib/crawlr/robots.rb +62 -44
- data/lib/crawlr/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 461033ddf39311187e8137c08382c3eb25a6f6a88e8324a0e305fffbbd35c6cc
|
4
|
+
data.tar.gz: aa70f5cb9f87cca95192e6447fe615b2357675c8ab0710b38c0197af4c0a91c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcbada68009ff7aa92e4ed37b57f9dde86132f6dd99a16f4f5999b4461b78a5b8d365b768d76f969dcb6aa34bcec795f0813576cb47bdbc24223bfe39b1e04a4
|
7
|
+
data.tar.gz: 6b22f9947b34c32c5ec5e1d2b684f7b0e37f8c49adf047bc66cdd24f7adbf8174826bdab5cdfc88f8115e2cc946f5edc300d7047a475001aebc30dd1c8b7dbc0
|
data/.rubocop.yml
CHANGED
@@ -1,9 +1,22 @@
|
|
1
1
|
AllCops:
|
2
2
|
TargetRubyVersion: 3.1
|
3
3
|
SuggestExtensions: false
|
4
|
-
|
4
|
+
Exclude:
|
5
|
+
- examples/*.rb
|
6
|
+
- spec/**/*.rb
|
7
|
+
Metrics/MethodLength:
|
8
|
+
Max: 20
|
9
|
+
Metrics/ClassLength:
|
10
|
+
Max: 300
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 30
|
13
|
+
Metrics/CyclomaticComplexity:
|
14
|
+
Max: 10
|
15
|
+
Metrics/PerceivedComplexity:
|
16
|
+
Max: 10
|
17
|
+
Layout/LineLength:
|
18
|
+
Max: 130
|
5
19
|
Style/StringLiterals:
|
6
20
|
EnforcedStyle: double_quotes
|
7
|
-
|
8
21
|
Style/StringLiteralsInInterpolation:
|
9
22
|
EnforcedStyle: double_quotes
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.2.3] - 2025-10-03
|
4
|
+
|
5
|
+
- Introduce custom HTTPInterface cookie_jar wrapped in concurrent map for safe reads/writes when batch visiting async
|
6
|
+
- Rubocop related updates
|
7
|
+
|
8
|
+
## [0.2.2] - 2025-10-01
|
9
|
+
|
10
|
+
- Refactor robots.rb and parser.rb to address a few rubocop complaints
|
11
|
+
|
3
12
|
## [0.2.1] - 2025-09-30
|
4
13
|
|
5
14
|
- Fix paginated_visit to properly handle provided url queries (if present)
|
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
|
4
4
|
|
5
5
|
[](https://badge.fury.io/rb/crawlr)
|
6
|
-
[](https://github.com/aristorap/crawlr/actions/workflows/
|
6
|
+
[](https://github.com/aristorap/crawlr/actions/workflows/main.yml)
|
7
7
|
|
8
8
|
## ✨ Features
|
9
9
|
|
data/lib/crawlr/callbacks.rb
CHANGED
@@ -141,9 +141,7 @@ module Crawlr
|
|
141
141
|
raise ArgumentError, "Unsupported format: #{format}" unless ALLOWED_FORMATS.include?(format)
|
142
142
|
|
143
143
|
selector_type, selector = parse_input(input)
|
144
|
-
unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
|
145
|
-
raise ArgumentError, "Unsupported selector type: #{selector_type}"
|
146
|
-
end
|
144
|
+
raise ArgumentError, "Unsupported selector type: #{selector_type}" unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
|
147
145
|
|
148
146
|
register(format, selector_type, selector, &block)
|
149
147
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "http/cookie_jar"
|
4
|
+
|
5
|
+
module Crawlr
|
6
|
+
# A thread-safe wrapper around the HTTP::CookieJar class
|
7
|
+
class CookieJar
|
8
|
+
def initialize
|
9
|
+
@jar = HTTP::CookieJar.new
|
10
|
+
@lock = Concurrent::ReadWriteLock.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def add(cookie)
|
14
|
+
@lock.with_write_lock { @jar.add(cookie) }
|
15
|
+
end
|
16
|
+
|
17
|
+
def cookies(uri)
|
18
|
+
@lock.with_read_lock { @jar.cookies(uri) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/crawlr/domains.rb
CHANGED
@@ -77,12 +77,7 @@ module Crawlr
|
|
77
77
|
# domains.allowed?('https://any-domain.com') #=> true
|
78
78
|
def allowed?(url)
|
79
79
|
return true if @allowed_domains.empty? && @domain_glob.empty?
|
80
|
-
|
81
|
-
unless @domain_glob.empty?
|
82
|
-
@domain_glob.each do |glob|
|
83
|
-
return true if File.fnmatch?(glob, url)
|
84
|
-
end
|
85
|
-
end
|
80
|
+
return true if !@domain_glob.empty? && matches_domain_glob?(url)
|
86
81
|
|
87
82
|
uri = URI(url)
|
88
83
|
base_name = uri.host.sub("www.", "")
|
@@ -114,6 +109,14 @@ module Crawlr
|
|
114
109
|
|
115
110
|
private
|
116
111
|
|
112
|
+
def matches_domain_glob?(url)
|
113
|
+
@domain_glob.each do |glob|
|
114
|
+
return true if File.fnmatch?(glob, url)
|
115
|
+
end
|
116
|
+
|
117
|
+
false
|
118
|
+
end
|
119
|
+
|
117
120
|
# Extracts and normalizes domain names from the configuration
|
118
121
|
#
|
119
122
|
# Processes the list of allowed domains by:
|
@@ -3,7 +3,7 @@
|
|
3
3
|
require "async"
|
4
4
|
require "async/timeout"
|
5
5
|
require "async/http/internet"
|
6
|
-
|
6
|
+
require_relative "cookie_jar"
|
7
7
|
|
8
8
|
module Crawlr
|
9
9
|
# Handles fetching documents via async HTTP with proxy and cookie support.
|
@@ -85,7 +85,7 @@ module Crawlr
|
|
85
85
|
# http = Crawlr::HTTPInterface.new(config)
|
86
86
|
def initialize(config)
|
87
87
|
@config = config
|
88
|
-
@
|
88
|
+
@cookie_jars = Concurrent::Map.new if @config.allow_cookies
|
89
89
|
@proxy_index = 0
|
90
90
|
end
|
91
91
|
|
@@ -135,7 +135,7 @@ module Crawlr
|
|
135
135
|
# rescue StandardError => e
|
136
136
|
# puts "Request failed: #{e.message}"
|
137
137
|
# end
|
138
|
-
def get(url)
|
138
|
+
def get(url) # rubocop:disable Metrics/MethodLength
|
139
139
|
Crawlr.logger.debug "Fetching #{url}"
|
140
140
|
|
141
141
|
uri = URI.parse(url)
|
@@ -143,13 +143,9 @@ module Crawlr
|
|
143
143
|
internet = build_internet_connection(proxy_url)
|
144
144
|
|
145
145
|
request_headers = @config.headers.dup
|
146
|
+
handle_cookies(uri, request_headers)
|
146
147
|
|
147
|
-
if
|
148
|
-
cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
|
149
|
-
request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
|
150
|
-
end
|
151
|
-
|
152
|
-
yield(url, request_headers) if block_given?
|
148
|
+
yield(url, request_headers) if block_given? # Used for request customization hook
|
153
149
|
|
154
150
|
raw_response = nil
|
155
151
|
begin
|
@@ -272,14 +268,23 @@ module Crawlr
|
|
272
268
|
# parse_and_set_cookies(uri, response)
|
273
269
|
# # Cookie is stored and will be sent with future requests to example.com
|
274
270
|
def parse_and_set_cookies(uri, response)
|
275
|
-
|
276
|
-
Array(
|
277
|
-
HTTP::Cookie.parse(set_cookie.to_s, uri).each
|
278
|
-
@cookie_jar.add(cookie)
|
279
|
-
Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
|
280
|
-
" domain=#{cookie.domain}, path=#{cookie.path}"
|
281
|
-
end
|
271
|
+
jar = cookie_jar_for(uri)
|
272
|
+
Array(response.headers["set-cookie"]).each do |set_cookie|
|
273
|
+
HTTP::Cookie.parse(set_cookie.to_s, uri).each { |cookie| jar.add(cookie) }
|
282
274
|
end
|
283
275
|
end
|
276
|
+
|
277
|
+
# Get or create a thread-safe jar for a domain
|
278
|
+
def cookie_jar_for(uri)
|
279
|
+
@cookie_jars.compute_if_absent(uri.host) { Crawlr::CookieJar.new }
|
280
|
+
end
|
281
|
+
|
282
|
+
def handle_cookies(uri, request_headers)
|
283
|
+
return unless @config.allow_cookies
|
284
|
+
|
285
|
+
jar = cookie_jar_for(uri)
|
286
|
+
cookie_header = HTTP::Cookie.cookie_value(jar.cookies(uri))
|
287
|
+
request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
|
288
|
+
end
|
284
289
|
end
|
285
290
|
end
|
data/lib/crawlr/parser.rb
CHANGED
@@ -165,15 +165,16 @@ module Crawlr
|
|
165
165
|
|
166
166
|
callbacks_by_format.each do |format, format_callbacks|
|
167
167
|
doc = parse_content(format, content)
|
168
|
-
|
169
|
-
format_callbacks.each do |callback|
|
170
|
-
Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
|
171
|
-
nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
|
172
|
-
nodes.each { |node| callback[:block].call(node, context) }
|
173
|
-
end
|
168
|
+
format_callbacks.each { |callback| apply_callback(doc, callback, context) }
|
174
169
|
end
|
175
170
|
end
|
176
171
|
|
172
|
+
private_class_method def self.apply_callback(doc, callback, context)
|
173
|
+
Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
|
174
|
+
nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
|
175
|
+
nodes.each { |node| callback[:block].call(node, context) }
|
176
|
+
end
|
177
|
+
|
177
178
|
# Parses content using the appropriate Nokogiri parser
|
178
179
|
#
|
179
180
|
# Creates a Nokogiri document object using either the HTML or XML parser
|
data/lib/crawlr/robots.rb
CHANGED
@@ -130,25 +130,13 @@ module Crawlr
|
|
130
130
|
# robots.allowed?('https://site.com/temporary/', 'Bot') #=> true
|
131
131
|
def allowed?(url, user_agent)
|
132
132
|
rule = get_rule(url, user_agent)
|
133
|
-
return true unless rule
|
133
|
+
return true unless rule
|
134
134
|
|
135
135
|
path = URI.parse(url).path
|
136
|
-
matched =
|
137
|
-
|
138
|
-
# Match allow/disallow using fnmatch (robots.txt style)
|
139
|
-
rule.allow.each do |pattern|
|
140
|
-
matched << [:allow, pattern] if robots_match?(pattern, path)
|
141
|
-
end
|
142
|
-
|
143
|
-
rule.disallow.each do |pattern|
|
144
|
-
matched << [:disallow, pattern] if robots_match?(pattern, path)
|
145
|
-
end
|
146
|
-
|
136
|
+
matched = matched_rules(rule, path)
|
147
137
|
return true if matched.empty?
|
148
138
|
|
149
|
-
|
150
|
-
action, = matched.max_by { |_, p| p.length }
|
151
|
-
action == :allow
|
139
|
+
longest_match_allows?(matched)
|
152
140
|
end
|
153
141
|
|
154
142
|
# Parses robots.txt content and stores rules for the given URL's domain
|
@@ -204,6 +192,25 @@ module Crawlr
|
|
204
192
|
|
205
193
|
private
|
206
194
|
|
195
|
+
def matched_rules(rule, path)
|
196
|
+
matched = []
|
197
|
+
|
198
|
+
rule.allow.each do |pattern|
|
199
|
+
matched << [:allow, pattern] if robots_match?(pattern, path)
|
200
|
+
end
|
201
|
+
|
202
|
+
rule.disallow.each do |pattern|
|
203
|
+
matched << [:disallow, pattern] if robots_match?(pattern, path)
|
204
|
+
end
|
205
|
+
|
206
|
+
matched
|
207
|
+
end
|
208
|
+
|
209
|
+
def longest_match_allows?(matched)
|
210
|
+
action, = matched.max_by { |_, pattern| pattern.length }
|
211
|
+
action == :allow
|
212
|
+
end
|
213
|
+
|
207
214
|
# Finds the most applicable rule for a URL and user-agent combination
|
208
215
|
#
|
209
216
|
# Implements the robots.txt user-agent matching algorithm:
|
@@ -222,11 +229,7 @@ module Crawlr
|
|
222
229
|
return nil unless rules
|
223
230
|
|
224
231
|
# Case-insensitive prefix match
|
225
|
-
applicable_rules = rules
|
226
|
-
next if rule.user_agent.nil?
|
227
|
-
|
228
|
-
user_agent.downcase.start_with?(rule.user_agent.downcase)
|
229
|
-
end
|
232
|
+
applicable_rules = rules_by_prefix_match(user_agent, rules)
|
230
233
|
|
231
234
|
# Fallback to wildcard
|
232
235
|
applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
|
@@ -235,6 +238,14 @@ module Crawlr
|
|
235
238
|
applicable_rules.max_by { |r| r.user_agent.length }
|
236
239
|
end
|
237
240
|
|
241
|
+
def rules_by_prefix_match(user_agent, rules)
|
242
|
+
rules.select do |rule|
|
243
|
+
next if rule.user_agent.nil?
|
244
|
+
|
245
|
+
user_agent.downcase.start_with?(rule.user_agent.downcase)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
238
249
|
# Tests if a robots.txt pattern matches a given path
|
239
250
|
#
|
240
251
|
# Implements robots.txt pattern matching including:
|
@@ -291,38 +302,45 @@ module Crawlr
|
|
291
302
|
# }
|
292
303
|
# }
|
293
304
|
def parse_to_hash(content)
|
294
|
-
robots_hash = {
|
295
|
-
sitemap: [],
|
296
|
-
rules: {}
|
297
|
-
}
|
298
|
-
|
305
|
+
robots_hash = { sitemap: [], rules: {} }
|
299
306
|
curr_user_agents = []
|
300
307
|
|
301
308
|
content.each_line do |line|
|
302
|
-
|
303
|
-
next if clean_line.empty? || clean_line.start_with?("#")
|
304
|
-
|
305
|
-
key, value = clean_line.split(":", 2).map(&:strip)
|
309
|
+
key, value = parse_line(line)
|
306
310
|
next unless key && value
|
307
311
|
|
308
|
-
|
309
|
-
|
310
|
-
case key
|
311
|
-
when "sitemap"
|
312
|
-
robots_hash[:sitemap] << value
|
313
|
-
when "user-agent"
|
314
|
-
curr_user_agents = [value]
|
315
|
-
robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
|
316
|
-
when "allow"
|
317
|
-
curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
|
318
|
-
when "disallow"
|
319
|
-
curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
|
320
|
-
when "crawl-delay"
|
321
|
-
curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
|
322
|
-
end
|
312
|
+
curr_user_agents = apply_rule(robots_hash, key, value, curr_user_agents)
|
323
313
|
end
|
324
314
|
|
325
315
|
robots_hash
|
326
316
|
end
|
317
|
+
|
318
|
+
def parse_line(line)
|
319
|
+
clean_line = line.strip
|
320
|
+
return if clean_line.empty? || clean_line.start_with?("#")
|
321
|
+
|
322
|
+
key, value = clean_line.split(":", 2).map(&:strip)
|
323
|
+
return unless key && value
|
324
|
+
|
325
|
+
[key.downcase, value]
|
326
|
+
end
|
327
|
+
|
328
|
+
def apply_rule(robots_hash, key, value, curr_user_agents)
|
329
|
+
case key
|
330
|
+
when "sitemap"
|
331
|
+
robots_hash[:sitemap] << value
|
332
|
+
when "user-agent"
|
333
|
+
curr_user_agents = [value]
|
334
|
+
robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
|
335
|
+
when "allow"
|
336
|
+
curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
|
337
|
+
when "disallow"
|
338
|
+
curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
|
339
|
+
when "crawl-delay"
|
340
|
+
curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
|
341
|
+
end
|
342
|
+
|
343
|
+
curr_user_agents
|
344
|
+
end
|
327
345
|
end
|
328
346
|
end
|
data/lib/crawlr/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aristotelis Rapai
|
@@ -174,6 +174,7 @@ files:
|
|
174
174
|
- lib/crawlr/collector.rb
|
175
175
|
- lib/crawlr/config.rb
|
176
176
|
- lib/crawlr/context.rb
|
177
|
+
- lib/crawlr/cookie_jar.rb
|
177
178
|
- lib/crawlr/domains.rb
|
178
179
|
- lib/crawlr/hooks.rb
|
179
180
|
- lib/crawlr/http_interface.rb
|