crawlr 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +15 -2
- data/CHANGELOG.md +5 -0
- data/lib/crawlr/callbacks.rb +1 -3
- data/lib/crawlr/cookie_jar.rb +21 -0
- data/lib/crawlr/domains.rb +9 -6
- data/lib/crawlr/http_interface.rb +21 -16
- data/lib/crawlr/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 461033ddf39311187e8137c08382c3eb25a6f6a88e8324a0e305fffbbd35c6cc
|
4
|
+
data.tar.gz: aa70f5cb9f87cca95192e6447fe615b2357675c8ab0710b38c0197af4c0a91c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fcbada68009ff7aa92e4ed37b57f9dde86132f6dd99a16f4f5999b4461b78a5b8d365b768d76f969dcb6aa34bcec795f0813576cb47bdbc24223bfe39b1e04a4
|
7
|
+
data.tar.gz: 6b22f9947b34c32c5ec5e1d2b684f7b0e37f8c49adf047bc66cdd24f7adbf8174826bdab5cdfc88f8115e2cc946f5edc300d7047a475001aebc30dd1c8b7dbc0
|
data/.rubocop.yml
CHANGED
@@ -1,9 +1,22 @@
|
|
1
1
|
AllCops:
|
2
2
|
TargetRubyVersion: 3.1
|
3
3
|
SuggestExtensions: false
|
4
|
-
|
4
|
+
Exclude:
|
5
|
+
- examples/*.rb
|
6
|
+
- spec/**/*.rb
|
7
|
+
Metrics/MethodLength:
|
8
|
+
Max: 20
|
9
|
+
Metrics/ClassLength:
|
10
|
+
Max: 300
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Max: 30
|
13
|
+
Metrics/CyclomaticComplexity:
|
14
|
+
Max: 10
|
15
|
+
Metrics/PerceivedComplexity:
|
16
|
+
Max: 10
|
17
|
+
Layout/LineLength:
|
18
|
+
Max: 130
|
5
19
|
Style/StringLiterals:
|
6
20
|
EnforcedStyle: double_quotes
|
7
|
-
|
8
21
|
Style/StringLiteralsInInterpolation:
|
9
22
|
EnforcedStyle: double_quotes
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.2.3] - 2025-10-03
|
4
|
+
|
5
|
+
- Introduce custom HTTPInterface cookie_jar wrapped in concurrent map for safe reads/writes when batch visiting async
|
6
|
+
- Rubocop related updates
|
7
|
+
|
3
8
|
## [0.2.2] - 2025-10-01
|
4
9
|
|
5
10
|
- Refactor robots.rb and parser.rb to address a few rubocop complaints
|
data/lib/crawlr/callbacks.rb
CHANGED
@@ -141,9 +141,7 @@ module Crawlr
|
|
141
141
|
raise ArgumentError, "Unsupported format: #{format}" unless ALLOWED_FORMATS.include?(format)
|
142
142
|
|
143
143
|
selector_type, selector = parse_input(input)
|
144
|
-
unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
|
145
|
-
raise ArgumentError, "Unsupported selector type: #{selector_type}"
|
146
|
-
end
|
144
|
+
raise ArgumentError, "Unsupported selector type: #{selector_type}" unless ALLOWED_SELECTOR_TYPES.include?(selector_type)
|
147
145
|
|
148
146
|
register(format, selector_type, selector, &block)
|
149
147
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "http/cookie_jar"
|
4
|
+
|
5
|
+
module Crawlr
|
6
|
+
# A thread-safe wrapper around the HTTP::CookieJar class
|
7
|
+
class CookieJar
|
8
|
+
def initialize
|
9
|
+
@jar = HTTP::CookieJar.new
|
10
|
+
@lock = Concurrent::ReadWriteLock.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def add(cookie)
|
14
|
+
@lock.with_write_lock { @jar.add(cookie) }
|
15
|
+
end
|
16
|
+
|
17
|
+
def cookies(uri)
|
18
|
+
@lock.with_read_lock { @jar.cookies(uri) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/crawlr/domains.rb
CHANGED
@@ -77,12 +77,7 @@ module Crawlr
|
|
77
77
|
# domains.allowed?('https://any-domain.com') #=> true
|
78
78
|
def allowed?(url)
|
79
79
|
return true if @allowed_domains.empty? && @domain_glob.empty?
|
80
|
-
|
81
|
-
unless @domain_glob.empty?
|
82
|
-
@domain_glob.each do |glob|
|
83
|
-
return true if File.fnmatch?(glob, url)
|
84
|
-
end
|
85
|
-
end
|
80
|
+
return true if !@domain_glob.empty? && matches_domain_glob?(url)
|
86
81
|
|
87
82
|
uri = URI(url)
|
88
83
|
base_name = uri.host.sub("www.", "")
|
@@ -114,6 +109,14 @@ module Crawlr
|
|
114
109
|
|
115
110
|
private
|
116
111
|
|
112
|
+
def matches_domain_glob?(url)
|
113
|
+
@domain_glob.each do |glob|
|
114
|
+
return true if File.fnmatch?(glob, url)
|
115
|
+
end
|
116
|
+
|
117
|
+
false
|
118
|
+
end
|
119
|
+
|
117
120
|
# Extracts and normalizes domain names from the configuration
|
118
121
|
#
|
119
122
|
# Processes the list of allowed domains by:
|
@@ -3,7 +3,7 @@
|
|
3
3
|
require "async"
|
4
4
|
require "async/timeout"
|
5
5
|
require "async/http/internet"
|
6
|
-
|
6
|
+
require_relative "cookie_jar"
|
7
7
|
|
8
8
|
module Crawlr
|
9
9
|
# Handles fetching documents via async HTTP with proxy and cookie support.
|
@@ -85,7 +85,7 @@ module Crawlr
|
|
85
85
|
# http = Crawlr::HTTPInterface.new(config)
|
86
86
|
def initialize(config)
|
87
87
|
@config = config
|
88
|
-
@
|
88
|
+
@cookie_jars = Concurrent::Map.new if @config.allow_cookies
|
89
89
|
@proxy_index = 0
|
90
90
|
end
|
91
91
|
|
@@ -135,7 +135,7 @@ module Crawlr
|
|
135
135
|
# rescue StandardError => e
|
136
136
|
# puts "Request failed: #{e.message}"
|
137
137
|
# end
|
138
|
-
def get(url)
|
138
|
+
def get(url) # rubocop:disable Metrics/MethodLength
|
139
139
|
Crawlr.logger.debug "Fetching #{url}"
|
140
140
|
|
141
141
|
uri = URI.parse(url)
|
@@ -143,13 +143,9 @@ module Crawlr
|
|
143
143
|
internet = build_internet_connection(proxy_url)
|
144
144
|
|
145
145
|
request_headers = @config.headers.dup
|
146
|
+
handle_cookies(uri, request_headers)
|
146
147
|
|
147
|
-
if
|
148
|
-
cookie_header = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
|
149
|
-
request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
|
150
|
-
end
|
151
|
-
|
152
|
-
yield(url, request_headers) if block_given?
|
148
|
+
yield(url, request_headers) if block_given? # Used for request customization hook
|
153
149
|
|
154
150
|
raw_response = nil
|
155
151
|
begin
|
@@ -272,14 +268,23 @@ module Crawlr
|
|
272
268
|
# parse_and_set_cookies(uri, response)
|
273
269
|
# # Cookie is stored and will be sent with future requests to example.com
|
274
270
|
def parse_and_set_cookies(uri, response)
|
275
|
-
|
276
|
-
Array(
|
277
|
-
HTTP::Cookie.parse(set_cookie.to_s, uri).each
|
278
|
-
@cookie_jar.add(cookie)
|
279
|
-
Crawlr.logger.debug "Received cookie: #{cookie.name}=#{cookie.value};" \
|
280
|
-
" domain=#{cookie.domain}, path=#{cookie.path}"
|
281
|
-
end
|
271
|
+
jar = cookie_jar_for(uri)
|
272
|
+
Array(response.headers["set-cookie"]).each do |set_cookie|
|
273
|
+
HTTP::Cookie.parse(set_cookie.to_s, uri).each { |cookie| jar.add(cookie) }
|
282
274
|
end
|
283
275
|
end
|
276
|
+
|
277
|
+
# Get or create a thread-safe jar for a domain
|
278
|
+
def cookie_jar_for(uri)
|
279
|
+
@cookie_jars.compute_if_absent(uri.host) { Crawlr::CookieJar.new }
|
280
|
+
end
|
281
|
+
|
282
|
+
def handle_cookies(uri, request_headers)
|
283
|
+
return unless @config.allow_cookies
|
284
|
+
|
285
|
+
jar = cookie_jar_for(uri)
|
286
|
+
cookie_header = HTTP::Cookie.cookie_value(jar.cookies(uri))
|
287
|
+
request_headers["cookie"] = cookie_header if cookie_header && !cookie_header.empty?
|
288
|
+
end
|
284
289
|
end
|
285
290
|
end
|
data/lib/crawlr/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aristotelis Rapai
|
@@ -174,6 +174,7 @@ files:
|
|
174
174
|
- lib/crawlr/collector.rb
|
175
175
|
- lib/crawlr/config.rb
|
176
176
|
- lib/crawlr/context.rb
|
177
|
+
- lib/crawlr/cookie_jar.rb
|
177
178
|
- lib/crawlr/domains.rb
|
178
179
|
- lib/crawlr/hooks.rb
|
179
180
|
- lib/crawlr/http_interface.rb
|