crawlr 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
4
- data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
3
+ metadata.gz: b74c6111f3df0866bf50f28c5510b637c74264fe8809d57092d2694d1694c974
4
+ data.tar.gz: 93799e89ba870575b86d9f70f4938c145cefa8aea8effc60d0bbacc1e9919a87
5
5
  SHA512:
6
- metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
7
- data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
6
+ metadata.gz: 492e82dddbc07a130135137c94f561307f0692a405f630eec51500cd8b4046ad7a3147f010758bdfa91f1b2aa6dd23135fdcbea5a9ba62c76c2715525a117fbb
7
+ data.tar.gz: ad76ec821b6b4929779c1823107c1b21ca8180ec3fea3b71402d66a8853eddeb986bd8895f0146ce8e14d88d9090d3df696b8015c8b51b0c1f5ca224b36b6986
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.2.2] - 2025-10-01
4
+
5
+ - Refactor robots.rb and parser.rb to address a few rubocop complaints
6
+
3
7
  ## [0.2.1] - 2025-09-30
4
8
 
5
9
  - Fix paginated_visit to properly handle provided url queries (if present)
data/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
4
4
 
5
5
  [![Gem Version](https://badge.fury.io/rb/crawlr.svg)](https://badge.fury.io/rb/crawlr)
6
- [![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
6
+ [![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/main.yml)
7
7
 
8
8
  ## ✨ Features
9
9
 
data/lib/crawlr/parser.rb CHANGED
@@ -165,15 +165,16 @@ module Crawlr
165
165
 
166
166
  callbacks_by_format.each do |format, format_callbacks|
167
167
  doc = parse_content(format, content)
168
-
169
- format_callbacks.each do |callback|
170
- Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
171
- nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
172
- nodes.each { |node| callback[:block].call(node, context) }
173
- end
168
+ format_callbacks.each { |callback| apply_callback(doc, callback, context) }
174
169
  end
175
170
  end
176
171
 
172
+ private_class_method def self.apply_callback(doc, callback, context)
173
+ Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
174
+ nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
175
+ nodes.each { |node| callback[:block].call(node, context) }
176
+ end
177
+
177
178
  # Parses content using the appropriate Nokogiri parser
178
179
  #
179
180
  # Creates a Nokogiri document object using either the HTML or XML parser
data/lib/crawlr/robots.rb CHANGED
@@ -130,25 +130,13 @@ module Crawlr
130
130
  # robots.allowed?('https://site.com/temporary/', 'Bot') #=> true
131
131
  def allowed?(url, user_agent)
132
132
  rule = get_rule(url, user_agent)
133
- return true unless rule # if no robots.txt or no rule, allow
133
+ return true unless rule
134
134
 
135
135
  path = URI.parse(url).path
136
- matched = []
137
-
138
- # Match allow/disallow using fnmatch (robots.txt style)
139
- rule.allow.each do |pattern|
140
- matched << [:allow, pattern] if robots_match?(pattern, path)
141
- end
142
-
143
- rule.disallow.each do |pattern|
144
- matched << [:disallow, pattern] if robots_match?(pattern, path)
145
- end
146
-
136
+ matched = matched_rules(rule, path)
147
137
  return true if matched.empty?
148
138
 
149
- # Longest match wins
150
- action, = matched.max_by { |_, p| p.length }
151
- action == :allow
139
+ longest_match_allows?(matched)
152
140
  end
153
141
 
154
142
  # Parses robots.txt content and stores rules for the given URL's domain
@@ -204,6 +192,25 @@ module Crawlr
204
192
 
205
193
  private
206
194
 
195
+ def matched_rules(rule, path)
196
+ matched = []
197
+
198
+ rule.allow.each do |pattern|
199
+ matched << [:allow, pattern] if robots_match?(pattern, path)
200
+ end
201
+
202
+ rule.disallow.each do |pattern|
203
+ matched << [:disallow, pattern] if robots_match?(pattern, path)
204
+ end
205
+
206
+ matched
207
+ end
208
+
209
+ def longest_match_allows?(matched)
210
+ action, = matched.max_by { |_, pattern| pattern.length }
211
+ action == :allow
212
+ end
213
+
207
214
  # Finds the most applicable rule for a URL and user-agent combination
208
215
  #
209
216
  # Implements the robots.txt user-agent matching algorithm:
@@ -222,11 +229,7 @@ module Crawlr
222
229
  return nil unless rules
223
230
 
224
231
  # Case-insensitive prefix match
225
- applicable_rules = rules.select do |rule|
226
- next if rule.user_agent.nil?
227
-
228
- user_agent.downcase.start_with?(rule.user_agent.downcase)
229
- end
232
+ applicable_rules = rules_by_prefix_match(user_agent, rules)
230
233
 
231
234
  # Fallback to wildcard
232
235
  applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
@@ -235,6 +238,14 @@ module Crawlr
235
238
  applicable_rules.max_by { |r| r.user_agent.length }
236
239
  end
237
240
 
241
+ def rules_by_prefix_match(user_agent, rules)
242
+ rules.select do |rule|
243
+ next if rule.user_agent.nil?
244
+
245
+ user_agent.downcase.start_with?(rule.user_agent.downcase)
246
+ end
247
+ end
248
+
238
249
  # Tests if a robots.txt pattern matches a given path
239
250
  #
240
251
  # Implements robots.txt pattern matching including:
@@ -291,38 +302,45 @@ module Crawlr
291
302
  # }
292
303
  # }
293
304
  def parse_to_hash(content)
294
- robots_hash = {
295
- sitemap: [],
296
- rules: {}
297
- }
298
-
305
+ robots_hash = { sitemap: [], rules: {} }
299
306
  curr_user_agents = []
300
307
 
301
308
  content.each_line do |line|
302
- clean_line = line.strip
303
- next if clean_line.empty? || clean_line.start_with?("#")
304
-
305
- key, value = clean_line.split(":", 2).map(&:strip)
309
+ key, value = parse_line(line)
306
310
  next unless key && value
307
311
 
308
- key = key.downcase
309
-
310
- case key
311
- when "sitemap"
312
- robots_hash[:sitemap] << value
313
- when "user-agent"
314
- curr_user_agents = [value]
315
- robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
316
- when "allow"
317
- curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
318
- when "disallow"
319
- curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
320
- when "crawl-delay"
321
- curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
322
- end
312
+ curr_user_agents = apply_rule(robots_hash, key, value, curr_user_agents)
323
313
  end
324
314
 
325
315
  robots_hash
326
316
  end
317
+
318
+ def parse_line(line)
319
+ clean_line = line.strip
320
+ return if clean_line.empty? || clean_line.start_with?("#")
321
+
322
+ key, value = clean_line.split(":", 2).map(&:strip)
323
+ return unless key && value
324
+
325
+ [key.downcase, value]
326
+ end
327
+
328
+ def apply_rule(robots_hash, key, value, curr_user_agents)
329
+ case key
330
+ when "sitemap"
331
+ robots_hash[:sitemap] << value
332
+ when "user-agent"
333
+ curr_user_agents = [value]
334
+ robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
335
+ when "allow"
336
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
337
+ when "disallow"
338
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
339
+ when "crawl-delay"
340
+ curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
341
+ end
342
+
343
+ curr_user_agents
344
+ end
327
345
  end
328
346
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlr
4
- VERSION = "0.2.1"
4
+ VERSION = "0.2.2"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aristotelis Rapai