crawlr 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/crawlr/parser.rb +7 -6
- data/lib/crawlr/robots.rb +62 -44
- data/lib/crawlr/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b74c6111f3df0866bf50f28c5510b637c74264fe8809d57092d2694d1694c974
|
4
|
+
data.tar.gz: 93799e89ba870575b86d9f70f4938c145cefa8aea8effc60d0bbacc1e9919a87
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 492e82dddbc07a130135137c94f561307f0692a405f630eec51500cd8b4046ad7a3147f010758bdfa91f1b2aa6dd23135fdcbea5a9ba62c76c2715525a117fbb
|
7
|
+
data.tar.gz: ad76ec821b6b4929779c1823107c1b21ca8180ec3fea3b71402d66a8853eddeb986bd8895f0146ce8e14d88d9090d3df696b8015c8b51b0c1f5ca224b36b6986
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
|
4
4
|
|
5
5
|
[](https://badge.fury.io/rb/crawlr)
|
6
|
-
[](https://github.com/aristorap/crawlr/actions/workflows/
|
6
|
+
[](https://github.com/aristorap/crawlr/actions/workflows/main.yml)
|
7
7
|
|
8
8
|
## ✨ Features
|
9
9
|
|
data/lib/crawlr/parser.rb
CHANGED
@@ -165,15 +165,16 @@ module Crawlr
|
|
165
165
|
|
166
166
|
callbacks_by_format.each do |format, format_callbacks|
|
167
167
|
doc = parse_content(format, content)
|
168
|
-
|
169
|
-
format_callbacks.each do |callback|
|
170
|
-
Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
|
171
|
-
nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
|
172
|
-
nodes.each { |node| callback[:block].call(node, context) }
|
173
|
-
end
|
168
|
+
format_callbacks.each { |callback| apply_callback(doc, callback, context) }
|
174
169
|
end
|
175
170
|
end
|
176
171
|
|
172
|
+
private_class_method def self.apply_callback(doc, callback, context)
|
173
|
+
Crawlr.logger.debug "Applying callback: #{callback[:selector_type]} #{callback[:selector]}"
|
174
|
+
nodes = extract_nodes(doc, callback[:selector_type], callback[:selector])
|
175
|
+
nodes.each { |node| callback[:block].call(node, context) }
|
176
|
+
end
|
177
|
+
|
177
178
|
# Parses content using the appropriate Nokogiri parser
|
178
179
|
#
|
179
180
|
# Creates a Nokogiri document object using either the HTML or XML parser
|
data/lib/crawlr/robots.rb
CHANGED
@@ -130,25 +130,13 @@ module Crawlr
|
|
130
130
|
# robots.allowed?('https://site.com/temporary/', 'Bot') #=> true
|
131
131
|
def allowed?(url, user_agent)
|
132
132
|
rule = get_rule(url, user_agent)
|
133
|
-
return true unless rule
|
133
|
+
return true unless rule
|
134
134
|
|
135
135
|
path = URI.parse(url).path
|
136
|
-
matched =
|
137
|
-
|
138
|
-
# Match allow/disallow using fnmatch (robots.txt style)
|
139
|
-
rule.allow.each do |pattern|
|
140
|
-
matched << [:allow, pattern] if robots_match?(pattern, path)
|
141
|
-
end
|
142
|
-
|
143
|
-
rule.disallow.each do |pattern|
|
144
|
-
matched << [:disallow, pattern] if robots_match?(pattern, path)
|
145
|
-
end
|
146
|
-
|
136
|
+
matched = matched_rules(rule, path)
|
147
137
|
return true if matched.empty?
|
148
138
|
|
149
|
-
|
150
|
-
action, = matched.max_by { |_, p| p.length }
|
151
|
-
action == :allow
|
139
|
+
longest_match_allows?(matched)
|
152
140
|
end
|
153
141
|
|
154
142
|
# Parses robots.txt content and stores rules for the given URL's domain
|
@@ -204,6 +192,25 @@ module Crawlr
|
|
204
192
|
|
205
193
|
private
|
206
194
|
|
195
|
+
def matched_rules(rule, path)
|
196
|
+
matched = []
|
197
|
+
|
198
|
+
rule.allow.each do |pattern|
|
199
|
+
matched << [:allow, pattern] if robots_match?(pattern, path)
|
200
|
+
end
|
201
|
+
|
202
|
+
rule.disallow.each do |pattern|
|
203
|
+
matched << [:disallow, pattern] if robots_match?(pattern, path)
|
204
|
+
end
|
205
|
+
|
206
|
+
matched
|
207
|
+
end
|
208
|
+
|
209
|
+
def longest_match_allows?(matched)
|
210
|
+
action, = matched.max_by { |_, pattern| pattern.length }
|
211
|
+
action == :allow
|
212
|
+
end
|
213
|
+
|
207
214
|
# Finds the most applicable rule for a URL and user-agent combination
|
208
215
|
#
|
209
216
|
# Implements the robots.txt user-agent matching algorithm:
|
@@ -222,11 +229,7 @@ module Crawlr
|
|
222
229
|
return nil unless rules
|
223
230
|
|
224
231
|
# Case-insensitive prefix match
|
225
|
-
applicable_rules = rules
|
226
|
-
next if rule.user_agent.nil?
|
227
|
-
|
228
|
-
user_agent.downcase.start_with?(rule.user_agent.downcase)
|
229
|
-
end
|
232
|
+
applicable_rules = rules_by_prefix_match(user_agent, rules)
|
230
233
|
|
231
234
|
# Fallback to wildcard
|
232
235
|
applicable_rules = rules.select { |rule| rule.user_agent == "*" } if applicable_rules.empty?
|
@@ -235,6 +238,14 @@ module Crawlr
|
|
235
238
|
applicable_rules.max_by { |r| r.user_agent.length }
|
236
239
|
end
|
237
240
|
|
241
|
+
def rules_by_prefix_match(user_agent, rules)
|
242
|
+
rules.select do |rule|
|
243
|
+
next if rule.user_agent.nil?
|
244
|
+
|
245
|
+
user_agent.downcase.start_with?(rule.user_agent.downcase)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
238
249
|
# Tests if a robots.txt pattern matches a given path
|
239
250
|
#
|
240
251
|
# Implements robots.txt pattern matching including:
|
@@ -291,38 +302,45 @@ module Crawlr
|
|
291
302
|
# }
|
292
303
|
# }
|
293
304
|
def parse_to_hash(content)
|
294
|
-
robots_hash = {
|
295
|
-
sitemap: [],
|
296
|
-
rules: {}
|
297
|
-
}
|
298
|
-
|
305
|
+
robots_hash = { sitemap: [], rules: {} }
|
299
306
|
curr_user_agents = []
|
300
307
|
|
301
308
|
content.each_line do |line|
|
302
|
-
|
303
|
-
next if clean_line.empty? || clean_line.start_with?("#")
|
304
|
-
|
305
|
-
key, value = clean_line.split(":", 2).map(&:strip)
|
309
|
+
key, value = parse_line(line)
|
306
310
|
next unless key && value
|
307
311
|
|
308
|
-
|
309
|
-
|
310
|
-
case key
|
311
|
-
when "sitemap"
|
312
|
-
robots_hash[:sitemap] << value
|
313
|
-
when "user-agent"
|
314
|
-
curr_user_agents = [value]
|
315
|
-
robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
|
316
|
-
when "allow"
|
317
|
-
curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
|
318
|
-
when "disallow"
|
319
|
-
curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
|
320
|
-
when "crawl-delay"
|
321
|
-
curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
|
322
|
-
end
|
312
|
+
curr_user_agents = apply_rule(robots_hash, key, value, curr_user_agents)
|
323
313
|
end
|
324
314
|
|
325
315
|
robots_hash
|
326
316
|
end
|
317
|
+
|
318
|
+
def parse_line(line)
|
319
|
+
clean_line = line.strip
|
320
|
+
return if clean_line.empty? || clean_line.start_with?("#")
|
321
|
+
|
322
|
+
key, value = clean_line.split(":", 2).map(&:strip)
|
323
|
+
return unless key && value
|
324
|
+
|
325
|
+
[key.downcase, value]
|
326
|
+
end
|
327
|
+
|
328
|
+
def apply_rule(robots_hash, key, value, curr_user_agents)
|
329
|
+
case key
|
330
|
+
when "sitemap"
|
331
|
+
robots_hash[:sitemap] << value
|
332
|
+
when "user-agent"
|
333
|
+
curr_user_agents = [value]
|
334
|
+
robots_hash[:rules][value] ||= { allow: [], disallow: [], crawl_delay: nil }
|
335
|
+
when "allow"
|
336
|
+
curr_user_agents.each { |ua| robots_hash[:rules][ua][:allow] << value }
|
337
|
+
when "disallow"
|
338
|
+
curr_user_agents.each { |ua| robots_hash[:rules][ua][:disallow] << value }
|
339
|
+
when "crawl-delay"
|
340
|
+
curr_user_agents.each { |ua| robots_hash[:rules][ua][:crawl_delay] = value }
|
341
|
+
end
|
342
|
+
|
343
|
+
curr_user_agents
|
344
|
+
end
|
327
345
|
end
|
328
346
|
end
|
data/lib/crawlr/version.rb
CHANGED