crawlr 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/README.md +26 -36
- data/examples/basic_visit.rb +19 -0
- data/examples/nested_visit.rb +36 -0
- data/examples/paginated_visit.rb +27 -0
- data/lib/crawlr/collector.rb +18 -7
- data/lib/crawlr/config.rb +0 -1
- data/lib/crawlr/domains.rb +0 -1
- data/lib/crawlr/http_interface.rb +0 -1
- data/lib/crawlr/parser.rb +0 -1
- data/lib/crawlr/robots.rb +0 -1
- data/lib/crawlr/version.rb +1 -1
- data/lib/crawlr/visits.rb +0 -1
- data/lib/crawlr.rb +3 -1
- metadata +6 -4
- data/sig/crawlr.rbs +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
|
4
|
+
data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
|
7
|
+
data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,16 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.2.1] - 2025-09-30
|
4
|
+
|
5
|
+
- Fix paginated_visit to properly handle provided url queries (if present)
|
6
|
+
- Update paginated_visit batch size parameter to respect max_depth (if max_depth set > 0)
|
7
|
+
|
8
|
+
## [0.2.0] - 2025-09-30
|
9
|
+
|
10
|
+
- Tidied up documentation and inline comments
|
11
|
+
- Fixed small bugs caused by typos
|
12
|
+
- Added a few examples demonstrating usage
|
13
|
+
|
3
14
|
## [0.1.0] - 2025-09-29
|
4
15
|
|
5
16
|
- Initial release
|
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
|
4
4
|
|
5
5
|
[](https://badge.fury.io/rb/crawlr)
|
6
|
-
[](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
|
7
7
|
|
8
8
|
## ✨ Features
|
9
9
|
|
@@ -71,20 +71,20 @@ collector.visit('https://example.com')
|
|
71
71
|
|
72
72
|
```ruby
|
73
73
|
collector = Crawlr::Collector.new
|
74
|
-
|
74
|
+
products = []
|
75
75
|
# Extract product information
|
76
|
-
collector.
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
76
|
+
collector.visit('https://shop.example.com/products') do |c|
|
77
|
+
c.on_html(:css, '.product') do |product, ctx|
|
78
|
+
data = {
|
79
|
+
name: product.css('.product-name').text.strip,
|
80
|
+
price: product.css('.price').text.strip,
|
81
|
+
image: product.css('img')&.first&.[]('src')
|
82
|
+
}
|
83
|
+
|
84
|
+
products << data
|
85
|
+
end
|
85
86
|
end
|
86
|
-
|
87
|
-
collector.visit('https://shop.example.com/products')
|
87
|
+
# do something with data
|
88
88
|
```
|
89
89
|
|
90
90
|
### API Scraping with Pagination
|
@@ -94,14 +94,16 @@ collector = Crawlr::Collector.new(
|
|
94
94
|
max_parallelism: 10,
|
95
95
|
timeout: 30
|
96
96
|
)
|
97
|
+
mu = Mutex.new
|
98
|
+
items = Array.new
|
97
99
|
|
98
|
-
collector.on_xml(:css, 'item') do |item,
|
99
|
-
|
100
|
-
ctx.items << {
|
100
|
+
collector.on_xml(:css, 'item') do |item, _ctx|
|
101
|
+
data = {
|
101
102
|
id: item.css('id').text,
|
102
103
|
title: item.css('title').text,
|
103
104
|
published: item.css('published').text
|
104
105
|
}
|
106
|
+
mu.synchronize { items << data }
|
105
107
|
end
|
106
108
|
|
107
109
|
# Automatically handles pagination with ?page=1, ?page=2, etc.
|
@@ -168,13 +170,11 @@ end
|
|
168
170
|
# Process responses after each request
|
169
171
|
collector.hook(:after_visit) do |url, response|
|
170
172
|
puts "Got #{response.status} from #{url}"
|
171
|
-
log_response_time(url, response.headers['X-Response-Time'])
|
172
173
|
end
|
173
174
|
|
174
175
|
# Handle errors gracefully
|
175
176
|
collector.hook(:on_error) do |url, error|
|
176
177
|
puts "Failed to scrape #{url}: #{error.message}"
|
177
|
-
error_tracker.record(url, error)
|
178
178
|
end
|
179
179
|
```
|
180
180
|
|
@@ -182,15 +182,11 @@ end
|
|
182
182
|
|
183
183
|
```ruby
|
184
184
|
collector.on_html(:xpath, '//div[@class="content"]//p[position() <= 3]') do |paragraph, ctx|
|
185
|
-
#
|
186
|
-
ctx.content_paragraphs ||= []
|
187
|
-
ctx.content_paragraphs << paragraph.text.strip
|
185
|
+
# Do stuff
|
188
186
|
end
|
189
187
|
|
190
188
|
collector.on_xml(:xpath, '//item[price > 100]/title') do |title, ctx|
|
191
|
-
#
|
192
|
-
ctx.expensive_items ||= []
|
193
|
-
ctx.expensive_items << title.text
|
189
|
+
# Do stuff
|
194
190
|
end
|
195
191
|
```
|
196
192
|
|
@@ -199,16 +195,12 @@ end
|
|
199
195
|
```ruby
|
200
196
|
collector = Crawlr::Collector.new(allow_cookies: true)
|
201
197
|
|
202
|
-
#
|
203
|
-
collector.on_html(:css, 'form[action="/login"]') do |form, ctx|
|
204
|
-
# Cookies from login will be automatically used in subsequent requests
|
205
|
-
end
|
206
|
-
|
198
|
+
# First visit will set cookies tor following requests
|
207
199
|
collector.visit('https://site.com/login')
|
208
200
|
collector.visit('https://site.com/protected-content') # Uses login cookies
|
209
201
|
```
|
210
202
|
|
211
|
-
###
|
203
|
+
### Stats
|
212
204
|
|
213
205
|
```ruby
|
214
206
|
collector = Crawlr::Collector.new
|
@@ -297,7 +289,7 @@ yard server
|
|
297
289
|
|
298
290
|
## 🤝 Contributing
|
299
291
|
|
300
|
-
1. Fork it (https://github.com/
|
292
|
+
1. Fork it (https://github.com/aristorap/crawlr/fork)
|
301
293
|
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
302
294
|
3. Make your changes with tests
|
303
295
|
4. Ensure all tests pass (`bundle exec rspec`)
|
@@ -313,13 +305,11 @@ This gem is available as open source under the terms of the [MIT License](https:
|
|
313
305
|
|
314
306
|
- Built with [Nokogiri](https://nokogiri.org/) for HTML/XML parsing
|
315
307
|
- Uses [Async](https://github.com/socketry/async) for high-performance concurrency
|
316
|
-
- Inspired by
|
317
|
-
|
308
|
+
- Inspired by Golang's [Colly](https://go-colly.org) framework and modern Ruby practices
|
318
309
|
## 📞 Support
|
319
310
|
|
320
|
-
- 📖 [Documentation](https://
|
321
|
-
- 🐛 [Issue Tracker](https://github.com/
|
322
|
-
- 💬 [Discussions](https://github.com/yourusername/crawlr/discussions)
|
311
|
+
- 📖 [Documentation TBD](https://aristorap.github.io/crawlr)
|
312
|
+
- 🐛 [Issue Tracker](https://github.com/aristorap/crawlr/issues)
|
323
313
|
|
324
314
|
---
|
325
315
|
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative "../lib/crawlr"
|
2
|
+
|
3
|
+
# Create a new collector instance
|
4
|
+
clct = Crawlr::Collector.new
|
5
|
+
gems = []
|
6
|
+
|
7
|
+
# Visit the RubyGems popular releases page
|
8
|
+
clct.visit("https://rubygems.org/releases/popular") do |collector|
|
9
|
+
# Extract gem links using a CSS selector
|
10
|
+
# The callback will be executed for each matched node
|
11
|
+
collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
|
12
|
+
link = node["href"]
|
13
|
+
gems << ctx.resolve_url(link) if link
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Print results
|
18
|
+
puts "Found #{gems.size} gems"
|
19
|
+
gems.each { |g| puts g }
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative "../lib/crawlr"
|
2
|
+
|
3
|
+
# Create a new collector instance
|
4
|
+
clct = Crawlr::Collector.new(
|
5
|
+
max_depth: 2, # Limit unbounded crawls
|
6
|
+
random_delay: 1, # Maximum random delay between requests
|
7
|
+
max_parallelism: 5 # Maximum concurrent requests
|
8
|
+
)
|
9
|
+
|
10
|
+
# Create a map to store gem metadata
|
11
|
+
# Use a thread-safe map due to parallel processing
|
12
|
+
gems_meta = Concurrent::Map.new
|
13
|
+
|
14
|
+
# Visit the RubyGems popular releases page
|
15
|
+
clct.visit("https://rubygems.org/releases/popular") do |c|
|
16
|
+
# Grab main container
|
17
|
+
c.on_html(:css, ".main--interior") do |node, ctx|
|
18
|
+
# Grab all gem links
|
19
|
+
gems = []
|
20
|
+
node.css("a.gems__gem").each do |a|
|
21
|
+
gems << ctx.resolve_url(a["href"])
|
22
|
+
end
|
23
|
+
# Visit each gem page
|
24
|
+
c.visit(gems, ctx.increment_depth) # Use context helper method to set depth for accurate tracking
|
25
|
+
end
|
26
|
+
|
27
|
+
# This callback will be matched on the individual gem pages
|
28
|
+
c.on_html(:css, "h2.gem__downloads__heading:nth-child(1) > span:nth-child(1)") do |node, ctx|
|
29
|
+
gems_meta[ctx.page_url] = node.text
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Print results
|
34
|
+
puts "Found #{gems_meta.size} gems"
|
35
|
+
|
36
|
+
gems_meta.each_pair { |k, v| puts "#{k} => #{v}" }
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative "../lib/crawlr"
|
2
|
+
|
3
|
+
# Create a new collector instance
|
4
|
+
clct = Crawlr::Collector.new(
|
5
|
+
max_depth: 2,
|
6
|
+
random_delay: 1,
|
7
|
+
max_parallelism: 5
|
8
|
+
)
|
9
|
+
mu = Mutex.new
|
10
|
+
gems = []
|
11
|
+
|
12
|
+
# Visit the RubyGems popular releases page with pagination
|
13
|
+
# Set max depth in collector config to limit crawl depth
|
14
|
+
clct.paginated_visit("https://rubygems.org/releases/popular") do |collector|
|
15
|
+
# Extract gem links using a CSS selector
|
16
|
+
collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
|
17
|
+
link = node["href"]
|
18
|
+
if link
|
19
|
+
full_link = ctx.resolve_url(link) # Resolve relative URL using context helper method
|
20
|
+
mu.synchronize { gems << full_link }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Print results
|
26
|
+
puts "Found #{gems.size} gems"
|
27
|
+
gems.each { |g| puts g }
|
data/lib/crawlr/collector.rb
CHANGED
@@ -53,7 +53,6 @@ module Crawlr
|
|
53
53
|
# puts "Failed to scrape #{url}: #{error.message}"
|
54
54
|
# end
|
55
55
|
#
|
56
|
-
# @author [Your Name]
|
57
56
|
# @since 0.1.0
|
58
57
|
class Collector
|
59
58
|
# @return [Crawlr::Config] The configuration object for this collector
|
@@ -214,7 +213,7 @@ module Crawlr
|
|
214
213
|
return unless valid_url?(url)
|
215
214
|
|
216
215
|
yield self if block_given?
|
217
|
-
|
216
|
+
fetch_robots_txt(url) unless @config.ignore_robots_txt
|
218
217
|
return unless can_visit?(url, @config.headers)
|
219
218
|
|
220
219
|
pages_to_visit = build_initial_pages(url, query, batch_size, start_page)
|
@@ -571,17 +570,19 @@ module Crawlr
|
|
571
570
|
end
|
572
571
|
|
573
572
|
def build_initial_pages(url, query, batch_size, start_page)
|
574
|
-
|
573
|
+
uri = URI.parse(url)
|
574
|
+
max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
|
575
|
+
|
575
576
|
if start_page == 1
|
576
|
-
[url] + (max_batch - 1).times.map { |i|
|
577
|
+
[url] + (max_batch - 1).times.map { |i| build_page_url(uri, query, i + 2) }
|
577
578
|
else
|
578
|
-
max_batch.times.map { |i|
|
579
|
+
max_batch.times.map { |i| build_page_url(uri, query, i + start_page) }
|
579
580
|
end
|
580
581
|
end
|
581
582
|
|
582
583
|
def process_page_batches(pages, current_depth, batch_size, query)
|
583
584
|
scheduled_depth = current_depth
|
584
|
-
max_batch = [@config.max_depth, batch_size].min
|
585
|
+
max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
|
585
586
|
|
586
587
|
loop do
|
587
588
|
break if reached_max_depth?(scheduled_depth)
|
@@ -626,7 +627,17 @@ module Crawlr
|
|
626
627
|
end
|
627
628
|
|
628
629
|
def generate_next_pages(batch, scheduled_depth, max_batch, query)
|
629
|
-
|
630
|
+
uri = URI.parse(batch.first)
|
631
|
+
(0...max_batch).map { |i| build_page_url(uri, query, i + scheduled_depth + 1) }
|
632
|
+
end
|
633
|
+
|
634
|
+
def build_page_url(uri, query, value)
|
635
|
+
new_uri = uri.dup
|
636
|
+
params = URI.decode_www_form(new_uri.query || "")
|
637
|
+
params.reject! { |k, _| k == query }
|
638
|
+
params << [query, value]
|
639
|
+
new_uri.query = URI.encode_www_form(params)
|
640
|
+
new_uri.to_s
|
630
641
|
end
|
631
642
|
end
|
632
643
|
end
|
data/lib/crawlr/config.rb
CHANGED
data/lib/crawlr/domains.rb
CHANGED
data/lib/crawlr/parser.rb
CHANGED
@@ -64,7 +64,6 @@ module Crawlr
|
|
64
64
|
# # HTML content parsed once, all callbacks executed on same document
|
65
65
|
# Crawlr::Parser.apply_callbacks(content: html, callbacks: callbacks, context: ctx)
|
66
66
|
#
|
67
|
-
# @author [Your Name]
|
68
67
|
# @since 0.1.0
|
69
68
|
module Parser
|
70
69
|
# Applies registered callbacks to parsed document content
|
data/lib/crawlr/robots.rb
CHANGED
@@ -58,7 +58,6 @@ module Crawlr
|
|
58
58
|
# robots.allowed?('https://example.com/temp/secret.txt', 'Bot') #=> false
|
59
59
|
# robots.allowed?('https://example.com/temp/public/file.txt', 'Bot') #=> true
|
60
60
|
#
|
61
|
-
# @author [Your Name]
|
62
61
|
# @since 0.1.0
|
63
62
|
class Robots
|
64
63
|
# Represents a robots.txt rule for a specific user-agent
|
data/lib/crawlr/version.rb
CHANGED
data/lib/crawlr/visits.rb
CHANGED
data/lib/crawlr.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "crawlr/version"
|
4
|
+
require_relative "crawlr/collector"
|
5
|
+
|
6
|
+
require "logger"
|
4
7
|
|
5
8
|
# A Ruby scraping framework for parsing HTML and XML documents
|
6
|
-
# @author [Your Name]
|
7
9
|
# @since 0.1.0
|
8
10
|
module Crawlr
|
9
11
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aristotelis Rapai
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: async
|
@@ -166,6 +166,9 @@ files:
|
|
166
166
|
- LICENSE.txt
|
167
167
|
- README.md
|
168
168
|
- Rakefile
|
169
|
+
- examples/basic_visit.rb
|
170
|
+
- examples/nested_visit.rb
|
171
|
+
- examples/paginated_visit.rb
|
169
172
|
- lib/crawlr.rb
|
170
173
|
- lib/crawlr/callbacks.rb
|
171
174
|
- lib/crawlr/collector.rb
|
@@ -178,7 +181,6 @@ files:
|
|
178
181
|
- lib/crawlr/robots.rb
|
179
182
|
- lib/crawlr/version.rb
|
180
183
|
- lib/crawlr/visits.rb
|
181
|
-
- sig/crawlr.rbs
|
182
184
|
homepage: https://github.com/aristorap/crawlr
|
183
185
|
licenses:
|
184
186
|
- MIT
|
@@ -203,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
203
205
|
- !ruby/object:Gem::Version
|
204
206
|
version: '0'
|
205
207
|
requirements: []
|
206
|
-
rubygems_version: 3.
|
208
|
+
rubygems_version: 3.7.2
|
207
209
|
specification_version: 4
|
208
210
|
summary: A powerful, async Ruby web scraping framework
|
209
211
|
test_files: []
|
data/sig/crawlr.rbs
DELETED