crawlr 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +26 -36
- data/examples/basic_visit.rb +19 -0
- data/examples/nested_visit.rb +36 -0
- data/examples/paginated_visit.rb +27 -0
- data/lib/crawlr/collector.rb +1 -2
- data/lib/crawlr/config.rb +0 -1
- data/lib/crawlr/domains.rb +0 -1
- data/lib/crawlr/http_interface.rb +0 -1
- data/lib/crawlr/parser.rb +0 -1
- data/lib/crawlr/robots.rb +0 -1
- data/lib/crawlr/version.rb +1 -1
- data/lib/crawlr/visits.rb +0 -1
- data/lib/crawlr.rb +3 -1
- data/rubygems.rb +18 -0
- metadata +5 -2
- data/sig/crawlr.rbs +0 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4a2b21633eead87fe3b879552db225aecc2975cad779fd86413f2531cd3f079
|
4
|
+
data.tar.gz: 9b20eb81f931b0f514609e9b699a85a8f25d2e64e0fb7f8f6845343d4872a893
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e5d343dd502ed23343ad0e6bfbe9fbe6b8696954e171a181ae385c8c679d60cfeeeec4d65cdbb9e841731664ae27fa4034405c8265ab12f967470e91321208a
|
7
|
+
data.tar.gz: ecb186a9d6e9a5f34a4e429b1c3971a1eaf7f708537d698557fab3272010c0f4fb953362b64ed498556867a0938387c70cc24f0b7f41563026be7f1157f373a1
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
|
4
4
|
|
5
5
|
[](https://badge.fury.io/rb/crawlr)
|
6
|
-
[](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
|
7
7
|
|
8
8
|
## ✨ Features
|
9
9
|
|
@@ -71,20 +71,20 @@ collector.visit('https://example.com')
|
|
71
71
|
|
72
72
|
```ruby
|
73
73
|
collector = Crawlr::Collector.new
|
74
|
-
|
74
|
+
products = []
|
75
75
|
# Extract product information
|
76
|
-
collector.
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
76
|
+
collector.visit('https://shop.example.com/products') do |c|
|
77
|
+
c.on_html(:css, '.product') do |product, ctx|
|
78
|
+
data = {
|
79
|
+
name: product.css('.product-name').text.strip,
|
80
|
+
price: product.css('.price').text.strip,
|
81
|
+
image: product.css('img')&.first&.[]('src')
|
82
|
+
}
|
83
|
+
|
84
|
+
products << data
|
85
|
+
end
|
85
86
|
end
|
86
|
-
|
87
|
-
collector.visit('https://shop.example.com/products')
|
87
|
+
# do something with data
|
88
88
|
```
|
89
89
|
|
90
90
|
### API Scraping with Pagination
|
@@ -94,14 +94,16 @@ collector = Crawlr::Collector.new(
|
|
94
94
|
max_parallelism: 10,
|
95
95
|
timeout: 30
|
96
96
|
)
|
97
|
+
mu = Mutex.new
|
98
|
+
items = Array.new
|
97
99
|
|
98
|
-
collector.on_xml(:css, 'item') do |item,
|
99
|
-
|
100
|
-
ctx.items << {
|
100
|
+
collector.on_xml(:css, 'item') do |item, _ctx|
|
101
|
+
data = {
|
101
102
|
id: item.css('id').text,
|
102
103
|
title: item.css('title').text,
|
103
104
|
published: item.css('published').text
|
104
105
|
}
|
106
|
+
mu.synchronize { items << data }
|
105
107
|
end
|
106
108
|
|
107
109
|
# Automatically handles pagination with ?page=1, ?page=2, etc.
|
@@ -168,13 +170,11 @@ end
|
|
168
170
|
# Process responses after each request
|
169
171
|
collector.hook(:after_visit) do |url, response|
|
170
172
|
puts "Got #{response.status} from #{url}"
|
171
|
-
log_response_time(url, response.headers['X-Response-Time'])
|
172
173
|
end
|
173
174
|
|
174
175
|
# Handle errors gracefully
|
175
176
|
collector.hook(:on_error) do |url, error|
|
176
177
|
puts "Failed to scrape #{url}: #{error.message}"
|
177
|
-
error_tracker.record(url, error)
|
178
178
|
end
|
179
179
|
```
|
180
180
|
|
@@ -182,15 +182,11 @@ end
|
|
182
182
|
|
183
183
|
```ruby
|
184
184
|
collector.on_html(:xpath, '//div[@class="content"]//p[position() <= 3]') do |paragraph, ctx|
|
185
|
-
#
|
186
|
-
ctx.content_paragraphs ||= []
|
187
|
-
ctx.content_paragraphs << paragraph.text.strip
|
185
|
+
# Do stuff
|
188
186
|
end
|
189
187
|
|
190
188
|
collector.on_xml(:xpath, '//item[price > 100]/title') do |title, ctx|
|
191
|
-
#
|
192
|
-
ctx.expensive_items ||= []
|
193
|
-
ctx.expensive_items << title.text
|
189
|
+
# Do stuff
|
194
190
|
end
|
195
191
|
```
|
196
192
|
|
@@ -199,16 +195,12 @@ end
|
|
199
195
|
```ruby
|
200
196
|
collector = Crawlr::Collector.new(allow_cookies: true)
|
201
197
|
|
202
|
-
#
|
203
|
-
collector.on_html(:css, 'form[action="/login"]') do |form, ctx|
|
204
|
-
# Cookies from login will be automatically used in subsequent requests
|
205
|
-
end
|
206
|
-
|
198
|
+
# First visit will set cookies tor following requests
|
207
199
|
collector.visit('https://site.com/login')
|
208
200
|
collector.visit('https://site.com/protected-content') # Uses login cookies
|
209
201
|
```
|
210
202
|
|
211
|
-
###
|
203
|
+
### Stats
|
212
204
|
|
213
205
|
```ruby
|
214
206
|
collector = Crawlr::Collector.new
|
@@ -297,7 +289,7 @@ yard server
|
|
297
289
|
|
298
290
|
## 🤝 Contributing
|
299
291
|
|
300
|
-
1. Fork it (https://github.com/
|
292
|
+
1. Fork it (https://github.com/aristorap/crawlr/fork)
|
301
293
|
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
|
302
294
|
3. Make your changes with tests
|
303
295
|
4. Ensure all tests pass (`bundle exec rspec`)
|
@@ -313,13 +305,11 @@ This gem is available as open source under the terms of the [MIT License](https:
|
|
313
305
|
|
314
306
|
- Built with [Nokogiri](https://nokogiri.org/) for HTML/XML parsing
|
315
307
|
- Uses [Async](https://github.com/socketry/async) for high-performance concurrency
|
316
|
-
- Inspired by
|
317
|
-
|
308
|
+
- Inspired by Golang's [Colly](https://go-colly.org) framework and modern Ruby practices
|
318
309
|
## 📞 Support
|
319
310
|
|
320
|
-
- 📖 [Documentation](https://
|
321
|
-
- 🐛 [Issue Tracker](https://github.com/
|
322
|
-
- 💬 [Discussions](https://github.com/yourusername/crawlr/discussions)
|
311
|
+
- 📖 [Documentation TBD](https://aristorap.github.io/crawlr)
|
312
|
+
- 🐛 [Issue Tracker](https://github.com/aristorap/crawlr/issues)
|
323
313
|
|
324
314
|
---
|
325
315
|
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require_relative "../lib/crawlr"
|
2
|
+
|
3
|
+
# Create a new collector instance
|
4
|
+
clct = Crawlr::Collector.new
|
5
|
+
gems = []
|
6
|
+
|
7
|
+
# Visit the RubyGems popular releases page
|
8
|
+
clct.visit("https://rubygems.org/releases/popular") do |collector|
|
9
|
+
# Extract gem links using a CSS selector
|
10
|
+
# The callback will be executed for each matched node
|
11
|
+
collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
|
12
|
+
link = node["href"]
|
13
|
+
gems << ctx.resolve_url(link) if link
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Print results
|
18
|
+
puts "Found #{gems.size} gems"
|
19
|
+
gems.each { |g| puts g }
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative "../lib/crawlr"
|
2
|
+
|
3
|
+
# Create a new collector instance
|
4
|
+
clct = Crawlr::Collector.new(
|
5
|
+
max_depth: 2, # Limit unbounded crawls
|
6
|
+
random_delay: 1, # Maximum random delay between requests
|
7
|
+
max_parallelism: 5 # Maximum concurrent requests
|
8
|
+
)
|
9
|
+
|
10
|
+
# Create a map to store gem metadata
|
11
|
+
# Use a thread-safe map due to parallel processing
|
12
|
+
gems_meta = Concurrent::Map.new
|
13
|
+
|
14
|
+
# Visit the RubyGems popular releases page
|
15
|
+
clct.visit("https://rubygems.org/releases/popular") do |c|
|
16
|
+
# Grab main container
|
17
|
+
c.on_html(:css, ".main--interior") do |node, ctx|
|
18
|
+
# Grab all gem links
|
19
|
+
gems = []
|
20
|
+
node.css("a.gems__gem").each do |a|
|
21
|
+
gems << ctx.resolve_url(a["href"])
|
22
|
+
end
|
23
|
+
# Visit each gem page
|
24
|
+
c.visit(gems, ctx.increment_depth) # Use context helper method to set depth for accurate tracking
|
25
|
+
end
|
26
|
+
|
27
|
+
# This callback will be matched on the individual gem pages
|
28
|
+
c.on_html(:css, "h2.gem__downloads__heading:nth-child(1) > span:nth-child(1)") do |node, ctx|
|
29
|
+
gems_meta[ctx.page_url] = node.text
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Print results
|
34
|
+
puts "Found #{gems_meta.size} gems"
|
35
|
+
|
36
|
+
gems_meta.each_pair { |k, v| puts "#{k} => #{v}" }
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative "../lib/crawlr"
|
2
|
+
|
3
|
+
# Create a new collector instance
|
4
|
+
clct = Crawlr::Collector.new(
|
5
|
+
max_depth: 2,
|
6
|
+
random_delay: 1,
|
7
|
+
max_parallelism: 5
|
8
|
+
)
|
9
|
+
mu = Mutex.new
|
10
|
+
gems = []
|
11
|
+
|
12
|
+
# Visit the RubyGems popular releases page with pagination
|
13
|
+
# Set max depth in collector config to limit crawl depth
|
14
|
+
clct.paginated_visit("https://rubygems.org/releases/popular") do |collector|
|
15
|
+
# Extract gem links using a CSS selector
|
16
|
+
collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
|
17
|
+
link = node["href"]
|
18
|
+
if link
|
19
|
+
full_link = ctx.resolve_url(link) # Resolve relative URL using context helper method
|
20
|
+
mu.synchronize { gems << full_link }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Print results
|
26
|
+
puts "Found #{gems.size} gems"
|
27
|
+
gems.each { |g| puts g }
|
data/lib/crawlr/collector.rb
CHANGED
@@ -53,7 +53,6 @@ module Crawlr
|
|
53
53
|
# puts "Failed to scrape #{url}: #{error.message}"
|
54
54
|
# end
|
55
55
|
#
|
56
|
-
# @author [Your Name]
|
57
56
|
# @since 0.1.0
|
58
57
|
class Collector
|
59
58
|
# @return [Crawlr::Config] The configuration object for this collector
|
@@ -214,7 +213,7 @@ module Crawlr
|
|
214
213
|
return unless valid_url?(url)
|
215
214
|
|
216
215
|
yield self if block_given?
|
217
|
-
|
216
|
+
fetch_robots_txt(url) unless @config.ignore_robots_txt
|
218
217
|
return unless can_visit?(url, @config.headers)
|
219
218
|
|
220
219
|
pages_to_visit = build_initial_pages(url, query, batch_size, start_page)
|
data/lib/crawlr/config.rb
CHANGED
data/lib/crawlr/domains.rb
CHANGED
data/lib/crawlr/parser.rb
CHANGED
@@ -64,7 +64,6 @@ module Crawlr
|
|
64
64
|
# # HTML content parsed once, all callbacks executed on same document
|
65
65
|
# Crawlr::Parser.apply_callbacks(content: html, callbacks: callbacks, context: ctx)
|
66
66
|
#
|
67
|
-
# @author [Your Name]
|
68
67
|
# @since 0.1.0
|
69
68
|
module Parser
|
70
69
|
# Applies registered callbacks to parsed document content
|
data/lib/crawlr/robots.rb
CHANGED
@@ -58,7 +58,6 @@ module Crawlr
|
|
58
58
|
# robots.allowed?('https://example.com/temp/secret.txt', 'Bot') #=> false
|
59
59
|
# robots.allowed?('https://example.com/temp/public/file.txt', 'Bot') #=> true
|
60
60
|
#
|
61
|
-
# @author [Your Name]
|
62
61
|
# @since 0.1.0
|
63
62
|
class Robots
|
64
63
|
# Represents a robots.txt rule for a specific user-agent
|
data/lib/crawlr/version.rb
CHANGED
data/lib/crawlr/visits.rb
CHANGED
data/lib/crawlr.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "crawlr/version"
|
4
|
+
require_relative "crawlr/collector"
|
5
|
+
|
6
|
+
require "logger"
|
4
7
|
|
5
8
|
# A Ruby scraping framework for parsing HTML and XML documents
|
6
|
-
# @author [Your Name]
|
7
9
|
# @since 0.1.0
|
8
10
|
module Crawlr
|
9
11
|
class Error < StandardError; end
|
data/rubygems.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require "lib/crawlr"
|
2
|
+
|
3
|
+
clct = Crawlr::Collector.new
|
4
|
+
gems = []
|
5
|
+
|
6
|
+
clct.visit("https://rubygems.org/releases/popular") do |collector|
|
7
|
+
collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
|
8
|
+
link = node["href"]
|
9
|
+
full_link = ctx.resolve_url(link) if link
|
10
|
+
gems << full_link
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
puts "Found #{gems.size} gems"
|
15
|
+
|
16
|
+
gems.each do |gem|
|
17
|
+
puts gem
|
18
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aristotelis Rapai
|
@@ -166,6 +166,9 @@ files:
|
|
166
166
|
- LICENSE.txt
|
167
167
|
- README.md
|
168
168
|
- Rakefile
|
169
|
+
- examples/basic_visit.rb
|
170
|
+
- examples/nested_visit.rb
|
171
|
+
- examples/paginated_visit.rb
|
169
172
|
- lib/crawlr.rb
|
170
173
|
- lib/crawlr/callbacks.rb
|
171
174
|
- lib/crawlr/collector.rb
|
@@ -178,7 +181,7 @@ files:
|
|
178
181
|
- lib/crawlr/robots.rb
|
179
182
|
- lib/crawlr/version.rb
|
180
183
|
- lib/crawlr/visits.rb
|
181
|
-
-
|
184
|
+
- rubygems.rb
|
182
185
|
homepage: https://github.com/aristorap/crawlr
|
183
186
|
licenses:
|
184
187
|
- MIT
|
data/sig/crawlr.rbs
DELETED