crawlr 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7962445e19428525184ea2fb8dfcb76612c3143fd2764be3dd376c9bcb65ae69
4
- data.tar.gz: b784eb2b27f6b170ac67c4a9c9113fc7e7ed4fb443fcd3145d3be5e24ab1194e
3
+ metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
4
+ data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
5
5
  SHA512:
6
- metadata.gz: bd8296ebd6bdc77bbf7a4200d9f211721a137bb74073e76fb8eae44007e05bcb894abdb5c4cb92efe28af0bd8c14b9d734a5d33420ffada3c7debcd4794027e3
7
- data.tar.gz: ba6608820012fada66dbbf1026e7d52a8aa29290a714e658a0a4d904b6f6c7b685bc287353cd9e1319561b4ca990fd6d68542f3747727388130eb390060d0b33
6
+ metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
7
+ data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
data/CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.2.1] - 2025-09-30
4
+
5
+ - Fix paginated_visit to properly handle provided url queries (if present)
6
+ - Update paginated_visit batch size parameter to respect max_depth (if max_depth set > 0)
7
+
8
+ ## [0.2.0] - 2025-09-30
9
+
10
+ - Tidied up documentation and inline comments
11
+ - Fixed small bugs caused by typos
12
+ - Added a few examples demonstrating usage
13
+
3
14
  ## [0.1.0] - 2025-09-29
4
15
 
5
16
  - Initial release
data/README.md CHANGED
@@ -3,7 +3,7 @@
3
3
  A powerful, async Ruby web scraping framework designed for respectful and efficient data extraction. Built with modern Ruby practices, crawlr provides a clean API for scraping websites while respecting robots.txt, managing cookies, rotating proxies, and handling complex scraping scenarios.
4
4
 
5
5
  [![Gem Version](https://badge.fury.io/rb/crawlr.svg)](https://badge.fury.io/rb/crawlr)
6
- [![Ruby](https://github.com/yourusername/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/yourusername/crawlr/actions/workflows/ruby.yml)
6
+ [![Ruby](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml/badge.svg)](https://github.com/aristorap/crawlr/actions/workflows/ruby.yml)
7
7
 
8
8
  ## ✨ Features
9
9
 
@@ -71,20 +71,20 @@ collector.visit('https://example.com')
71
71
 
72
72
  ```ruby
73
73
  collector = Crawlr::Collector.new
74
-
74
+ products = []
75
75
  # Extract product information
76
- collector.on_html(:css, '.product') do |product, ctx|
77
- data = {
78
- name: product.css('.product-name').text.strip,
79
- price: product.css('.price').text.strip,
80
- image: product.css('img')&.first&.[]('src')
81
- }
82
-
83
- ctx.products ||= []
84
- ctx.products << data
76
+ collector.visit('https://shop.example.com/products') do |c|
77
+ c.on_html(:css, '.product') do |product, ctx|
78
+ data = {
79
+ name: product.css('.product-name').text.strip,
80
+ price: product.css('.price').text.strip,
81
+ image: product.css('img')&.first&.[]('src')
82
+ }
83
+
84
+ products << data
85
+ end
85
86
  end
86
-
87
- collector.visit('https://shop.example.com/products')
87
+ # do something with data
88
88
  ```
89
89
 
90
90
  ### API Scraping with Pagination
@@ -94,14 +94,16 @@ collector = Crawlr::Collector.new(
94
94
  max_parallelism: 10,
95
95
  timeout: 30
96
96
  )
97
+ mu = Mutex.new
98
+ items = Array.new
97
99
 
98
- collector.on_xml(:css, 'item') do |item, ctx|
99
- ctx.items ||= []
100
- ctx.items << {
100
+ collector.on_xml(:css, 'item') do |item, _ctx|
101
+ data = {
101
102
  id: item.css('id').text,
102
103
  title: item.css('title').text,
103
104
  published: item.css('published').text
104
105
  }
106
+ mu.synchronize { items << data }
105
107
  end
106
108
 
107
109
  # Automatically handles pagination with ?page=1, ?page=2, etc.
@@ -168,13 +170,11 @@ end
168
170
  # Process responses after each request
169
171
  collector.hook(:after_visit) do |url, response|
170
172
  puts "Got #{response.status} from #{url}"
171
- log_response_time(url, response.headers['X-Response-Time'])
172
173
  end
173
174
 
174
175
  # Handle errors gracefully
175
176
  collector.hook(:on_error) do |url, error|
176
177
  puts "Failed to scrape #{url}: #{error.message}"
177
- error_tracker.record(url, error)
178
178
  end
179
179
  ```
180
180
 
@@ -182,15 +182,11 @@ end
182
182
 
183
183
  ```ruby
184
184
  collector.on_html(:xpath, '//div[@class="content"]//p[position() <= 3]') do |paragraph, ctx|
185
- # Extract first 3 paragraphs from content divs
186
- ctx.content_paragraphs ||= []
187
- ctx.content_paragraphs << paragraph.text.strip
185
+ # Do stuff
188
186
  end
189
187
 
190
188
  collector.on_xml(:xpath, '//item[price > 100]/title') do |title, ctx|
191
- # Extract titles of expensive items from XML feeds
192
- ctx.expensive_items ||= []
193
- ctx.expensive_items << title.text
189
+ # Do stuff
194
190
  end
195
191
  ```
196
192
 
@@ -199,16 +195,12 @@ end
199
195
  ```ruby
200
196
  collector = Crawlr::Collector.new(allow_cookies: true)
201
197
 
202
- # Login first
203
- collector.on_html(:css, 'form[action="/login"]') do |form, ctx|
204
- # Cookies from login will be automatically used in subsequent requests
205
- end
206
-
198
+ # First visit will set cookies tor following requests
207
199
  collector.visit('https://site.com/login')
208
200
  collector.visit('https://site.com/protected-content') # Uses login cookies
209
201
  ```
210
202
 
211
- ### Monitoring and Statistics
203
+ ### Stats
212
204
 
213
205
  ```ruby
214
206
  collector = Crawlr::Collector.new
@@ -297,7 +289,7 @@ yard server
297
289
 
298
290
  ## 🤝 Contributing
299
291
 
300
- 1. Fork it (https://github.com/yourusername/crawlr/fork)
292
+ 1. Fork it (https://github.com/aristorap/crawlr/fork)
301
293
  2. Create your feature branch (`git checkout -b feature/amazing-feature`)
302
294
  3. Make your changes with tests
303
295
  4. Ensure all tests pass (`bundle exec rspec`)
@@ -313,13 +305,11 @@ This gem is available as open source under the terms of the [MIT License](https:
313
305
 
314
306
  - Built with [Nokogiri](https://nokogiri.org/) for HTML/XML parsing
315
307
  - Uses [Async](https://github.com/socketry/async) for high-performance concurrency
316
- - Inspired by Python's Scrapy framework and modern Ruby practices
317
-
308
+ - Inspired by Golang's [Colly](https://go-colly.org) framework and modern Ruby practices
318
309
  ## 📞 Support
319
310
 
320
- - 📖 [Documentation](https://yourusername.github.io/crawlr)
321
- - 🐛 [Issue Tracker](https://github.com/yourusername/crawlr/issues)
322
- - 💬 [Discussions](https://github.com/yourusername/crawlr/discussions)
311
+ - 📖 [Documentation TBD](https://aristorap.github.io/crawlr)
312
+ - 🐛 [Issue Tracker](https://github.com/aristorap/crawlr/issues)
323
313
 
324
314
  ---
325
315
 
@@ -0,0 +1,19 @@
1
+ require_relative "../lib/crawlr"
2
+
3
+ # Create a new collector instance
4
+ clct = Crawlr::Collector.new
5
+ gems = []
6
+
7
+ # Visit the RubyGems popular releases page
8
+ clct.visit("https://rubygems.org/releases/popular") do |collector|
9
+ # Extract gem links using a CSS selector
10
+ # The callback will be executed for each matched node
11
+ collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
12
+ link = node["href"]
13
+ gems << ctx.resolve_url(link) if link
14
+ end
15
+ end
16
+
17
+ # Print results
18
+ puts "Found #{gems.size} gems"
19
+ gems.each { |g| puts g }
@@ -0,0 +1,36 @@
1
+ require_relative "../lib/crawlr"
2
+
3
+ # Create a new collector instance
4
+ clct = Crawlr::Collector.new(
5
+ max_depth: 2, # Limit unbounded crawls
6
+ random_delay: 1, # Maximum random delay between requests
7
+ max_parallelism: 5 # Maximum concurrent requests
8
+ )
9
+
10
+ # Create a map to store gem metadata
11
+ # Use a thread-safe map due to parallel processing
12
+ gems_meta = Concurrent::Map.new
13
+
14
+ # Visit the RubyGems popular releases page
15
+ clct.visit("https://rubygems.org/releases/popular") do |c|
16
+ # Grab main container
17
+ c.on_html(:css, ".main--interior") do |node, ctx|
18
+ # Grab all gem links
19
+ gems = []
20
+ node.css("a.gems__gem").each do |a|
21
+ gems << ctx.resolve_url(a["href"])
22
+ end
23
+ # Visit each gem page
24
+ c.visit(gems, ctx.increment_depth) # Use context helper method to set depth for accurate tracking
25
+ end
26
+
27
+ # This callback will be matched on the individual gem pages
28
+ c.on_html(:css, "h2.gem__downloads__heading:nth-child(1) > span:nth-child(1)") do |node, ctx|
29
+ gems_meta[ctx.page_url] = node.text
30
+ end
31
+ end
32
+
33
+ # Print results
34
+ puts "Found #{gems_meta.size} gems"
35
+
36
+ gems_meta.each_pair { |k, v| puts "#{k} => #{v}" }
@@ -0,0 +1,27 @@
1
+ require_relative "../lib/crawlr"
2
+
3
+ # Create a new collector instance
4
+ clct = Crawlr::Collector.new(
5
+ max_depth: 2,
6
+ random_delay: 1,
7
+ max_parallelism: 5
8
+ )
9
+ mu = Mutex.new
10
+ gems = []
11
+
12
+ # Visit the RubyGems popular releases page with pagination
13
+ # Set max depth in collector config to limit crawl depth
14
+ clct.paginated_visit("https://rubygems.org/releases/popular") do |collector|
15
+ # Extract gem links using a CSS selector
16
+ collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
17
+ link = node["href"]
18
+ if link
19
+ full_link = ctx.resolve_url(link) # Resolve relative URL using context helper method
20
+ mu.synchronize { gems << full_link }
21
+ end
22
+ end
23
+ end
24
+
25
+ # Print results
26
+ puts "Found #{gems.size} gems"
27
+ gems.each { |g| puts g }
@@ -53,7 +53,6 @@ module Crawlr
53
53
  # puts "Failed to scrape #{url}: #{error.message}"
54
54
  # end
55
55
  #
56
- # @author [Your Name]
57
56
  # @since 0.1.0
58
57
  class Collector
59
58
  # @return [Crawlr::Config] The configuration object for this collector
@@ -214,7 +213,7 @@ module Crawlr
214
213
  return unless valid_url?(url)
215
214
 
216
215
  yield self if block_given?
217
- fetch_robots(url) unless @config.ignore_robots_txt
216
+ fetch_robots_txt(url) unless @config.ignore_robots_txt
218
217
  return unless can_visit?(url, @config.headers)
219
218
 
220
219
  pages_to_visit = build_initial_pages(url, query, batch_size, start_page)
@@ -571,17 +570,19 @@ module Crawlr
571
570
  end
572
571
 
573
572
  def build_initial_pages(url, query, batch_size, start_page)
574
- max_batch = [@config.max_depth, batch_size].min
573
+ uri = URI.parse(url)
574
+ max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
575
+
575
576
  if start_page == 1
576
- [url] + (max_batch - 1).times.map { |i| "#{url}?#{query}=#{i + 2}" }
577
+ [url] + (max_batch - 1).times.map { |i| build_page_url(uri, query, i + 2) }
577
578
  else
578
- max_batch.times.map { |i| "#{url}?#{query}=#{i + start_page}" }
579
+ max_batch.times.map { |i| build_page_url(uri, query, i + start_page) }
579
580
  end
580
581
  end
581
582
 
582
583
  def process_page_batches(pages, current_depth, batch_size, query)
583
584
  scheduled_depth = current_depth
584
- max_batch = [@config.max_depth, batch_size].min
585
+ max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
585
586
 
586
587
  loop do
587
588
  break if reached_max_depth?(scheduled_depth)
@@ -626,7 +627,17 @@ module Crawlr
626
627
  end
627
628
 
628
629
  def generate_next_pages(batch, scheduled_depth, max_batch, query)
629
- max_batch.times.map { |i| "#{batch.first}?#{query}=#{i + scheduled_depth + 1}" }
630
+ uri = URI.parse(batch.first)
631
+ (0...max_batch).map { |i| build_page_url(uri, query, i + scheduled_depth + 1) }
632
+ end
633
+
634
+ def build_page_url(uri, query, value)
635
+ new_uri = uri.dup
636
+ params = URI.decode_www_form(new_uri.query || "")
637
+ params.reject! { |k, _| k == query }
638
+ params << [query, value]
639
+ new_uri.query = URI.encode_www_form(params)
640
+ new_uri.to_s
630
641
  end
631
642
  end
632
643
  end
data/lib/crawlr/config.rb CHANGED
@@ -32,7 +32,6 @@ module Crawlr
32
32
  # max_parallelism: 10
33
33
  # )
34
34
  #
35
- # @author [Your Name]
36
35
  # @since 0.1.0
37
36
  class Config
38
37
  # @return [Integer] HTTP request timeout in seconds
@@ -35,7 +35,6 @@ module Crawlr
35
35
  #
36
36
  # domains.allowed?('https://any-site.com') #=> true
37
37
  #
38
- # @author [Your Name]
39
38
  # @since 0.1.0
40
39
  class Domains
41
40
  # Initializes a new Domains instance with the given configuration
@@ -45,7 +45,6 @@ module Crawlr
45
45
  # headers['X-Request-ID'] = SecureRandom.uuid
46
46
  # end
47
47
  #
48
- # @author [Your Name]
49
48
  # @since 0.1.0
50
49
  class HTTPInterface
51
50
  # Simplified HTTP response structure for internal use
data/lib/crawlr/parser.rb CHANGED
@@ -64,7 +64,6 @@ module Crawlr
64
64
  # # HTML content parsed once, all callbacks executed on same document
65
65
  # Crawlr::Parser.apply_callbacks(content: html, callbacks: callbacks, context: ctx)
66
66
  #
67
- # @author [Your Name]
68
67
  # @since 0.1.0
69
68
  module Parser
70
69
  # Applies registered callbacks to parsed document content
data/lib/crawlr/robots.rb CHANGED
@@ -58,7 +58,6 @@ module Crawlr
58
58
  # robots.allowed?('https://example.com/temp/secret.txt', 'Bot') #=> false
59
59
  # robots.allowed?('https://example.com/temp/public/file.txt', 'Bot') #=> true
60
60
  #
61
- # @author [Your Name]
62
61
  # @since 0.1.0
63
62
  class Robots
64
63
  # Represents a robots.txt rule for a specific user-agent
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlr
4
- VERSION = "0.1.0"
4
+ VERSION = "0.2.1"
5
5
  end
data/lib/crawlr/visits.rb CHANGED
@@ -56,7 +56,6 @@ module Crawlr
56
56
  #
57
57
  # threads.each(&:join)
58
58
  #
59
- # @author [Your Name]
60
59
  # @since 0.1.0
61
60
  class Visits
62
61
  # Initializes a new Visits tracker with the given configuration
data/lib/crawlr.rb CHANGED
@@ -1,9 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "crawlr/version"
4
+ require_relative "crawlr/collector"
5
+
6
+ require "logger"
4
7
 
5
8
  # A Ruby scraping framework for parsing HTML and XML documents
6
- # @author [Your Name]
7
9
  # @since 0.1.0
8
10
  module Crawlr
9
11
  class Error < StandardError; end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aristotelis Rapai
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-09-29 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: async
@@ -166,6 +166,9 @@ files:
166
166
  - LICENSE.txt
167
167
  - README.md
168
168
  - Rakefile
169
+ - examples/basic_visit.rb
170
+ - examples/nested_visit.rb
171
+ - examples/paginated_visit.rb
169
172
  - lib/crawlr.rb
170
173
  - lib/crawlr/callbacks.rb
171
174
  - lib/crawlr/collector.rb
@@ -178,7 +181,6 @@ files:
178
181
  - lib/crawlr/robots.rb
179
182
  - lib/crawlr/version.rb
180
183
  - lib/crawlr/visits.rb
181
- - sig/crawlr.rbs
182
184
  homepage: https://github.com/aristorap/crawlr
183
185
  licenses:
184
186
  - MIT
@@ -203,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
203
205
  - !ruby/object:Gem::Version
204
206
  version: '0'
205
207
  requirements: []
206
- rubygems_version: 3.6.3
208
+ rubygems_version: 3.7.2
207
209
  specification_version: 4
208
210
  summary: A powerful, async Ruby web scraping framework
209
211
  test_files: []
data/sig/crawlr.rbs DELETED
@@ -1,4 +0,0 @@
1
- module Crawlr
2
- VERSION: String
3
- # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
- end