crawlr 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f4a2b21633eead87fe3b879552db225aecc2975cad779fd86413f2531cd3f079
4
- data.tar.gz: 9b20eb81f931b0f514609e9b699a85a8f25d2e64e0fb7f8f6845343d4872a893
3
+ metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
4
+ data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
5
5
  SHA512:
6
- metadata.gz: 3e5d343dd502ed23343ad0e6bfbe9fbe6b8696954e171a181ae385c8c679d60cfeeeec4d65cdbb9e841731664ae27fa4034405c8265ab12f967470e91321208a
7
- data.tar.gz: ecb186a9d6e9a5f34a4e429b1c3971a1eaf7f708537d698557fab3272010c0f4fb953362b64ed498556867a0938387c70cc24f0b7f41563026be7f1157f373a1
6
+ metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
7
+ data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.2.1] - 2025-09-30
4
+
5
+ - Fix paginated_visit to properly handle provided url queries (if present)
6
+ - Update paginated_visit batch size parameter to respect max_depth (if max_depth set > 0)
7
+
3
8
  ## [0.2.0] - 2025-09-30
4
9
 
5
10
  - Tidied up documentation and inline comments
@@ -570,17 +570,19 @@ module Crawlr
570
570
  end
571
571
 
572
572
  def build_initial_pages(url, query, batch_size, start_page)
573
- max_batch = [@config.max_depth, batch_size].min
573
+ uri = URI.parse(url)
574
+ max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
575
+
574
576
  if start_page == 1
575
- [url] + (max_batch - 1).times.map { |i| "#{url}?#{query}=#{i + 2}" }
577
+ [url] + (max_batch - 1).times.map { |i| build_page_url(uri, query, i + 2) }
576
578
  else
577
- max_batch.times.map { |i| "#{url}?#{query}=#{i + start_page}" }
579
+ max_batch.times.map { |i| build_page_url(uri, query, i + start_page) }
578
580
  end
579
581
  end
580
582
 
581
583
  def process_page_batches(pages, current_depth, batch_size, query)
582
584
  scheduled_depth = current_depth
583
- max_batch = [@config.max_depth, batch_size].min
585
+ max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
584
586
 
585
587
  loop do
586
588
  break if reached_max_depth?(scheduled_depth)
@@ -625,7 +627,17 @@ module Crawlr
625
627
  end
626
628
 
627
629
  def generate_next_pages(batch, scheduled_depth, max_batch, query)
628
- max_batch.times.map { |i| "#{batch.first}?#{query}=#{i + scheduled_depth + 1}" }
630
+ uri = URI.parse(batch.first)
631
+ (0...max_batch).map { |i| build_page_url(uri, query, i + scheduled_depth + 1) }
632
+ end
633
+
634
+ def build_page_url(uri, query, value)
635
+ new_uri = uri.dup
636
+ params = URI.decode_www_form(new_uri.query || "")
637
+ params.reject! { |k, _| k == query }
638
+ params << [query, value]
639
+ new_uri.query = URI.encode_www_form(params)
640
+ new_uri.to_s
629
641
  end
630
642
  end
631
643
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Crawlr
4
- VERSION = "0.2.0"
4
+ VERSION = "0.2.1"
5
5
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlr
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aristotelis Rapai
8
8
  bindir: exe
9
9
  cert_chain: []
10
- date: 2025-09-29 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: async
@@ -181,7 +181,6 @@ files:
181
181
  - lib/crawlr/robots.rb
182
182
  - lib/crawlr/version.rb
183
183
  - lib/crawlr/visits.rb
184
- - rubygems.rb
185
184
  homepage: https://github.com/aristorap/crawlr
186
185
  licenses:
187
186
  - MIT
@@ -206,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
206
205
  - !ruby/object:Gem::Version
207
206
  version: '0'
208
207
  requirements: []
209
- rubygems_version: 3.6.3
208
+ rubygems_version: 3.7.2
210
209
  specification_version: 4
211
210
  summary: A powerful, async Ruby web scraping framework
212
211
  test_files: []
data/rubygems.rb DELETED
@@ -1,18 +0,0 @@
1
- require "lib/crawlr"
2
-
3
- clct = Crawlr::Collector.new
4
- gems = []
5
-
6
- clct.visit("https://rubygems.org/releases/popular") do |collector|
7
- collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
8
- link = node["href"]
9
- full_link = ctx.resolve_url(link) if link
10
- gems << full_link
11
- end
12
- end
13
-
14
- puts "Found #{gems.size} gems"
15
-
16
- gems.each do |gem|
17
- puts gem
18
- end