crawlr 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/lib/crawlr/collector.rb +17 -5
- data/lib/crawlr/version.rb +1 -1
- metadata +3 -4
- data/rubygems.rb +0 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fe3c5b1d19db6a4fda1bd66a9e2c62a1b2bdb80c361fe06e84023a6bf3f024bb
|
4
|
+
data.tar.gz: 6f26c3350a3cbf7e967899d8f5490312d83caa8ad9223cefcb5ad8423bec1e97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4c58780044aa20341737127823958728deb6b3574c781cb804db45e5c81971678058f779657b379bfa566c0608d273c72ec8331e2226885f71e3d476af1c0076
|
7
|
+
data.tar.gz: a094872a4ad346cae330a6daa894c6a49a72e7082f9279fa878dd14d09f7fdbccad5617433e683d15309eb4b1f14bcc05aa59cd47f2f7a9c460a5b2728530ad0
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
|
3
|
+
## [0.2.1] - 2025-09-30
|
4
|
+
|
5
|
+
- Fix paginated_visit to properly handle provided url queries (if present)
|
6
|
+
- Update paginated_visit batch size parameter to respect max_depth (if max_depth set > 0)
|
7
|
+
|
3
8
|
## [0.2.0] - 2025-09-30
|
4
9
|
|
5
10
|
- Tidied up documentation and inline comments
|
data/lib/crawlr/collector.rb
CHANGED
@@ -570,17 +570,19 @@ module Crawlr
|
|
570
570
|
end
|
571
571
|
|
572
572
|
def build_initial_pages(url, query, batch_size, start_page)
|
573
|
-
|
573
|
+
uri = URI.parse(url)
|
574
|
+
max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
|
575
|
+
|
574
576
|
if start_page == 1
|
575
|
-
[url] + (max_batch - 1).times.map { |i|
|
577
|
+
[url] + (max_batch - 1).times.map { |i| build_page_url(uri, query, i + 2) }
|
576
578
|
else
|
577
|
-
max_batch.times.map { |i|
|
579
|
+
max_batch.times.map { |i| build_page_url(uri, query, i + start_page) }
|
578
580
|
end
|
579
581
|
end
|
580
582
|
|
581
583
|
def process_page_batches(pages, current_depth, batch_size, query)
|
582
584
|
scheduled_depth = current_depth
|
583
|
-
max_batch = [@config.max_depth, batch_size].min
|
585
|
+
max_batch = @config.max_depth.zero? ? batch_size : [@config.max_depth, batch_size].min
|
584
586
|
|
585
587
|
loop do
|
586
588
|
break if reached_max_depth?(scheduled_depth)
|
@@ -625,7 +627,17 @@ module Crawlr
|
|
625
627
|
end
|
626
628
|
|
627
629
|
def generate_next_pages(batch, scheduled_depth, max_batch, query)
|
628
|
-
|
630
|
+
uri = URI.parse(batch.first)
|
631
|
+
(0...max_batch).map { |i| build_page_url(uri, query, i + scheduled_depth + 1) }
|
632
|
+
end
|
633
|
+
|
634
|
+
def build_page_url(uri, query, value)
|
635
|
+
new_uri = uri.dup
|
636
|
+
params = URI.decode_www_form(new_uri.query || "")
|
637
|
+
params.reject! { |k, _| k == query }
|
638
|
+
params << [query, value]
|
639
|
+
new_uri.query = URI.encode_www_form(params)
|
640
|
+
new_uri.to_s
|
629
641
|
end
|
630
642
|
end
|
631
643
|
end
|
data/lib/crawlr/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aristotelis Rapai
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
12
|
- !ruby/object:Gem::Dependency
|
13
13
|
name: async
|
@@ -181,7 +181,6 @@ files:
|
|
181
181
|
- lib/crawlr/robots.rb
|
182
182
|
- lib/crawlr/version.rb
|
183
183
|
- lib/crawlr/visits.rb
|
184
|
-
- rubygems.rb
|
185
184
|
homepage: https://github.com/aristorap/crawlr
|
186
185
|
licenses:
|
187
186
|
- MIT
|
@@ -206,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
206
205
|
- !ruby/object:Gem::Version
|
207
206
|
version: '0'
|
208
207
|
requirements: []
|
209
|
-
rubygems_version: 3.
|
208
|
+
rubygems_version: 3.7.2
|
210
209
|
specification_version: 4
|
211
210
|
summary: A powerful, async Ruby web scraping framework
|
212
211
|
test_files: []
|
data/rubygems.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require "lib/crawlr"
|
2
|
-
|
3
|
-
clct = Crawlr::Collector.new
|
4
|
-
gems = []
|
5
|
-
|
6
|
-
clct.visit("https://rubygems.org/releases/popular") do |collector|
|
7
|
-
collector.on_html(:css, ".main--interior a.gems__gem") do |node, ctx|
|
8
|
-
link = node["href"]
|
9
|
-
full_link = ctx.resolve_url(link) if link
|
10
|
-
gems << full_link
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
puts "Found #{gems.size} gems"
|
15
|
-
|
16
|
-
gems.each do |gem|
|
17
|
-
puts gem
|
18
|
-
end
|