scraper_utils 0.14.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/docs/misc_utilities.md +0 -6
- data/lib/scraper_utils/host_throttler.rb +4 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +4 -5
- data/lib/scraper_utils/misc_utils.rb +4 -0
- data/lib/scraper_utils/spec_support.rb +70 -1
- data/lib/scraper_utils/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
|
|
4
|
+
data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
|
|
7
|
+
data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.15.0 - 2026-03-05
|
|
4
|
+
|
|
5
|
+
* Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
|
|
6
|
+
* Fix pre_connect_hook hostname extraction to use `request['Host']` header
|
|
7
|
+
|
|
3
8
|
## 0.14.1 - 2026-03-04
|
|
4
9
|
|
|
5
10
|
* Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
|
data/docs/misc_utilities.md
CHANGED
|
@@ -18,11 +18,5 @@ The throttle automatically:
|
|
|
18
18
|
- Pauses before next request based on previous timing
|
|
19
19
|
- Caps pause at 120s maximum
|
|
20
20
|
|
|
21
|
-
Override the next pause duration manually if needed:
|
|
22
|
-
|
|
23
|
-
```ruby
|
|
24
|
-
ScraperUtils::MiscUtils.pause_duration = 2.0
|
|
25
|
-
```
|
|
26
|
-
|
|
27
21
|
**Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
|
|
28
22
|
each request is made and thus does not need to be wrapped with the helper.
|
|
@@ -25,6 +25,10 @@ module ScraperUtils
|
|
|
25
25
|
@request_started_at = {} # hostname => Time
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
def will_pause_till(hostname)
|
|
29
|
+
@next_request_at[hostname]
|
|
30
|
+
end
|
|
31
|
+
|
|
28
32
|
# Sleep until this host's throttle window has elapsed.
|
|
29
33
|
# Records when the request actually started.
|
|
30
34
|
# @param hostname [String]
|
|
@@ -177,7 +177,7 @@ module ScraperUtils
|
|
|
177
177
|
end
|
|
178
178
|
|
|
179
179
|
def pre_connect_hook(_agent, request)
|
|
180
|
-
hostname = request.respond_to?(:
|
|
180
|
+
hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
|
|
181
181
|
@throttler.before_request(hostname)
|
|
182
182
|
if DebugUtils.verbose?
|
|
183
183
|
ScraperUtils::LogUtils.log(
|
|
@@ -191,7 +191,8 @@ module ScraperUtils
|
|
|
191
191
|
|
|
192
192
|
status = response.respond_to?(:code) ? response.code.to_i : nil
|
|
193
193
|
overloaded = [429, 500, 503].include?(status)
|
|
194
|
-
|
|
194
|
+
hostname = uri.host || 'unknown'
|
|
195
|
+
@throttler.after_request(hostname, overloaded: overloaded)
|
|
195
196
|
|
|
196
197
|
if DebugUtils.basic?
|
|
197
198
|
ScraperUtils::LogUtils.log(
|
|
@@ -206,9 +207,7 @@ module ScraperUtils
|
|
|
206
207
|
# Mechanize errors often carry the URI in the message; fall back to 'unknown'
|
|
207
208
|
hostname = if error.respond_to?(:uri)
|
|
208
209
|
error.uri.host
|
|
209
|
-
|
|
210
|
-
'unknown'
|
|
211
|
-
end
|
|
210
|
+
end || 'unknown'
|
|
212
211
|
@throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
|
|
213
212
|
end
|
|
214
213
|
|
|
@@ -62,6 +62,13 @@ module ScraperUtils
|
|
|
62
62
|
'certificate', 'approval', 'consent', 'permit'
|
|
63
63
|
].freeze
|
|
64
64
|
|
|
65
|
+
|
|
66
|
+
def self.fetch_url_head(url)
|
|
67
|
+
agent = Mechanize.new
|
|
68
|
+
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
69
|
+
agent.head(url)
|
|
70
|
+
end
|
|
71
|
+
|
|
65
72
|
def self.fetch_url_with_redirects(url)
|
|
66
73
|
agent = Mechanize.new
|
|
67
74
|
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
@@ -104,7 +111,16 @@ module ScraperUtils
|
|
|
104
111
|
geocodable = results
|
|
105
112
|
.map { |record| record["address"] }
|
|
106
113
|
.uniq
|
|
107
|
-
.count
|
|
114
|
+
.count do |text|
|
|
115
|
+
ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
|
|
116
|
+
if !ok && DebugUtils.verbose?
|
|
117
|
+
ScraperUtils::LogUtils.log(
|
|
118
|
+
"Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
ok
|
|
123
|
+
end
|
|
108
124
|
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
|
109
125
|
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
|
110
126
|
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
|
@@ -223,6 +239,22 @@ module ScraperUtils
|
|
|
223
239
|
end
|
|
224
240
|
end
|
|
225
241
|
|
|
242
|
+
# Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
|
|
243
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
|
244
|
+
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
|
245
|
+
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
|
246
|
+
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
247
|
+
# @raise RuntimeError if insufficient detail checks pass
|
|
248
|
+
def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
|
|
249
|
+
if defined?(VCR)
|
|
250
|
+
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
|
|
251
|
+
check_info_url_is_present(results, percentage, variation, &block)
|
|
252
|
+
end
|
|
253
|
+
else
|
|
254
|
+
check_info_url_is_present(results, percentage, variation, &block)
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
226
258
|
# Validates that info_urls have expected details (unique URLs with content validation)
|
|
227
259
|
# @param results [Array<Hash>] The results from scraping an authority
|
|
228
260
|
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
|
@@ -281,6 +313,43 @@ module ScraperUtils
|
|
|
281
313
|
|
|
282
314
|
private
|
|
283
315
|
|
|
316
|
+
def self.check_info_url_is_present(results, percentage, variation, &block)
|
|
317
|
+
count = 0
|
|
318
|
+
failed = 0
|
|
319
|
+
fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
|
|
320
|
+
|
|
321
|
+
fib_indices.each do |index|
|
|
322
|
+
record = results[index]
|
|
323
|
+
info_url = record["info_url"]
|
|
324
|
+
puts "Checking info_url[#{index}]: #{info_url} is present..."
|
|
325
|
+
|
|
326
|
+
begin
|
|
327
|
+
page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
|
|
328
|
+
status = page.code.to_i
|
|
329
|
+
rescue Mechanize::ResponseCodeError => e
|
|
330
|
+
status = e.response_code.to_i
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
if [403, 429].include?(status)
|
|
334
|
+
puts " Bot protection detected - skipping"
|
|
335
|
+
next
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
count += 1
|
|
339
|
+
if status.between?(200, 299)
|
|
340
|
+
puts " OK: #{status}" if ENV['DEBUG']
|
|
341
|
+
else
|
|
342
|
+
failed += 1
|
|
343
|
+
puts " Failed: #{status}"
|
|
344
|
+
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
|
345
|
+
passed = count - failed
|
|
346
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
|
|
351
|
+
end
|
|
352
|
+
|
|
284
353
|
def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
|
285
354
|
count = 0
|
|
286
355
|
failed = 0
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.15.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
@@ -114,7 +114,7 @@ metadata:
|
|
|
114
114
|
allowed_push_host: https://rubygems.org
|
|
115
115
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
116
116
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
117
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
117
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
|
|
118
118
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
119
119
|
rubygems_mfa_required: 'true'
|
|
120
120
|
post_install_message:
|