scraper_utils 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 03b44a667992331d6e36bb6eca68afc286205846d7be06263694fed52b5e2d30
4
- data.tar.gz: 9f0dd276223f1b22dd688453e1769199cbda34efa5141d58e546a8ddcb85c795
3
+ metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
4
+ data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
5
5
  SHA512:
6
- metadata.gz: b42e0be0f9e42d9a83588cf7dcbb98ec079d01262340d2e6fef8ac7201c3d80faa645351631f60f767186721a58580f4f1e5e09c130a3a32aebb4f301dbfbdfc
7
- data.tar.gz: e3cec3345d0af13026259600a54e417efd0c36394f1bc22ecac1a25573551a3a2e51482b060ad1b72ed7ba4850d55bf9f8032321d1b8c1ae6eab581244e92410
6
+ metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
7
+ data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.15.0 - 2026-03-05
4
+
5
+ * Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
6
+ * Fix pre_connect_hook hostname extraction to use `request['Host']` header
7
+
3
8
  ## 0.14.1 - 2026-03-04
4
9
 
5
10
  * Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
@@ -18,11 +18,5 @@ The throttle automatically:
18
18
  - Pauses before next request based on previous timing
19
19
  - Caps pause at 120s maximum
20
20
 
21
- Override the next pause duration manually if needed:
22
-
23
- ```ruby
24
- ScraperUtils::MiscUtils.pause_duration = 2.0
25
- ```
26
-
27
21
  **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
28
22
  each request is made and thus does not need to be wrapped with the helper.
@@ -25,6 +25,10 @@ module ScraperUtils
25
25
  @request_started_at = {} # hostname => Time
26
26
  end
27
27
 
28
+ def will_pause_till(hostname)
29
+ @next_request_at[hostname]
30
+ end
31
+
28
32
  # Sleep until this host's throttle window has elapsed.
29
33
  # Records when the request actually started.
30
34
  # @param hostname [String]
@@ -177,7 +177,7 @@ module ScraperUtils
177
177
  end
178
178
 
179
179
  def pre_connect_hook(_agent, request)
180
- hostname = request.respond_to?(:uri) ? request.uri.host : 'unknown'
180
+ hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
181
181
  @throttler.before_request(hostname)
182
182
  if DebugUtils.verbose?
183
183
  ScraperUtils::LogUtils.log(
@@ -191,7 +191,8 @@ module ScraperUtils
191
191
 
192
192
  status = response.respond_to?(:code) ? response.code.to_i : nil
193
193
  overloaded = [429, 500, 503].include?(status)
194
- @throttler.after_request(uri.host, overloaded: overloaded)
194
+ hostname = uri.host || 'unknown'
195
+ @throttler.after_request(hostname, overloaded: overloaded)
195
196
 
196
197
  if DebugUtils.basic?
197
198
  ScraperUtils::LogUtils.log(
@@ -206,9 +207,7 @@ module ScraperUtils
206
207
  # Mechanize errors often carry the URI in the message; fall back to 'unknown'
207
208
  hostname = if error.respond_to?(:uri)
208
209
  error.uri.host
209
- else
210
- 'unknown'
211
- end
210
+ end || 'unknown'
212
211
  @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
213
212
  end
214
213
 
@@ -27,6 +27,10 @@ module ScraperUtils
27
27
  @throttler = nil
28
28
  end
29
29
 
30
+ def will_pause_till
31
+ throttler.will_pause_till(THROTTLE_HOSTNAME)
32
+ end
33
+
30
34
  private
31
35
 
32
36
  def throttler
@@ -62,6 +62,13 @@ module ScraperUtils
62
62
  'certificate', 'approval', 'consent', 'permit'
63
63
  ].freeze
64
64
 
65
+
66
+ def self.fetch_url_head(url)
67
+ agent = Mechanize.new
68
+ # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
69
+ agent.head(url)
70
+ end
71
+
65
72
  def self.fetch_url_with_redirects(url)
66
73
  agent = Mechanize.new
67
74
  # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
@@ -104,7 +111,16 @@ module ScraperUtils
104
111
  geocodable = results
105
112
  .map { |record| record["address"] }
106
113
  .uniq
107
- .count { |text| ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case }
114
+ .count do |text|
115
+ ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
116
+ if !ok && DebugUtils.verbose?
117
+ ScraperUtils::LogUtils.log(
118
+ "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
119
+ )
120
+ end
121
+
122
+ ok
123
+ end
108
124
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
109
125
  "(#{(100.0 * geocodable / results.count).round(1)}%)"
110
126
  expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
@@ -223,6 +239,22 @@ module ScraperUtils
223
239
  end
224
240
  end
225
241
 
242
+ # Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
243
+ # @param results [Array<Hash>] The results from scraping an authority
244
+ # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
245
+ # @param variation [Integer] The variation allowed in addition to percentage (default:3)
246
+ # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
247
+ # @raise RuntimeError if insufficient detail checks pass
248
+ def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
249
+ if defined?(VCR)
250
+ VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
251
+ check_info_url_is_present(results, percentage, variation, &block)
252
+ end
253
+ else
254
+ check_info_url_is_present(results, percentage, variation, &block)
255
+ end
256
+ end
257
+
226
258
  # Validates that info_urls have expected details (unique URLs with content validation)
227
259
  # @param results [Array<Hash>] The results from scraping an authority
228
260
  # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
@@ -281,6 +313,43 @@ module ScraperUtils
281
313
 
282
314
  private
283
315
 
316
+ def self.check_info_url_is_present(results, percentage, variation, &block)
317
+ count = 0
318
+ failed = 0
319
+ fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
320
+
321
+ fib_indices.each do |index|
322
+ record = results[index]
323
+ info_url = record["info_url"]
324
+ puts "Checking info_url[#{index}]: #{info_url} is present..."
325
+
326
+ begin
327
+ page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
328
+ status = page.code.to_i
329
+ rescue Mechanize::ResponseCodeError => e
330
+ status = e.response_code.to_i
331
+ end
332
+
333
+ if [403, 429].include?(status)
334
+ puts " Bot protection detected - skipping"
335
+ next
336
+ end
337
+
338
+ count += 1
339
+ if status.between?(200, 299)
340
+ puts " OK: #{status}" if ENV['DEBUG']
341
+ else
342
+ failed += 1
343
+ puts " Failed: #{status}"
344
+ min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
345
+ passed = count - failed
346
+ raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
347
+ end
348
+ end
349
+
350
+ puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
351
+ end
352
+
284
353
  def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
285
354
  count = 0
286
355
  failed = 0
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.14.1"
4
+ VERSION = "0.15.0"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
@@ -114,7 +114,7 @@ metadata:
114
114
  allowed_push_host: https://rubygems.org
115
115
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
116
116
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
117
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.14.1
117
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
118
118
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
119
119
  rubygems_mfa_required: 'true'
120
120
  post_install_message: