scraper_utils 0.13.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3bce8cc5a624f9904ebf8bb35ccb5c5c6c831e28ed56f88d3baf3b8d19fbbd13
4
- data.tar.gz: 0a481566e846a4274796b0542fb64a805f486065ed08045724cea7bc3d46710d
3
+ metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
4
+ data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
5
5
  SHA512:
6
- metadata.gz: 231c167ffe232daacbc862b8c3dd2c0c71be6b8fc2ff061f4f36d88f2e2185a454eb0aa79653c7a99a2ed65c9857d961059456f8403af8c1ed39623cc8e2db6a
7
- data.tar.gz: f287f85cdd4cc11cf17c3e5d34d5493e2809f255f3a3544bc881e756f3379c897dd70dbba5ebf16b30837bb8612f42f704872e06c6bec1cad87845606fce6231
6
+ metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
7
+ data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33
data/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.15.0 - 2026-03-05
4
+
5
+ * Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
6
+ * Fix pre_connect_hook hostname extraction to use `request['Host']` header
7
+
8
+ ## 0.14.1 - 2026-03-04
9
+
10
+ * Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
11
+ `ScraperUtils::SpecSupport.geocodable?` to validate addresses that don't have postcodes nor capitalised suburb names
12
+ * Can pass ignore_case: true to relax the requirement for either postcode or uppercase suburb when you don't want to
13
+ pass Known suburb.
14
+ * Move Throttling to HostThrottler
15
+
3
16
  ## 0.13.1 - 2026.02-21
4
17
 
5
18
  * Added PaValidation that validates based
@@ -18,11 +18,5 @@ The throttle automatically:
18
18
  - Pauses before next request based on previous timing
19
19
  - Caps pause at 120s maximum
20
20
 
21
- Override the next pause duration manually if needed:
22
-
23
- ```ruby
24
- ScraperUtils::MiscUtils.pause_duration = 2.0
25
- ```
26
-
27
21
  **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
28
22
  each request is made and thus does not need to be wrapped with the helper.
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Tracks per-host next-allowed-request time so that time spent parsing
5
+ # and saving records counts toward the crawl delay rather than being
6
+ # added on top of it.
7
+ #
8
+ # Usage:
9
+ # throttler = HostThrottler.new(crawl_delay: 1.0, max_load: 50.0)
10
+ # throttler.before_request(hostname) # sleep until ready
11
+ # # ... make request ...
12
+ # throttler.after_request(hostname) # record timing, schedule next slot
13
+ # throttler.after_request(hostname, overloaded: true) # double delay + 5s
14
+ class HostThrottler
15
+ MAX_DELAY = 120.0
16
+
17
+ # @param crawl_delay [Float] minimum seconds between requests per host
18
+ # @param max_load [Float] target server load percentage (10..100);
19
+ # 50 means response_time == pause_time
20
+ def initialize(crawl_delay: 0.0, max_load: nil)
21
+ @crawl_delay = crawl_delay.to_f
22
+ # Clamp between 10 (delay 9x response) and 100 (no extra delay)
23
+ @max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
24
+ @next_request_at = {} # hostname => Time
25
+ @request_started_at = {} # hostname => Time
26
+ end
27
+
28
+ def will_pause_till(hostname)
29
+ @next_request_at[hostname]
30
+ end
31
+
32
+ # Sleep until this host's throttle window has elapsed.
33
+ # Records when the request actually started.
34
+ # @param hostname [String]
35
+ # @return [void]
36
+ def before_request(hostname)
37
+ target = @next_request_at[hostname]
38
+ if target
39
+ remaining = target - Time.now
40
+ sleep(remaining) if remaining > 0
41
+ end
42
+ @request_started_at[hostname] = Time.now
43
+ end
44
+
45
+ # Calculate and store the next allowed request time for this host.
46
+ # @param hostname [String]
47
+ # @param overloaded [Boolean] true when the server signalled overload
48
+ # (HTTP 429/500/503); doubles the normal delay and adds 5 seconds.
49
+ # @return [void]
50
+ def after_request(hostname, overloaded: false)
51
+ started = @request_started_at[hostname] || Time.now
52
+ response_time = Time.now - started
53
+
54
+ delay = @crawl_delay
55
+ if @max_load
56
+ delay += (100.0 - @max_load) * response_time / @max_load
57
+ end
58
+
59
+ if overloaded
60
+ delay = delay + response_time * 2 + 5.0
61
+ end
62
+
63
+ delay = delay.round(3).clamp(0.0, MAX_DELAY)
64
+ @next_request_at[hostname] = Time.now + delay
65
+
66
+ if DebugUtils.basic?
67
+ msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
68
+ msg += " OVERLOADED" if overloaded
69
+ msg += ", Will delay #{delay}s before next request"
70
+ LogUtils.log(msg)
71
+ end
72
+ end
73
+
74
+ # Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
75
+ # @param error [Exception]
76
+ # @return [Boolean]
77
+ def self.overload_error?(error)
78
+ code = if error.respond_to?(:response) && error.response.respond_to?(:code)
79
+ error.response.code.to_i # HTTParty style
80
+ elsif error.respond_to?(:response_code)
81
+ error.response_code.to_i # Mechanize style
82
+ end
83
+ [429, 500, 503].include?(code)
84
+ end
85
+ end
86
+ end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "mechanize"
4
4
  require "ipaddr"
5
+ require_relative "../host_throttler"
5
6
 
6
7
  module ScraperUtils
7
8
  module MechanizeUtils
@@ -76,8 +77,7 @@ module ScraperUtils
76
77
  attr_reader :user_agent
77
78
 
78
79
  # Give access for testing
79
-
80
- attr_reader :max_load, :crawl_delay
80
+ attr_reader :max_load, :crawl_delay, :throttler
81
81
 
82
82
  # Creates Mechanize agent configuration with sensible defaults overridable via configure
83
83
  # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
@@ -107,6 +107,7 @@ module ScraperUtils
107
107
  @crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
108
108
  # Clamp between 10 (delay 9 x response) and 100 (no delay)
109
109
  @max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
110
+ @throttler = HostThrottler.new(crawl_delay: @crawl_delay, max_load: @max_load)
110
111
 
111
112
  # Validate proxy URL format if proxy will be used
112
113
  @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
@@ -155,6 +156,7 @@ module ScraperUtils
155
156
 
156
157
  agent.pre_connect_hooks << method(:pre_connect_hook)
157
158
  agent.post_connect_hooks << method(:post_connect_hook)
159
+ agent.error_hooks << method(:error_hook) if agent.respond_to?(:error_hooks)
158
160
  end
159
161
 
160
162
  private
@@ -175,38 +177,40 @@ module ScraperUtils
175
177
  end
176
178
 
177
179
  def pre_connect_hook(_agent, request)
178
- @connection_started_at = Time.now
179
- return unless DebugUtils.verbose?
180
-
181
- ScraperUtils::LogUtils.log(
182
- "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
183
- )
180
+ hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
181
+ @throttler.before_request(hostname)
182
+ if DebugUtils.verbose?
183
+ ScraperUtils::LogUtils.log(
184
+ "Pre Connect request: #{request.inspect}"
185
+ )
186
+ end
184
187
  end
185
188
 
186
189
  def post_connect_hook(_agent, uri, response, _body)
187
190
  raise ArgumentError, "URI must be present in post-connect hook" unless uri
188
191
 
189
- response_time = Time.now - @connection_started_at
190
-
191
- response_delay = @crawl_delay || 0.0
192
- if @crawl_delay ||@max_load
193
- response_delay += response_time
194
- if @max_load && @max_load >= 1
195
- response_delay += (100.0 - @max_load) * response_time / @max_load
196
- end
197
- response_delay = response_delay.round(3)
198
- end
192
+ status = response.respond_to?(:code) ? response.code.to_i : nil
193
+ overloaded = [429, 500, 503].include?(status)
194
+ hostname = uri.host || 'unknown'
195
+ @throttler.after_request(hostname, overloaded: overloaded)
199
196
 
200
197
  if DebugUtils.basic?
201
198
  ScraperUtils::LogUtils.log(
202
- "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
203
- "after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
199
+ "Post Connect uri: #{uri.inspect}, response: #{response.inspect}"
204
200
  )
205
201
  end
206
- sleep(response_delay) if response_delay > 0.0
207
202
  response
208
203
  end
209
204
 
205
+ def error_hook(_agent, error)
206
+ # Best-effort: record the error against whatever host we can find
207
+ # Mechanize errors often carry the URI in the message; fall back to 'unknown'
208
+ hostname = if error.respond_to?(:uri)
209
+ error.uri.host
210
+ end || 'unknown'
211
+ @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
212
+ end
213
+
210
214
  def verify_proxy_works(agent)
211
215
  $stderr.flush
212
216
  $stdout.flush
@@ -1,23 +1,40 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "host_throttler"
4
+
3
5
  module ScraperUtils
4
6
  # Misc Standalone Utilities
5
7
  module MiscUtils
6
- MAX_PAUSE = 120.0
8
+ THROTTLE_HOSTNAME = "block"
7
9
 
8
10
  class << self
9
- attr_accessor :pause_duration
10
-
11
- # Throttle block to be nice to servers we are scraping
12
- def throttle_block(extra_delay: 0.5)
13
- if @pause_duration&.positive?
14
- puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
15
- sleep(@pause_duration)
11
+ # Throttle block to be nice to servers we are scraping.
12
+ # Time spent inside the block (parsing, saving) counts toward the delay.
13
+ def throttle_block
14
+ throttler.before_request(THROTTLE_HOSTNAME)
15
+ begin
16
+ result = yield
17
+ throttler.after_request(THROTTLE_HOSTNAME)
18
+ result
19
+ rescue StandardError => e
20
+ throttler.after_request(THROTTLE_HOSTNAME, overloaded: HostThrottler.overload_error?(e))
21
+ raise
16
22
  end
17
- start_time = Time.now.to_f
18
- result = yield
19
- @pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
20
- result
23
+ end
24
+
25
+ # Reset the internal throttler (useful in tests)
26
+ def reset_throttler!
27
+ @throttler = nil
28
+ end
29
+
30
+ def will_pause_till
31
+ throttler.will_pause_till(THROTTLE_HOSTNAME)
32
+ end
33
+
34
+ private
35
+
36
+ def throttler
37
+ @throttler ||= HostThrottler.new
21
38
  end
22
39
  end
23
40
  end
@@ -62,6 +62,13 @@ module ScraperUtils
62
62
  'certificate', 'approval', 'consent', 'permit'
63
63
  ].freeze
64
64
 
65
+
66
+ def self.fetch_url_head(url)
67
+ agent = Mechanize.new
68
+ # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
69
+ agent.head(url)
70
+ end
71
+
65
72
  def self.fetch_url_with_redirects(url)
66
73
  agent = Mechanize.new
67
74
  # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
@@ -95,14 +102,25 @@ module ScraperUtils
95
102
  # @param results [Array<Hash>] The results from scraping an authority
96
103
  # @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
97
104
  # @param variation [Integer] The variation allowed in addition to percentage (default:3)
105
+ # @param ignore_case [Boolean] Ignores case which relaxes suburb check
106
+ # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
98
107
  # @raise RuntimeError if insufficient addresses are geocodable
99
- def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
108
+ def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
100
109
  return nil if results.empty?
101
110
 
102
111
  geocodable = results
103
112
  .map { |record| record["address"] }
104
113
  .uniq
105
- .count { |text| ScraperUtils::SpecSupport.geocodable? text }
114
+ .count do |text|
115
+ ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
116
+ if !ok && DebugUtils.verbose?
117
+ ScraperUtils::LogUtils.log(
118
+ "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
119
+ )
120
+ end
121
+
122
+ ok
123
+ end
106
124
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
107
125
  "(#{(100.0 * geocodable / results.count).round(1)}%)"
108
126
  expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
@@ -115,8 +133,10 @@ module ScraperUtils
115
133
  # Check if an address is likely to be geocodable by analyzing its format.
116
134
  # This is a bit stricter than needed - typically assert >= 75% match
117
135
  # @param address [String] The address to check
136
+ # @param ignore_case [Boolean] Ignores case which relaxes suburb check
137
+ # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
118
138
  # @return [Boolean] True if the address appears to be geocodable.
119
- def self.geocodable?(address, ignore_case: false)
139
+ def self.geocodable?(address, ignore_case: false, known_suburbs: [])
120
140
  return false if address.nil? || address.empty?
121
141
  check_address = ignore_case ? address.upcase : address
122
142
 
@@ -129,16 +149,17 @@ module ScraperUtils
129
149
 
130
150
  uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
131
151
  has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
152
+ has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }
132
153
 
133
154
  if ENV["DEBUG"]
134
155
  missing = []
135
156
  missing << "street type" unless has_street_type
136
- missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
157
+ missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
137
158
  missing << "state" unless has_state
138
159
  puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
139
160
  end
140
161
 
141
- has_street_type && (has_postcode || has_uppercase_suburb) && has_state
162
+ has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
142
163
  end
143
164
 
144
165
  PLACEHOLDERS = [
@@ -218,6 +239,22 @@ module ScraperUtils
218
239
  end
219
240
  end
220
241
 
242
+ # Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
243
+ # @param results [Array<Hash>] The results from scraping an authority
244
+ # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
245
+ # @param variation [Integer] The variation allowed in addition to percentage (default:3)
246
+ # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
247
+ # @raise RuntimeError if insufficient detail checks pass
248
+ def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
249
+ if defined?(VCR)
250
+ VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
251
+ check_info_url_is_present(results, percentage, variation, &block)
252
+ end
253
+ else
254
+ check_info_url_is_present(results, percentage, variation, &block)
255
+ end
256
+ end
257
+
221
258
  # Validates that info_urls have expected details (unique URLs with content validation)
222
259
  # @param results [Array<Hash>] The results from scraping an authority
223
260
  # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
@@ -276,6 +313,43 @@ module ScraperUtils
276
313
 
277
314
  private
278
315
 
316
+ def self.check_info_url_is_present(results, percentage, variation, &block)
317
+ count = 0
318
+ failed = 0
319
+ fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
320
+
321
+ fib_indices.each do |index|
322
+ record = results[index]
323
+ info_url = record["info_url"]
324
+ puts "Checking info_url[#{index}]: #{info_url} is present..."
325
+
326
+ begin
327
+ page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
328
+ status = page.code.to_i
329
+ rescue Mechanize::ResponseCodeError => e
330
+ status = e.response_code.to_i
331
+ end
332
+
333
+ if [403, 429].include?(status)
334
+ puts " Bot protection detected - skipping"
335
+ next
336
+ end
337
+
338
+ count += 1
339
+ if status.between?(200, 299)
340
+ puts " OK: #{status}" if ENV['DEBUG']
341
+ else
342
+ failed += 1
343
+ puts " Failed: #{status}"
344
+ min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
345
+ passed = count - failed
346
+ raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
347
+ end
348
+ end
349
+
350
+ puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
351
+ end
352
+
279
353
  def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
280
354
  count = 0
281
355
  failed = 0
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.13.1"
4
+ VERSION = "0.15.0"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -5,12 +5,13 @@ require "scraper_utils/version"
5
5
  # Public Apis (responsible for requiring their own dependencies)
6
6
  require "scraper_utils/authority_utils"
7
7
  require "scraper_utils/data_quality_monitor"
8
- require "scraper_utils/pa_validation"
9
8
  require "scraper_utils/db_utils"
10
9
  require "scraper_utils/debug_utils"
10
+ require "scraper_utils/host_throttler"
11
11
  require "scraper_utils/log_utils"
12
12
  require "scraper_utils/maths_utils"
13
13
  require "scraper_utils/misc_utils"
14
+ require "scraper_utils/pa_validation"
14
15
  require "scraper_utils/spec_support"
15
16
 
16
17
  # Mechanize utilities
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.1
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-21 00:00:00.000000000 Z
11
+ date: 2026-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -97,6 +97,7 @@ files:
97
97
  - lib/scraper_utils/data_quality_monitor.rb
98
98
  - lib/scraper_utils/db_utils.rb
99
99
  - lib/scraper_utils/debug_utils.rb
100
+ - lib/scraper_utils/host_throttler.rb
100
101
  - lib/scraper_utils/log_utils.rb
101
102
  - lib/scraper_utils/maths_utils.rb
102
103
  - lib/scraper_utils/mechanize_utils.rb
@@ -113,7 +114,7 @@ metadata:
113
114
  allowed_push_host: https://rubygems.org
114
115
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
115
116
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
116
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.13.1
117
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
117
118
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
118
119
  rubygems_mfa_required: 'true'
119
120
  post_install_message: