scraper_utils 0.13.1 → 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3bce8cc5a624f9904ebf8bb35ccb5c5c6c831e28ed56f88d3baf3b8d19fbbd13
4
- data.tar.gz: 0a481566e846a4274796b0542fb64a805f486065ed08045724cea7bc3d46710d
3
+ metadata.gz: 03b44a667992331d6e36bb6eca68afc286205846d7be06263694fed52b5e2d30
4
+ data.tar.gz: 9f0dd276223f1b22dd688453e1769199cbda34efa5141d58e546a8ddcb85c795
5
5
  SHA512:
6
- metadata.gz: 231c167ffe232daacbc862b8c3dd2c0c71be6b8fc2ff061f4f36d88f2e2185a454eb0aa79653c7a99a2ed65c9857d961059456f8403af8c1ed39623cc8e2db6a
7
- data.tar.gz: f287f85cdd4cc11cf17c3e5d34d5493e2809f255f3a3544bc881e756f3379c897dd70dbba5ebf16b30837bb8612f42f704872e06c6bec1cad87845606fce6231
6
+ metadata.gz: b42e0be0f9e42d9a83588cf7dcbb98ec079d01262340d2e6fef8ac7201c3d80faa645351631f60f767186721a58580f4f1e5e09c130a3a32aebb4f301dbfbdfc
7
+ data.tar.gz: e3cec3345d0af13026259600a54e417efd0c36394f1bc22ecac1a25573551a3a2e51482b060ad1b72ed7ba4850d55bf9f8032321d1b8c1ae6eab581244e92410
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.14.1 - 2026-03-04
4
+
5
+ * Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
6
+ `ScraperUtils::SpecSupport.geocodable?` to validate addresses that don't have postcodes nor capitalised suburb names
7
+ * Can pass ignore_case: true to relax the requirement for either postcode or uppercase suburb when you don't want to
8
+ pass Known suburb.
9
+ * Move Throttling to HostThrottler
10
+
3
11
  ## 0.13.1 - 2026.02-21
4
12
 
5
13
  * Added PaValidation that validates based
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Tracks per-host next-allowed-request time so that time spent parsing
5
+ # and saving records counts toward the crawl delay rather than being
6
+ # added on top of it.
7
+ #
8
+ # Usage:
9
+ # throttler = HostThrottler.new(crawl_delay: 1.0, max_load: 50.0)
10
+ # throttler.before_request(hostname) # sleep until ready
11
+ # # ... make request ...
12
+ # throttler.after_request(hostname) # record timing, schedule next slot
13
+ # throttler.after_request(hostname, overloaded: true) # double delay + 5s
14
+ class HostThrottler
15
+ MAX_DELAY = 120.0
16
+
17
+ # @param crawl_delay [Float] minimum seconds between requests per host
18
+ # @param max_load [Float] target server load percentage (10..100);
19
+ # 50 means response_time == pause_time
20
+ def initialize(crawl_delay: 0.0, max_load: nil)
21
+ @crawl_delay = crawl_delay.to_f
22
+ # Clamp between 10 (delay 9x response) and 100 (no extra delay)
23
+ @max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
24
+ @next_request_at = {} # hostname => Time
25
+ @request_started_at = {} # hostname => Time
26
+ end
27
+
28
+ # Sleep until this host's throttle window has elapsed.
29
+ # Records when the request actually started.
30
+ # @param hostname [String]
31
+ # @return [void]
32
+ def before_request(hostname)
33
+ target = @next_request_at[hostname]
34
+ if target
35
+ remaining = target - Time.now
36
+ sleep(remaining) if remaining > 0
37
+ end
38
+ @request_started_at[hostname] = Time.now
39
+ end
40
+
41
+ # Calculate and store the next allowed request time for this host.
42
+ # @param hostname [String]
43
+ # @param overloaded [Boolean] true when the server signalled overload
44
+ # (HTTP 429/500/503); doubles the normal delay and adds 5 seconds.
45
+ # @return [void]
46
+ def after_request(hostname, overloaded: false)
47
+ started = @request_started_at[hostname] || Time.now
48
+ response_time = Time.now - started
49
+
50
+ delay = @crawl_delay
51
+ if @max_load
52
+ delay += (100.0 - @max_load) * response_time / @max_load
53
+ end
54
+
55
+ if overloaded
56
+ delay = delay + response_time * 2 + 5.0
57
+ end
58
+
59
+ delay = delay.round(3).clamp(0.0, MAX_DELAY)
60
+ @next_request_at[hostname] = Time.now + delay
61
+
62
+ if DebugUtils.basic?
63
+ msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
64
+ msg += " OVERLOADED" if overloaded
65
+ msg += ", Will delay #{delay}s before next request"
66
+ LogUtils.log(msg)
67
+ end
68
+ end
69
+
70
+ # Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
71
+ # @param error [Exception]
72
+ # @return [Boolean]
73
+ def self.overload_error?(error)
74
+ code = if error.respond_to?(:response) && error.response.respond_to?(:code)
75
+ error.response.code.to_i # HTTParty style
76
+ elsif error.respond_to?(:response_code)
77
+ error.response_code.to_i # Mechanize style
78
+ end
79
+ [429, 500, 503].include?(code)
80
+ end
81
+ end
82
+ end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "mechanize"
4
4
  require "ipaddr"
5
+ require_relative "../host_throttler"
5
6
 
6
7
  module ScraperUtils
7
8
  module MechanizeUtils
@@ -76,8 +77,7 @@ module ScraperUtils
76
77
  attr_reader :user_agent
77
78
 
78
79
  # Give access for testing
79
-
80
- attr_reader :max_load, :crawl_delay
80
+ attr_reader :max_load, :crawl_delay, :throttler
81
81
 
82
82
  # Creates Mechanize agent configuration with sensible defaults overridable via configure
83
83
  # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
@@ -107,6 +107,7 @@ module ScraperUtils
107
107
  @crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
108
108
  # Clamp between 10 (delay 9 x response) and 100 (no delay)
109
109
  @max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
110
+ @throttler = HostThrottler.new(crawl_delay: @crawl_delay, max_load: @max_load)
110
111
 
111
112
  # Validate proxy URL format if proxy will be used
112
113
  @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
@@ -155,6 +156,7 @@ module ScraperUtils
155
156
 
156
157
  agent.pre_connect_hooks << method(:pre_connect_hook)
157
158
  agent.post_connect_hooks << method(:post_connect_hook)
159
+ agent.error_hooks << method(:error_hook) if agent.respond_to?(:error_hooks)
158
160
  end
159
161
 
160
162
  private
@@ -175,38 +177,41 @@ module ScraperUtils
175
177
  end
176
178
 
177
179
  def pre_connect_hook(_agent, request)
178
- @connection_started_at = Time.now
179
- return unless DebugUtils.verbose?
180
-
181
- ScraperUtils::LogUtils.log(
182
- "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
183
- )
180
+ hostname = request.respond_to?(:uri) ? request.uri.host : 'unknown'
181
+ @throttler.before_request(hostname)
182
+ if DebugUtils.verbose?
183
+ ScraperUtils::LogUtils.log(
184
+ "Pre Connect request: #{request.inspect}"
185
+ )
186
+ end
184
187
  end
185
188
 
186
189
  def post_connect_hook(_agent, uri, response, _body)
187
190
  raise ArgumentError, "URI must be present in post-connect hook" unless uri
188
191
 
189
- response_time = Time.now - @connection_started_at
190
-
191
- response_delay = @crawl_delay || 0.0
192
- if @crawl_delay ||@max_load
193
- response_delay += response_time
194
- if @max_load && @max_load >= 1
195
- response_delay += (100.0 - @max_load) * response_time / @max_load
196
- end
197
- response_delay = response_delay.round(3)
198
- end
192
+ status = response.respond_to?(:code) ? response.code.to_i : nil
193
+ overloaded = [429, 500, 503].include?(status)
194
+ @throttler.after_request(uri.host, overloaded: overloaded)
199
195
 
200
196
  if DebugUtils.basic?
201
197
  ScraperUtils::LogUtils.log(
202
- "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
203
- "after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
198
+ "Post Connect uri: #{uri.inspect}, response: #{response.inspect}"
204
199
  )
205
200
  end
206
- sleep(response_delay) if response_delay > 0.0
207
201
  response
208
202
  end
209
203
 
204
+ def error_hook(_agent, error)
205
+ # Best-effort: record the error against whatever host we can find
206
+ # Mechanize errors often carry the URI in the message; fall back to 'unknown'
207
+ hostname = if error.respond_to?(:uri)
208
+ error.uri.host
209
+ else
210
+ 'unknown'
211
+ end
212
+ @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
213
+ end
214
+
210
215
  def verify_proxy_works(agent)
211
216
  $stderr.flush
212
217
  $stdout.flush
@@ -1,23 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "host_throttler"
4
+
3
5
  module ScraperUtils
4
6
  # Misc Standalone Utilities
5
7
  module MiscUtils
6
- MAX_PAUSE = 120.0
8
+ THROTTLE_HOSTNAME = "block"
7
9
 
8
10
  class << self
9
- attr_accessor :pause_duration
10
-
11
- # Throttle block to be nice to servers we are scraping
12
- def throttle_block(extra_delay: 0.5)
13
- if @pause_duration&.positive?
14
- puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
15
- sleep(@pause_duration)
11
+ # Throttle block to be nice to servers we are scraping.
12
+ # Time spent inside the block (parsing, saving) counts toward the delay.
13
+ def throttle_block
14
+ throttler.before_request(THROTTLE_HOSTNAME)
15
+ begin
16
+ result = yield
17
+ throttler.after_request(THROTTLE_HOSTNAME)
18
+ result
19
+ rescue StandardError => e
20
+ throttler.after_request(THROTTLE_HOSTNAME, overloaded: HostThrottler.overload_error?(e))
21
+ raise
16
22
  end
17
- start_time = Time.now.to_f
18
- result = yield
19
- @pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
20
- result
23
+ end
24
+
25
+ # Reset the internal throttler (useful in tests)
26
+ def reset_throttler!
27
+ @throttler = nil
28
+ end
29
+
30
+ private
31
+
32
+ def throttler
33
+ @throttler ||= HostThrottler.new
21
34
  end
22
35
  end
23
36
  end
@@ -95,14 +95,16 @@ module ScraperUtils
95
95
  # @param results [Array<Hash>] The results from scraping an authority
96
96
  # @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
97
97
  # @param variation [Integer] The variation allowed in addition to percentage (default:3)
98
+ # @param ignore_case [Boolean] Ignores case which relaxes suburb check
99
+ # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
98
100
  # @raise RuntimeError if insufficient addresses are geocodable
99
- def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
101
+ def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
100
102
  return nil if results.empty?
101
103
 
102
104
  geocodable = results
103
105
  .map { |record| record["address"] }
104
106
  .uniq
105
- .count { |text| ScraperUtils::SpecSupport.geocodable? text }
107
+ .count { |text| ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case }
106
108
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
107
109
  "(#{(100.0 * geocodable / results.count).round(1)}%)"
108
110
  expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
@@ -115,8 +117,10 @@ module ScraperUtils
115
117
  # Check if an address is likely to be geocodable by analyzing its format.
116
118
  # This is a bit stricter than needed - typically assert >= 75% match
117
119
  # @param address [String] The address to check
120
+ # @param ignore_case [Boolean] Ignores case which relaxes suburb check
121
+ # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
118
122
  # @return [Boolean] True if the address appears to be geocodable.
119
- def self.geocodable?(address, ignore_case: false)
123
+ def self.geocodable?(address, ignore_case: false, known_suburbs: [])
120
124
  return false if address.nil? || address.empty?
121
125
  check_address = ignore_case ? address.upcase : address
122
126
 
@@ -129,16 +133,17 @@ module ScraperUtils
129
133
 
130
134
  uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
131
135
  has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
136
+ has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }
132
137
 
133
138
  if ENV["DEBUG"]
134
139
  missing = []
135
140
  missing << "street type" unless has_street_type
136
- missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
141
+ missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
137
142
  missing << "state" unless has_state
138
143
  puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
139
144
  end
140
145
 
141
- has_street_type && (has_postcode || has_uppercase_suburb) && has_state
146
+ has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
142
147
  end
143
148
 
144
149
  PLACEHOLDERS = [
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.13.1"
4
+ VERSION = "0.14.1"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -5,12 +5,13 @@ require "scraper_utils/version"
5
5
  # Public Apis (responsible for requiring their own dependencies)
6
6
  require "scraper_utils/authority_utils"
7
7
  require "scraper_utils/data_quality_monitor"
8
- require "scraper_utils/pa_validation"
9
8
  require "scraper_utils/db_utils"
10
9
  require "scraper_utils/debug_utils"
10
+ require "scraper_utils/host_throttler"
11
11
  require "scraper_utils/log_utils"
12
12
  require "scraper_utils/maths_utils"
13
13
  require "scraper_utils/misc_utils"
14
+ require "scraper_utils/pa_validation"
14
15
  require "scraper_utils/spec_support"
15
16
 
16
17
  # Mechanize utilities
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.1
4
+ version: 0.14.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-21 00:00:00.000000000 Z
11
+ date: 2026-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -97,6 +97,7 @@ files:
97
97
  - lib/scraper_utils/data_quality_monitor.rb
98
98
  - lib/scraper_utils/db_utils.rb
99
99
  - lib/scraper_utils/debug_utils.rb
100
+ - lib/scraper_utils/host_throttler.rb
100
101
  - lib/scraper_utils/log_utils.rb
101
102
  - lib/scraper_utils/maths_utils.rb
102
103
  - lib/scraper_utils/mechanize_utils.rb
@@ -113,7 +114,7 @@ metadata:
113
114
  allowed_push_host: https://rubygems.org
114
115
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
115
116
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
116
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.13.1
117
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.14.1
117
118
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
118
119
  rubygems_mfa_required: 'true'
119
120
  post_install_message: