scraper_utils 0.13.1 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/lib/scraper_utils/host_throttler.rb +82 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +26 -21
- data/lib/scraper_utils/misc_utils.rb +25 -12
- data/lib/scraper_utils/spec_support.rb +10 -5
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +2 -1
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 03b44a667992331d6e36bb6eca68afc286205846d7be06263694fed52b5e2d30
|
|
4
|
+
data.tar.gz: 9f0dd276223f1b22dd688453e1769199cbda34efa5141d58e546a8ddcb85c795
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b42e0be0f9e42d9a83588cf7dcbb98ec079d01262340d2e6fef8ac7201c3d80faa645351631f60f767186721a58580f4f1e5e09c130a3a32aebb4f301dbfbdfc
|
|
7
|
+
data.tar.gz: e3cec3345d0af13026259600a54e417efd0c36394f1bc22ecac1a25573551a3a2e51482b060ad1b72ed7ba4850d55bf9f8032321d1b8c1ae6eab581244e92410
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.14.1 - 2026-03-04
|
|
4
|
+
|
|
5
|
+
* Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
|
|
6
|
+
`ScraperUtils::SpecSupport.geocodable?` to validate addresses that don't have postcodes nor capitalised suburb names
|
|
7
|
+
* Can pass ignore_case: true to relax the requirement for either postcode or uppercase suburb when you don't want to
|
|
8
|
+
pass Known suburb.
|
|
9
|
+
* Move Throttling to HostThrottler
|
|
10
|
+
|
|
3
11
|
## 0.13.1 - 2026.02-21
|
|
4
12
|
|
|
5
13
|
* Added PaValidation that validates based
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ScraperUtils
|
|
4
|
+
# Tracks per-host next-allowed-request time so that time spent parsing
|
|
5
|
+
# and saving records counts toward the crawl delay rather than being
|
|
6
|
+
# added on top of it.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# throttler = HostThrottler.new(crawl_delay: 1.0, max_load: 50.0)
|
|
10
|
+
# throttler.before_request(hostname) # sleep until ready
|
|
11
|
+
# # ... make request ...
|
|
12
|
+
# throttler.after_request(hostname) # record timing, schedule next slot
|
|
13
|
+
# throttler.after_request(hostname, overloaded: true) # double delay + 5s
|
|
14
|
+
class HostThrottler
|
|
15
|
+
MAX_DELAY = 120.0
|
|
16
|
+
|
|
17
|
+
# @param crawl_delay [Float] minimum seconds between requests per host
|
|
18
|
+
# @param max_load [Float] target server load percentage (10..100);
|
|
19
|
+
# 50 means response_time == pause_time
|
|
20
|
+
def initialize(crawl_delay: 0.0, max_load: nil)
|
|
21
|
+
@crawl_delay = crawl_delay.to_f
|
|
22
|
+
# Clamp between 10 (delay 9x response) and 100 (no extra delay)
|
|
23
|
+
@max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
|
|
24
|
+
@next_request_at = {} # hostname => Time
|
|
25
|
+
@request_started_at = {} # hostname => Time
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Sleep until this host's throttle window has elapsed.
|
|
29
|
+
# Records when the request actually started.
|
|
30
|
+
# @param hostname [String]
|
|
31
|
+
# @return [void]
|
|
32
|
+
def before_request(hostname)
|
|
33
|
+
target = @next_request_at[hostname]
|
|
34
|
+
if target
|
|
35
|
+
remaining = target - Time.now
|
|
36
|
+
sleep(remaining) if remaining > 0
|
|
37
|
+
end
|
|
38
|
+
@request_started_at[hostname] = Time.now
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Calculate and store the next allowed request time for this host.
|
|
42
|
+
# @param hostname [String]
|
|
43
|
+
# @param overloaded [Boolean] true when the server signalled overload
|
|
44
|
+
# (HTTP 429/500/503); doubles the normal delay and adds 5 seconds.
|
|
45
|
+
# @return [void]
|
|
46
|
+
def after_request(hostname, overloaded: false)
|
|
47
|
+
started = @request_started_at[hostname] || Time.now
|
|
48
|
+
response_time = Time.now - started
|
|
49
|
+
|
|
50
|
+
delay = @crawl_delay
|
|
51
|
+
if @max_load
|
|
52
|
+
delay += (100.0 - @max_load) * response_time / @max_load
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
if overloaded
|
|
56
|
+
delay = delay + response_time * 2 + 5.0
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
delay = delay.round(3).clamp(0.0, MAX_DELAY)
|
|
60
|
+
@next_request_at[hostname] = Time.now + delay
|
|
61
|
+
|
|
62
|
+
if DebugUtils.basic?
|
|
63
|
+
msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
|
|
64
|
+
msg += " OVERLOADED" if overloaded
|
|
65
|
+
msg += ", Will delay #{delay}s before next request"
|
|
66
|
+
LogUtils.log(msg)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
|
|
71
|
+
# @param error [Exception]
|
|
72
|
+
# @return [Boolean]
|
|
73
|
+
def self.overload_error?(error)
|
|
74
|
+
code = if error.respond_to?(:response) && error.response.respond_to?(:code)
|
|
75
|
+
error.response.code.to_i # HTTParty style
|
|
76
|
+
elsif error.respond_to?(:response_code)
|
|
77
|
+
error.response_code.to_i # Mechanize style
|
|
78
|
+
end
|
|
79
|
+
[429, 500, 503].include?(code)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "mechanize"
|
|
4
4
|
require "ipaddr"
|
|
5
|
+
require_relative "../host_throttler"
|
|
5
6
|
|
|
6
7
|
module ScraperUtils
|
|
7
8
|
module MechanizeUtils
|
|
@@ -76,8 +77,7 @@ module ScraperUtils
|
|
|
76
77
|
attr_reader :user_agent
|
|
77
78
|
|
|
78
79
|
# Give access for testing
|
|
79
|
-
|
|
80
|
-
attr_reader :max_load, :crawl_delay
|
|
80
|
+
attr_reader :max_load, :crawl_delay, :throttler
|
|
81
81
|
|
|
82
82
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
|
83
83
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
|
@@ -107,6 +107,7 @@ module ScraperUtils
|
|
|
107
107
|
@crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
|
|
108
108
|
# Clamp between 10 (delay 9 x response) and 100 (no delay)
|
|
109
109
|
@max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
|
|
110
|
+
@throttler = HostThrottler.new(crawl_delay: @crawl_delay, max_load: @max_load)
|
|
110
111
|
|
|
111
112
|
# Validate proxy URL format if proxy will be used
|
|
112
113
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
|
@@ -155,6 +156,7 @@ module ScraperUtils
|
|
|
155
156
|
|
|
156
157
|
agent.pre_connect_hooks << method(:pre_connect_hook)
|
|
157
158
|
agent.post_connect_hooks << method(:post_connect_hook)
|
|
159
|
+
agent.error_hooks << method(:error_hook) if agent.respond_to?(:error_hooks)
|
|
158
160
|
end
|
|
159
161
|
|
|
160
162
|
private
|
|
@@ -175,38 +177,41 @@ module ScraperUtils
|
|
|
175
177
|
end
|
|
176
178
|
|
|
177
179
|
def pre_connect_hook(_agent, request)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
180
|
+
hostname = request.respond_to?(:uri) ? request.uri.host : 'unknown'
|
|
181
|
+
@throttler.before_request(hostname)
|
|
182
|
+
if DebugUtils.verbose?
|
|
183
|
+
ScraperUtils::LogUtils.log(
|
|
184
|
+
"Pre Connect request: #{request.inspect}"
|
|
185
|
+
)
|
|
186
|
+
end
|
|
184
187
|
end
|
|
185
188
|
|
|
186
189
|
def post_connect_hook(_agent, uri, response, _body)
|
|
187
190
|
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
|
188
191
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if @crawl_delay ||@max_load
|
|
193
|
-
response_delay += response_time
|
|
194
|
-
if @max_load && @max_load >= 1
|
|
195
|
-
response_delay += (100.0 - @max_load) * response_time / @max_load
|
|
196
|
-
end
|
|
197
|
-
response_delay = response_delay.round(3)
|
|
198
|
-
end
|
|
192
|
+
status = response.respond_to?(:code) ? response.code.to_i : nil
|
|
193
|
+
overloaded = [429, 500, 503].include?(status)
|
|
194
|
+
@throttler.after_request(uri.host, overloaded: overloaded)
|
|
199
195
|
|
|
200
196
|
if DebugUtils.basic?
|
|
201
197
|
ScraperUtils::LogUtils.log(
|
|
202
|
-
"Post Connect uri: #{uri.inspect}, response: #{response.inspect}
|
|
203
|
-
"after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
|
|
198
|
+
"Post Connect uri: #{uri.inspect}, response: #{response.inspect}"
|
|
204
199
|
)
|
|
205
200
|
end
|
|
206
|
-
sleep(response_delay) if response_delay > 0.0
|
|
207
201
|
response
|
|
208
202
|
end
|
|
209
203
|
|
|
204
|
+
def error_hook(_agent, error)
|
|
205
|
+
# Best-effort: record the error against whatever host we can find
|
|
206
|
+
# Mechanize errors often carry the URI in the message; fall back to 'unknown'
|
|
207
|
+
hostname = if error.respond_to?(:uri)
|
|
208
|
+
error.uri.host
|
|
209
|
+
else
|
|
210
|
+
'unknown'
|
|
211
|
+
end
|
|
212
|
+
@throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
|
|
213
|
+
end
|
|
214
|
+
|
|
210
215
|
def verify_proxy_works(agent)
|
|
211
216
|
$stderr.flush
|
|
212
217
|
$stdout.flush
|
|
@@ -1,23 +1,36 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "host_throttler"
|
|
4
|
+
|
|
3
5
|
module ScraperUtils
|
|
4
6
|
# Misc Standalone Utilities
|
|
5
7
|
module MiscUtils
|
|
6
|
-
|
|
8
|
+
THROTTLE_HOSTNAME = "block"
|
|
7
9
|
|
|
8
10
|
class << self
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
11
|
+
# Throttle block to be nice to servers we are scraping.
|
|
12
|
+
# Time spent inside the block (parsing, saving) counts toward the delay.
|
|
13
|
+
def throttle_block
|
|
14
|
+
throttler.before_request(THROTTLE_HOSTNAME)
|
|
15
|
+
begin
|
|
16
|
+
result = yield
|
|
17
|
+
throttler.after_request(THROTTLE_HOSTNAME)
|
|
18
|
+
result
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
throttler.after_request(THROTTLE_HOSTNAME, overloaded: HostThrottler.overload_error?(e))
|
|
21
|
+
raise
|
|
16
22
|
end
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Reset the internal throttler (useful in tests)
|
|
26
|
+
def reset_throttler!
|
|
27
|
+
@throttler = nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def throttler
|
|
33
|
+
@throttler ||= HostThrottler.new
|
|
21
34
|
end
|
|
22
35
|
end
|
|
23
36
|
end
|
|
@@ -95,14 +95,16 @@ module ScraperUtils
|
|
|
95
95
|
# @param results [Array<Hash>] The results from scraping an authority
|
|
96
96
|
# @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
|
|
97
97
|
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
|
98
|
+
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
99
|
+
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
98
100
|
# @raise RuntimeError if insufficient addresses are geocodable
|
|
99
|
-
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
|
|
101
|
+
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
|
|
100
102
|
return nil if results.empty?
|
|
101
103
|
|
|
102
104
|
geocodable = results
|
|
103
105
|
.map { |record| record["address"] }
|
|
104
106
|
.uniq
|
|
105
|
-
.count { |text| ScraperUtils::SpecSupport.geocodable? text }
|
|
107
|
+
.count { |text| ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case }
|
|
106
108
|
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
|
107
109
|
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
|
108
110
|
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
|
@@ -115,8 +117,10 @@ module ScraperUtils
|
|
|
115
117
|
# Check if an address is likely to be geocodable by analyzing its format.
|
|
116
118
|
# This is a bit stricter than needed - typically assert >= 75% match
|
|
117
119
|
# @param address [String] The address to check
|
|
120
|
+
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
121
|
+
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
118
122
|
# @return [Boolean] True if the address appears to be geocodable.
|
|
119
|
-
def self.geocodable?(address, ignore_case: false)
|
|
123
|
+
def self.geocodable?(address, ignore_case: false, known_suburbs: [])
|
|
120
124
|
return false if address.nil? || address.empty?
|
|
121
125
|
check_address = ignore_case ? address.upcase : address
|
|
122
126
|
|
|
@@ -129,16 +133,17 @@ module ScraperUtils
|
|
|
129
133
|
|
|
130
134
|
uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
|
|
131
135
|
has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
|
|
136
|
+
has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }
|
|
132
137
|
|
|
133
138
|
if ENV["DEBUG"]
|
|
134
139
|
missing = []
|
|
135
140
|
missing << "street type" unless has_street_type
|
|
136
|
-
missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
|
|
141
|
+
missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
|
|
137
142
|
missing << "state" unless has_state
|
|
138
143
|
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
|
139
144
|
end
|
|
140
145
|
|
|
141
|
-
has_street_type && (has_postcode || has_uppercase_suburb) && has_state
|
|
146
|
+
has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
|
|
142
147
|
end
|
|
143
148
|
|
|
144
149
|
PLACEHOLDERS = [
|
data/lib/scraper_utils.rb
CHANGED
|
@@ -5,12 +5,13 @@ require "scraper_utils/version"
|
|
|
5
5
|
# Public Apis (responsible for requiring their own dependencies)
|
|
6
6
|
require "scraper_utils/authority_utils"
|
|
7
7
|
require "scraper_utils/data_quality_monitor"
|
|
8
|
-
require "scraper_utils/pa_validation"
|
|
9
8
|
require "scraper_utils/db_utils"
|
|
10
9
|
require "scraper_utils/debug_utils"
|
|
10
|
+
require "scraper_utils/host_throttler"
|
|
11
11
|
require "scraper_utils/log_utils"
|
|
12
12
|
require "scraper_utils/maths_utils"
|
|
13
13
|
require "scraper_utils/misc_utils"
|
|
14
|
+
require "scraper_utils/pa_validation"
|
|
14
15
|
require "scraper_utils/spec_support"
|
|
15
16
|
|
|
16
17
|
# Mechanize utilities
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.14.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -97,6 +97,7 @@ files:
|
|
|
97
97
|
- lib/scraper_utils/data_quality_monitor.rb
|
|
98
98
|
- lib/scraper_utils/db_utils.rb
|
|
99
99
|
- lib/scraper_utils/debug_utils.rb
|
|
100
|
+
- lib/scraper_utils/host_throttler.rb
|
|
100
101
|
- lib/scraper_utils/log_utils.rb
|
|
101
102
|
- lib/scraper_utils/maths_utils.rb
|
|
102
103
|
- lib/scraper_utils/mechanize_utils.rb
|
|
@@ -113,7 +114,7 @@ metadata:
|
|
|
113
114
|
allowed_push_host: https://rubygems.org
|
|
114
115
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
115
116
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
116
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
117
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.14.1
|
|
117
118
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
118
119
|
rubygems_mfa_required: 'true'
|
|
119
120
|
post_install_message:
|