scraper_utils 0.12.1 → 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -3
- data/lib/scraper_utils/db_utils.rb +9 -19
- data/lib/scraper_utils/host_throttler.rb +82 -0
- data/lib/scraper_utils/log_utils.rb +6 -14
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +27 -21
- data/lib/scraper_utils/misc_utils.rb +25 -12
- data/lib/scraper_utils/pa_validation.rb +88 -0
- data/lib/scraper_utils/spec_support.rb +34 -16
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 03b44a667992331d6e36bb6eca68afc286205846d7be06263694fed52b5e2d30
|
|
4
|
+
data.tar.gz: 9f0dd276223f1b22dd688453e1769199cbda34efa5141d58e546a8ddcb85c795
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b42e0be0f9e42d9a83588cf7dcbb98ec079d01262340d2e6fef8ac7201c3d80faa645351631f60f767186721a58580f4f1e5e09c130a3a32aebb4f301dbfbdfc
|
|
7
|
+
data.tar.gz: e3cec3345d0af13026259600a54e417efd0c36394f1bc22ecac1a25573551a3a2e51482b060ad1b72ed7ba4850d55bf9f8032321d1b8c1ae6eab581244e92410
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,29 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.14.1 - 2026-03-04
|
|
4
|
+
|
|
5
|
+
* Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
|
|
6
|
+
`ScraperUtils::SpecSupport.geocodable?` to validate addresses that don't have postcodes nor capitalised suburb names
|
|
7
|
+
* Can pass ignore_case: true to relax the requirement for either postcode or uppercase suburb when you don't want to
|
|
8
|
+
pass Known suburb.
|
|
9
|
+
* Move Throttling to HostThrottler
|
|
10
|
+
|
|
11
|
+
## 0.13.1 - 2026.02-21
|
|
12
|
+
|
|
13
|
+
* Added PaValidation that validates based
|
|
14
|
+
on [How to write a scraper](https://www.planningalerts.org.au/how_to_write_a_scraper)
|
|
15
|
+
* `ScraperUtils::PaValidation.validate_record!` raises an exception if record is invalid, calls
|
|
16
|
+
* `ScraperUtils::PaValidation.validate_record` returns an Array of error messages if record is invalid, otherwise nil
|
|
17
|
+
* Added `ScraperUtils::SpecSupport.validate_unique_references!` which validates that all references are unique
|
|
18
|
+
* Note: due to saving records based on the unique reference, any duplicates are overwritten and are never presented to
|
|
19
|
+
PA, so this is basically checking that you are not losing records due to an incorrect reference
|
|
20
|
+
* Refactored `DbUtils.save_record` to use PaValidation
|
|
21
|
+
* Merged `clean_old_records` from LogUtils into same method in DbUtils bringing across `force` named param
|
|
22
|
+
* `LogUtils.clean_old_records` now warns if it is deprecated
|
|
23
|
+
* Increased test coverage
|
|
24
|
+
* Fixed edge case in `ScraperUtils::MechanizeUtils::AgentConfig#verify_proxy_works` - it now raises an exception on json
|
|
25
|
+
parse error
|
|
26
|
+
|
|
3
27
|
## 0.12.1 - 2026.02-18
|
|
4
28
|
|
|
5
29
|
* Added override for the threshold of when to abdon scraping due to unprocessable records
|
|
@@ -34,15 +58,18 @@
|
|
|
34
58
|
|
|
35
59
|
## 0.9.0 - 2025-07-11
|
|
36
60
|
|
|
37
|
-
**Significant cleanup - removed code we ended up not using as none of the councils are actually concerned about server
|
|
61
|
+
**Significant cleanup - removed code we ended up not using as none of the councils are actually concerned about server
|
|
62
|
+
load**
|
|
38
63
|
|
|
39
64
|
* Refactored example code into simple callable methods
|
|
40
65
|
* Expand test for geocodeable addresses to include comma between postcode and state at the end of the address.
|
|
41
66
|
|
|
42
67
|
### Added
|
|
68
|
+
|
|
43
69
|
- `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` - validates percentage of geocodable addresses
|
|
44
70
|
- `ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!` - validates percentage of reasonable descriptions
|
|
45
|
-
- `ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!` - validates single global info_url usage and
|
|
71
|
+
- `ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!` - validates single global info_url usage and
|
|
72
|
+
availability
|
|
46
73
|
- `ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!` - validates info_urls contain expected content
|
|
47
74
|
- `ScraperUtils::MathsUtils.fibonacci_series` - generates fibonacci sequence up to max value
|
|
48
75
|
- `bot_check_expected` parameter to info_url validation methods for handling reCAPTCHA/Cloudflare protection
|
|
@@ -53,10 +80,12 @@
|
|
|
53
80
|
- .editorconfig as an example for scrapers
|
|
54
81
|
|
|
55
82
|
### Fixed
|
|
83
|
+
|
|
56
84
|
- Typo in `geocodable?` method debug output (`has_suburb_stats` → `has_suburb_states`)
|
|
57
85
|
- Code example in `docs/enhancing_specs.md`
|
|
58
86
|
|
|
59
87
|
### Updated
|
|
88
|
+
|
|
60
89
|
- `ScraperUtils::SpecSupport.acceptable_description?` - Accept 1 or 2 word descriptors with planning specific terms
|
|
61
90
|
- Code example in `docs/enhancing_specs.md` to reflect new support methods
|
|
62
91
|
- Code examples
|
|
@@ -68,6 +97,7 @@
|
|
|
68
97
|
- Added extra street types
|
|
69
98
|
|
|
70
99
|
### Removed
|
|
100
|
+
|
|
71
101
|
- Unsued CycleUtils
|
|
72
102
|
- Unused DateRangeUtils
|
|
73
103
|
- Unused RandomizeUtils
|
|
@@ -150,7 +180,8 @@ Fixed broken v0.2.0
|
|
|
150
180
|
|
|
151
181
|
## 0.2.0 - 2025-02-28
|
|
152
182
|
|
|
153
|
-
Added FiberScheduler, enabled complient mode with delays by default and simplified usage removing third retry without
|
|
183
|
+
Added FiberScheduler, enabled complient mode with delays by default and simplified usage removing third retry without
|
|
184
|
+
proxy
|
|
154
185
|
|
|
155
186
|
## 0.1.0 - 2025-02-23
|
|
156
187
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "uri"
|
|
3
4
|
require "scraperwiki"
|
|
4
5
|
|
|
5
6
|
module ScraperUtils
|
|
@@ -27,23 +28,10 @@ module ScraperUtils
|
|
|
27
28
|
# @raise [ScraperUtils::UnprocessableRecord] If record fails validation
|
|
28
29
|
# @return [void]
|
|
29
30
|
def self.save_record(record)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
required_fields.each do |field|
|
|
33
|
-
if record[field].to_s.empty?
|
|
34
|
-
raise ScraperUtils::UnprocessableRecord, "Missing required field: #{field}"
|
|
35
|
-
end
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
# Validate date formats
|
|
39
|
-
%w[date_scraped date_received on_notice_from on_notice_to].each do |date_field|
|
|
40
|
-
Date.parse(record[date_field]) unless record[date_field].to_s.empty?
|
|
41
|
-
rescue ArgumentError
|
|
42
|
-
raise ScraperUtils::UnprocessableRecord,
|
|
43
|
-
"Invalid date format for #{date_field}: #{record[date_field].inspect}"
|
|
44
|
-
end
|
|
31
|
+
record = record.transform_keys(&:to_s)
|
|
32
|
+
ScraperUtils::PaValidation.validate_record!(record)
|
|
45
33
|
|
|
46
|
-
# Determine primary key based on presence of authority_label
|
|
34
|
+
# Determine the primary key based on the presence of authority_label
|
|
47
35
|
primary_key = if record.key?("authority_label")
|
|
48
36
|
%w[authority_label council_reference]
|
|
49
37
|
else
|
|
@@ -58,7 +46,7 @@ module ScraperUtils
|
|
|
58
46
|
end
|
|
59
47
|
|
|
60
48
|
# Clean up records older than 30 days and approx once a month vacuum the DB
|
|
61
|
-
def self.cleanup_old_records
|
|
49
|
+
def self.cleanup_old_records(force: false)
|
|
62
50
|
cutoff_date = (Date.today - 30).to_s
|
|
63
51
|
vacuum_cutoff_date = (Date.today - 35).to_s
|
|
64
52
|
|
|
@@ -70,15 +58,17 @@ module ScraperUtils
|
|
|
70
58
|
deleted_count = stats["count"]
|
|
71
59
|
oldest_date = stats["oldest"]
|
|
72
60
|
|
|
73
|
-
return unless deleted_count.positive? || ENV["VACUUM"]
|
|
61
|
+
return unless deleted_count.positive? || ENV["VACUUM"] || force
|
|
74
62
|
|
|
75
63
|
LogUtils.log "Deleting #{deleted_count} applications scraped between #{oldest_date} and #{cutoff_date}"
|
|
76
64
|
ScraperWiki.sqliteexecute("DELETE FROM data WHERE date_scraped < ?", [cutoff_date])
|
|
77
65
|
|
|
78
|
-
return unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"]
|
|
66
|
+
return unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
|
|
79
67
|
|
|
80
68
|
LogUtils.log " Running VACUUM to reclaim space..."
|
|
81
69
|
ScraperWiki.sqliteexecute("VACUUM")
|
|
70
|
+
rescue SqliteMagic::NoSuchTable => e
|
|
71
|
+
ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
|
|
82
72
|
end
|
|
83
73
|
end
|
|
84
74
|
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ScraperUtils
|
|
4
|
+
# Tracks per-host next-allowed-request time so that time spent parsing
|
|
5
|
+
# and saving records counts toward the crawl delay rather than being
|
|
6
|
+
# added on top of it.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# throttler = HostThrottler.new(crawl_delay: 1.0, max_load: 50.0)
|
|
10
|
+
# throttler.before_request(hostname) # sleep until ready
|
|
11
|
+
# # ... make request ...
|
|
12
|
+
# throttler.after_request(hostname) # record timing, schedule next slot
|
|
13
|
+
# throttler.after_request(hostname, overloaded: true) # double delay + 5s
|
|
14
|
+
class HostThrottler
|
|
15
|
+
MAX_DELAY = 120.0
|
|
16
|
+
|
|
17
|
+
# @param crawl_delay [Float] minimum seconds between requests per host
|
|
18
|
+
# @param max_load [Float] target server load percentage (10..100);
|
|
19
|
+
# 50 means response_time == pause_time
|
|
20
|
+
def initialize(crawl_delay: 0.0, max_load: nil)
|
|
21
|
+
@crawl_delay = crawl_delay.to_f
|
|
22
|
+
# Clamp between 10 (delay 9x response) and 100 (no extra delay)
|
|
23
|
+
@max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
|
|
24
|
+
@next_request_at = {} # hostname => Time
|
|
25
|
+
@request_started_at = {} # hostname => Time
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Sleep until this host's throttle window has elapsed.
|
|
29
|
+
# Records when the request actually started.
|
|
30
|
+
# @param hostname [String]
|
|
31
|
+
# @return [void]
|
|
32
|
+
def before_request(hostname)
|
|
33
|
+
target = @next_request_at[hostname]
|
|
34
|
+
if target
|
|
35
|
+
remaining = target - Time.now
|
|
36
|
+
sleep(remaining) if remaining > 0
|
|
37
|
+
end
|
|
38
|
+
@request_started_at[hostname] = Time.now
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Calculate and store the next allowed request time for this host.
|
|
42
|
+
# @param hostname [String]
|
|
43
|
+
# @param overloaded [Boolean] true when the server signalled overload
|
|
44
|
+
# (HTTP 429/500/503); doubles the normal delay and adds 5 seconds.
|
|
45
|
+
# @return [void]
|
|
46
|
+
def after_request(hostname, overloaded: false)
|
|
47
|
+
started = @request_started_at[hostname] || Time.now
|
|
48
|
+
response_time = Time.now - started
|
|
49
|
+
|
|
50
|
+
delay = @crawl_delay
|
|
51
|
+
if @max_load
|
|
52
|
+
delay += (100.0 - @max_load) * response_time / @max_load
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
if overloaded
|
|
56
|
+
delay = delay + response_time * 2 + 5.0
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
delay = delay.round(3).clamp(0.0, MAX_DELAY)
|
|
60
|
+
@next_request_at[hostname] = Time.now + delay
|
|
61
|
+
|
|
62
|
+
if DebugUtils.basic?
|
|
63
|
+
msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
|
|
64
|
+
msg += " OVERLOADED" if overloaded
|
|
65
|
+
msg += ", Will delay #{delay}s before next request"
|
|
66
|
+
LogUtils.log(msg)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
|
|
71
|
+
# @param error [Exception]
|
|
72
|
+
# @return [Boolean]
|
|
73
|
+
def self.overload_error?(error)
|
|
74
|
+
code = if error.respond_to?(:response) && error.response.respond_to?(:code)
|
|
75
|
+
error.response.code.to_i # HTTParty style
|
|
76
|
+
elsif error.respond_to?(:response_code)
|
|
77
|
+
error.response_code.to_i # Mechanize style
|
|
78
|
+
end
|
|
79
|
+
[429, 500, 503].include?(code)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -85,7 +85,7 @@ module ScraperUtils
|
|
|
85
85
|
failed
|
|
86
86
|
)
|
|
87
87
|
|
|
88
|
-
cleanup_old_records
|
|
88
|
+
DbUtils::cleanup_old_records
|
|
89
89
|
end
|
|
90
90
|
|
|
91
91
|
# Extracts the first relevant line from backtrace that's from our project
|
|
@@ -225,21 +225,13 @@ module ScraperUtils
|
|
|
225
225
|
)
|
|
226
226
|
end
|
|
227
227
|
|
|
228
|
+
# Moved to DbUtils
|
|
229
|
+
# :nocov:
|
|
228
230
|
def self.cleanup_old_records(force: false)
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
@last_cutoff = cutoff
|
|
233
|
-
|
|
234
|
-
[SUMMARY_TABLE, LOG_TABLE].each do |table|
|
|
235
|
-
ScraperWiki.sqliteexecute(
|
|
236
|
-
"DELETE FROM #{table} WHERE date(run_at) < date(?)",
|
|
237
|
-
[cutoff]
|
|
238
|
-
)
|
|
239
|
-
rescue SqliteMagic::NoSuchTable => e
|
|
240
|
-
ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
|
|
241
|
-
end
|
|
231
|
+
warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.", category: :deprecated
|
|
232
|
+
ScraperUtils::DbUtils.cleanup_old_records(force: force)
|
|
242
233
|
end
|
|
234
|
+
# :nocov:
|
|
243
235
|
|
|
244
236
|
# Extracts meaningful backtrace - 3 lines from ruby/gem and max 6 in total
|
|
245
237
|
def self.extract_meaningful_backtrace(error)
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "mechanize"
|
|
4
4
|
require "ipaddr"
|
|
5
|
+
require_relative "../host_throttler"
|
|
5
6
|
|
|
6
7
|
module ScraperUtils
|
|
7
8
|
module MechanizeUtils
|
|
@@ -76,8 +77,7 @@ module ScraperUtils
|
|
|
76
77
|
attr_reader :user_agent
|
|
77
78
|
|
|
78
79
|
# Give access for testing
|
|
79
|
-
|
|
80
|
-
attr_reader :max_load, :crawl_delay
|
|
80
|
+
attr_reader :max_load, :crawl_delay, :throttler
|
|
81
81
|
|
|
82
82
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
|
83
83
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
|
@@ -107,6 +107,7 @@ module ScraperUtils
|
|
|
107
107
|
@crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
|
|
108
108
|
# Clamp between 10 (delay 9 x response) and 100 (no delay)
|
|
109
109
|
@max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
|
|
110
|
+
@throttler = HostThrottler.new(crawl_delay: @crawl_delay, max_load: @max_load)
|
|
110
111
|
|
|
111
112
|
# Validate proxy URL format if proxy will be used
|
|
112
113
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
|
@@ -155,6 +156,7 @@ module ScraperUtils
|
|
|
155
156
|
|
|
156
157
|
agent.pre_connect_hooks << method(:pre_connect_hook)
|
|
157
158
|
agent.post_connect_hooks << method(:post_connect_hook)
|
|
159
|
+
agent.error_hooks << method(:error_hook) if agent.respond_to?(:error_hooks)
|
|
158
160
|
end
|
|
159
161
|
|
|
160
162
|
private
|
|
@@ -175,38 +177,41 @@ module ScraperUtils
|
|
|
175
177
|
end
|
|
176
178
|
|
|
177
179
|
def pre_connect_hook(_agent, request)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
180
|
+
hostname = request.respond_to?(:uri) ? request.uri.host : 'unknown'
|
|
181
|
+
@throttler.before_request(hostname)
|
|
182
|
+
if DebugUtils.verbose?
|
|
183
|
+
ScraperUtils::LogUtils.log(
|
|
184
|
+
"Pre Connect request: #{request.inspect}"
|
|
185
|
+
)
|
|
186
|
+
end
|
|
184
187
|
end
|
|
185
188
|
|
|
186
189
|
def post_connect_hook(_agent, uri, response, _body)
|
|
187
190
|
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
|
188
191
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
if @crawl_delay ||@max_load
|
|
193
|
-
response_delay += response_time
|
|
194
|
-
if @max_load && @max_load >= 1
|
|
195
|
-
response_delay += (100.0 - @max_load) * response_time / @max_load
|
|
196
|
-
end
|
|
197
|
-
response_delay = response_delay.round(3)
|
|
198
|
-
end
|
|
192
|
+
status = response.respond_to?(:code) ? response.code.to_i : nil
|
|
193
|
+
overloaded = [429, 500, 503].include?(status)
|
|
194
|
+
@throttler.after_request(uri.host, overloaded: overloaded)
|
|
199
195
|
|
|
200
196
|
if DebugUtils.basic?
|
|
201
197
|
ScraperUtils::LogUtils.log(
|
|
202
|
-
"Post Connect uri: #{uri.inspect}, response: #{response.inspect}
|
|
203
|
-
"after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
|
|
198
|
+
"Post Connect uri: #{uri.inspect}, response: #{response.inspect}"
|
|
204
199
|
)
|
|
205
200
|
end
|
|
206
|
-
sleep(response_delay) if response_delay > 0.0
|
|
207
201
|
response
|
|
208
202
|
end
|
|
209
203
|
|
|
204
|
+
def error_hook(_agent, error)
|
|
205
|
+
# Best-effort: record the error against whatever host we can find
|
|
206
|
+
# Mechanize errors often carry the URI in the message; fall back to 'unknown'
|
|
207
|
+
hostname = if error.respond_to?(:uri)
|
|
208
|
+
error.uri.host
|
|
209
|
+
else
|
|
210
|
+
'unknown'
|
|
211
|
+
end
|
|
212
|
+
@throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
|
|
213
|
+
end
|
|
214
|
+
|
|
210
215
|
def verify_proxy_works(agent)
|
|
211
216
|
$stderr.flush
|
|
212
217
|
$stdout.flush
|
|
@@ -227,6 +232,7 @@ module ScraperUtils
|
|
|
227
232
|
rescue JSON::ParserError => e
|
|
228
233
|
puts "Couldn't parse public_headers: #{e}! Raw response:"
|
|
229
234
|
puts my_headers.inspect
|
|
235
|
+
raise "Couldn't parse public_headers as JSON: #{e}!"
|
|
230
236
|
end
|
|
231
237
|
rescue Timeout::Error => e # Includes Net::OpenTimeout
|
|
232
238
|
raise "Proxy check timed out: #{e}"
|
|
@@ -1,23 +1,36 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "host_throttler"
|
|
4
|
+
|
|
3
5
|
module ScraperUtils
|
|
4
6
|
# Misc Standalone Utilities
|
|
5
7
|
module MiscUtils
|
|
6
|
-
|
|
8
|
+
THROTTLE_HOSTNAME = "block"
|
|
7
9
|
|
|
8
10
|
class << self
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
11
|
+
# Throttle block to be nice to servers we are scraping.
|
|
12
|
+
# Time spent inside the block (parsing, saving) counts toward the delay.
|
|
13
|
+
def throttle_block
|
|
14
|
+
throttler.before_request(THROTTLE_HOSTNAME)
|
|
15
|
+
begin
|
|
16
|
+
result = yield
|
|
17
|
+
throttler.after_request(THROTTLE_HOSTNAME)
|
|
18
|
+
result
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
throttler.after_request(THROTTLE_HOSTNAME, overloaded: HostThrottler.overload_error?(e))
|
|
21
|
+
raise
|
|
16
22
|
end
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Reset the internal throttler (useful in tests)
|
|
26
|
+
def reset_throttler!
|
|
27
|
+
@throttler = nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def throttler
|
|
33
|
+
@throttler ||= HostThrottler.new
|
|
21
34
|
end
|
|
22
35
|
end
|
|
23
36
|
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "uri"
|
|
4
|
+
require "date"
|
|
5
|
+
|
|
6
|
+
module ScraperUtils
|
|
7
|
+
# Validates scraper records match Planning Alerts requirements before submission.
|
|
8
|
+
# Use in specs to catch problems early rather than waiting for PA's import.
|
|
9
|
+
module PaValidation
|
|
10
|
+
REQUIRED_FIELDS = %w[council_reference address description date_scraped].freeze
|
|
11
|
+
|
|
12
|
+
# Validates a single record (hash with string keys) against PA's rules.
|
|
13
|
+
# @param record [Hash] The record to validate
|
|
14
|
+
# @raise [ScraperUtils::UnprocessableRecord] if there are error messages
|
|
15
|
+
def self.validate_record!(record)
|
|
16
|
+
errors = validate_record(record)
|
|
17
|
+
raise(ScraperUtils::UnprocessableRecord, errors.join("; ")) if errors&.any?
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Validates a single record (hash with string keys) against PA's rules.
|
|
21
|
+
# @param record [Hash] The record to validate
|
|
22
|
+
# @return [Array<String>, nil] Array of error messages, or nil if valid
|
|
23
|
+
def self.validate_record(record)
|
|
24
|
+
record = record.transform_keys(&:to_s)
|
|
25
|
+
errors = []
|
|
26
|
+
|
|
27
|
+
validate_presence(record, errors)
|
|
28
|
+
validate_info_url(record, errors)
|
|
29
|
+
validate_dates(record, errors)
|
|
30
|
+
|
|
31
|
+
errors.empty? ? nil : errors
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def self.validate_presence(record, errors)
|
|
37
|
+
REQUIRED_FIELDS.each do |field|
|
|
38
|
+
errors << "#{field} can't be blank" if record[field].to_s.strip.empty?
|
|
39
|
+
end
|
|
40
|
+
errors << "info_url can't be blank" if record["info_url"].to_s.strip.empty?
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def self.validate_info_url(record, errors)
|
|
44
|
+
url = record["info_url"].to_s.strip
|
|
45
|
+
return if url.empty? # already caught by presence check
|
|
46
|
+
|
|
47
|
+
begin
|
|
48
|
+
uri = URI.parse(url)
|
|
49
|
+
unless uri.is_a?(URI::HTTP) && uri.host.to_s != ""
|
|
50
|
+
errors << "info_url must be a valid http\/https URL with host"
|
|
51
|
+
end
|
|
52
|
+
rescue URI::InvalidURIError
|
|
53
|
+
errors << "info_url must be a valid http\/https URL"
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def self.validate_dates(record, errors)
|
|
58
|
+
today = Date.today
|
|
59
|
+
|
|
60
|
+
date_scraped = parse_date(record["date_scraped"])
|
|
61
|
+
errors << "Invalid date format for date_scraped: #{record["date_scraped"].inspect} is not a valid ISO 8601 date" if record["date_scraped"] && date_scraped.nil?
|
|
62
|
+
|
|
63
|
+
date_received = parse_date(record["date_received"])
|
|
64
|
+
if record["date_received"] && date_received.nil?
|
|
65
|
+
errors << "Invalid date format for date_received: #{record["date_received"].inspect} is not a valid ISO 8601 date"
|
|
66
|
+
elsif date_received && date_received.to_date > today
|
|
67
|
+
errors << "Invalid date for date_received: #{record["date_received"].inspect} is in the future"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
%w[on_notice_from on_notice_to].each do |field|
|
|
71
|
+
val = parse_date(record[field])
|
|
72
|
+
errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date" if record[field] && val.nil?
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Returns a Date if value is already a Date, or parses a YYYY-MM-DD string.
|
|
77
|
+
# Returns nil if unparseable or blank.
|
|
78
|
+
def self.parse_date(value)
|
|
79
|
+
return nil if value.nil? || value == ""
|
|
80
|
+
return value if value.is_a?(Date) || value.is_a?(Time)
|
|
81
|
+
return nil unless value.is_a?(String) && value =~ /\A\d{4}-\d{2}-\d{2}\z/
|
|
82
|
+
|
|
83
|
+
Date.parse(value)
|
|
84
|
+
rescue ArgumentError
|
|
85
|
+
nil
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
@@ -78,30 +78,49 @@ module ScraperUtils
|
|
|
78
78
|
"#{prefix}#{authority_labels.first}#{suffix}"
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
+
# Finds records with duplicate [authority_label, council_reference] keys.
|
|
82
|
+
# @param records [Array<Hash>] All records to check
|
|
83
|
+
# @raises [Hash<Array<String>, Array<Hash>>, nil] Groups of duplicate records keys by primary key, or nil if all unique
|
|
84
|
+
def self.validate_unique_references!(records)
|
|
85
|
+
groups = records.group_by do |r|
|
|
86
|
+
[r["authority_label"], r["council_reference"]&.downcase]
|
|
87
|
+
end
|
|
88
|
+
duplicates = groups.select { |_k, g| g.size > 1 }
|
|
89
|
+
return if duplicates.empty?
|
|
90
|
+
|
|
91
|
+
raise UnprocessableSite, "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
|
|
92
|
+
end
|
|
93
|
+
|
|
81
94
|
# Validates enough addresses are geocodable
|
|
82
95
|
# @param results [Array<Hash>] The results from scraping an authority
|
|
83
96
|
# @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
|
|
84
97
|
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
|
98
|
+
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
99
|
+
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
85
100
|
# @raise RuntimeError if insufficient addresses are geocodable
|
|
86
|
-
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
|
|
101
|
+
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
|
|
87
102
|
return nil if results.empty?
|
|
88
103
|
|
|
89
104
|
geocodable = results
|
|
90
105
|
.map { |record| record["address"] }
|
|
91
106
|
.uniq
|
|
92
|
-
.count { |text| ScraperUtils::SpecSupport.geocodable? text }
|
|
107
|
+
.count { |text| ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case }
|
|
93
108
|
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
|
94
109
|
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
|
95
110
|
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
|
96
|
-
|
|
111
|
+
unless geocodable >= expected
|
|
112
|
+
raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
|
|
113
|
+
end
|
|
97
114
|
geocodable
|
|
98
115
|
end
|
|
99
116
|
|
|
100
117
|
# Check if an address is likely to be geocodable by analyzing its format.
|
|
101
118
|
# This is a bit stricter than needed - typically assert >= 75% match
|
|
102
119
|
# @param address [String] The address to check
|
|
120
|
+
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
121
|
+
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
103
122
|
# @return [Boolean] True if the address appears to be geocodable.
|
|
104
|
-
def self.geocodable?(address, ignore_case: false)
|
|
123
|
+
def self.geocodable?(address, ignore_case: false, known_suburbs: [])
|
|
105
124
|
return false if address.nil? || address.empty?
|
|
106
125
|
check_address = ignore_case ? address.upcase : address
|
|
107
126
|
|
|
@@ -114,16 +133,17 @@ module ScraperUtils
|
|
|
114
133
|
|
|
115
134
|
uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
|
|
116
135
|
has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
|
|
136
|
+
has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }
|
|
117
137
|
|
|
118
138
|
if ENV["DEBUG"]
|
|
119
139
|
missing = []
|
|
120
140
|
missing << "street type" unless has_street_type
|
|
121
|
-
missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
|
|
141
|
+
missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
|
|
122
142
|
missing << "state" unless has_state
|
|
123
143
|
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
|
124
144
|
end
|
|
125
145
|
|
|
126
|
-
has_street_type && (has_postcode || has_uppercase_suburb) && has_state
|
|
146
|
+
has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
|
|
127
147
|
end
|
|
128
148
|
|
|
129
149
|
PLACEHOLDERS = [
|
|
@@ -157,7 +177,7 @@ module ScraperUtils
|
|
|
157
177
|
puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
|
|
158
178
|
"(#{(100.0 * descriptions / results.count).round(1)}%)"
|
|
159
179
|
expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
|
|
160
|
-
raise "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
|
|
180
|
+
raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
|
|
161
181
|
descriptions
|
|
162
182
|
end
|
|
163
183
|
|
|
@@ -278,7 +298,7 @@ module ScraperUtils
|
|
|
278
298
|
next
|
|
279
299
|
end
|
|
280
300
|
|
|
281
|
-
raise "Expected 200 response, got #{page.code}" unless page.code == "200"
|
|
301
|
+
raise UnprocessableRecord, "Expected 200 response, got #{page.code}" unless page.code == "200"
|
|
282
302
|
|
|
283
303
|
page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
|
|
284
304
|
|
|
@@ -310,12 +330,10 @@ module ScraperUtils
|
|
|
310
330
|
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
|
311
331
|
passed = count - failed
|
|
312
332
|
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
|
|
317
|
-
end
|
|
318
|
-
|
|
319
|
-
end
|
|
320
|
-
end
|
|
333
|
+
end
|
|
334
|
+
end
|
|
321
335
|
|
|
336
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
end
|
data/lib/scraper_utils.rb
CHANGED
|
@@ -7,9 +7,11 @@ require "scraper_utils/authority_utils"
|
|
|
7
7
|
require "scraper_utils/data_quality_monitor"
|
|
8
8
|
require "scraper_utils/db_utils"
|
|
9
9
|
require "scraper_utils/debug_utils"
|
|
10
|
+
require "scraper_utils/host_throttler"
|
|
10
11
|
require "scraper_utils/log_utils"
|
|
11
12
|
require "scraper_utils/maths_utils"
|
|
12
13
|
require "scraper_utils/misc_utils"
|
|
14
|
+
require "scraper_utils/pa_validation"
|
|
13
15
|
require "scraper_utils/spec_support"
|
|
14
16
|
|
|
15
17
|
# Mechanize utilities
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.14.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -97,11 +97,13 @@ files:
|
|
|
97
97
|
- lib/scraper_utils/data_quality_monitor.rb
|
|
98
98
|
- lib/scraper_utils/db_utils.rb
|
|
99
99
|
- lib/scraper_utils/debug_utils.rb
|
|
100
|
+
- lib/scraper_utils/host_throttler.rb
|
|
100
101
|
- lib/scraper_utils/log_utils.rb
|
|
101
102
|
- lib/scraper_utils/maths_utils.rb
|
|
102
103
|
- lib/scraper_utils/mechanize_utils.rb
|
|
103
104
|
- lib/scraper_utils/mechanize_utils/agent_config.rb
|
|
104
105
|
- lib/scraper_utils/misc_utils.rb
|
|
106
|
+
- lib/scraper_utils/pa_validation.rb
|
|
105
107
|
- lib/scraper_utils/spec_support.rb
|
|
106
108
|
- lib/scraper_utils/version.rb
|
|
107
109
|
- scraper_utils.gemspec
|
|
@@ -112,7 +114,7 @@ metadata:
|
|
|
112
114
|
allowed_push_host: https://rubygems.org
|
|
113
115
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
114
116
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
115
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
117
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.14.1
|
|
116
118
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
117
119
|
rubygems_mfa_required: 'true'
|
|
118
120
|
post_install_message:
|