scraper_utils 0.14.1 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 03b44a667992331d6e36bb6eca68afc286205846d7be06263694fed52b5e2d30
4
- data.tar.gz: 9f0dd276223f1b22dd688453e1769199cbda34efa5141d58e546a8ddcb85c795
3
+ metadata.gz: 9a1001f794ef04c587bb726157c66fc637fbb8525bac1c5be93a138e7f0a8266
4
+ data.tar.gz: f92023b5362c6b64ae74d0bf43cf613b02849687a46ec7fbb6b51c4b7ad397dc
5
5
  SHA512:
6
- metadata.gz: b42e0be0f9e42d9a83588cf7dcbb98ec079d01262340d2e6fef8ac7201c3d80faa645351631f60f767186721a58580f4f1e5e09c130a3a32aebb4f301dbfbdfc
7
- data.tar.gz: e3cec3345d0af13026259600a54e417efd0c36394f1bc22ecac1a25573551a3a2e51482b060ad1b72ed7ba4850d55bf9f8032321d1b8c1ae6eab581244e92410
6
+ metadata.gz: 88e952e952d59011018ca4721bde72d49c913beccccf098d62bb4d1313d0ca3bf94678ff27db5ba4cef3a674fefbebd067a5008e5f36a2029f2a9c8ac1689b15
7
+ data.tar.gz: 35601498d9d110d5d365aa7c1fddcfa74a86fde4b93537b44f8e00bb84f664ba455c642256c0032e221b484d986ea39b2d3ab743c94102b10c7bed1c397139d5
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.16.0 - 2026-04-08
4
+ * Use defaults from AgentConfig for `throttle_block`, and allow defaults to be overriden
5
+
6
+ ## 0.15.0 - 2026-03-05
7
+
8
+ * Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
9
+ * Fix pre_connect_hook hostname extraction to use `request['Host']` header
10
+
3
11
  ## 0.14.1 - 2026-03-04
4
12
 
5
13
  * Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
@@ -27,21 +27,20 @@ class Scraper
27
27
  begin
28
28
  ScraperUtils::DataQualityMonitor.start_authority(authority_label)
29
29
  YourScraper.scrape(authority_label) do |record|
30
- begin
31
- record["authority_label"] = authority_label.to_s
32
- ScraperUtils::DbUtils.save_record(record)
33
- rescue ScraperUtils::UnprocessableRecord => e
34
- # Log bad record but continue processing unless too many have occurred
35
- ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
36
- unprocessable_record_details << [e, record]
37
- end
30
+ record["authority_label"] = authority_label.to_s
31
+ ScraperUtils::DbUtils.save_record(record)
32
+ rescue ScraperUtils::UnprocessableRecord => e
33
+ # Log bad record but continue processing unless too many have occurred
34
+ ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
35
+ unprocessable_record_details << [e, record]
38
36
  end
39
37
  rescue StandardError => e
40
38
  warn "#{authority_label}: ERROR: #{e}"
41
39
  warn e.backtrace
42
40
  fatal_exception = e
43
41
  end
44
- [authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details, fatal_exception]
42
+ [authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details,
43
+ fatal_exception]
45
44
  end
46
45
 
47
46
  # Process authorities in parallel
@@ -54,7 +53,7 @@ class Scraper
54
53
  scrape_authority(authority_label, attempt)
55
54
  end.each do |authority_label, saves, unprocessable, fatal_exception|
56
55
  # Runs in main process
57
- status = fatal_exception ? 'FAILED' : 'OK'
56
+ status = fatal_exception ? "FAILED" : "OK"
58
57
  puts "Saving results of #{authority_label}: #{saves.size} records, #{unprocessable.size} unprocessable #{status}"
59
58
 
60
59
  saves.each do |record|
@@ -65,11 +64,11 @@ class Scraper
65
64
  exceptions[authority_label] = e
66
65
  end
67
66
 
68
- if fatal_exception
69
- puts " Warning: #{authority_label} failed with: #{fatal_exception.message}"
70
- puts " Saved #{saves.size} records before failure"
71
- exceptions[authority_label] = fatal_exception
72
- end
67
+ next unless fatal_exception
68
+
69
+ puts " Warning: #{authority_label} failed with: #{fatal_exception.message}"
70
+ puts " Saved #{saves.size} records before failure"
71
+ exceptions[authority_label] = fatal_exception
73
72
  end
74
73
 
75
74
  exceptions
@@ -96,7 +95,7 @@ class Scraper
96
95
  unless exceptions.empty?
97
96
  puts "\n***************************************************"
98
97
  puts "Now retrying authorities which earlier had failures"
99
- puts exceptions.keys.join(", ").to_s
98
+ puts exceptions.keys.join(", ")
100
99
  puts "***************************************************"
101
100
 
102
101
  start_time = Time.now
@@ -118,7 +117,7 @@ end
118
117
  if __FILE__ == $PROGRAM_NAME
119
118
  ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
120
119
 
121
- process_count = (ENV['MORPH_PROCESSES'] || Etc.nprocessors * 2).to_i
120
+ process_count = (ENV["MORPH_PROCESSES"] || (Etc.nprocessors * 2)).to_i
122
121
 
123
122
  Scraper.run(Scraper.selected_authorities, process_count: process_count)
124
123
  end
@@ -22,13 +22,11 @@ class Scraper
22
22
  # REPLACE section with:
23
23
  ScraperUtils::DataQualityMonitor.start_authority(authority_label)
24
24
  YourScraper.scrape(authority_label) do |record|
25
- begin
26
- record["authority_label"] = authority_label.to_s
27
- ScraperUtils::DbUtils.save_record(record)
28
- rescue ScraperUtils::UnprocessableRecord => e
29
- ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
30
- exceptions[authority_label] = e
31
- end
25
+ record["authority_label"] = authority_label.to_s
26
+ ScraperUtils::DbUtils.save_record(record)
27
+ rescue ScraperUtils::UnprocessableRecord => e
28
+ ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
29
+ exceptions[authority_label] = e
32
30
  end
33
31
  # END OF REPLACE
34
32
  rescue StandardError => e
@@ -61,7 +59,7 @@ class Scraper
61
59
  puts "Now retrying authorities which earlier had failures"
62
60
  puts exceptions.keys.join(", ")
63
61
  puts "***************************************************"
64
- ENV['DEBUG'] ||= '1'
62
+ ENV["DEBUG"] ||= "1"
65
63
 
66
64
  start_time = Time.now
67
65
  exceptions = scrape(exceptions.keys, 2)
@@ -85,12 +83,11 @@ if __FILE__ == $PROGRAM_NAME
85
83
  # some: url-for-issue Summary Reason
86
84
  # councils: url-for-issue Summary Reason
87
85
 
88
- if ENV['MORPH_EXPECT_BAD'].nil?
89
- default_expect_bad = {
90
- }
91
- puts 'Default EXPECT_BAD:', default_expect_bad.to_yaml if default_expect_bad.any?
86
+ if ENV["MORPH_EXPECT_BAD"].nil?
87
+ default_expect_bad = {}
88
+ puts "Default EXPECT_BAD:", default_expect_bad.to_yaml if default_expect_bad.any?
92
89
 
93
- ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(',')
90
+ ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(",")
94
91
  end
95
92
  # If the sites have many unusable records - raise defaults
96
93
  # ENV['MORPH_UNPROCESSABLE_BASE'] ||= "10"
@@ -18,11 +18,5 @@ The throttle automatically:
18
18
  - Pauses before next request based on previous timing
19
19
  - Caps pause at 120s maximum
20
20
 
21
- Override the next pause duration manually if needed:
22
-
23
- ```ruby
24
- ScraperUtils::MiscUtils.pause_duration = 2.0
25
- ```
26
-
27
21
  **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
28
22
  each request is made and thus does not need to be wrapped with the helper.
@@ -26,7 +26,7 @@ if File.exist?(config_file)
26
26
  config = YAML.safe_load(File.read(config_file), symbolize_names: true)
27
27
  options.merge!(config) if config
28
28
  puts "Loaded config from #{config_file}"
29
- rescue => e
29
+ rescue StandardError => e
30
30
  puts "Warning: Could not load #{config_file}: #{e.message}"
31
31
  end
32
32
  end
@@ -38,19 +38,23 @@ OptionParser.new do |opts|
38
38
  options[:database] = db
39
39
  end
40
40
 
41
- opts.on("-g", "--geocodable-percentage N", Integer, "Min percentage of geocodable addresses (default: 50)") do |n|
41
+ opts.on("-g", "--geocodable-percentage N", Integer,
42
+ "Min percentage of geocodable addresses (default: 50)") do |n|
42
43
  options[:geocodable_percentage] = n
43
44
  end
44
45
 
45
- opts.on("-r", "--description-percentage N", Integer, "Min percentage of reasonable descriptions (default: 50)") do |n|
46
+ opts.on("-r", "--description-percentage N", Integer,
47
+ "Min percentage of reasonable descriptions (default: 50)") do |n|
46
48
  options[:description_percentage] = n
47
49
  end
48
50
 
49
- opts.on("-u", "--info-url-percentage N", Integer, "Min percentage for info URL validation (default: 75)") do |n|
51
+ opts.on("-u", "--info-url-percentage N", Integer,
52
+ "Min percentage for info URL validation (default: 75)") do |n|
50
53
  options[:info_url_percentage] = n
51
54
  end
52
55
 
53
- opts.on("-v", "--variation N", Integer, "Variation tolerance for all validations (default: 3)") do |n|
56
+ opts.on("-v", "--variation N", Integer,
57
+ "Variation tolerance for all validations (default: 3)") do |n|
54
58
  options[:geocodable_variation] = n
55
59
  options[:description_variation] = n
56
60
  options[:info_url_variation] = n
@@ -60,11 +64,13 @@ OptionParser.new do |opts|
60
64
  options[:bot_check_expected] = true
61
65
  end
62
66
 
63
- opts.on("-i", "--global-info-url URL", "Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
67
+ opts.on("-i", "--global-info-url URL",
68
+ "Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
64
69
  options[:global_info_url] = url
65
70
  end
66
71
 
67
- opts.on("-c", "--config FILE", "Load config from YAML file (default: .scraper_validation.yml)") do |file|
72
+ opts.on("-c", "--config FILE",
73
+ "Load config from YAML file (default: .scraper_validation.yml)") do |file|
68
74
  config_file = file
69
75
  end
70
76
 
@@ -142,7 +148,6 @@ begin
142
148
 
143
149
  puts
144
150
  puts "✅ All validations passed!"
145
-
146
151
  rescue RuntimeError => e
147
152
  puts
148
153
  puts "❌ Validation failed: #{e.message}"
@@ -30,8 +30,13 @@ module ScraperUtils
30
30
  # Initial base of 5.01 (override using MORPH_UNPROCESSABLE_BASE)
31
31
  # Initial percentage of 10% (override using MORPH_UNPROCESSABLE_PERCENTAGE)
32
32
  def self.threshold(authority_label)
33
- ENV.fetch('MORPH_UNPROCESSABLE_BASE', 5.01).to_f +
34
- (@stats[authority_label][:saved].to_i * ENV.fetch('MORPH_UNPROCESSABLE_PERCENTAGE', 10.0).to_f / 100.0) if @stats&.fetch(authority_label, nil)
33
+ if @stats&.fetch(
34
+ authority_label, nil
35
+ )
36
+ ENV.fetch("MORPH_UNPROCESSABLE_BASE", 5.01).to_f +
37
+ (@stats[authority_label][:saved].to_i * ENV.fetch("MORPH_UNPROCESSABLE_PERCENTAGE",
38
+ 10.0).to_f / 100.0)
39
+ end
35
40
  end
36
41
 
37
42
  # Logs an unprocessable record and raises an exception if error threshold is exceeded
@@ -44,7 +49,7 @@ module ScraperUtils
44
49
  def self.log_unprocessable_record(exception, record)
45
50
  authority_label = extract_authority(record)
46
51
  @stats[authority_label][:unprocessed] += 1
47
- details = if record&.key?('council_reference') && record&.key?('address')
52
+ details = if record&.key?("council_reference") && record&.key?("address")
48
53
  "#{record['council_reference']} - #{record['address']}"
49
54
  else
50
55
  record.inspect
@@ -64,7 +69,7 @@ module ScraperUtils
64
69
  def self.log_saved_record(record)
65
70
  authority_label = extract_authority(record)
66
71
  @stats[authority_label][:saved] += 1
67
- ScraperUtils::LogUtils.log "Saving record #{authority_label&.empty? ? '' : "for #{authority_label}: "}#{record['council_reference']} - #{record['address']}"
72
+ ScraperUtils::LogUtils.log "Saving record #{"for #{authority_label}: " unless authority_label&.empty?}#{record['council_reference']} - #{record['address']}"
68
73
  end
69
74
  end
70
75
  end
@@ -63,12 +63,16 @@ module ScraperUtils
63
63
  LogUtils.log "Deleting #{deleted_count} applications scraped between #{oldest_date} and #{cutoff_date}"
64
64
  ScraperWiki.sqliteexecute("DELETE FROM data WHERE date_scraped < ?", [cutoff_date])
65
65
 
66
- return unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
66
+ unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
67
+ return
68
+ end
67
69
 
68
70
  LogUtils.log " Running VACUUM to reclaim space..."
69
71
  ScraperWiki.sqliteexecute("VACUUM")
70
72
  rescue SqliteMagic::NoSuchTable => e
71
- ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
73
+ if ScraperUtils::DebugUtils.trace?
74
+ ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records"
75
+ end
72
76
  end
73
77
  end
74
78
  end
@@ -18,7 +18,7 @@ module ScraperUtils
18
18
  # Checks DEBUG and MORPH_DEBUG env variables
19
19
  # @return [Integer] Debug level
20
20
  def self.debug_level
21
- debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
21
+ debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, "0"))
22
22
  debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
23
23
  end
24
24
 
@@ -48,7 +48,6 @@ module ScraperUtils
48
48
  debug?(TRACE_LEVEL)
49
49
  end
50
50
 
51
-
52
51
  # Logs details of an HTTP request when debug mode is enabled
53
52
  #
54
53
  # @param http_method [String] HTTP http_method (GET, POST, etc.)
@@ -21,10 +21,14 @@ module ScraperUtils
21
21
  @crawl_delay = crawl_delay.to_f
22
22
  # Clamp between 10 (delay 9x response) and 100 (no extra delay)
23
23
  @max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
24
- @next_request_at = {} # hostname => Time
24
+ @next_request_at = {} # hostname => Time
25
25
  @request_started_at = {} # hostname => Time
26
26
  end
27
27
 
28
+ def will_pause_till(hostname)
29
+ @next_request_at[hostname]
30
+ end
31
+
28
32
  # Sleep until this host's throttle window has elapsed.
29
33
  # Records when the request actually started.
30
34
  # @param hostname [String]
@@ -48,23 +52,19 @@ module ScraperUtils
48
52
  response_time = Time.now - started
49
53
 
50
54
  delay = @crawl_delay
51
- if @max_load
52
- delay += (100.0 - @max_load) * response_time / @max_load
53
- end
55
+ delay += (100.0 - @max_load) * response_time / @max_load if @max_load
54
56
 
55
- if overloaded
56
- delay = delay + response_time * 2 + 5.0
57
- end
57
+ delay = delay + (response_time * 2) + 5.0 if overloaded
58
58
 
59
59
  delay = delay.round(3).clamp(0.0, MAX_DELAY)
60
60
  @next_request_at[hostname] = Time.now + delay
61
61
 
62
- if DebugUtils.basic?
63
- msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
64
- msg += " OVERLOADED" if overloaded
65
- msg += ", Will delay #{delay}s before next request"
66
- LogUtils.log(msg)
67
- end
62
+ return unless DebugUtils.basic?
63
+
64
+ msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
65
+ msg += " OVERLOADED" if overloaded
66
+ msg += ", Will delay #{delay}s before next request"
67
+ LogUtils.log(msg)
68
68
  end
69
69
 
70
70
  # Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
@@ -14,7 +14,7 @@ module ScraperUtils
14
14
  # @param message [String] the message to log
15
15
  # @return [void]
16
16
  def self.log(message, authority = nil)
17
- authority ||= ENV['AUTHORITY']
17
+ authority ||= ENV.fetch("AUTHORITY", nil)
18
18
  $stderr.flush
19
19
  if authority
20
20
  puts "[#{authority}] #{message}"
@@ -85,7 +85,7 @@ module ScraperUtils
85
85
  failed
86
86
  )
87
87
 
88
- DbUtils::cleanup_old_records
88
+ DbUtils.cleanup_old_records
89
89
  end
90
90
 
91
91
  # Extracts the first relevant line from backtrace that's from our project
@@ -104,15 +104,15 @@ module ScraperUtils
104
104
  format = options[:format] || false
105
105
 
106
106
  # Normalize the root directory path with a trailing slash
107
- pwd = File.join(pwd, '')
107
+ pwd = File.join(pwd, "")
108
108
 
109
109
  backtrace.each do |line|
110
- next if line.include?('/gems/') ||
111
- line.include?('/vendor/') ||
112
- line.include?('/ruby/')
110
+ next if line.include?("/gems/") ||
111
+ line.include?("/vendor/") ||
112
+ line.include?("/ruby/")
113
113
 
114
114
  if line.start_with?(pwd)
115
- relative_path = line.sub(pwd, '')
115
+ relative_path = line.sub(pwd, "")
116
116
  return format ? " [#{relative_path}]" : relative_path
117
117
  end
118
118
  end
@@ -138,7 +138,7 @@ module ScraperUtils
138
138
  puts "\nScraping Summary:"
139
139
  summary_format = "%-20s %6s %6s %s"
140
140
 
141
- puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
141
+ puts format(summary_format, "Authority", "OK", "Bad", "Exception")
142
142
  puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
143
143
 
144
144
  authorities.each do |authority|
@@ -149,7 +149,8 @@ module ScraperUtils
149
149
 
150
150
  expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
151
151
  exception_msg = if exceptions[authority]
152
- location = self.project_backtrace_line(exceptions[authority].backtrace, format: true)
152
+ location = project_backtrace_line(exceptions[authority].backtrace,
153
+ format: true)
153
154
  "#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
154
155
  else
155
156
  "-"
@@ -174,12 +175,12 @@ module ScraperUtils
174
175
 
175
176
  # Check for authorities with unexpected errors
176
177
  unexpected_errors = authorities
177
- .select { |authority| exceptions[authority] }
178
- .reject { |authority| expect_bad.include?(authority) }
178
+ .select { |authority| exceptions[authority] }
179
+ .reject { |authority| expect_bad.include?(authority) }
179
180
 
180
181
  if unexpected_errors.any?
181
182
  errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
182
- "(Add to MORPH_EXPECT_BAD?)"
183
+ "(Add to MORPH_EXPECT_BAD?)"
183
184
  unexpected_errors.each do |authority|
184
185
  error = exceptions[authority]
185
186
  errors << " #{authority}: #{error.class} - #{error}"
@@ -228,7 +229,8 @@ module ScraperUtils
228
229
  # Moved to DbUtils
229
230
  # :nocov:
230
231
  def self.cleanup_old_records(force: false)
231
- warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.", category: :deprecated
232
+ warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.",
233
+ category: :deprecated
232
234
  ScraperUtils::DbUtils.cleanup_old_records(force: force)
233
235
  end
234
236
  # :nocov:
@@ -239,7 +241,9 @@ module ScraperUtils
239
241
 
240
242
  lines = []
241
243
  error.backtrace.each do |line|
242
- lines << line if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
244
+ if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
245
+ lines << line
246
+ end
243
247
  break if lines.length >= 6
244
248
  end
245
249
 
@@ -11,7 +11,8 @@ module ScraperUtils
11
11
  def self.fibonacci_series(max)
12
12
  result = []
13
13
  # Start with the basic Fibonacci sequence
14
- last_fib, this_fib = 1, 0
14
+ last_fib = 1
15
+ this_fib = 0
15
16
  while this_fib <= max
16
17
  result << this_fib
17
18
  yield this_fib if block_given?
@@ -61,12 +61,12 @@ module ScraperUtils
61
61
  # Reset all configuration options to their default values
62
62
  # @return [void]
63
63
  def reset_defaults!
64
- @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
65
- @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
66
- @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
67
- @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
68
- @default_crawl_delay = ENV.fetch('MORPH_CLIENT_CRAWL_DELAY', DEFAULT_CRAWL_DELAY)
69
- @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD)
64
+ @default_timeout = ENV.fetch("MORPH_CLIENT_TIMEOUT", DEFAULT_TIMEOUT).to_i # 60
65
+ @default_disable_ssl_certificate_check = !ENV.fetch("MORPH_DISABLE_SSL_CHECK", nil).to_s.empty? # false
66
+ @default_australian_proxy = !ENV.fetch("MORPH_USE_PROXY", nil).to_s.empty? # false
67
+ @default_user_agent = ENV.fetch("MORPH_USER_AGENT", nil) # Uses Mechanize user agent
68
+ @default_crawl_delay = ENV.fetch("MORPH_CLIENT_CRAWL_DELAY", DEFAULT_CRAWL_DELAY)
69
+ @default_max_load = ENV.fetch("MORPH_MAX_LOAD", DEFAULT_MAX_LOAD)
70
70
  end
71
71
  end
72
72
 
@@ -113,10 +113,10 @@ module ScraperUtils
113
113
  @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
114
114
  if @australian_proxy
115
115
  uri = begin
116
- URI.parse(ScraperUtils.australian_proxy.to_s)
117
- rescue URI::InvalidURIError => e
118
- raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
119
- end
116
+ URI.parse(ScraperUtils.australian_proxy.to_s)
117
+ rescue URI::InvalidURIError => e
118
+ raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
119
+ end
120
120
  unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
121
121
  raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
122
122
  end
@@ -177,13 +177,13 @@ module ScraperUtils
177
177
  end
178
178
 
179
179
  def pre_connect_hook(_agent, request)
180
- hostname = request.respond_to?(:uri) ? request.uri.host : 'unknown'
180
+ hostname = (request.respond_to?(:[]) && request["Host"]) || "unknown"
181
181
  @throttler.before_request(hostname)
182
- if DebugUtils.verbose?
183
- ScraperUtils::LogUtils.log(
184
- "Pre Connect request: #{request.inspect}"
185
- )
186
- end
182
+ return unless DebugUtils.verbose?
183
+
184
+ ScraperUtils::LogUtils.log(
185
+ "Pre Connect request: #{request.inspect}"
186
+ )
187
187
  end
188
188
 
189
189
  def post_connect_hook(_agent, uri, response, _body)
@@ -191,7 +191,8 @@ module ScraperUtils
191
191
 
192
192
  status = response.respond_to?(:code) ? response.code.to_i : nil
193
193
  overloaded = [429, 500, 503].include?(status)
194
- @throttler.after_request(uri.host, overloaded: overloaded)
194
+ hostname = uri.host || "unknown"
195
+ @throttler.after_request(hostname, overloaded: overloaded)
195
196
 
196
197
  if DebugUtils.basic?
197
198
  ScraperUtils::LogUtils.log(
@@ -204,11 +205,7 @@ module ScraperUtils
204
205
  def error_hook(_agent, error)
205
206
  # Best-effort: record the error against whatever host we can find
206
207
  # Mechanize errors often carry the URI in the message; fall back to 'unknown'
207
- hostname = if error.respond_to?(:uri)
208
- error.uri.host
209
- else
210
- 'unknown'
211
- end
208
+ hostname = (error.uri.host if error.respond_to?(:uri)) || "unknown"
212
209
  @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
213
210
  end
214
211
 
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "host_throttler"
4
+ require_relative "mechanize_utils/agent_config"
4
5
 
5
6
  module ScraperUtils
6
7
  # Misc Standalone Utilities
@@ -8,6 +9,14 @@ module ScraperUtils
8
9
  THROTTLE_HOSTNAME = "block"
9
10
 
10
11
  class << self
12
+ attr_accessor :default_crawl_delay, :default_max_load
13
+
14
+ def reset_defaults!
15
+ @default_crawl_delay = MechanizeUtils::AgentConfig.default_crawl_delay
16
+ @default_max_load = MechanizeUtils::AgentConfig.default_max_load
17
+ reset_throttler!
18
+ end
19
+
11
20
  # Throttle block to be nice to servers we are scraping.
12
21
  # Time spent inside the block (parsing, saving) counts toward the delay.
13
22
  def throttle_block
@@ -27,11 +36,22 @@ module ScraperUtils
27
36
  @throttler = nil
28
37
  end
29
38
 
39
+ def will_pause_till
40
+ throttler.will_pause_till(THROTTLE_HOSTNAME)
41
+ end
42
+
30
43
  private
31
44
 
32
45
  def throttler
33
- @throttler ||= HostThrottler.new
46
+ @throttler ||= HostThrottler.new(
47
+ crawl_delay: default_crawl_delay,
48
+ max_load: default_max_load
49
+ )
34
50
  end
35
51
  end
52
+
53
+ # Initialise defaults after AgentConfig is loaded
54
+ require_relative "mechanize_utils/agent_config"
55
+ reset_defaults!
36
56
  end
37
57
  end
@@ -31,8 +31,6 @@ module ScraperUtils
31
31
  errors.empty? ? nil : errors
32
32
  end
33
33
 
34
- private
35
-
36
34
  def self.validate_presence(record, errors)
37
35
  REQUIRED_FIELDS.each do |field|
38
36
  errors << "#{field} can't be blank" if record[field].to_s.strip.empty?
@@ -47,10 +45,10 @@ module ScraperUtils
47
45
  begin
48
46
  uri = URI.parse(url)
49
47
  unless uri.is_a?(URI::HTTP) && uri.host.to_s != ""
50
- errors << "info_url must be a valid http\/https URL with host"
48
+ errors << "info_url must be a valid http/https URL with host"
51
49
  end
52
50
  rescue URI::InvalidURIError
53
- errors << "info_url must be a valid http\/https URL"
51
+ errors << "info_url must be a valid http/https URL"
54
52
  end
55
53
  end
56
54
 
@@ -58,18 +56,22 @@ module ScraperUtils
58
56
  today = Date.today
59
57
 
60
58
  date_scraped = parse_date(record["date_scraped"])
61
- errors << "Invalid date format for date_scraped: #{record["date_scraped"].inspect} is not a valid ISO 8601 date" if record["date_scraped"] && date_scraped.nil?
59
+ if record["date_scraped"] && date_scraped.nil?
60
+ errors << "Invalid date format for date_scraped: #{record['date_scraped'].inspect} is not a valid ISO 8601 date"
61
+ end
62
62
 
63
63
  date_received = parse_date(record["date_received"])
64
64
  if record["date_received"] && date_received.nil?
65
- errors << "Invalid date format for date_received: #{record["date_received"].inspect} is not a valid ISO 8601 date"
65
+ errors << "Invalid date format for date_received: #{record['date_received'].inspect} is not a valid ISO 8601 date"
66
66
  elsif date_received && date_received.to_date > today
67
- errors << "Invalid date for date_received: #{record["date_received"].inspect} is in the future"
67
+ errors << "Invalid date for date_received: #{record['date_received'].inspect} is in the future"
68
68
  end
69
69
 
70
70
  %w[on_notice_from on_notice_to].each do |field|
71
71
  val = parse_date(record[field])
72
- errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date" if record[field] && val.nil?
72
+ if record[field] && val.nil?
73
+ errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date"
74
+ end
73
75
  end
74
76
  end
75
77
 
@@ -47,34 +47,43 @@ module ScraperUtils
47
47
 
48
48
  PLANNING_KEYWORDS = [
49
49
  # Building types
50
- 'dwelling', 'house', 'unit', 'building', 'structure', 'facility',
50
+ "dwelling", "house", "unit", "building", "structure", "facility",
51
51
  # Modifications
52
- 'addition', 'extension', 'renovation', 'alteration', 'modification',
53
- 'replacement', 'upgrade', 'improvement',
52
+ "addition", "extension", "renovation", "alteration", "modification",
53
+ "replacement", "upgrade", "improvement",
54
54
  # Specific structures
55
- 'carport', 'garage', 'shed', 'pool', 'deck', 'patio', 'pergola',
56
- 'verandah', 'balcony', 'fence', 'wall', 'driveway',
55
+ "carport", "garage", "shed", "pool", "deck", "patio", "pergola",
56
+ "verandah", "balcony", "fence", "wall", "driveway",
57
57
  # Development types
58
- 'subdivision', 'demolition', 'construction', 'development',
58
+ "subdivision", "demolition", "construction", "development",
59
59
  # Services/utilities
60
- 'signage', 'telecommunications', 'stormwater', 'water', 'sewer',
60
+ "signage", "telecommunications", "stormwater", "water", "sewer",
61
61
  # Approvals/certificates
62
- 'certificate', 'approval', 'consent', 'permit'
62
+ "certificate", "approval", "consent", "permit"
63
63
  ].freeze
64
64
 
65
+ def self.fetch_url_head(url)
66
+ agent = Mechanize.new
67
+ # FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
68
+ agent.head(url)
69
+ end
70
+
65
71
  def self.fetch_url_with_redirects(url)
66
72
  agent = Mechanize.new
67
- # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
73
+ # FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
68
74
  agent.get(url)
69
75
  end
70
76
 
71
- def self.authority_label(results, prefix: '', suffix: '')
77
+ def self.authority_label(results, prefix: "", suffix: "")
72
78
  return nil if results.nil?
73
79
 
74
- authority_labels = results.map { |record| record['authority_label'] }.compact.uniq
80
+ authority_labels = results.map { |record| record["authority_label"] }.compact.uniq
75
81
  return nil if authority_labels.empty?
76
82
 
77
- raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
83
+ if authority_labels.size > 1
84
+ raise "Expected one authority_label, not #{authority_labels.inspect}"
85
+ end
86
+
78
87
  "#{prefix}#{authority_labels.first}#{suffix}"
79
88
  end
80
89
 
@@ -88,7 +97,8 @@ module ScraperUtils
88
97
  duplicates = groups.select { |_k, g| g.size > 1 }
89
98
  return if duplicates.empty?
90
99
 
91
- raise UnprocessableSite, "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
100
+ raise UnprocessableSite,
101
+ "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
92
102
  end
93
103
 
94
104
  # Validates enough addresses are geocodable
@@ -98,19 +108,32 @@ module ScraperUtils
98
108
  # @param ignore_case [Boolean] Ignores case which relaxes suburb check
99
109
  # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
100
110
  # @raise RuntimeError if insufficient addresses are geocodable
101
- def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
111
+ def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
112
+ ignore_case: false, known_suburbs: [])
102
113
  return nil if results.empty?
103
114
 
104
115
  geocodable = results
105
- .map { |record| record["address"] }
106
- .uniq
107
- .count { |text| ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case }
116
+ .map { |record| record["address"] }
117
+ .uniq
118
+ .count do |text|
119
+ ok = ScraperUtils::SpecSupport.geocodable? text,
120
+ known_suburbs: known_suburbs, ignore_case: ignore_case
121
+ if !ok && DebugUtils.verbose?
122
+ ScraperUtils::LogUtils.log(
123
+ "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
124
+ )
125
+ end
126
+
127
+ ok
128
+ end
108
129
  puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
109
- "(#{(100.0 * geocodable / results.count).round(1)}%)"
110
- expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
130
+ "(#{(100.0 * geocodable / results.count).round(1)}%)"
131
+ expected = [(((percentage.to_f / 100.0) * results.count) - variation), 1].max
111
132
  unless geocodable >= expected
112
- raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
133
+ raise UnprocessableSite,
134
+ "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
113
135
  end
136
+
114
137
  geocodable
115
138
  end
116
139
 
@@ -122,10 +145,13 @@ module ScraperUtils
122
145
  # @return [Boolean] True if the address appears to be geocodable.
123
146
  def self.geocodable?(address, ignore_case: false, known_suburbs: [])
124
147
  return false if address.nil? || address.empty?
148
+
125
149
  check_address = ignore_case ? address.upcase : address
126
150
 
127
151
  # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
128
- has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
152
+ has_state = AUSTRALIAN_STATES.any? do |state|
153
+ check_address.end_with?(" #{state}") || check_address.include?(" #{state} ")
154
+ end
129
155
  has_postcode = address.match?(AUSTRALIAN_POSTCODES)
130
156
 
131
157
  # Using the pre-compiled patterns
@@ -138,9 +164,13 @@ module ScraperUtils
138
164
  if ENV["DEBUG"]
139
165
  missing = []
140
166
  missing << "street type" unless has_street_type
141
- missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
167
+ unless has_postcode || has_uppercase_suburb || has_known_suburb
168
+ missing << "postcode/Uppercase suburb/Known suburb"
169
+ end
142
170
  missing << "state" unless has_state
143
- puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
171
+ if missing.any?
172
+ puts " address: #{address} is not geocodable, missing #{missing.join(', ')}"
173
+ end
144
174
  end
145
175
 
146
176
  has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
@@ -167,17 +197,21 @@ module ScraperUtils
167
197
  return nil if results.empty?
168
198
 
169
199
  descriptions = results
170
- .map { |record| record["description"] }
171
- .uniq
172
- .count do |text|
200
+ .map { |record| record["description"] }
201
+ .uniq
202
+ .count do |text|
173
203
  selected = ScraperUtils::SpecSupport.reasonable_description? text
174
204
  puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
175
205
  selected
176
206
  end
177
207
  puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
178
- "(#{(100.0 * descriptions / results.count).round(1)}%)"
179
- expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
180
- raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
208
+ "(#{(100.0 * descriptions / results.count).round(1)}%)"
209
+ expected = [((percentage.to_f / 100.0) * results.count) - variation, 1].max
210
+ unless descriptions >= expected
211
+ raise UnprocessableSite,
212
+ "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}"
213
+ end
214
+
181
215
  descriptions
182
216
  end
183
217
 
@@ -200,7 +234,8 @@ module ScraperUtils
200
234
  # @param bot_check_expected [Boolean] Whether bot protection is acceptable
201
235
  # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
202
236
  # @raise RuntimeError if records don't use the expected URL or it doesn't return 200
203
- def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false, &block)
237
+ def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
238
+ &block)
204
239
  info_urls = results.map { |record| record["info_url"] }.uniq
205
240
 
206
241
  unless info_urls.size == 1
@@ -223,6 +258,22 @@ module ScraperUtils
223
258
  end
224
259
  end
225
260
 
261
+ # Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
262
+ # @param results [Array<Hash>] The results from scraping an authority
263
+ # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
264
+ # @param variation [Integer] The variation allowed in addition to percentage (default:3)
265
+ # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
266
+ # @raise RuntimeError if insufficient detail checks pass
267
+ def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
268
+ if defined?(VCR)
269
+ VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
270
+ check_info_url_is_present(results, percentage, variation, &block)
271
+ end
272
+ else
273
+ check_info_url_is_present(results, percentage, variation, &block)
274
+ end
275
+ end
276
+
226
277
  # Validates that info_urls have expected details (unique URLs with content validation)
227
278
  # @param results [Array<Hash>] The results from scraping an authority
228
279
  # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
@@ -230,7 +281,8 @@ module ScraperUtils
230
281
  # @param bot_check_expected [Boolean] Whether bot protection is acceptable
231
282
  # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
232
283
  # @raise RuntimeError if insufficient detail checks pass
233
- def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false, &block)
284
+ def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
285
+ bot_check_expected: false, &block)
234
286
  if defined?(VCR)
235
287
  VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
236
288
  check_info_url_details(results, percentage, variation, bot_check_expected, &block)
@@ -252,15 +304,15 @@ module ScraperUtils
252
304
 
253
305
  # Check for common bot protection indicators
254
306
  bot_indicators = [
255
- 'recaptcha',
256
- 'cloudflare',
257
- 'are you human',
258
- 'bot detection',
259
- 'security check',
260
- 'verify you are human',
261
- 'access denied',
262
- 'blocked',
263
- 'captcha'
307
+ "recaptcha",
308
+ "cloudflare",
309
+ "are you human",
310
+ "bot detection",
311
+ "security check",
312
+ "verify you are human",
313
+ "access denied",
314
+ "blocked",
315
+ "captcha"
264
316
  ]
265
317
 
266
318
  bot_indicators.any? { |indicator| body_lower.include?(indicator) }
@@ -276,10 +328,51 @@ module ScraperUtils
276
328
  return
277
329
  end
278
330
 
279
- raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
331
+ return if page.code == "200"
332
+
333
+ raise "Expected 200 response from the one expected info_url, got #{page.code}"
280
334
  end
281
335
 
282
- private
336
+ def self.check_info_url_is_present(results, percentage, variation, &block)
337
+ count = 0
338
+ failed = 0
339
+ fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
340
+
341
+ fib_indices.each do |index|
342
+ record = results[index]
343
+ info_url = record["info_url"]
344
+ puts "Checking info_url[#{index}]: #{info_url} is present..."
345
+
346
+ begin
347
+ page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
348
+ status = page.code.to_i
349
+ rescue Mechanize::ResponseCodeError => e
350
+ status = e.response_code.to_i
351
+ end
352
+
353
+ if [403, 429].include?(status)
354
+ puts " Bot protection detected - skipping"
355
+ next
356
+ end
357
+
358
+ count += 1
359
+ if status.between?(200, 299)
360
+ puts " OK: #{status}" if ENV["DEBUG"]
361
+ else
362
+ failed += 1
363
+ puts " Failed: #{status}"
364
+ min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
365
+ passed = count - failed
366
+ if passed < min_required
367
+ raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
368
+ end
369
+ end
370
+ end
371
+
372
+ return unless count > 0
373
+
374
+ puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!"
375
+ end
283
376
 
284
377
  def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
285
378
  count = 0
@@ -298,7 +391,10 @@ module ScraperUtils
298
391
  next
299
392
  end
300
393
 
301
- raise UnprocessableRecord, "Expected 200 response, got #{page.code}" unless page.code == "200"
394
+ unless page.code == "200"
395
+ raise UnprocessableRecord,
396
+ "Expected 200 response, got #{page.code}"
397
+ end
302
398
 
303
399
  page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
304
400
 
@@ -306,34 +402,40 @@ module ScraperUtils
306
402
  count += 1
307
403
  expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
308
404
  expected2 = case attribute
309
- when 'council_reference'
310
- expected.sub(/\ADA\s*-\s*/, '')
311
- when 'address'
312
- expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, '') # Handle Lismore post-code/state swap
405
+ when "council_reference"
406
+ expected.sub(/\ADA\s*-\s*/, "")
407
+ when "address"
408
+ expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, "") # Handle Lismore post-code/state swap
313
409
  else
314
410
  expected
315
411
  end
316
412
  expected3 = case attribute
317
- when 'address'
318
- expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, '')
413
+ when "address"
414
+ expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, "")
319
415
  else
320
416
  expected
321
- end.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-')
322
- next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-').include?(expected3)
417
+ end.gsub(/\s*,\s*/, " ").gsub(/\s*-\s*/, "-")
418
+ next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, " ").gsub(
419
+ /\s*-\s*/, "-"
420
+ ).include?(expected3)
323
421
 
324
422
  failed += 1
325
- desc2 = expected2 == expected ? '' : " or #{expected2.inspect}"
326
- desc3 = expected3 == expected ? '' : " or #{expected3.inspect}"
423
+ desc2 = expected2 == expected ? "" : " or #{expected2.inspect}"
424
+ desc3 = expected3 == expected ? "" : " or #{expected3.inspect}"
327
425
  puts " Missing: #{expected.inspect}#{desc2}#{desc3}"
328
- puts " IN: #{page_body}" if ENV['DEBUG']
426
+ puts " IN: #{page_body}" if ENV["DEBUG"]
329
427
 
330
- min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
428
+ min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
331
429
  passed = count - failed
332
- raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
430
+ if passed < min_required
431
+ raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
432
+ end
333
433
  end
334
434
  end
335
435
 
336
- puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
436
+ return unless count > 0
437
+
438
+ puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!"
337
439
  end
338
440
  end
339
441
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.14.1"
4
+ VERSION = "0.16.0"
5
5
  end
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
13
13
 
14
14
  spec.summary = "planningalerts scraper utilities"
15
15
  spec.description = "Utilities to help make planningalerts scrapers, " \
16
- "especially multi authority scrapers, easier to develop, run and debug."
16
+ "especially multi authority scrapers, easier to develop, run and debug."
17
17
  spec.homepage = "https://github.com/ianheggie-oaf/#{spec.name}"
18
18
  spec.license = "MIT"
19
19
 
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
23
23
  spec.metadata["homepage_uri"] = spec.homepage
24
24
  spec.metadata["source_code_uri"] = spec.homepage
25
25
  spec.metadata["documentation_uri"] = "https://rubydoc.info/gems/#{spec.name}/#{ScraperUtils::VERSION}"
26
- spec.metadata["changelog_uri"] = "#{spec.metadata["source_code_uri"]}/blob/main/CHANGELOG.md"
26
+ spec.metadata["changelog_uri"] = "#{spec.metadata['source_code_uri']}/blob/main/CHANGELOG.md"
27
27
  else
28
28
  raise "RubyGems 2.0 or newer is required to protect against " \
29
- "public gem pushes."
29
+ "public gem pushes."
30
30
  end
31
31
 
32
32
  # Specify which files should be added to the gem when it is released.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.1
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-04 00:00:00.000000000 Z
11
+ date: 2026-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -114,7 +114,7 @@ metadata:
114
114
  allowed_push_host: https://rubygems.org
115
115
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
116
116
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
117
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.14.1
117
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.16.0
118
118
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
119
119
  rubygems_mfa_required: 'true'
120
120
  post_install_message: