RubyGems - scraper_utils - Versions diffs - 0.15.0 → 0.16.0 - Mend

scraper_utils 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +3 -0
data/docs/example_parallel_scraper.rb +16 -17
data/docs/example_scraper.rb +10 -13
data/exe/validate_scraper_data +13 -8
data/lib/scraper_utils/data_quality_monitor.rb +9 -4
data/lib/scraper_utils/db_utils.rb +6 -2
data/lib/scraper_utils/debug_utils.rb +1 -2
data/lib/scraper_utils/host_throttler.rb +9 -13
data/lib/scraper_utils/log_utils.rb +18 -14
data/lib/scraper_utils/maths_utils.rb +2 -1
data/lib/scraper_utils/mechanize_utils/agent_config.rb +18 -20
data/lib/scraper_utils/misc_utils.rb +17 -1
data/lib/scraper_utils/pa_validation.rb +10 -8
data/lib/scraper_utils/spec_support.rb +106 -73
data/lib/scraper_utils/version.rb +1 -1
data/scraper_utils.gemspec +3 -3
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
-  data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
+  metadata.gz: 9a1001f794ef04c587bb726157c66fc637fbb8525bac1c5be93a138e7f0a8266
+  data.tar.gz: f92023b5362c6b64ae74d0bf43cf613b02849687a46ec7fbb6b51c4b7ad397dc
 SHA512:
-  metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
-  data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33
+  metadata.gz: 88e952e952d59011018ca4721bde72d49c913beccccf098d62bb4d1313d0ca3bf94678ff27db5ba4cef3a674fefbebd067a5008e5f36a2029f2a9c8ac1689b15
+  data.tar.gz: 35601498d9d110d5d365aa7c1fddcfa74a86fde4b93537b44f8e00bb84f664ba455c642256c0032e221b484d986ea39b2d3ab743c94102b10c7bed1c397139d5

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,8 @@
 # Changelog
+## 0.16.0 - 2026-04-08
+* Use defaults from AgentConfig for `throttle_block`, and allow defaults to be overriden
 ## 0.15.0 - 2026-03-05
 * Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests

data/docs/example_parallel_scraper.rb CHANGED Viewed

@@ -27,21 +27,20 @@ class Scraper
     begin
       ScraperUtils::DataQualityMonitor.start_authority(authority_label)
       YourScraper.scrape(authority_label) do |record|
-        begin
-          record["authority_label"] = authority_label.to_s
-          ScraperUtils::DbUtils.save_record(record)
-        rescue ScraperUtils::UnprocessableRecord => e
-          # Log bad record but continue processing unless too many have occurred
-          ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
-          unprocessable_record_details << [e, record]
-        end
+        record["authority_label"] = authority_label.to_s
+        ScraperUtils::DbUtils.save_record(record)
+      rescue ScraperUtils::UnprocessableRecord => e
+        # Log bad record but continue processing unless too many have occurred
+        ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
+        unprocessable_record_details << [e, record]
       end
     rescue StandardError => e
       warn "#{authority_label}: ERROR: #{e}"
       warn e.backtrace
       fatal_exception = e
     end
-    [authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details, fatal_exception]
+    [authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details,
+     fatal_exception]
   end
   # Process authorities in parallel
@@ -54,7 +53,7 @@ class Scraper
       scrape_authority(authority_label, attempt)
     end.each do |authority_label, saves, unprocessable, fatal_exception|
       # Runs in main process
-      status = fatal_exception ? 'FAILED' : 'OK'
+      status = fatal_exception ? "FAILED" : "OK"
       puts "Saving results of #{authority_label}: #{saves.size} records, #{unprocessable.size} unprocessable #{status}"
       saves.each do |record|
@@ -65,11 +64,11 @@ class Scraper
         exceptions[authority_label] = e
       end
-      if fatal_exception
-        puts "  Warning: #{authority_label} failed with: #{fatal_exception.message}"
-        puts "  Saved #{saves.size} records before failure"
-        exceptions[authority_label] = fatal_exception
-      end
+      next unless fatal_exception
+      puts "  Warning: #{authority_label} failed with: #{fatal_exception.message}"
+      puts "  Saved #{saves.size} records before failure"
+      exceptions[authority_label] = fatal_exception
     end
     exceptions
@@ -96,7 +95,7 @@ class Scraper
     unless exceptions.empty?
       puts "\n***************************************************"
       puts "Now retrying authorities which earlier had failures"
-      puts exceptions.keys.join(", ").to_s
+      puts exceptions.keys.join(", ")
       puts "***************************************************"
       start_time = Time.now
@@ -118,7 +117,7 @@ end
 if __FILE__ == $PROGRAM_NAME
   ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
-  process_count = (ENV['MORPH_PROCESSES'] || Etc.nprocessors * 2).to_i
+  process_count = (ENV["MORPH_PROCESSES"] || (Etc.nprocessors * 2)).to_i
   Scraper.run(Scraper.selected_authorities, process_count: process_count)
 end

data/docs/example_scraper.rb CHANGED Viewed

@@ -22,13 +22,11 @@ class Scraper
       # REPLACE section with:
       ScraperUtils::DataQualityMonitor.start_authority(authority_label)
       YourScraper.scrape(authority_label) do |record|
-        begin
-          record["authority_label"] = authority_label.to_s
-          ScraperUtils::DbUtils.save_record(record)
-        rescue ScraperUtils::UnprocessableRecord => e
-          ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
-          exceptions[authority_label] = e
-        end
+        record["authority_label"] = authority_label.to_s
+        ScraperUtils::DbUtils.save_record(record)
+      rescue ScraperUtils::UnprocessableRecord => e
+        ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
+        exceptions[authority_label] = e
       end
       # END OF REPLACE
     rescue StandardError => e
@@ -61,7 +59,7 @@ class Scraper
       puts "Now retrying authorities which earlier had failures"
       puts exceptions.keys.join(", ")
       puts "***************************************************"
-      ENV['DEBUG'] ||= '1'
+      ENV["DEBUG"] ||= "1"
       start_time = Time.now
       exceptions = scrape(exceptions.keys, 2)
@@ -85,12 +83,11 @@ if __FILE__ == $PROGRAM_NAME
   # some: url-for-issue Summary Reason
   # councils: url-for-issue Summary Reason
-  if ENV['MORPH_EXPECT_BAD'].nil?
-    default_expect_bad = {
-    }
-    puts 'Default EXPECT_BAD:', default_expect_bad.to_yaml if default_expect_bad.any?
+  if ENV["MORPH_EXPECT_BAD"].nil?
+    default_expect_bad = {}
+    puts "Default EXPECT_BAD:", default_expect_bad.to_yaml if default_expect_bad.any?
-    ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(',')
+    ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(",")
   end
   # If the sites have many unusable records - raise defaults
   # ENV['MORPH_UNPROCESSABLE_BASE'] ||= "10"

data/exe/validate_scraper_data CHANGED Viewed

@@ -26,7 +26,7 @@ if File.exist?(config_file)
     config = YAML.safe_load(File.read(config_file), symbolize_names: true)
     options.merge!(config) if config
     puts "Loaded config from #{config_file}"
-  rescue => e
+  rescue StandardError => e
     puts "Warning: Could not load #{config_file}: #{e.message}"
   end
 end
@@ -38,19 +38,23 @@ OptionParser.new do |opts|
     options[:database] = db
   end
-  opts.on("-g", "--geocodable-percentage N", Integer, "Min percentage of geocodable addresses (default: 50)") do |n|
+  opts.on("-g", "--geocodable-percentage N", Integer,
+          "Min percentage of geocodable addresses (default: 50)") do |n|
     options[:geocodable_percentage] = n
   end
-  opts.on("-r", "--description-percentage N", Integer, "Min percentage of reasonable descriptions (default: 50)") do |n|
+  opts.on("-r", "--description-percentage N", Integer,
+          "Min percentage of reasonable descriptions (default: 50)") do |n|
     options[:description_percentage] = n
   end
-  opts.on("-u", "--info-url-percentage N", Integer, "Min percentage for info URL validation (default: 75)") do |n|
+  opts.on("-u", "--info-url-percentage N", Integer,
+          "Min percentage for info URL validation (default: 75)") do |n|
     options[:info_url_percentage] = n
   end
-  opts.on("-v", "--variation N", Integer, "Variation tolerance for all validations (default: 3)") do |n|
+  opts.on("-v", "--variation N", Integer,
+          "Variation tolerance for all validations (default: 3)") do |n|
     options[:geocodable_variation] = n
     options[:description_variation] = n
     options[:info_url_variation] = n
@@ -60,11 +64,13 @@ OptionParser.new do |opts|
     options[:bot_check_expected] = true
   end
-  opts.on("-i", "--global-info-url URL", "Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
+  opts.on("-i", "--global-info-url URL",
+          "Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
     options[:global_info_url] = url
   end
-  opts.on("-c", "--config FILE", "Load config from YAML file (default: .scraper_validation.yml)") do |file|
+  opts.on("-c", "--config FILE",
+          "Load config from YAML file (default: .scraper_validation.yml)") do |file|
     config_file = file
   end
@@ -142,7 +148,6 @@ begin
   puts
   puts "✅ All validations passed!"
 rescue RuntimeError => e
   puts
   puts "❌ Validation failed: #{e.message}"

data/lib/scraper_utils/data_quality_monitor.rb CHANGED Viewed

@@ -30,8 +30,13 @@ module ScraperUtils
     # Initial base of 5.01 (override using MORPH_UNPROCESSABLE_BASE)
     # Initial percentage of 10% (override using MORPH_UNPROCESSABLE_PERCENTAGE)
     def self.threshold(authority_label)
-      ENV.fetch('MORPH_UNPROCESSABLE_BASE', 5.01).to_f +
-        (@stats[authority_label][:saved].to_i * ENV.fetch('MORPH_UNPROCESSABLE_PERCENTAGE', 10.0).to_f / 100.0) if @stats&.fetch(authority_label, nil)
+      if @stats&.fetch(
+        authority_label, nil
+      )
+        ENV.fetch("MORPH_UNPROCESSABLE_BASE", 5.01).to_f +
+          (@stats[authority_label][:saved].to_i * ENV.fetch("MORPH_UNPROCESSABLE_PERCENTAGE",
+                                                            10.0).to_f / 100.0)
+      end
     end
     # Logs an unprocessable record and raises an exception if error threshold is exceeded
@@ -44,7 +49,7 @@ module ScraperUtils
     def self.log_unprocessable_record(exception, record)
       authority_label = extract_authority(record)
       @stats[authority_label][:unprocessed] += 1
-      details = if record&.key?('council_reference') && record&.key?('address')
+      details = if record&.key?("council_reference") && record&.key?("address")
                   "#{record['council_reference']} - #{record['address']}"
                 else
                   record.inspect
@@ -64,7 +69,7 @@ module ScraperUtils
     def self.log_saved_record(record)
       authority_label = extract_authority(record)
       @stats[authority_label][:saved] += 1
-      ScraperUtils::LogUtils.log "Saving record #{authority_label&.empty? ? '' : "for #{authority_label}: "}#{record['council_reference']} - #{record['address']}"
+      ScraperUtils::LogUtils.log "Saving record #{"for #{authority_label}: " unless authority_label&.empty?}#{record['council_reference']} - #{record['address']}"
     end
   end
 end

data/lib/scraper_utils/db_utils.rb CHANGED Viewed

@@ -63,12 +63,16 @@ module ScraperUtils
       LogUtils.log "Deleting #{deleted_count} applications scraped between #{oldest_date} and #{cutoff_date}"
       ScraperWiki.sqliteexecute("DELETE FROM data WHERE date_scraped < ?", [cutoff_date])
-      return unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
+      unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
+        return
+      end
       LogUtils.log "  Running VACUUM to reclaim space..."
       ScraperWiki.sqliteexecute("VACUUM")
     rescue SqliteMagic::NoSuchTable => e
-      ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records" if ScraperUtils::DebugUtils.trace?
+      if ScraperUtils::DebugUtils.trace?
+        ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records"
+      end
     end
   end
 end

data/lib/scraper_utils/debug_utils.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module ScraperUtils
     # Checks DEBUG and MORPH_DEBUG env variables
     # @return [Integer] Debug level
     def self.debug_level
-      debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
+      debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, "0"))
       debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
     end
@@ -48,7 +48,6 @@ module ScraperUtils
       debug?(TRACE_LEVEL)
     end
     # Logs details of an HTTP request when debug mode is enabled
     #
     # @param http_method [String] HTTP http_method (GET, POST, etc.)

data/lib/scraper_utils/host_throttler.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module ScraperUtils
       @crawl_delay = crawl_delay.to_f
       # Clamp between 10 (delay 9x response) and 100 (no extra delay)
       @max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
-      @next_request_at = {}   # hostname => Time
+      @next_request_at = {} # hostname => Time
       @request_started_at = {} # hostname => Time
     end
@@ -52,23 +52,19 @@ module ScraperUtils
       response_time = Time.now - started
       delay = @crawl_delay
-      if @max_load
-        delay += (100.0 - @max_load) * response_time / @max_load
-      end
+      delay += (100.0 - @max_load) * response_time / @max_load if @max_load
-      if overloaded
-        delay = delay + response_time * 2 + 5.0
-      end
+      delay = delay + (response_time * 2) + 5.0 if overloaded
       delay = delay.round(3).clamp(0.0, MAX_DELAY)
       @next_request_at[hostname] = Time.now + delay
-      if DebugUtils.basic?
-        msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
-        msg += " OVERLOADED" if overloaded
-        msg += ", Will delay #{delay}s before next request"
-        LogUtils.log(msg)
-      end
+      return unless DebugUtils.basic?
+      msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
+      msg += " OVERLOADED" if overloaded
+      msg += ", Will delay #{delay}s before next request"
+      LogUtils.log(msg)
     end
     # Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.

data/lib/scraper_utils/log_utils.rb CHANGED Viewed

@@ -14,7 +14,7 @@ module ScraperUtils
     # @param message [String] the message to log
     # @return [void]
     def self.log(message, authority = nil)
-      authority ||= ENV['AUTHORITY']
+      authority ||= ENV.fetch("AUTHORITY", nil)
       $stderr.flush
       if authority
         puts "[#{authority}] #{message}"
@@ -85,7 +85,7 @@ module ScraperUtils
         failed
       )
-      DbUtils::cleanup_old_records
+      DbUtils.cleanup_old_records
     end
     # Extracts the first relevant line from backtrace that's from our project
@@ -104,15 +104,15 @@ module ScraperUtils
       format = options[:format] || false
       # Normalize the root directory path with a trailing slash
-      pwd = File.join(pwd, '')
+      pwd = File.join(pwd, "")
       backtrace.each do |line|
-        next if line.include?('/gems/') ||
-                line.include?('/vendor/') ||
-                line.include?('/ruby/')
+        next if line.include?("/gems/") ||
+                line.include?("/vendor/") ||
+                line.include?("/ruby/")
         if line.start_with?(pwd)
-          relative_path = line.sub(pwd, '')
+          relative_path = line.sub(pwd, "")
           return format ? " [#{relative_path}]" : relative_path
         end
       end
@@ -138,7 +138,7 @@ module ScraperUtils
       puts "\nScraping Summary:"
       summary_format = "%-20s %6s %6s %s"
-      puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
+      puts format(summary_format, "Authority", "OK", "Bad", "Exception")
       puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
       authorities.each do |authority|
@@ -149,7 +149,8 @@ module ScraperUtils
         expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
         exception_msg = if exceptions[authority]
-                          location = self.project_backtrace_line(exceptions[authority].backtrace, format: true)
+                          location = project_backtrace_line(exceptions[authority].backtrace,
+                                                            format: true)
                           "#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
                         else
                           "-"
@@ -174,12 +175,12 @@ module ScraperUtils
       # Check for authorities with unexpected errors
       unexpected_errors = authorities
-                            .select { |authority| exceptions[authority] }
-                            .reject { |authority| expect_bad.include?(authority) }
+                          .select { |authority| exceptions[authority] }
+                          .reject { |authority| expect_bad.include?(authority) }
       if unexpected_errors.any?
         errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
-          "(Add to MORPH_EXPECT_BAD?)"
+                  "(Add to MORPH_EXPECT_BAD?)"
         unexpected_errors.each do |authority|
           error = exceptions[authority]
           errors << "  #{authority}: #{error.class} - #{error}"
@@ -228,7 +229,8 @@ module ScraperUtils
     # Moved to DbUtils
     # :nocov:
     def self.cleanup_old_records(force: false)
-      warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.", category: :deprecated
+      warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.",
+           category: :deprecated
       ScraperUtils::DbUtils.cleanup_old_records(force: force)
     end
     # :nocov:
@@ -239,7 +241,9 @@ module ScraperUtils
       lines = []
       error.backtrace.each do |line|
-        lines << line if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
+        if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
+          lines << line
+        end
         break if lines.length >= 6
       end

data/lib/scraper_utils/maths_utils.rb CHANGED Viewed

@@ -11,7 +11,8 @@ module ScraperUtils
     def self.fibonacci_series(max)
       result = []
       # Start with the basic Fibonacci sequence
-      last_fib, this_fib = 1, 0
+      last_fib = 1
+      this_fib = 0
       while this_fib <= max
         result << this_fib
         yield this_fib if block_given?

data/lib/scraper_utils/mechanize_utils/agent_config.rb CHANGED Viewed

@@ -61,12 +61,12 @@ module ScraperUtils
         # Reset all configuration options to their default values
         # @return [void]
         def reset_defaults!
-          @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
-          @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
-          @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
-          @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
-          @default_crawl_delay = ENV.fetch('MORPH_CLIENT_CRAWL_DELAY', DEFAULT_CRAWL_DELAY)
-          @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD)
+          @default_timeout = ENV.fetch("MORPH_CLIENT_TIMEOUT", DEFAULT_TIMEOUT).to_i # 60
+          @default_disable_ssl_certificate_check = !ENV.fetch("MORPH_DISABLE_SSL_CHECK", nil).to_s.empty? # false
+          @default_australian_proxy = !ENV.fetch("MORPH_USE_PROXY", nil).to_s.empty? # false
+          @default_user_agent = ENV.fetch("MORPH_USER_AGENT", nil) # Uses Mechanize user agent
+          @default_crawl_delay = ENV.fetch("MORPH_CLIENT_CRAWL_DELAY", DEFAULT_CRAWL_DELAY)
+          @default_max_load = ENV.fetch("MORPH_MAX_LOAD", DEFAULT_MAX_LOAD)
         end
       end
@@ -113,10 +113,10 @@ module ScraperUtils
         @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
         if @australian_proxy
           uri = begin
-                  URI.parse(ScraperUtils.australian_proxy.to_s)
-                rescue URI::InvalidURIError => e
-                  raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
-                end
+            URI.parse(ScraperUtils.australian_proxy.to_s)
+          rescue URI::InvalidURIError => e
+            raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
+          end
           unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
             raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
           end
@@ -177,13 +177,13 @@ module ScraperUtils
       end
       def pre_connect_hook(_agent, request)
-        hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
+        hostname = (request.respond_to?(:[]) && request["Host"]) || "unknown"
         @throttler.before_request(hostname)
-        if DebugUtils.verbose?
-          ScraperUtils::LogUtils.log(
-            "Pre Connect request: #{request.inspect}"
-          )
-        end
+        return unless DebugUtils.verbose?
+        ScraperUtils::LogUtils.log(
+          "Pre Connect request: #{request.inspect}"
+        )
       end
       def post_connect_hook(_agent, uri, response, _body)
@@ -191,7 +191,7 @@ module ScraperUtils
         status = response.respond_to?(:code) ? response.code.to_i : nil
         overloaded = [429, 500, 503].include?(status)
-        hostname = uri.host || 'unknown'
+        hostname = uri.host || "unknown"
         @throttler.after_request(hostname, overloaded: overloaded)
         if DebugUtils.basic?
@@ -205,9 +205,7 @@ module ScraperUtils
       def error_hook(_agent, error)
         # Best-effort: record the error against whatever host we can find
         # Mechanize errors often carry the URI in the message; fall back to 'unknown'
-        hostname = if error.respond_to?(:uri)
-                     error.uri.host
-                   end || 'unknown'
+        hostname = (error.uri.host if error.respond_to?(:uri)) || "unknown"
         @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
       end

data/lib/scraper_utils/misc_utils.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative "host_throttler"
+require_relative "mechanize_utils/agent_config"
 module ScraperUtils
   # Misc Standalone Utilities
@@ -8,6 +9,14 @@ module ScraperUtils
     THROTTLE_HOSTNAME = "block"
     class << self
+      attr_accessor :default_crawl_delay, :default_max_load
+      def reset_defaults!
+        @default_crawl_delay = MechanizeUtils::AgentConfig.default_crawl_delay
+        @default_max_load = MechanizeUtils::AgentConfig.default_max_load
+        reset_throttler!
+      end
       # Throttle block to be nice to servers we are scraping.
       # Time spent inside the block (parsing, saving) counts toward the delay.
       def throttle_block
@@ -34,8 +43,15 @@ module ScraperUtils
       private
       def throttler
-        @throttler ||= HostThrottler.new
+        @throttler ||= HostThrottler.new(
+          crawl_delay: default_crawl_delay,
+          max_load: default_max_load
+        )
       end
     end
+    # Initialise defaults after AgentConfig is loaded
+    require_relative "mechanize_utils/agent_config"
+    reset_defaults!
   end
 end

data/lib/scraper_utils/pa_validation.rb CHANGED Viewed

@@ -31,8 +31,6 @@ module ScraperUtils
       errors.empty? ? nil : errors
     end
-    private
     def self.validate_presence(record, errors)
       REQUIRED_FIELDS.each do |field|
         errors << "#{field} can't be blank" if record[field].to_s.strip.empty?
@@ -47,10 +45,10 @@ module ScraperUtils
       begin
         uri = URI.parse(url)
         unless uri.is_a?(URI::HTTP) && uri.host.to_s != ""
-          errors << "info_url must be a valid http\/https URL with host"
+          errors << "info_url must be a valid http/https URL with host"
         end
       rescue URI::InvalidURIError
-        errors << "info_url must be a valid http\/https URL"
+        errors << "info_url must be a valid http/https URL"
       end
     end
@@ -58,18 +56,22 @@ module ScraperUtils
       today = Date.today
       date_scraped = parse_date(record["date_scraped"])
-      errors << "Invalid date format for date_scraped: #{record["date_scraped"].inspect} is not a valid ISO 8601 date" if record["date_scraped"] && date_scraped.nil?
+      if record["date_scraped"] && date_scraped.nil?
+        errors << "Invalid date format for date_scraped: #{record['date_scraped'].inspect} is not a valid ISO 8601 date"
+      end
       date_received = parse_date(record["date_received"])
       if record["date_received"] && date_received.nil?
-        errors << "Invalid date format for date_received: #{record["date_received"].inspect} is not a valid ISO 8601 date"
+        errors << "Invalid date format for date_received: #{record['date_received'].inspect} is not a valid ISO 8601 date"
       elsif date_received && date_received.to_date > today
-        errors << "Invalid date for date_received: #{record["date_received"].inspect} is in the future"
+        errors << "Invalid date for date_received: #{record['date_received'].inspect} is in the future"
       end
       %w[on_notice_from on_notice_to].each do |field|
         val = parse_date(record[field])
-        errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date" if record[field] && val.nil?
+        if record[field] && val.nil?
+          errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date"
+        end
       end
     end

data/lib/scraper_utils/spec_support.rb CHANGED Viewed

@@ -47,41 +47,43 @@ module ScraperUtils
     PLANNING_KEYWORDS = [
       # Building types
-      'dwelling', 'house', 'unit', 'building', 'structure', 'facility',
+      "dwelling", "house", "unit", "building", "structure", "facility",
       # Modifications
-      'addition', 'extension', 'renovation', 'alteration', 'modification',
-      'replacement', 'upgrade', 'improvement',
+      "addition", "extension", "renovation", "alteration", "modification",
+      "replacement", "upgrade", "improvement",
       # Specific structures
-      'carport', 'garage', 'shed', 'pool', 'deck', 'patio', 'pergola',
-      'verandah', 'balcony', 'fence', 'wall', 'driveway',
+      "carport", "garage", "shed", "pool", "deck", "patio", "pergola",
+      "verandah", "balcony", "fence", "wall", "driveway",
       # Development types
-      'subdivision', 'demolition', 'construction', 'development',
+      "subdivision", "demolition", "construction", "development",
       # Services/utilities
-      'signage', 'telecommunications', 'stormwater', 'water', 'sewer',
+      "signage", "telecommunications", "stormwater", "water", "sewer",
       # Approvals/certificates
-      'certificate', 'approval', 'consent', 'permit'
+      "certificate", "approval", "consent", "permit"
     ].freeze
     def self.fetch_url_head(url)
       agent = Mechanize.new
-      # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
+      # FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
       agent.head(url)
     end
     def self.fetch_url_with_redirects(url)
       agent = Mechanize.new
-      # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
+      # FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
       agent.get(url)
     end
-    def self.authority_label(results, prefix: '', suffix: '')
+    def self.authority_label(results, prefix: "", suffix: "")
       return nil if results.nil?
-      authority_labels = results.map { |record| record['authority_label'] }.compact.uniq
+      authority_labels = results.map { |record| record["authority_label"] }.compact.uniq
       return nil if authority_labels.empty?
-      raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
+      if authority_labels.size > 1
+        raise "Expected one authority_label, not #{authority_labels.inspect}"
+      end
       "#{prefix}#{authority_labels.first}#{suffix}"
     end
@@ -95,7 +97,8 @@ module ScraperUtils
       duplicates = groups.select { |_k, g| g.size > 1 }
       return if duplicates.empty?
-      raise UnprocessableSite, "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
+      raise UnprocessableSite,
+            "Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
     end
     # Validates enough addresses are geocodable
@@ -105,28 +108,32 @@ module ScraperUtils
     # @param ignore_case [Boolean] Ignores case which relaxes suburb check
     # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
     # @raise RuntimeError if insufficient addresses are geocodable
-    def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
+    def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
+                                                ignore_case: false, known_suburbs: [])
       return nil if results.empty?
       geocodable = results
-                     .map { |record| record["address"] }
-                     .uniq
-                     .count do |text|
-                       ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
-                        if !ok && DebugUtils.verbose?
-                          ScraperUtils::LogUtils.log(
-                            "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
-                          )
-                        end
-                       ok
-                       end
+                   .map { |record| record["address"] }
+                   .uniq
+                   .count do |text|
+        ok = ScraperUtils::SpecSupport.geocodable? text,
+                                                   known_suburbs: known_suburbs, ignore_case: ignore_case
+        if !ok && DebugUtils.verbose?
+          ScraperUtils::LogUtils.log(
+            "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
+          )
+        end
+        ok
+      end
       puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
-             "(#{(100.0 * geocodable / results.count).round(1)}%)"
-      expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
+           "(#{(100.0 * geocodable / results.count).round(1)}%)"
+      expected = [(((percentage.to_f / 100.0) * results.count) - variation), 1].max
       unless geocodable >= expected
-        raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
+        raise UnprocessableSite,
+              "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
       end
       geocodable
     end
@@ -138,10 +145,13 @@ module ScraperUtils
     # @return [Boolean] True if the address appears to be geocodable.
     def self.geocodable?(address, ignore_case: false, known_suburbs: [])
       return false if address.nil? || address.empty?
       check_address = ignore_case ? address.upcase : address
       # Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
-      has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
+      has_state = AUSTRALIAN_STATES.any? do |state|
+        check_address.end_with?(" #{state}") || check_address.include?(" #{state} ")
+      end
       has_postcode = address.match?(AUSTRALIAN_POSTCODES)
       # Using the pre-compiled patterns
@@ -154,9 +164,13 @@ module ScraperUtils
       if ENV["DEBUG"]
         missing = []
         missing << "street type" unless has_street_type
-        missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
+        unless has_postcode || has_uppercase_suburb || has_known_suburb
+          missing << "postcode/Uppercase suburb/Known suburb"
+        end
         missing << "state" unless has_state
-        puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
+        if missing.any?
+          puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}"
+        end
       end
       has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
@@ -183,17 +197,21 @@ module ScraperUtils
       return nil if results.empty?
       descriptions = results
-                       .map { |record| record["description"] }
-                       .uniq
-                       .count do |text|
+                     .map { |record| record["description"] }
+                     .uniq
+                     .count do |text|
         selected = ScraperUtils::SpecSupport.reasonable_description? text
         puts "  description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
         selected
       end
       puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
-             "(#{(100.0 * descriptions / results.count).round(1)}%)"
-      expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
-      raise UnprocessableSite, "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
+           "(#{(100.0 * descriptions / results.count).round(1)}%)"
+      expected = [((percentage.to_f / 100.0) * results.count) - variation, 1].max
+      unless descriptions >= expected
+        raise UnprocessableSite,
+              "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}"
+      end
       descriptions
     end
@@ -216,7 +234,8 @@ module ScraperUtils
     # @param bot_check_expected [Boolean] Whether bot protection is acceptable
     # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
     # @raise RuntimeError if records don't use the expected URL or it doesn't return 200
-    def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false, &block)
+    def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
+                                               &block)
       info_urls = results.map { |record| record["info_url"] }.uniq
       unless info_urls.size == 1
@@ -262,7 +281,8 @@ module ScraperUtils
     # @param bot_check_expected [Boolean] Whether bot protection is acceptable
     # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
     # @raise RuntimeError if insufficient detail checks pass
-    def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false, &block)
+    def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
+                                                       bot_check_expected: false, &block)
       if defined?(VCR)
         VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
           check_info_url_details(results, percentage, variation, bot_check_expected, &block)
@@ -284,15 +304,15 @@ module ScraperUtils
       # Check for common bot protection indicators
       bot_indicators = [
-        'recaptcha',
-        'cloudflare',
-        'are you human',
-        'bot detection',
-        'security check',
-        'verify you are human',
-        'access denied',
-        'blocked',
-        'captcha'
+        "recaptcha",
+        "cloudflare",
+        "are you human",
+        "bot detection",
+        "security check",
+        "verify you are human",
+        "access denied",
+        "blocked",
+        "captcha"
       ]
       bot_indicators.any? { |indicator| body_lower.include?(indicator) }
@@ -308,10 +328,10 @@ module ScraperUtils
         return
       end
-      raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
-    end
+      return if page.code == "200"
-    private
+      raise "Expected 200 response from the one expected info_url, got #{page.code}"
+    end
     def self.check_info_url_is_present(results, percentage, variation, &block)
       count = 0
@@ -337,17 +357,21 @@ module ScraperUtils
         count += 1
         if status.between?(200, 299)
-          puts "  OK: #{status}" if ENV['DEBUG']
+          puts "  OK: #{status}" if ENV["DEBUG"]
         else
           failed += 1
           puts "  Failed: #{status}"
-          min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
+          min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
           passed = count - failed
-          raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
+          if passed < min_required
+            raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
+          end
         end
       end
-      puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
+      return unless count > 0
+      puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!"
     end
     def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
@@ -367,7 +391,10 @@ module ScraperUtils
           next
         end
-        raise UnprocessableRecord, "Expected 200 response, got #{page.code}" unless page.code == "200"
+        unless page.code == "200"
+          raise UnprocessableRecord,
+                "Expected 200 response, got #{page.code}"
+        end
         page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
@@ -375,34 +402,40 @@ module ScraperUtils
           count += 1
           expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
           expected2 = case attribute
-                      when 'council_reference'
-                        expected.sub(/\ADA\s*-\s*/, '')
-                      when 'address'
-                        expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, '') # Handle Lismore post-code/state swap
+                      when "council_reference"
+                        expected.sub(/\ADA\s*-\s*/, "")
+                      when "address"
+                        expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, "") # Handle Lismore post-code/state swap
                       else
                         expected
                       end
           expected3 = case attribute
-                      when 'address'
-                        expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, '')
+                      when "address"
+                        expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, "")
                       else
                         expected
-                      end.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-')
-          next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-').include?(expected3)
+                      end.gsub(/\s*,\s*/, " ").gsub(/\s*-\s*/, "-")
+          next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, " ").gsub(
+            /\s*-\s*/, "-"
+          ).include?(expected3)
           failed += 1
-          desc2 = expected2 == expected ? '' : " or #{expected2.inspect}"
-          desc3 = expected3 == expected ? '' : " or #{expected3.inspect}"
+          desc2 = expected2 == expected ? "" : " or #{expected2.inspect}"
+          desc3 = expected3 == expected ? "" : " or #{expected3.inspect}"
           puts "  Missing: #{expected.inspect}#{desc2}#{desc3}"
-          puts "    IN: #{page_body}" if ENV['DEBUG']
+          puts "    IN: #{page_body}" if ENV["DEBUG"]
-          min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
+          min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
           passed = count - failed
-          raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
+          if passed < min_required
+            raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
+          end
         end
       end
-      puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
+      return unless count > 0
+      puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!"
     end
   end
 end

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.15.0"
+  VERSION = "0.16.0"
 end

data/scraper_utils.gemspec CHANGED Viewed

@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
   spec.summary = "planningalerts scraper utilities"
   spec.description = "Utilities to help make planningalerts scrapers, " \
-    "especially multi authority scrapers, easier to develop, run and debug."
+                     "especially multi authority scrapers, easier to develop, run and debug."
   spec.homepage = "https://github.com/ianheggie-oaf/#{spec.name}"
   spec.license = "MIT"
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
     spec.metadata["homepage_uri"] = spec.homepage
     spec.metadata["source_code_uri"] = spec.homepage
     spec.metadata["documentation_uri"] = "https://rubydoc.info/gems/#{spec.name}/#{ScraperUtils::VERSION}"
-    spec.metadata["changelog_uri"] = "#{spec.metadata["source_code_uri"]}/blob/main/CHANGELOG.md"
+    spec.metadata["changelog_uri"] = "#{spec.metadata['source_code_uri']}/blob/main/CHANGELOG.md"
   else
     raise "RubyGems 2.0 or newer is required to protect against " \
-            "public gem pushes."
+          "public gem pushes."
   end
   # Specify which files should be added to the gem when it is released.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.15.0
+  version: 0.16.0
 platform: ruby
 authors:
 - Ian Heggie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-03-04 00:00:00.000000000 Z
+date: 2026-04-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -114,7 +114,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
-  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
+  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.16.0
   changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
   rubygems_mfa_required: 'true'
 post_install_message: