RubyGems - scraper_utils - Versions diffs - 0.13.1 → 0.15.0 - Mend

scraper_utils 0.13.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +13 -0
data/docs/misc_utilities.md +0 -6
data/lib/scraper_utils/host_throttler.rb +86 -0
data/lib/scraper_utils/mechanize_utils/agent_config.rb +25 -21
data/lib/scraper_utils/misc_utils.rb +29 -12
data/lib/scraper_utils/spec_support.rb +79 -5
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +2 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3bce8cc5a624f9904ebf8bb35ccb5c5c6c831e28ed56f88d3baf3b8d19fbbd13
-  data.tar.gz: 0a481566e846a4274796b0542fb64a805f486065ed08045724cea7bc3d46710d
+  metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
+  data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
 SHA512:
-  metadata.gz: 231c167ffe232daacbc862b8c3dd2c0c71be6b8fc2ff061f4f36d88f2e2185a454eb0aa79653c7a99a2ed65c9857d961059456f8403af8c1ed39623cc8e2db6a
-  data.tar.gz: f287f85cdd4cc11cf17c3e5d34d5493e2809f255f3a3544bc881e756f3379c897dd70dbba5ebf16b30837bb8612f42f704872e06c6bec1cad87845606fce6231
+  metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
+  data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,18 @@
 # Changelog
+## 0.15.0 - 2026-03-05
+* Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
+* Fix pre_connect_hook hostname extraction to use `request['Host']` header
+## 0.14.1 - 2026-03-04
+* Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
+  `ScraperUtils::SpecSupport.geocodable?` to validate addresses that don't have postcodes nor capitalised suburb names
+* Can pass ignore_case: true to relax the requirement for either postcode or uppercase suburb when you don't want to
+  pass Known suburb.
+* Move Throttling to HostThrottler
 ## 0.13.1 - 2026.02-21
 * Added PaValidation that validates based

data/docs/misc_utilities.md CHANGED Viewed

@@ -18,11 +18,5 @@ The throttle automatically:
 - Pauses before next request based on previous timing
 - Caps pause at 120s maximum
-Override the next pause duration manually if needed:
-```ruby
-ScraperUtils::MiscUtils.pause_duration = 2.0
-```
 **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
 each request is made and thus does not need to be wrapped with the helper.

data/lib/scraper_utils/host_throttler.rb ADDED Viewed

@@ -0,0 +1,86 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # Tracks per-host next-allowed-request time so that time spent parsing
+  # and saving records counts toward the crawl delay rather than being
+  # added on top of it.
+  #
+  # Usage:
+  #   throttler = HostThrottler.new(crawl_delay: 1.0, max_load: 50.0)
+  #   throttler.before_request(hostname)   # sleep until ready
+  #   # ... make request ...
+  #   throttler.after_request(hostname)    # record timing, schedule next slot
+  #   throttler.after_request(hostname, overloaded: true)  # double delay + 5s
+  class HostThrottler
+    MAX_DELAY = 120.0
+    # @param crawl_delay [Float] minimum seconds between requests per host
+    # @param max_load [Float] target server load percentage (10..100);
+    #   50 means response_time == pause_time
+    def initialize(crawl_delay: 0.0, max_load: nil)
+      @crawl_delay = crawl_delay.to_f
+      # Clamp between 10 (delay 9x response) and 100 (no extra delay)
+      @max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
+      @next_request_at = {}   # hostname => Time
+      @request_started_at = {} # hostname => Time
+    end
+    def will_pause_till(hostname)
+      @next_request_at[hostname]
+    end
+    # Sleep until this host's throttle window has elapsed.
+    # Records when the request actually started.
+    # @param hostname [String]
+    # @return [void]
+    def before_request(hostname)
+      target = @next_request_at[hostname]
+      if target
+        remaining = target - Time.now
+        sleep(remaining) if remaining > 0
+      end
+      @request_started_at[hostname] = Time.now
+    end
+    # Calculate and store the next allowed request time for this host.
+    # @param hostname [String]
+    # @param overloaded [Boolean] true when the server signalled overload
+    #   (HTTP 429/500/503); doubles the normal delay and adds 5 seconds.
+    # @return [void]
+    def after_request(hostname, overloaded: false)
+      started = @request_started_at[hostname] || Time.now
+      response_time = Time.now - started
+      delay = @crawl_delay
+      if @max_load
+        delay += (100.0 - @max_load) * response_time / @max_load
+      end
+      if overloaded
+        delay = delay + response_time * 2 + 5.0
+      end
+      delay = delay.round(3).clamp(0.0, MAX_DELAY)
+      @next_request_at[hostname] = Time.now + delay
+      if DebugUtils.basic?
+        msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
+        msg += " OVERLOADED" if overloaded
+        msg += ", Will delay #{delay}s before next request"
+        LogUtils.log(msg)
+      end
+    end
+    # Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
+    # @param error [Exception]
+    # @return [Boolean]
+    def self.overload_error?(error)
+      code = if error.respond_to?(:response) && error.response.respond_to?(:code)
+               error.response.code.to_i          # HTTParty style
+             elsif error.respond_to?(:response_code)
+               error.response_code.to_i          # Mechanize style
+             end
+      [429, 500, 503].include?(code)
+    end
+  end
+end

data/lib/scraper_utils/mechanize_utils/agent_config.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require "mechanize"
 require "ipaddr"
+require_relative "../host_throttler"
 module ScraperUtils
   module MechanizeUtils
@@ -76,8 +77,7 @@ module ScraperUtils
       attr_reader :user_agent
       # Give access for testing
-      attr_reader :max_load, :crawl_delay
+      attr_reader :max_load, :crawl_delay, :throttler
       # Creates Mechanize agent configuration with sensible defaults overridable via configure
       # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
@@ -107,6 +107,7 @@ module ScraperUtils
         @crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
         # Clamp between 10 (delay 9 x response) and 100 (no delay)
         @max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
+        @throttler = HostThrottler.new(crawl_delay: @crawl_delay, max_load: @max_load)
         # Validate proxy URL format if proxy will be used
         @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
@@ -155,6 +156,7 @@ module ScraperUtils
         agent.pre_connect_hooks << method(:pre_connect_hook)
         agent.post_connect_hooks << method(:post_connect_hook)
+        agent.error_hooks << method(:error_hook) if agent.respond_to?(:error_hooks)
       end
       private
@@ -175,38 +177,40 @@ module ScraperUtils
       end
       def pre_connect_hook(_agent, request)
-        @connection_started_at = Time.now
-        return unless DebugUtils.verbose?
-        ScraperUtils::LogUtils.log(
-          "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
-        )
+        hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
+        @throttler.before_request(hostname)
+        if DebugUtils.verbose?
+          ScraperUtils::LogUtils.log(
+            "Pre Connect request: #{request.inspect}"
+          )
+        end
       end
       def post_connect_hook(_agent, uri, response, _body)
         raise ArgumentError, "URI must be present in post-connect hook" unless uri
-        response_time = Time.now - @connection_started_at
-        response_delay = @crawl_delay || 0.0
-        if @crawl_delay ||@max_load
-          response_delay += response_time
-          if @max_load && @max_load >= 1
-            response_delay += (100.0 - @max_load) * response_time / @max_load
-          end
-          response_delay = response_delay.round(3)
-        end
+        status = response.respond_to?(:code) ? response.code.to_i : nil
+        overloaded = [429, 500, 503].include?(status)
+        hostname = uri.host || 'unknown'
+        @throttler.after_request(hostname, overloaded: overloaded)
         if DebugUtils.basic?
           ScraperUtils::LogUtils.log(
-            "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
-              "after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
+            "Post Connect uri: #{uri.inspect}, response: #{response.inspect}"
           )
         end
-        sleep(response_delay) if response_delay > 0.0
         response
       end
+      def error_hook(_agent, error)
+        # Best-effort: record the error against whatever host we can find
+        # Mechanize errors often carry the URI in the message; fall back to 'unknown'
+        hostname = if error.respond_to?(:uri)
+                     error.uri.host
+                   end || 'unknown'
+        @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
+      end
       def verify_proxy_works(agent)
         $stderr.flush
         $stdout.flush

data/lib/scraper_utils/misc_utils.rb CHANGED Viewed

@@ -1,23 +1,40 @@
 # frozen_string_literal: true
+require_relative "host_throttler"
 module ScraperUtils
   # Misc Standalone Utilities
   module MiscUtils
-    MAX_PAUSE = 120.0
+    THROTTLE_HOSTNAME = "block"
     class << self
-      attr_accessor :pause_duration
-      # Throttle block to be nice to servers we are scraping
-      def throttle_block(extra_delay: 0.5)
-        if @pause_duration&.positive?
-          puts "Pausing #{@pause_duration}s" if ScraperUtils::DebugUtils.trace?
-          sleep(@pause_duration)
+      # Throttle block to be nice to servers we are scraping.
+      # Time spent inside the block (parsing, saving) counts toward the delay.
+      def throttle_block
+        throttler.before_request(THROTTLE_HOSTNAME)
+        begin
+          result = yield
+          throttler.after_request(THROTTLE_HOSTNAME)
+          result
+        rescue StandardError => e
+          throttler.after_request(THROTTLE_HOSTNAME, overloaded: HostThrottler.overload_error?(e))
+          raise
         end
-        start_time = Time.now.to_f
-        result = yield
-        @pause_duration = (Time.now.to_f - start_time + extra_delay).round(3).clamp(0.0, MAX_PAUSE)
-        result
+      end
+      # Reset the internal throttler (useful in tests)
+      def reset_throttler!
+        @throttler = nil
+      end
+      def will_pause_till
+        throttler.will_pause_till(THROTTLE_HOSTNAME)
+      end
+      private
+      def throttler
+        @throttler ||= HostThrottler.new
       end
     end
   end

data/lib/scraper_utils/spec_support.rb CHANGED Viewed

@@ -62,6 +62,13 @@ module ScraperUtils
       'certificate', 'approval', 'consent', 'permit'
     ].freeze
+    def self.fetch_url_head(url)
+      agent = Mechanize.new
+      # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
+      agent.head(url)
+    end
     def self.fetch_url_with_redirects(url)
       agent = Mechanize.new
       # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
@@ -95,14 +102,25 @@ module ScraperUtils
     # @param results [Array<Hash>] The results from scraping an authority
     # @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
     # @param variation [Integer] The variation allowed in addition to percentage (default:3)
+    # @param ignore_case [Boolean] Ignores case which relaxes suburb check
+    # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
     # @raise RuntimeError if insufficient addresses are geocodable
-    def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
+    def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
       return nil if results.empty?
       geocodable = results
                      .map { |record| record["address"] }
                      .uniq
-                     .count { |text| ScraperUtils::SpecSupport.geocodable? text }
+                     .count do |text|
+                       ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
+                        if !ok && DebugUtils.verbose?
+                          ScraperUtils::LogUtils.log(
+                            "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
+                          )
+                        end
+                       ok
+                       end
       puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
              "(#{(100.0 * geocodable / results.count).round(1)}%)"
       expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
@@ -115,8 +133,10 @@ module ScraperUtils
     # Check if an address is likely to be geocodable by analyzing its format.
     # This is a bit stricter than needed - typically assert >= 75% match
     # @param address [String] The address to check
+    # @param ignore_case [Boolean] Ignores case which relaxes suburb check
+    # @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
     # @return [Boolean] True if the address appears to be geocodable.
-    def self.geocodable?(address, ignore_case: false)
+    def self.geocodable?(address, ignore_case: false, known_suburbs: [])
       return false if address.nil? || address.empty?
       check_address = ignore_case ? address.upcase : address
@@ -129,16 +149,17 @@ module ScraperUtils
       uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
       has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
+      has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }
       if ENV["DEBUG"]
         missing = []
         missing << "street type" unless has_street_type
-        missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
+        missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
         missing << "state" unless has_state
         puts "  address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
       end
-      has_street_type && (has_postcode || has_uppercase_suburb) && has_state
+      has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
     end
     PLACEHOLDERS = [
@@ -218,6 +239,22 @@ module ScraperUtils
       end
     end
+    # Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
+    # @param results [Array<Hash>] The results from scraping an authority
+    # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
+    # @param variation [Integer] The variation allowed in addition to percentage (default:3)
+    # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
+    # @raise RuntimeError if insufficient detail checks pass
+    def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
+      if defined?(VCR)
+        VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
+          check_info_url_is_present(results, percentage, variation, &block)
+        end
+      else
+        check_info_url_is_present(results, percentage, variation, &block)
+      end
+    end
     # Validates that info_urls have expected details (unique URLs with content validation)
     # @param results [Array<Hash>] The results from scraping an authority
     # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
@@ -276,6 +313,43 @@ module ScraperUtils
     private
+    def self.check_info_url_is_present(results, percentage, variation, &block)
+      count = 0
+      failed = 0
+      fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
+      fib_indices.each do |index|
+        record = results[index]
+        info_url = record["info_url"]
+        puts "Checking info_url[#{index}]: #{info_url} is present..."
+        begin
+          page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
+          status = page.code.to_i
+        rescue Mechanize::ResponseCodeError => e
+          status = e.response_code.to_i
+        end
+        if [403, 429].include?(status)
+          puts "  Bot protection detected - skipping"
+          next
+        end
+        count += 1
+        if status.between?(200, 299)
+          puts "  OK: #{status}" if ENV['DEBUG']
+        else
+          failed += 1
+          puts "  Failed: #{status}"
+          min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
+          passed = count - failed
+          raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
+        end
+      end
+      puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
+    end
     def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
       count = 0
       failed = 0

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.13.1"
+  VERSION = "0.15.0"
 end

data/lib/scraper_utils.rb CHANGED Viewed

@@ -5,12 +5,13 @@ require "scraper_utils/version"
 # Public Apis (responsible for requiring their own dependencies)
 require "scraper_utils/authority_utils"
 require "scraper_utils/data_quality_monitor"
-require "scraper_utils/pa_validation"
 require "scraper_utils/db_utils"
 require "scraper_utils/debug_utils"
+require "scraper_utils/host_throttler"
 require "scraper_utils/log_utils"
 require "scraper_utils/maths_utils"
 require "scraper_utils/misc_utils"
+require "scraper_utils/pa_validation"
 require "scraper_utils/spec_support"
 # Mechanize utilities

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.13.1
+  version: 0.15.0
 platform: ruby
 authors:
 - Ian Heggie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-02-21 00:00:00.000000000 Z
+date: 2026-03-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -97,6 +97,7 @@ files:
 - lib/scraper_utils/data_quality_monitor.rb
 - lib/scraper_utils/db_utils.rb
 - lib/scraper_utils/debug_utils.rb
+- lib/scraper_utils/host_throttler.rb
 - lib/scraper_utils/log_utils.rb
 - lib/scraper_utils/maths_utils.rb
 - lib/scraper_utils/mechanize_utils.rb
@@ -113,7 +114,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
-  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.13.1
+  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
   changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
   rubygems_mfa_required: 'true'
 post_install_message: