RubyGems - scraper_utils - Versions diffs - 0.14.1 → 0.15.0 - Mend

scraper_utils 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/docs/misc_utilities.md +0 -6
data/lib/scraper_utils/host_throttler.rb +4 -0
data/lib/scraper_utils/mechanize_utils/agent_config.rb +4 -5
data/lib/scraper_utils/misc_utils.rb +4 -0
data/lib/scraper_utils/spec_support.rb +70 -1
data/lib/scraper_utils/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 03b44a667992331d6e36bb6eca68afc286205846d7be06263694fed52b5e2d30
-  data.tar.gz: 9f0dd276223f1b22dd688453e1769199cbda34efa5141d58e546a8ddcb85c795
+  metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
+  data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
 SHA512:
-  metadata.gz: b42e0be0f9e42d9a83588cf7dcbb98ec079d01262340d2e6fef8ac7201c3d80faa645351631f60f767186721a58580f4f1e5e09c130a3a32aebb4f301dbfbdfc
-  data.tar.gz: e3cec3345d0af13026259600a54e417efd0c36394f1bc22ecac1a25573551a3a2e51482b060ad1b72ed7ba4850d55bf9f8032321d1b8c1ae6eab581244e92410
+  metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
+  data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,10 @@
 # Changelog
+## 0.15.0 - 2026-03-05
+* Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
+* Fix pre_connect_hook hostname extraction to use `request['Host']` header
 ## 0.14.1 - 2026-03-04
 * Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and

data/docs/misc_utilities.md CHANGED Viewed

@@ -18,11 +18,5 @@ The throttle automatically:
 - Pauses before next request based on previous timing
 - Caps pause at 120s maximum
-Override the next pause duration manually if needed:
-```ruby
-ScraperUtils::MiscUtils.pause_duration = 2.0
-```
 **Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
 each request is made and thus does not need to be wrapped with the helper.

data/lib/scraper_utils/host_throttler.rb CHANGED Viewed

@@ -25,6 +25,10 @@ module ScraperUtils
       @request_started_at = {} # hostname => Time
     end
+    def will_pause_till(hostname)
+      @next_request_at[hostname]
+    end
     # Sleep until this host's throttle window has elapsed.
     # Records when the request actually started.
     # @param hostname [String]

data/lib/scraper_utils/mechanize_utils/agent_config.rb CHANGED Viewed

@@ -177,7 +177,7 @@ module ScraperUtils
       end
       def pre_connect_hook(_agent, request)
-        hostname = request.respond_to?(:uri) ? request.uri.host : 'unknown'
+        hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
         @throttler.before_request(hostname)
         if DebugUtils.verbose?
           ScraperUtils::LogUtils.log(
@@ -191,7 +191,8 @@ module ScraperUtils
         status = response.respond_to?(:code) ? response.code.to_i : nil
         overloaded = [429, 500, 503].include?(status)
-        @throttler.after_request(uri.host, overloaded: overloaded)
+        hostname = uri.host || 'unknown'
+        @throttler.after_request(hostname, overloaded: overloaded)
         if DebugUtils.basic?
           ScraperUtils::LogUtils.log(
@@ -206,9 +207,7 @@ module ScraperUtils
         # Mechanize errors often carry the URI in the message; fall back to 'unknown'
         hostname = if error.respond_to?(:uri)
                      error.uri.host
-                   else
-                     'unknown'
-                   end
+                   end || 'unknown'
         @throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
       end

data/lib/scraper_utils/misc_utils.rb CHANGED Viewed

@@ -27,6 +27,10 @@ module ScraperUtils
         @throttler = nil
       end
+      def will_pause_till
+        throttler.will_pause_till(THROTTLE_HOSTNAME)
+      end
       private
       def throttler

data/lib/scraper_utils/spec_support.rb CHANGED Viewed

@@ -62,6 +62,13 @@ module ScraperUtils
       'certificate', 'approval', 'consent', 'permit'
     ].freeze
+    def self.fetch_url_head(url)
+      agent = Mechanize.new
+      # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
+      agent.head(url)
+    end
     def self.fetch_url_with_redirects(url)
       agent = Mechanize.new
       # FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
@@ -104,7 +111,16 @@ module ScraperUtils
       geocodable = results
                      .map { |record| record["address"] }
                      .uniq
-                     .count { |text| ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case }
+                     .count do |text|
+                       ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
+                        if !ok && DebugUtils.verbose?
+                          ScraperUtils::LogUtils.log(
+                            "Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
+                          )
+                        end
+                       ok
+                       end
       puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
              "(#{(100.0 * geocodable / results.count).round(1)}%)"
       expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
@@ -223,6 +239,22 @@ module ScraperUtils
       end
     end
+    # Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
+    # @param results [Array<Hash>] The results from scraping an authority
+    # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
+    # @param variation [Integer] The variation allowed in addition to percentage (default:3)
+    # @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
+    # @raise RuntimeError if insufficient detail checks pass
+    def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
+      if defined?(VCR)
+        VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
+          check_info_url_is_present(results, percentage, variation, &block)
+        end
+      else
+        check_info_url_is_present(results, percentage, variation, &block)
+      end
+    end
     # Validates that info_urls have expected details (unique URLs with content validation)
     # @param results [Array<Hash>] The results from scraping an authority
     # @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
@@ -281,6 +313,43 @@ module ScraperUtils
     private
+    def self.check_info_url_is_present(results, percentage, variation, &block)
+      count = 0
+      failed = 0
+      fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
+      fib_indices.each do |index|
+        record = results[index]
+        info_url = record["info_url"]
+        puts "Checking info_url[#{index}]: #{info_url} is present..."
+        begin
+          page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
+          status = page.code.to_i
+        rescue Mechanize::ResponseCodeError => e
+          status = e.response_code.to_i
+        end
+        if [403, 429].include?(status)
+          puts "  Bot protection detected - skipping"
+          next
+        end
+        count += 1
+        if status.between?(200, 299)
+          puts "  OK: #{status}" if ENV['DEBUG']
+        else
+          failed += 1
+          puts "  Failed: #{status}"
+          min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
+          passed = count - failed
+          raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
+        end
+      end
+      puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
+    end
     def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
       count = 0
       failed = 0

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.14.1"
+  VERSION = "0.15.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.14.1
+  version: 0.15.0
 platform: ruby
 authors:
 - Ian Heggie
@@ -114,7 +114,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
-  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.14.1
+  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
   changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
   rubygems_mfa_required: 'true'
 post_install_message: