RubyGems - scraper_utils - Versions diffs - 0.1.0 → 0.2.0 - Mend

scraper_utils 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.rubocop.yml +1 -8
data/CHANGELOG.md +5 -0
data/GUIDELINES.md +75 -0
data/Gemfile +1 -1
data/IMPLEMENTATION.md +33 -0
data/README.md +226 -131
data/SPECS.md +25 -0
data/bin/console +1 -0
data/bin/setup +2 -1
data/lib/scraper_utils/adaptive_delay.rb +65 -0
data/lib/scraper_utils/authority_utils.rb +2 -2
data/lib/scraper_utils/data_quality_monitor.rb +53 -0
data/lib/scraper_utils/db_utils.rb +2 -1
data/lib/scraper_utils/debug_utils.rb +13 -20
data/lib/scraper_utils/fiber_scheduler.rb +206 -0
data/lib/scraper_utils/log_utils.rb +57 -26
data/lib/scraper_utils/mechanize_utils/agent_config.rb +255 -0
data/lib/scraper_utils/mechanize_utils.rb +23 -29
data/lib/scraper_utils/robots_checker.rb +144 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +3 -0
data/scraper_utils.gemspec +3 -8
metadata +13 -74

data/lib/scraper_utils/mechanize_utils/agent_config.rb ADDED Viewed

@@ -0,0 +1,255 @@
+# frozen_string_literal: true
+require "mechanize"
+require "ipaddr"
+module ScraperUtils
+  module MechanizeUtils
+    # Configuration for a Mechanize agent with sensible defaults and configurable settings.
+    # Supports global configuration through {.configure} and per-instance overrides.
+    #
+    # @example Setting global defaults
+    #   ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
+    #     config.default_timeout = 90
+    #     config.default_random_delay = 5
+    #   end
+    #
+    # @example Creating an instance with defaults
+    #   config = ScraperUtils::MechanizeUtils::AgentConfig.new
+    #
+    # @example Overriding specific settings
+    #   config = ScraperUtils::MechanizeUtils::AgentConfig.new(
+    #     timeout: 120,
+    #     random_delay: 10
+    #   )
+    class AgentConfig
+      # Class-level defaults that can be modified
+      class << self
+        # @return [Integer] Default timeout in seconds for agent connections
+        attr_accessor :default_timeout
+        # @return [Boolean] Default setting for compliance with headers and robots.txt
+        attr_accessor :default_compliant_mode
+        # @return [Integer, nil] Default average random delay in seconds
+        attr_accessor :default_random_delay
+        # @return [Float, nil] Default maximum server load percentage (nil = no response delay)
+        attr_accessor :default_max_load
+        # @return [Boolean] Default setting for SSL certificate verification
+        attr_accessor :default_disable_ssl_certificate_check
+        # @return [Boolean] Default flag for Australian proxy preference
+        attr_accessor :default_australian_proxy
+        # @return [String, nil] Default Mechanize user agent
+        attr_accessor :default_user_agent
+        # Configure default settings for all AgentConfig instances
+        # @yield [self] Yields self for configuration
+        # @example
+        #   AgentConfig.configure do |config|
+        #     config.default_timeout = 90
+        #     config.default_random_delay = 5
+        #     config.default_max_load = 15
+        #   end
+        # @return [void]
+        def configure
+          yield self if block_given?
+        end
+        # Reset all configuration options to their default values
+        # @return [void]
+        def reset_defaults!
+          @default_timeout = 60
+          @default_compliant_mode = true
+          @default_random_delay = 3
+          @default_max_load = 20.0
+          @default_disable_ssl_certificate_check = false
+          @default_australian_proxy = nil
+          @default_user_agent = nil
+        end
+      end
+      # Set defaults on load
+      reset_defaults!
+      # @return [String] User agent string
+      attr_reader :user_agent
+      # Give access for testing
+      attr_reader :max_load
+      attr_reader :min_random
+      attr_reader :max_random
+      # Creates configuration for a Mechanize agent with sensible defaults
+      # @param timeout [Integer, nil] Timeout for agent connections (default: 60 unless changed)
+      # @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true unless changed)
+      # @param random_delay [Integer, nil] Average random delay in seconds (default: 3 unless changed)
+      # @param max_load [Float, nil] Maximum server load percentage (nil = no response delay, default: 20%)
+      #                              When compliant_mode is true, max_load is capped at 33%
+      # @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false unless changed)
+      # @param australian_proxy [Boolean, nil] Use proxy if available (default: false unless changed)
+      # @param user_agent [String, nil] Configure Mechanize user agent
+      def initialize(timeout: nil,
+                     compliant_mode: nil,
+                     random_delay: nil,
+                     max_load: nil,
+                     disable_ssl_certificate_check: nil,
+                     australian_proxy: false,
+                     user_agent: nil)
+        @timeout = timeout.nil? ? self.class.default_timeout : timeout
+        @compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
+        @random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
+        @max_load = max_load.nil? ? self.class.default_max_load : max_load
+        @max_load = [@max_load || 20.0, 33.0].min if @compliant_mode
+        @user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
+        @disable_ssl_certificate_check = disable_ssl_certificate_check.nil? ?
+                                           self.class.default_disable_ssl_certificate_check :
+                                           disable_ssl_certificate_check
+        @australian_proxy = australian_proxy.nil? ? self.class.default_australian_proxy : australian_proxy
+        # Validate proxy URL format if proxy will be used
+        @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
+        if @australian_proxy
+          uri = begin
+                  URI.parse(ScraperUtils.australian_proxy.to_s)
+                rescue URI::InvalidURIError => e
+                  raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
+                end
+          unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
+            raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
+          end
+          unless uri.host && uri.port
+            raise URI::InvalidURIError, "Proxy URL must include host and port"
+          end
+        end
+        if @random_delay
+          @min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
+          @max_random = (3 * @min_random).round(3)
+        end
+        today = Date.today.strftime("%Y-%m-%d")
+        @user_agent = ENV['MORPH_USER_AGENT']&.sub("TODAY", today)
+        if @compliant_mode
+          version = ScraperUtils::VERSION
+          @user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
+        end
+        @robots_checker = RobotsChecker.new(@user_agent) if @user_agent
+        @adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
+        display_options
+      end
+      # Configures a Mechanize agent with these settings
+      # @param agent [Mechanize] The agent to configure
+      # @return [void]
+      def configure_agent(agent)
+        agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if @disable_ssl_certificate_check
+        if @timeout
+          agent.open_timeout = @timeout
+          agent.read_timeout = @timeout
+        end
+        if @compliant_mode
+          agent.user_agent = user_agent
+          agent.request_headers ||= {}
+          agent.request_headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
+          agent.request_headers["Upgrade-Insecure-Requests"] = "1"
+        end
+        if @australian_proxy
+          agent.agent.set_proxy(ScraperUtils.australian_proxy)
+          agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
+          verify_proxy_works(agent)
+        end
+        @connection_started_at = nil
+        agent.pre_connect_hooks << method(:pre_connect_hook)
+        agent.post_connect_hooks << method(:post_connect_hook)
+      end
+      private
+      def display_options
+        display_args = []
+        display_args << "timeout=#{@timeout}" if @timeout
+        if @australian_proxy
+          display_args << "australian_proxy=#{@australian_proxy.inspect}"
+        elsif ScraperUtils.australian_proxy.to_s.empty?
+          display_args << "#{ScraperUtils::AUSTRALIAN_PROXY_ENV_VAR} not set"
+        else
+          display_args << "australian_proxy=#{@australian_proxy.inspect}"
+        end
+        display_args << "compliant_mode" if @compliant_mode
+        display_args << "random_delay=#{@random_delay}" if @random_delay
+        display_args << "max_load=#{@max_load}%" if @max_load
+        display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
+        display_args << "default args" if display_args.empty?
+        ScraperUtils::FiberScheduler.log "Configuring Mechanize agent with #{display_args.join(', ')}"
+      end
+      def pre_connect_hook(_agent, request)
+        @connection_started_at = Time.now
+        ScraperUtils::FiberScheduler.log "Pre Connect request: #{request.inspect} at #{@connection_started_at}" if ENV["DEBUG"]
+      end
+      def post_connect_hook(_agent, uri, response, _body)
+        raise ArgumentError, "URI must be present in post-connect hook" unless uri
+        response_time = Time.now - @connection_started_at
+        if ENV["DEBUG"]
+          ScraperUtils::FiberScheduler.log "Post Connect uri: #{uri.inspect}, response: #{response.inspect} after #{response_time} seconds"
+        end
+        if @robots_checker&.disallowed?(uri)
+          raise ScraperUtils::UnprocessableSite,
+                "URL is disallowed by robots.txt specific rules: #{uri}"
+        end
+        delays = {
+          robot_txt: @robots_checker&.crawl_delay&.round(3),
+          max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
+          random: (@min_random ? (rand(@min_random..@max_random) ** 2).round(3) : nil)
+        }
+        @delay = delays.values.compact.max
+        if @delay&.positive?
+          puts "Delaying #{@delay} seconds, max of #{delays.inspect}" if ENV["DEBUG"]
+          sleep(@delay)
+        end
+        response
+      end
+      def verify_proxy_works(agent)
+        my_ip = MechanizeUtils.public_ip(agent)
+        begin
+          IPAddr.new(my_ip)
+        rescue IPAddr::InvalidAddressError => e
+          raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
+        end
+        ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
+        my_headers = MechanizeUtils::public_headers(agent)
+        begin
+          # Check response is JSON just to be safe!
+          headers = JSON.parse(my_headers)
+          puts "Proxy is passing headers:"
+          puts JSON.pretty_generate(headers['headers'])
+        rescue JSON::ParserError => e
+          puts "Couldn't parse public_headers: #{e}! Raw response:"
+          puts my_headers.inspect
+        end
+      rescue Net::OpenTimeout, Timeout::Error => e
+        raise "Proxy check timed out: #{e}"
+      rescue Errno::ECONNREFUSED, Net::HTTP::Persistent::Error => e
+        raise "Failed to connect to proxy: #{e}"
+      rescue Mechanize::ResponseCodeError => e
+        raise "Proxy check error: #{e}"
+      end
+    end
+  end
+end

data/lib/scraper_utils/mechanize_utils.rb CHANGED Viewed

@@ -1,32 +1,23 @@
 # frozen_string_literal: true
 require "mechanize"
+require "ipaddr"
+require "scraper_utils/mechanize_utils/agent_config"
 module ScraperUtils
   # Utilities for configuring and using Mechanize for web scraping
   module MechanizeUtils
     PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
+    HEADERS_ECHO_URL = "https://httpbin.org/headers"
-    # Creates and configures a Mechanize agent with optional proxy and timeout
-    #
-    # @param timeout [Integer, nil] Timeout for agent connections
-    # @param australian_proxy [Boolean] Whether to use an Australian proxy
+    # Creates and configures a Mechanize agent
+    # @param (see AgentConfig#initialize)
     # @return [Mechanize] Configured Mechanize agent
-    def self.mechanize_agent(timeout: nil, use_proxy: true)
+    def self.mechanize_agent(**options)
       agent = Mechanize.new
-      agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
-      use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
-      if use_proxy
-        # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
-        # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
-        # the real password.
-        agent.agent.set_proxy(ScraperUtils.australian_proxy)
-      end
-      if timeout
-        agent.open_timeout = timeout
-        agent.read_timeout = timeout
-      end
-      public_ip(agent) if use_proxy
+      config = AgentConfig.new(**options)
+      config.configure_agent(agent)
+      agent.instance_variable_set(:@scraper_utils_config, config)
       agent
     end
@@ -47,24 +38,27 @@ module ScraperUtils
         text = element.inner_text
         return "Maintenance: #{text}" if text&.match?(/maintenance/i)
       end
-      # Not in maintenance mode
       nil
     end
     # Retrieves and logs the public IP address
     #
-    # @param agent [Mechanize] Mechanize agent to use for IP lookup
-    # @param force [Boolean] Force a new IP lookup, bypassing cache
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
+    # @param force [Boolean] Force a new IP lookup, by clearing cache first
     # @return [String] The public IP address
-    def self.public_ip(agent, force: false)
+    def self.public_ip(agent = nil, force: false)
       @public_ip = nil if force
-      @public_ip ||=
-        begin
-          ip = agent.get(PUBLIC_IP_URL).body.strip
-          puts "Public IP: #{ip}"
-          ip
-        end
+      @public_ip ||= agent&.get(PUBLIC_IP_URL)&.body&.strip if agent
+    end
+    # Retrieves and logs the headers that make it through the proxy
+    #
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
+    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @return [String] The list of headers in json format
+    def self.public_headers(agent = nil, force: false)
+      @public_headers = nil if force
+      @public_headers ||= agent&.get(HEADERS_ECHO_URL)&.body&.strip if agent
     end
   end
 end

data/lib/scraper_utils/robots_checker.rb ADDED Viewed

@@ -0,0 +1,144 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # robots.txt checker with deliberately simplistic rules
+  class RobotsChecker
+    # @return [String] Lowercased user_agent for matching
+    attr_reader :user_agent
+    # Initialize with full user agent string like:
+    # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
+    # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
+    # Checks for
+    # * Disallow for User-agent: bot_name and
+    # * Crawl-delay from either User-agent: bot name or * (default)
+    def initialize(user_agent)
+      @user_agent = extract_user_agent(user_agent).downcase
+      if ENV["DEBUG"]
+        ScraperUtils::FiberScheduler.log "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
+      end
+      @rules = {} # domain -> {rules: [], delay: int}
+      @delay = nil # Delay from last robots.txt check
+    end
+    # Check if a URL is disallowed based on robots.txt rules specific to our user agent
+    # @param url [String] The full URL to check
+    # @return [Boolean] true if specifically blocked for our user agent, otherwise false
+    def disallowed?(url)
+      return false unless url
+      uri = URI(url)
+      domain = "#{uri.scheme}://#{uri.host}"
+      path = uri.path || "/"
+      # Get or fetch robots.txt rules
+      rules = get_rules(domain)
+      return false unless rules # If we can't get robots.txt, assume allowed
+      # Store any delay found for this domain
+      @delay = rules[:our_delay]
+      # Check rules specific to our user agent
+      matches_any_rule?(path, rules[:our_rules])
+    end
+    # Returns the crawl delay (if any) that applied to the last URL checked
+    # Should be called after disallowed? to get relevant delay
+    # @return [Integer, nil] The delay in seconds, or nil if no delay specified
+    def crawl_delay
+      @delay
+    end
+    private
+    def extract_user_agent(user_agent)
+      if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
+        user_agent = ::Regexp.last_match(2)&.strip
+      end
+      user_agent&.strip
+    end
+    def matches_any_rule?(path, rules)
+      rules&.any? { |rule| path.start_with?(rule) }
+    end
+    def get_rules(domain)
+      return @rules[domain] if @rules.key?(domain)
+      begin
+        response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
+        return nil unless response.code.start_with?("2") # 2xx response
+        rules = parse_robots_txt(response.body)
+        @rules[domain] = rules
+        rules
+      rescue StandardError => e
+        ScraperUtils::FiberScheduler.log "Warning: Failed to fetch robots.txt for #{domain}: #{e.message}" if ENV["DEBUG"]
+        nil
+      end
+    end
+    # Parse robots.txt content into structured rules
+    # Only collects rules for our specific user agent and generic crawl-delay
+    # @param content [String] The robots.txt content
+    # @return [Hash] Hash containing :our_rules and :our_delay
+    def parse_robots_txt(content)
+      sections = [] # Array of {agent:, rules:[], delay:} hashes
+      current_section = nil
+      content.each_line do |line|
+        line = line.strip.downcase
+        next if line.empty? || line.start_with?("#")
+        if line.start_with?("user-agent:")
+          agent = line.split(":", 2).last.strip
+          # Check if this is a continuation of the previous section
+          if current_section && current_section[:rules].empty? && current_section[:delay].nil?
+            current_section[:agents] << agent
+          else
+            current_section = { agents: [agent], rules: [], delay: nil }
+            sections << current_section
+          end
+          next
+        end
+        next unless current_section # Skip rules before first user-agent
+        if line.start_with?("disallow:")
+          path = line.split(":", 2).last.strip
+          current_section[:rules] << path unless path.empty?
+        elsif line.start_with?("crawl-delay:")
+          delay = line.split(":", 2).last.strip.to_i
+          current_section[:delay] = delay if delay.positive?
+        end
+      end
+      # Sort sections by most specific agent match first
+      matched_section = sections.find do |section|
+        section[:agents].any? do |agent|
+          # Our user agent starts with the agent from robots.txt
+          @user_agent.start_with?(agent) ||
+            # Or the agent from robots.txt starts with our user agent
+            # (handles ScraperUtils matching ScraperUtils/1.0)
+            agent.start_with?(@user_agent)
+        end
+      end
+      # Use matched section or fall back to wildcard
+      if matched_section
+        {
+          our_rules: matched_section[:rules],
+          our_delay: matched_section[:delay]
+        }
+      else
+        # Find default section
+        default_section = sections.find { |s| s[:agents].include?("*") }
+        {
+          our_rules: [],
+          our_delay: default_section&.dig(:delay)
+        }
+      end
+    end
+  end
+end

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.1.0"
+  VERSION = "0.2.0"
 end

data/lib/scraper_utils.rb CHANGED Viewed

@@ -1,10 +1,13 @@
 # frozen_string_literal: true
+require "scraper_utils/adaptive_delay"
 require "scraper_utils/authority_utils"
+require "scraper_utils/data_quality_monitor"
 require "scraper_utils/db_utils"
 require "scraper_utils/debug_utils"
 require "scraper_utils/log_utils"
 require "scraper_utils/mechanize_utils"
+require "scraper_utils/robots_checker"
 require "scraper_utils/version"
 # Utilities for planningalerts scrapers

data/scraper_utils.gemspec CHANGED Viewed

@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
   spec.summary = "planningalerts scraper utilities"
   spec.description = "Utilities to help make planningalerts scrapers, " \
-    "+especially multis easier to develop, run and debug."
+                     "+especially multis easier to develop, run and debug."
   spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
   spec.license = "MIT"
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
     # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
   else
     raise "RubyGems 2.0 or newer is required to protect against " \
-            "public gem pushes."
+          "public gem pushes."
   end
   # Specify which files should be added to the gem when it is released.
@@ -40,10 +40,5 @@ Gem::Specification.new do |spec|
   spec.add_dependency "mechanize"
   spec.add_dependency "nokogiri"
   spec.add_dependency "sqlite3"
-  spec.add_development_dependency "rake"
-  spec.add_development_dependency "rspec"
-  spec.add_development_dependency "rubocop"
-  spec.add_development_dependency "simplecov"
-  spec.add_development_dependency "simplecov-console"
+  spec.metadata["rubygems_mfa_required"] = "true"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Ian Heggie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-02-22 00:00:00.000000000 Z
+date: 2025-02-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -52,76 +52,6 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-- !ruby/object:Gem::Dependency
-  name: rake
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: rspec
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: rubocop
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: simplecov
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: simplecov-console
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
 description: Utilities to help make planningalerts scrapers, +especially multis easier
   to develop, run and debug.
 email:
@@ -134,18 +64,27 @@ files:
 - ".rspec"
 - ".rubocop.yml"
 - ".travis.yml"
+- CHANGELOG.md
+- GUIDELINES.md
 - Gemfile
+- IMPLEMENTATION.md
 - LICENSE.txt
 - README.md
 - Rakefile
+- SPECS.md
 - bin/console
 - bin/setup
 - lib/scraper_utils.rb
+- lib/scraper_utils/adaptive_delay.rb
 - lib/scraper_utils/authority_utils.rb
+- lib/scraper_utils/data_quality_monitor.rb
 - lib/scraper_utils/db_utils.rb
 - lib/scraper_utils/debug_utils.rb
+- lib/scraper_utils/fiber_scheduler.rb
 - lib/scraper_utils/log_utils.rb
 - lib/scraper_utils/mechanize_utils.rb
+- lib/scraper_utils/mechanize_utils/agent_config.rb
+- lib/scraper_utils/robots_checker.rb
 - lib/scraper_utils/version.rb
 - scraper_utils.gemspec
 homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -155,6 +94,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
+  rubygems_mfa_required: 'true'
 post_install_message:
 rdoc_options: []
 require_paths:
@@ -170,8 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6.2
+rubygems_version: 3.4.10
 signing_key:
 specification_version: 4
 summary: planningalerts scraper utilities