RubyGems - scraper_utils - Versions diffs - 0.5.1 → 0.6.0 - Mend

scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/.yardopts +5 -0
data/CHANGELOG.md +7 -0
data/GUIDELINES.md +2 -1
data/Gemfile +1 -0
data/IMPLEMENTATION.md +40 -0
data/README.md +29 -23
data/SPECS.md +13 -1
data/bin/rspec +27 -0
data/docs/example_scrape_with_fibers.rb +4 -4
data/docs/fibers_and_threads.md +72 -0
data/docs/getting_started.md +6 -6
data/docs/interleaving_requests.md +7 -7
data/docs/parallel_requests.md +138 -0
data/docs/randomizing_requests.md +12 -8
data/docs/reducing_server_load.md +6 -6
data/lib/scraper_utils/data_quality_monitor.rb +2 -3
data/lib/scraper_utils/date_range_utils.rb +37 -78
data/lib/scraper_utils/debug_utils.rb +5 -5
data/lib/scraper_utils/log_utils.rb +15 -0
data/lib/scraper_utils/mechanize_actions.rb +37 -8
data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
data/lib/scraper_utils/mechanize_utils.rb +8 -5
data/lib/scraper_utils/randomize_utils.rb +22 -19
data/lib/scraper_utils/scheduler/constants.rb +12 -0
data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
data/lib/scraper_utils/scheduler/process_request.rb +59 -0
data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
data/lib/scraper_utils/scheduler.rb +286 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +11 -14
metadata +16 -6
data/lib/scraper_utils/adaptive_delay.rb +0 -70
data/lib/scraper_utils/fiber_scheduler.rb +0 -229
data/lib/scraper_utils/robots_checker.rb +0 -149

data/lib/scraper_utils/scheduler/thread_response.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+module ScraperUtils
+  module Scheduler
+    # Encapsulates a response from an asynchronous command execution
+    class ThreadResponse
+      # @return [Symbol] The authority from the original command
+      attr_reader :authority
+      # @return [Object, nil] The result of the command
+      attr_reader :result
+      # @return [Exception, nil] Any error that occurred during execution
+      attr_reader :error
+      # @return [Float] The time taken to execute the command in seconds
+      attr_reader :time_taken
+      # @return [Time, nil] Optionally delay the next process
+      attr_accessor :delay_till
+      # Initialize a new async response
+      #
+      # @param authority [Symbol] The authority from the original command
+      # @param result [Object, nil] The result of the command
+      # @param error [Exception, nil] Any error that occurred during execution
+      # @param time_taken [Float] The time taken to submit_request the command in seconds
+      def initialize(authority, result, error, time_taken)
+        @authority = authority
+        @result = result
+        @error = error
+        @time_taken = time_taken
+        @delay_till = nil
+      end
+      # Check if the command execution was successful
+      #
+      # @return [Boolean] true if successful, false otherwise
+      def success?
+        @error.nil?
+      end
+      # Return result or raise error
+      # @return [Object] Result pf request
+      def result!
+        return @result if success?
+        raise @error
+      end
+      # Provide a readable inspection of the response
+      # @return [String] Readable representation
+      def inspect
+        status = success? ? "success" : "FAILED"
+        error_info = success? ? "" : " - #{error.class}: #{error.message}"
+        "#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
+      end
+    end
+  end
+end

data/lib/scraper_utils/scheduler.rb ADDED Viewed

@@ -0,0 +1,286 @@
+# frozen_string_literal: true
+require "fiber"
+require_relative "scheduler/constants"
+require_relative "scheduler/operation_registry"
+require_relative "scheduler/operation_worker"
+# value objects
+require_relative "scheduler/process_request"
+require_relative "scheduler/thread_request"
+module ScraperUtils
+  # A utility module to coordinate the scheduling of work,
+  # * interleaving multiple operations (scraping of an authorities site)
+  #   uses Fibers (cooperative concurrency) so your code and the libraries you call don't have to be thread safe
+  # * Performing mechanize Network I/O in parallel using Threads
+  #
+  # Process flow
+  # 0. operation_workers start with response = true as the first resume passes args to block and response is ignored
+  # 1. resumes fiber of operation_worker with the last response when `Time.now` >= resume_at
+  # 2. worker fiber calls {Scheduler.execute_request}
+  #    a. sets resume_at based on calculated delay and waiting_for_response
+  #    b. pushes request onto local request queue if parallel, otherwise
+  #       executes request immediately in fiber and passes response to save_thread_response
+  #    c. fiber yields true to main fiber to indicate it wants to continue after resume_at / response arrives
+  # 3. one thread for each fiber (if parallel), thread:
+  #    a. pops request
+  #    b. executes request
+  #    c. pushes response onto global response queue (includes response_time)
+  # 4. main fiber - schedule_all loop
+  #    a. pops any responses and calls save_thread_response on operation_worker
+  #    c. resumes(true) operation_worker (fiber) when `Time.now` >= resume_at and not waiting_for_response
+  # 5. When worker fiber is finished it returns false to indicate it is finished
+  #    OR when shutdown is called resume(false) is called to indicate worker fiber should not continue
+  #
+  # save_thread_response:
+  #    * Updates running average and calculates next_resume_at
+  #
+  # fiber aborts processing if 2nd argument is true
+  # fiber returns nil when finished
+  #
+  # Workers:
+  # * Push process requests onto individual request queues for their thread to process, and yield(true) to scheduler
+  #
+  # when enough
+  #
+  # Thread safe Implementation:
+  # * Uses fibers for each authority with its own mechanize agent so operations don't need to be thread safe
+  # * Only Mechanize requests are run in threads in parallel whilst they wait for network response
+  # * Uses message passing (using Queue's) to avoid having to share state between threads.
+  # * Execute request does not return till the response has been received from the thread,
+  #   so the fiber's mechanize agent that is shared with the thread isn't used in multiple threads at once
+  # * Only one execute request per authority fiber can be in the thread request queue at any one time
+  module Scheduler
+    # @!group Main fiber / thread Api
+    # These Methods should only be called from main (initial) fiber
+    class << self
+      # Controls if network I/O requests will be processed in parallel using threads
+      #
+      # @return [Boolean] true if processing network I/O in parallel using threads, otherwise false
+      # @note Defaults to true unless the MORPH_DISABLE_THREADS ENV variable is set
+      attr_accessor :threaded
+      # @return (see #threaded)
+      alias threaded? threaded
+      # Controls whether Mechanize network requests are executed in parallel using threads
+      #
+      # @return [Integer] max concurrent workers using fibers and threads, defaults to MAX_WORKERS env variable or 50
+      attr_accessor :max_workers
+      # @return [Hash{Symbol => Exception}] exceptions by authority
+      attr_reader :exceptions
+      # Returns the run_operations timeout
+      # On timeout a message will be output and the ruby program will exit with exit code 124.
+      #
+      # @return [Integer] Overall process timeout in seconds (default MORPH_RUN_TIMEOUT ENV value or 6 hours)
+      attr_accessor :run_timeout
+      # Private accessors for internal use
+      private
+      attr_reader :initial_resume_at, :operation_registry, :reset, :response_queue, :totals
+    end
+    # Resets the scheduler state. Use before retrying failed authorities.
+    def self.reset!
+      @operation_registry&.shutdown
+      @operation_registry = nil
+      @response_queue.close if @response_queue
+      @threaded = ENV["MORPH_DISABLE_THREADS"].to_s.empty?
+      @max_workers = [1, ENV.fetch('MORPH_MAX_WORKERS', Constants::DEFAULT_MAX_WORKERS).to_i].max
+      @exceptions = {}
+      @totals = Hash.new { 0 }
+      @initial_resume_at = Time.now
+      @response_queue = Thread::Queue.new if self.threaded?
+      @operation_registry = OperationRegistry.new
+      @reset = true
+      @run_timeout = ENV.fetch('MORPH_RUN_TIMEOUT', Constants::DEFAULT_TIMEOUT).to_i
+      nil
+    end
+    # reset on class load
+    reset!
+    # Registers a block to scrape for a specific authority
+    #
+    # Block yields(:delay) when operation.resume_at is in the future, and returns :finished when finished
+    # @param authority [Symbol] the name of the authority being processed
+    # @yield to the block containing the scraping operation to be run in the fiber
+    def self.register_operation(authority, &block)
+      fiber = Fiber.new do |continue|
+        begin
+          raise "Terminated fiber for #{authority} before block run" unless continue
+          block.call
+        rescue StandardError => e
+          # Store exception against the authority
+          exceptions[authority] = e
+        ensure
+          # Clean up when done regardless of success/failure
+          operation_registry&.deregister
+        end
+        # no further requests
+        nil
+      end
+      operation = operation_registry&.register(fiber, authority)
+      if DebugUtils.basic?
+        LogUtils.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
+      end
+      if operation_registry&.size >= @max_workers
+        LogUtils.log "Running batch of #{operation_registry&.size} operations immediately"
+        run_operations
+      end
+      # return operation for ease of testing
+      operation
+    end
+    # Run all registered operations until completion
+    #
+    # @return [Hash] Exceptions that occurred during execution
+    def self.run_operations
+      monitor_run_time = Thread.new do
+        sleep run_timeout
+        desc = "#{(run_timeout / 3600.0).round(1)} hours"
+        desc = "#{(run_timeout / 60.0).round(1)} minutes" if run_timeout < 100 * 60
+        desc = "#{run_timeout} seconds" if run_timeout < 100
+        LogUtils.log "ERROR: Script exceeded maximum allowed runtime of #{desc}!\n" \
+                       "Forcibly terminating process!"
+        Process.exit!(124)
+      end
+      count = operation_registry&.size
+      # Main scheduling loop - process till there is nothing left to do
+      until @operation_registry.empty?
+        save_thread_responses
+        resume_next_operation
+      end
+      report_summary(count)
+      exceptions
+    ensure
+      # Kill the monitoring thread if we finish normally
+      monitor_run_time.kill if monitor_run_time.alive?
+      monitor_run_time.join(2)
+    end
+    # ===========================================================
+    # @!group Fiber Api
+    # These Methods should be called from the worker's own fiber but can be called from the main fiber
+    # Execute Mechanize network request in parallel using the fiber's thread
+    # This allows multiple network I/O requests to be waiting for a response in parallel
+    # whilst responses that have arrived can be processed by their fibers.
+    #
+    # @example Replace this code in your scraper
+    #   page = agent.get(url_period(url, period, webguest))
+    #
+    # @example With this code
+    #   page = ScraperUtils::Scheduler.execute_request(agent, :get, [url_period(url, period, webguest)])
+    #
+    # @param client [MechanizeClient] client to be used to process request
+    # @param method_name [Symbol] method to be called on client
+    # @param args [Array] Arguments to be used with method call
+    # @return [Object] response from method call on client
+    def self.execute_request(client, method_name, args)
+      operation = current_operation
+      # execute immediately if not in a worker fiber
+      return client.send(method_name, args) unless operation
+      request = Scheduler::ProcessRequest.new(operation.authority, client, method_name, args)
+      log "Submitting request #{request.inspect}" if DebugUtils.basic?
+      response = operation.submit_request(request)
+      unless response.is_a?(ThreadResponse)
+        raise "Expected ThreadResponse, got: #{response.inspect}"
+      end
+      response.result!
+    end
+    # Gets the authority associated with the current fiber or thread
+    #
+    # @return [Symbol, nil] the authority name or nil if not in a fiber
+    def self.current_authority
+      current_operation&.authority
+    end
+    # @!endgroup
+    # ===========================================================
+    private
+    # Save results from threads in operation state so more operation fibers can be resumed
+    def self.save_thread_responses
+      while (thread_response = get_response)
+        operation = @operation_registry&.find(thread_response.authority)
+        operation&.save_thread_response(thread_response)
+        LogUtils.log "WARNING: orphaned thread response ignored: #{thread_response.inspect}", thread_response.authority
+      end
+    end
+    # Resume next operation or sleep POLL_PERIOD if non are ready
+    def self.resume_next_operation
+      delay = Constants::POLL_PERIOD
+      # Find the operation that ready to run with the earliest resume_at
+      can_resume_operations = @operation_registry&.can_resume
+      operation = can_resume_operations&.first
+      if !operation
+        # All the fibers must be waiting for responses, so sleep a bit to allow the responses to arrive
+        @operation_registry&.cleanup_zombies
+        sleep(delay)
+        @totals[:wait_response] += delay
+      else
+        delay = [(operation.resume_at - Time.now).to_f, delay].min
+        if delay.positive?
+          # Wait a bit for a fiber to be ready to run
+          sleep(delay)
+          waiting_for_delay = delay * can_resume_operations&.size.to_f / (@operation_registry&.size || 1)
+          @totals[:wait_delay] += waiting_for_delay
+          @totals[:wait_response] += delay - waiting_for_delay
+        else
+          @totals[:resume_count] += 1
+          # resume fiber with response to last request that is ready to be resumed now
+          operation.resume
+        end
+        operation
+      end
+    end
+    # Return the next response, returns nil if queue is empty
+    #
+    # @return [ThreadResponse, nil] Result of request execution
+    def self.get_response(non_block = true)
+      return nil if non_block && @response_queue.empty?
+      @response_queue.pop(non_block)
+    end
+    def self.current_operation
+      @operation_registry&.find
+    end
+    def self.report_summary(count)
+      wait_delay_percent = 0
+      wait_response_percent = 0
+      delay_requested = [@totals[:wait_delay], @totals[:wait_response]].sum
+      if delay_requested.positive?
+        wait_delay_percent = (100.0 * @totals[:wait_delay] / delay_requested).round(1)
+        wait_response_percent = (100.0 * @totals[:wait_response] / delay_requested).round(1)
+      end
+      puts
+      LogUtils.log "Scheduler processed #{@totals[:resume_count]} calls for #{count} registrations, " \
+                     "with #{wait_delay_percent}% of #{delay_requested.round(1)} seconds spent keeping under max_load, " \
+                     "and #{wait_response_percent}% waiting for network I/O requests."
+      puts
+    end
+  end
+end

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.5.1"
+  VERSION = "0.6.0"
 end

data/lib/scraper_utils.rb CHANGED Viewed

@@ -1,20 +1,21 @@
 # frozen_string_literal: true
-require "scraper_utils/adaptive_delay"
+require "scraper_utils/version"
+# Public Apis (responsible for requiring their own dependencies)
 require "scraper_utils/authority_utils"
 require "scraper_utils/cycle_utils"
 require "scraper_utils/data_quality_monitor"
 require "scraper_utils/date_range_utils"
 require "scraper_utils/db_utils"
 require "scraper_utils/debug_utils"
-require "scraper_utils/fiber_scheduler"
 require "scraper_utils/log_utils"
+require "scraper_utils/randomize_utils"
+require "scraper_utils/scheduler"
+# Mechanize utilities
 require "scraper_utils/mechanize_actions"
-require "scraper_utils/mechanize_utils/agent_config"
 require "scraper_utils/mechanize_utils"
-require "scraper_utils/randomize_utils"
-require "scraper_utils/robots_checker"
-require "scraper_utils/version"
 # Utilities for planningalerts scrapers
 module ScraperUtils
@@ -22,17 +23,13 @@ module ScraperUtils
   AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
   # Fatal Error
-  class Error < StandardError
-  end
+  class Error < StandardError; end
   # Fatal error with the site - retrying won't help
-  class UnprocessableSite < Error
-  end
+  class UnprocessableSite < Error; end
-  # Content validation errors that should not be retried for that record,
-  # but other records may be processable
-  class UnprocessableRecord < Error
-  end
+  # Fatal Error for a record - other records may be processable
+  class UnprocessableRecord < Error; end
   def self.australian_proxy
     ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.5.1
+  version: 0.6.0
 platform: ruby
 authors:
 - Ian Heggie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-03-05 00:00:00.000000000 Z
+date: 2025-03-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -64,6 +64,7 @@ files:
 - ".rspec"
 - ".rubocop.yml"
 - ".travis.yml"
+- ".yardopts"
 - CHANGELOG.md
 - GUIDELINES.md
 - Gemfile
@@ -73,30 +74,39 @@ files:
 - Rakefile
 - SPECS.md
 - bin/console
+- bin/rspec
 - bin/setup
 - docs/debugging.md
 - docs/example_scrape_with_fibers.rb
 - docs/example_scraper.rb
+- docs/fibers_and_threads.md
 - docs/getting_started.md
 - docs/interleaving_requests.md
 - docs/mechanize_utilities.md
+- docs/parallel_requests.md
 - docs/randomizing_requests.md
 - docs/reducing_server_load.md
 - lib/scraper_utils.rb
-- lib/scraper_utils/adaptive_delay.rb
 - lib/scraper_utils/authority_utils.rb
 - lib/scraper_utils/cycle_utils.rb
 - lib/scraper_utils/data_quality_monitor.rb
 - lib/scraper_utils/date_range_utils.rb
 - lib/scraper_utils/db_utils.rb
 - lib/scraper_utils/debug_utils.rb
-- lib/scraper_utils/fiber_scheduler.rb
 - lib/scraper_utils/log_utils.rb
 - lib/scraper_utils/mechanize_actions.rb
 - lib/scraper_utils/mechanize_utils.rb
+- lib/scraper_utils/mechanize_utils/adaptive_delay.rb
 - lib/scraper_utils/mechanize_utils/agent_config.rb
+- lib/scraper_utils/mechanize_utils/robots_checker.rb
 - lib/scraper_utils/randomize_utils.rb
-- lib/scraper_utils/robots_checker.rb
+- lib/scraper_utils/scheduler.rb
+- lib/scraper_utils/scheduler/constants.rb
+- lib/scraper_utils/scheduler/operation_registry.rb
+- lib/scraper_utils/scheduler/operation_worker.rb
+- lib/scraper_utils/scheduler/process_request.rb
+- lib/scraper_utils/scheduler/thread_request.rb
+- lib/scraper_utils/scheduler/thread_response.rb
 - lib/scraper_utils/version.rb
 - scraper_utils.gemspec
 homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -106,7 +116,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
-  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.5.1
+  documentation_uri: https://rubydoc.info/gems/scraper_utils/0.6.0
   changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
   rubygems_mfa_required: 'true'
 post_install_message:

data/lib/scraper_utils/adaptive_delay.rb DELETED Viewed

@@ -1,70 +0,0 @@
-# frozen_string_literal: true
-require "uri"
-module ScraperUtils
-  # Adapts delays between requests based on server response times.
-  # Target delay is proportional to response time based on max_load setting.
-  # Uses an exponential moving average to smooth variations in response times.
-  class AdaptiveDelay
-    DEFAULT_MIN_DELAY = 0.0
-    DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
-    attr_reader :min_delay, :max_delay, :max_load
-    # Creates a new adaptive delay calculator
-    #
-    # @param min_delay [Float] Minimum delay between requests in seconds
-    # @param max_delay [Float] Maximum delay between requests in seconds
-    # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
-    #                         Lower values are more conservative (e.g., 20% = 4x response time delay)
-    def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
-      @delays = {} # domain -> last delay used
-      @min_delay = min_delay.to_f
-      @max_delay = max_delay.to_f
-      @max_load = max_load.to_f.clamp(1.0, 99.0)
-      @response_multiplier = (100.0 - @max_load) / @max_load
-      return unless DebugUtils.basic?
-      ScraperUtils::FiberScheduler.log(
-        "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
-          "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
-      )
-    end
-    # @param uri [URI::Generic, String] The URL to extract the domain from
-    # @return [String] The domain in the format "scheme://host"
-    def domain(uri)
-      uri = URI(uri) unless uri.is_a?(URI)
-      "#{uri.scheme}://#{uri.host}".downcase
-    end
-    # @param uri [URI::Generic, String] URL to get delay for
-    # @return [Float] Current delay for the domain, or min_delay if no delay set
-    def delay(uri)
-      @delays[domain(uri)] || @min_delay
-    end
-    # @param uri [URI::Generic, String] URL the response came from
-    # @param response_time [Float] Time in seconds the server took to respond
-    # @return [Float] The calculated delay to use with the next request
-    def next_delay(uri, response_time)
-      uris_domain = domain(uri)
-      target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
-      current_delay = @delays[uris_domain] || target_delay
-      delay = ((9.0 * current_delay) + target_delay) / 10.0
-      delay = delay.clamp(@min_delay, @max_delay)
-      if DebugUtils.basic?
-        ScraperUtils::FiberScheduler.log(
-          "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
-            "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
-        )
-      end
-      @delays[uris_domain] = delay
-      delay
-    end
-  end
-end