RubyGems - scraper_utils - Versions diffs - 0.5.1 → 0.6.0 - Mend

scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/.yardopts +5 -0
data/CHANGELOG.md +7 -0
data/GUIDELINES.md +2 -1
data/Gemfile +1 -0
data/IMPLEMENTATION.md +40 -0
data/README.md +29 -23
data/SPECS.md +13 -1
data/bin/rspec +27 -0
data/docs/example_scrape_with_fibers.rb +4 -4
data/docs/fibers_and_threads.md +72 -0
data/docs/getting_started.md +6 -6
data/docs/interleaving_requests.md +7 -7
data/docs/parallel_requests.md +138 -0
data/docs/randomizing_requests.md +12 -8
data/docs/reducing_server_load.md +6 -6
data/lib/scraper_utils/data_quality_monitor.rb +2 -3
data/lib/scraper_utils/date_range_utils.rb +37 -78
data/lib/scraper_utils/debug_utils.rb +5 -5
data/lib/scraper_utils/log_utils.rb +15 -0
data/lib/scraper_utils/mechanize_actions.rb +37 -8
data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
data/lib/scraper_utils/mechanize_utils.rb +8 -5
data/lib/scraper_utils/randomize_utils.rb +22 -19
data/lib/scraper_utils/scheduler/constants.rb +12 -0
data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
data/lib/scraper_utils/scheduler/process_request.rb +59 -0
data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
data/lib/scraper_utils/scheduler.rb +286 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +11 -14
metadata +16 -6
data/lib/scraper_utils/adaptive_delay.rb +0 -70
data/lib/scraper_utils/fiber_scheduler.rb +0 -229
data/lib/scraper_utils/robots_checker.rb +0 -149

data/lib/scraper_utils/mechanize_utils/robots_checker.rb ADDED Viewed

@@ -0,0 +1,151 @@
+# frozen_string_literal: true
+module ScraperUtils
+  module MechanizeUtils
+    # robots.txt checker with deliberately simplistic rules
+    class RobotsChecker
+      # @return [String] Lowercased user_agent for matching
+      attr_reader :user_agent
+      # Initialize with full user agent string like:
+      # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
+      # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
+      # Checks for
+      # * Disallow for User-agent: bot_name and
+      # * Crawl-delay from either User-agent: bot name or * (default)
+      def initialize(user_agent)
+        @user_agent = extract_user_agent(user_agent).downcase
+        if DebugUtils.basic?
+          ScraperUtils::LogUtils.log(
+            "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
+          )
+        end
+        @rules = {} # domain -> {rules: [], delay: int}
+        @delay = nil # Delay from last robots.txt check
+      end
+      # Check if a URL is disallowed based on robots.txt rules specific to our user agent
+      # @param url [String] The full URL to check
+      # @return [Boolean] true if specifically blocked for our user agent, otherwise false
+      def disallowed?(url)
+        return false unless url
+        uri = URI(url)
+        domain = "#{uri.scheme}://#{uri.host}"
+        path = uri.path || "/"
+        # Get or fetch robots.txt rules
+        rules = get_rules(domain)
+        return false unless rules # If we can't get robots.txt, assume allowed
+        # Store any delay found for this domain
+        @delay = rules[:our_delay]
+        # Check rules specific to our user agent
+        matches_any_rule?(path, rules[:our_rules])
+      end
+      # Returns the crawl delay (if any) that applied to the last URL checked
+      # Should be called after disallowed? to get relevant delay
+      # @return [Integer, nil] The delay in seconds, or nil if no delay specified
+      def crawl_delay
+        @delay
+      end
+      private
+      def extract_user_agent(user_agent)
+        if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
+          user_agent = ::Regexp.last_match(2)&.strip
+        end
+        user_agent&.strip
+      end
+      def matches_any_rule?(path, rules)
+        rules&.any? { |rule| path.start_with?(rule) }
+      end
+      def get_rules(domain)
+        return @rules[domain] if @rules.key?(domain)
+        begin
+          response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
+          return nil unless response.code.start_with?("2") # 2xx response
+          rules = parse_robots_txt(response.body)
+          @rules[domain] = rules
+          rules
+        rescue StandardError => e
+          if DebugUtils.basic?
+            ScraperUtils::LogUtils.log(
+              "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
+            )
+          end
+          nil
+        end
+      end
+      # Parse robots.txt content into structured rules
+      # Only collects rules for our specific user agent and generic crawl-delay
+      # @param content [String] The robots.txt content
+      # @return [Hash] Hash containing :our_rules and :our_delay
+      def parse_robots_txt(content)
+        sections = [] # Array of {agent:, rules:[], delay:} hashes
+        current_section = nil
+        content.each_line do |line|
+          line = line.strip.downcase
+          next if line.empty? || line.start_with?("#")
+          if line.start_with?("user-agent:")
+            agent = line.split(":", 2).last.strip
+            # Check if this is a continuation of the previous section
+            if current_section && current_section[:rules].empty? && current_section[:delay].nil?
+              current_section[:agents] << agent
+            else
+              current_section = { agents: [agent], rules: [], delay: nil }
+              sections << current_section
+            end
+            next
+          end
+          next unless current_section # Skip rules before first user-agent
+          if line.start_with?("disallow:")
+            path = line.split(":", 2).last.strip
+            current_section[:rules] << path unless path.empty?
+          elsif line.start_with?("crawl-delay:")
+            delay = line.split(":", 2).last.strip.to_i
+            current_section[:delay] = delay if delay.positive?
+          end
+        end
+        # Sort sections by most specific agent match first
+        matched_section = sections.find do |section|
+          section[:agents].any? do |agent|
+            # Our user agent starts with the agent from robots.txt
+            @user_agent.start_with?(agent) ||
+              # Or the agent from robots.txt starts with our user agent
+              # (handles ScraperUtils matching ScraperUtils/1.0)
+              agent.start_with?(@user_agent)
+          end
+        end
+        # Use matched section or fall back to wildcard
+        if matched_section
+          {
+            our_rules: matched_section[:rules],
+            our_delay: matched_section[:delay]
+          }
+        else
+          # Find default section
+          default_section = sections.find { |s| s[:agents].include?("*") }
+          {
+            our_rules: [],
+            our_delay: default_section&.dig(:delay)
+          }
+        end
+      end
+    end
+  end
+end

data/lib/scraper_utils/mechanize_utils.rb CHANGED Viewed

@@ -2,7 +2,10 @@
 require "mechanize"
 require "ipaddr"
-require "scraper_utils/mechanize_utils/agent_config"
+require_relative "mechanize_utils/adaptive_delay"
+require_relative "mechanize_utils/agent_config"
+require_relative "mechanize_utils/robots_checker"
 module ScraperUtils
   # Utilities for configuring and using Mechanize for web scraping
@@ -43,8 +46,8 @@ module ScraperUtils
     # Retrieves and logs the public IP address
     #
-    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
-    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
+    # @param force [Boolean] Force a new IP find, by clearing cache first
     # @return [String, nil] The public IP address
     def self.public_ip(agent = nil, force: false)
       @public_ip = nil if force
@@ -57,8 +60,8 @@ module ScraperUtils
     # Retrieves and logs the headers that make it through the proxy
     #
-    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
-    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
+    # @param force [Boolean] Force a new IP find, by clearing cache first
     # @return [String, nil] The list of headers in json format
     def self.public_headers(agent = nil, force: false)
       @public_headers = nil if force

data/lib/scraper_utils/randomize_utils.rb CHANGED Viewed

@@ -4,31 +4,34 @@ module ScraperUtils
   # Provides utilities for randomizing processing order in scrapers,
   # particularly helpful for distributing load and avoiding predictable patterns
   module RandomizeUtils
-    # Returns a randomized version of the input collection when in production mode,
-    # or the original collection when in test/sequential mode
-    #
-    # @param collection [Array, Enumerable] Collection of items to potentially randomize
-    # @return [Array] Randomized or original collection depending on environment
-    def self.randomize_order(collection)
-      return collection.to_a if sequential?
+    class << self
+      # Controls if processing order can be randomized
+      #
+      # @return [Boolean] true if all processing is done sequentially, otherwise false
+      # @note Defaults to true unless the MORPH_DISABLE_RANDOM ENV variable is set
+      attr_accessor :random
-      collection.to_a.shuffle
+      # Reports if processing order will be randomized
+      #
+      # @return (see #random)
+      alias random? random
     end
-    # Checks if sequential processing is enabled
-    #
-    # @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
-    def self.sequential?
-      @sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
-      @sequential || false
+    def self.reset!
+      @random = ENV["MORPH_DISABLE_RANDOM"].to_s.empty?
     end
-    # Explicitly set sequential mode for testing
+    # reset on class load
+    reset!
+    # Returns a randomized version of the input collection unless `.sequential?` is true.
     #
-    # @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
-    # @return [Boolean, nil]
-    def self.sequential=(value)
-      @sequential = value
+    # @param collection [Array, Enumerable] Collection of items
+    # @return [Array] Randomized unless {.sequential?} is true, otherwise original order
+    def self.randomize_order(collection)
+      return collection.to_a.shuffle if random?
+      collection.to_a
     end
   end
 end

data/lib/scraper_utils/scheduler/constants.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module ScraperUtils
+  module Scheduler
+    module Constants
+      MAIN_FIBER = Fiber.current
+      # @!group Scheduler defaults
+      DEFAULT_MAX_WORKERS = 50
+      DEFAULT_TIMEOUT = 6 * 60 * 60 # 6 hours
+      POLL_PERIOD = 0.01
+    end
+  end
+end

data/lib/scraper_utils/scheduler/operation_registry.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+require "fiber"
+require_relative "operation_worker"
+module ScraperUtils
+  module Scheduler
+    # Registry of all active OperationWorkers registered to be processed
+    class OperationRegistry
+      def initialize
+        @operations = {}
+        @fiber_ids = {}
+      end
+      def register(fiber, authority)
+        authority = authority.to_sym
+        operation = OperationWorker.new(fiber, authority, @response_queue)
+        @operations[authority] = operation
+        @fiber_ids[operation.fiber.object_id] = operation
+      end
+      # Remove yourself from registry, called from fiber
+      def deregister
+        operation = find
+        return unless operation
+        operation.close
+        # Remove operation from registry since shutdown has done all it can to shut down the thread and fiber
+        @operations.delete(operation.authority)
+        @fiber_ids.delete(operation.fiber.object_id)
+      end
+      def current_authority
+        find(Fiber.current.object_id)&.authority
+      end
+      # Find OperationWorker
+      # @param key [Integer, String, nil] Fiber's object_id or authority (default current fiber's object_id)
+      # @return [OperationWorker, nil] Returns worker or nil if not found
+      def find(key = nil)
+        key ||= Fiber.current.object_id
+        if key.is_a?(Symbol)
+          @operations[key]
+        elsif key.is_a?(Integer)
+          @fiber_ids[key]
+        end
+      end
+      # Removes operations
+      def shutdown
+        operations.each do |_key, operation|
+          operation.shutdown
+        end
+      end
+      # Returns true if there are no registered operations
+      def empty?
+        @operations.empty?
+      end
+      # Returns number of registered operations
+      def size
+        @operations.size
+      end
+      # Find operations that can be resumed in resume_at order (may include future resume_at)
+      #
+      # @return [Array{OperationWorker}] Operations that are alive and have a response to use with resume
+      def can_resume
+        @operations
+          .values
+          .select { |op| op.can_resume? }
+          .sort_by(&:resume_at)
+      end
+      # Cleanup dead fibers that haven't removed themselves so we don't loop forever
+      def cleanup_zombies
+        dead_operations = @operations.values.reject(&:alive?)
+        dead_operations.each do |operation|
+          LogUtils.log "WARNING: removing dead operation for #{operation.authority} - it should have cleaned up after itself!"
+          operation.shutdown
+          @operations.delete(operation.authority)
+          @fiber_ids.delete(operation.fiber.object_id)
+        end
+      end
+      # Save the thread response into the thread and mark that it can continue
+      def process_thread_response(response)
+        operation = find(response.authority)
+        operation&.save_thread_response response
+      end
+      private
+      attr_accessor :operations
+    end
+  end
+end

data/lib/scraper_utils/scheduler/operation_worker.rb ADDED Viewed

@@ -0,0 +1,199 @@
+# frozen_string_literal: true
+require_relative "constants"
+require_relative 'process_request'
+module ScraperUtils
+  module Scheduler
+    # Handles the processing of a registered operation and associated fiber and thread state
+    class OperationWorker
+      class NotReadyError < RuntimeError; end
+      # @return [Fiber] The fiber
+      attr_reader :fiber
+      # @return [Symbol] The authority name associated with this fiber
+      attr_reader :authority
+      # @return [Time] When the fiber should be delayed till / ready to resume at
+      attr_accessor :resume_at
+      # @return [ThreadResponse, nil] The response to be passed on the next resume
+      attr_accessor :response
+      # @return [Boolean] Waiting for a response
+      attr_reader :waiting_for_response
+      # @return [Thread] Thread used
+      attr_reader :thread
+      # @return [Thread::Queue] The request queue for the thread
+      attr_reader :request_queue
+      def self.next_resume_at
+        @next_resume_at = [@next_resume_at, Time.now - 0.001].compact.max + 0.001
+      end
+      # Fiber has not finished running
+      def alive?
+        fiber.alive?
+      end
+      # Worker has the necessary state to be resumed
+      def can_resume?
+        !@response.nil? && !@resume_at.nil? && alive?
+      end
+      # Save thread response from main or worker fiber
+      def save_thread_response(response)
+        raise "#{authority} Wasn't waiting for response! Got: #{response.inspect}" unless @waiting_for_response
+        @response = response
+        @waiting_for_response = false
+        @resume_at = [response&.delay_till, Time.now].compact.max
+        if DebugUtils.basic?
+          log "Received #{response&.class&.name || 'nil response'} from thread for fiber #{authority} in #{response&.time_taken&.round(3)}s"
+        end
+        response
+      end
+      # close resources from worker fiber
+      # Called by worker fiber just before it exits
+      def close
+        validate_fiber(main: false)
+        # Signal thread to finish processing, then wait for it
+        @request_queue&.close
+        @thread&.join(60)
+        # drop references for GC
+        @request_queue = nil
+        @thread = nil
+        # make can_resume? false
+        clear_resume_state
+      end
+      # ===================================================
+      # @! Main Fiber API
+      # Initialize a new Worker Fiber and Thread, called from the main Fiber
+      #
+      # The Thread executes ThreadRequest objects from the request_queue and pushes
+      # responses to the global response_queue.
+      #
+      # @param fiber [Fiber] Fiber to process authority block
+      # @param authority [Symbol] Authority label
+      # @param response_queue [Thread::Queue, nil] Queue for thread responses if enabled
+      def initialize(fiber, authority, response_queue)
+        raise(ArgumentError, "Fiber and Authority must be provided") unless fiber && authority
+        validate_fiber(main: true)
+        @fiber = fiber
+        @authority = authority
+        @response_queue = response_queue
+        @fiber.instance_variable_set(:@operation_worker, self)
+        if response_queue
+          @request_queue = Thread::Queue.new
+          @thread = Thread.new do
+            Thread.current[:current_authority] = authority
+            while (request = @request_queue&.pop)
+              @response_queue.push request.execute
+            end
+          end
+        end
+        @resume_at = self.class.next_resume_at
+        @waiting_for_response = false
+        # First resume response is ignored
+        @response = true
+      end
+      # Resume an operation fiber and queue request if there is any from main fiber
+      #
+      # @return [ThreadRequest, nil] request returned by resume or nil if finished
+      def resume
+        raise ClosedQueueError unless alive?
+        raise NotReadyError, "Cannot resume #{authority} without response!" unless @response
+        validate_fiber(main: true)
+        request = @fiber.resume(@response)
+        # submit the next request for processing
+        submit_request(request) if request
+        request
+      end
+      # Shutdown worker called from main fiber
+      def shutdown
+        validate_fiber(main: true)
+        clear_resume_state
+        if @fiber&.alive?
+          # Trigger fiber to raise an error and thus call deregister
+          @fiber.resume(nil)
+        end
+      end
+      # ===================================================
+      # @! Worker Fiber API
+      # Queue a thread request to be executed from worker fiber
+      # otherwise locally if parallel processing is disabled
+      #
+      # Process flow if parallel enabled:
+      # 1. This method:
+      #   a. pushes request onto local @request_queue
+      #   b. calls Fiber.yield(true) so Scheduler can run other fibers
+      # 2. Meanwhile, this fibers thread:
+      #   a. pops request off queue
+      #   b. processes request
+      #   c. pushes response to global response queue
+      # 3. Meanwhile, Scheduler on Main fiber:
+      #   a. pops response from response queue as they arrive
+      #     * calls {#save_thread_response} on associated worker to save each response
+      #   c. calls {#resume} on worker when it is its' turn (based on resume_at) and it can_resume (has @response)
+      #
+      # If parallel processing is not enabled, then the processing occurs in the workers fiber
+      #
+      # @param request [ThreadRequest] The request to be processed in thread
+      def submit_request(request)
+        raise NotReadyError, "Cannot make a second request before the first has responded!" if @waiting_for_response
+        raise ArgumentError, "Must be passed a valid ThreadRequest! Got: #{request.inspect}" unless request.is_a? ThreadRequest
+        validate_fiber(main: false)
+        @response = nil
+        @waiting_for_response = true
+        if @request_queue
+          @request_queue&.push request
+          response = Fiber.yield true
+          raise "Terminated fiber for #{authority} as requested" unless response
+        else
+          response = save_thread_response request.execute
+        end
+        response
+      end
+      private
+      def validate_fiber(main: false)
+        required_fiber = main ? Constants::MAIN_FIBER : @fiber
+        current_id = Fiber.current.object_id
+        return if current_id == required_fiber.object_id
+        desc = main ? 'main' : 'worker'
+        we_are = if current_id == Constants::MAIN_FIBER.object_id
+                   'main'
+                 elsif current_id == @fiber.object_id
+                   'worker'
+                 else
+                   'other'
+                 end
+        raise ArgumentError,
+              "Must be run within the #{desc} not #{we_are} fiber!"
+      end
+      # Clear resume state so the operation won't be resumed
+      def clear_resume_state
+        @resume_at = nil
+        @response = nil
+        @waiting_for_response = false
+      end
+    end
+  end
+end

data/lib/scraper_utils/scheduler/process_request.rb ADDED Viewed

@@ -0,0 +1,59 @@
+# frozen_string_literal: true
+require_relative "thread_request"
+module ScraperUtils
+  module Scheduler
+    # Encapsulates a request to be executed (usually )asynchronously by the ThreadPool)
+    class ProcessRequest < ThreadRequest
+      # @return [Object] The object to call the method on
+      attr_reader :subject
+      # @return [Symbol] The method to call on the subject
+      attr_reader :method_name
+      # @return [Array] The arguments to pass to the method
+      attr_reader :args
+      # Initialize a new async request
+      #
+      # @param authority [Symbol, nil] Authority for correlating requests and responses
+      #   nil is used when threads are disabled to process locally without duplicating codd
+      # @param subject [Object] The object to call the method on
+      # @param method_name [Symbol] The method to call on the subject
+      # @param args [Array] The arguments to pass to the method
+      # @raise [ArgumentError] If any required parameter is missing or invalid
+      def initialize(authority, subject, method_name, args)
+        super(authority)
+        @subject = subject
+        @method_name = method_name
+        @args = args
+        validate!
+      end
+      # Execute the request by calling the method on the subject
+      # If the subject has an instance variable @delay_till then that is added to the response
+      # @return [ThreadResponse] The result of the request
+      def execute
+        result = execute_block do
+          subject.send(method_name, *args)
+        end
+        result.delay_till = subject.instance_variable_get(:@delay_till)
+        result
+      end
+      private
+      # Validate that all required parameters are present and valid
+      #
+      # @raise [ArgumentError] If any parameter is missing or invalid
+      def validate!
+        raise ArgumentError, "Subject must be provided" unless @subject
+        raise ArgumentError, "Method name must be provided" unless @method_name
+        raise ArgumentError, "Args must be an array" unless @args.is_a?(Array)
+        raise ArgumentError, "Subject must respond to method" unless @subject&.respond_to?(@method_name)
+      end
+    end
+  end
+end

data/lib/scraper_utils/scheduler/thread_request.rb ADDED Viewed

@@ -0,0 +1,51 @@
+# frozen_string_literal: true
+require_relative "thread_response"
+module ScraperUtils
+  module Scheduler
+    # Encapsulates a request that pushed to the fiber's request queue to be executed by the Fiber's Thread
+    # The response is returned via the Scheduler's response queue
+    # @see {ProcessRequest}
+    class ThreadRequest
+      # @return [Symbol] Authority for correlating requests and responses
+      attr_reader :authority
+      # Initialize a new process request
+      #
+      # @param authority [Symbol, nil] Authority for correlating requests and responses
+      def initialize(authority)
+        @authority = authority
+      end
+      # Execute a request and return ThreadResponse - use helper method `.execute_block`
+      def execute
+        raise NotImplementedError, "Implement in subclass"
+      end
+      # Execute a request by calling the block
+      # @return [ThreadResponse] The result of the request
+      def execute_block
+        start_time = Time.now
+        begin
+          result = yield
+          elapsed_time = Time.now - start_time
+          ThreadResponse.new(
+            authority,
+            result,
+            nil,
+            elapsed_time
+          )
+        rescue => e
+          elapsed_time = Time.now - start_time
+          ThreadResponse.new(
+            authority,
+            nil,
+            e,
+            elapsed_time
+          )
+        end
+      end
+    end
+  end
+end