RubyGems - scraper_utils - Versions diffs - 0.5.1 → 0.7.0 - Mend

scraper_utils 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
data/.yardopts +5 -0
data/CHANGELOG.md +19 -0
data/GUIDELINES.md +2 -1
data/Gemfile +1 -0
data/IMPLEMENTATION.md +39 -0
data/README.md +29 -23
data/SPECS.md +13 -1
data/bin/rspec +27 -0
data/docs/enhancing_specs.md +100 -0
data/docs/example_scrape_with_fibers.rb +4 -4
data/docs/fibers_and_threads.md +72 -0
data/docs/getting_started.md +6 -6
data/docs/interleaving_requests.md +9 -8
data/docs/mechanize_utilities.md +4 -4
data/docs/parallel_requests.md +138 -0
data/docs/randomizing_requests.md +12 -8
data/docs/reducing_server_load.md +6 -6
data/lib/scraper_utils/data_quality_monitor.rb +2 -3
data/lib/scraper_utils/date_range_utils.rb +37 -78
data/lib/scraper_utils/debug_utils.rb +5 -5
data/lib/scraper_utils/log_utils.rb +15 -0
data/lib/scraper_utils/mechanize_actions.rb +37 -8
data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +80 -0
data/lib/scraper_utils/mechanize_utils/agent_config.rb +35 -34
data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
data/lib/scraper_utils/mechanize_utils.rb +8 -5
data/lib/scraper_utils/randomize_utils.rb +22 -19
data/lib/scraper_utils/scheduler/constants.rb +12 -0
data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
data/lib/scraper_utils/scheduler/process_request.rb +59 -0
data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
data/lib/scraper_utils/scheduler.rb +286 -0
data/lib/scraper_utils/spec_support.rb +67 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +12 -14
metadata +18 -6
data/lib/scraper_utils/adaptive_delay.rb +0 -70
data/lib/scraper_utils/fiber_scheduler.rb +0 -229
data/lib/scraper_utils/robots_checker.rb +0 -149

data/lib/scraper_utils/mechanize_utils/agent_config.rb CHANGED Viewed

@@ -24,9 +24,9 @@ module ScraperUtils
     #   )
     class AgentConfig
       DEFAULT_TIMEOUT = 60
-      DEFAULT_RANDOM_DELAY = 5
-      DEFAULT_MAX_LOAD = 33.3
-      MAX_LOAD_CAP = 50.0
+      DEFAULT_RANDOM_DELAY = 0
+      DEFAULT_MAX_LOAD = 50.0
+      MAX_LOAD_CAP = 80.0
       # Class-level defaults that can be modified
       class << self
@@ -67,10 +67,10 @@ module ScraperUtils
         # Reset all configuration options to their default values
         # @return [void]
         def reset_defaults!
-          @default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
+          @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
           @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
-          @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
-          @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
+          @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
+          @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 50.0
           @default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
           @default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
           @default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
@@ -85,7 +85,7 @@ module ScraperUtils
       # Give access for testing
-      attr_reader :max_load, :min_random, :max_random
+      attr_reader :max_load, :random_range
       # Creates Mechanize agent configuration with sensible defaults overridable via configure
       # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
@@ -125,21 +125,21 @@ module ScraperUtils
         @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
         if @australian_proxy
           uri = begin
-            URI.parse(ScraperUtils.australian_proxy.to_s)
-          rescue URI::InvalidURIError => e
-            raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
-          end
+                  URI.parse(ScraperUtils.australian_proxy.to_s)
+                rescue URI::InvalidURIError => e
+                  raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
+                end
           unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
             raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
           end
-          unless uri.host && uri.port
+          unless !uri.host.to_s.empty? && uri.port&.positive?
             raise URI::InvalidURIError, "Proxy URL must include host and port"
           end
         end
-        if @random_delay
-          @min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
-          @max_random = (3 * @min_random).round(3)
+        if @random_delay&.positive?
+          min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
+          @random_range = min_random.round(3)..(3 * min_random).round(3)
         end
         today = Date.today.strftime("%Y-%m-%d")
@@ -177,7 +177,6 @@ module ScraperUtils
           verify_proxy_works(agent)
         end
-        @connection_started_at = nil
         agent.pre_connect_hooks << method(:pre_connect_hook)
         agent.post_connect_hooks << method(:post_connect_hook)
       end
@@ -193,11 +192,11 @@ module ScraperUtils
                           "australian_proxy=#{@australian_proxy.inspect}"
                         end
         display_args << "compliant_mode" if @compliant_mode
-        display_args << "random_delay=#{@random_delay}" if @random_delay
+        display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
         display_args << "max_load=#{@max_load}%" if @max_load
         display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
         display_args << "default args" if display_args.empty?
-        ScraperUtils::FiberScheduler.log(
+        ScraperUtils::LogUtils.log(
           "Configuring Mechanize agent with #{display_args.join(', ')}"
         )
       end
@@ -206,7 +205,7 @@ module ScraperUtils
         @connection_started_at = Time.now
         return unless DebugUtils.verbose?
-        ScraperUtils::FiberScheduler.log(
+        ScraperUtils::LogUtils.log(
           "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
         )
       end
@@ -216,9 +215,9 @@ module ScraperUtils
         response_time = Time.now - @connection_started_at
         if DebugUtils.basic?
-          ScraperUtils::FiberScheduler.log(
+          ScraperUtils::LogUtils.log(
             "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
-            "after #{response_time} seconds"
+              "after #{response_time} seconds"
           )
         end
@@ -227,33 +226,35 @@ module ScraperUtils
                 "URL is disallowed by robots.txt specific rules: #{uri}"
         end
-        delays = {
-          robot_txt: @robots_checker&.crawl_delay&.round(3),
-          max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
-          random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
-        }
-        @delay = delays.values.compact.max
+        @delay_till = nil
+        @delay = @robots_checker&.crawl_delay&.round(3)
+        debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
+        unless @delay&.positive?
+          delays = {
+            max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
+            random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
+          }
+          @delay = [delays[:max_load], delays[:random]].compact.sum
+          debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
+        end
         if @delay&.positive?
-          $stderr.flush
-          ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
-          $stdout.flush
-          ScraperUtils::FiberScheduler.delay(@delay)
+          @delay_till = Time.now + @delay
+          ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
         end
         response
       end
       def verify_proxy_works(agent)
         $stderr.flush
         $stdout.flush
-        FiberScheduler.log "Checking proxy works..."
+        LogUtils.log "Checking proxy works..."
         my_ip = MechanizeUtils.public_ip(agent)
         begin
           IPAddr.new(my_ip)
         rescue IPAddr::InvalidAddressError => e
           raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
         end
-        ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
+        ScraperUtils::LogUtils.log "Proxy is using IP address: #{my_ip.inspect}"
         my_headers = MechanizeUtils.public_headers(agent)
         begin
           # Check response is JSON just to be safe!

data/lib/scraper_utils/mechanize_utils/robots_checker.rb ADDED Viewed

@@ -0,0 +1,151 @@
+# frozen_string_literal: true
+module ScraperUtils
+  module MechanizeUtils
+    # robots.txt checker with deliberately simplistic rules
+    class RobotsChecker
+      # @return [String] Lowercased user_agent for matching
+      attr_reader :user_agent
+      # Initialize with full user agent string like:
+      # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
+      # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
+      # Checks for
+      # * Disallow for User-agent: bot_name and
+      # * Crawl-delay from either User-agent: bot name or * (default)
+      def initialize(user_agent)
+        @user_agent = extract_user_agent(user_agent).downcase
+        if DebugUtils.basic?
+          ScraperUtils::LogUtils.log(
+            "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
+          )
+        end
+        @rules = {} # domain -> {rules: [], delay: int}
+        @delay = nil # Delay from last robots.txt check
+      end
+      # Check if a URL is disallowed based on robots.txt rules specific to our user agent
+      # @param url [String] The full URL to check
+      # @return [Boolean] true if specifically blocked for our user agent, otherwise false
+      def disallowed?(url)
+        return false unless url
+        uri = URI(url)
+        domain = "#{uri.scheme}://#{uri.host}"
+        path = uri.path || "/"
+        # Get or fetch robots.txt rules
+        rules = get_rules(domain)
+        return false unless rules # If we can't get robots.txt, assume allowed
+        # Store any delay found for this domain
+        @delay = rules[:our_delay]
+        # Check rules specific to our user agent
+        matches_any_rule?(path, rules[:our_rules])
+      end
+      # Returns the crawl delay (if any) that applied to the last URL checked
+      # Should be called after disallowed? to get relevant delay
+      # @return [Integer, nil] The delay in seconds, or nil if no delay specified
+      def crawl_delay
+        @delay
+      end
+      private
+      def extract_user_agent(user_agent)
+        if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
+          user_agent = ::Regexp.last_match(2)&.strip
+        end
+        user_agent&.strip
+      end
+      def matches_any_rule?(path, rules)
+        rules&.any? { |rule| path.start_with?(rule) }
+      end
+      def get_rules(domain)
+        return @rules[domain] if @rules.key?(domain)
+        begin
+          response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
+          return nil unless response.code.start_with?("2") # 2xx response
+          rules = parse_robots_txt(response.body)
+          @rules[domain] = rules
+          rules
+        rescue StandardError => e
+          if DebugUtils.basic?
+            ScraperUtils::LogUtils.log(
+              "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
+            )
+          end
+          nil
+        end
+      end
+      # Parse robots.txt content into structured rules
+      # Only collects rules for our specific user agent and generic crawl-delay
+      # @param content [String] The robots.txt content
+      # @return [Hash] Hash containing :our_rules and :our_delay
+      def parse_robots_txt(content)
+        sections = [] # Array of {agent:, rules:[], delay:} hashes
+        current_section = nil
+        content.each_line do |line|
+          line = line.strip.downcase
+          next if line.empty? || line.start_with?("#")
+          if line.start_with?("user-agent:")
+            agent = line.split(":", 2).last.strip
+            # Check if this is a continuation of the previous section
+            if current_section && current_section[:rules].empty? && current_section[:delay].nil?
+              current_section[:agents] << agent
+            else
+              current_section = { agents: [agent], rules: [], delay: nil }
+              sections << current_section
+            end
+            next
+          end
+          next unless current_section # Skip rules before first user-agent
+          if line.start_with?("disallow:")
+            path = line.split(":", 2).last.strip
+            current_section[:rules] << path unless path.empty?
+          elsif line.start_with?("crawl-delay:")
+            delay = line.split(":", 2).last.strip.to_i
+            current_section[:delay] = delay if delay.positive?
+          end
+        end
+        # Sort sections by most specific agent match first
+        matched_section = sections.find do |section|
+          section[:agents].any? do |agent|
+            # Our user agent starts with the agent from robots.txt
+            @user_agent.start_with?(agent) ||
+              # Or the agent from robots.txt starts with our user agent
+              # (handles ScraperUtils matching ScraperUtils/1.0)
+              agent.start_with?(@user_agent)
+          end
+        end
+        # Use matched section or fall back to wildcard
+        if matched_section
+          {
+            our_rules: matched_section[:rules],
+            our_delay: matched_section[:delay]
+          }
+        else
+          # Find default section
+          default_section = sections.find { |s| s[:agents].include?("*") }
+          {
+            our_rules: [],
+            our_delay: default_section&.dig(:delay)
+          }
+        end
+      end
+    end
+  end
+end

data/lib/scraper_utils/mechanize_utils.rb CHANGED Viewed

@@ -2,7 +2,10 @@
 require "mechanize"
 require "ipaddr"
-require "scraper_utils/mechanize_utils/agent_config"
+require_relative "mechanize_utils/adaptive_delay"
+require_relative "mechanize_utils/agent_config"
+require_relative "mechanize_utils/robots_checker"
 module ScraperUtils
   # Utilities for configuring and using Mechanize for web scraping
@@ -43,8 +46,8 @@ module ScraperUtils
     # Retrieves and logs the public IP address
     #
-    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
-    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
+    # @param force [Boolean] Force a new IP find, by clearing cache first
     # @return [String, nil] The public IP address
     def self.public_ip(agent = nil, force: false)
       @public_ip = nil if force
@@ -57,8 +60,8 @@ module ScraperUtils
     # Retrieves and logs the headers that make it through the proxy
     #
-    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
-    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
+    # @param force [Boolean] Force a new IP find, by clearing cache first
     # @return [String, nil] The list of headers in json format
     def self.public_headers(agent = nil, force: false)
       @public_headers = nil if force

data/lib/scraper_utils/randomize_utils.rb CHANGED Viewed

@@ -4,31 +4,34 @@ module ScraperUtils
   # Provides utilities for randomizing processing order in scrapers,
   # particularly helpful for distributing load and avoiding predictable patterns
   module RandomizeUtils
-    # Returns a randomized version of the input collection when in production mode,
-    # or the original collection when in test/sequential mode
-    #
-    # @param collection [Array, Enumerable] Collection of items to potentially randomize
-    # @return [Array] Randomized or original collection depending on environment
-    def self.randomize_order(collection)
-      return collection.to_a if sequential?
+    class << self
+      # Controls if processing order can be randomized
+      #
+      # @return [Boolean] true if all processing is done sequentially, otherwise false
+      # @note Defaults to true unless the MORPH_DISABLE_RANDOM ENV variable is set
+      attr_accessor :random
-      collection.to_a.shuffle
+      # Reports if processing order will be randomized
+      #
+      # @return (see #random)
+      alias random? random
     end
-    # Checks if sequential processing is enabled
-    #
-    # @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
-    def self.sequential?
-      @sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
-      @sequential || false
+    def self.reset!
+      @random = ENV["MORPH_DISABLE_RANDOM"].to_s.empty?
     end
-    # Explicitly set sequential mode for testing
+    # reset on class load
+    reset!
+    # Returns a randomized version of the input collection unless `.sequential?` is true.
     #
-    # @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
-    # @return [Boolean, nil]
-    def self.sequential=(value)
-      @sequential = value
+    # @param collection [Array, Enumerable] Collection of items
+    # @return [Array] Randomized unless {.sequential?} is true, otherwise original order
+    def self.randomize_order(collection)
+      return collection.to_a.shuffle if random?
+      collection.to_a
     end
   end
 end

data/lib/scraper_utils/scheduler/constants.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module ScraperUtils
+  module Scheduler
+    module Constants
+      MAIN_FIBER = Fiber.current
+      # @!group Scheduler defaults
+      DEFAULT_MAX_WORKERS = 50
+      DEFAULT_TIMEOUT = 6 * 60 * 60 # 6 hours
+      POLL_PERIOD = 0.01
+    end
+  end
+end

data/lib/scraper_utils/scheduler/operation_registry.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+require "fiber"
+require_relative "operation_worker"
+module ScraperUtils
+  module Scheduler
+    # Registry of all active OperationWorkers registered to be processed
+    class OperationRegistry
+      def initialize
+        @operations = {}
+        @fiber_ids = {}
+      end
+      def register(fiber, authority)
+        authority = authority.to_sym
+        operation = OperationWorker.new(fiber, authority, @response_queue)
+        @operations[authority] = operation
+        @fiber_ids[operation.fiber.object_id] = operation
+      end
+      # Remove yourself from registry, called from fiber
+      def deregister
+        operation = find
+        return unless operation
+        operation.close
+        # Remove operation from registry since shutdown has done all it can to shut down the thread and fiber
+        @operations.delete(operation.authority)
+        @fiber_ids.delete(operation.fiber.object_id)
+      end
+      def current_authority
+        find(Fiber.current.object_id)&.authority
+      end
+      # Find OperationWorker
+      # @param key [Integer, String, nil] Fiber's object_id or authority (default current fiber's object_id)
+      # @return [OperationWorker, nil] Returns worker or nil if not found
+      def find(key = nil)
+        key ||= Fiber.current.object_id
+        if key.is_a?(Symbol)
+          @operations[key]
+        elsif key.is_a?(Integer)
+          @fiber_ids[key]
+        end
+      end
+      # Removes operations
+      def shutdown
+        operations.each do |_key, operation|
+          operation.shutdown
+        end
+      end
+      # Returns true if there are no registered operations
+      def empty?
+        @operations.empty?
+      end
+      # Returns number of registered operations
+      def size
+        @operations.size
+      end
+      # Find operations that can be resumed in resume_at order (may include future resume_at)
+      #
+      # @return [Array{OperationWorker}] Operations that are alive and have a response to use with resume
+      def can_resume
+        @operations
+          .values
+          .select { |op| op.can_resume? }
+          .sort_by(&:resume_at)
+      end
+      # Cleanup dead fibers that haven't removed themselves so we don't loop forever
+      def cleanup_zombies
+        dead_operations = @operations.values.reject(&:alive?)
+        dead_operations.each do |operation|
+          LogUtils.log "WARNING: removing dead operation for #{operation.authority} - it should have cleaned up after itself!"
+          operation.shutdown
+          @operations.delete(operation.authority)
+          @fiber_ids.delete(operation.fiber.object_id)
+        end
+      end
+      # Save the thread response into the thread and mark that it can continue
+      def process_thread_response(response)
+        operation = find(response.authority)
+        operation&.save_thread_response response
+      end
+      private
+      attr_accessor :operations
+    end
+  end
+end