RubyGems - scraper_utils - Versions diffs - 0.5.1 → 0.6.0 - Mend

scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

checksums.yaml +4 -4
data/.yardopts +5 -0
data/CHANGELOG.md +7 -0
data/GUIDELINES.md +2 -1
data/Gemfile +1 -0
data/IMPLEMENTATION.md +40 -0
data/README.md +29 -23
data/SPECS.md +13 -1
data/bin/rspec +27 -0
data/docs/example_scrape_with_fibers.rb +4 -4
data/docs/fibers_and_threads.md +72 -0
data/docs/getting_started.md +6 -6
data/docs/interleaving_requests.md +7 -7
data/docs/parallel_requests.md +138 -0
data/docs/randomizing_requests.md +12 -8
data/docs/reducing_server_load.md +6 -6
data/lib/scraper_utils/data_quality_monitor.rb +2 -3
data/lib/scraper_utils/date_range_utils.rb +37 -78
data/lib/scraper_utils/debug_utils.rb +5 -5
data/lib/scraper_utils/log_utils.rb +15 -0
data/lib/scraper_utils/mechanize_actions.rb +37 -8
data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
data/lib/scraper_utils/mechanize_utils.rb +8 -5
data/lib/scraper_utils/randomize_utils.rb +22 -19
data/lib/scraper_utils/scheduler/constants.rb +12 -0
data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
data/lib/scraper_utils/scheduler/process_request.rb +59 -0
data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
data/lib/scraper_utils/scheduler.rb +286 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +11 -14
metadata +16 -6
data/lib/scraper_utils/adaptive_delay.rb +0 -70
data/lib/scraper_utils/fiber_scheduler.rb +0 -229
data/lib/scraper_utils/robots_checker.rb +0 -149

data/lib/scraper_utils/date_range_utils.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module ScraperUtils
   class DateRangeUtils
     MERGE_ADJACENT_RANGES = true
-    PERIODS = [2, 3, 5, 8].freeze
+    PERIODS = [2, 3, 4].freeze
     class << self
       # @return [Integer] Default number of days to cover
@@ -33,7 +33,7 @@ module ScraperUtils
       def reset_defaults!
         @default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
         @default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
-        @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
+        @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 2).to_i # 3
       end
     end
@@ -46,8 +46,8 @@ module ScraperUtils
     # Generates one or more date ranges to check the most recent daily through to checking each max_period
     # There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
     # @param days [Integer, nil] create ranges that cover the last `days` dates
-    # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
-    # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
+    # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates (minimum 1)
+    # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days (1..4)
     # @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
     # @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
     #
@@ -58,7 +58,7 @@ module ScraperUtils
     def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
       _calculate_date_ranges(
         Integer(days || self.class.default_days),
-        Integer(everytime || self.class.default_everytime),
+        [1, Integer(everytime || self.class.default_everytime)].max,
         Integer(max_period || self.class.default_max_period),
         today || Time.now(in: '+09:30').to_date
       )
@@ -76,84 +76,43 @@ module ScraperUtils
         # cover everything everytime
         return [[today + 1 - days, today, "everything"]]
       end
       max_period = valid_periods.max
-      run_number = today.to_date.jd
-      ranges = []
-      if everytime.positive?
-        ranges << [to_date + 1 - everytime, to_date, "everytime"]
-        days -= everytime
-        to_date -= everytime
-      end
-      periods = valid_periods.dup
-      loop do
-        period = periods.shift
-        break if period.nil? || period >= max_period || !days.positive?
-        if DebugUtils.trace?
-          FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
+      @max_period_used = max_period
+      one_half = ((days - everytime) / 2).to_i
+      one_third = ((days - everytime) / 3).to_i
+      two_ninths = (2 * (days - everytime) / 9).to_i
+      run_ranges =
+        case max_period
+        when 2
+          [
+            [[to_date - (one_half + everytime), to_date, "#{max_period}#0+everytime"]],
+            [[to_date - days, to_date - (one_half + everytime), "#{max_period}#1"], [to_date - everytime, to_date, "everytime"]]
+          ]
+        when 3
+          [
+            [[to_date - days - 1, to_date + two_ninths - days, "3#0"], [to_date - (one_third + everytime), to_date, "2#0+everytime"]],
+            [[to_date + two_ninths - days, to_date + 2 * two_ninths - days, "3#1"], [to_date - everytime, to_date, "everytime"]],
+            [[to_date + 2 * two_ninths - days, to_date, "3#2+2#0+everytime"]],
+            [[to_date - days - 1, to_date + two_ninths - days, "3#3"], [to_date - everytime, to_date, "everytime"]],
+            [[to_date + two_ninths - days, to_date + 2 * two_ninths - days, "3#4"], [to_date - (one_third + everytime), to_date, "2#2+everytime"]],
+            [[to_date + 2 * two_ninths - days, to_date - (one_third + everytime), "3#5"], [to_date - everytime, to_date, "everytime"]]
+          ]
+        else
+          [
+            [[to_date - (one_half + everytime), to_date, "2#0+everytime"]],
+            [[to_date - days - 2, to_date - (one_half + everytime), "4#0"], [to_date - everytime, to_date, "everytime"]],
+            [[to_date - (one_half + everytime), to_date, "2#1+everytime"]],
+            [[to_date - everytime, to_date, "everytime"]]
+          ]
         end
-        period.times do |index|
-          break unless days.positive?
-          this_period = [days, period].min
-          break if this_period <= 0
-          earliest_from = to_date - days
-          # we are working from the oldest back towards today
-          if run_number % period == index
-            from = to_date - index - (this_period - 1)
-            from = earliest_from if from < earliest_from
-            to = [today, to_date - index].min
-            break if from > to
+      run_number = today.to_date.jd % run_ranges.size
-            @max_period_used = [this_period, @max_period_used].max
-            if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
-              # extend adjacent range
-              ranges.last[0] = [from, ranges.last[0]].min
-              ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
-            else
-              to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
-              ranges << [from, to, "#{period}\##{index}"]
-            end
-          end
-          days -= this_period
-          to_date -= this_period
-        end
-      end
-      # remainder of range at max_period, whatever that is
+      ranges = run_ranges[run_number]
       if days.positive? && ScraperUtils::DebugUtils.trace?
-        FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
-      end
-      index = -1
-      while days.positive?
-        index += 1
-        this_period = [days, max_period].min
-        break if this_period <= 0
-        earliest_from = to_date - days
-        if (run_number % max_period) == (index % max_period)
-          from = to_date - index - (this_period - 1)
-          from = earliest_from if from < earliest_from
-          to = to_date - index
-          break if from > to
-          @max_period_used = [this_period, @max_period_used].max
-          if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
-            # extend adjacent range
-            ranges.last[0] = [from, ranges.last[0]].min
-            ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
-          else
-            to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
-            ranges << [from, to, "#{this_period}\##{index}"]
-          end
-        end
-        days -= this_period
-        to_date -= this_period
+        LogUtils.log "DEBUG: #{max_period} ranges: #{ranges.inspect}"
       end
-      ranges.reverse
+      ranges
     end
   end
 end

data/lib/scraper_utils/debug_utils.rb CHANGED Viewed

@@ -51,17 +51,17 @@ module ScraperUtils
     # Logs details of an HTTP request when debug mode is enabled
     #
-    # @param method [String] HTTP method (GET, POST, etc.)
+    # @param http_method [String] HTTP http_method (GET, POST, etc.)
     # @param url [String] Request URL
     # @param parameters [Hash, nil] Optional request parameters
     # @param headers [Hash, nil] Optional request headers
     # @param body [Hash, nil] Optional request body
     # @return [void]
-    def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
+    def self.debug_request(http_method, url, parameters: nil, headers: nil, body: nil)
       return unless basic?
       puts
-      FiberScheduler.log "🔍 #{method.upcase} #{url}"
+      LogUtils.log "🔍 #{http_method.upcase} #{url}"
       puts "Parameters:", JSON.pretty_generate(parameters) if parameters
       puts "Headers:", JSON.pretty_generate(headers) if headers
       puts "Body:", JSON.pretty_generate(body) if body
@@ -77,7 +77,7 @@ module ScraperUtils
       return unless trace?
       puts
-      FiberScheduler.log "🔍 DEBUG: #{message}"
+      LogUtils.log "🔍 DEBUG: #{message}"
       puts "Current URL: #{page.uri}"
       puts "Page title: #{page.at('title').text.strip}" if page.at("title")
       puts "",
@@ -98,7 +98,7 @@ module ScraperUtils
       return unless trace?
       puts
-      FiberScheduler.log "🔍 DEBUG: #{message}"
+      LogUtils.log "🔍 DEBUG: #{message}"
       puts "Looking for selector: #{selector}"
       element = page.at(selector)
       if element

data/lib/scraper_utils/log_utils.rb CHANGED Viewed

@@ -9,6 +9,21 @@ module ScraperUtils
     LOG_TABLE = "scrape_log"
     LOG_RETENTION_DAYS = 30
+    # Logs a message, automatically prefixing with authority name if in a fiber
+    #
+    # @param message [String] the message to log
+    # @return [void]
+    def self.log(message, authority = nil)
+      authority ||= Scheduler.current_authority
+      $stderr.flush
+      if authority
+        puts "[#{authority}] #{message}"
+      else
+        puts message
+      end
+      $stdout.flush
+    end
     # Log details about a scraping run for one or more authorities
     # @param start_time [Time] When this scraping attempt was started
     # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)

data/lib/scraper_utils/mechanize_actions.rb CHANGED Viewed

@@ -9,7 +9,7 @@ module ScraperUtils
   #
   #   actions = [
   #     [:click, "Next Page"],
-  #     [:click, ["Option A", "Option B"]] # Will select one randomly
+  #     [:click, ["Option A", "xpath://div[@id='results']/a", "css:.some-button"]] # Will select one randomly
   #   ]
   #
   #   processor = ScraperUtils::MechanizeActions.new(agent)
@@ -50,7 +50,7 @@ module ScraperUtils
     # @example Action format
     #   actions = [
     #     [:click, "Link Text"],                     # Click on link with this text
-    #     [:click, ["Option A", "Option B"]],        # Click on one of these options (randomly selected)
+    #     [:click, ["Option A", "text:Option B"]],   # Click on one of these options (randomly selected)
     #     [:click, "css:.some-button"],              # Use CSS selector
     #     [:click, "xpath://div[@id='results']/a"],  # Use XPath selector
     #     [:block, ->(page, args, agent, results) { [page, { custom_results: 'data' }] }] # Custom block
@@ -67,8 +67,7 @@ module ScraperUtils
           when :click
             handle_click(current_page, args)
           when :block
-            block = args.shift
-            block.call(current_page, args, agent, @results.dup)
+            handle_block(current_page, args)
           else
             raise ArgumentError, "Unknown action type: #{action_type}"
           end
@@ -81,6 +80,18 @@ module ScraperUtils
     private
+    # Process a block action
+    #
+    # @param page [Mechanize::Page] The current page
+    # @param args [Array] The block and its arguments
+    # @return [Array<Mechanize::Page, Hash>] The resulting page and status
+    def handle_block(page, args)
+      block = args.shift
+      # Apply replacements to all remaining arguments
+      processed_args = args.map { |arg| apply_replacements(arg) }
+      block.call(page, processed_args.first, agent, @results.dup)
+    end
     # Handle a click action
     #
     # @param page [Mechanize::Page] The current page
@@ -105,16 +116,34 @@ module ScraperUtils
     # Select an element on the page based on selector string
     #
     # @param page [Mechanize::Page] The page to search in
-    # @param selector_string [String] The selector string
+    # @param selector_string [String] The selector string, optionally with "css:", "xpath:" or "text:" prefix
     # @return [Mechanize::Element, nil] The selected element or nil if not found
     def select_element(page, selector_string)
       # Handle different selector types based on prefixes
       if selector_string.start_with?("css:")
         selector = selector_string.sub(/^css:/, '')
-        page.at_css(selector)
+        # We need to convert Nokogiri elements to Mechanize elements for clicking
+        css_element = page.at_css(selector)
+        return nil unless css_element
+        # If it's a link, find the matching Mechanize link
+        if css_element.name.downcase == 'a' && css_element['href']
+          return page.links.find { |link| link.href == css_element['href'] }
+        end
+        return css_element
       elsif selector_string.start_with?("xpath:")
         selector = selector_string.sub(/^xpath:/, '')
-        page.at_xpath(selector)
+        # We need to convert Nokogiri elements to Mechanize elements for clicking
+        xpath_element = page.at_xpath(selector)
+        return nil unless xpath_element
+        # If it's a link, find the matching Mechanize link
+        if xpath_element.name.downcase == 'a' && xpath_element['href']
+          return page.links.find { |link| link.href == xpath_element['href'] }
+        end
+        return xpath_element
       else
         # Default to text: for links
         selector = selector_string.sub(/^text:/, '')
@@ -133,7 +162,7 @@ module ScraperUtils
           end
         end
-        # Get the link with the shortest (closest matching) text then the longest href
+        # Get the link with the a. shortest (closest matching) text and then b. the longest href
         matching_links.min_by { |l| [l.text.strip.length, -l.href.length] }
       end
     end

data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb ADDED Viewed

@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+require "uri"
+module ScraperUtils
+  module MechanizeUtils
+    # Adapts delays between requests based on server response times.
+    # Target delay is proportional to response time based on max_load setting.
+    # Uses an exponential moving average to smooth variations in response times.
+    class AdaptiveDelay
+      DEFAULT_MIN_DELAY = 0.0
+      DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
+      attr_reader :min_delay, :max_delay, :max_load
+      # Creates a new adaptive delay calculator
+      #
+      # @param min_delay [Float] Minimum delay between requests in seconds
+      # @param max_delay [Float] Maximum delay between requests in seconds
+      # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
+      #                         Lower values are more conservative (e.g., 20% = 4x response time delay)
+      def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
+        @delays = {} # domain -> last delay used
+        @min_delay = min_delay.to_f
+        @max_delay = max_delay.to_f
+        @max_load = max_load.to_f.clamp(1.0, 99.0)
+        @response_multiplier = (100.0 - @max_load) / @max_load
+        return unless DebugUtils.basic?
+        ScraperUtils::LogUtils.log(
+          "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
+            "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
+        )
+      end
+      # @param uri [URI::Generic, String] URL to get delay for
+      # @return [Float] Current delay for the domain, or min_delay if no delay set
+      def delay(uri)
+        @delays[domain(uri)] || @min_delay
+      end
+      # Returns the next_delay calculated from a smoothed average of response_time to use less than max_load% of server
+      #
+      # @param uri [URI::Generic, String] URL the response came from
+      # @param response_time [Float] Time in seconds the server took to respond
+      # @return [Float] The calculated delay to use with the next request
+      def next_delay(uri, response_time)
+        uris_domain = domain(uri)
+        # calculate target_delay to achieve desired max_load% using pre-calculated multiplier
+        target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
+        # Initialise average from initial_response_time rather than zero to start with reasonable approximation
+        current_delay = @delays[uris_domain] || target_delay
+        # exponential smooth the delay to smooth out wild swings (Equivalent to an RC low pass filter)
+        delay = ((3.0 * current_delay) + target_delay) / 4.0
+        delay = delay.clamp(@min_delay, @max_delay)
+        if DebugUtils.basic?
+          ScraperUtils::LogUtils.log(
+            "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
+              "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
+          )
+        end
+        @delays[uris_domain] = delay
+        delay
+      end
+      private
+      # @param uri [URI::Generic, String] The URL to extract the domain from
+      # @return [String] The domain in the format "scheme://host"
+      def domain(uri)
+        uri = URI(uri) unless uri.is_a?(URI)
+        "#{uri.scheme}://#{uri.host}".downcase
+      end
+    end
+  end
+end

data/lib/scraper_utils/mechanize_utils/agent_config.rb CHANGED Viewed

@@ -24,7 +24,7 @@ module ScraperUtils
     #   )
     class AgentConfig
       DEFAULT_TIMEOUT = 60
-      DEFAULT_RANDOM_DELAY = 5
+      DEFAULT_RANDOM_DELAY = 0
       DEFAULT_MAX_LOAD = 33.3
       MAX_LOAD_CAP = 50.0
@@ -67,7 +67,7 @@ module ScraperUtils
         # Reset all configuration options to their default values
         # @return [void]
         def reset_defaults!
-          @default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
+          @default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
           @default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
           @default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 5
           @default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 33.3
@@ -85,7 +85,7 @@ module ScraperUtils
       # Give access for testing
-      attr_reader :max_load, :min_random, :max_random
+      attr_reader :max_load, :random_range
       # Creates Mechanize agent configuration with sensible defaults overridable via configure
       # @param timeout [Integer, nil] Timeout for agent connections (default: 60)
@@ -125,21 +125,21 @@ module ScraperUtils
         @australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
         if @australian_proxy
           uri = begin
-            URI.parse(ScraperUtils.australian_proxy.to_s)
-          rescue URI::InvalidURIError => e
-            raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
-          end
+                  URI.parse(ScraperUtils.australian_proxy.to_s)
+                rescue URI::InvalidURIError => e
+                  raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
+                end
           unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
             raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
           end
-          unless uri.host && uri.port
+          unless !uri.host.to_s.empty? && uri.port&.positive?
             raise URI::InvalidURIError, "Proxy URL must include host and port"
           end
         end
-        if @random_delay
-          @min_random = Math.sqrt(@random_delay * 3.0 / 13.0).round(3)
-          @max_random = (3 * @min_random).round(3)
+        if @random_delay&.positive?
+          min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
+          @random_range = min_random.round(3)..(3 * min_random).round(3)
         end
         today = Date.today.strftime("%Y-%m-%d")
@@ -177,7 +177,6 @@ module ScraperUtils
           verify_proxy_works(agent)
         end
-        @connection_started_at = nil
         agent.pre_connect_hooks << method(:pre_connect_hook)
         agent.post_connect_hooks << method(:post_connect_hook)
       end
@@ -193,11 +192,11 @@ module ScraperUtils
                           "australian_proxy=#{@australian_proxy.inspect}"
                         end
         display_args << "compliant_mode" if @compliant_mode
-        display_args << "random_delay=#{@random_delay}" if @random_delay
+        display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
         display_args << "max_load=#{@max_load}%" if @max_load
         display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
         display_args << "default args" if display_args.empty?
-        ScraperUtils::FiberScheduler.log(
+        ScraperUtils::LogUtils.log(
           "Configuring Mechanize agent with #{display_args.join(', ')}"
         )
       end
@@ -206,7 +205,7 @@ module ScraperUtils
         @connection_started_at = Time.now
         return unless DebugUtils.verbose?
-        ScraperUtils::FiberScheduler.log(
+        ScraperUtils::LogUtils.log(
           "Pre Connect request: #{request.inspect} at #{@connection_started_at}"
         )
       end
@@ -216,9 +215,9 @@ module ScraperUtils
         response_time = Time.now - @connection_started_at
         if DebugUtils.basic?
-          ScraperUtils::FiberScheduler.log(
+          ScraperUtils::LogUtils.log(
             "Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
-            "after #{response_time} seconds"
+              "after #{response_time} seconds"
           )
         end
@@ -227,33 +226,35 @@ module ScraperUtils
                 "URL is disallowed by robots.txt specific rules: #{uri}"
         end
-        delays = {
-          robot_txt: @robots_checker&.crawl_delay&.round(3),
-          max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
-          random: (@min_random ? (rand(@min_random..@max_random)**2).round(3) : nil)
-        }
-        @delay = delays.values.compact.max
+        @delay_till = nil
+        @delay = @robots_checker&.crawl_delay&.round(3)
+        debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
+        unless @delay&.positive?
+          delays = {
+            max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
+            random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
+          }
+          @delay = [delays[:max_load], delays[:random]].compact.sum
+          debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
+        end
         if @delay&.positive?
-          $stderr.flush
-          ScraperUtils::FiberScheduler.log("Delaying #{@delay} seconds, max of #{delays.inspect}") if ENV["DEBUG"]
-          $stdout.flush
-          ScraperUtils::FiberScheduler.delay(@delay)
+          @delay_till = Time.now + @delay
+          ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
         end
         response
       end
       def verify_proxy_works(agent)
         $stderr.flush
         $stdout.flush
-        FiberScheduler.log "Checking proxy works..."
+        LogUtils.log "Checking proxy works..."
         my_ip = MechanizeUtils.public_ip(agent)
         begin
           IPAddr.new(my_ip)
         rescue IPAddr::InvalidAddressError => e
           raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
         end
-        ScraperUtils::FiberScheduler.log "Proxy is using IP address: #{my_ip.inspect}"
+        ScraperUtils::LogUtils.log "Proxy is using IP address: #{my_ip.inspect}"
         my_headers = MechanizeUtils.public_headers(agent)
         begin
           # Check response is JSON just to be safe!