RubyGems - scraper_utils - Versions diffs - 0.1.0 → 0.3.0 - Mend

scraper_utils 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/.rubocop.yml +5 -8
data/CHANGELOG.md +14 -0
data/GUIDELINES.md +75 -0
data/Gemfile +6 -3
data/IMPLEMENTATION.md +33 -0
data/README.md +226 -177
data/SPECS.md +25 -0
data/bin/console +1 -0
data/bin/setup +2 -1
data/docs/example_scrape_with_fibers.rb +31 -0
data/docs/example_scraper.rb +93 -0
data/lib/scraper_utils/adaptive_delay.rb +70 -0
data/lib/scraper_utils/authority_utils.rb +2 -2
data/lib/scraper_utils/data_quality_monitor.rb +64 -0
data/lib/scraper_utils/date_range_utils.rb +159 -0
data/lib/scraper_utils/db_utils.rb +1 -2
data/lib/scraper_utils/debug_utils.rb +63 -23
data/lib/scraper_utils/fiber_scheduler.rb +229 -0
data/lib/scraper_utils/log_utils.rb +58 -25
data/lib/scraper_utils/mechanize_utils/agent_config.rb +276 -0
data/lib/scraper_utils/mechanize_utils.rb +32 -30
data/lib/scraper_utils/randomize_utils.rb +34 -0
data/lib/scraper_utils/robots_checker.rb +149 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +6 -10
data/scraper_utils.gemspec +3 -8
metadata +17 -74

data/SPECS.md ADDED Viewed

@@ -0,0 +1,25 @@
+SPECS
+=====
+These project specific Specifications go into further details than the
+installation and usage notes in `README.md`.
+ASK for clarification of any apparent conflicts with IMPLEMENTATION, GUIDELINES or project instructions.
+## Core Design Principles
+### Error Handling
+- Record-level errors abort only that record's processing
+- Allow up to 5 + 10% unprocessable records before failing
+- External service reliability (e.g., robots.txt) should not block core functionality
+### Rate Limiting
+- Honor site-specific rate limits when clearly specified
+- Apply adaptive delays based on response times
+- Use randomized delays to avoid looking like a bot
+- Support proxy configuration for geolocation needs
+### Testing
+- Ensure components are independently testable
+- Avoid timing-based tests in favor of logic validation
+- Keep test scenarios focused and under 20 lines

data/bin/console CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 require "bundler/setup"
 require "scraper_utils"

data/bin/setup CHANGED Viewed

@@ -1,4 +1,5 @@
-#!/usr/bin/env bash
+#!/bin/bash
 set -euo pipefail
 IFS=$'\n\t'
 set -vx

data/docs/example_scrape_with_fibers.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+# Example scrape method updated to use ScraperUtils::FibreScheduler
+def scrape(authorities, attempt)
+  ScraperUtils::FiberScheduler.reset!
+  exceptions = {}
+  authorities.each do |authority_label|
+    ScraperUtils::FiberScheduler.register_operation(authority_label) do
+      ScraperUtils::FiberScheduler.log(
+        "Collecting feed data for #{authority_label}, attempt: #{attempt}..."
+      )
+      ScraperUtils::DataQualityMonitor.start_authority(authority_label)
+      YourScraper.scrape(authority_label) do |record|
+        record["authority_label"] = authority_label.to_s
+        ScraperUtils::DbUtils.save_record(record)
+      rescue ScraperUtils::UnprocessableRecord => e
+        ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
+        exceptions[authority_label] = e
+        # Continues processing other records
+      end
+    rescue StandardError => e
+      warn "#{authority_label}: ERROR: #{e}"
+      warn e.backtrace || "No backtrace available"
+      exceptions[authority_label] = e
+    end
+    # end of register_operation block
+  end
+  ScraperUtils::FiberScheduler.run_all
+  exceptions
+end

data/docs/example_scraper.rb ADDED Viewed

@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+$LOAD_PATH << "./lib"
+require "scraper_utils"
+require "technology_one_scraper"
+# Main Scraper class
+class Scraper
+  AUTHORITIES = YourScraper::AUTHORITIES
+  # ADD: attempt argument
+  def scrape(authorities, attempt)
+    exceptions = {}
+    # ADD: Report attempt number
+    authorities.each do |authority_label|
+      puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
+      begin
+        # REPLACE:
+        # YourScraper.scrape(authority_label) do |record|
+        #   record["authority_label"] = authority_label.to_s
+        #   YourScraper.log(record)
+        #   ScraperWiki.save_sqlite(%w[authority_label council_reference], record)
+        # end
+        # WITH:
+        ScraperUtils::DataQualityMonitor.start_authority(authority_label)
+        YourScraper.scrape(authority_label) do |record|
+          begin
+            record["authority_label"] = authority_label.to_s
+            ScraperUtils::DbUtils.save_record(record)
+          rescue ScraperUtils::UnprocessableRecord => e
+            ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
+            exceptions[authority_label] = e
+          end
+        end
+        # END OF REPLACE
+      end
+    rescue StandardError => e
+      warn "#{authority_label}: ERROR: #{e}"
+      warn e.backtrace
+      exceptions[authority_label] = e
+    end
+    exceptions
+  end
+  def self.selected_authorities
+    ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
+  end
+  def self.run(authorities)
+    puts "Scraping authorities: #{authorities.join(', ')}"
+    start_time = Time.now
+    exceptions = scrape(authorities, 1)
+    # Set start_time and attempt to the call above and log run below
+    ScraperUtils::LogUtils.log_scraping_run(
+      start_time,
+      1,
+      authorities,
+      exceptions
+    )
+    unless exceptions.empty?
+      puts "\n***************************************************"
+      puts "Now retrying authorities which earlier had failures"
+      puts exceptions.keys.join(", ").to_s
+      puts "***************************************************"
+      start_time = Time.now
+      exceptions = scrape(exceptions.keys, 2)
+      # Set start_time and attempt to the call above and log run below
+      ScraperUtils::LogUtils.log_scraping_run(
+        start_time,
+        2,
+        authorities,
+        exceptions
+      )
+    end
+    # Report on results, raising errors for unexpected conditions
+    ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
+  end
+end
+if __FILE__ == $PROGRAM_NAME
+  # Default to list of authorities we can't or won't fix in code, explain why
+  # wagga: url redirects and then reports Application error
+  ENV["MORPH_EXPECT_BAD"] ||= "wagga"
+  Scraper.run(Scraper.selected_authorities)
+end

data/lib/scraper_utils/adaptive_delay.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+require "uri"
+module ScraperUtils
+  # Adapts delays between requests based on server response times.
+  # Target delay is proportional to response time based on max_load setting.
+  # Uses an exponential moving average to smooth variations in response times.
+  class AdaptiveDelay
+    DEFAULT_MIN_DELAY = 0.0
+    DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
+    attr_reader :min_delay, :max_delay, :max_load
+    # Creates a new adaptive delay calculator
+    #
+    # @param min_delay [Float] Minimum delay between requests in seconds
+    # @param max_delay [Float] Maximum delay between requests in seconds
+    # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
+    #                         Lower values are more conservative (e.g., 20% = 4x response time delay)
+    def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
+      @delays = {} # domain -> last delay used
+      @min_delay = min_delay.to_f
+      @max_delay = max_delay.to_f
+      @max_load = max_load.to_f.clamp(1.0, 99.0)
+      @response_multiplier = (100.0 - @max_load) / @max_load
+      return unless DebugUtils.basic?
+      ScraperUtils::FiberScheduler.log(
+        "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
+          "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
+      )
+    end
+    # @param uri [URI::Generic, String] The URL to extract the domain from
+    # @return [String] The domain in the format "scheme://host"
+    def domain(uri)
+      uri = URI(uri) unless uri.is_a?(URI)
+      "#{uri.scheme}://#{uri.host}".downcase
+    end
+    # @param uri [URI::Generic, String] URL to get delay for
+    # @return [Float] Current delay for the domain, or min_delay if no delay set
+    def delay(uri)
+      @delays[domain(uri)] || @min_delay
+    end
+    # @param uri [URI::Generic, String] URL the response came from
+    # @param response_time [Float] Time in seconds the server took to respond
+    # @return [Float] The calculated delay to use with the next request
+    def next_delay(uri, response_time)
+      uris_domain = domain(uri)
+      target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
+      current_delay = @delays[uris_domain] || target_delay
+      delay = ((9.0 * current_delay) + target_delay) / 10.0
+      delay = delay.clamp(@min_delay, @max_delay)
+      if DebugUtils.basic?
+        ScraperUtils::FiberScheduler.log(
+          "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
+            "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
+        )
+      end
+      @delays[uris_domain] = delay
+      delay
+    end
+  end
+end

data/lib/scraper_utils/authority_utils.rb CHANGED Viewed

@@ -3,13 +3,13 @@
 module ScraperUtils
   # Utilities for managing and selecting authorities
   module AuthorityUtils
+    AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
     # Selects authorities based on environment variable or returns all authorities
     #
     # @param all_authorities [Array<Symbol>] Full list of available authorities
     # @return [Array<Symbol>] Selected subset of authorities or all authorities
     # @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
-    AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
     def self.selected_authorities(all_authorities)
       if ENV[AUTHORITIES_ENV_VAR]
         authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)

data/lib/scraper_utils/data_quality_monitor.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # Monitors data quality during scraping by tracking successful vs failed record processing
+  # Automatically triggers an exception if the error rate exceeds a threshold
+  class DataQualityMonitor
+    # Get the statistics for all authorities
+    # @return [Hash, nil] Hash of statistics per authority or nil if none started
+    class << self
+      attr_reader :stats
+    end
+    # Notes the start of processing an authority and clears any previous stats
+    #
+    # @param authority_label [Symbol] The authority we are processing
+    # @return [void]
+    def self.start_authority(authority_label)
+      @stats ||= {}
+      @stats[authority_label] = { saved: 0, unprocessed: 0 }
+    end
+    # Extracts authority label and ensures stats are setup for record
+    def self.extract_authority(record)
+      authority_label = (record&.key?("authority_label") ? record["authority_label"] : "").to_sym
+      @stats ||= {}
+      @stats[authority_label] ||= { saved: 0, unprocessed: 0 }
+      authority_label
+    end
+    def self.threshold(authority_label)
+      5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
+    end
+    # Logs an unprocessable record and raises an exception if error threshold is exceeded
+    # The threshold is 5 + 10% of saved records
+    #
+    # @param exception [Exception] The exception that caused the record to be unprocessable
+    # @param record [Hash, nil] The record that couldn't be processed
+    # @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
+    # @return [void]
+    def self.log_unprocessable_record(exception, record)
+      authority_label = extract_authority(record)
+      @stats[authority_label][:unprocessed] += 1
+      ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
+        'address', nil
+      ) || record.inspect}: #{exception}"
+      return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
+      raise ScraperUtils::UnprocessableSite,
+            "Too many unprocessable_records for #{authority_label}: " \
+            "#{@stats[authority_label].inspect} - aborting processing of site!"
+    end
+    # Logs a successfully saved record
+    #
+    # @param record [Hash] The record that was saved
+    # @return [void]
+    def self.log_saved_record(record)
+      authority_label = extract_authority(record)
+      @stats[authority_label][:saved] += 1
+      ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
+    end
+  end
+end

data/lib/scraper_utils/date_range_utils.rb ADDED Viewed

@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+module ScraperUtils
+  class DateRangeUtils
+    MERGE_ADJACENT_RANGES = true
+    PERIODS = [2, 3, 5, 8].freeze
+    class << self
+      # @return [Integer] Default number of days to cover
+      attr_accessor :default_days
+      # @return [Integer] Default days to always include in ranges
+      attr_accessor :default_everytime
+      # @return [Integer, nil] Default max days between any one date being in a range
+      attr_accessor :default_max_period
+      # Configure default settings for all DateRangeUtils instances
+      # @yield [self] Yields self for configuration
+      # @example
+      #   AgentConfig.configure do |config|
+      #     config.default_everytime = 3
+      #     config.default_days = 35
+      #     config.default_max_period = 5
+      #   end
+      # @return [void]
+      def configure
+        yield self if block_given?
+      end
+      # Reset all configuration options to their default values
+      # @return [void]
+      def reset_defaults!
+        @default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
+        @default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
+        @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
+      end
+    end
+    # Set defaults on load
+    reset_defaults!
+    attr_reader :max_period_used
+    attr_reader :extended_max_period
+    # Generates one or more date ranges to check the most recent daily through to checking each max_period
+    # There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
+    # @param days [Integer, nil] create ranges that cover the last `days` dates
+    # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
+    # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
+    # @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
+    # @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
+    #
+    # Uses a Fibonacci sequence to create a natural progression of check frequencies.
+    # Newer data is checked more frequently, with periods between checks growing
+    # according to the Fibonacci sequence (2, 3, 5, 8, 13...) until reaching max_period.
+    # This creates an efficient schedule that mimics natural information decay patterns.
+    def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
+      _calculate_date_ranges(
+        Integer(days || self.class.default_days),
+        Integer(everytime || self.class.default_everytime),
+        Integer(max_period || self.class.default_max_period),
+        today || Time.now(in: '+09:30').to_date
+      )
+    end
+    private
+    def _calculate_date_ranges(days, everytime, max_period, today)
+      @max_period_used = 1
+      to_date = today
+      valid_periods = PERIODS.select { |p| p <= max_period }
+      if !max_period.positive? || !days.positive?
+        return []
+      elsif valid_periods.empty? || everytime >= days
+        # cover everything everytime
+        return [[today + 1 - days, today, "everything"]]
+      end
+      max_period = valid_periods.max
+      run_number = today.to_date.jd
+      ranges = []
+      if everytime.positive?
+        ranges << [to_date + 1 - everytime, to_date, "everytime"]
+        days -= everytime
+        to_date -= everytime
+      end
+      periods = valid_periods.dup
+      loop do
+        period = periods.shift
+        break if period.nil? || period >= max_period || !days.positive?
+        if DebugUtils.trace?
+          FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
+        end
+        period.times do |index|
+          break unless days.positive?
+          this_period = [days, period].min
+          break if this_period <= 0
+          earliest_from = to_date - days
+          # we are working from the oldest back towards today
+          if run_number % period == index
+            from = to_date - index - (this_period - 1)
+            from = earliest_from if from < earliest_from
+            to = [today, to_date - index].min
+            break if from > to
+            @max_period_used = [this_period, @max_period_used].max
+            if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
+              # extend adjacent range
+              ranges.last[0] = [from, ranges.last[0]].min
+              ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
+            else
+              to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
+              ranges << [from, to, "#{period}\##{index}"]
+            end
+          end
+          days -= this_period
+          to_date -= this_period
+        end
+      end
+      # remainder of range at max_period, whatever that is
+      if days.positive? && ScraperUtils::DebugUtils.trace?
+        FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
+      end
+      index = -1
+      while days.positive?
+        index += 1
+        this_period = [days, max_period].min
+        break if this_period <= 0
+        earliest_from = to_date - days
+        if (run_number % max_period) == (index % max_period)
+          from = to_date - index - (this_period - 1)
+          from = earliest_from if from < earliest_from
+          to = to_date - index
+          break if from > to
+          @max_period_used = [this_period, @max_period_used].max
+          if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
+            # extend adjacent range
+            ranges.last[0] = [from, ranges.last[0]].min
+            ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
+          else
+            to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
+            ranges << [from, to, "#{this_period}\##{index}"]
+          end
+        end
+        days -= this_period
+        to_date -= this_period
+      end
+      ranges.reverse
+    end
+  end
+end

data/lib/scraper_utils/db_utils.rb CHANGED Viewed

@@ -33,9 +33,8 @@ module ScraperUtils
                     else
                       ["council_reference"]
                     end
-      puts "Saving record #{record['council_reference']} - #{record['address']}"
       ScraperWiki.save_sqlite(primary_key, record)
+      ScraperUtils::DataQualityMonitor.log_saved_record(record)
     end
   end
 end

data/lib/scraper_utils/debug_utils.rb CHANGED Viewed

@@ -5,6 +5,50 @@ require "json"
 module ScraperUtils
   # Utilities for debugging web scraping processes
   module DebugUtils
+    DEBUG_ENV_VAR = "DEBUG"
+    MORPH_DEBUG_ENV_VAR = "MORPH_DEBUG"
+    # Debug level constants
+    DISABLED_LEVEL = 0
+    BASIC_LEVEL = 1
+    VERBOSE_LEVEL = 2
+    TRACE_LEVEL = 3
+    # Get current debug level (0 = disabled, 1 = basic, 2 = verbose, 3 = trace)
+    # Checks DEBUG and MORPH_DEBUG env variables
+    # @return [Integer] Debug level
+    def self.debug_level
+      debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
+      debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
+    end
+    # Check if debug is enabled at specified level or higher
+    #
+    # @param level [Integer] Minimum debug level to check for
+    # @return [Boolean] true if debugging at specified level is enabled
+    def self.debug?(level = BASIC_LEVEL)
+      debug_level >= level
+    end
+    # Check if basic debug output or higher is enabled
+    # @return [Boolean] true if debugging is enabled
+    def self.basic?
+      debug?(BASIC_LEVEL)
+    end
+    # Check if verbose debug output or higher is enabled
+    # @return [Boolean] true if verbose debugging is enabled
+    def self.verbose?
+      debug?(VERBOSE_LEVEL)
+    end
+    # Check if debug tracing or higher is enabled
+    # @return [Boolean] true if debugging is enabled at trace level
+    def self.trace?
+      debug?(TRACE_LEVEL)
+    end
     # Logs details of an HTTP request when debug mode is enabled
     #
     # @param method [String] HTTP method (GET, POST, etc.)
@@ -14,21 +58,14 @@ module ScraperUtils
     # @param body [Hash, nil] Optional request body
     # @return [void]
     def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
-      return unless ScraperUtils.debug?
+      return unless basic?
-      puts "\n🔍 #{method.upcase} #{url}"
-      if parameters
-        puts "Parameters:"
-        puts JSON.pretty_generate(parameters)
-      end
-      if headers
-        puts "Headers:"
-        puts JSON.pretty_generate(headers)
-      end
-      return unless body
-      puts "Body:"
-      puts JSON.pretty_generate(body)
+      puts
+      FiberScheduler.log "🔍 #{method.upcase} #{url}"
+      puts "Parameters:", JSON.pretty_generate(parameters) if parameters
+      puts "Headers:", JSON.pretty_generate(headers) if headers
+      puts "Body:", JSON.pretty_generate(body) if body
+      $stdout.flush
     end
     # Logs details of a web page when debug mode is enabled
@@ -37,17 +74,18 @@ module ScraperUtils
     # @param message [String] Context or description for the debug output
     # @return [void]
     def self.debug_page(page, message)
-      return unless ScraperUtils.debug?
+      return unless trace?
-      puts "",
-           "🔍 DEBUG: #{message}"
+      puts
+      FiberScheduler.log "🔍 DEBUG: #{message}"
       puts "Current URL: #{page.uri}"
       puts "Page title: #{page.at('title').text.strip}" if page.at("title")
       puts "",
-           "Page content:"
-      puts "-" * 40
-      puts page.body
-      puts "-" * 40
+           "Page content:",
+           "-" * 40,
+           page.body,
+           "-" * 40
+      $stdout.flush
     end
     # Logs details about a specific page selector when debug mode is enabled
@@ -57,9 +95,10 @@ module ScraperUtils
     # @param message [String] Context or description for the debug output
     # @return [void]
     def self.debug_selector(page, selector, message)
-      return unless ScraperUtils.debug?
+      return unless trace?
-      puts "\n🔍 DEBUG: #{message}"
+      puts
+      FiberScheduler.log "🔍 DEBUG: #{message}"
       puts "Looking for selector: #{selector}"
       element = page.at(selector)
       if element
@@ -71,6 +110,7 @@ module ScraperUtils
         puts page.body
         puts "-" * 40
       end
+      $stdout.flush
     end
   end
 end