RubyGems - scraper_utils - Versions diffs - 0.1.0 → 0.3.0 - Mend

scraper_utils 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/.gitignore +3 -0
data/.rubocop.yml +5 -8
data/CHANGELOG.md +14 -0
data/GUIDELINES.md +75 -0
data/Gemfile +6 -3
data/IMPLEMENTATION.md +33 -0
data/README.md +226 -177
data/SPECS.md +25 -0
data/bin/console +1 -0
data/bin/setup +2 -1
data/docs/example_scrape_with_fibers.rb +31 -0
data/docs/example_scraper.rb +93 -0
data/lib/scraper_utils/adaptive_delay.rb +70 -0
data/lib/scraper_utils/authority_utils.rb +2 -2
data/lib/scraper_utils/data_quality_monitor.rb +64 -0
data/lib/scraper_utils/date_range_utils.rb +159 -0
data/lib/scraper_utils/db_utils.rb +1 -2
data/lib/scraper_utils/debug_utils.rb +63 -23
data/lib/scraper_utils/fiber_scheduler.rb +229 -0
data/lib/scraper_utils/log_utils.rb +58 -25
data/lib/scraper_utils/mechanize_utils/agent_config.rb +276 -0
data/lib/scraper_utils/mechanize_utils.rb +32 -30
data/lib/scraper_utils/randomize_utils.rb +34 -0
data/lib/scraper_utils/robots_checker.rb +149 -0
data/lib/scraper_utils/version.rb +1 -1
data/lib/scraper_utils.rb +6 -10
data/scraper_utils.gemspec +3 -8
metadata +17 -74

data/lib/scraper_utils/mechanize_utils.rb CHANGED Viewed

@@ -1,32 +1,23 @@
 # frozen_string_literal: true
 require "mechanize"
+require "ipaddr"
+require "scraper_utils/mechanize_utils/agent_config"
 module ScraperUtils
   # Utilities for configuring and using Mechanize for web scraping
   module MechanizeUtils
     PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
+    HEADERS_ECHO_URL = "https://httpbin.org/headers"
-    # Creates and configures a Mechanize agent with optional proxy and timeout
-    #
-    # @param timeout [Integer, nil] Timeout for agent connections
-    # @param australian_proxy [Boolean] Whether to use an Australian proxy
+    # Creates and configures a Mechanize agent
+    # @param (see AgentConfig#initialize)
     # @return [Mechanize] Configured Mechanize agent
-    def self.mechanize_agent(timeout: nil, use_proxy: true)
+    def self.mechanize_agent(**options)
       agent = Mechanize.new
-      agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
-      use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
-      if use_proxy
-        # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
-        # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
-        # the real password.
-        agent.agent.set_proxy(ScraperUtils.australian_proxy)
-      end
-      if timeout
-        agent.open_timeout = timeout
-        agent.read_timeout = timeout
-      end
-      public_ip(agent) if use_proxy
+      config = AgentConfig.new(**options)
+      config.configure_agent(agent)
+      agent.instance_variable_set(:@scraper_utils_config, config)
       agent
     end
@@ -47,24 +38,35 @@ module ScraperUtils
         text = element.inner_text
         return "Maintenance: #{text}" if text&.match?(/maintenance/i)
       end
-      # Not in maintenance mode
       nil
     end
     # Retrieves and logs the public IP address
     #
-    # @param agent [Mechanize] Mechanize agent to use for IP lookup
-    # @param force [Boolean] Force a new IP lookup, bypassing cache
-    # @return [String] The public IP address
-    def self.public_ip(agent, force: false)
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
+    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @return [String, nil] The public IP address
+    def self.public_ip(agent = nil, force: false)
       @public_ip = nil if force
-      @public_ip ||=
-        begin
-          ip = agent.get(PUBLIC_IP_URL).body.strip
-          puts "Public IP: #{ip}"
-          ip
-        end
+      @public_ip ||= begin
+        response = agent&.get(PUBLIC_IP_URL)
+        response&.body&.strip
+      end
+      @public_ip
+    end
+    # Retrieves and logs the headers that make it through the proxy
+    #
+    # @param agent [Mechanize, nil] Mechanize agent to use for IP lookup or nil when clearing cache
+    # @param force [Boolean] Force a new IP lookup, by clearing cache first
+    # @return [String, nil] The list of headers in json format
+    def self.public_headers(agent = nil, force: false)
+      @public_headers = nil if force
+      @public_headers ||= begin
+        response = agent&.get(HEADERS_ECHO_URL)
+        response&.body&.strip
+      end
+      @public_headers
     end
   end
 end

data/lib/scraper_utils/randomize_utils.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # Provides utilities for randomizing processing order in scrapers,
+  # particularly helpful for distributing load and avoiding predictable patterns
+  module RandomizeUtils
+    # Returns a randomized version of the input collection when in production mode,
+    # or the original collection when in test/sequential mode
+    #
+    # @param collection [Array, Enumerable] Collection of items to potentially randomize
+    # @return [Array] Randomized or original collection depending on environment
+    def self.randomize_order(collection)
+      return collection.to_a if sequential?
+      collection.to_a.shuffle
+    end
+    # Checks if sequential processing is enabled
+    #
+    # @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
+    def self.sequential?
+      @sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
+      @sequential || false
+    end
+    # Explicitly set sequential mode for testing
+    #
+    # @param value [Boolean, nil] true to enable sequential mode, false to disable, nil to clear cache
+    # @return [Boolean, nil]
+    def self.sequential=(value)
+      @sequential = value
+    end
+  end
+end

data/lib/scraper_utils/robots_checker.rb ADDED Viewed

@@ -0,0 +1,149 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # robots.txt checker with deliberately simplistic rules
+  class RobotsChecker
+    # @return [String] Lowercased user_agent for matching
+    attr_reader :user_agent
+    # Initialize with full user agent string like:
+    # "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
+    # Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
+    # Checks for
+    # * Disallow for User-agent: bot_name and
+    # * Crawl-delay from either User-agent: bot name or * (default)
+    def initialize(user_agent)
+      @user_agent = extract_user_agent(user_agent).downcase
+      if DebugUtils.basic?
+        ScraperUtils::FiberScheduler.log(
+          "Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
+        )
+      end
+      @rules = {} # domain -> {rules: [], delay: int}
+      @delay = nil # Delay from last robots.txt check
+    end
+    # Check if a URL is disallowed based on robots.txt rules specific to our user agent
+    # @param url [String] The full URL to check
+    # @return [Boolean] true if specifically blocked for our user agent, otherwise false
+    def disallowed?(url)
+      return false unless url
+      uri = URI(url)
+      domain = "#{uri.scheme}://#{uri.host}"
+      path = uri.path || "/"
+      # Get or fetch robots.txt rules
+      rules = get_rules(domain)
+      return false unless rules # If we can't get robots.txt, assume allowed
+      # Store any delay found for this domain
+      @delay = rules[:our_delay]
+      # Check rules specific to our user agent
+      matches_any_rule?(path, rules[:our_rules])
+    end
+    # Returns the crawl delay (if any) that applied to the last URL checked
+    # Should be called after disallowed? to get relevant delay
+    # @return [Integer, nil] The delay in seconds, or nil if no delay specified
+    def crawl_delay
+      @delay
+    end
+    private
+    def extract_user_agent(user_agent)
+      if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
+        user_agent = ::Regexp.last_match(2)&.strip
+      end
+      user_agent&.strip
+    end
+    def matches_any_rule?(path, rules)
+      rules&.any? { |rule| path.start_with?(rule) }
+    end
+    def get_rules(domain)
+      return @rules[domain] if @rules.key?(domain)
+      begin
+        response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
+        return nil unless response.code.start_with?("2") # 2xx response
+        rules = parse_robots_txt(response.body)
+        @rules[domain] = rules
+        rules
+      rescue StandardError => e
+        if DebugUtils.basic?
+          ScraperUtils::FiberScheduler.log(
+            "WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
+          )
+        end
+        nil
+      end
+    end
+    # Parse robots.txt content into structured rules
+    # Only collects rules for our specific user agent and generic crawl-delay
+    # @param content [String] The robots.txt content
+    # @return [Hash] Hash containing :our_rules and :our_delay
+    def parse_robots_txt(content)
+      sections = [] # Array of {agent:, rules:[], delay:} hashes
+      current_section = nil
+      content.each_line do |line|
+        line = line.strip.downcase
+        next if line.empty? || line.start_with?("#")
+        if line.start_with?("user-agent:")
+          agent = line.split(":", 2).last.strip
+          # Check if this is a continuation of the previous section
+          if current_section && current_section[:rules].empty? && current_section[:delay].nil?
+            current_section[:agents] << agent
+          else
+            current_section = { agents: [agent], rules: [], delay: nil }
+            sections << current_section
+          end
+          next
+        end
+        next unless current_section # Skip rules before first user-agent
+        if line.start_with?("disallow:")
+          path = line.split(":", 2).last.strip
+          current_section[:rules] << path unless path.empty?
+        elsif line.start_with?("crawl-delay:")
+          delay = line.split(":", 2).last.strip.to_i
+          current_section[:delay] = delay if delay.positive?
+        end
+      end
+      # Sort sections by most specific agent match first
+      matched_section = sections.find do |section|
+        section[:agents].any? do |agent|
+          # Our user agent starts with the agent from robots.txt
+          @user_agent.start_with?(agent) ||
+            # Or the agent from robots.txt starts with our user agent
+            # (handles ScraperUtils matching ScraperUtils/1.0)
+            agent.start_with?(@user_agent)
+        end
+      end
+      # Use matched section or fall back to wildcard
+      if matched_section
+        {
+          our_rules: matched_section[:rules],
+          our_delay: matched_section[:delay]
+        }
+      else
+        # Find default section
+        default_section = sections.find { |s| s[:agents].include?("*") }
+        {
+          our_rules: [],
+          our_delay: default_section&.dig(:delay)
+        }
+      end
+    end
+  end
+end

data/lib/scraper_utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module ScraperUtils
-  VERSION = "0.1.0"
+  VERSION = "0.3.0"
 end

data/lib/scraper_utils.rb CHANGED Viewed

@@ -1,10 +1,16 @@
 # frozen_string_literal: true
+require "scraper_utils/adaptive_delay"
 require "scraper_utils/authority_utils"
+require "scraper_utils/data_quality_monitor"
 require "scraper_utils/db_utils"
 require "scraper_utils/debug_utils"
+require "scraper_utils/fiber_scheduler"
 require "scraper_utils/log_utils"
+require "scraper_utils/mechanize_utils/agent_config"
 require "scraper_utils/mechanize_utils"
+require "scraper_utils/randomize_utils"
+require "scraper_utils/robots_checker"
 require "scraper_utils/version"
 # Utilities for planningalerts scrapers
@@ -12,9 +18,6 @@ module ScraperUtils
   # Constants for configuration on Morph.io
   AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
-  # Enable debug locally, not on morph.io
-  DEBUG_ENV_VAR = "DEBUG"
   # Fatal Error
   class Error < StandardError
   end
@@ -28,13 +31,6 @@ module ScraperUtils
   class UnprocessableRecord < Error
   end
-  # Check if debug mode is enabled
-  #
-  # @return [Boolean] Whether debug mode is active
-  def self.debug?
-    !ENV[DEBUG_ENV_VAR].to_s.empty?
-  end
   def self.australian_proxy
     ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
     ap.empty? ? nil : ap

data/scraper_utils.gemspec CHANGED Viewed

@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
   spec.summary = "planningalerts scraper utilities"
   spec.description = "Utilities to help make planningalerts scrapers, " \
-    "+especially multis easier to develop, run and debug."
+                     "+especially multis easier to develop, run and debug."
   spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
   spec.license = "MIT"
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
     # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
   else
     raise "RubyGems 2.0 or newer is required to protect against " \
-            "public gem pushes."
+          "public gem pushes."
   end
   # Specify which files should be added to the gem when it is released.
@@ -40,10 +40,5 @@ Gem::Specification.new do |spec|
   spec.add_dependency "mechanize"
   spec.add_dependency "nokogiri"
   spec.add_dependency "sqlite3"
-  spec.add_development_dependency "rake"
-  spec.add_development_dependency "rspec"
-  spec.add_development_dependency "rubocop"
-  spec.add_development_dependency "simplecov"
-  spec.add_development_dependency "simplecov-console"
+  spec.metadata["rubygems_mfa_required"] = "true"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: scraper_utils
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Ian Heggie
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-02-22 00:00:00.000000000 Z
+date: 2025-03-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: mechanize
@@ -52,76 +52,6 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-- !ruby/object:Gem::Dependency
-  name: rake
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: rspec
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: rubocop
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: simplecov
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
-  name: simplecov-console
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0'
 description: Utilities to help make planningalerts scrapers, +especially multis easier
   to develop, run and debug.
 email:
@@ -134,18 +64,31 @@ files:
 - ".rspec"
 - ".rubocop.yml"
 - ".travis.yml"
+- CHANGELOG.md
+- GUIDELINES.md
 - Gemfile
+- IMPLEMENTATION.md
 - LICENSE.txt
 - README.md
 - Rakefile
+- SPECS.md
 - bin/console
 - bin/setup
+- docs/example_scrape_with_fibers.rb
+- docs/example_scraper.rb
 - lib/scraper_utils.rb
+- lib/scraper_utils/adaptive_delay.rb
 - lib/scraper_utils/authority_utils.rb
+- lib/scraper_utils/data_quality_monitor.rb
+- lib/scraper_utils/date_range_utils.rb
 - lib/scraper_utils/db_utils.rb
 - lib/scraper_utils/debug_utils.rb
+- lib/scraper_utils/fiber_scheduler.rb
 - lib/scraper_utils/log_utils.rb
 - lib/scraper_utils/mechanize_utils.rb
+- lib/scraper_utils/mechanize_utils/agent_config.rb
+- lib/scraper_utils/randomize_utils.rb
+- lib/scraper_utils/robots_checker.rb
 - lib/scraper_utils/version.rb
 - scraper_utils.gemspec
 homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -155,6 +98,7 @@ metadata:
   allowed_push_host: https://rubygems.org
   homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
   source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
+  rubygems_mfa_required: 'true'
 post_install_message:
 rdoc_options: []
 require_paths:
@@ -170,8 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6.2
+rubygems_version: 3.4.10
 signing_key:
 specification_version: 4
 summary: planningalerts scraper utilities