RubyGems - scraper_utils - Versions diffs - 0.1.0 - Mend

scraper_utils 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +7 -0
data/.gitignore +77 -0
data/.rspec +3 -0
data/.rubocop.yml +82 -0
data/.travis.yml +7 -0
data/Gemfile +33 -0
data/LICENSE.txt +21 -0
data/README.md +271 -0
data/Rakefile +9 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/scraper_utils/authority_utils.rb +28 -0
data/lib/scraper_utils/db_utils.rb +41 -0
data/lib/scraper_utils/debug_utils.rb +76 -0
data/lib/scraper_utils/log_utils.rb +174 -0
data/lib/scraper_utils/mechanize_utils.rb +70 -0
data/lib/scraper_utils/version.rb +5 -0
data/lib/scraper_utils.rb +42 -0
data/scraper_utils.gemspec +49 -0
metadata +178 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: db12d36e0d3be635eba2c00dbe149f4d177ddc5e538a08fcd9038a026feaee91
+  data.tar.gz: 8d2f140b7fff7e02d90df19ac196018f8719cd73d85067519ee3e931f679f619
+SHA512:
+  metadata.gz: 7138204493653a872aafcf4a1f8b78d8d5129c70d79a54d6ca10aa1440fc60362edc270522cc0d66c13a7694527a502d75d3dec36cb21f2240fceea85367eec4
+  data.tar.gz: 83dffaedd054ed40c7a269c4fd3db270892bc0fa20c4b7d1a904a075cb990bee51004ccb9c0cb86840d87a631655207301b8af1f7a5572f0893d9317a1b90aa5

data/.gitignore ADDED Viewed

@@ -0,0 +1,77 @@
+*.gem
+*.rbc
+/.config
+/coverage/
+/InstalledFiles
+/pkg/
+/spec/reports/
+/spec/examples.txt
+/test/tmp/
+/test/version_tmp/
+# Temp files
+,*
+*.bak
+/tmp/
+# IDEs and AI assistants
+/.aider*
+/.idea
+/.vscode*
+# Ignore vim files:
+*~
+*.swp
+*.swo
+# Ignore patch files
+*.patch
+*.rej
+# Used by direnv / dotenv library to load environment variables.
+.env*
+# Ignore Byebug command history file.
+.byebug_history
+## Specific to RubyMotion:
+.dat*
+.repl_history
+build/
+*.bridgesupport
+build-iPhoneOS/
+build-iPhoneSimulator/
+## Specific to RubyMotion (use of CocoaPods):
+#
+# We recommend against adding the Pods directory to your .gitignore. However
+# you should judge for yourself, the pros and cons are mentioned at:
+# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
+#
+# vendor/Pods/
+## Documentation cache and generated files:
+/.yardoc/
+/_yardoc/
+/doc/
+/rdoc/
+## Environment normalization:
+/.bundle/
+/vendor/bundle
+/lib/bundler/man/
+# for a library or gem, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+Gemfile.lock
+.ruby-version
+.ruby-gemset
+# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
+.rvmrc
+# Used by RuboCop. Remote config files pulled in from inherit_from directive.
+.rubocop-https?--*
+# rspec reports
+/.rspec_status

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,82 @@
+AllCops:
+  Exclude:
+    - bin/*
+    # This is a temporary dumping ground for authority specific
+    # code that we're probably just initially copying across from
+    # other scrapers. So, we don't care about the formatting and style
+    # initially.
+    # TODO: Remove this once we've removed all the code from here
+    - lib/technology_one_scraper/authority/*
+# Bumping max line length to something a little more reasonable
+Layout/LineLength:
+  Max: 100
+# We prefer double quotes here and it we're making liberal use of multi-line
+# strings so it makes sense to enforce those to be consistent oo
+Style/StringLiterals:
+  EnforcedStyle: double_quotes
+  ConsistentQuotesInMultiline: true
+# This one I disagree with. Putting seperators in large numbers makes sense
+# in some circumstances but in others (an example id in a database table)
+# it's just nonsensical. Also, I think this one might also be a bit US centric.
+Style/NumericLiterals:
+  Enabled: false
+# Disable a bunch of metrics to do with code complexity. These as are all
+# a bit hard-nosed. Maybe after we've done a pass with Code Climate we
+# can revisit these
+Metrics/AbcSize:
+  Enabled: false
+Metrics/BlockLength:
+  Enabled: false
+Metrics/ClassLength:
+  Enabled: false
+Metrics/CyclomaticComplexity:
+  Enabled: false
+Metrics/MethodLength:
+  Enabled: false
+Metrics/ModuleLength:
+  Enabled: false
+Metrics/ParameterLists:
+  Enabled: false
+Metrics/PerceivedComplexity:
+  Enabled: false
+Layout/EmptyLinesAroundAttributeAccessor:
+  Enabled: true
+Layout/SpaceAroundMethodCallOperator:
+  Enabled: true
+Lint/DeprecatedOpenSSLConstant:
+  Enabled: true
+Lint/RaiseException:
+  Enabled: true
+Lint/StructNewOverride:
+  Enabled: true
+Style/ExponentialNotation:
+  Enabled: true
+Style/HashEachMethods:
+  Enabled: true
+Style/HashTransformKeys:
+  Enabled: true
+Style/HashTransformValues:
+  Enabled: true
+Style/SlicingWithRange:
+  Enabled: true

data/.travis.yml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+sudo: false
+language: ruby
+cache: bundler
+rvm:
+  - 2.5.8
+before_install: gem install bundler -v 1.17.3

data/Gemfile ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+source "https://rubygems.org"
+platform = if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("3.0.0")
+             :heroku16
+           elsif Gem::Version.new(RUBY_VERSION) < Gem::Version.new("3.3.0")
+             :heroku18
+           end
+ruby case platform
+     when :heroku16 then "~> 2.5.8"
+     when :heroku18 then "~> 3.2.2"
+     else "~> 3.3.7"
+     end
+gem "mechanize", platform && (platform == :heroku16 ? "~> 2.7.0" : "~> 2.8.5")
+gem "nokogiri", platform && (platform == :heroku16 ? "~> 1.11.2" : "~> 1.15.0")
+gem "sqlite3", platform && (platform == :heroku16 ? "~> 1.4.0" : "~> 1.6.3")
+# Unable to list in gemspec - Include it in your projects Gemfile when using this gem
+gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
+                   branch: "morph_defaults"
+# development and test test gems
+gem "rake", platform && (platform == :heroku16 ? "~> 12.3.3" : "~> 13.0")
+gem "rspec", platform && (platform == :heroku16 ? "~> 3.9.0" : "~> 3.12")
+gem "rubocop", platform && (platform == :heroku16 ? "~> 0.80.0" : "~> 1.57")
+gem "simplecov", platform && (platform == :heroku16 ? "~> 0.18.0" : "~> 0.22.0")
+# gem "simplecov-console" listed in gemspec
+gem "webmock", platform && (platform == :heroku16 ? "~> 3.14.0" : "~> 3.19.0")
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2025 Ian Heggie
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,271 @@
+ScraperUtils (Ruby)
+===================
+Utilities to help make planningalerts scrapers, especially multis easier to develop, run and debug.
+WARNING: This is still under development! Breaking changes may occur in version 0!
+## Installation
+Add these line to your application's Gemfile:
+```ruby
+gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
+gem 'scraper_utils'
+```
+And then execute:
+    $ bundle
+Or install it yourself for testing:
+    $ gem install scraper_utils
+## Usage
+### Environment variables
+Optionally filter authorities via environment variable in morph > scraper > settings or
+in your dev environment:
+```bash
+export MORPH_AUTHORITIES=noosa,wagga
+```
+### Example updated `scraper.rb` file
+Update your `scraper.rb` as per the following example:
+```ruby
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+$LOAD_PATH << "./lib"
+require "scraper_utils"
+require "technology_one_scraper"
+# Main Scraper class
+class Scraper
+  AUTHORITIES = TechnologyOneScraper::AUTHORITIES
+  def self.scrape(authorities, attempt)
+    results = {}
+    authorities.each do |authority_label|
+      these_results = results[authority_label] = {}
+      begin
+        records_scraped = 0
+        unprocessable_records = 0
+        # Allow 5 + 10% unprocessable records
+        too_many_unprocessable = -5.0
+        use_proxy = AUTHORITIES[authority_label][:australian_proxy] && ScraperUtils.australian_proxy
+        next if attempt > 2 && !use_proxy
+        puts "",
+             "Collecting feed data for #{authority_label}, attempt: #{attempt}" \
+               "#{use_proxy ? ' (via proxy)' : ''} ..."
+        # Change scrape to accept a use_proxy flag and return an unprocessable flag
+        # it should rescue ScraperUtils::UnprocessableRecord thrown deeper in the scraping code and
+        # set unprocessable
+        TechnologyOneScraper.scrape(use_proxy, authority_label) do |record, unprocessable|
+          unless unprocessable
+            begin
+              record["authority_label"] = authority_label.to_s
+              ScraperUtils::DbUtils.save_record(record)
+            rescue ScraperUtils::UnprocessableRecord => e
+              # validation error
+              unprocessable = true
+              these_results[:error] = e
+            end
+          end
+          if unprocessable
+            unprocessable_records += 1
+            these_results[:unprocessable_records] = unprocessable_records
+            too_many_unprocessable += 1
+            raise "Too many unprocessable records" if too_many_unprocessable.positive?
+          else
+            records_scraped += 1
+            these_results[:records_scraped] = records_scraped
+            too_many_unprocessable -= 0.1
+          end
+        end
+      rescue StandardError => e
+        warn "#{authority_label}: ERROR: #{e}"
+        warn e.backtrace || "No backtrace available"
+        these_results[:error] = e
+      end
+    end
+    results
+  end
+  def self.selected_authorities
+    ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
+  end
+  def self.run(authorities)
+    puts "Scraping authorities: #{authorities.join(', ')}"
+    start_time = Time.now
+    results = scrape(authorities, 1)
+    ScraperUtils::LogUtils.log_scraping_run(
+      start_time,
+      1,
+      authorities,
+      results
+    )
+    retry_errors = results.select do |_auth, result|
+      result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
+    end.keys
+    unless retry_errors.empty?
+      puts "",
+           "***************************************************"
+      puts "Now retrying authorities which earlier had failures"
+      puts retry_errors.join(", ").to_s
+      puts "***************************************************"
+      start_retry = Time.now
+      retry_results = scrape(retry_errors, 2)
+      ScraperUtils::LogUtils.log_scraping_run(
+        start_retry,
+        2,
+        retry_errors,
+        retry_results
+      )
+      retry_results.each do |auth, result|
+        unless result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
+          results[auth] = result
+        end
+      end.keys
+      retry_no_proxy = retry_results.select do |_auth, result|
+        result[:used_proxy] && result[:error] &&
+          !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
+      end.keys
+      unless retry_no_proxy.empty?
+        puts "",
+             "*****************************************************************"
+        puts "Now retrying authorities which earlier had failures without proxy"
+        puts retry_no_proxy.join(", ").to_s
+        puts "*****************************************************************"
+        start_retry = Time.now
+        second_retry_results = scrape(retry_no_proxy, 3)
+        ScraperUtils::LogUtils.log_scraping_run(
+          start_retry,
+          3,
+          retry_no_proxy,
+          second_retry_results
+        )
+        second_retry_results.each do |auth, result|
+          unless result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
+            results[auth] = result
+          end
+        end.keys
+      end
+    end
+    # Report on results, raising errors for unexpected conditions
+    ScraperUtils::LogUtils.report_on_results(authorities, results)
+  end
+end
+if __FILE__ == $PROGRAM_NAME
+  # Default to list of authorities we can't or won't fix in code, explain why
+  # wagga: url redirects and reports Application error, main site says to use NSW Planning Portal from 1 July 2021
+  #        which doesn't list any DA's for wagga wagga!
+  ENV["MORPH_EXPECT_BAD"] ||= "wagga"
+  Scraper.run(Scraper.selected_authorities)
+end
+```
+Then deeper in your code update:
+* Change scrape to accept a `use_proxy` flag and return an `unprocessable` flag
+* it should rescue ScraperUtils::UnprocessableRecord thrown deeper in the scraping code and
+  set and yield unprocessable eg: `TechnologyOneScraper.scrape(use_proxy, authority_label) do |record, unprocessable|`
+```ruby
+require "scraper_utils"
+#...
+module TechnologyOneScraper
+  # Note the extra parameter: use_proxy
+  def self.scrape(use_proxy, authority)
+    raise "Unexpected authority: #{authority}" unless AUTHORITIES.key?(authority)
+    scrape_period(use_proxy, AUTHORITIES[authority]) do |record, unprocessable|
+      yield record, unprocessable
+    end
+  end
+  # ... rest of code ...
+  # Note the extra parameters: use_proxy and timeout
+  def self.scrape_period(use_proxy,
+    url:, period:, webguest: "P1.WEBGUEST", disable_ssl_certificate_check: false,
+    australian_proxy: false, timeout: nil
+  )
+    agent = ScraperUtils::MechanizeUtils.mechanize_agent(use_proxy: use_proxy, timeout: timeout)
+    agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if disable_ssl_certificate_check
+    # ... rest of code ...
+    # Update yield to return unprocessable as well as record
+  end
+  # ... rest of code ...
+end
+```
+### Debugging Techniques
+The following code will print dbugging info if you set:
+```bash
+export DEBUG=1
+```
+Add the following immediately before requesting or examining pages
+```ruby
+require 'scraper_utils'
+# Debug an HTTP request
+ScraperUtils::DebugUtils.debug_request(
+  "GET",
+  "https://example.com/planning-apps",
+  parameters: { year: 2023 },
+  headers: { "Accept" => "application/json" }
+)
+# Debug a web page
+ScraperUtils::DebugUtils.debug_page(page, "Checking search results page")
+# Debug a specific page selector
+ScraperUtils::DebugUtils.debug_selector(page, '.results-table', "Looking for development applications")
+```
+## Development
+After checking out the repo, run `bin/setup` to install dependencies.
+Then, run `rake test` to run the tests.
+You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`.
+To release a new version, update the version number in `version.rb`, and
+then run `bundle exec rake release`,
+which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/ianheggie-oaf/scraper_utils
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

data/Rakefile ADDED Viewed

@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+require "bundler/gem_tasks"
+require "rspec/core/rake_task"
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "scraper_utils"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/scraper_utils/authority_utils.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module ScraperUtils
+  # Utilities for managing and selecting authorities
+  module AuthorityUtils
+    # Selects authorities based on environment variable or returns all authorities
+    #
+    # @param all_authorities [Array<Symbol>] Full list of available authorities
+    # @return [Array<Symbol>] Selected subset of authorities or all authorities
+    # @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
+    AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
+    def self.selected_authorities(all_authorities)
+      if ENV[AUTHORITIES_ENV_VAR]
+        authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
+        invalid = authorities - all_authorities
+        unless invalid.empty?
+          raise ScraperUtils::Error,
+                "Invalid authorities specified in MORPH_AUTHORITIES: #{invalid.join(', ')}"
+        end
+        authorities
+      else
+        all_authorities
+      end
+    end
+  end
+end

data/lib/scraper_utils/db_utils.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+require "scraperwiki"
+module ScraperUtils
+  # Utilities for database operations in scrapers
+  module DbUtils
+    # Saves a record to the SQLite database with validation and logging
+    #
+    # @param record [Hash] The record to be saved
+    # @raise [ScraperUtils::UnprocessableRecord] If record fails validation
+    # @return [void]
+    def self.save_record(record)
+      # Validate required fields
+      required_fields = %w[council_reference address description info_url date_scraped]
+      required_fields.each do |field|
+        if record[field].to_s.empty?
+          raise ScraperUtils::UnprocessableRecord, "Missing required field: #{field}"
+        end
+      end
+      # Validate date formats
+      %w[date_scraped date_received on_notice_from on_notice_to].each do |date_field|
+        Date.parse(record[date_field]) if record[date_field]
+      rescue ArgumentError
+        raise ScraperUtils::UnprocessableRecord,
+              "Invalid date format for #{date_field}: #{record[date_field]}"
+      end
+      # Determine primary key based on presence of authority_label
+      primary_key = if record.key?("authority_label")
+                      %w[authority_label council_reference]
+                    else
+                      ["council_reference"]
+                    end
+      puts "Saving record #{record['council_reference']} - #{record['address']}"
+      ScraperWiki.save_sqlite(primary_key, record)
+    end
+  end
+end

data/lib/scraper_utils/debug_utils.rb ADDED Viewed

@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+require "json"
+module ScraperUtils
+  # Utilities for debugging web scraping processes
+  module DebugUtils
+    # Logs details of an HTTP request when debug mode is enabled
+    #
+    # @param method [String] HTTP method (GET, POST, etc.)
+    # @param url [String] Request URL
+    # @param parameters [Hash, nil] Optional request parameters
+    # @param headers [Hash, nil] Optional request headers
+    # @param body [Hash, nil] Optional request body
+    # @return [void]
+    def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
+      return unless ScraperUtils.debug?
+      puts "\n🔍 #{method.upcase} #{url}"
+      if parameters
+        puts "Parameters:"
+        puts JSON.pretty_generate(parameters)
+      end
+      if headers
+        puts "Headers:"
+        puts JSON.pretty_generate(headers)
+      end
+      return unless body
+      puts "Body:"
+      puts JSON.pretty_generate(body)
+    end
+    # Logs details of a web page when debug mode is enabled
+    #
+    # @param page [Mechanize::Page] The web page to debug
+    # @param message [String] Context or description for the debug output
+    # @return [void]
+    def self.debug_page(page, message)
+      return unless ScraperUtils.debug?
+      puts "",
+           "🔍 DEBUG: #{message}"
+      puts "Current URL: #{page.uri}"
+      puts "Page title: #{page.at('title').text.strip}" if page.at("title")
+      puts "",
+           "Page content:"
+      puts "-" * 40
+      puts page.body
+      puts "-" * 40
+    end
+    # Logs details about a specific page selector when debug mode is enabled
+    #
+    # @param page [Mechanize::Page] The web page to inspect
+    # @param selector [String] CSS selector to look for
+    # @param message [String] Context or description for the debug output
+    # @return [void]
+    def self.debug_selector(page, selector, message)
+      return unless ScraperUtils.debug?
+      puts "\n🔍 DEBUG: #{message}"
+      puts "Looking for selector: #{selector}"
+      element = page.at(selector)
+      if element
+        puts "Found element:"
+        puts element.to_html
+      else
+        puts "Element not found in:"
+        puts "-" * 40
+        puts page.body
+        puts "-" * 40
+      end
+    end
+  end
+end

data/lib/scraper_utils/log_utils.rb ADDED Viewed

@@ -0,0 +1,174 @@
+# frozen_string_literal: true
+require "scraperwiki"
+module ScraperUtils
+  # Utilities for logging scraper execution details and outcomes
+  module LogUtils
+    SUMMARY_TABLE = "scrape_summary"
+    LOG_TABLE = "scrape_log"
+    LOG_RETENTION_DAYS = 30
+    # Log details about a scraping run for one or more authorities
+    # @param start_time [Time] When this scraping attempt was started
+    # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
+    # @param authorities [Array<Symbol>] List of authorities attempted to scrape
+    # @param results [Hash] Results for each authority containing:
+    #   - :records_scraped [Integer] Number of records successfully scraped
+    #   - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
+    #   - :error [Exception, nil] Any exception that occurred during scraping
+    #   - :proxy_used [Boolean] Whether a proxy was used
+    # @return [void]
+    def self.log_scraping_run(start_time, attempt, authorities, results)
+      raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
+      raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
+      end_time = Time.now
+      duration = (end_time - start_time).round(1)
+      successful = []
+      failed = []
+      interrupted = []
+      authorities.each do |authority_label|
+        result = results[authority_label] || {}
+        status = if result[:records_scraped]&.positive?
+                   result[:error] ? :interrupted : :successful
+                 else
+                   :failed
+                 end
+        case status
+        when :successful
+          successful << authority_label
+        when :interrupted
+          interrupted << authority_label
+        else
+          failed << authority_label
+        end
+        record = {
+          "run_at" => start_time.iso8601,
+          "attempt" => attempt,
+          "authority_label" => authority_label.to_s,
+          "records_scraped" => result[:records_scraped] || 0,
+          "unprocessable_records" => result[:unprocessable_records] || 0,
+          "used_proxy" => result[:proxy_used] ? 1 : 0,
+          "status" => status.to_s,
+          "error_message" => result[:error]&.message,
+          "error_class" => result[:error]&.class&.to_s,
+          "error_backtrace" => extract_meaningful_backtrace(result[:error])
+        }
+        save_log_record(record)
+      end
+      # Save summary record for the entire run
+      save_summary_record(
+        start_time,
+        attempt,
+        duration,
+        successful,
+        interrupted,
+        failed
+      )
+      cleanup_old_records
+    end
+    def self.report_on_results(authorities, results)
+      expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:to_sym) || []
+      puts "MORPH_EXPECT_BAD=#{ENV['MORPH_EXPECT_BAD']}" if expect_bad.any?
+      errors = []
+      # Check for authorities that were expected to be bad but are now working
+      unexpected_working = expect_bad.select do |authority|
+        result = results[authority]
+        result && result[:records_scraped]&.positive? && result[:error].nil?
+      end
+      if unexpected_working.any?
+        errors << "WARNING: Remove #{unexpected_working.join(',')} from EXPECT_BAD as it now works!"
+      end
+      # Check for authorities with unexpected errors
+      unexpected_errors = authorities
+                          .select { |authority| results[authority]&.dig(:error) }
+                          .reject { |authority| expect_bad.include?(authority) }
+      if unexpected_errors.any?
+        errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
+          "(Add to MORPH_EXPECT_BAD?)"
+        unexpected_errors.each do |authority|
+          error = results[authority][:error]
+          errors << "  #{authority}: #{error.class} - #{error.message}"
+        end
+      end
+      if errors.any?
+        errors << "See earlier output for details"
+        raise errors.join("\n")
+      end
+      puts "Exiting with OK status!"
+    end
+    def self.save_log_record(record)
+      ScraperWiki.save_sqlite(
+        %w[authority_label run_at],
+        record,
+        LOG_TABLE
+      )
+    end
+    def self.save_summary_record(start_time, attempt, duration,
+                                 successful, interrupted, failed)
+      summary = {
+        "run_at" => start_time.iso8601,
+        "attempt" => attempt,
+        "duration" => duration,
+        "successful" => successful.join(","),
+        "failed" => failed.join(","),
+        "interrupted" => interrupted.join(","),
+        "successful_count" => successful.size,
+        "interrupted_count" => interrupted.size,
+        "failed_count" => failed.size
+      }
+      ScraperWiki.save_sqlite(
+        ["run_at"],
+        summary,
+        SUMMARY_TABLE
+      )
+    end
+    def self.cleanup_old_records(force: false)
+      cutoff = (Date.today - LOG_RETENTION_DAYS).to_s
+      return if !force && @last_cutoff == cutoff
+      @last_cutoff = cutoff
+      [SUMMARY_TABLE, LOG_TABLE].each do |table|
+        ScraperWiki.sqliteexecute(
+          "DELETE FROM #{table} WHERE date(run_at) < date(?)",
+          [cutoff]
+        )
+      end
+    end
+    # Extracts meaningful backtrace - 3 lines from ruby/gem and max 6 in total
+    def self.extract_meaningful_backtrace(error)
+      return nil unless error.respond_to?(:backtrace) && error&.backtrace
+      lines = []
+      error.backtrace.each do |line|
+        lines << line if lines.length < 2 || !line.include?("/vendor/")
+        break if lines.length >= 6
+      end
+      lines.empty? ? nil : lines.join("\n")
+    end
+  end
+end

data/lib/scraper_utils/mechanize_utils.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+require "mechanize"
+module ScraperUtils
+  # Utilities for configuring and using Mechanize for web scraping
+  module MechanizeUtils
+    PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
+    # Creates and configures a Mechanize agent with optional proxy and timeout
+    #
+    # @param timeout [Integer, nil] Timeout for agent connections
+    # @param australian_proxy [Boolean] Whether to use an Australian proxy
+    # @return [Mechanize] Configured Mechanize agent
+    def self.mechanize_agent(timeout: nil, use_proxy: true)
+      agent = Mechanize.new
+      agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
+      if use_proxy
+        # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
+        # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
+        # the real password.
+        agent.agent.set_proxy(ScraperUtils.australian_proxy)
+      end
+      if timeout
+        agent.open_timeout = timeout
+        agent.read_timeout = timeout
+      end
+      public_ip(agent) if use_proxy
+      agent
+    end
+    # Returns if the Mechanize agent is using the proxy
+    def self.using_proxy?(agent)
+      !agent.agent.proxy_uri.nil?
+    end
+    # Checks if a page indicates a maintenance mode
+    #
+    # @param page [Mechanize::Page] The web page to check
+    # @return [String, nil] Maintenance message if found, otherwise nil
+    def self.find_maintenance_message(page)
+      # Use Nokogiri for parsing because earlier versions of Mechanize
+      # do not support the .search method on page objects
+      doc = Nokogiri::HTML(page.body)
+      doc.css("h1, title").each do |element|
+        text = element.inner_text
+        return "Maintenance: #{text}" if text&.match?(/maintenance/i)
+      end
+      # Not in maintenance mode
+      nil
+    end
+    # Retrieves and logs the public IP address
+    #
+    # @param agent [Mechanize] Mechanize agent to use for IP lookup
+    # @param force [Boolean] Force a new IP lookup, bypassing cache
+    # @return [String] The public IP address
+    def self.public_ip(agent, force: false)
+      @public_ip = nil if force
+      @public_ip ||=
+        begin
+          ip = agent.get(PUBLIC_IP_URL).body.strip
+          puts "Public IP: #{ip}"
+          ip
+        end
+    end
+  end
+end

data/lib/scraper_utils/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module ScraperUtils
+  VERSION = "0.1.0"
+end

data/lib/scraper_utils.rb ADDED Viewed

@@ -0,0 +1,42 @@
+# frozen_string_literal: true
+require "scraper_utils/authority_utils"
+require "scraper_utils/db_utils"
+require "scraper_utils/debug_utils"
+require "scraper_utils/log_utils"
+require "scraper_utils/mechanize_utils"
+require "scraper_utils/version"
+# Utilities for planningalerts scrapers
+module ScraperUtils
+  # Constants for configuration on Morph.io
+  AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
+  # Enable debug locally, not on morph.io
+  DEBUG_ENV_VAR = "DEBUG"
+  # Fatal Error
+  class Error < StandardError
+  end
+  # Fatal error with the site - retrying won't help
+  class UnprocessableSite < Error
+  end
+  # Content validation errors that should not be retried for that record,
+  # but other records may be processable
+  class UnprocessableRecord < Error
+  end
+  # Check if debug mode is enabled
+  #
+  # @return [Boolean] Whether debug mode is active
+  def self.debug?
+    !ENV[DEBUG_ENV_VAR].to_s.empty?
+  end
+  def self.australian_proxy
+    ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
+    ap.empty? ? nil : ap
+  end
+end

data/scraper_utils.gemspec ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+lib = File.expand_path("lib", __dir__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require "scraper_utils/version"
+Gem::Specification.new do |spec|
+  spec.name = "scraper_utils"
+  spec.version = ScraperUtils::VERSION
+  spec.authors = ["Ian Heggie"]
+  spec.email = ["ian@heggie.biz"]
+  spec.required_ruby_version = ">= 2.5.1"
+  spec.summary = "planningalerts scraper utilities"
+  spec.description = "Utilities to help make planningalerts scrapers, " \
+    "+especially multis easier to develop, run and debug."
+  spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
+  spec.license = "MIT"
+  if spec.respond_to?(:metadata)
+    spec.metadata["allowed_push_host"] = "https://rubygems.org"
+    spec.metadata["homepage_uri"] = spec.homepage
+    spec.metadata["source_code_uri"] = spec.homepage
+    # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
+  else
+    raise "RubyGems 2.0 or newer is required to protect against " \
+            "public gem pushes."
+  end
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files = Dir.chdir(File.expand_path(__dir__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir = "exe"
+  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_dependency "mechanize"
+  spec.add_dependency "nokogiri"
+  spec.add_dependency "sqlite3"
+  spec.add_development_dependency "rake"
+  spec.add_development_dependency "rspec"
+  spec.add_development_dependency "rubocop"
+  spec.add_development_dependency "simplecov"
+  spec.add_development_dependency "simplecov-console"
+end

metadata ADDED Viewed

@@ -0,0 +1,178 @@
+--- !ruby/object:Gem::Specification
+name: scraper_utils
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Ian Heggie
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2025-02-22 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rubocop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: simplecov-console
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Utilities to help make planningalerts scrapers, +especially multis easier
+  to develop, run and debug.
+email:
+- ian@heggie.biz
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".rubocop.yml"
+- ".travis.yml"
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- lib/scraper_utils.rb
+- lib/scraper_utils/authority_utils.rb
+- lib/scraper_utils/db_utils.rb
+- lib/scraper_utils/debug_utils.rb
+- lib/scraper_utils/log_utils.rb
+- lib/scraper_utils/mechanize_utils.rb
+- lib/scraper_utils/version.rb
+- scraper_utils.gemspec
+homepage: https://github.com/ianheggie-oaf/scraper_utils
+licenses:
+- MIT
+metadata:
+  allowed_push_host: https://rubygems.org
+  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
+  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 2.5.1
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.7.6.2
+signing_key:
+specification_version: 4
+summary: planningalerts scraper utilities
+test_files: []