scraper_utils 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: db12d36e0d3be635eba2c00dbe149f4d177ddc5e538a08fcd9038a026feaee91
4
+ data.tar.gz: 8d2f140b7fff7e02d90df19ac196018f8719cd73d85067519ee3e931f679f619
5
+ SHA512:
6
+ metadata.gz: 7138204493653a872aafcf4a1f8b78d8d5129c70d79a54d6ca10aa1440fc60362edc270522cc0d66c13a7694527a502d75d3dec36cb21f2240fceea85367eec4
7
+ data.tar.gz: 83dffaedd054ed40c7a269c4fd3db270892bc0fa20c4b7d1a904a075cb990bee51004ccb9c0cb86840d87a631655207301b8af1f7a5572f0893d9317a1b90aa5
data/.gitignore ADDED
@@ -0,0 +1,77 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+
12
+ # Temp files
13
+ ,*
14
+ *.bak
15
+ /tmp/
16
+
17
+ # IDEs and AI assistants
18
+ /.aider*
19
+ /.idea
20
+ /.vscode*
21
+
22
+ # Ignore vim files:
23
+ *~
24
+ *.swp
25
+ *.swo
26
+
27
+ # Ignore patch files
28
+ *.patch
29
+ *.rej
30
+
31
+ # Used by direnv / dotenv library to load environment variables.
32
+ .env*
33
+
34
+ # Ignore Byebug command history file.
35
+ .byebug_history
36
+
37
+ ## Specific to RubyMotion:
38
+ .dat*
39
+ .repl_history
40
+ build/
41
+ *.bridgesupport
42
+ build-iPhoneOS/
43
+ build-iPhoneSimulator/
44
+
45
+ ## Specific to RubyMotion (use of CocoaPods):
46
+ #
47
+ # We recommend against adding the Pods directory to your .gitignore. However
48
+ # you should judge for yourself, the pros and cons are mentioned at:
49
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
50
+ #
51
+ # vendor/Pods/
52
+
53
+ ## Documentation cache and generated files:
54
+ /.yardoc/
55
+ /_yardoc/
56
+ /doc/
57
+ /rdoc/
58
+
59
+ ## Environment normalization:
60
+ /.bundle/
61
+ /vendor/bundle
62
+ /lib/bundler/man/
63
+
64
+ # for a library or gem, you might want to ignore these files since the code is
65
+ # intended to run in multiple environments; otherwise, check them in:
66
+ Gemfile.lock
67
+ .ruby-version
68
+ .ruby-gemset
69
+
70
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
71
+ .rvmrc
72
+
73
+ # Used by RuboCop. Remote config files pulled in from inherit_from directive.
74
+ .rubocop-https?--*
75
+
76
+ # rspec reports
77
+ /.rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,82 @@
1
+ AllCops:
2
+ Exclude:
3
+ - bin/*
4
+ # This is a temporary dumping ground for authority specific
5
+ # code that we're probably just initially copying across from
6
+ # other scrapers. So, we don't care about the formatting and style
7
+ # initially.
8
+ # TODO: Remove this once we've removed all the code from here
9
+ - lib/technology_one_scraper/authority/*
10
+
11
+ # Bumping max line length to something a little more reasonable
12
+ Layout/LineLength:
13
+ Max: 100
14
+
15
+ # We prefer double quotes here and it we're making liberal use of multi-line
16
+ # strings so it makes sense to enforce those to be consistent oo
17
+ Style/StringLiterals:
18
+ EnforcedStyle: double_quotes
19
+ ConsistentQuotesInMultiline: true
20
+
21
+ # This one I disagree with. Putting seperators in large numbers makes sense
22
+ # in some circumstances but in others (an example id in a database table)
23
+ # it's just nonsensical. Also, I think this one might also be a bit US centric.
24
+ Style/NumericLiterals:
25
+ Enabled: false
26
+
27
+ # Disable a bunch of metrics to do with code complexity. These as are all
28
+ # a bit hard-nosed. Maybe after we've done a pass with Code Climate we
29
+ # can revisit these
30
+ Metrics/AbcSize:
31
+ Enabled: false
32
+
33
+ Metrics/BlockLength:
34
+ Enabled: false
35
+
36
+ Metrics/ClassLength:
37
+ Enabled: false
38
+
39
+ Metrics/CyclomaticComplexity:
40
+ Enabled: false
41
+
42
+ Metrics/MethodLength:
43
+ Enabled: false
44
+
45
+ Metrics/ModuleLength:
46
+ Enabled: false
47
+
48
+ Metrics/ParameterLists:
49
+ Enabled: false
50
+
51
+ Metrics/PerceivedComplexity:
52
+ Enabled: false
53
+
54
+ Layout/EmptyLinesAroundAttributeAccessor:
55
+ Enabled: true
56
+
57
+ Layout/SpaceAroundMethodCallOperator:
58
+ Enabled: true
59
+
60
+ Lint/DeprecatedOpenSSLConstant:
61
+ Enabled: true
62
+
63
+ Lint/RaiseException:
64
+ Enabled: true
65
+
66
+ Lint/StructNewOverride:
67
+ Enabled: true
68
+
69
+ Style/ExponentialNotation:
70
+ Enabled: true
71
+
72
+ Style/HashEachMethods:
73
+ Enabled: true
74
+
75
+ Style/HashTransformKeys:
76
+ Enabled: true
77
+
78
+ Style/HashTransformValues:
79
+ Enabled: true
80
+
81
+ Style/SlicingWithRange:
82
+ Enabled: true
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.8
7
+ before_install: gem install bundler -v 1.17.3
data/Gemfile ADDED
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ platform = if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("3.0.0")
6
+ :heroku16
7
+ elsif Gem::Version.new(RUBY_VERSION) < Gem::Version.new("3.3.0")
8
+ :heroku18
9
+ end
10
+
11
+ ruby case platform
12
+ when :heroku16 then "~> 2.5.8"
13
+ when :heroku18 then "~> 3.2.2"
14
+ else "~> 3.3.7"
15
+ end
16
+
17
+ gem "mechanize", platform && (platform == :heroku16 ? "~> 2.7.0" : "~> 2.8.5")
18
+ gem "nokogiri", platform && (platform == :heroku16 ? "~> 1.11.2" : "~> 1.15.0")
19
+ gem "sqlite3", platform && (platform == :heroku16 ? "~> 1.4.0" : "~> 1.6.3")
20
+
21
+ # Unable to list in gemspec - Include it in your projects Gemfile when using this gem
22
+ gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
23
+ branch: "morph_defaults"
24
+
25
+ # development and test test gems
26
+ gem "rake", platform && (platform == :heroku16 ? "~> 12.3.3" : "~> 13.0")
27
+ gem "rspec", platform && (platform == :heroku16 ? "~> 3.9.0" : "~> 3.12")
28
+ gem "rubocop", platform && (platform == :heroku16 ? "~> 0.80.0" : "~> 1.57")
29
+ gem "simplecov", platform && (platform == :heroku16 ? "~> 0.18.0" : "~> 0.22.0")
30
+ # gem "simplecov-console" listed in gemspec
31
+ gem "webmock", platform && (platform == :heroku16 ? "~> 3.14.0" : "~> 3.19.0")
32
+
33
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Ian Heggie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,271 @@
1
+ ScraperUtils (Ruby)
2
+ ===================
3
+
4
+ Utilities to help make planningalerts scrapers, especially multis easier to develop, run and debug.
5
+
6
+ WARNING: This is still under development! Breaking changes may occur in version 0!
7
+
8
+ ## Installation
9
+
10
+ Add these line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
14
+ gem 'scraper_utils'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself for testing:
22
+
23
+ $ gem install scraper_utils
24
+
25
+ ## Usage
26
+
27
+ ### Environment variables
28
+
29
+ Optionally filter authorities via environment variable in morph > scraper > settings or
30
+ in your dev environment:
31
+
32
+ ```bash
33
+ export MORPH_AUTHORITIES=noosa,wagga
34
+ ```
35
+
36
+ ### Example updated `scraper.rb` file
37
+
38
+ Update your `scraper.rb` as per the following example:
39
+
40
+ ```ruby
41
+ #!/usr/bin/env ruby
42
+ # frozen_string_literal: true
43
+
44
+ $LOAD_PATH << "./lib"
45
+
46
+ require "scraper_utils"
47
+ require "technology_one_scraper"
48
+
49
+ # Main Scraper class
50
+ class Scraper
51
+ AUTHORITIES = TechnologyOneScraper::AUTHORITIES
52
+
53
+ def self.scrape(authorities, attempt)
54
+ results = {}
55
+ authorities.each do |authority_label|
56
+ these_results = results[authority_label] = {}
57
+ begin
58
+ records_scraped = 0
59
+ unprocessable_records = 0
60
+ # Allow 5 + 10% unprocessable records
61
+ too_many_unprocessable = -5.0
62
+ use_proxy = AUTHORITIES[authority_label][:australian_proxy] && ScraperUtils.australian_proxy
63
+ next if attempt > 2 && !use_proxy
64
+
65
+ puts "",
66
+ "Collecting feed data for #{authority_label}, attempt: #{attempt}" \
67
+ "#{use_proxy ? ' (via proxy)' : ''} ..."
68
+ # Change scrape to accept a use_proxy flag and return an unprocessable flag
69
+ # it should rescue ScraperUtils::UnprocessableRecord thrown deeper in the scraping code and
70
+ # set unprocessable
71
+ TechnologyOneScraper.scrape(use_proxy, authority_label) do |record, unprocessable|
72
+ unless unprocessable
73
+ begin
74
+ record["authority_label"] = authority_label.to_s
75
+ ScraperUtils::DbUtils.save_record(record)
76
+ rescue ScraperUtils::UnprocessableRecord => e
77
+ # validation error
78
+ unprocessable = true
79
+ these_results[:error] = e
80
+ end
81
+ end
82
+ if unprocessable
83
+ unprocessable_records += 1
84
+ these_results[:unprocessable_records] = unprocessable_records
85
+ too_many_unprocessable += 1
86
+ raise "Too many unprocessable records" if too_many_unprocessable.positive?
87
+ else
88
+ records_scraped += 1
89
+ these_results[:records_scraped] = records_scraped
90
+ too_many_unprocessable -= 0.1
91
+ end
92
+ end
93
+ rescue StandardError => e
94
+ warn "#{authority_label}: ERROR: #{e}"
95
+ warn e.backtrace || "No backtrace available"
96
+ these_results[:error] = e
97
+ end
98
+ end
99
+ results
100
+ end
101
+
102
+ def self.selected_authorities
103
+ ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
104
+ end
105
+
106
+ def self.run(authorities)
107
+ puts "Scraping authorities: #{authorities.join(', ')}"
108
+ start_time = Time.now
109
+ results = scrape(authorities, 1)
110
+ ScraperUtils::LogUtils.log_scraping_run(
111
+ start_time,
112
+ 1,
113
+ authorities,
114
+ results
115
+ )
116
+
117
+ retry_errors = results.select do |_auth, result|
118
+ result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
119
+ end.keys
120
+
121
+ unless retry_errors.empty?
122
+ puts "",
123
+ "***************************************************"
124
+ puts "Now retrying authorities which earlier had failures"
125
+ puts retry_errors.join(", ").to_s
126
+ puts "***************************************************"
127
+
128
+ start_retry = Time.now
129
+ retry_results = scrape(retry_errors, 2)
130
+ ScraperUtils::LogUtils.log_scraping_run(
131
+ start_retry,
132
+ 2,
133
+ retry_errors,
134
+ retry_results
135
+ )
136
+
137
+ retry_results.each do |auth, result|
138
+ unless result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
139
+ results[auth] = result
140
+ end
141
+ end.keys
142
+ retry_no_proxy = retry_results.select do |_auth, result|
143
+ result[:used_proxy] && result[:error] &&
144
+ !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
145
+ end.keys
146
+
147
+ unless retry_no_proxy.empty?
148
+ puts "",
149
+ "*****************************************************************"
150
+ puts "Now retrying authorities which earlier had failures without proxy"
151
+ puts retry_no_proxy.join(", ").to_s
152
+ puts "*****************************************************************"
153
+
154
+ start_retry = Time.now
155
+ second_retry_results = scrape(retry_no_proxy, 3)
156
+ ScraperUtils::LogUtils.log_scraping_run(
157
+ start_retry,
158
+ 3,
159
+ retry_no_proxy,
160
+ second_retry_results
161
+ )
162
+ second_retry_results.each do |auth, result|
163
+ unless result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
164
+ results[auth] = result
165
+ end
166
+ end.keys
167
+ end
168
+ end
169
+
170
+ # Report on results, raising errors for unexpected conditions
171
+ ScraperUtils::LogUtils.report_on_results(authorities, results)
172
+ end
173
+ end
174
+
175
+ if __FILE__ == $PROGRAM_NAME
176
+ # Default to list of authorities we can't or won't fix in code, explain why
177
+ # wagga: url redirects and reports Application error, main site says to use NSW Planning Portal from 1 July 2021
178
+ # which doesn't list any DA's for wagga wagga!
179
+
180
+ ENV["MORPH_EXPECT_BAD"] ||= "wagga"
181
+ Scraper.run(Scraper.selected_authorities)
182
+ end
183
+ ```
184
+
185
+ Then deeper in your code update:
186
+
187
+ * Change scrape to accept a `use_proxy` flag and return an `unprocessable` flag
188
+ * it should rescue ScraperUtils::UnprocessableRecord thrown deeper in the scraping code and
189
+ set and yield unprocessable eg: `TechnologyOneScraper.scrape(use_proxy, authority_label) do |record, unprocessable|`
190
+
191
+ ```ruby
192
+ require "scraper_utils"
193
+ #...
194
+ module TechnologyOneScraper
195
+ # Note the extra parameter: use_proxy
196
+ def self.scrape(use_proxy, authority)
197
+ raise "Unexpected authority: #{authority}" unless AUTHORITIES.key?(authority)
198
+
199
+ scrape_period(use_proxy, AUTHORITIES[authority]) do |record, unprocessable|
200
+ yield record, unprocessable
201
+ end
202
+ end
203
+
204
+ # ... rest of code ...
205
+
206
+ # Note the extra parameters: use_proxy and timeout
207
+ def self.scrape_period(use_proxy,
208
+ url:, period:, webguest: "P1.WEBGUEST", disable_ssl_certificate_check: false,
209
+ australian_proxy: false, timeout: nil
210
+ )
211
+ agent = ScraperUtils::MechanizeUtils.mechanize_agent(use_proxy: use_proxy, timeout: timeout)
212
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if disable_ssl_certificate_check
213
+
214
+ # ... rest of code ...
215
+
216
+ # Update yield to return unprocessable as well as record
217
+
218
+ end
219
+ # ... rest of code ...
220
+ end
221
+ ```
222
+
223
+ ### Debugging Techniques
224
+
225
+ The following code will print dbugging info if you set:
226
+
227
+ ```bash
228
+ export DEBUG=1
229
+ ```
230
+
231
+ Add the following immediately before requesting or examining pages
232
+
233
+ ```ruby
234
+ require 'scraper_utils'
235
+
236
+ # Debug an HTTP request
237
+ ScraperUtils::DebugUtils.debug_request(
238
+ "GET",
239
+ "https://example.com/planning-apps",
240
+ parameters: { year: 2023 },
241
+ headers: { "Accept" => "application/json" }
242
+ )
243
+
244
+ # Debug a web page
245
+ ScraperUtils::DebugUtils.debug_page(page, "Checking search results page")
246
+
247
+ # Debug a specific page selector
248
+ ScraperUtils::DebugUtils.debug_selector(page, '.results-table', "Looking for development applications")
249
+ ```
250
+
251
+ ## Development
252
+
253
+ After checking out the repo, run `bin/setup` to install dependencies.
254
+ Then, run `rake test` to run the tests.
255
+
256
+ You can also run `bin/console` for an interactive prompt that will allow you to experiment.
257
+
258
+ To install this gem onto your local machine, run `bundle exec rake install`.
259
+
260
+ To release a new version, update the version number in `version.rb`, and
261
+ then run `bundle exec rake release`,
262
+ which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
263
+
264
+ ## Contributing
265
+
266
+ Bug reports and pull requests are welcome on GitHub at https://github.com/ianheggie-oaf/scraper_utils
267
+
268
+ ## License
269
+
270
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
271
+
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+
5
+ require "rspec/core/rake_task"
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scraper_utils"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Utilities for managing and selecting authorities
5
+ module AuthorityUtils
6
+ # Selects authorities based on environment variable or returns all authorities
7
+ #
8
+ # @param all_authorities [Array<Symbol>] Full list of available authorities
9
+ # @return [Array<Symbol>] Selected subset of authorities or all authorities
10
+ # @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
11
+ AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
12
+
13
+ def self.selected_authorities(all_authorities)
14
+ if ENV[AUTHORITIES_ENV_VAR]
15
+ authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
16
+ invalid = authorities - all_authorities
17
+ unless invalid.empty?
18
+ raise ScraperUtils::Error,
19
+ "Invalid authorities specified in MORPH_AUTHORITIES: #{invalid.join(', ')}"
20
+ end
21
+
22
+ authorities
23
+ else
24
+ all_authorities
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraperwiki"
4
+
5
+ module ScraperUtils
6
+ # Utilities for database operations in scrapers
7
+ module DbUtils
8
+ # Saves a record to the SQLite database with validation and logging
9
+ #
10
+ # @param record [Hash] The record to be saved
11
+ # @raise [ScraperUtils::UnprocessableRecord] If record fails validation
12
+ # @return [void]
13
+ def self.save_record(record)
14
+ # Validate required fields
15
+ required_fields = %w[council_reference address description info_url date_scraped]
16
+ required_fields.each do |field|
17
+ if record[field].to_s.empty?
18
+ raise ScraperUtils::UnprocessableRecord, "Missing required field: #{field}"
19
+ end
20
+ end
21
+
22
+ # Validate date formats
23
+ %w[date_scraped date_received on_notice_from on_notice_to].each do |date_field|
24
+ Date.parse(record[date_field]) if record[date_field]
25
+ rescue ArgumentError
26
+ raise ScraperUtils::UnprocessableRecord,
27
+ "Invalid date format for #{date_field}: #{record[date_field]}"
28
+ end
29
+
30
+ # Determine primary key based on presence of authority_label
31
+ primary_key = if record.key?("authority_label")
32
+ %w[authority_label council_reference]
33
+ else
34
+ ["council_reference"]
35
+ end
36
+
37
+ puts "Saving record #{record['council_reference']} - #{record['address']}"
38
+ ScraperWiki.save_sqlite(primary_key, record)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module ScraperUtils
6
+ # Utilities for debugging web scraping processes
7
+ module DebugUtils
8
+ # Logs details of an HTTP request when debug mode is enabled
9
+ #
10
+ # @param method [String] HTTP method (GET, POST, etc.)
11
+ # @param url [String] Request URL
12
+ # @param parameters [Hash, nil] Optional request parameters
13
+ # @param headers [Hash, nil] Optional request headers
14
+ # @param body [Hash, nil] Optional request body
15
+ # @return [void]
16
+ def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
17
+ return unless ScraperUtils.debug?
18
+
19
+ puts "\n🔍 #{method.upcase} #{url}"
20
+ if parameters
21
+ puts "Parameters:"
22
+ puts JSON.pretty_generate(parameters)
23
+ end
24
+ if headers
25
+ puts "Headers:"
26
+ puts JSON.pretty_generate(headers)
27
+ end
28
+ return unless body
29
+
30
+ puts "Body:"
31
+ puts JSON.pretty_generate(body)
32
+ end
33
+
34
+ # Logs details of a web page when debug mode is enabled
35
+ #
36
+ # @param page [Mechanize::Page] The web page to debug
37
+ # @param message [String] Context or description for the debug output
38
+ # @return [void]
39
+ def self.debug_page(page, message)
40
+ return unless ScraperUtils.debug?
41
+
42
+ puts "",
43
+ "🔍 DEBUG: #{message}"
44
+ puts "Current URL: #{page.uri}"
45
+ puts "Page title: #{page.at('title').text.strip}" if page.at("title")
46
+ puts "",
47
+ "Page content:"
48
+ puts "-" * 40
49
+ puts page.body
50
+ puts "-" * 40
51
+ end
52
+
53
+ # Logs details about a specific page selector when debug mode is enabled
54
+ #
55
+ # @param page [Mechanize::Page] The web page to inspect
56
+ # @param selector [String] CSS selector to look for
57
+ # @param message [String] Context or description for the debug output
58
+ # @return [void]
59
+ def self.debug_selector(page, selector, message)
60
+ return unless ScraperUtils.debug?
61
+
62
+ puts "\n🔍 DEBUG: #{message}"
63
+ puts "Looking for selector: #{selector}"
64
+ element = page.at(selector)
65
+ if element
66
+ puts "Found element:"
67
+ puts element.to_html
68
+ else
69
+ puts "Element not found in:"
70
+ puts "-" * 40
71
+ puts page.body
72
+ puts "-" * 40
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraperwiki"
4
+
5
+ module ScraperUtils
6
+ # Utilities for logging scraper execution details and outcomes
7
+ module LogUtils
8
+ SUMMARY_TABLE = "scrape_summary"
9
+ LOG_TABLE = "scrape_log"
10
+ LOG_RETENTION_DAYS = 30
11
+
12
+ # Log details about a scraping run for one or more authorities
13
+ # @param start_time [Time] When this scraping attempt was started
14
+ # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
15
+ # @param authorities [Array<Symbol>] List of authorities attempted to scrape
16
+ # @param results [Hash] Results for each authority containing:
17
+ # - :records_scraped [Integer] Number of records successfully scraped
18
+ # - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
19
+ # - :error [Exception, nil] Any exception that occurred during scraping
20
+ # - :proxy_used [Boolean] Whether a proxy was used
21
+ # @return [void]
22
+ def self.log_scraping_run(start_time, attempt, authorities, results)
23
+ raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
24
+ raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
25
+
26
+ end_time = Time.now
27
+ duration = (end_time - start_time).round(1)
28
+
29
+ successful = []
30
+ failed = []
31
+ interrupted = []
32
+
33
+ authorities.each do |authority_label|
34
+ result = results[authority_label] || {}
35
+
36
+ status = if result[:records_scraped]&.positive?
37
+ result[:error] ? :interrupted : :successful
38
+ else
39
+ :failed
40
+ end
41
+ case status
42
+ when :successful
43
+ successful << authority_label
44
+ when :interrupted
45
+ interrupted << authority_label
46
+ else
47
+ failed << authority_label
48
+ end
49
+
50
+ record = {
51
+ "run_at" => start_time.iso8601,
52
+ "attempt" => attempt,
53
+ "authority_label" => authority_label.to_s,
54
+ "records_scraped" => result[:records_scraped] || 0,
55
+ "unprocessable_records" => result[:unprocessable_records] || 0,
56
+ "used_proxy" => result[:proxy_used] ? 1 : 0,
57
+ "status" => status.to_s,
58
+ "error_message" => result[:error]&.message,
59
+ "error_class" => result[:error]&.class&.to_s,
60
+ "error_backtrace" => extract_meaningful_backtrace(result[:error])
61
+ }
62
+
63
+ save_log_record(record)
64
+ end
65
+
66
+ # Save summary record for the entire run
67
+ save_summary_record(
68
+ start_time,
69
+ attempt,
70
+ duration,
71
+ successful,
72
+ interrupted,
73
+ failed
74
+ )
75
+
76
+ cleanup_old_records
77
+ end
78
+
79
+ def self.report_on_results(authorities, results)
80
+ expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:to_sym) || []
81
+
82
+ puts "MORPH_EXPECT_BAD=#{ENV['MORPH_EXPECT_BAD']}" if expect_bad.any?
83
+
84
+ errors = []
85
+
86
+ # Check for authorities that were expected to be bad but are now working
87
+ unexpected_working = expect_bad.select do |authority|
88
+ result = results[authority]
89
+ result && result[:records_scraped]&.positive? && result[:error].nil?
90
+ end
91
+
92
+ if unexpected_working.any?
93
+ errors << "WARNING: Remove #{unexpected_working.join(',')} from EXPECT_BAD as it now works!"
94
+ end
95
+
96
+ # Check for authorities with unexpected errors
97
+ unexpected_errors = authorities
98
+ .select { |authority| results[authority]&.dig(:error) }
99
+ .reject { |authority| expect_bad.include?(authority) }
100
+
101
+ if unexpected_errors.any?
102
+ errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
103
+ "(Add to MORPH_EXPECT_BAD?)"
104
+ unexpected_errors.each do |authority|
105
+ error = results[authority][:error]
106
+ errors << " #{authority}: #{error.class} - #{error.message}"
107
+ end
108
+ end
109
+
110
+ if errors.any?
111
+ errors << "See earlier output for details"
112
+ raise errors.join("\n")
113
+ end
114
+
115
+ puts "Exiting with OK status!"
116
+ end
117
+
118
+ def self.save_log_record(record)
119
+ ScraperWiki.save_sqlite(
120
+ %w[authority_label run_at],
121
+ record,
122
+ LOG_TABLE
123
+ )
124
+ end
125
+
126
+ def self.save_summary_record(start_time, attempt, duration,
127
+ successful, interrupted, failed)
128
+ summary = {
129
+ "run_at" => start_time.iso8601,
130
+ "attempt" => attempt,
131
+ "duration" => duration,
132
+ "successful" => successful.join(","),
133
+ "failed" => failed.join(","),
134
+ "interrupted" => interrupted.join(","),
135
+ "successful_count" => successful.size,
136
+ "interrupted_count" => interrupted.size,
137
+ "failed_count" => failed.size
138
+ }
139
+
140
+ ScraperWiki.save_sqlite(
141
+ ["run_at"],
142
+ summary,
143
+ SUMMARY_TABLE
144
+ )
145
+ end
146
+
147
+ def self.cleanup_old_records(force: false)
148
+ cutoff = (Date.today - LOG_RETENTION_DAYS).to_s
149
+ return if !force && @last_cutoff == cutoff
150
+
151
+ @last_cutoff = cutoff
152
+
153
+ [SUMMARY_TABLE, LOG_TABLE].each do |table|
154
+ ScraperWiki.sqliteexecute(
155
+ "DELETE FROM #{table} WHERE date(run_at) < date(?)",
156
+ [cutoff]
157
+ )
158
+ end
159
+ end
160
+
161
+ # Extracts meaningful backtrace - 3 lines from ruby/gem and max 6 in total
162
+ def self.extract_meaningful_backtrace(error)
163
+ return nil unless error.respond_to?(:backtrace) && error&.backtrace
164
+
165
+ lines = []
166
+ error.backtrace.each do |line|
167
+ lines << line if lines.length < 2 || !line.include?("/vendor/")
168
+ break if lines.length >= 6
169
+ end
170
+
171
+ lines.empty? ? nil : lines.join("\n")
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mechanize"
4
+
5
+ module ScraperUtils
6
+ # Utilities for configuring and using Mechanize for web scraping
7
+ module MechanizeUtils
8
+ PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
9
+
10
+ # Creates and configures a Mechanize agent with optional proxy and timeout
11
+ #
12
+ # @param timeout [Integer, nil] Timeout for agent connections
13
+ # @param australian_proxy [Boolean] Whether to use an Australian proxy
14
+ # @return [Mechanize] Configured Mechanize agent
15
+ def self.mechanize_agent(timeout: nil, use_proxy: true)
16
+ agent = Mechanize.new
17
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
18
+ use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
19
+ if use_proxy
20
+ # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
21
+ # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
22
+ # the real password.
23
+ agent.agent.set_proxy(ScraperUtils.australian_proxy)
24
+ end
25
+ if timeout
26
+ agent.open_timeout = timeout
27
+ agent.read_timeout = timeout
28
+ end
29
+ public_ip(agent) if use_proxy
30
+ agent
31
+ end
32
+
33
+ # Returns if the Mechanize agent is using the proxy
34
+ def self.using_proxy?(agent)
35
+ !agent.agent.proxy_uri.nil?
36
+ end
37
+
38
+ # Checks if a page indicates a maintenance mode
39
+ #
40
+ # @param page [Mechanize::Page] The web page to check
41
+ # @return [String, nil] Maintenance message if found, otherwise nil
42
+ def self.find_maintenance_message(page)
43
+ # Use Nokogiri for parsing because earlier versions of Mechanize
44
+ # do not support the .search method on page objects
45
+ doc = Nokogiri::HTML(page.body)
46
+ doc.css("h1, title").each do |element|
47
+ text = element.inner_text
48
+ return "Maintenance: #{text}" if text&.match?(/maintenance/i)
49
+ end
50
+
51
+ # Not in maintenance mode
52
+ nil
53
+ end
54
+
55
+ # Retrieves and logs the public IP address
56
+ #
57
+ # @param agent [Mechanize] Mechanize agent to use for IP lookup
58
+ # @param force [Boolean] Force a new IP lookup, bypassing cache
59
+ # @return [String] The public IP address
60
+ def self.public_ip(agent, force: false)
61
+ @public_ip = nil if force
62
+ @public_ip ||=
63
+ begin
64
+ ip = agent.get(PUBLIC_IP_URL).body.strip
65
+ puts "Public IP: #{ip}"
66
+ ip
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraper_utils/authority_utils"
4
+ require "scraper_utils/db_utils"
5
+ require "scraper_utils/debug_utils"
6
+ require "scraper_utils/log_utils"
7
+ require "scraper_utils/mechanize_utils"
8
+ require "scraper_utils/version"
9
+
10
+ # Utilities for planningalerts scrapers
11
+ module ScraperUtils
12
+ # Constants for configuration on Morph.io
13
+ AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
14
+
15
+ # Enable debug locally, not on morph.io
16
+ DEBUG_ENV_VAR = "DEBUG"
17
+
18
+ # Fatal Error
19
+ class Error < StandardError
20
+ end
21
+
22
+ # Fatal error with the site - retrying won't help
23
+ class UnprocessableSite < Error
24
+ end
25
+
26
+ # Content validation errors that should not be retried for that record,
27
+ # but other records may be processable
28
+ class UnprocessableRecord < Error
29
+ end
30
+
31
+ # Check if debug mode is enabled
32
+ #
33
+ # @return [Boolean] Whether debug mode is active
34
+ def self.debug?
35
+ !ENV[DEBUG_ENV_VAR].to_s.empty?
36
+ end
37
+
38
+ def self.australian_proxy
39
+ ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
40
+ ap.empty? ? nil : ap
41
+ end
42
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require "scraper_utils/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "scraper_utils"
9
+ spec.version = ScraperUtils::VERSION
10
+ spec.authors = ["Ian Heggie"]
11
+ spec.email = ["ian@heggie.biz"]
12
+ spec.required_ruby_version = ">= 2.5.1"
13
+
14
+ spec.summary = "planningalerts scraper utilities"
15
+ spec.description = "Utilities to help make planningalerts scrapers, " \
16
+ "+especially multis easier to develop, run and debug."
17
+ spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
18
+ spec.license = "MIT"
19
+
20
+ if spec.respond_to?(:metadata)
21
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
22
+
23
+ spec.metadata["homepage_uri"] = spec.homepage
24
+ spec.metadata["source_code_uri"] = spec.homepage
25
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
26
+ else
27
+ raise "RubyGems 2.0 or newer is required to protect against " \
28
+ "public gem pushes."
29
+ end
30
+
31
+ # Specify which files should be added to the gem when it is released.
32
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
33
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
34
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
35
+ end
36
+ spec.bindir = "exe"
37
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
38
+ spec.require_paths = ["lib"]
39
+
40
+ spec.add_dependency "mechanize"
41
+ spec.add_dependency "nokogiri"
42
+ spec.add_dependency "sqlite3"
43
+
44
+ spec.add_development_dependency "rake"
45
+ spec.add_development_dependency "rspec"
46
+ spec.add_development_dependency "rubocop"
47
+ spec.add_development_dependency "simplecov"
48
+ spec.add_development_dependency "simplecov-console"
49
+ end
metadata ADDED
@@ -0,0 +1,178 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraper_utils
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ian Heggie
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-02-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: sqlite3
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov-console
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Utilities to help make planningalerts scrapers, +especially multis easier
126
+ to develop, run and debug.
127
+ email:
128
+ - ian@heggie.biz
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - ".rspec"
135
+ - ".rubocop.yml"
136
+ - ".travis.yml"
137
+ - Gemfile
138
+ - LICENSE.txt
139
+ - README.md
140
+ - Rakefile
141
+ - bin/console
142
+ - bin/setup
143
+ - lib/scraper_utils.rb
144
+ - lib/scraper_utils/authority_utils.rb
145
+ - lib/scraper_utils/db_utils.rb
146
+ - lib/scraper_utils/debug_utils.rb
147
+ - lib/scraper_utils/log_utils.rb
148
+ - lib/scraper_utils/mechanize_utils.rb
149
+ - lib/scraper_utils/version.rb
150
+ - scraper_utils.gemspec
151
+ homepage: https://github.com/ianheggie-oaf/scraper_utils
152
+ licenses:
153
+ - MIT
154
+ metadata:
155
+ allowed_push_host: https://rubygems.org
156
+ homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
157
+ source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
158
+ post_install_message:
159
+ rdoc_options: []
160
+ require_paths:
161
+ - lib
162
+ required_ruby_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 2.5.1
167
+ required_rubygems_version: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ requirements: []
173
+ rubyforge_project:
174
+ rubygems_version: 2.7.6.2
175
+ signing_key:
176
+ specification_version: 4
177
+ summary: planningalerts scraper utilities
178
+ test_files: []