scraper_utils 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: db12d36e0d3be635eba2c00dbe149f4d177ddc5e538a08fcd9038a026feaee91
4
+ data.tar.gz: 8d2f140b7fff7e02d90df19ac196018f8719cd73d85067519ee3e931f679f619
5
+ SHA512:
6
+ metadata.gz: 7138204493653a872aafcf4a1f8b78d8d5129c70d79a54d6ca10aa1440fc60362edc270522cc0d66c13a7694527a502d75d3dec36cb21f2240fceea85367eec4
7
+ data.tar.gz: 83dffaedd054ed40c7a269c4fd3db270892bc0fa20c4b7d1a904a075cb990bee51004ccb9c0cb86840d87a631655207301b8af1f7a5572f0893d9317a1b90aa5
data/.gitignore ADDED
@@ -0,0 +1,77 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+
12
+ # Temp files
13
+ ,*
14
+ *.bak
15
+ /tmp/
16
+
17
+ # IDEs and AI assistants
18
+ /.aider*
19
+ /.idea
20
+ /.vscode*
21
+
22
+ # Ignore vim files:
23
+ *~
24
+ *.swp
25
+ *.swo
26
+
27
+ # Ignore patch files
28
+ *.patch
29
+ *.rej
30
+
31
+ # Used by direnv / dotenv library to load environment variables.
32
+ .env*
33
+
34
+ # Ignore Byebug command history file.
35
+ .byebug_history
36
+
37
+ ## Specific to RubyMotion:
38
+ .dat*
39
+ .repl_history
40
+ build/
41
+ *.bridgesupport
42
+ build-iPhoneOS/
43
+ build-iPhoneSimulator/
44
+
45
+ ## Specific to RubyMotion (use of CocoaPods):
46
+ #
47
+ # We recommend against adding the Pods directory to your .gitignore. However
48
+ # you should judge for yourself, the pros and cons are mentioned at:
49
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
50
+ #
51
+ # vendor/Pods/
52
+
53
+ ## Documentation cache and generated files:
54
+ /.yardoc/
55
+ /_yardoc/
56
+ /doc/
57
+ /rdoc/
58
+
59
+ ## Environment normalization:
60
+ /.bundle/
61
+ /vendor/bundle
62
+ /lib/bundler/man/
63
+
64
+ # for a library or gem, you might want to ignore these files since the code is
65
+ # intended to run in multiple environments; otherwise, check them in:
66
+ Gemfile.lock
67
+ .ruby-version
68
+ .ruby-gemset
69
+
70
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
71
+ .rvmrc
72
+
73
+ # Used by RuboCop. Remote config files pulled in from inherit_from directive.
74
+ .rubocop-https?--*
75
+
76
+ # rspec reports
77
+ /.rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1,82 @@
1
+ AllCops:
2
+ Exclude:
3
+ - bin/*
4
+ # This is a temporary dumping ground for authority specific
5
+ # code that we're probably just initially copying across from
6
+ # other scrapers. So, we don't care about the formatting and style
7
+ # initially.
8
+ # TODO: Remove this once we've removed all the code from here
9
+ - lib/technology_one_scraper/authority/*
10
+
11
+ # Bumping max line length to something a little more reasonable
12
+ Layout/LineLength:
13
+ Max: 100
14
+
15
+ # We prefer double quotes here and it we're making liberal use of multi-line
16
+ # strings so it makes sense to enforce those to be consistent oo
17
+ Style/StringLiterals:
18
+ EnforcedStyle: double_quotes
19
+ ConsistentQuotesInMultiline: true
20
+
21
+ # This one I disagree with. Putting seperators in large numbers makes sense
22
+ # in some circumstances but in others (an example id in a database table)
23
+ # it's just nonsensical. Also, I think this one might also be a bit US centric.
24
+ Style/NumericLiterals:
25
+ Enabled: false
26
+
27
+ # Disable a bunch of metrics to do with code complexity. These as are all
28
+ # a bit hard-nosed. Maybe after we've done a pass with Code Climate we
29
+ # can revisit these
30
+ Metrics/AbcSize:
31
+ Enabled: false
32
+
33
+ Metrics/BlockLength:
34
+ Enabled: false
35
+
36
+ Metrics/ClassLength:
37
+ Enabled: false
38
+
39
+ Metrics/CyclomaticComplexity:
40
+ Enabled: false
41
+
42
+ Metrics/MethodLength:
43
+ Enabled: false
44
+
45
+ Metrics/ModuleLength:
46
+ Enabled: false
47
+
48
+ Metrics/ParameterLists:
49
+ Enabled: false
50
+
51
+ Metrics/PerceivedComplexity:
52
+ Enabled: false
53
+
54
+ Layout/EmptyLinesAroundAttributeAccessor:
55
+ Enabled: true
56
+
57
+ Layout/SpaceAroundMethodCallOperator:
58
+ Enabled: true
59
+
60
+ Lint/DeprecatedOpenSSLConstant:
61
+ Enabled: true
62
+
63
+ Lint/RaiseException:
64
+ Enabled: true
65
+
66
+ Lint/StructNewOverride:
67
+ Enabled: true
68
+
69
+ Style/ExponentialNotation:
70
+ Enabled: true
71
+
72
+ Style/HashEachMethods:
73
+ Enabled: true
74
+
75
+ Style/HashTransformKeys:
76
+ Enabled: true
77
+
78
+ Style/HashTransformValues:
79
+ Enabled: true
80
+
81
+ Style/SlicingWithRange:
82
+ Enabled: true
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.8
7
+ before_install: gem install bundler -v 1.17.3
data/Gemfile ADDED
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ platform = if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("3.0.0")
6
+ :heroku16
7
+ elsif Gem::Version.new(RUBY_VERSION) < Gem::Version.new("3.3.0")
8
+ :heroku18
9
+ end
10
+
11
+ ruby case platform
12
+ when :heroku16 then "~> 2.5.8"
13
+ when :heroku18 then "~> 3.2.2"
14
+ else "~> 3.3.7"
15
+ end
16
+
17
+ gem "mechanize", platform && (platform == :heroku16 ? "~> 2.7.0" : "~> 2.8.5")
18
+ gem "nokogiri", platform && (platform == :heroku16 ? "~> 1.11.2" : "~> 1.15.0")
19
+ gem "sqlite3", platform && (platform == :heroku16 ? "~> 1.4.0" : "~> 1.6.3")
20
+
21
+ # Unable to list in gemspec - Include it in your projects Gemfile when using this gem
22
+ gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git",
23
+ branch: "morph_defaults"
24
+
25
+ # development and test test gems
26
+ gem "rake", platform && (platform == :heroku16 ? "~> 12.3.3" : "~> 13.0")
27
+ gem "rspec", platform && (platform == :heroku16 ? "~> 3.9.0" : "~> 3.12")
28
+ gem "rubocop", platform && (platform == :heroku16 ? "~> 0.80.0" : "~> 1.57")
29
+ gem "simplecov", platform && (platform == :heroku16 ? "~> 0.18.0" : "~> 0.22.0")
30
+ # gem "simplecov-console" listed in gemspec
31
+ gem "webmock", platform && (platform == :heroku16 ? "~> 3.14.0" : "~> 3.19.0")
32
+
33
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Ian Heggie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,271 @@
1
+ ScraperUtils (Ruby)
2
+ ===================
3
+
4
+ Utilities to help make planningalerts scrapers, especially multis easier to develop, run and debug.
5
+
6
+ WARNING: This is still under development! Breaking changes may occur in version 0!
7
+
8
+ ## Installation
9
+
10
+ Add these line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
14
+ gem 'scraper_utils'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself for testing:
22
+
23
+ $ gem install scraper_utils
24
+
25
+ ## Usage
26
+
27
+ ### Environment variables
28
+
29
+ Optionally filter authorities via environment variable in morph > scraper > settings or
30
+ in your dev environment:
31
+
32
+ ```bash
33
+ export MORPH_AUTHORITIES=noosa,wagga
34
+ ```
35
+
36
+ ### Example updated `scraper.rb` file
37
+
38
+ Update your `scraper.rb` as per the following example:
39
+
40
+ ```ruby
41
+ #!/usr/bin/env ruby
42
+ # frozen_string_literal: true
43
+
44
+ $LOAD_PATH << "./lib"
45
+
46
+ require "scraper_utils"
47
+ require "technology_one_scraper"
48
+
49
+ # Main Scraper class
50
+ class Scraper
51
+ AUTHORITIES = TechnologyOneScraper::AUTHORITIES
52
+
53
+ def self.scrape(authorities, attempt)
54
+ results = {}
55
+ authorities.each do |authority_label|
56
+ these_results = results[authority_label] = {}
57
+ begin
58
+ records_scraped = 0
59
+ unprocessable_records = 0
60
+ # Allow 5 + 10% unprocessable records
61
+ too_many_unprocessable = -5.0
62
+ use_proxy = AUTHORITIES[authority_label][:australian_proxy] && ScraperUtils.australian_proxy
63
+ next if attempt > 2 && !use_proxy
64
+
65
+ puts "",
66
+ "Collecting feed data for #{authority_label}, attempt: #{attempt}" \
67
+ "#{use_proxy ? ' (via proxy)' : ''} ..."
68
+ # Change scrape to accept a use_proxy flag and return an unprocessable flag
69
+ # it should rescue ScraperUtils::UnprocessableRecord thrown deeper in the scraping code and
70
+ # set unprocessable
71
+ TechnologyOneScraper.scrape(use_proxy, authority_label) do |record, unprocessable|
72
+ unless unprocessable
73
+ begin
74
+ record["authority_label"] = authority_label.to_s
75
+ ScraperUtils::DbUtils.save_record(record)
76
+ rescue ScraperUtils::UnprocessableRecord => e
77
+ # validation error
78
+ unprocessable = true
79
+ these_results[:error] = e
80
+ end
81
+ end
82
+ if unprocessable
83
+ unprocessable_records += 1
84
+ these_results[:unprocessable_records] = unprocessable_records
85
+ too_many_unprocessable += 1
86
+ raise "Too many unprocessable records" if too_many_unprocessable.positive?
87
+ else
88
+ records_scraped += 1
89
+ these_results[:records_scraped] = records_scraped
90
+ too_many_unprocessable -= 0.1
91
+ end
92
+ end
93
+ rescue StandardError => e
94
+ warn "#{authority_label}: ERROR: #{e}"
95
+ warn e.backtrace || "No backtrace available"
96
+ these_results[:error] = e
97
+ end
98
+ end
99
+ results
100
+ end
101
+
102
+ def self.selected_authorities
103
+ ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
104
+ end
105
+
106
+ def self.run(authorities)
107
+ puts "Scraping authorities: #{authorities.join(', ')}"
108
+ start_time = Time.now
109
+ results = scrape(authorities, 1)
110
+ ScraperUtils::LogUtils.log_scraping_run(
111
+ start_time,
112
+ 1,
113
+ authorities,
114
+ results
115
+ )
116
+
117
+ retry_errors = results.select do |_auth, result|
118
+ result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
119
+ end.keys
120
+
121
+ unless retry_errors.empty?
122
+ puts "",
123
+ "***************************************************"
124
+ puts "Now retrying authorities which earlier had failures"
125
+ puts retry_errors.join(", ").to_s
126
+ puts "***************************************************"
127
+
128
+ start_retry = Time.now
129
+ retry_results = scrape(retry_errors, 2)
130
+ ScraperUtils::LogUtils.log_scraping_run(
131
+ start_retry,
132
+ 2,
133
+ retry_errors,
134
+ retry_results
135
+ )
136
+
137
+ retry_results.each do |auth, result|
138
+ unless result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
139
+ results[auth] = result
140
+ end
141
+ end.keys
142
+ retry_no_proxy = retry_results.select do |_auth, result|
143
+ result[:used_proxy] && result[:error] &&
144
+ !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
145
+ end.keys
146
+
147
+ unless retry_no_proxy.empty?
148
+ puts "",
149
+ "*****************************************************************"
150
+ puts "Now retrying authorities which earlier had failures without proxy"
151
+ puts retry_no_proxy.join(", ").to_s
152
+ puts "*****************************************************************"
153
+
154
+ start_retry = Time.now
155
+ second_retry_results = scrape(retry_no_proxy, 3)
156
+ ScraperUtils::LogUtils.log_scraping_run(
157
+ start_retry,
158
+ 3,
159
+ retry_no_proxy,
160
+ second_retry_results
161
+ )
162
+ second_retry_results.each do |auth, result|
163
+ unless result[:error] && !result[:error].is_a?(ScraperUtils::UnprocessableRecord)
164
+ results[auth] = result
165
+ end
166
+ end.keys
167
+ end
168
+ end
169
+
170
+ # Report on results, raising errors for unexpected conditions
171
+ ScraperUtils::LogUtils.report_on_results(authorities, results)
172
+ end
173
+ end
174
+
175
+ if __FILE__ == $PROGRAM_NAME
176
+ # Default to list of authorities we can't or won't fix in code, explain why
177
+ # wagga: url redirects and reports Application error, main site says to use NSW Planning Portal from 1 July 2021
178
+ # which doesn't list any DA's for wagga wagga!
179
+
180
+ ENV["MORPH_EXPECT_BAD"] ||= "wagga"
181
+ Scraper.run(Scraper.selected_authorities)
182
+ end
183
+ ```
184
+
185
+ Then deeper in your code update:
186
+
187
+ * Change scrape to accept a `use_proxy` flag and return an `unprocessable` flag
188
+ * it should rescue ScraperUtils::UnprocessableRecord thrown deeper in the scraping code and
189
+ set and yield unprocessable eg: `TechnologyOneScraper.scrape(use_proxy, authority_label) do |record, unprocessable|`
190
+
191
+ ```ruby
192
+ require "scraper_utils"
193
+ #...
194
+ module TechnologyOneScraper
195
+ # Note the extra parameter: use_proxy
196
+ def self.scrape(use_proxy, authority)
197
+ raise "Unexpected authority: #{authority}" unless AUTHORITIES.key?(authority)
198
+
199
+ scrape_period(use_proxy, AUTHORITIES[authority]) do |record, unprocessable|
200
+ yield record, unprocessable
201
+ end
202
+ end
203
+
204
+ # ... rest of code ...
205
+
206
+ # Note the extra parameters: use_proxy and timeout
207
+ def self.scrape_period(use_proxy,
208
+ url:, period:, webguest: "P1.WEBGUEST", disable_ssl_certificate_check: false,
209
+ australian_proxy: false, timeout: nil
210
+ )
211
+ agent = ScraperUtils::MechanizeUtils.mechanize_agent(use_proxy: use_proxy, timeout: timeout)
212
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE if disable_ssl_certificate_check
213
+
214
+ # ... rest of code ...
215
+
216
+ # Update yield to return unprocessable as well as record
217
+
218
+ end
219
+ # ... rest of code ...
220
+ end
221
+ ```
222
+
223
+ ### Debugging Techniques
224
+
225
+ The following code will print dbugging info if you set:
226
+
227
+ ```bash
228
+ export DEBUG=1
229
+ ```
230
+
231
+ Add the following immediately before requesting or examining pages
232
+
233
+ ```ruby
234
+ require 'scraper_utils'
235
+
236
+ # Debug an HTTP request
237
+ ScraperUtils::DebugUtils.debug_request(
238
+ "GET",
239
+ "https://example.com/planning-apps",
240
+ parameters: { year: 2023 },
241
+ headers: { "Accept" => "application/json" }
242
+ )
243
+
244
+ # Debug a web page
245
+ ScraperUtils::DebugUtils.debug_page(page, "Checking search results page")
246
+
247
+ # Debug a specific page selector
248
+ ScraperUtils::DebugUtils.debug_selector(page, '.results-table', "Looking for development applications")
249
+ ```
250
+
251
+ ## Development
252
+
253
+ After checking out the repo, run `bin/setup` to install dependencies.
254
+ Then, run `rake test` to run the tests.
255
+
256
+ You can also run `bin/console` for an interactive prompt that will allow you to experiment.
257
+
258
+ To install this gem onto your local machine, run `bundle exec rake install`.
259
+
260
+ To release a new version, update the version number in `version.rb`, and
261
+ then run `bundle exec rake release`,
262
+ which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
263
+
264
+ ## Contributing
265
+
266
+ Bug reports and pull requests are welcome on GitHub at https://github.com/ianheggie-oaf/scraper_utils
267
+
268
+ ## License
269
+
270
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
271
+
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+
5
+ require "rspec/core/rake_task"
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "scraper_utils"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Utilities for managing and selecting authorities
5
+ module AuthorityUtils
6
+ # Selects authorities based on environment variable or returns all authorities
7
+ #
8
+ # @param all_authorities [Array<Symbol>] Full list of available authorities
9
+ # @return [Array<Symbol>] Selected subset of authorities or all authorities
10
+ # @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
11
+ AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
12
+
13
+ def self.selected_authorities(all_authorities)
14
+ if ENV[AUTHORITIES_ENV_VAR]
15
+ authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
16
+ invalid = authorities - all_authorities
17
+ unless invalid.empty?
18
+ raise ScraperUtils::Error,
19
+ "Invalid authorities specified in MORPH_AUTHORITIES: #{invalid.join(', ')}"
20
+ end
21
+
22
+ authorities
23
+ else
24
+ all_authorities
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraperwiki"
4
+
5
+ module ScraperUtils
6
+ # Utilities for database operations in scrapers
7
+ module DbUtils
8
+ # Saves a record to the SQLite database with validation and logging
9
+ #
10
+ # @param record [Hash] The record to be saved
11
+ # @raise [ScraperUtils::UnprocessableRecord] If record fails validation
12
+ # @return [void]
13
+ def self.save_record(record)
14
+ # Validate required fields
15
+ required_fields = %w[council_reference address description info_url date_scraped]
16
+ required_fields.each do |field|
17
+ if record[field].to_s.empty?
18
+ raise ScraperUtils::UnprocessableRecord, "Missing required field: #{field}"
19
+ end
20
+ end
21
+
22
+ # Validate date formats
23
+ %w[date_scraped date_received on_notice_from on_notice_to].each do |date_field|
24
+ Date.parse(record[date_field]) if record[date_field]
25
+ rescue ArgumentError
26
+ raise ScraperUtils::UnprocessableRecord,
27
+ "Invalid date format for #{date_field}: #{record[date_field]}"
28
+ end
29
+
30
+ # Determine primary key based on presence of authority_label
31
+ primary_key = if record.key?("authority_label")
32
+ %w[authority_label council_reference]
33
+ else
34
+ ["council_reference"]
35
+ end
36
+
37
+ puts "Saving record #{record['council_reference']} - #{record['address']}"
38
+ ScraperWiki.save_sqlite(primary_key, record)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module ScraperUtils
6
+ # Utilities for debugging web scraping processes
7
+ module DebugUtils
8
+ # Logs details of an HTTP request when debug mode is enabled
9
+ #
10
+ # @param method [String] HTTP method (GET, POST, etc.)
11
+ # @param url [String] Request URL
12
+ # @param parameters [Hash, nil] Optional request parameters
13
+ # @param headers [Hash, nil] Optional request headers
14
+ # @param body [Hash, nil] Optional request body
15
+ # @return [void]
16
+ def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
17
+ return unless ScraperUtils.debug?
18
+
19
+ puts "\n🔍 #{method.upcase} #{url}"
20
+ if parameters
21
+ puts "Parameters:"
22
+ puts JSON.pretty_generate(parameters)
23
+ end
24
+ if headers
25
+ puts "Headers:"
26
+ puts JSON.pretty_generate(headers)
27
+ end
28
+ return unless body
29
+
30
+ puts "Body:"
31
+ puts JSON.pretty_generate(body)
32
+ end
33
+
34
+ # Logs details of a web page when debug mode is enabled
35
+ #
36
+ # @param page [Mechanize::Page] The web page to debug
37
+ # @param message [String] Context or description for the debug output
38
+ # @return [void]
39
+ def self.debug_page(page, message)
40
+ return unless ScraperUtils.debug?
41
+
42
+ puts "",
43
+ "🔍 DEBUG: #{message}"
44
+ puts "Current URL: #{page.uri}"
45
+ puts "Page title: #{page.at('title').text.strip}" if page.at("title")
46
+ puts "",
47
+ "Page content:"
48
+ puts "-" * 40
49
+ puts page.body
50
+ puts "-" * 40
51
+ end
52
+
53
+ # Logs details about a specific page selector when debug mode is enabled
54
+ #
55
+ # @param page [Mechanize::Page] The web page to inspect
56
+ # @param selector [String] CSS selector to look for
57
+ # @param message [String] Context or description for the debug output
58
+ # @return [void]
59
+ def self.debug_selector(page, selector, message)
60
+ return unless ScraperUtils.debug?
61
+
62
+ puts "\n🔍 DEBUG: #{message}"
63
+ puts "Looking for selector: #{selector}"
64
+ element = page.at(selector)
65
+ if element
66
+ puts "Found element:"
67
+ puts element.to_html
68
+ else
69
+ puts "Element not found in:"
70
+ puts "-" * 40
71
+ puts page.body
72
+ puts "-" * 40
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraperwiki"
4
+
5
+ module ScraperUtils
6
+ # Utilities for logging scraper execution details and outcomes
7
+ module LogUtils
8
+ SUMMARY_TABLE = "scrape_summary"
9
+ LOG_TABLE = "scrape_log"
10
+ LOG_RETENTION_DAYS = 30
11
+
12
+ # Log details about a scraping run for one or more authorities
13
+ # @param start_time [Time] When this scraping attempt was started
14
+ # @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
15
+ # @param authorities [Array<Symbol>] List of authorities attempted to scrape
16
+ # @param results [Hash] Results for each authority containing:
17
+ # - :records_scraped [Integer] Number of records successfully scraped
18
+ # - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
19
+ # - :error [Exception, nil] Any exception that occurred during scraping
20
+ # - :proxy_used [Boolean] Whether a proxy was used
21
+ # @return [void]
22
+ def self.log_scraping_run(start_time, attempt, authorities, results)
23
+ raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
24
+ raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
25
+
26
+ end_time = Time.now
27
+ duration = (end_time - start_time).round(1)
28
+
29
+ successful = []
30
+ failed = []
31
+ interrupted = []
32
+
33
+ authorities.each do |authority_label|
34
+ result = results[authority_label] || {}
35
+
36
+ status = if result[:records_scraped]&.positive?
37
+ result[:error] ? :interrupted : :successful
38
+ else
39
+ :failed
40
+ end
41
+ case status
42
+ when :successful
43
+ successful << authority_label
44
+ when :interrupted
45
+ interrupted << authority_label
46
+ else
47
+ failed << authority_label
48
+ end
49
+
50
+ record = {
51
+ "run_at" => start_time.iso8601,
52
+ "attempt" => attempt,
53
+ "authority_label" => authority_label.to_s,
54
+ "records_scraped" => result[:records_scraped] || 0,
55
+ "unprocessable_records" => result[:unprocessable_records] || 0,
56
+ "used_proxy" => result[:proxy_used] ? 1 : 0,
57
+ "status" => status.to_s,
58
+ "error_message" => result[:error]&.message,
59
+ "error_class" => result[:error]&.class&.to_s,
60
+ "error_backtrace" => extract_meaningful_backtrace(result[:error])
61
+ }
62
+
63
+ save_log_record(record)
64
+ end
65
+
66
+ # Save summary record for the entire run
67
+ save_summary_record(
68
+ start_time,
69
+ attempt,
70
+ duration,
71
+ successful,
72
+ interrupted,
73
+ failed
74
+ )
75
+
76
+ cleanup_old_records
77
+ end
78
+
79
+ def self.report_on_results(authorities, results)
80
+ expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:to_sym) || []
81
+
82
+ puts "MORPH_EXPECT_BAD=#{ENV['MORPH_EXPECT_BAD']}" if expect_bad.any?
83
+
84
+ errors = []
85
+
86
+ # Check for authorities that were expected to be bad but are now working
87
+ unexpected_working = expect_bad.select do |authority|
88
+ result = results[authority]
89
+ result && result[:records_scraped]&.positive? && result[:error].nil?
90
+ end
91
+
92
+ if unexpected_working.any?
93
+ errors << "WARNING: Remove #{unexpected_working.join(',')} from EXPECT_BAD as it now works!"
94
+ end
95
+
96
+ # Check for authorities with unexpected errors
97
+ unexpected_errors = authorities
98
+ .select { |authority| results[authority]&.dig(:error) }
99
+ .reject { |authority| expect_bad.include?(authority) }
100
+
101
+ if unexpected_errors.any?
102
+ errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
103
+ "(Add to MORPH_EXPECT_BAD?)"
104
+ unexpected_errors.each do |authority|
105
+ error = results[authority][:error]
106
+ errors << " #{authority}: #{error.class} - #{error.message}"
107
+ end
108
+ end
109
+
110
+ if errors.any?
111
+ errors << "See earlier output for details"
112
+ raise errors.join("\n")
113
+ end
114
+
115
+ puts "Exiting with OK status!"
116
+ end
117
+
118
+ def self.save_log_record(record)
119
+ ScraperWiki.save_sqlite(
120
+ %w[authority_label run_at],
121
+ record,
122
+ LOG_TABLE
123
+ )
124
+ end
125
+
126
+ def self.save_summary_record(start_time, attempt, duration,
127
+ successful, interrupted, failed)
128
+ summary = {
129
+ "run_at" => start_time.iso8601,
130
+ "attempt" => attempt,
131
+ "duration" => duration,
132
+ "successful" => successful.join(","),
133
+ "failed" => failed.join(","),
134
+ "interrupted" => interrupted.join(","),
135
+ "successful_count" => successful.size,
136
+ "interrupted_count" => interrupted.size,
137
+ "failed_count" => failed.size
138
+ }
139
+
140
+ ScraperWiki.save_sqlite(
141
+ ["run_at"],
142
+ summary,
143
+ SUMMARY_TABLE
144
+ )
145
+ end
146
+
147
+ def self.cleanup_old_records(force: false)
148
+ cutoff = (Date.today - LOG_RETENTION_DAYS).to_s
149
+ return if !force && @last_cutoff == cutoff
150
+
151
+ @last_cutoff = cutoff
152
+
153
+ [SUMMARY_TABLE, LOG_TABLE].each do |table|
154
+ ScraperWiki.sqliteexecute(
155
+ "DELETE FROM #{table} WHERE date(run_at) < date(?)",
156
+ [cutoff]
157
+ )
158
+ end
159
+ end
160
+
161
+ # Extracts meaningful backtrace - 3 lines from ruby/gem and max 6 in total
162
+ def self.extract_meaningful_backtrace(error)
163
+ return nil unless error.respond_to?(:backtrace) && error&.backtrace
164
+
165
+ lines = []
166
+ error.backtrace.each do |line|
167
+ lines << line if lines.length < 2 || !line.include?("/vendor/")
168
+ break if lines.length >= 6
169
+ end
170
+
171
+ lines.empty? ? nil : lines.join("\n")
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mechanize"
4
+
5
+ module ScraperUtils
6
+ # Utilities for configuring and using Mechanize for web scraping
7
+ module MechanizeUtils
8
+ PUBLIC_IP_URL = "https://whatismyip.akamai.com/"
9
+
10
+ # Creates and configures a Mechanize agent with optional proxy and timeout
11
+ #
12
+ # @param timeout [Integer, nil] Timeout for agent connections
13
+ # @param australian_proxy [Boolean] Whether to use an Australian proxy
14
+ # @return [Mechanize] Configured Mechanize agent
15
+ def self.mechanize_agent(timeout: nil, use_proxy: true)
16
+ agent = Mechanize.new
17
+ agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
18
+ use_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
19
+ if use_proxy
20
+ # On morph.io set the environment variable MORPH_AUSTRALIAN_PROXY to
21
+ # http://morph:password@au.proxy.oaf.org.au:8888 replacing password with
22
+ # the real password.
23
+ agent.agent.set_proxy(ScraperUtils.australian_proxy)
24
+ end
25
+ if timeout
26
+ agent.open_timeout = timeout
27
+ agent.read_timeout = timeout
28
+ end
29
+ public_ip(agent) if use_proxy
30
+ agent
31
+ end
32
+
33
+ # Returns if the Mechanize agent is using the proxy
34
+ def self.using_proxy?(agent)
35
+ !agent.agent.proxy_uri.nil?
36
+ end
37
+
38
+ # Checks if a page indicates a maintenance mode
39
+ #
40
+ # @param page [Mechanize::Page] The web page to check
41
+ # @return [String, nil] Maintenance message if found, otherwise nil
42
+ def self.find_maintenance_message(page)
43
+ # Use Nokogiri for parsing because earlier versions of Mechanize
44
+ # do not support the .search method on page objects
45
+ doc = Nokogiri::HTML(page.body)
46
+ doc.css("h1, title").each do |element|
47
+ text = element.inner_text
48
+ return "Maintenance: #{text}" if text&.match?(/maintenance/i)
49
+ end
50
+
51
+ # Not in maintenance mode
52
+ nil
53
+ end
54
+
55
+ # Retrieves and logs the public IP address
56
+ #
57
+ # @param agent [Mechanize] Mechanize agent to use for IP lookup
58
+ # @param force [Boolean] Force a new IP lookup, bypassing cache
59
+ # @return [String] The public IP address
60
+ def self.public_ip(agent, force: false)
61
+ @public_ip = nil if force
62
+ @public_ip ||=
63
+ begin
64
+ ip = agent.get(PUBLIC_IP_URL).body.strip
65
+ puts "Public IP: #{ip}"
66
+ ip
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "scraper_utils/authority_utils"
4
+ require "scraper_utils/db_utils"
5
+ require "scraper_utils/debug_utils"
6
+ require "scraper_utils/log_utils"
7
+ require "scraper_utils/mechanize_utils"
8
+ require "scraper_utils/version"
9
+
10
+ # Utilities for planningalerts scrapers
11
+ module ScraperUtils
12
+ # Constants for configuration on Morph.io
13
+ AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
14
+
15
+ # Enable debug locally, not on morph.io
16
+ DEBUG_ENV_VAR = "DEBUG"
17
+
18
+ # Fatal Error
19
+ class Error < StandardError
20
+ end
21
+
22
+ # Fatal error with the site - retrying won't help
23
+ class UnprocessableSite < Error
24
+ end
25
+
26
+ # Content validation errors that should not be retried for that record,
27
+ # but other records may be processable
28
+ class UnprocessableRecord < Error
29
+ end
30
+
31
+ # Check if debug mode is enabled
32
+ #
33
+ # @return [Boolean] Whether debug mode is active
34
+ def self.debug?
35
+ !ENV[DEBUG_ENV_VAR].to_s.empty?
36
+ end
37
+
38
+ def self.australian_proxy
39
+ ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
40
+ ap.empty? ? nil : ap
41
+ end
42
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require "scraper_utils/version"
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = "scraper_utils"
9
+ spec.version = ScraperUtils::VERSION
10
+ spec.authors = ["Ian Heggie"]
11
+ spec.email = ["ian@heggie.biz"]
12
+ spec.required_ruby_version = ">= 2.5.1"
13
+
14
+ spec.summary = "planningalerts scraper utilities"
15
+ spec.description = "Utilities to help make planningalerts scrapers, " \
16
+ "+especially multis easier to develop, run and debug."
17
+ spec.homepage = "https://github.com/ianheggie-oaf/scraper_utils"
18
+ spec.license = "MIT"
19
+
20
+ if spec.respond_to?(:metadata)
21
+ spec.metadata["allowed_push_host"] = "https://rubygems.org"
22
+
23
+ spec.metadata["homepage_uri"] = spec.homepage
24
+ spec.metadata["source_code_uri"] = spec.homepage
25
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
26
+ else
27
+ raise "RubyGems 2.0 or newer is required to protect against " \
28
+ "public gem pushes."
29
+ end
30
+
31
+ # Specify which files should be added to the gem when it is released.
32
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
33
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
34
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
35
+ end
36
+ spec.bindir = "exe"
37
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
38
+ spec.require_paths = ["lib"]
39
+
40
+ spec.add_dependency "mechanize"
41
+ spec.add_dependency "nokogiri"
42
+ spec.add_dependency "sqlite3"
43
+
44
+ spec.add_development_dependency "rake"
45
+ spec.add_development_dependency "rspec"
46
+ spec.add_development_dependency "rubocop"
47
+ spec.add_development_dependency "simplecov"
48
+ spec.add_development_dependency "simplecov-console"
49
+ end
metadata ADDED
@@ -0,0 +1,178 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scraper_utils
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ian Heggie
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-02-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: sqlite3
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov-console
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Utilities to help make planningalerts scrapers, +especially multis easier
126
+ to develop, run and debug.
127
+ email:
128
+ - ian@heggie.biz
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".gitignore"
134
+ - ".rspec"
135
+ - ".rubocop.yml"
136
+ - ".travis.yml"
137
+ - Gemfile
138
+ - LICENSE.txt
139
+ - README.md
140
+ - Rakefile
141
+ - bin/console
142
+ - bin/setup
143
+ - lib/scraper_utils.rb
144
+ - lib/scraper_utils/authority_utils.rb
145
+ - lib/scraper_utils/db_utils.rb
146
+ - lib/scraper_utils/debug_utils.rb
147
+ - lib/scraper_utils/log_utils.rb
148
+ - lib/scraper_utils/mechanize_utils.rb
149
+ - lib/scraper_utils/version.rb
150
+ - scraper_utils.gemspec
151
+ homepage: https://github.com/ianheggie-oaf/scraper_utils
152
+ licenses:
153
+ - MIT
154
+ metadata:
155
+ allowed_push_host: https://rubygems.org
156
+ homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
157
+ source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
158
+ post_install_message:
159
+ rdoc_options: []
160
+ require_paths:
161
+ - lib
162
+ required_ruby_version: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: 2.5.1
167
+ required_rubygems_version: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ requirements: []
173
+ rubyforge_project:
174
+ rubygems_version: 2.7.6.2
175
+ signing_key:
176
+ specification_version: 4
177
+ summary: planningalerts scraper utilities
178
+ test_files: []