scraper_utils 0.8.3 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +42 -0
- data/CHANGELOG.md +14 -5
- data/README.md +11 -1
- data/docs/enhancing_specs.md +48 -14
- data/docs/example_parallel_scraper.rb +124 -0
- data/docs/example_scraper.rb +19 -3
- data/docs/parallel_scrapers.md +95 -0
- data/lib/scraper_utils/db_utils.rb +22 -2
- data/lib/scraper_utils/spec_support.rb +83 -35
- data/lib/scraper_utils/version.rb +1 -1
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3d985ab4384498ed5c44360db6d6020a039e38ec0480ab3e9942cd0d3267d9e
|
4
|
+
data.tar.gz: c61a923aa54f37623f8766a6750ecd97529dcaf6d315d056e04dde77f962cdaa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e99031153c341917fecc73057ce5e6f662cf6f45859990c817c7c77e530aa11162f8d5c778f78c09dbb4d2287e060769880e23f4591f8a30abb816c6dfe28e1
|
7
|
+
data.tar.gz: e8d9dcbb019f93b05dc9ca3f2c09cf5f488434befab13b92841d0e9377041eb3e8914efafdd81218596bad158aab24e0304212ef25ed8211cb3c1b3a7c5c2478
|
data/.editorconfig
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# EditorConfig helps maintain consistent coding styles across different editors and IDEs
|
2
|
+
# https://editorconfig.org/
|
3
|
+
|
4
|
+
root = true
|
5
|
+
|
6
|
+
# Ruby default: 2 spaces for indentation (Ruby Style Guide standard)
|
7
|
+
[*]
|
8
|
+
end_of_line = lf
|
9
|
+
indent_size = 2
|
10
|
+
indent_style = space
|
11
|
+
insert_final_newline = true
|
12
|
+
tab_width = 4 # Tab display width (RubyMine/Vim standard)
|
13
|
+
trim_trailing_whitespace = true
|
14
|
+
|
15
|
+
[*.bat]
|
16
|
+
end_of_line = crlf
|
17
|
+
|
18
|
+
# Python uses 4 spaces (PEP 8 standard)
|
19
|
+
[*.py]
|
20
|
+
indent_size = 4
|
21
|
+
|
22
|
+
# Makefiles require tabs
|
23
|
+
[{*[Mm]akefile*,*.mak,*.mk,depend}]
|
24
|
+
indent_style = tab
|
25
|
+
|
26
|
+
# Minified JavaScript files shouldn't be changed
|
27
|
+
[**.min.js]
|
28
|
+
indent_style = ignore
|
29
|
+
insert_final_newline = ignore
|
30
|
+
|
31
|
+
# Editor Setup Instructions:
|
32
|
+
#
|
33
|
+
# Vim: Add to ~/.vimrc:
|
34
|
+
# filetype plugin indent on
|
35
|
+
# set expandtab
|
36
|
+
# autocmd FileType ruby setlocal shiftwidth=2 tabstop=2 softtabstop=2
|
37
|
+
#
|
38
|
+
# RubyMine: Settings → Editor → Code Style → Ruby → set to 2 spaces
|
39
|
+
# (should auto-detect this .editorconfig file)
|
40
|
+
#
|
41
|
+
# VS Code: Install "EditorConfig for VS Code" extension
|
42
|
+
# (will automatically apply these settings)
|
data/CHANGELOG.md
CHANGED
@@ -14,17 +14,26 @@
|
|
14
14
|
- `ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!` - validates info_urls contain expected content
|
15
15
|
- `ScraperUtils::MathsUtils.fibonacci_series` - generates fibonacci sequence up to max value
|
16
16
|
- `bot_check_expected` parameter to info_url validation methods for handling reCAPTCHA/Cloudflare protection
|
17
|
+
- Experimental Parallel Processing support
|
18
|
+
- Uses the parallel gem with subprocesses
|
19
|
+
- Added facility to collect records in memory
|
20
|
+
- see docs/parallel_scrapers.md and docs/example_parallel_scraper.rb
|
21
|
+
- .editorconfig as an example for scrapers
|
17
22
|
|
18
23
|
### Fixed
|
19
|
-
- Typo in `geocodable?` method debug output (`has_suburb_stats` → `has_suburb_states`)
|
24
|
+
- Typo in `geocodable?` method debug output (`has_suburb_stats` → `has_suburb_states`)
|
20
25
|
- Code example in `docs/enhancing_specs.md`
|
21
26
|
|
22
27
|
### Updated
|
28
|
+
- `ScraperUtils::SpecSupport.acceptable_description?` - Accept 1 or 2 word descriptors with planning specific terms
|
23
29
|
- Code example in `docs/enhancing_specs.md` to reflect new support methods
|
24
|
-
-
|
25
|
-
|
26
|
-
-
|
30
|
+
- Code examples
|
31
|
+
- geocodeable? test is simpler - it requires
|
32
|
+
- a street type
|
33
|
+
- an uppercase word (assumed to be a suburb) or postcode and
|
27
34
|
- a state
|
35
|
+
- Support for 1 or 2 word "reasonable" descriptions that use words specific to planning alerts
|
36
|
+
- Added extra street types
|
28
37
|
|
29
38
|
### Removed
|
30
39
|
- Unsued CycleUtils
|
@@ -32,7 +41,7 @@
|
|
32
41
|
- Unused RandomizeUtils
|
33
42
|
- Unused Scheduling (Fiber and Threads)
|
34
43
|
- Unused Compliant mode, delays for Agent (Agent is configured with an agent string)
|
35
|
-
- Unused MechanizeActions
|
44
|
+
- Unused MechanizeActions
|
36
45
|
|
37
46
|
## 0.8.2 - 2025-05-07
|
38
47
|
|
data/README.md
CHANGED
@@ -10,6 +10,8 @@ Add to [your scraper's](https://www.planningalerts.org.au/how_to_write_a_scraper
|
|
10
10
|
```ruby
|
11
11
|
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
|
12
12
|
gem 'scraper_utils'
|
13
|
+
# Optional: For parallel processing of multi-authority scrapers
|
14
|
+
gem 'parallel'
|
13
15
|
```
|
14
16
|
|
15
17
|
For detailed setup and configuration options,
|
@@ -23,6 +25,14 @@ see {file:docs/getting_started.md Getting Started guide}
|
|
23
25
|
- Supports extra actions required to get to results page
|
24
26
|
- {file:docs/mechanize_utilities.md Learn more about Mechanize utilities}
|
25
27
|
|
28
|
+
### Parallel Processing
|
29
|
+
|
30
|
+
- Process multiple authorities simultaneously for significant speed improvements
|
31
|
+
- 3-8x faster execution for multi-authority scrapers
|
32
|
+
- Simple migration from sequential processing with minimal code changes
|
33
|
+
- {file:docs/parallel_scraping.md Learn more about parallel scraping}
|
34
|
+
- {file:docs/example_parallel_scraper.rb Example parallel scraper script}
|
35
|
+
|
26
36
|
### Error Handling & Quality Monitoring
|
27
37
|
|
28
38
|
- Record-level error handling with appropriate thresholds
|
@@ -67,4 +77,4 @@ on [ianheggie-oaf/scraper_utils | GitHub](https://github.com/ianheggie-oaf/scrap
|
|
67
77
|
|
68
78
|
## License
|
69
79
|
|
70
|
-
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
80
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/docs/enhancing_specs.md
CHANGED
@@ -25,6 +25,17 @@ RSpec.describe Scraper do
|
|
25
25
|
].freeze
|
26
26
|
|
27
27
|
describe ".scrape" do
|
28
|
+
def fetch_url_with_redirects(url)
|
29
|
+
agent = Mechanize.new
|
30
|
+
page = agent.get(url)
|
31
|
+
if YourScraper::Pages::TermsAndConditions.on_page?(page)
|
32
|
+
puts "Agreeing to terms and conditions for #{url}"
|
33
|
+
YourScraper::Pages::TermsAndConditions.click_agree(page)
|
34
|
+
page = agent.get(url)
|
35
|
+
end
|
36
|
+
page
|
37
|
+
end
|
38
|
+
|
28
39
|
def test_scrape(authority)
|
29
40
|
ScraperWiki.close_sqlite
|
30
41
|
FileUtils.rm_f("data.sqlite")
|
@@ -55,20 +66,27 @@ RSpec.describe Scraper do
|
|
55
66
|
|
56
67
|
expect(results).to eq expected
|
57
68
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
69
|
+
if results.any?
|
70
|
+
ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(results, percentage: 70, variation: 3)
|
71
|
+
|
72
|
+
ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(results, percentage: 55, variation: 3)
|
73
|
+
|
74
|
+
global_info_url = Scraper::AUTHORITIES[authority][:info_url]
|
75
|
+
# OR
|
76
|
+
# global_info_url = results.first['info_url']
|
77
|
+
bot_check_expected = AUTHORITIES_WITH_BOT_PROTECTION.include?(authority)
|
78
|
+
|
79
|
+
unless ENV['DISABLE_INFO_URL_CHECK']
|
80
|
+
if global_info_url
|
81
|
+
ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, global_info_url, bot_check_expected: bot_check_expected) do |url|
|
82
|
+
fetch_url_with_redirects(url)
|
83
|
+
end
|
84
|
+
else
|
85
|
+
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 70, variation: 3, bot_check_expected: bot_check_expected) do |url|
|
86
|
+
fetch_url_with_redirects(url)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
72
90
|
end
|
73
91
|
end
|
74
92
|
|
@@ -119,6 +137,21 @@ ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, per
|
|
119
137
|
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: true)
|
120
138
|
```
|
121
139
|
|
140
|
+
### Custom URL fetching
|
141
|
+
|
142
|
+
For sites requiring special handling (terms agreement, cookies, etc.):
|
143
|
+
|
144
|
+
```ruby
|
145
|
+
# Custom URL fetching with block
|
146
|
+
ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, url) do |url|
|
147
|
+
fetch_url_with_redirects(url) # Your custom fetch implementation
|
148
|
+
end
|
149
|
+
|
150
|
+
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results) do |url|
|
151
|
+
fetch_url_with_redirects(url) # Handle terms agreement, cookies, etc.
|
152
|
+
end
|
153
|
+
```
|
154
|
+
|
122
155
|
## Bot Protection Handling
|
123
156
|
|
124
157
|
The `bot_check_expected` parameter allows validation methods to accept bot protection as valid responses:
|
@@ -137,3 +170,4 @@ The `bot_check_expected` parameter allows validation methods to accept bot prote
|
|
137
170
|
|
138
171
|
All validation methods accept `percentage` (minimum percentage required) and `variation` (additional tolerance)
|
139
172
|
parameters for consistent configuration.
|
173
|
+
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
$LOAD_PATH << "./lib"
|
5
|
+
|
6
|
+
require "scraper_utils"
|
7
|
+
require "parallel"
|
8
|
+
require "your_scraper"
|
9
|
+
|
10
|
+
# Main Scraper class
|
11
|
+
class Scraper
|
12
|
+
AUTHORITIES = YourScraper::AUTHORITIES
|
13
|
+
|
14
|
+
# Process a single authority and returns an array of:
|
15
|
+
# * authority_label,
|
16
|
+
# * array of records to save,
|
17
|
+
# * an array of arrays of unprocessable_records and their exception
|
18
|
+
# * nil or a fatal exception,
|
19
|
+
def self.scrape_authority(authority_label, attempt)
|
20
|
+
puts "\nCollecting feed data for #{authority_label}, attempt: #{attempt}..."
|
21
|
+
|
22
|
+
# Enable in-memory collection mode, which disables saving to file and avoids conflicts
|
23
|
+
ScraperUtils::DbUtils.collect_saves!
|
24
|
+
unprocessable_record_details = []
|
25
|
+
fatal_exception = nil
|
26
|
+
|
27
|
+
begin
|
28
|
+
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
29
|
+
YourScraper.scrape(authority_label) do |record|
|
30
|
+
begin
|
31
|
+
record["authority_label"] = authority_label.to_s
|
32
|
+
ScraperUtils::DbUtils.save_record(record)
|
33
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
34
|
+
# Log bad record but continue processing unless too many have occurred
|
35
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
36
|
+
unprocessable_record_details << [e, record]
|
37
|
+
end
|
38
|
+
end
|
39
|
+
rescue StandardError => e
|
40
|
+
warn "#{authority_label}: ERROR: #{e}"
|
41
|
+
warn e.backtrace
|
42
|
+
fatal_exception = e
|
43
|
+
end
|
44
|
+
[authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details, fatal_exception]
|
45
|
+
end
|
46
|
+
|
47
|
+
# Process authorities in parallel
|
48
|
+
def self.scrape_parallel(authorities, attempt, process_count: 4)
|
49
|
+
exceptions = {}
|
50
|
+
# Saves immediately in main process
|
51
|
+
ScraperUtils::DbUtils.save_immediately!
|
52
|
+
Parallel.map(authorities, in_processes: process_count) do |authority_label|
|
53
|
+
# Runs in sub process
|
54
|
+
scrape_authority(authority_label, attempt)
|
55
|
+
end.each do |authority_label, saves, unprocessable, fatal_exception|
|
56
|
+
# Runs in main process
|
57
|
+
status = fatal_exception ? 'FAILED' : 'OK'
|
58
|
+
puts "Saving results of #{authority_label}: #{saves.size} records, #{unprocessable.size} unprocessable #{status}"
|
59
|
+
|
60
|
+
saves.each do |record|
|
61
|
+
ScraperUtils::DbUtils.save_record(record)
|
62
|
+
end
|
63
|
+
unprocessable.each do |e, record|
|
64
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
65
|
+
exceptions[authority_label] = e
|
66
|
+
end
|
67
|
+
|
68
|
+
if fatal_exception
|
69
|
+
puts " Warning: #{authority_label} failed with: #{fatal_exception.message}"
|
70
|
+
puts " Saved #{saves.size} records before failure"
|
71
|
+
exceptions[authority_label] = fatal_exception
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
exceptions
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.selected_authorities
|
79
|
+
ScraperUtils::AuthorityUtils.selected_authorities(AUTHORITIES.keys)
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.run(authorities, process_count: 8)
|
83
|
+
puts "Scraping authorities in parallel: #{authorities.join(', ')}"
|
84
|
+
puts "Using #{process_count} processes"
|
85
|
+
|
86
|
+
start_time = Time.now
|
87
|
+
exceptions = scrape_parallel(authorities, 1, process_count: process_count)
|
88
|
+
|
89
|
+
ScraperUtils::LogUtils.log_scraping_run(
|
90
|
+
start_time,
|
91
|
+
1,
|
92
|
+
authorities,
|
93
|
+
exceptions
|
94
|
+
)
|
95
|
+
|
96
|
+
unless exceptions.empty?
|
97
|
+
puts "\n***************************************************"
|
98
|
+
puts "Now retrying authorities which earlier had failures"
|
99
|
+
puts exceptions.keys.join(", ").to_s
|
100
|
+
puts "***************************************************"
|
101
|
+
|
102
|
+
start_time = Time.now
|
103
|
+
exceptions = scrape_parallel(exceptions.keys, 2, process_count: process_count)
|
104
|
+
|
105
|
+
ScraperUtils::LogUtils.log_scraping_run(
|
106
|
+
start_time,
|
107
|
+
2,
|
108
|
+
authorities,
|
109
|
+
exceptions
|
110
|
+
)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Report on results, raising errors for unexpected conditions
|
114
|
+
ScraperUtils::LogUtils.report_on_results(authorities, exceptions)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
if __FILE__ == $PROGRAM_NAME
|
119
|
+
ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
|
120
|
+
|
121
|
+
process_count = (ENV['MORPH_PROCESSES'] || Etc.nprocessors * 2).to_i
|
122
|
+
|
123
|
+
Scraper.run(Scraper.selected_authorities, process_count: process_count)
|
124
|
+
end
|
data/docs/example_scraper.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
|
+
Bundler.require
|
5
|
+
|
4
6
|
$LOAD_PATH << "./lib"
|
5
7
|
|
6
8
|
require "scraper_utils"
|
@@ -57,8 +59,9 @@ class Scraper
|
|
57
59
|
unless exceptions.empty?
|
58
60
|
puts "\n***************************************************"
|
59
61
|
puts "Now retrying authorities which earlier had failures"
|
60
|
-
puts exceptions.keys.join(", ")
|
62
|
+
puts exceptions.keys.join(", ")
|
61
63
|
puts "***************************************************"
|
64
|
+
ENV['DEBUG'] ||= '1'
|
62
65
|
|
63
66
|
start_time = Time.now
|
64
67
|
exceptions = scrape(exceptions.keys, 2)
|
@@ -79,8 +82,21 @@ end
|
|
79
82
|
if __FILE__ == $PROGRAM_NAME
|
80
83
|
# Default to list of authorities we can't or won't fix in code, explain why
|
81
84
|
# some: url-for-issue Summary Reason
|
82
|
-
# councils
|
85
|
+
# councils: url-for-issue Summary Reason
|
86
|
+
|
87
|
+
if ENV['MORPH_EXPECT_BAD'].nil?
|
88
|
+
default_expect_bad = {
|
89
|
+
}
|
90
|
+
puts 'Default EXPECT_BAD:', default_expect_bad.to_yaml if default_expect_bad.any?
|
83
91
|
|
84
|
-
|
92
|
+
ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(',')
|
93
|
+
end
|
85
94
|
Scraper.run(Scraper.selected_authorities)
|
95
|
+
|
96
|
+
# Dump database for morph-cli
|
97
|
+
if File.exist?("tmp/dump-data-sqlite")
|
98
|
+
puts "-- dump of data.sqlite --"
|
99
|
+
system "sqlite3 data.sqlite .dump"
|
100
|
+
puts "-- end of dump --"
|
101
|
+
end
|
86
102
|
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# Parallel Scraping with ScraperUtils
|
2
|
+
|
3
|
+
This guide shows how to parallelize your multi-authority scraper to significantly reduce run times.
|
4
|
+
|
5
|
+
## When to Use Parallel Scraping
|
6
|
+
|
7
|
+
Use parallel scraping when:
|
8
|
+
- You have 10+ authorities taking significant time each
|
9
|
+
- Authorities are independent (no shared state)
|
10
|
+
- You want to reduce total scraper run time from hours to minutes
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add the parallel gem to your Gemfile:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
|
18
|
+
gem 'scraper_utils'
|
19
|
+
gem 'parallel' # Add this line
|
20
|
+
```
|
21
|
+
|
22
|
+
## Modified Scraper Implementation
|
23
|
+
|
24
|
+
See `example_parallel_scraper.rb` as an example of how to convert your existing scraper to use parallel processing.
|
25
|
+
|
26
|
+
## Key Changes from Sequential Version
|
27
|
+
|
28
|
+
1. **Added `parallel` gem** to Gemfile
|
29
|
+
2. **Split scraping logic** into `scrape_authority` (single authority) and `scrape_parallel` (coordinator)
|
30
|
+
3. **Enable collection mode** with `ScraperUtils::DbUtils.collect_saves!` in each subprocess
|
31
|
+
4. **Return results** as `[authority_label, saves, unprocessable, exception]` from each subprocess
|
32
|
+
5. **Save in main process** to avoid SQLite locking issues
|
33
|
+
6. **Preserve error handling**: UnprocessableRecord exceptions logged but don't re-raise
|
34
|
+
|
35
|
+
## Configuration Options
|
36
|
+
|
37
|
+
### Process Count
|
38
|
+
|
39
|
+
Control the number of parallel processes:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
# In code
|
43
|
+
process_count = (ENV['MORPH_PROCESSES'] || Etc.nprocessors * 2).to_i
|
44
|
+
Scraper.run(authorities, process_count: process_count)
|
45
|
+
|
46
|
+
# Via environment variable
|
47
|
+
export MORPH_PROCESSES=6
|
48
|
+
```
|
49
|
+
|
50
|
+
Start with 4 processes and adjust based on:
|
51
|
+
- Available CPU cores
|
52
|
+
- Memory usage
|
53
|
+
- Network bandwidth
|
54
|
+
- Target site responsiveness
|
55
|
+
|
56
|
+
### Environment Variables
|
57
|
+
|
58
|
+
All existing environment variables work unchanged:
|
59
|
+
- `MORPH_AUTHORITIES` - filter authorities
|
60
|
+
- `MORPH_EXPECT_BAD` - expected bad authorities
|
61
|
+
- `DEBUG` - debugging output
|
62
|
+
- `MORPH_PROCESSES` - number of parallel processes
|
63
|
+
|
64
|
+
## Performance Expectations
|
65
|
+
|
66
|
+
Typical performance improvements:
|
67
|
+
- **4 processes**: 3-4x faster
|
68
|
+
- **8 processes**: 6-7x faster (if you have the cores/bandwidth)
|
69
|
+
- **Diminishing returns** beyond 8 processes for most scrapers
|
70
|
+
|
71
|
+
Example: 20 authorities × 6 minutes each = 2 hours sequential → 30 minutes with 4 processes
|
72
|
+
|
73
|
+
## Debugging Parallel Scrapers
|
74
|
+
|
75
|
+
1. **Test with 1 process first**: `process_count: 1` to isolate logic issues
|
76
|
+
2. **Check individual authorities**: Use `MORPH_AUTHORITIES=problematic_auth`
|
77
|
+
3. **Monitor resource usage**: Watch CPU, memory, and network during runs
|
78
|
+
4. **Enable debugging**: `DEBUG=1` works in all processes
|
79
|
+
|
80
|
+
## Limitations
|
81
|
+
|
82
|
+
- **Shared state**: Each process is isolated - no shared variables between authorities
|
83
|
+
- **Memory usage**: Each process uses full memory - monitor total usage
|
84
|
+
- **Database locking**: Only the main process writes to SQLite (by design)
|
85
|
+
- **Error handling**: Exceptions in one process don't affect others
|
86
|
+
|
87
|
+
## Migration from Sequential
|
88
|
+
|
89
|
+
Your existing scraper logic requires minimal changes:
|
90
|
+
1. Extract single-authority logic into separate method
|
91
|
+
2. Add `collect_saves!` call at start of each subprocess
|
92
|
+
3. Return collected saves instead of direct database writes
|
93
|
+
4. Use `Parallel.map` instead of `each` for authorities
|
94
|
+
|
95
|
+
The core scraping logic in `YourScraper.scrape` remains completely unchanged.
|
@@ -5,6 +5,22 @@ require "scraperwiki"
|
|
5
5
|
module ScraperUtils
|
6
6
|
# Utilities for database operations in scrapers
|
7
7
|
module DbUtils
|
8
|
+
# Enable in-memory collection mode instead of saving to SQLite
|
9
|
+
def self.collect_saves!
|
10
|
+
@collected_saves = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# Save to disk rather than collect
|
14
|
+
def self.save_immediately!
|
15
|
+
@collected_saves = nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# Get all collected save calls
|
19
|
+
# @return [Array<Array>] Array of [primary_key, record] pairs
|
20
|
+
def self.collected_saves
|
21
|
+
@collected_saves
|
22
|
+
end
|
23
|
+
|
8
24
|
# Saves a record to the SQLite database with validation and logging
|
9
25
|
#
|
10
26
|
# @param record [Hash] The record to be saved
|
@@ -33,8 +49,12 @@ module ScraperUtils
|
|
33
49
|
else
|
34
50
|
["council_reference"]
|
35
51
|
end
|
36
|
-
|
37
|
-
|
52
|
+
if @collected_saves
|
53
|
+
@collected_saves << record
|
54
|
+
else
|
55
|
+
ScraperWiki.save_sqlite(primary_key, record)
|
56
|
+
ScraperUtils::DataQualityMonitor.log_saved_record(record)
|
57
|
+
end
|
38
58
|
end
|
39
59
|
end
|
40
60
|
end
|
@@ -7,35 +7,61 @@ module ScraperUtils
|
|
7
7
|
# Methods to support specs
|
8
8
|
module SpecSupport
|
9
9
|
AUSTRALIAN_STATES = %w[ACT NSW NT QLD SA TAS VIC WA].freeze
|
10
|
+
|
10
11
|
STREET_TYPE_PATTERNS = [
|
12
|
+
/\bArcade\b/i,
|
11
13
|
/\bAv(e(nue)?)?\b/i,
|
12
|
-
/\bB(oulevard|lvd)\b/i,
|
14
|
+
/\bB(oulevard|lvd|vd)\b/i,
|
13
15
|
/\b(Circuit|Cct)\b/i,
|
16
|
+
/\bCir(cle)?\b/i,
|
14
17
|
/\bCl(ose)?\b/i,
|
15
18
|
/\bC(our|r)?t\b/i,
|
16
|
-
/\bCircle\b/i,
|
17
19
|
/\bChase\b/i,
|
20
|
+
/\bCorso\b/i,
|
18
21
|
/\bCr(es(cent)?)?\b/i,
|
22
|
+
/\bCross\b/i,
|
19
23
|
/\bDr((ive)?|v)\b/i,
|
20
24
|
/\bEnt(rance)?\b/i,
|
25
|
+
/\bEsp(lanade)?\b/i,
|
21
26
|
/\bGr(ove)?\b/i,
|
22
27
|
/\bH(ighwa|w)y\b/i,
|
23
|
-
/\
|
28
|
+
/\bL(ane?|a)\b/i,
|
24
29
|
/\bLoop\b/i,
|
30
|
+
/\bM(ews|w)\b/i,
|
31
|
+
/\bP(arade|de)\b/i,
|
25
32
|
/\bParkway\b/i,
|
26
33
|
/\bPl(ace)?\b/i,
|
27
34
|
/\bPriv(ate)?\b/i,
|
28
|
-
/\
|
35
|
+
/\bProm(enade)?\b/i,
|
36
|
+
/\bQuay\b/i,
|
29
37
|
/\bR(oa)?d\b/i,
|
38
|
+
/\bR(idge|dg)\b/i,
|
30
39
|
/\bRise\b/i,
|
40
|
+
/\bSq(uare)?\b/i,
|
31
41
|
/\bSt(reet)?\b/i,
|
32
|
-
/\
|
33
|
-
/\
|
34
|
-
/\bWay\b/i
|
42
|
+
/\bT(erra)?ce\b/i,
|
43
|
+
/\bWa?y\b/i
|
35
44
|
].freeze
|
36
45
|
|
37
46
|
AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
|
38
47
|
|
48
|
+
PLANNING_KEYWORDS = [
|
49
|
+
# Building types
|
50
|
+
'dwelling', 'house', 'unit', 'building', 'structure', 'facility',
|
51
|
+
# Modifications
|
52
|
+
'addition', 'extension', 'renovation', 'alteration', 'modification',
|
53
|
+
'replacement', 'upgrade', 'improvement',
|
54
|
+
# Specific structures
|
55
|
+
'carport', 'garage', 'shed', 'pool', 'deck', 'patio', 'pergola',
|
56
|
+
'verandah', 'balcony', 'fence', 'wall', 'driveway',
|
57
|
+
# Development types
|
58
|
+
'subdivision', 'demolition', 'construction', 'development',
|
59
|
+
# Services/utilities
|
60
|
+
'signage', 'telecommunications', 'stormwater', 'water', 'sewer',
|
61
|
+
# Approvals/certificates
|
62
|
+
'certificate', 'approval', 'consent', 'permit'
|
63
|
+
].freeze
|
64
|
+
|
39
65
|
def self.fetch_url_with_redirects(url)
|
40
66
|
agent = Mechanize.new
|
41
67
|
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
@@ -45,7 +71,7 @@ module ScraperUtils
|
|
45
71
|
def self.authority_label(results, prefix: '', suffix: '')
|
46
72
|
return nil if results.nil?
|
47
73
|
|
48
|
-
authority_labels = results.map { |record| record['authority_label']}.compact.uniq
|
74
|
+
authority_labels = results.map { |record| record['authority_label'] }.compact.uniq
|
49
75
|
return nil if authority_labels.empty?
|
50
76
|
|
51
77
|
raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
|
@@ -86,22 +112,18 @@ module ScraperUtils
|
|
86
112
|
# Using the pre-compiled patterns
|
87
113
|
has_street_type = STREET_TYPE_PATTERNS.any? { |pattern| check_address.match?(pattern) }
|
88
114
|
|
89
|
-
has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
|
90
|
-
|
91
115
|
uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
|
92
116
|
has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
|
93
117
|
|
94
118
|
if ENV["DEBUG"]
|
95
119
|
missing = []
|
96
|
-
unless has_street_type
|
97
|
-
missing << "street type / unit / lot"
|
98
|
-
end
|
120
|
+
missing << "street type" unless has_street_type
|
99
121
|
missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
|
100
122
|
missing << "state" unless has_state
|
101
123
|
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
102
124
|
end
|
103
125
|
|
104
|
-
|
126
|
+
has_street_type && (has_postcode || has_uppercase_suburb) && has_state
|
105
127
|
end
|
106
128
|
|
107
129
|
PLACEHOLDERS = [
|
@@ -142,14 +164,23 @@ module ScraperUtils
|
|
142
164
|
# Check if this looks like a "reasonable" description
|
143
165
|
# This is a bit stricter than needed - typically assert >= 75% match
|
144
166
|
def self.reasonable_description?(text)
|
145
|
-
|
167
|
+
return false if placeholder?(text)
|
168
|
+
|
169
|
+
# Long descriptions (3+ words) are assumed reasonable
|
170
|
+
return true if text.to_s.split.size >= 3
|
171
|
+
|
172
|
+
# Short descriptions must contain at least one planning keyword
|
173
|
+
text_lower = text.to_s.downcase
|
174
|
+
PLANNING_KEYWORDS.any? { |keyword| text_lower.include?(keyword) }
|
146
175
|
end
|
147
176
|
|
148
177
|
# Validates that all records use the expected global info_url and it returns 200
|
149
178
|
# @param results [Array<Hash>] The results from scraping an authority
|
150
179
|
# @param expected_url [String] The expected global info_url for this authority
|
180
|
+
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
181
|
+
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
151
182
|
# @raise RuntimeError if records don't use the expected URL or it doesn't return 200
|
152
|
-
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false)
|
183
|
+
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false, &block)
|
153
184
|
info_urls = results.map { |record| record["info_url"] }.uniq
|
154
185
|
|
155
186
|
unless info_urls.size == 1
|
@@ -163,11 +194,11 @@ module ScraperUtils
|
|
163
194
|
|
164
195
|
if defined?(VCR)
|
165
196
|
VCR.use_cassette("#{authority_label(results, suffix: '_')}one_info_url") do
|
166
|
-
page = fetch_url_with_redirects(expected_url)
|
197
|
+
page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
|
167
198
|
validate_page_response(page, bot_check_expected)
|
168
199
|
end
|
169
200
|
else
|
170
|
-
page = fetch_url_with_redirects(expected_url)
|
201
|
+
page = block_given? ? block.call(expected_url) : fetch_url_with_redirects(expected_url)
|
171
202
|
validate_page_response(page, bot_check_expected)
|
172
203
|
end
|
173
204
|
end
|
@@ -176,14 +207,16 @@ module ScraperUtils
|
|
176
207
|
# @param results [Array<Hash>] The results from scraping an authority
|
177
208
|
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
178
209
|
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
210
|
+
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
211
|
+
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
179
212
|
# @raise RuntimeError if insufficient detail checks pass
|
180
|
-
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false)
|
213
|
+
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false, &block)
|
181
214
|
if defined?(VCR)
|
182
215
|
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url_details") do
|
183
|
-
check_info_url_details(results, percentage, variation, bot_check_expected)
|
216
|
+
check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
184
217
|
end
|
185
218
|
else
|
186
|
-
check_info_url_details(results, percentage, variation, bot_check_expected)
|
219
|
+
check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
187
220
|
end
|
188
221
|
end
|
189
222
|
|
@@ -195,7 +228,7 @@ module ScraperUtils
|
|
195
228
|
|
196
229
|
return false unless page.body
|
197
230
|
|
198
|
-
body_lower = page.body
|
231
|
+
body_lower = page.body&.downcase
|
199
232
|
|
200
233
|
# Check for common bot protection indicators
|
201
234
|
bot_indicators = [
|
@@ -228,7 +261,7 @@ module ScraperUtils
|
|
228
261
|
|
229
262
|
private
|
230
263
|
|
231
|
-
def self.check_info_url_details(results, percentage, variation, bot_check_expected)
|
264
|
+
def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
232
265
|
count = 0
|
233
266
|
failed = 0
|
234
267
|
fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
|
@@ -238,7 +271,7 @@ module ScraperUtils
|
|
238
271
|
info_url = record["info_url"]
|
239
272
|
puts "Checking info_url[#{index}]: #{info_url} has the expected reference, address and description..."
|
240
273
|
|
241
|
-
page = fetch_url_with_redirects(info_url)
|
274
|
+
page = block_given? ? block.call(info_url) : fetch_url_with_redirects(info_url)
|
242
275
|
|
243
276
|
if bot_check_expected && bot_protection_detected?(page)
|
244
277
|
puts " Bot protection detected - skipping detailed validation"
|
@@ -252,22 +285,37 @@ module ScraperUtils
|
|
252
285
|
%w[council_reference address description].each do |attribute|
|
253
286
|
count += 1
|
254
287
|
expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
|
255
|
-
expected2 =
|
256
|
-
|
257
|
-
|
288
|
+
expected2 = case attribute
|
289
|
+
when 'council_reference'
|
290
|
+
expected.sub(/\ADA\s*-\s*/, '')
|
291
|
+
when 'address'
|
292
|
+
expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, '') # Handle Lismore post-code/state swap
|
293
|
+
else
|
294
|
+
expected
|
295
|
+
end
|
296
|
+
expected3 = case attribute
|
297
|
+
when 'address'
|
298
|
+
expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, '')
|
299
|
+
else
|
300
|
+
expected
|
301
|
+
end.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-')
|
302
|
+
next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, ' ').gsub(/\s*-\s*/, '-').include?(expected3)
|
258
303
|
|
259
304
|
failed += 1
|
260
|
-
|
305
|
+
desc2 = expected2 == expected ? '' : " or #{expected2.inspect}"
|
306
|
+
desc3 = expected3 == expected ? '' : " or #{expected3.inspect}"
|
307
|
+
puts " Missing: #{expected.inspect}#{desc2}#{desc3}"
|
261
308
|
puts " IN: #{page_body}" if ENV['DEBUG']
|
262
309
|
|
263
|
-
min_required =
|
310
|
+
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
264
311
|
passed = count - failed
|
265
312
|
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
|
266
|
-
|
267
|
-
|
313
|
+
end
|
314
|
+
end
|
268
315
|
|
269
|
-
|
270
|
-
|
316
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
|
317
|
+
end
|
318
|
+
|
319
|
+
end
|
320
|
+
end
|
271
321
|
|
272
|
-
end
|
273
|
-
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-07-
|
11
|
+
date: 2025-07-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -61,6 +61,7 @@ executables:
|
|
61
61
|
extensions: []
|
62
62
|
extra_rdoc_files: []
|
63
63
|
files:
|
64
|
+
- ".editorconfig"
|
64
65
|
- ".gitignore"
|
65
66
|
- ".rspec"
|
66
67
|
- ".rubocop.yml"
|
@@ -82,9 +83,11 @@ files:
|
|
82
83
|
- docs/enhancing_specs.md
|
83
84
|
- docs/example_custom_Rakefile
|
84
85
|
- docs/example_dot_scraper_validation.yml
|
86
|
+
- docs/example_parallel_scraper.rb
|
85
87
|
- docs/example_scraper.rb
|
86
88
|
- docs/getting_started.md
|
87
89
|
- docs/mechanize_utilities.md
|
90
|
+
- docs/parallel_scrapers.md
|
88
91
|
- docs/testing_custom_scrapers.md
|
89
92
|
- exe/validate_scraper_data
|
90
93
|
- lib/scraper_utils.rb
|
@@ -106,7 +109,7 @@ metadata:
|
|
106
109
|
allowed_push_host: https://rubygems.org
|
107
110
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
108
111
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
109
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
112
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.9.0
|
110
113
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
111
114
|
rubygems_mfa_required: 'true'
|
112
115
|
post_install_message:
|