scraper_utils 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +39 -9
- data/CHANGELOG.md +34 -0
- data/README.md +7 -55
- data/docs/enhancing_specs.md +86 -47
- data/docs/example_custom_Rakefile +38 -0
- data/docs/example_dot_scraper_validation.yml +23 -0
- data/docs/mechanize_utilities.md +0 -3
- data/docs/testing_custom_scrapers.md +74 -0
- data/exe/validate_scraper_data +150 -0
- data/lib/scraper_utils/log_utils.rb +5 -5
- data/lib/scraper_utils/maths_utils.rb +23 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +9 -65
- data/lib/scraper_utils/mechanize_utils.rb +0 -2
- data/lib/scraper_utils/spec_support.rb +189 -6
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +1 -5
- data/scraper_utils.gemspec +1 -0
- metadata +11 -24
- data/docs/example_scrape_with_fibers.rb +0 -31
- data/docs/fibers_and_threads.md +0 -72
- data/docs/interleaving_requests.md +0 -33
- data/docs/parallel_requests.md +0 -138
- data/docs/randomizing_requests.md +0 -38
- data/docs/reducing_server_load.md +0 -63
- data/lib/scraper_utils/cycle_utils.rb +0 -26
- data/lib/scraper_utils/date_range_utils.rb +0 -118
- data/lib/scraper_utils/mechanize_actions.rb +0 -183
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +0 -80
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +0 -151
- data/lib/scraper_utils/randomize_utils.rb +0 -37
- data/lib/scraper_utils/scheduler/constants.rb +0 -12
- data/lib/scraper_utils/scheduler/operation_registry.rb +0 -101
- data/lib/scraper_utils/scheduler/operation_worker.rb +0 -199
- data/lib/scraper_utils/scheduler/process_request.rb +0 -59
- data/lib/scraper_utils/scheduler/thread_request.rb +0 -51
- data/lib/scraper_utils/scheduler/thread_response.rb +0 -59
- data/lib/scraper_utils/scheduler.rb +0 -286
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 858cdc2f2f55b23ea10a09629ff347b2a6833c8efd1f89851168db7ffb46d9f8
|
4
|
+
data.tar.gz: f6d17f7a327dd3698b12e4da7772037e58cb76201d3271def86f195a68e97b03
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0605eaa66aa317a0076d7c87ebdb883e274bd0fd44fd53ab28219a09a963c78decd0a086368e5b0e01748c0c67cd9ef4a84b48274794ad6bb5ea6e9dcdff4745
|
7
|
+
data.tar.gz: 511175ebaed815b6b851f804029b33c8ebbe553623d63556d78bb49f5be1fe4d52e5d9a78f27a75f3423ea90df7aad244acb43415b3990f86f78a9986438e21c
|
data/.gitignore
CHANGED
@@ -1,9 +1,44 @@
|
|
1
|
+
# Scraper specific example
|
2
|
+
|
3
|
+
# Ignore output of scraper
|
4
|
+
data.sqlite
|
5
|
+
|
6
|
+
# Don't save limited scraper runs
|
7
|
+
/spec/cassettes/limited_to_*
|
8
|
+
/spec/expected/limited_to_*
|
9
|
+
|
10
|
+
# ---------------------------
|
11
|
+
# Gem ignores
|
1
12
|
*.gem
|
13
|
+
/pkg/
|
14
|
+
/InstalledFiles
|
15
|
+
|
16
|
+
# REMOVE PARAGRAPH IF USING WITH SCRAPERS:
|
17
|
+
# for a library or gem, you might want to ignore these files since the code is
|
18
|
+
# intended to run in multiple environments; otherwise, check them in:
|
19
|
+
Gemfile.lock
|
20
|
+
.ruby-version
|
21
|
+
.ruby-gemset
|
22
|
+
|
23
|
+
# ---------------------------
|
24
|
+
# General ignore
|
25
|
+
|
26
|
+
# rspec failure tracking
|
27
|
+
.rspec_status
|
28
|
+
.release
|
29
|
+
.bundle
|
30
|
+
.profile.d
|
31
|
+
bin
|
32
|
+
vendor
|
33
|
+
tmp
|
34
|
+
.basher/bash
|
35
|
+
Procfile
|
36
|
+
|
37
|
+
# Spec coverage
|
38
|
+
/coverage/
|
39
|
+
|
2
40
|
*.rbc
|
3
41
|
/.config
|
4
|
-
/coverage/
|
5
|
-
/InstalledFiles
|
6
|
-
/pkg/
|
7
42
|
/spec/reports/
|
8
43
|
/spec/examples.txt
|
9
44
|
/test/tmp/
|
@@ -64,12 +99,6 @@ build-iPhoneSimulator/
|
|
64
99
|
/vendor/bundle
|
65
100
|
/lib/bundler/man/
|
66
101
|
|
67
|
-
# for a library or gem, you might want to ignore these files since the code is
|
68
|
-
# intended to run in multiple environments; otherwise, check them in:
|
69
|
-
Gemfile.lock
|
70
|
-
.ruby-version
|
71
|
-
.ruby-gemset
|
72
|
-
|
73
102
|
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
74
103
|
.rvmrc
|
75
104
|
|
@@ -78,3 +107,4 @@ Gemfile.lock
|
|
78
107
|
|
79
108
|
# rspec reports
|
80
109
|
/.rspec_status
|
110
|
+
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,39 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.9.0 - 2025-07-01
|
4
|
+
|
5
|
+
**Significant cleanup - removed code we ended up not using as none of the councils are actually concerned about server load**
|
6
|
+
|
7
|
+
* Refactored example code into simple callable methods
|
8
|
+
* Expand test for geocodeable addresses to include comma between postcode and state at the end of the address.
|
9
|
+
|
10
|
+
### Added
|
11
|
+
- `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` - validates percentage of geocodable addresses
|
12
|
+
- `ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!` - validates percentage of reasonable descriptions
|
13
|
+
- `ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!` - validates single global info_url usage and availability
|
14
|
+
- `ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!` - validates info_urls contain expected content
|
15
|
+
- `ScraperUtils::MathsUtils.fibonacci_series` - generates fibonacci sequence up to max value
|
16
|
+
- `bot_check_expected` parameter to info_url validation methods for handling reCAPTCHA/Cloudflare protection
|
17
|
+
|
18
|
+
### Fixed
|
19
|
+
- Typo in `geocodable?` method debug output (`has_suburb_stats` → `has_suburb_states`)
|
20
|
+
- Code example in `docs/enhancing_specs.md`
|
21
|
+
|
22
|
+
### Updated
|
23
|
+
- Code example in `docs/enhancing_specs.md` to reflect new support methods
|
24
|
+
- geocodeable? test is simpler - it requires
|
25
|
+
- a street type, unit or lot,
|
26
|
+
- an uppercase word (assumed to be a suburb) or postcode and
|
27
|
+
- a state
|
28
|
+
|
29
|
+
### Removed
|
30
|
+
- Unsued CycleUtils
|
31
|
+
- Unused DateRangeUtils
|
32
|
+
- Unused RandomizeUtils
|
33
|
+
- Unused Scheduling (Fiber and Threads)
|
34
|
+
- Unused Compliant mode, delays for Agent (Agent is configured with an agent string)
|
35
|
+
- Unused MechanizeActions
|
36
|
+
|
3
37
|
## 0.8.2 - 2025-05-07
|
4
38
|
|
5
39
|
* Ignore blank dates supplied when validating rather than complain they are not valid
|
data/README.md
CHANGED
@@ -3,43 +3,6 @@ ScraperUtils (Ruby)
|
|
3
3
|
|
4
4
|
Utilities to help make planningalerts scrapers, especially multis, easier to develop, run and debug.
|
5
5
|
|
6
|
-
For Server Administrators
|
7
|
-
-------------------------
|
8
|
-
|
9
|
-
The ScraperUtils library is designed to be a respectful citizen of the web. If you're a server administrator and notice
|
10
|
-
our scraper accessing your systems, here's what you should know:
|
11
|
-
|
12
|
-
### We play nice with your servers
|
13
|
-
|
14
|
-
Our goal is to access public planning information with minimal impact on your services. The following features are on by
|
15
|
-
default:
|
16
|
-
|
17
|
-
- **Limit server load**:
|
18
|
-
- We limit the max load we present to your server to less than a half of one of your cpu cores
|
19
|
-
- The more loaded your server is, the longer we wait between requests!
|
20
|
-
- We respect Crawl-delay from robots.txt (see section below), so you can tell us an acceptable rate
|
21
|
-
- Scarper developers can
|
22
|
-
- reduce the max_load we present to your server even further
|
23
|
-
- add random extra delays to give your server a chance to catch up with background tasks
|
24
|
-
|
25
|
-
- **Identify themselves**: Our user agent clearly indicates who we are and provides a link to the project repository:
|
26
|
-
`Mozilla/5.0 (compatible; ScraperUtils/0.2.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)`
|
27
|
-
|
28
|
-
### How to Control Our Behavior
|
29
|
-
|
30
|
-
Our scraper utilities respect the standard server **robots.txt** control mechanisms (by default).
|
31
|
-
To control our access:
|
32
|
-
|
33
|
-
- Add a section for our user agent: `User-agent: ScraperUtils`
|
34
|
-
- Set a crawl delay, eg: `Crawl-delay: 20`
|
35
|
-
- If needed specify disallowed paths: `Disallow: /private/`
|
36
|
-
|
37
|
-
For Scraper Developers
|
38
|
-
----------------------
|
39
|
-
|
40
|
-
We provide utilities to make developing, running and debugging your scraper easier in addition to the base utilities
|
41
|
-
mentioned above.
|
42
|
-
|
43
6
|
## Installation & Configuration
|
44
7
|
|
45
8
|
Add to [your scraper's](https://www.planningalerts.org.au/how_to_write_a_scraper) Gemfile:
|
@@ -57,27 +20,9 @@ see {file:docs/getting_started.md Getting Started guide}
|
|
57
20
|
### Well-Behaved Web Client
|
58
21
|
|
59
22
|
- Configure Mechanize agents with sensible defaults
|
60
|
-
- Automatic rate limiting based on server response times
|
61
|
-
- Supports robots.txt and crawl-delay directives
|
62
23
|
- Supports extra actions required to get to results page
|
63
24
|
- {file:docs/mechanize_utilities.md Learn more about Mechanize utilities}
|
64
25
|
|
65
|
-
### Optimize Server Load
|
66
|
-
|
67
|
-
- Intelligent date range selection (reduce server load by up to 60%)
|
68
|
-
- Cycle utilities for rotating search parameters
|
69
|
-
- {file:docs/reducing_server_load.md Learn more about reducing server load}
|
70
|
-
|
71
|
-
### Improve Scraper Efficiency
|
72
|
-
|
73
|
-
- Interleaves requests to optimize run time
|
74
|
-
- {file:docs/interleaving_requests.md Learn more about interleaving requests}
|
75
|
-
- Use {ScraperUtils::Scheduler.execute_request} so Mechanize network requests will be performed by threads in parallel
|
76
|
-
- {file:docs/parallel_requests.md Parallel Request} - see Usage section for installation instructions
|
77
|
-
- Randomize processing order for more natural request patterns
|
78
|
-
- {file:docs/randomizing_requests.md Learn more about randomizing requests} - see Usage section for installation
|
79
|
-
instructions
|
80
|
-
|
81
26
|
### Error Handling & Quality Monitoring
|
82
27
|
|
83
28
|
- Record-level error handling with appropriate thresholds
|
@@ -90,6 +35,13 @@ see {file:docs/getting_started.md Getting Started guide}
|
|
90
35
|
- Simple logging with authority context
|
91
36
|
- {file:docs/debugging.md Learn more about debugging}
|
92
37
|
|
38
|
+
### Spec Validation
|
39
|
+
|
40
|
+
- Validate geocodable addresses and reasonable descriptions
|
41
|
+
- Check info URL availability and content
|
42
|
+
- Support for bot protection detection
|
43
|
+
- {file:docs/enhancing_specs.md Learn more about spec validation}
|
44
|
+
|
93
45
|
## API Documentation
|
94
46
|
|
95
47
|
Complete API documentation is available at [scraper_utils | RubyDoc.info](https://rubydoc.info/gems/scraper_utils).
|
data/docs/enhancing_specs.md
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
# Enhancing specs
|
2
2
|
|
3
|
-
ScraperUtils provides
|
3
|
+
ScraperUtils provides validation methods to help with checking scraping results:
|
4
4
|
|
5
|
-
* `ScraperUtils::SpecSupport.geocodable?`
|
6
|
-
* `ScraperUtils::SpecSupport.reasonable_description?`
|
5
|
+
* `ScraperUtils::SpecSupport.geocodable?` - Check if an address is likely to be geocodable
|
6
|
+
* `ScraperUtils::SpecSupport.reasonable_description?` - Check if a description looks reasonable
|
7
|
+
* `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` - Validate percentage of geocodable addresses
|
8
|
+
* `ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!` - Validate percentage of reasonable descriptions
|
9
|
+
* `ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!` - Validate single global info_url usage and availability
|
10
|
+
* `ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!` - Validate info_urls contain expected content
|
7
11
|
|
8
12
|
## Example Code:
|
9
13
|
|
@@ -14,9 +18,16 @@ require "timecop"
|
|
14
18
|
require_relative "../scraper"
|
15
19
|
|
16
20
|
RSpec.describe Scraper do
|
21
|
+
# Authorities that use bot protection (reCAPTCHA, Cloudflare, etc.) on detail pages (info_url)
|
22
|
+
AUTHORITIES_WITH_BOT_PROTECTION = %i[
|
23
|
+
some_authority
|
24
|
+
another_authority
|
25
|
+
].freeze
|
26
|
+
|
17
27
|
describe ".scrape" do
|
18
28
|
def test_scrape(authority)
|
19
|
-
|
29
|
+
ScraperWiki.close_sqlite
|
30
|
+
FileUtils.rm_f("data.sqlite")
|
20
31
|
|
21
32
|
VCR.use_cassette(authority) do
|
22
33
|
date = Date.new(2025, 4, 15)
|
@@ -44,48 +55,20 @@ RSpec.describe Scraper do
|
|
44
55
|
|
45
56
|
expect(results).to eq expected
|
46
57
|
|
47
|
-
geocodable
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
selected
|
62
|
-
end
|
63
|
-
puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
|
64
|
-
"(#{(100.0 * descriptions / results.count).round(1)}%)"
|
65
|
-
expect(descriptions).to be > (0.55 * results.count)
|
66
|
-
|
67
|
-
info_urls = results
|
68
|
-
.map { |record| record["info_url"] }
|
69
|
-
.uniq
|
70
|
-
.count { |text| text.to_s.match(%r{\Ahttps?://}) }
|
71
|
-
puts "Found #{info_urls} out of #{results.count} unique info_urls " \
|
72
|
-
"(#{(100.0 * info_urls / results.count).round(1)}%)"
|
73
|
-
expect(info_urls).to be > (0.7 * results.count) if info_urls != 1
|
74
|
-
|
75
|
-
VCR.use_cassette("#{authority}.info_urls") do
|
76
|
-
results.each do |record|
|
77
|
-
info_url = record["info_url"]
|
78
|
-
puts "Checking info_url #{info_url} #{info_urls > 1 ? ' has expected details' : ''} ..."
|
79
|
-
response = Net::HTTP.get_response(URI(info_url))
|
80
|
-
|
81
|
-
expect(response.code).to eq("200")
|
82
|
-
# If info_url is the same for all records, then it won't have details
|
83
|
-
break if info_urls == 1
|
84
|
-
|
85
|
-
expect(response.body).to include(record["council_reference"])
|
86
|
-
expect(response.body).to include(record["address"])
|
87
|
-
expect(response.body).to include(record["description"])
|
88
|
-
end
|
58
|
+
# Validate addresses are geocodable (70% minimum with 3 records variation)
|
59
|
+
ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(results, percentage: 70, variation: 3)
|
60
|
+
|
61
|
+
# Validate descriptions are reasonable (55% minimum with 3 records variation)
|
62
|
+
ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(results, percentage: 55, variation: 3)
|
63
|
+
|
64
|
+
# Validate info_urls based on authority configuration
|
65
|
+
global_info_url = Scraper::AUTHORITIES[authority][:info_url]
|
66
|
+
bot_check_expected = AUTHORITIES_WITH_BOT_PROTECTION.include?(authority)
|
67
|
+
|
68
|
+
if global_info_url
|
69
|
+
ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, global_info_url, bot_check_expected: bot_check_expected)
|
70
|
+
else
|
71
|
+
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: bot_check_expected)
|
89
72
|
end
|
90
73
|
end
|
91
74
|
|
@@ -96,5 +79,61 @@ RSpec.describe Scraper do
|
|
96
79
|
end
|
97
80
|
end
|
98
81
|
end
|
82
|
+
```
|
83
|
+
|
84
|
+
## Individual validation methods:
|
85
|
+
|
86
|
+
### Address validation
|
87
|
+
|
88
|
+
```ruby
|
89
|
+
# Check if single address is geocodable
|
90
|
+
ScraperUtils::SpecSupport.geocodable?('123 Smith Street, Sydney NSW 2000')
|
91
|
+
|
92
|
+
# Validate percentage of geocodable addresses
|
93
|
+
ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
|
94
|
+
```
|
95
|
+
|
96
|
+
### Description validation
|
97
|
+
|
98
|
+
```ruby
|
99
|
+
# Check if single description is reasonable
|
100
|
+
ScraperUtils::SpecSupport.reasonable_description?('Construction of new building')
|
101
|
+
|
102
|
+
# Validate percentage of reasonable descriptions
|
103
|
+
ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
|
104
|
+
```
|
105
|
+
|
106
|
+
### Info URL validation
|
107
|
+
|
108
|
+
```ruby
|
109
|
+
# For authorities with global info_url
|
110
|
+
ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, 'https://example.com/search')
|
111
|
+
|
112
|
+
# For authorities with bot protection
|
113
|
+
ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, 'https://example.com/search', bot_check_expected: true)
|
114
|
+
|
115
|
+
# For authorities with unique info_urls per record
|
116
|
+
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3)
|
117
|
+
|
118
|
+
# For authorities with unique info_urls that may have bot protection
|
119
|
+
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: true)
|
120
|
+
```
|
121
|
+
|
122
|
+
## Bot Protection Handling
|
123
|
+
|
124
|
+
The `bot_check_expected` parameter allows validation methods to accept bot protection as valid responses:
|
125
|
+
|
126
|
+
**When bot protection is detected:**
|
127
|
+
|
128
|
+
- HTTP status codes: 403 (Forbidden), 429 (Too Many Requests)
|
129
|
+
- Content indicators: "recaptcha", "cloudflare", "are you human", "bot detection", "security check", "verify you are
|
130
|
+
human", "access denied", "blocked", "captcha"
|
131
|
+
|
132
|
+
**Usage patterns:**
|
133
|
+
|
134
|
+
- Set `bot_check_expected: false` (default) - requires 200 responses, fails on bot protection
|
135
|
+
- Set `bot_check_expected: true` - accepts bot protection as valid, useful for authorities known to use anti-bot
|
136
|
+
measures
|
99
137
|
|
100
|
-
|
138
|
+
All validation methods accept `percentage` (minimum percentage required) and `variation` (additional tolerance)
|
139
|
+
parameters for consistent configuration.
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Remember to add scraper_utils to Gemfile!
|
4
|
+
|
5
|
+
require "fileutils"
|
6
|
+
|
7
|
+
desc "Run scraper and validate results"
|
8
|
+
task :test do
|
9
|
+
Rake::Task[:scrape].invoke
|
10
|
+
Rake::Task[:validate].invoke
|
11
|
+
end
|
12
|
+
|
13
|
+
desc "Run the scraper"
|
14
|
+
task :scrape do
|
15
|
+
puts "Running scraper..."
|
16
|
+
FileUtils.rm_f("data.sqlite")
|
17
|
+
|
18
|
+
system("bundle exec ruby scraper.rb") || abort("Scraper failed")
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "Validate scraped data"
|
22
|
+
task :validate do
|
23
|
+
puts "Validating scraped data..."
|
24
|
+
|
25
|
+
unless File.exist?("data.sqlite")
|
26
|
+
abort("No data.sqlite found - run scraper first")
|
27
|
+
end
|
28
|
+
|
29
|
+
system("validate_scraper_data") || abort("Validation failed")
|
30
|
+
end
|
31
|
+
|
32
|
+
desc "Clean up generated files"
|
33
|
+
task :clean do
|
34
|
+
FileUtils.rm_f("data.sqlite")
|
35
|
+
puts "Cleaned up data.sqlite"
|
36
|
+
end
|
37
|
+
|
38
|
+
task default: :test
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# Scraper validation configuration
|
2
|
+
# Copy this to .scraper_validation.yml and customize as needed
|
3
|
+
|
4
|
+
# Database settings
|
5
|
+
database: "data.sqlite"
|
6
|
+
|
7
|
+
# Address validation (percentage of addresses that should be geocodable)
|
8
|
+
geocodable_percentage: 70
|
9
|
+
geocodable_variation: 3
|
10
|
+
|
11
|
+
# Description validation (percentage of descriptions that should be reasonable)
|
12
|
+
description_percentage: 55
|
13
|
+
description_variation: 3
|
14
|
+
|
15
|
+
# Info URL validation (percentage of detail checks that should pass)
|
16
|
+
info_url_percentage: 75
|
17
|
+
info_url_variation: 3
|
18
|
+
|
19
|
+
# Bot protection (set to true if info URLs use reCAPTCHA/Cloudflare)
|
20
|
+
bot_check_expected: false
|
21
|
+
|
22
|
+
# Global info URL (leave blank for auto-detection)
|
23
|
+
# global_info_url: "https://example.com/search"
|
data/docs/mechanize_utilities.md
CHANGED
@@ -36,9 +36,6 @@ Note - compliant mode forces max_load to be set to a value no greater than 50.
|
|
36
36
|
```ruby
|
37
37
|
ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
38
38
|
config.default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
|
39
|
-
config.default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
40
|
-
config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
|
41
|
-
config.default_max_load = ENV.fetch('MORPH_MAX_LOAD',DEFAULT_MAX_LOAD).to_f # 50.0
|
42
39
|
config.default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
43
40
|
config.default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
44
41
|
config.default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# Testing Custom Scrapers
|
2
|
+
|
3
|
+
Custom scrapers often lack proper specs but still need quality validation. This gem provides a simple executable to
|
4
|
+
validate your scraped data.
|
5
|
+
|
6
|
+
## Setup
|
7
|
+
|
8
|
+
Add to your `Gemfile`:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem "scraper_utils", "~> 0.8.2"
|
12
|
+
gem "rake", "~> 13.0"
|
13
|
+
```
|
14
|
+
|
15
|
+
Copy the example files to your scraper directory:
|
16
|
+
|
17
|
+
- `docs/example_custom_Rakefile` → `Rakefile`
|
18
|
+
- `docs/example_dot_scraper_validation.yml` → `.scraper_validation.yml`
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
### Command Line
|
23
|
+
|
24
|
+
Once you have run the scraper (`bundle exec ruby scraper.rb`) you can manually run the command:
|
25
|
+
|
26
|
+
```bash
|
27
|
+
# Run with defaults
|
28
|
+
validate_scraper_data
|
29
|
+
|
30
|
+
# Custom thresholds
|
31
|
+
validate_scraper_data --geocodable-percentage 80 --description-percentage 60
|
32
|
+
|
33
|
+
# For scrapers with bot protection
|
34
|
+
validate_scraper_data --bot-check-expected
|
35
|
+
```
|
36
|
+
|
37
|
+
### Rake Tasks
|
38
|
+
|
39
|
+
```bash
|
40
|
+
bundle exec rake # Run scraper then validate (default)
|
41
|
+
bundle exec rake test # Same as above
|
42
|
+
bundle exec rake scrape # Just run scraper
|
43
|
+
bundle exec rake validate # Just validate existing data
|
44
|
+
bundle exec rake clean # Remove data.sqlite
|
45
|
+
```
|
46
|
+
|
47
|
+
## Configuration
|
48
|
+
|
49
|
+
Edit `.scraper_validation.yml` to set validation thresholds:
|
50
|
+
|
51
|
+
```yaml
|
52
|
+
geocodable_percentage: 70 # Min % of geocodable addresses
|
53
|
+
description_percentage: 55 # Min % of reasonable descriptions
|
54
|
+
info_url_percentage: 75 # Min % of info URL detail checks
|
55
|
+
bot_check_expected: false # Set true when detail pages have reCAPTCHA/Cloudflare bot checks
|
56
|
+
```
|
57
|
+
|
58
|
+
## What Gets Validated
|
59
|
+
|
60
|
+
- **Addresses**: Checks for proper Australian address format (street type, state, postcode)
|
61
|
+
- **Descriptions**: Ensures descriptions aren't placeholders and have 3+ words
|
62
|
+
- **Info URLs**: Validates URLs return status 200 and the page contains expected details
|
63
|
+
- **Global URLs**: Auto-detects if all records use the same info URL and validates it's accessible
|
64
|
+
|
65
|
+
## Integration with CI
|
66
|
+
|
67
|
+
Add to your CI pipeline:
|
68
|
+
|
69
|
+
```bash
|
70
|
+
bundle install
|
71
|
+
rake test
|
72
|
+
```
|
73
|
+
|
74
|
+
The validation will fail with clear error messages if data quality is insufficient, helping catch scraping issues early.
|
@@ -0,0 +1,150 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "optparse"
|
5
|
+
require "sqlite3"
|
6
|
+
require "yaml"
|
7
|
+
require "scraper_utils"
|
8
|
+
|
9
|
+
# Default validation options
|
10
|
+
options = {
|
11
|
+
database: "data.sqlite",
|
12
|
+
geocodable_percentage: 50,
|
13
|
+
geocodable_variation: 3,
|
14
|
+
description_percentage: 50,
|
15
|
+
description_variation: 3,
|
16
|
+
info_url_percentage: 75,
|
17
|
+
info_url_variation: 3,
|
18
|
+
bot_check_expected: false,
|
19
|
+
global_info_url: nil
|
20
|
+
}
|
21
|
+
|
22
|
+
# Load config file if it exists
|
23
|
+
config_file = ".scraper_validation.yml"
|
24
|
+
if File.exist?(config_file)
|
25
|
+
begin
|
26
|
+
config = YAML.safe_load(File.read(config_file), symbolize_names: true)
|
27
|
+
options.merge!(config) if config
|
28
|
+
puts "Loaded config from #{config_file}"
|
29
|
+
rescue => e
|
30
|
+
puts "Warning: Could not load #{config_file}: #{e.message}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
OptionParser.new do |opts|
|
35
|
+
opts.banner = "Usage: validate_scraper_data [options]"
|
36
|
+
|
37
|
+
opts.on("-d", "--database PATH", "SQLite database path (default: data.sqlite)") do |db|
|
38
|
+
options[:database] = db
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-g", "--geocodable-percentage N", Integer, "Min percentage of geocodable addresses (default: 50)") do |n|
|
42
|
+
options[:geocodable_percentage] = n
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-r", "--description-percentage N", Integer, "Min percentage of reasonable descriptions (default: 50)") do |n|
|
46
|
+
options[:description_percentage] = n
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-u", "--info-url-percentage N", Integer, "Min percentage for info URL validation (default: 75)") do |n|
|
50
|
+
options[:info_url_percentage] = n
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("-v", "--variation N", Integer, "Variation tolerance for all validations (default: 3)") do |n|
|
54
|
+
options[:geocodable_variation] = n
|
55
|
+
options[:description_variation] = n
|
56
|
+
options[:info_url_variation] = n
|
57
|
+
end
|
58
|
+
|
59
|
+
opts.on("-b", "--bot-check-expected", "Expect bot protection on info URLs") do
|
60
|
+
options[:bot_check_expected] = true
|
61
|
+
end
|
62
|
+
|
63
|
+
opts.on("-i", "--global-info-url URL", "Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
|
64
|
+
options[:global_info_url] = url
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on("-c", "--config FILE", "Load config from YAML file (default: .scraper_validation.yml)") do |file|
|
68
|
+
config_file = file
|
69
|
+
end
|
70
|
+
|
71
|
+
opts.on("-h", "--help", "Show this help") do
|
72
|
+
puts opts
|
73
|
+
exit
|
74
|
+
end
|
75
|
+
end.parse!
|
76
|
+
|
77
|
+
# Check database exists
|
78
|
+
unless File.exist?(options[:database])
|
79
|
+
puts "Error: Database file '#{options[:database]}' not found"
|
80
|
+
exit 1
|
81
|
+
end
|
82
|
+
|
83
|
+
# Read data from SQLite
|
84
|
+
begin
|
85
|
+
db = SQLite3::Database.new(options[:database])
|
86
|
+
db.results_as_hash = true
|
87
|
+
results = db.execute("SELECT * FROM data ORDER BY council_reference")
|
88
|
+
db.close
|
89
|
+
rescue SQLite3::Exception => e
|
90
|
+
puts "Error reading database: #{e.message}"
|
91
|
+
exit 1
|
92
|
+
end
|
93
|
+
|
94
|
+
if results.empty?
|
95
|
+
puts "No data found in database"
|
96
|
+
exit 1
|
97
|
+
end
|
98
|
+
|
99
|
+
puts "Validating #{results.count} records from #{options[:database]}..."
|
100
|
+
|
101
|
+
# Auto-detect global info URL if not specified
|
102
|
+
if options[:global_info_url].nil?
|
103
|
+
info_urls = results.map { |record| record["info_url"] }.compact.uniq
|
104
|
+
if info_urls.size == 1
|
105
|
+
options[:global_info_url] = info_urls.first
|
106
|
+
puts "Auto-detected global info_url: #{options[:global_info_url]}"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
puts
|
111
|
+
|
112
|
+
begin
|
113
|
+
# Validate addresses are geocodable
|
114
|
+
ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(
|
115
|
+
results,
|
116
|
+
percentage: options[:geocodable_percentage],
|
117
|
+
variation: options[:geocodable_variation]
|
118
|
+
)
|
119
|
+
|
120
|
+
# Validate descriptions are reasonable
|
121
|
+
ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(
|
122
|
+
results,
|
123
|
+
percentage: options[:description_percentage],
|
124
|
+
variation: options[:description_variation]
|
125
|
+
)
|
126
|
+
|
127
|
+
# Validate info URLs
|
128
|
+
if options[:global_info_url]
|
129
|
+
ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(
|
130
|
+
results,
|
131
|
+
options[:global_info_url],
|
132
|
+
bot_check_expected: options[:bot_check_expected]
|
133
|
+
)
|
134
|
+
else
|
135
|
+
ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(
|
136
|
+
results,
|
137
|
+
percentage: options[:info_url_percentage],
|
138
|
+
variation: options[:info_url_variation],
|
139
|
+
bot_check_expected: options[:bot_check_expected]
|
140
|
+
)
|
141
|
+
end
|
142
|
+
|
143
|
+
puts
|
144
|
+
puts "✅ All validations passed!"
|
145
|
+
|
146
|
+
rescue RuntimeError => e
|
147
|
+
puts
|
148
|
+
puts "❌ Validation failed: #{e.message}"
|
149
|
+
exit 1
|
150
|
+
end
|