scraper_utils 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +39 -9
  3. data/CHANGELOG.md +34 -0
  4. data/README.md +7 -55
  5. data/docs/enhancing_specs.md +86 -47
  6. data/docs/example_custom_Rakefile +38 -0
  7. data/docs/example_dot_scraper_validation.yml +23 -0
  8. data/docs/mechanize_utilities.md +0 -3
  9. data/docs/testing_custom_scrapers.md +74 -0
  10. data/exe/validate_scraper_data +150 -0
  11. data/lib/scraper_utils/log_utils.rb +5 -5
  12. data/lib/scraper_utils/maths_utils.rb +23 -0
  13. data/lib/scraper_utils/mechanize_utils/agent_config.rb +9 -65
  14. data/lib/scraper_utils/mechanize_utils.rb +0 -2
  15. data/lib/scraper_utils/spec_support.rb +189 -6
  16. data/lib/scraper_utils/version.rb +1 -1
  17. data/lib/scraper_utils.rb +1 -5
  18. data/scraper_utils.gemspec +1 -0
  19. metadata +11 -24
  20. data/docs/example_scrape_with_fibers.rb +0 -31
  21. data/docs/fibers_and_threads.md +0 -72
  22. data/docs/interleaving_requests.md +0 -33
  23. data/docs/parallel_requests.md +0 -138
  24. data/docs/randomizing_requests.md +0 -38
  25. data/docs/reducing_server_load.md +0 -63
  26. data/lib/scraper_utils/cycle_utils.rb +0 -26
  27. data/lib/scraper_utils/date_range_utils.rb +0 -118
  28. data/lib/scraper_utils/mechanize_actions.rb +0 -183
  29. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +0 -80
  30. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +0 -151
  31. data/lib/scraper_utils/randomize_utils.rb +0 -37
  32. data/lib/scraper_utils/scheduler/constants.rb +0 -12
  33. data/lib/scraper_utils/scheduler/operation_registry.rb +0 -101
  34. data/lib/scraper_utils/scheduler/operation_worker.rb +0 -199
  35. data/lib/scraper_utils/scheduler/process_request.rb +0 -59
  36. data/lib/scraper_utils/scheduler/thread_request.rb +0 -51
  37. data/lib/scraper_utils/scheduler/thread_response.rb +0 -59
  38. data/lib/scraper_utils/scheduler.rb +0 -286
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5103f6e00bdb69c631084a1c6bc939bb7380b0dcd342e21c61eb4b4756482aa9
4
- data.tar.gz: '068e221220371c3e27379ccccd7f784db108985e02273da92eca9d1d94b90620'
3
+ metadata.gz: 858cdc2f2f55b23ea10a09629ff347b2a6833c8efd1f89851168db7ffb46d9f8
4
+ data.tar.gz: f6d17f7a327dd3698b12e4da7772037e58cb76201d3271def86f195a68e97b03
5
5
  SHA512:
6
- metadata.gz: f6b04d20acfe5944f24b1c3a86816d817088d85c2bb208699eee0d88da1f9927d645d84ed3170483843cb8d34f09d06db18982f660b31392853b696a4c3239ed
7
- data.tar.gz: 0ab144cb6f9b9da5df3476f441b3701e254ddf18b264a181394da7e535a53ec234cfb6f75d2b50a0b0557fdb435b663427356178adee8505c5e2633f341d8676
6
+ metadata.gz: 0605eaa66aa317a0076d7c87ebdb883e274bd0fd44fd53ab28219a09a963c78decd0a086368e5b0e01748c0c67cd9ef4a84b48274794ad6bb5ea6e9dcdff4745
7
+ data.tar.gz: 511175ebaed815b6b851f804029b33c8ebbe553623d63556d78bb49f5be1fe4d52e5d9a78f27a75f3423ea90df7aad244acb43415b3990f86f78a9986438e21c
data/.gitignore CHANGED
@@ -1,9 +1,44 @@
1
+ # Scraper specific example
2
+
3
+ # Ignore output of scraper
4
+ data.sqlite
5
+
6
+ # Don't save limited scraper runs
7
+ /spec/cassettes/limited_to_*
8
+ /spec/expected/limited_to_*
9
+
10
+ # ---------------------------
11
+ # Gem ignores
1
12
  *.gem
13
+ /pkg/
14
+ /InstalledFiles
15
+
16
+ # REMOVE PARAGRAPH IF USING WITH SCRAPERS:
17
+ # for a library or gem, you might want to ignore these files since the code is
18
+ # intended to run in multiple environments; otherwise, check them in:
19
+ Gemfile.lock
20
+ .ruby-version
21
+ .ruby-gemset
22
+
23
+ # ---------------------------
24
+ # General ignore
25
+
26
+ # rspec failure tracking
27
+ .rspec_status
28
+ .release
29
+ .bundle
30
+ .profile.d
31
+ bin
32
+ vendor
33
+ tmp
34
+ .basher/bash
35
+ Procfile
36
+
37
+ # Spec coverage
38
+ /coverage/
39
+
2
40
  *.rbc
3
41
  /.config
4
- /coverage/
5
- /InstalledFiles
6
- /pkg/
7
42
  /spec/reports/
8
43
  /spec/examples.txt
9
44
  /test/tmp/
@@ -64,12 +99,6 @@ build-iPhoneSimulator/
64
99
  /vendor/bundle
65
100
  /lib/bundler/man/
66
101
 
67
- # for a library or gem, you might want to ignore these files since the code is
68
- # intended to run in multiple environments; otherwise, check them in:
69
- Gemfile.lock
70
- .ruby-version
71
- .ruby-gemset
72
-
73
102
  # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
74
103
  .rvmrc
75
104
 
@@ -78,3 +107,4 @@ Gemfile.lock
78
107
 
79
108
  # rspec reports
80
109
  /.rspec_status
110
+
data/CHANGELOG.md CHANGED
@@ -1,5 +1,39 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.9.0 - 2025-07-01
4
+
5
+ **Significant cleanup - removed code we ended up not using as none of the councils are actually concerned about server load**
6
+
7
+ * Refactored example code into simple callable methods
8
+ * Expand test for geocodeable addresses to include comma between postcode and state at the end of the address.
9
+
10
+ ### Added
11
+ - `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` - validates percentage of geocodable addresses
12
+ - `ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!` - validates percentage of reasonable descriptions
13
+ - `ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!` - validates single global info_url usage and availability
14
+ - `ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!` - validates info_urls contain expected content
15
+ - `ScraperUtils::MathsUtils.fibonacci_series` - generates fibonacci sequence up to max value
16
+ - `bot_check_expected` parameter to info_url validation methods for handling reCAPTCHA/Cloudflare protection
17
+
18
+ ### Fixed
19
+ - Typo in `geocodable?` method debug output (`has_suburb_stats` → `has_suburb_states`)
20
+ - Code example in `docs/enhancing_specs.md`
21
+
22
+ ### Updated
23
+ - Code example in `docs/enhancing_specs.md` to reflect new support methods
24
+ - geocodeable? test is simpler - it requires
25
+ - a street type, unit or lot,
26
+ - an uppercase word (assumed to be a suburb) or postcode and
27
+ - a state
28
+
29
+ ### Removed
30
+ - Unsued CycleUtils
31
+ - Unused DateRangeUtils
32
+ - Unused RandomizeUtils
33
+ - Unused Scheduling (Fiber and Threads)
34
+ - Unused Compliant mode, delays for Agent (Agent is configured with an agent string)
35
+ - Unused MechanizeActions
36
+
3
37
  ## 0.8.2 - 2025-05-07
4
38
 
5
39
  * Ignore blank dates supplied when validating rather than complain they are not valid
data/README.md CHANGED
@@ -3,43 +3,6 @@ ScraperUtils (Ruby)
3
3
 
4
4
  Utilities to help make planningalerts scrapers, especially multis, easier to develop, run and debug.
5
5
 
6
- For Server Administrators
7
- -------------------------
8
-
9
- The ScraperUtils library is designed to be a respectful citizen of the web. If you're a server administrator and notice
10
- our scraper accessing your systems, here's what you should know:
11
-
12
- ### We play nice with your servers
13
-
14
- Our goal is to access public planning information with minimal impact on your services. The following features are on by
15
- default:
16
-
17
- - **Limit server load**:
18
- - We limit the max load we present to your server to less than a half of one of your cpu cores
19
- - The more loaded your server is, the longer we wait between requests!
20
- - We respect Crawl-delay from robots.txt (see section below), so you can tell us an acceptable rate
21
- - Scarper developers can
22
- - reduce the max_load we present to your server even further
23
- - add random extra delays to give your server a chance to catch up with background tasks
24
-
25
- - **Identify themselves**: Our user agent clearly indicates who we are and provides a link to the project repository:
26
- `Mozilla/5.0 (compatible; ScraperUtils/0.2.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)`
27
-
28
- ### How to Control Our Behavior
29
-
30
- Our scraper utilities respect the standard server **robots.txt** control mechanisms (by default).
31
- To control our access:
32
-
33
- - Add a section for our user agent: `User-agent: ScraperUtils`
34
- - Set a crawl delay, eg: `Crawl-delay: 20`
35
- - If needed specify disallowed paths: `Disallow: /private/`
36
-
37
- For Scraper Developers
38
- ----------------------
39
-
40
- We provide utilities to make developing, running and debugging your scraper easier in addition to the base utilities
41
- mentioned above.
42
-
43
6
  ## Installation & Configuration
44
7
 
45
8
  Add to [your scraper's](https://www.planningalerts.org.au/how_to_write_a_scraper) Gemfile:
@@ -57,27 +20,9 @@ see {file:docs/getting_started.md Getting Started guide}
57
20
  ### Well-Behaved Web Client
58
21
 
59
22
  - Configure Mechanize agents with sensible defaults
60
- - Automatic rate limiting based on server response times
61
- - Supports robots.txt and crawl-delay directives
62
23
  - Supports extra actions required to get to results page
63
24
  - {file:docs/mechanize_utilities.md Learn more about Mechanize utilities}
64
25
 
65
- ### Optimize Server Load
66
-
67
- - Intelligent date range selection (reduce server load by up to 60%)
68
- - Cycle utilities for rotating search parameters
69
- - {file:docs/reducing_server_load.md Learn more about reducing server load}
70
-
71
- ### Improve Scraper Efficiency
72
-
73
- - Interleaves requests to optimize run time
74
- - {file:docs/interleaving_requests.md Learn more about interleaving requests}
75
- - Use {ScraperUtils::Scheduler.execute_request} so Mechanize network requests will be performed by threads in parallel
76
- - {file:docs/parallel_requests.md Parallel Request} - see Usage section for installation instructions
77
- - Randomize processing order for more natural request patterns
78
- - {file:docs/randomizing_requests.md Learn more about randomizing requests} - see Usage section for installation
79
- instructions
80
-
81
26
  ### Error Handling & Quality Monitoring
82
27
 
83
28
  - Record-level error handling with appropriate thresholds
@@ -90,6 +35,13 @@ see {file:docs/getting_started.md Getting Started guide}
90
35
  - Simple logging with authority context
91
36
  - {file:docs/debugging.md Learn more about debugging}
92
37
 
38
+ ### Spec Validation
39
+
40
+ - Validate geocodable addresses and reasonable descriptions
41
+ - Check info URL availability and content
42
+ - Support for bot protection detection
43
+ - {file:docs/enhancing_specs.md Learn more about spec validation}
44
+
93
45
  ## API Documentation
94
46
 
95
47
  Complete API documentation is available at [scraper_utils | RubyDoc.info](https://rubydoc.info/gems/scraper_utils).
@@ -1,9 +1,13 @@
1
1
  # Enhancing specs
2
2
 
3
- ScraperUtils provides two methods to help with checking results
3
+ ScraperUtils provides validation methods to help with checking scraping results:
4
4
 
5
- * `ScraperUtils::SpecSupport.geocodable?`
6
- * `ScraperUtils::SpecSupport.reasonable_description?`
5
+ * `ScraperUtils::SpecSupport.geocodable?` - Check if an address is likely to be geocodable
6
+ * `ScraperUtils::SpecSupport.reasonable_description?` - Check if a description looks reasonable
7
+ * `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` - Validate percentage of geocodable addresses
8
+ * `ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!` - Validate percentage of reasonable descriptions
9
+ * `ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!` - Validate single global info_url usage and availability
10
+ * `ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!` - Validate info_urls contain expected content
7
11
 
8
12
  ## Example Code:
9
13
 
@@ -14,9 +18,16 @@ require "timecop"
14
18
  require_relative "../scraper"
15
19
 
16
20
  RSpec.describe Scraper do
21
+ # Authorities that use bot protection (reCAPTCHA, Cloudflare, etc.) on detail pages (info_url)
22
+ AUTHORITIES_WITH_BOT_PROTECTION = %i[
23
+ some_authority
24
+ another_authority
25
+ ].freeze
26
+
17
27
  describe ".scrape" do
18
28
  def test_scrape(authority)
19
- File.delete("./data.sqlite") if File.exist?("./data.sqlite")
29
+ ScraperWiki.close_sqlite
30
+ FileUtils.rm_f("data.sqlite")
20
31
 
21
32
  VCR.use_cassette(authority) do
22
33
  date = Date.new(2025, 4, 15)
@@ -44,48 +55,20 @@ RSpec.describe Scraper do
44
55
 
45
56
  expect(results).to eq expected
46
57
 
47
- geocodable = results
48
- .map { |record| record["address"] }
49
- .uniq
50
- .count { |text| SpecHelper.geocodable? text }
51
- puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
52
- "(#{(100.0 * geocodable / results.count).round(1)}%)"
53
- expect(geocodable).to be > (0.7 * results.count)
54
-
55
- descriptions = results
56
- .map { |record| record["description"] }
57
- .uniq
58
- .count do |text|
59
- selected = SpecHelper.reasonable_description? text
60
- puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
61
- selected
62
- end
63
- puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
64
- "(#{(100.0 * descriptions / results.count).round(1)}%)"
65
- expect(descriptions).to be > (0.55 * results.count)
66
-
67
- info_urls = results
68
- .map { |record| record["info_url"] }
69
- .uniq
70
- .count { |text| text.to_s.match(%r{\Ahttps?://}) }
71
- puts "Found #{info_urls} out of #{results.count} unique info_urls " \
72
- "(#{(100.0 * info_urls / results.count).round(1)}%)"
73
- expect(info_urls).to be > (0.7 * results.count) if info_urls != 1
74
-
75
- VCR.use_cassette("#{authority}.info_urls") do
76
- results.each do |record|
77
- info_url = record["info_url"]
78
- puts "Checking info_url #{info_url} #{info_urls > 1 ? ' has expected details' : ''} ..."
79
- response = Net::HTTP.get_response(URI(info_url))
80
-
81
- expect(response.code).to eq("200")
82
- # If info_url is the same for all records, then it won't have details
83
- break if info_urls == 1
84
-
85
- expect(response.body).to include(record["council_reference"])
86
- expect(response.body).to include(record["address"])
87
- expect(response.body).to include(record["description"])
88
- end
58
+ # Validate addresses are geocodable (70% minimum with 3 records variation)
59
+ ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(results, percentage: 70, variation: 3)
60
+
61
+ # Validate descriptions are reasonable (55% minimum with 3 records variation)
62
+ ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(results, percentage: 55, variation: 3)
63
+
64
+ # Validate info_urls based on authority configuration
65
+ global_info_url = Scraper::AUTHORITIES[authority][:info_url]
66
+ bot_check_expected = AUTHORITIES_WITH_BOT_PROTECTION.include?(authority)
67
+
68
+ if global_info_url
69
+ ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, global_info_url, bot_check_expected: bot_check_expected)
70
+ else
71
+ ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: bot_check_expected)
89
72
  end
90
73
  end
91
74
 
@@ -96,5 +79,61 @@ RSpec.describe Scraper do
96
79
  end
97
80
  end
98
81
  end
82
+ ```
83
+
84
+ ## Individual validation methods:
85
+
86
+ ### Address validation
87
+
88
+ ```ruby
89
+ # Check if single address is geocodable
90
+ ScraperUtils::SpecSupport.geocodable?('123 Smith Street, Sydney NSW 2000')
91
+
92
+ # Validate percentage of geocodable addresses
93
+ ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
94
+ ```
95
+
96
+ ### Description validation
97
+
98
+ ```ruby
99
+ # Check if single description is reasonable
100
+ ScraperUtils::SpecSupport.reasonable_description?('Construction of new building')
101
+
102
+ # Validate percentage of reasonable descriptions
103
+ ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
104
+ ```
105
+
106
+ ### Info URL validation
107
+
108
+ ```ruby
109
+ # For authorities with global info_url
110
+ ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, 'https://example.com/search')
111
+
112
+ # For authorities with bot protection
113
+ ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(results, 'https://example.com/search', bot_check_expected: true)
114
+
115
+ # For authorities with unique info_urls per record
116
+ ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3)
117
+
118
+ # For authorities with unique info_urls that may have bot protection
119
+ ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: true)
120
+ ```
121
+
122
+ ## Bot Protection Handling
123
+
124
+ The `bot_check_expected` parameter allows validation methods to accept bot protection as valid responses:
125
+
126
+ **When bot protection is detected:**
127
+
128
+ - HTTP status codes: 403 (Forbidden), 429 (Too Many Requests)
129
+ - Content indicators: "recaptcha", "cloudflare", "are you human", "bot detection", "security check", "verify you are
130
+ human", "access denied", "blocked", "captcha"
131
+
132
+ **Usage patterns:**
133
+
134
+ - Set `bot_check_expected: false` (default) - requires 200 responses, fails on bot protection
135
+ - Set `bot_check_expected: true` - accepts bot protection as valid, useful for authorities known to use anti-bot
136
+ measures
99
137
 
100
- ```
138
+ All validation methods accept `percentage` (minimum percentage required) and `variation` (additional tolerance)
139
+ parameters for consistent configuration.
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Remember to add scraper_utils to Gemfile!
4
+
5
+ require "fileutils"
6
+
7
+ desc "Run scraper and validate results"
8
+ task :test do
9
+ Rake::Task[:scrape].invoke
10
+ Rake::Task[:validate].invoke
11
+ end
12
+
13
+ desc "Run the scraper"
14
+ task :scrape do
15
+ puts "Running scraper..."
16
+ FileUtils.rm_f("data.sqlite")
17
+
18
+ system("bundle exec ruby scraper.rb") || abort("Scraper failed")
19
+ end
20
+
21
+ desc "Validate scraped data"
22
+ task :validate do
23
+ puts "Validating scraped data..."
24
+
25
+ unless File.exist?("data.sqlite")
26
+ abort("No data.sqlite found - run scraper first")
27
+ end
28
+
29
+ system("validate_scraper_data") || abort("Validation failed")
30
+ end
31
+
32
+ desc "Clean up generated files"
33
+ task :clean do
34
+ FileUtils.rm_f("data.sqlite")
35
+ puts "Cleaned up data.sqlite"
36
+ end
37
+
38
+ task default: :test
@@ -0,0 +1,23 @@
1
+ # Scraper validation configuration
2
+ # Copy this to .scraper_validation.yml and customize as needed
3
+
4
+ # Database settings
5
+ database: "data.sqlite"
6
+
7
+ # Address validation (percentage of addresses that should be geocodable)
8
+ geocodable_percentage: 70
9
+ geocodable_variation: 3
10
+
11
+ # Description validation (percentage of descriptions that should be reasonable)
12
+ description_percentage: 55
13
+ description_variation: 3
14
+
15
+ # Info URL validation (percentage of detail checks that should pass)
16
+ info_url_percentage: 75
17
+ info_url_variation: 3
18
+
19
+ # Bot protection (set to true if info URLs use reCAPTCHA/Cloudflare)
20
+ bot_check_expected: false
21
+
22
+ # Global info URL (leave blank for auto-detection)
23
+ # global_info_url: "https://example.com/search"
@@ -36,9 +36,6 @@ Note - compliant mode forces max_load to be set to a value no greater than 50.
36
36
  ```ruby
37
37
  ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
38
38
  config.default_timeout = ENV.fetch('MORPH_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
39
- config.default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
40
- config.default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
41
- config.default_max_load = ENV.fetch('MORPH_MAX_LOAD',DEFAULT_MAX_LOAD).to_f # 50.0
42
39
  config.default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
43
40
  config.default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
44
41
  config.default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
@@ -0,0 +1,74 @@
1
+ # Testing Custom Scrapers
2
+
3
+ Custom scrapers often lack proper specs but still need quality validation. This gem provides a simple executable to
4
+ validate your scraped data.
5
+
6
+ ## Setup
7
+
8
+ Add to your `Gemfile`:
9
+
10
+ ```ruby
11
+ gem "scraper_utils", "~> 0.8.2"
12
+ gem "rake", "~> 13.0"
13
+ ```
14
+
15
+ Copy the example files to your scraper directory:
16
+
17
+ - `docs/example_custom_Rakefile` → `Rakefile`
18
+ - `docs/example_dot_scraper_validation.yml` → `.scraper_validation.yml`
19
+
20
+ ## Usage
21
+
22
+ ### Command Line
23
+
24
+ Once you have run the scraper (`bundle exec ruby scraper.rb`) you can manually run the command:
25
+
26
+ ```bash
27
+ # Run with defaults
28
+ validate_scraper_data
29
+
30
+ # Custom thresholds
31
+ validate_scraper_data --geocodable-percentage 80 --description-percentage 60
32
+
33
+ # For scrapers with bot protection
34
+ validate_scraper_data --bot-check-expected
35
+ ```
36
+
37
+ ### Rake Tasks
38
+
39
+ ```bash
40
+ bundle exec rake # Run scraper then validate (default)
41
+ bundle exec rake test # Same as above
42
+ bundle exec rake scrape # Just run scraper
43
+ bundle exec rake validate # Just validate existing data
44
+ bundle exec rake clean # Remove data.sqlite
45
+ ```
46
+
47
+ ## Configuration
48
+
49
+ Edit `.scraper_validation.yml` to set validation thresholds:
50
+
51
+ ```yaml
52
+ geocodable_percentage: 70 # Min % of geocodable addresses
53
+ description_percentage: 55 # Min % of reasonable descriptions
54
+ info_url_percentage: 75 # Min % of info URL detail checks
55
+ bot_check_expected: false # Set true when detail pages have reCAPTCHA/Cloudflare bot checks
56
+ ```
57
+
58
+ ## What Gets Validated
59
+
60
+ - **Addresses**: Checks for proper Australian address format (street type, state, postcode)
61
+ - **Descriptions**: Ensures descriptions aren't placeholders and have 3+ words
62
+ - **Info URLs**: Validates URLs return status 200 and the page contains expected details
63
+ - **Global URLs**: Auto-detects if all records use the same info URL and validates it's accessible
64
+
65
+ ## Integration with CI
66
+
67
+ Add to your CI pipeline:
68
+
69
+ ```bash
70
+ bundle install
71
+ rake test
72
+ ```
73
+
74
+ The validation will fail with clear error messages if data quality is insufficient, helping catch scraping issues early.
@@ -0,0 +1,150 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "optparse"
5
+ require "sqlite3"
6
+ require "yaml"
7
+ require "scraper_utils"
8
+
9
+ # Default validation options
10
+ options = {
11
+ database: "data.sqlite",
12
+ geocodable_percentage: 50,
13
+ geocodable_variation: 3,
14
+ description_percentage: 50,
15
+ description_variation: 3,
16
+ info_url_percentage: 75,
17
+ info_url_variation: 3,
18
+ bot_check_expected: false,
19
+ global_info_url: nil
20
+ }
21
+
22
+ # Load config file if it exists
23
+ config_file = ".scraper_validation.yml"
24
+ if File.exist?(config_file)
25
+ begin
26
+ config = YAML.safe_load(File.read(config_file), symbolize_names: true)
27
+ options.merge!(config) if config
28
+ puts "Loaded config from #{config_file}"
29
+ rescue => e
30
+ puts "Warning: Could not load #{config_file}: #{e.message}"
31
+ end
32
+ end
33
+
34
+ OptionParser.new do |opts|
35
+ opts.banner = "Usage: validate_scraper_data [options]"
36
+
37
+ opts.on("-d", "--database PATH", "SQLite database path (default: data.sqlite)") do |db|
38
+ options[:database] = db
39
+ end
40
+
41
+ opts.on("-g", "--geocodable-percentage N", Integer, "Min percentage of geocodable addresses (default: 50)") do |n|
42
+ options[:geocodable_percentage] = n
43
+ end
44
+
45
+ opts.on("-r", "--description-percentage N", Integer, "Min percentage of reasonable descriptions (default: 50)") do |n|
46
+ options[:description_percentage] = n
47
+ end
48
+
49
+ opts.on("-u", "--info-url-percentage N", Integer, "Min percentage for info URL validation (default: 75)") do |n|
50
+ options[:info_url_percentage] = n
51
+ end
52
+
53
+ opts.on("-v", "--variation N", Integer, "Variation tolerance for all validations (default: 3)") do |n|
54
+ options[:geocodable_variation] = n
55
+ options[:description_variation] = n
56
+ options[:info_url_variation] = n
57
+ end
58
+
59
+ opts.on("-b", "--bot-check-expected", "Expect bot protection on info URLs") do
60
+ options[:bot_check_expected] = true
61
+ end
62
+
63
+ opts.on("-i", "--global-info-url URL", "Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
64
+ options[:global_info_url] = url
65
+ end
66
+
67
+ opts.on("-c", "--config FILE", "Load config from YAML file (default: .scraper_validation.yml)") do |file|
68
+ config_file = file
69
+ end
70
+
71
+ opts.on("-h", "--help", "Show this help") do
72
+ puts opts
73
+ exit
74
+ end
75
+ end.parse!
76
+
77
+ # Check database exists
78
+ unless File.exist?(options[:database])
79
+ puts "Error: Database file '#{options[:database]}' not found"
80
+ exit 1
81
+ end
82
+
83
+ # Read data from SQLite
84
+ begin
85
+ db = SQLite3::Database.new(options[:database])
86
+ db.results_as_hash = true
87
+ results = db.execute("SELECT * FROM data ORDER BY council_reference")
88
+ db.close
89
+ rescue SQLite3::Exception => e
90
+ puts "Error reading database: #{e.message}"
91
+ exit 1
92
+ end
93
+
94
+ if results.empty?
95
+ puts "No data found in database"
96
+ exit 1
97
+ end
98
+
99
+ puts "Validating #{results.count} records from #{options[:database]}..."
100
+
101
+ # Auto-detect global info URL if not specified
102
+ if options[:global_info_url].nil?
103
+ info_urls = results.map { |record| record["info_url"] }.compact.uniq
104
+ if info_urls.size == 1
105
+ options[:global_info_url] = info_urls.first
106
+ puts "Auto-detected global info_url: #{options[:global_info_url]}"
107
+ end
108
+ end
109
+
110
+ puts
111
+
112
+ begin
113
+ # Validate addresses are geocodable
114
+ ScraperUtils::SpecSupport.validate_addresses_are_geocodable!(
115
+ results,
116
+ percentage: options[:geocodable_percentage],
117
+ variation: options[:geocodable_variation]
118
+ )
119
+
120
+ # Validate descriptions are reasonable
121
+ ScraperUtils::SpecSupport.validate_descriptions_are_reasonable!(
122
+ results,
123
+ percentage: options[:description_percentage],
124
+ variation: options[:description_variation]
125
+ )
126
+
127
+ # Validate info URLs
128
+ if options[:global_info_url]
129
+ ScraperUtils::SpecSupport.validate_uses_one_valid_info_url!(
130
+ results,
131
+ options[:global_info_url],
132
+ bot_check_expected: options[:bot_check_expected]
133
+ )
134
+ else
135
+ ScraperUtils::SpecSupport.validate_info_urls_have_expected_details!(
136
+ results,
137
+ percentage: options[:info_url_percentage],
138
+ variation: options[:info_url_variation],
139
+ bot_check_expected: options[:bot_check_expected]
140
+ )
141
+ end
142
+
143
+ puts
144
+ puts "✅ All validations passed!"
145
+
146
+ rescue RuntimeError => e
147
+ puts
148
+ puts "❌ Validation failed: #{e.message}"
149
+ exit 1
150
+ end