scraper_utils 0.14.1 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/docs/example_parallel_scraper.rb +16 -17
- data/docs/example_scraper.rb +10 -13
- data/docs/misc_utilities.md +0 -6
- data/exe/validate_scraper_data +13 -8
- data/lib/scraper_utils/data_quality_monitor.rb +9 -4
- data/lib/scraper_utils/db_utils.rb +6 -2
- data/lib/scraper_utils/debug_utils.rb +1 -2
- data/lib/scraper_utils/host_throttler.rb +13 -13
- data/lib/scraper_utils/log_utils.rb +18 -14
- data/lib/scraper_utils/maths_utils.rb +2 -1
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +19 -22
- data/lib/scraper_utils/misc_utils.rb +21 -1
- data/lib/scraper_utils/pa_validation.rb +10 -8
- data/lib/scraper_utils/spec_support.rb +159 -57
- data/lib/scraper_utils/version.rb +1 -1
- data/scraper_utils.gemspec +3 -3
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9a1001f794ef04c587bb726157c66fc637fbb8525bac1c5be93a138e7f0a8266
|
|
4
|
+
data.tar.gz: f92023b5362c6b64ae74d0bf43cf613b02849687a46ec7fbb6b51c4b7ad397dc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 88e952e952d59011018ca4721bde72d49c913beccccf098d62bb4d1313d0ca3bf94678ff27db5ba4cef3a674fefbebd067a5008e5f36a2029f2a9c8ac1689b15
|
|
7
|
+
data.tar.gz: 35601498d9d110d5d365aa7c1fddcfa74a86fde4b93537b44f8e00bb84f664ba455c642256c0032e221b484d986ea39b2d3ab743c94102b10c7bed1c397139d5
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.16.0 - 2026-04-08
|
|
4
|
+
* Use defaults from AgentConfig for `throttle_block`, and allow defaults to be overriden
|
|
5
|
+
|
|
6
|
+
## 0.15.0 - 2026-03-05
|
|
7
|
+
|
|
8
|
+
* Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
|
|
9
|
+
* Fix pre_connect_hook hostname extraction to use `request['Host']` header
|
|
10
|
+
|
|
3
11
|
## 0.14.1 - 2026-03-04
|
|
4
12
|
|
|
5
13
|
* Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
|
|
@@ -27,21 +27,20 @@ class Scraper
|
|
|
27
27
|
begin
|
|
28
28
|
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
|
29
29
|
YourScraper.scrape(authority_label) do |record|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
unprocessable_record_details << [e, record]
|
|
37
|
-
end
|
|
30
|
+
record["authority_label"] = authority_label.to_s
|
|
31
|
+
ScraperUtils::DbUtils.save_record(record)
|
|
32
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
|
33
|
+
# Log bad record but continue processing unless too many have occurred
|
|
34
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
|
35
|
+
unprocessable_record_details << [e, record]
|
|
38
36
|
end
|
|
39
37
|
rescue StandardError => e
|
|
40
38
|
warn "#{authority_label}: ERROR: #{e}"
|
|
41
39
|
warn e.backtrace
|
|
42
40
|
fatal_exception = e
|
|
43
41
|
end
|
|
44
|
-
[authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details,
|
|
42
|
+
[authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details,
|
|
43
|
+
fatal_exception]
|
|
45
44
|
end
|
|
46
45
|
|
|
47
46
|
# Process authorities in parallel
|
|
@@ -54,7 +53,7 @@ class Scraper
|
|
|
54
53
|
scrape_authority(authority_label, attempt)
|
|
55
54
|
end.each do |authority_label, saves, unprocessable, fatal_exception|
|
|
56
55
|
# Runs in main process
|
|
57
|
-
status = fatal_exception ?
|
|
56
|
+
status = fatal_exception ? "FAILED" : "OK"
|
|
58
57
|
puts "Saving results of #{authority_label}: #{saves.size} records, #{unprocessable.size} unprocessable #{status}"
|
|
59
58
|
|
|
60
59
|
saves.each do |record|
|
|
@@ -65,11 +64,11 @@ class Scraper
|
|
|
65
64
|
exceptions[authority_label] = e
|
|
66
65
|
end
|
|
67
66
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
67
|
+
next unless fatal_exception
|
|
68
|
+
|
|
69
|
+
puts " Warning: #{authority_label} failed with: #{fatal_exception.message}"
|
|
70
|
+
puts " Saved #{saves.size} records before failure"
|
|
71
|
+
exceptions[authority_label] = fatal_exception
|
|
73
72
|
end
|
|
74
73
|
|
|
75
74
|
exceptions
|
|
@@ -96,7 +95,7 @@ class Scraper
|
|
|
96
95
|
unless exceptions.empty?
|
|
97
96
|
puts "\n***************************************************"
|
|
98
97
|
puts "Now retrying authorities which earlier had failures"
|
|
99
|
-
puts exceptions.keys.join(", ")
|
|
98
|
+
puts exceptions.keys.join(", ")
|
|
100
99
|
puts "***************************************************"
|
|
101
100
|
|
|
102
101
|
start_time = Time.now
|
|
@@ -118,7 +117,7 @@ end
|
|
|
118
117
|
if __FILE__ == $PROGRAM_NAME
|
|
119
118
|
ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
|
|
120
119
|
|
|
121
|
-
process_count = (ENV[
|
|
120
|
+
process_count = (ENV["MORPH_PROCESSES"] || (Etc.nprocessors * 2)).to_i
|
|
122
121
|
|
|
123
122
|
Scraper.run(Scraper.selected_authorities, process_count: process_count)
|
|
124
123
|
end
|
data/docs/example_scraper.rb
CHANGED
|
@@ -22,13 +22,11 @@ class Scraper
|
|
|
22
22
|
# REPLACE section with:
|
|
23
23
|
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
|
24
24
|
YourScraper.scrape(authority_label) do |record|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
exceptions[authority_label] = e
|
|
31
|
-
end
|
|
25
|
+
record["authority_label"] = authority_label.to_s
|
|
26
|
+
ScraperUtils::DbUtils.save_record(record)
|
|
27
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
|
28
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
|
29
|
+
exceptions[authority_label] = e
|
|
32
30
|
end
|
|
33
31
|
# END OF REPLACE
|
|
34
32
|
rescue StandardError => e
|
|
@@ -61,7 +59,7 @@ class Scraper
|
|
|
61
59
|
puts "Now retrying authorities which earlier had failures"
|
|
62
60
|
puts exceptions.keys.join(", ")
|
|
63
61
|
puts "***************************************************"
|
|
64
|
-
ENV[
|
|
62
|
+
ENV["DEBUG"] ||= "1"
|
|
65
63
|
|
|
66
64
|
start_time = Time.now
|
|
67
65
|
exceptions = scrape(exceptions.keys, 2)
|
|
@@ -85,12 +83,11 @@ if __FILE__ == $PROGRAM_NAME
|
|
|
85
83
|
# some: url-for-issue Summary Reason
|
|
86
84
|
# councils: url-for-issue Summary Reason
|
|
87
85
|
|
|
88
|
-
if ENV[
|
|
89
|
-
default_expect_bad = {
|
|
90
|
-
|
|
91
|
-
puts 'Default EXPECT_BAD:', default_expect_bad.to_yaml if default_expect_bad.any?
|
|
86
|
+
if ENV["MORPH_EXPECT_BAD"].nil?
|
|
87
|
+
default_expect_bad = {}
|
|
88
|
+
puts "Default EXPECT_BAD:", default_expect_bad.to_yaml if default_expect_bad.any?
|
|
92
89
|
|
|
93
|
-
ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(
|
|
90
|
+
ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(",")
|
|
94
91
|
end
|
|
95
92
|
# If the sites have many unusable records - raise defaults
|
|
96
93
|
# ENV['MORPH_UNPROCESSABLE_BASE'] ||= "10"
|
data/docs/misc_utilities.md
CHANGED
|
@@ -18,11 +18,5 @@ The throttle automatically:
|
|
|
18
18
|
- Pauses before next request based on previous timing
|
|
19
19
|
- Caps pause at 120s maximum
|
|
20
20
|
|
|
21
|
-
Override the next pause duration manually if needed:
|
|
22
|
-
|
|
23
|
-
```ruby
|
|
24
|
-
ScraperUtils::MiscUtils.pause_duration = 2.0
|
|
25
|
-
```
|
|
26
|
-
|
|
27
21
|
**Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
|
|
28
22
|
each request is made and thus does not need to be wrapped with the helper.
|
data/exe/validate_scraper_data
CHANGED
|
@@ -26,7 +26,7 @@ if File.exist?(config_file)
|
|
|
26
26
|
config = YAML.safe_load(File.read(config_file), symbolize_names: true)
|
|
27
27
|
options.merge!(config) if config
|
|
28
28
|
puts "Loaded config from #{config_file}"
|
|
29
|
-
rescue => e
|
|
29
|
+
rescue StandardError => e
|
|
30
30
|
puts "Warning: Could not load #{config_file}: #{e.message}"
|
|
31
31
|
end
|
|
32
32
|
end
|
|
@@ -38,19 +38,23 @@ OptionParser.new do |opts|
|
|
|
38
38
|
options[:database] = db
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
-
opts.on("-g", "--geocodable-percentage N", Integer,
|
|
41
|
+
opts.on("-g", "--geocodable-percentage N", Integer,
|
|
42
|
+
"Min percentage of geocodable addresses (default: 50)") do |n|
|
|
42
43
|
options[:geocodable_percentage] = n
|
|
43
44
|
end
|
|
44
45
|
|
|
45
|
-
opts.on("-r", "--description-percentage N", Integer,
|
|
46
|
+
opts.on("-r", "--description-percentage N", Integer,
|
|
47
|
+
"Min percentage of reasonable descriptions (default: 50)") do |n|
|
|
46
48
|
options[:description_percentage] = n
|
|
47
49
|
end
|
|
48
50
|
|
|
49
|
-
opts.on("-u", "--info-url-percentage N", Integer,
|
|
51
|
+
opts.on("-u", "--info-url-percentage N", Integer,
|
|
52
|
+
"Min percentage for info URL validation (default: 75)") do |n|
|
|
50
53
|
options[:info_url_percentage] = n
|
|
51
54
|
end
|
|
52
55
|
|
|
53
|
-
opts.on("-v", "--variation N", Integer,
|
|
56
|
+
opts.on("-v", "--variation N", Integer,
|
|
57
|
+
"Variation tolerance for all validations (default: 3)") do |n|
|
|
54
58
|
options[:geocodable_variation] = n
|
|
55
59
|
options[:description_variation] = n
|
|
56
60
|
options[:info_url_variation] = n
|
|
@@ -60,11 +64,13 @@ OptionParser.new do |opts|
|
|
|
60
64
|
options[:bot_check_expected] = true
|
|
61
65
|
end
|
|
62
66
|
|
|
63
|
-
opts.on("-i", "--global-info-url URL",
|
|
67
|
+
opts.on("-i", "--global-info-url URL",
|
|
68
|
+
"Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
|
|
64
69
|
options[:global_info_url] = url
|
|
65
70
|
end
|
|
66
71
|
|
|
67
|
-
opts.on("-c", "--config FILE",
|
|
72
|
+
opts.on("-c", "--config FILE",
|
|
73
|
+
"Load config from YAML file (default: .scraper_validation.yml)") do |file|
|
|
68
74
|
config_file = file
|
|
69
75
|
end
|
|
70
76
|
|
|
@@ -142,7 +148,6 @@ begin
|
|
|
142
148
|
|
|
143
149
|
puts
|
|
144
150
|
puts "✅ All validations passed!"
|
|
145
|
-
|
|
146
151
|
rescue RuntimeError => e
|
|
147
152
|
puts
|
|
148
153
|
puts "❌ Validation failed: #{e.message}"
|
|
@@ -30,8 +30,13 @@ module ScraperUtils
|
|
|
30
30
|
# Initial base of 5.01 (override using MORPH_UNPROCESSABLE_BASE)
|
|
31
31
|
# Initial percentage of 10% (override using MORPH_UNPROCESSABLE_PERCENTAGE)
|
|
32
32
|
def self.threshold(authority_label)
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
if @stats&.fetch(
|
|
34
|
+
authority_label, nil
|
|
35
|
+
)
|
|
36
|
+
ENV.fetch("MORPH_UNPROCESSABLE_BASE", 5.01).to_f +
|
|
37
|
+
(@stats[authority_label][:saved].to_i * ENV.fetch("MORPH_UNPROCESSABLE_PERCENTAGE",
|
|
38
|
+
10.0).to_f / 100.0)
|
|
39
|
+
end
|
|
35
40
|
end
|
|
36
41
|
|
|
37
42
|
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
|
@@ -44,7 +49,7 @@ module ScraperUtils
|
|
|
44
49
|
def self.log_unprocessable_record(exception, record)
|
|
45
50
|
authority_label = extract_authority(record)
|
|
46
51
|
@stats[authority_label][:unprocessed] += 1
|
|
47
|
-
details = if record&.key?(
|
|
52
|
+
details = if record&.key?("council_reference") && record&.key?("address")
|
|
48
53
|
"#{record['council_reference']} - #{record['address']}"
|
|
49
54
|
else
|
|
50
55
|
record.inspect
|
|
@@ -64,7 +69,7 @@ module ScraperUtils
|
|
|
64
69
|
def self.log_saved_record(record)
|
|
65
70
|
authority_label = extract_authority(record)
|
|
66
71
|
@stats[authority_label][:saved] += 1
|
|
67
|
-
ScraperUtils::LogUtils.log "Saving record #{
|
|
72
|
+
ScraperUtils::LogUtils.log "Saving record #{"for #{authority_label}: " unless authority_label&.empty?}#{record['council_reference']} - #{record['address']}"
|
|
68
73
|
end
|
|
69
74
|
end
|
|
70
75
|
end
|
|
@@ -63,12 +63,16 @@ module ScraperUtils
|
|
|
63
63
|
LogUtils.log "Deleting #{deleted_count} applications scraped between #{oldest_date} and #{cutoff_date}"
|
|
64
64
|
ScraperWiki.sqliteexecute("DELETE FROM data WHERE date_scraped < ?", [cutoff_date])
|
|
65
65
|
|
|
66
|
-
|
|
66
|
+
unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
|
|
67
|
+
return
|
|
68
|
+
end
|
|
67
69
|
|
|
68
70
|
LogUtils.log " Running VACUUM to reclaim space..."
|
|
69
71
|
ScraperWiki.sqliteexecute("VACUUM")
|
|
70
72
|
rescue SqliteMagic::NoSuchTable => e
|
|
71
|
-
|
|
73
|
+
if ScraperUtils::DebugUtils.trace?
|
|
74
|
+
ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records"
|
|
75
|
+
end
|
|
72
76
|
end
|
|
73
77
|
end
|
|
74
78
|
end
|
|
@@ -18,7 +18,7 @@ module ScraperUtils
|
|
|
18
18
|
# Checks DEBUG and MORPH_DEBUG env variables
|
|
19
19
|
# @return [Integer] Debug level
|
|
20
20
|
def self.debug_level
|
|
21
|
-
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR,
|
|
21
|
+
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, "0"))
|
|
22
22
|
debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
|
|
23
23
|
end
|
|
24
24
|
|
|
@@ -48,7 +48,6 @@ module ScraperUtils
|
|
|
48
48
|
debug?(TRACE_LEVEL)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
-
|
|
52
51
|
# Logs details of an HTTP request when debug mode is enabled
|
|
53
52
|
#
|
|
54
53
|
# @param http_method [String] HTTP http_method (GET, POST, etc.)
|
|
@@ -21,10 +21,14 @@ module ScraperUtils
|
|
|
21
21
|
@crawl_delay = crawl_delay.to_f
|
|
22
22
|
# Clamp between 10 (delay 9x response) and 100 (no extra delay)
|
|
23
23
|
@max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
|
|
24
|
-
@next_request_at = {}
|
|
24
|
+
@next_request_at = {} # hostname => Time
|
|
25
25
|
@request_started_at = {} # hostname => Time
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
def will_pause_till(hostname)
|
|
29
|
+
@next_request_at[hostname]
|
|
30
|
+
end
|
|
31
|
+
|
|
28
32
|
# Sleep until this host's throttle window has elapsed.
|
|
29
33
|
# Records when the request actually started.
|
|
30
34
|
# @param hostname [String]
|
|
@@ -48,23 +52,19 @@ module ScraperUtils
|
|
|
48
52
|
response_time = Time.now - started
|
|
49
53
|
|
|
50
54
|
delay = @crawl_delay
|
|
51
|
-
if @max_load
|
|
52
|
-
delay += (100.0 - @max_load) * response_time / @max_load
|
|
53
|
-
end
|
|
55
|
+
delay += (100.0 - @max_load) * response_time / @max_load if @max_load
|
|
54
56
|
|
|
55
|
-
if overloaded
|
|
56
|
-
delay = delay + response_time * 2 + 5.0
|
|
57
|
-
end
|
|
57
|
+
delay = delay + (response_time * 2) + 5.0 if overloaded
|
|
58
58
|
|
|
59
59
|
delay = delay.round(3).clamp(0.0, MAX_DELAY)
|
|
60
60
|
@next_request_at[hostname] = Time.now + delay
|
|
61
61
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
62
|
+
return unless DebugUtils.basic?
|
|
63
|
+
|
|
64
|
+
msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
|
|
65
|
+
msg += " OVERLOADED" if overloaded
|
|
66
|
+
msg += ", Will delay #{delay}s before next request"
|
|
67
|
+
LogUtils.log(msg)
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
# Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
|
|
@@ -14,7 +14,7 @@ module ScraperUtils
|
|
|
14
14
|
# @param message [String] the message to log
|
|
15
15
|
# @return [void]
|
|
16
16
|
def self.log(message, authority = nil)
|
|
17
|
-
authority ||= ENV
|
|
17
|
+
authority ||= ENV.fetch("AUTHORITY", nil)
|
|
18
18
|
$stderr.flush
|
|
19
19
|
if authority
|
|
20
20
|
puts "[#{authority}] #{message}"
|
|
@@ -85,7 +85,7 @@ module ScraperUtils
|
|
|
85
85
|
failed
|
|
86
86
|
)
|
|
87
87
|
|
|
88
|
-
DbUtils
|
|
88
|
+
DbUtils.cleanup_old_records
|
|
89
89
|
end
|
|
90
90
|
|
|
91
91
|
# Extracts the first relevant line from backtrace that's from our project
|
|
@@ -104,15 +104,15 @@ module ScraperUtils
|
|
|
104
104
|
format = options[:format] || false
|
|
105
105
|
|
|
106
106
|
# Normalize the root directory path with a trailing slash
|
|
107
|
-
pwd = File.join(pwd,
|
|
107
|
+
pwd = File.join(pwd, "")
|
|
108
108
|
|
|
109
109
|
backtrace.each do |line|
|
|
110
|
-
next if line.include?(
|
|
111
|
-
line.include?(
|
|
112
|
-
line.include?(
|
|
110
|
+
next if line.include?("/gems/") ||
|
|
111
|
+
line.include?("/vendor/") ||
|
|
112
|
+
line.include?("/ruby/")
|
|
113
113
|
|
|
114
114
|
if line.start_with?(pwd)
|
|
115
|
-
relative_path = line.sub(pwd,
|
|
115
|
+
relative_path = line.sub(pwd, "")
|
|
116
116
|
return format ? " [#{relative_path}]" : relative_path
|
|
117
117
|
end
|
|
118
118
|
end
|
|
@@ -138,7 +138,7 @@ module ScraperUtils
|
|
|
138
138
|
puts "\nScraping Summary:"
|
|
139
139
|
summary_format = "%-20s %6s %6s %s"
|
|
140
140
|
|
|
141
|
-
puts format(summary_format,
|
|
141
|
+
puts format(summary_format, "Authority", "OK", "Bad", "Exception")
|
|
142
142
|
puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
|
|
143
143
|
|
|
144
144
|
authorities.each do |authority|
|
|
@@ -149,7 +149,8 @@ module ScraperUtils
|
|
|
149
149
|
|
|
150
150
|
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
|
151
151
|
exception_msg = if exceptions[authority]
|
|
152
|
-
location =
|
|
152
|
+
location = project_backtrace_line(exceptions[authority].backtrace,
|
|
153
|
+
format: true)
|
|
153
154
|
"#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
|
|
154
155
|
else
|
|
155
156
|
"-"
|
|
@@ -174,12 +175,12 @@ module ScraperUtils
|
|
|
174
175
|
|
|
175
176
|
# Check for authorities with unexpected errors
|
|
176
177
|
unexpected_errors = authorities
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
.select { |authority| exceptions[authority] }
|
|
179
|
+
.reject { |authority| expect_bad.include?(authority) }
|
|
179
180
|
|
|
180
181
|
if unexpected_errors.any?
|
|
181
182
|
errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
|
|
182
|
-
|
|
183
|
+
"(Add to MORPH_EXPECT_BAD?)"
|
|
183
184
|
unexpected_errors.each do |authority|
|
|
184
185
|
error = exceptions[authority]
|
|
185
186
|
errors << " #{authority}: #{error.class} - #{error}"
|
|
@@ -228,7 +229,8 @@ module ScraperUtils
|
|
|
228
229
|
# Moved to DbUtils
|
|
229
230
|
# :nocov:
|
|
230
231
|
def self.cleanup_old_records(force: false)
|
|
231
|
-
warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.",
|
|
232
|
+
warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.",
|
|
233
|
+
category: :deprecated
|
|
232
234
|
ScraperUtils::DbUtils.cleanup_old_records(force: force)
|
|
233
235
|
end
|
|
234
236
|
# :nocov:
|
|
@@ -239,7 +241,9 @@ module ScraperUtils
|
|
|
239
241
|
|
|
240
242
|
lines = []
|
|
241
243
|
error.backtrace.each do |line|
|
|
242
|
-
|
|
244
|
+
if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
|
|
245
|
+
lines << line
|
|
246
|
+
end
|
|
243
247
|
break if lines.length >= 6
|
|
244
248
|
end
|
|
245
249
|
|
|
@@ -61,12 +61,12 @@ module ScraperUtils
|
|
|
61
61
|
# Reset all configuration options to their default values
|
|
62
62
|
# @return [void]
|
|
63
63
|
def reset_defaults!
|
|
64
|
-
@default_timeout = ENV.fetch(
|
|
65
|
-
@default_disable_ssl_certificate_check = !ENV.fetch(
|
|
66
|
-
@default_australian_proxy = !ENV.fetch(
|
|
67
|
-
@default_user_agent = ENV.fetch(
|
|
68
|
-
@default_crawl_delay = ENV.fetch(
|
|
69
|
-
@default_max_load = ENV.fetch(
|
|
64
|
+
@default_timeout = ENV.fetch("MORPH_CLIENT_TIMEOUT", DEFAULT_TIMEOUT).to_i # 60
|
|
65
|
+
@default_disable_ssl_certificate_check = !ENV.fetch("MORPH_DISABLE_SSL_CHECK", nil).to_s.empty? # false
|
|
66
|
+
@default_australian_proxy = !ENV.fetch("MORPH_USE_PROXY", nil).to_s.empty? # false
|
|
67
|
+
@default_user_agent = ENV.fetch("MORPH_USER_AGENT", nil) # Uses Mechanize user agent
|
|
68
|
+
@default_crawl_delay = ENV.fetch("MORPH_CLIENT_CRAWL_DELAY", DEFAULT_CRAWL_DELAY)
|
|
69
|
+
@default_max_load = ENV.fetch("MORPH_MAX_LOAD", DEFAULT_MAX_LOAD)
|
|
70
70
|
end
|
|
71
71
|
end
|
|
72
72
|
|
|
@@ -113,10 +113,10 @@ module ScraperUtils
|
|
|
113
113
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
|
114
114
|
if @australian_proxy
|
|
115
115
|
uri = begin
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
116
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
|
117
|
+
rescue URI::InvalidURIError => e
|
|
118
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
|
|
119
|
+
end
|
|
120
120
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
121
121
|
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
|
122
122
|
end
|
|
@@ -177,13 +177,13 @@ module ScraperUtils
|
|
|
177
177
|
end
|
|
178
178
|
|
|
179
179
|
def pre_connect_hook(_agent, request)
|
|
180
|
-
hostname = request.respond_to?(:
|
|
180
|
+
hostname = (request.respond_to?(:[]) && request["Host"]) || "unknown"
|
|
181
181
|
@throttler.before_request(hostname)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
182
|
+
return unless DebugUtils.verbose?
|
|
183
|
+
|
|
184
|
+
ScraperUtils::LogUtils.log(
|
|
185
|
+
"Pre Connect request: #{request.inspect}"
|
|
186
|
+
)
|
|
187
187
|
end
|
|
188
188
|
|
|
189
189
|
def post_connect_hook(_agent, uri, response, _body)
|
|
@@ -191,7 +191,8 @@ module ScraperUtils
|
|
|
191
191
|
|
|
192
192
|
status = response.respond_to?(:code) ? response.code.to_i : nil
|
|
193
193
|
overloaded = [429, 500, 503].include?(status)
|
|
194
|
-
|
|
194
|
+
hostname = uri.host || "unknown"
|
|
195
|
+
@throttler.after_request(hostname, overloaded: overloaded)
|
|
195
196
|
|
|
196
197
|
if DebugUtils.basic?
|
|
197
198
|
ScraperUtils::LogUtils.log(
|
|
@@ -204,11 +205,7 @@ module ScraperUtils
|
|
|
204
205
|
def error_hook(_agent, error)
|
|
205
206
|
# Best-effort: record the error against whatever host we can find
|
|
206
207
|
# Mechanize errors often carry the URI in the message; fall back to 'unknown'
|
|
207
|
-
hostname = if error.respond_to?(:uri)
|
|
208
|
-
error.uri.host
|
|
209
|
-
else
|
|
210
|
-
'unknown'
|
|
211
|
-
end
|
|
208
|
+
hostname = (error.uri.host if error.respond_to?(:uri)) || "unknown"
|
|
212
209
|
@throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
|
|
213
210
|
end
|
|
214
211
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "host_throttler"
|
|
4
|
+
require_relative "mechanize_utils/agent_config"
|
|
4
5
|
|
|
5
6
|
module ScraperUtils
|
|
6
7
|
# Misc Standalone Utilities
|
|
@@ -8,6 +9,14 @@ module ScraperUtils
|
|
|
8
9
|
THROTTLE_HOSTNAME = "block"
|
|
9
10
|
|
|
10
11
|
class << self
|
|
12
|
+
attr_accessor :default_crawl_delay, :default_max_load
|
|
13
|
+
|
|
14
|
+
def reset_defaults!
|
|
15
|
+
@default_crawl_delay = MechanizeUtils::AgentConfig.default_crawl_delay
|
|
16
|
+
@default_max_load = MechanizeUtils::AgentConfig.default_max_load
|
|
17
|
+
reset_throttler!
|
|
18
|
+
end
|
|
19
|
+
|
|
11
20
|
# Throttle block to be nice to servers we are scraping.
|
|
12
21
|
# Time spent inside the block (parsing, saving) counts toward the delay.
|
|
13
22
|
def throttle_block
|
|
@@ -27,11 +36,22 @@ module ScraperUtils
|
|
|
27
36
|
@throttler = nil
|
|
28
37
|
end
|
|
29
38
|
|
|
39
|
+
def will_pause_till
|
|
40
|
+
throttler.will_pause_till(THROTTLE_HOSTNAME)
|
|
41
|
+
end
|
|
42
|
+
|
|
30
43
|
private
|
|
31
44
|
|
|
32
45
|
def throttler
|
|
33
|
-
@throttler ||= HostThrottler.new
|
|
46
|
+
@throttler ||= HostThrottler.new(
|
|
47
|
+
crawl_delay: default_crawl_delay,
|
|
48
|
+
max_load: default_max_load
|
|
49
|
+
)
|
|
34
50
|
end
|
|
35
51
|
end
|
|
52
|
+
|
|
53
|
+
# Initialise defaults after AgentConfig is loaded
|
|
54
|
+
require_relative "mechanize_utils/agent_config"
|
|
55
|
+
reset_defaults!
|
|
36
56
|
end
|
|
37
57
|
end
|
|
@@ -31,8 +31,6 @@ module ScraperUtils
|
|
|
31
31
|
errors.empty? ? nil : errors
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
private
|
|
35
|
-
|
|
36
34
|
def self.validate_presence(record, errors)
|
|
37
35
|
REQUIRED_FIELDS.each do |field|
|
|
38
36
|
errors << "#{field} can't be blank" if record[field].to_s.strip.empty?
|
|
@@ -47,10 +45,10 @@ module ScraperUtils
|
|
|
47
45
|
begin
|
|
48
46
|
uri = URI.parse(url)
|
|
49
47
|
unless uri.is_a?(URI::HTTP) && uri.host.to_s != ""
|
|
50
|
-
errors << "info_url must be a valid http
|
|
48
|
+
errors << "info_url must be a valid http/https URL with host"
|
|
51
49
|
end
|
|
52
50
|
rescue URI::InvalidURIError
|
|
53
|
-
errors << "info_url must be a valid http
|
|
51
|
+
errors << "info_url must be a valid http/https URL"
|
|
54
52
|
end
|
|
55
53
|
end
|
|
56
54
|
|
|
@@ -58,18 +56,22 @@ module ScraperUtils
|
|
|
58
56
|
today = Date.today
|
|
59
57
|
|
|
60
58
|
date_scraped = parse_date(record["date_scraped"])
|
|
61
|
-
|
|
59
|
+
if record["date_scraped"] && date_scraped.nil?
|
|
60
|
+
errors << "Invalid date format for date_scraped: #{record['date_scraped'].inspect} is not a valid ISO 8601 date"
|
|
61
|
+
end
|
|
62
62
|
|
|
63
63
|
date_received = parse_date(record["date_received"])
|
|
64
64
|
if record["date_received"] && date_received.nil?
|
|
65
|
-
errors << "Invalid date format for date_received: #{record[
|
|
65
|
+
errors << "Invalid date format for date_received: #{record['date_received'].inspect} is not a valid ISO 8601 date"
|
|
66
66
|
elsif date_received && date_received.to_date > today
|
|
67
|
-
errors << "Invalid date for date_received: #{record[
|
|
67
|
+
errors << "Invalid date for date_received: #{record['date_received'].inspect} is in the future"
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
%w[on_notice_from on_notice_to].each do |field|
|
|
71
71
|
val = parse_date(record[field])
|
|
72
|
-
|
|
72
|
+
if record[field] && val.nil?
|
|
73
|
+
errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date"
|
|
74
|
+
end
|
|
73
75
|
end
|
|
74
76
|
end
|
|
75
77
|
|
|
@@ -47,34 +47,43 @@ module ScraperUtils
|
|
|
47
47
|
|
|
48
48
|
PLANNING_KEYWORDS = [
|
|
49
49
|
# Building types
|
|
50
|
-
|
|
50
|
+
"dwelling", "house", "unit", "building", "structure", "facility",
|
|
51
51
|
# Modifications
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
"addition", "extension", "renovation", "alteration", "modification",
|
|
53
|
+
"replacement", "upgrade", "improvement",
|
|
54
54
|
# Specific structures
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
"carport", "garage", "shed", "pool", "deck", "patio", "pergola",
|
|
56
|
+
"verandah", "balcony", "fence", "wall", "driveway",
|
|
57
57
|
# Development types
|
|
58
|
-
|
|
58
|
+
"subdivision", "demolition", "construction", "development",
|
|
59
59
|
# Services/utilities
|
|
60
|
-
|
|
60
|
+
"signage", "telecommunications", "stormwater", "water", "sewer",
|
|
61
61
|
# Approvals/certificates
|
|
62
|
-
|
|
62
|
+
"certificate", "approval", "consent", "permit"
|
|
63
63
|
].freeze
|
|
64
64
|
|
|
65
|
+
def self.fetch_url_head(url)
|
|
66
|
+
agent = Mechanize.new
|
|
67
|
+
# FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
68
|
+
agent.head(url)
|
|
69
|
+
end
|
|
70
|
+
|
|
65
71
|
def self.fetch_url_with_redirects(url)
|
|
66
72
|
agent = Mechanize.new
|
|
67
|
-
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
73
|
+
# FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
68
74
|
agent.get(url)
|
|
69
75
|
end
|
|
70
76
|
|
|
71
|
-
def self.authority_label(results, prefix:
|
|
77
|
+
def self.authority_label(results, prefix: "", suffix: "")
|
|
72
78
|
return nil if results.nil?
|
|
73
79
|
|
|
74
|
-
authority_labels = results.map { |record| record[
|
|
80
|
+
authority_labels = results.map { |record| record["authority_label"] }.compact.uniq
|
|
75
81
|
return nil if authority_labels.empty?
|
|
76
82
|
|
|
77
|
-
|
|
83
|
+
if authority_labels.size > 1
|
|
84
|
+
raise "Expected one authority_label, not #{authority_labels.inspect}"
|
|
85
|
+
end
|
|
86
|
+
|
|
78
87
|
"#{prefix}#{authority_labels.first}#{suffix}"
|
|
79
88
|
end
|
|
80
89
|
|
|
@@ -88,7 +97,8 @@ module ScraperUtils
|
|
|
88
97
|
duplicates = groups.select { |_k, g| g.size > 1 }
|
|
89
98
|
return if duplicates.empty?
|
|
90
99
|
|
|
91
|
-
raise UnprocessableSite,
|
|
100
|
+
raise UnprocessableSite,
|
|
101
|
+
"Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
|
|
92
102
|
end
|
|
93
103
|
|
|
94
104
|
# Validates enough addresses are geocodable
|
|
@@ -98,19 +108,32 @@ module ScraperUtils
|
|
|
98
108
|
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
99
109
|
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
100
110
|
# @raise RuntimeError if insufficient addresses are geocodable
|
|
101
|
-
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
|
|
111
|
+
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
|
|
112
|
+
ignore_case: false, known_suburbs: [])
|
|
102
113
|
return nil if results.empty?
|
|
103
114
|
|
|
104
115
|
geocodable = results
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
116
|
+
.map { |record| record["address"] }
|
|
117
|
+
.uniq
|
|
118
|
+
.count do |text|
|
|
119
|
+
ok = ScraperUtils::SpecSupport.geocodable? text,
|
|
120
|
+
known_suburbs: known_suburbs, ignore_case: ignore_case
|
|
121
|
+
if !ok && DebugUtils.verbose?
|
|
122
|
+
ScraperUtils::LogUtils.log(
|
|
123
|
+
"Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
ok
|
|
128
|
+
end
|
|
108
129
|
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
|
109
|
-
|
|
110
|
-
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
|
130
|
+
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
|
131
|
+
expected = [(((percentage.to_f / 100.0) * results.count) - variation), 1].max
|
|
111
132
|
unless geocodable >= expected
|
|
112
|
-
raise UnprocessableSite,
|
|
133
|
+
raise UnprocessableSite,
|
|
134
|
+
"Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
|
|
113
135
|
end
|
|
136
|
+
|
|
114
137
|
geocodable
|
|
115
138
|
end
|
|
116
139
|
|
|
@@ -122,10 +145,13 @@ module ScraperUtils
|
|
|
122
145
|
# @return [Boolean] True if the address appears to be geocodable.
|
|
123
146
|
def self.geocodable?(address, ignore_case: false, known_suburbs: [])
|
|
124
147
|
return false if address.nil? || address.empty?
|
|
148
|
+
|
|
125
149
|
check_address = ignore_case ? address.upcase : address
|
|
126
150
|
|
|
127
151
|
# Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
|
|
128
|
-
has_state = AUSTRALIAN_STATES.any?
|
|
152
|
+
has_state = AUSTRALIAN_STATES.any? do |state|
|
|
153
|
+
check_address.end_with?(" #{state}") || check_address.include?(" #{state} ")
|
|
154
|
+
end
|
|
129
155
|
has_postcode = address.match?(AUSTRALIAN_POSTCODES)
|
|
130
156
|
|
|
131
157
|
# Using the pre-compiled patterns
|
|
@@ -138,9 +164,13 @@ module ScraperUtils
|
|
|
138
164
|
if ENV["DEBUG"]
|
|
139
165
|
missing = []
|
|
140
166
|
missing << "street type" unless has_street_type
|
|
141
|
-
|
|
167
|
+
unless has_postcode || has_uppercase_suburb || has_known_suburb
|
|
168
|
+
missing << "postcode/Uppercase suburb/Known suburb"
|
|
169
|
+
end
|
|
142
170
|
missing << "state" unless has_state
|
|
143
|
-
|
|
171
|
+
if missing.any?
|
|
172
|
+
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}"
|
|
173
|
+
end
|
|
144
174
|
end
|
|
145
175
|
|
|
146
176
|
has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
|
|
@@ -167,17 +197,21 @@ module ScraperUtils
|
|
|
167
197
|
return nil if results.empty?
|
|
168
198
|
|
|
169
199
|
descriptions = results
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
200
|
+
.map { |record| record["description"] }
|
|
201
|
+
.uniq
|
|
202
|
+
.count do |text|
|
|
173
203
|
selected = ScraperUtils::SpecSupport.reasonable_description? text
|
|
174
204
|
puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
|
|
175
205
|
selected
|
|
176
206
|
end
|
|
177
207
|
puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
|
|
178
|
-
|
|
179
|
-
expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
|
|
180
|
-
|
|
208
|
+
"(#{(100.0 * descriptions / results.count).round(1)}%)"
|
|
209
|
+
expected = [((percentage.to_f / 100.0) * results.count) - variation, 1].max
|
|
210
|
+
unless descriptions >= expected
|
|
211
|
+
raise UnprocessableSite,
|
|
212
|
+
"Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}"
|
|
213
|
+
end
|
|
214
|
+
|
|
181
215
|
descriptions
|
|
182
216
|
end
|
|
183
217
|
|
|
@@ -200,7 +234,8 @@ module ScraperUtils
|
|
|
200
234
|
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
|
201
235
|
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
202
236
|
# @raise RuntimeError if records don't use the expected URL or it doesn't return 200
|
|
203
|
-
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
|
|
237
|
+
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
|
|
238
|
+
&block)
|
|
204
239
|
info_urls = results.map { |record| record["info_url"] }.uniq
|
|
205
240
|
|
|
206
241
|
unless info_urls.size == 1
|
|
@@ -223,6 +258,22 @@ module ScraperUtils
|
|
|
223
258
|
end
|
|
224
259
|
end
|
|
225
260
|
|
|
261
|
+
# Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
|
|
262
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
|
263
|
+
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
|
264
|
+
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
|
265
|
+
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
266
|
+
# @raise RuntimeError if insufficient detail checks pass
|
|
267
|
+
def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
|
|
268
|
+
if defined?(VCR)
|
|
269
|
+
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
|
|
270
|
+
check_info_url_is_present(results, percentage, variation, &block)
|
|
271
|
+
end
|
|
272
|
+
else
|
|
273
|
+
check_info_url_is_present(results, percentage, variation, &block)
|
|
274
|
+
end
|
|
275
|
+
end
|
|
276
|
+
|
|
226
277
|
# Validates that info_urls have expected details (unique URLs with content validation)
|
|
227
278
|
# @param results [Array<Hash>] The results from scraping an authority
|
|
228
279
|
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
|
@@ -230,7 +281,8 @@ module ScraperUtils
|
|
|
230
281
|
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
|
231
282
|
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
232
283
|
# @raise RuntimeError if insufficient detail checks pass
|
|
233
|
-
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
|
|
284
|
+
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
|
|
285
|
+
bot_check_expected: false, &block)
|
|
234
286
|
if defined?(VCR)
|
|
235
287
|
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
|
|
236
288
|
check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
|
@@ -252,15 +304,15 @@ module ScraperUtils
|
|
|
252
304
|
|
|
253
305
|
# Check for common bot protection indicators
|
|
254
306
|
bot_indicators = [
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
307
|
+
"recaptcha",
|
|
308
|
+
"cloudflare",
|
|
309
|
+
"are you human",
|
|
310
|
+
"bot detection",
|
|
311
|
+
"security check",
|
|
312
|
+
"verify you are human",
|
|
313
|
+
"access denied",
|
|
314
|
+
"blocked",
|
|
315
|
+
"captcha"
|
|
264
316
|
]
|
|
265
317
|
|
|
266
318
|
bot_indicators.any? { |indicator| body_lower.include?(indicator) }
|
|
@@ -276,10 +328,51 @@ module ScraperUtils
|
|
|
276
328
|
return
|
|
277
329
|
end
|
|
278
330
|
|
|
279
|
-
|
|
331
|
+
return if page.code == "200"
|
|
332
|
+
|
|
333
|
+
raise "Expected 200 response from the one expected info_url, got #{page.code}"
|
|
280
334
|
end
|
|
281
335
|
|
|
282
|
-
|
|
336
|
+
def self.check_info_url_is_present(results, percentage, variation, &block)
|
|
337
|
+
count = 0
|
|
338
|
+
failed = 0
|
|
339
|
+
fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
|
|
340
|
+
|
|
341
|
+
fib_indices.each do |index|
|
|
342
|
+
record = results[index]
|
|
343
|
+
info_url = record["info_url"]
|
|
344
|
+
puts "Checking info_url[#{index}]: #{info_url} is present..."
|
|
345
|
+
|
|
346
|
+
begin
|
|
347
|
+
page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
|
|
348
|
+
status = page.code.to_i
|
|
349
|
+
rescue Mechanize::ResponseCodeError => e
|
|
350
|
+
status = e.response_code.to_i
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
if [403, 429].include?(status)
|
|
354
|
+
puts " Bot protection detected - skipping"
|
|
355
|
+
next
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
count += 1
|
|
359
|
+
if status.between?(200, 299)
|
|
360
|
+
puts " OK: #{status}" if ENV["DEBUG"]
|
|
361
|
+
else
|
|
362
|
+
failed += 1
|
|
363
|
+
puts " Failed: #{status}"
|
|
364
|
+
min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
|
|
365
|
+
passed = count - failed
|
|
366
|
+
if passed < min_required
|
|
367
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
return unless count > 0
|
|
373
|
+
|
|
374
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!"
|
|
375
|
+
end
|
|
283
376
|
|
|
284
377
|
def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
|
285
378
|
count = 0
|
|
@@ -298,7 +391,10 @@ module ScraperUtils
|
|
|
298
391
|
next
|
|
299
392
|
end
|
|
300
393
|
|
|
301
|
-
|
|
394
|
+
unless page.code == "200"
|
|
395
|
+
raise UnprocessableRecord,
|
|
396
|
+
"Expected 200 response, got #{page.code}"
|
|
397
|
+
end
|
|
302
398
|
|
|
303
399
|
page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
|
|
304
400
|
|
|
@@ -306,34 +402,40 @@ module ScraperUtils
|
|
|
306
402
|
count += 1
|
|
307
403
|
expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
|
|
308
404
|
expected2 = case attribute
|
|
309
|
-
when
|
|
310
|
-
expected.sub(/\ADA\s*-\s*/,
|
|
311
|
-
when
|
|
312
|
-
expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/,
|
|
405
|
+
when "council_reference"
|
|
406
|
+
expected.sub(/\ADA\s*-\s*/, "")
|
|
407
|
+
when "address"
|
|
408
|
+
expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, "") # Handle Lismore post-code/state swap
|
|
313
409
|
else
|
|
314
410
|
expected
|
|
315
411
|
end
|
|
316
412
|
expected3 = case attribute
|
|
317
|
-
when
|
|
318
|
-
expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/,
|
|
413
|
+
when "address"
|
|
414
|
+
expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, "")
|
|
319
415
|
else
|
|
320
416
|
expected
|
|
321
|
-
end.gsub(/\s*,\s*/,
|
|
322
|
-
next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/,
|
|
417
|
+
end.gsub(/\s*,\s*/, " ").gsub(/\s*-\s*/, "-")
|
|
418
|
+
next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, " ").gsub(
|
|
419
|
+
/\s*-\s*/, "-"
|
|
420
|
+
).include?(expected3)
|
|
323
421
|
|
|
324
422
|
failed += 1
|
|
325
|
-
desc2 = expected2 == expected ?
|
|
326
|
-
desc3 = expected3 == expected ?
|
|
423
|
+
desc2 = expected2 == expected ? "" : " or #{expected2.inspect}"
|
|
424
|
+
desc3 = expected3 == expected ? "" : " or #{expected3.inspect}"
|
|
327
425
|
puts " Missing: #{expected.inspect}#{desc2}#{desc3}"
|
|
328
|
-
puts " IN: #{page_body}" if ENV[
|
|
426
|
+
puts " IN: #{page_body}" if ENV["DEBUG"]
|
|
329
427
|
|
|
330
|
-
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
|
428
|
+
min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
|
|
331
429
|
passed = count - failed
|
|
332
|
-
|
|
430
|
+
if passed < min_required
|
|
431
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
|
|
432
|
+
end
|
|
333
433
|
end
|
|
334
434
|
end
|
|
335
435
|
|
|
336
|
-
|
|
436
|
+
return unless count > 0
|
|
437
|
+
|
|
438
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!"
|
|
337
439
|
end
|
|
338
440
|
end
|
|
339
441
|
end
|
data/scraper_utils.gemspec
CHANGED
|
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
|
13
13
|
|
|
14
14
|
spec.summary = "planningalerts scraper utilities"
|
|
15
15
|
spec.description = "Utilities to help make planningalerts scrapers, " \
|
|
16
|
-
|
|
16
|
+
"especially multi authority scrapers, easier to develop, run and debug."
|
|
17
17
|
spec.homepage = "https://github.com/ianheggie-oaf/#{spec.name}"
|
|
18
18
|
spec.license = "MIT"
|
|
19
19
|
|
|
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
|
|
|
23
23
|
spec.metadata["homepage_uri"] = spec.homepage
|
|
24
24
|
spec.metadata["source_code_uri"] = spec.homepage
|
|
25
25
|
spec.metadata["documentation_uri"] = "https://rubydoc.info/gems/#{spec.name}/#{ScraperUtils::VERSION}"
|
|
26
|
-
spec.metadata["changelog_uri"] = "#{spec.metadata[
|
|
26
|
+
spec.metadata["changelog_uri"] = "#{spec.metadata['source_code_uri']}/blob/main/CHANGELOG.md"
|
|
27
27
|
else
|
|
28
28
|
raise "RubyGems 2.0 or newer is required to protect against " \
|
|
29
|
-
|
|
29
|
+
"public gem pushes."
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
# Specify which files should be added to the gem when it is released.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.16.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -114,7 +114,7 @@ metadata:
|
|
|
114
114
|
allowed_push_host: https://rubygems.org
|
|
115
115
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
116
116
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
117
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
117
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.16.0
|
|
118
118
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
119
119
|
rubygems_mfa_required: 'true'
|
|
120
120
|
post_install_message:
|