scraper_utils 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/docs/example_parallel_scraper.rb +16 -17
- data/docs/example_scraper.rb +10 -13
- data/exe/validate_scraper_data +13 -8
- data/lib/scraper_utils/data_quality_monitor.rb +9 -4
- data/lib/scraper_utils/db_utils.rb +6 -2
- data/lib/scraper_utils/debug_utils.rb +1 -2
- data/lib/scraper_utils/host_throttler.rb +9 -13
- data/lib/scraper_utils/log_utils.rb +18 -14
- data/lib/scraper_utils/maths_utils.rb +2 -1
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +18 -20
- data/lib/scraper_utils/misc_utils.rb +17 -1
- data/lib/scraper_utils/pa_validation.rb +10 -8
- data/lib/scraper_utils/spec_support.rb +106 -73
- data/lib/scraper_utils/version.rb +1 -1
- data/scraper_utils.gemspec +3 -3
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9a1001f794ef04c587bb726157c66fc637fbb8525bac1c5be93a138e7f0a8266
|
|
4
|
+
data.tar.gz: f92023b5362c6b64ae74d0bf43cf613b02849687a46ec7fbb6b51c4b7ad397dc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 88e952e952d59011018ca4721bde72d49c913beccccf098d62bb4d1313d0ca3bf94678ff27db5ba4cef3a674fefbebd067a5008e5f36a2029f2a9c8ac1689b15
|
|
7
|
+
data.tar.gz: 35601498d9d110d5d365aa7c1fddcfa74a86fde4b93537b44f8e00bb84f664ba455c642256c0032e221b484d986ea39b2d3ab743c94102b10c7bed1c397139d5
|
data/CHANGELOG.md
CHANGED
|
@@ -27,21 +27,20 @@ class Scraper
|
|
|
27
27
|
begin
|
|
28
28
|
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
|
29
29
|
YourScraper.scrape(authority_label) do |record|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
unprocessable_record_details << [e, record]
|
|
37
|
-
end
|
|
30
|
+
record["authority_label"] = authority_label.to_s
|
|
31
|
+
ScraperUtils::DbUtils.save_record(record)
|
|
32
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
|
33
|
+
# Log bad record but continue processing unless too many have occurred
|
|
34
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
|
35
|
+
unprocessable_record_details << [e, record]
|
|
38
36
|
end
|
|
39
37
|
rescue StandardError => e
|
|
40
38
|
warn "#{authority_label}: ERROR: #{e}"
|
|
41
39
|
warn e.backtrace
|
|
42
40
|
fatal_exception = e
|
|
43
41
|
end
|
|
44
|
-
[authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details,
|
|
42
|
+
[authority_label, ScraperUtils::DbUtils.collected_saves, unprocessable_record_details,
|
|
43
|
+
fatal_exception]
|
|
45
44
|
end
|
|
46
45
|
|
|
47
46
|
# Process authorities in parallel
|
|
@@ -54,7 +53,7 @@ class Scraper
|
|
|
54
53
|
scrape_authority(authority_label, attempt)
|
|
55
54
|
end.each do |authority_label, saves, unprocessable, fatal_exception|
|
|
56
55
|
# Runs in main process
|
|
57
|
-
status = fatal_exception ?
|
|
56
|
+
status = fatal_exception ? "FAILED" : "OK"
|
|
58
57
|
puts "Saving results of #{authority_label}: #{saves.size} records, #{unprocessable.size} unprocessable #{status}"
|
|
59
58
|
|
|
60
59
|
saves.each do |record|
|
|
@@ -65,11 +64,11 @@ class Scraper
|
|
|
65
64
|
exceptions[authority_label] = e
|
|
66
65
|
end
|
|
67
66
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
67
|
+
next unless fatal_exception
|
|
68
|
+
|
|
69
|
+
puts " Warning: #{authority_label} failed with: #{fatal_exception.message}"
|
|
70
|
+
puts " Saved #{saves.size} records before failure"
|
|
71
|
+
exceptions[authority_label] = fatal_exception
|
|
73
72
|
end
|
|
74
73
|
|
|
75
74
|
exceptions
|
|
@@ -96,7 +95,7 @@ class Scraper
|
|
|
96
95
|
unless exceptions.empty?
|
|
97
96
|
puts "\n***************************************************"
|
|
98
97
|
puts "Now retrying authorities which earlier had failures"
|
|
99
|
-
puts exceptions.keys.join(", ")
|
|
98
|
+
puts exceptions.keys.join(", ")
|
|
100
99
|
puts "***************************************************"
|
|
101
100
|
|
|
102
101
|
start_time = Time.now
|
|
@@ -118,7 +117,7 @@ end
|
|
|
118
117
|
if __FILE__ == $PROGRAM_NAME
|
|
119
118
|
ENV["MORPH_EXPECT_BAD"] ||= "some,councils"
|
|
120
119
|
|
|
121
|
-
process_count = (ENV[
|
|
120
|
+
process_count = (ENV["MORPH_PROCESSES"] || (Etc.nprocessors * 2)).to_i
|
|
122
121
|
|
|
123
122
|
Scraper.run(Scraper.selected_authorities, process_count: process_count)
|
|
124
123
|
end
|
data/docs/example_scraper.rb
CHANGED
|
@@ -22,13 +22,11 @@ class Scraper
|
|
|
22
22
|
# REPLACE section with:
|
|
23
23
|
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
|
24
24
|
YourScraper.scrape(authority_label) do |record|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
exceptions[authority_label] = e
|
|
31
|
-
end
|
|
25
|
+
record["authority_label"] = authority_label.to_s
|
|
26
|
+
ScraperUtils::DbUtils.save_record(record)
|
|
27
|
+
rescue ScraperUtils::UnprocessableRecord => e
|
|
28
|
+
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
|
29
|
+
exceptions[authority_label] = e
|
|
32
30
|
end
|
|
33
31
|
# END OF REPLACE
|
|
34
32
|
rescue StandardError => e
|
|
@@ -61,7 +59,7 @@ class Scraper
|
|
|
61
59
|
puts "Now retrying authorities which earlier had failures"
|
|
62
60
|
puts exceptions.keys.join(", ")
|
|
63
61
|
puts "***************************************************"
|
|
64
|
-
ENV[
|
|
62
|
+
ENV["DEBUG"] ||= "1"
|
|
65
63
|
|
|
66
64
|
start_time = Time.now
|
|
67
65
|
exceptions = scrape(exceptions.keys, 2)
|
|
@@ -85,12 +83,11 @@ if __FILE__ == $PROGRAM_NAME
|
|
|
85
83
|
# some: url-for-issue Summary Reason
|
|
86
84
|
# councils: url-for-issue Summary Reason
|
|
87
85
|
|
|
88
|
-
if ENV[
|
|
89
|
-
default_expect_bad = {
|
|
90
|
-
|
|
91
|
-
puts 'Default EXPECT_BAD:', default_expect_bad.to_yaml if default_expect_bad.any?
|
|
86
|
+
if ENV["MORPH_EXPECT_BAD"].nil?
|
|
87
|
+
default_expect_bad = {}
|
|
88
|
+
puts "Default EXPECT_BAD:", default_expect_bad.to_yaml if default_expect_bad.any?
|
|
92
89
|
|
|
93
|
-
ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(
|
|
90
|
+
ENV["MORPH_EXPECT_BAD"] = default_expect_bad.keys.join(",")
|
|
94
91
|
end
|
|
95
92
|
# If the sites have many unusable records - raise defaults
|
|
96
93
|
# ENV['MORPH_UNPROCESSABLE_BASE'] ||= "10"
|
data/exe/validate_scraper_data
CHANGED
|
@@ -26,7 +26,7 @@ if File.exist?(config_file)
|
|
|
26
26
|
config = YAML.safe_load(File.read(config_file), symbolize_names: true)
|
|
27
27
|
options.merge!(config) if config
|
|
28
28
|
puts "Loaded config from #{config_file}"
|
|
29
|
-
rescue => e
|
|
29
|
+
rescue StandardError => e
|
|
30
30
|
puts "Warning: Could not load #{config_file}: #{e.message}"
|
|
31
31
|
end
|
|
32
32
|
end
|
|
@@ -38,19 +38,23 @@ OptionParser.new do |opts|
|
|
|
38
38
|
options[:database] = db
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
-
opts.on("-g", "--geocodable-percentage N", Integer,
|
|
41
|
+
opts.on("-g", "--geocodable-percentage N", Integer,
|
|
42
|
+
"Min percentage of geocodable addresses (default: 50)") do |n|
|
|
42
43
|
options[:geocodable_percentage] = n
|
|
43
44
|
end
|
|
44
45
|
|
|
45
|
-
opts.on("-r", "--description-percentage N", Integer,
|
|
46
|
+
opts.on("-r", "--description-percentage N", Integer,
|
|
47
|
+
"Min percentage of reasonable descriptions (default: 50)") do |n|
|
|
46
48
|
options[:description_percentage] = n
|
|
47
49
|
end
|
|
48
50
|
|
|
49
|
-
opts.on("-u", "--info-url-percentage N", Integer,
|
|
51
|
+
opts.on("-u", "--info-url-percentage N", Integer,
|
|
52
|
+
"Min percentage for info URL validation (default: 75)") do |n|
|
|
50
53
|
options[:info_url_percentage] = n
|
|
51
54
|
end
|
|
52
55
|
|
|
53
|
-
opts.on("-v", "--variation N", Integer,
|
|
56
|
+
opts.on("-v", "--variation N", Integer,
|
|
57
|
+
"Variation tolerance for all validations (default: 3)") do |n|
|
|
54
58
|
options[:geocodable_variation] = n
|
|
55
59
|
options[:description_variation] = n
|
|
56
60
|
options[:info_url_variation] = n
|
|
@@ -60,11 +64,13 @@ OptionParser.new do |opts|
|
|
|
60
64
|
options[:bot_check_expected] = true
|
|
61
65
|
end
|
|
62
66
|
|
|
63
|
-
opts.on("-i", "--global-info-url URL",
|
|
67
|
+
opts.on("-i", "--global-info-url URL",
|
|
68
|
+
"Validate all records use this global info URL (auto-detected if all URLs are the same)") do |url|
|
|
64
69
|
options[:global_info_url] = url
|
|
65
70
|
end
|
|
66
71
|
|
|
67
|
-
opts.on("-c", "--config FILE",
|
|
72
|
+
opts.on("-c", "--config FILE",
|
|
73
|
+
"Load config from YAML file (default: .scraper_validation.yml)") do |file|
|
|
68
74
|
config_file = file
|
|
69
75
|
end
|
|
70
76
|
|
|
@@ -142,7 +148,6 @@ begin
|
|
|
142
148
|
|
|
143
149
|
puts
|
|
144
150
|
puts "✅ All validations passed!"
|
|
145
|
-
|
|
146
151
|
rescue RuntimeError => e
|
|
147
152
|
puts
|
|
148
153
|
puts "❌ Validation failed: #{e.message}"
|
|
@@ -30,8 +30,13 @@ module ScraperUtils
|
|
|
30
30
|
# Initial base of 5.01 (override using MORPH_UNPROCESSABLE_BASE)
|
|
31
31
|
# Initial percentage of 10% (override using MORPH_UNPROCESSABLE_PERCENTAGE)
|
|
32
32
|
def self.threshold(authority_label)
|
|
33
|
-
|
|
34
|
-
|
|
33
|
+
if @stats&.fetch(
|
|
34
|
+
authority_label, nil
|
|
35
|
+
)
|
|
36
|
+
ENV.fetch("MORPH_UNPROCESSABLE_BASE", 5.01).to_f +
|
|
37
|
+
(@stats[authority_label][:saved].to_i * ENV.fetch("MORPH_UNPROCESSABLE_PERCENTAGE",
|
|
38
|
+
10.0).to_f / 100.0)
|
|
39
|
+
end
|
|
35
40
|
end
|
|
36
41
|
|
|
37
42
|
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
|
@@ -44,7 +49,7 @@ module ScraperUtils
|
|
|
44
49
|
def self.log_unprocessable_record(exception, record)
|
|
45
50
|
authority_label = extract_authority(record)
|
|
46
51
|
@stats[authority_label][:unprocessed] += 1
|
|
47
|
-
details = if record&.key?(
|
|
52
|
+
details = if record&.key?("council_reference") && record&.key?("address")
|
|
48
53
|
"#{record['council_reference']} - #{record['address']}"
|
|
49
54
|
else
|
|
50
55
|
record.inspect
|
|
@@ -64,7 +69,7 @@ module ScraperUtils
|
|
|
64
69
|
def self.log_saved_record(record)
|
|
65
70
|
authority_label = extract_authority(record)
|
|
66
71
|
@stats[authority_label][:saved] += 1
|
|
67
|
-
ScraperUtils::LogUtils.log "Saving record #{
|
|
72
|
+
ScraperUtils::LogUtils.log "Saving record #{"for #{authority_label}: " unless authority_label&.empty?}#{record['council_reference']} - #{record['address']}"
|
|
68
73
|
end
|
|
69
74
|
end
|
|
70
75
|
end
|
|
@@ -63,12 +63,16 @@ module ScraperUtils
|
|
|
63
63
|
LogUtils.log "Deleting #{deleted_count} applications scraped between #{oldest_date} and #{cutoff_date}"
|
|
64
64
|
ScraperWiki.sqliteexecute("DELETE FROM data WHERE date_scraped < ?", [cutoff_date])
|
|
65
65
|
|
|
66
|
-
|
|
66
|
+
unless rand < 0.03 || (oldest_date && oldest_date < vacuum_cutoff_date) || ENV["VACUUM"] || force
|
|
67
|
+
return
|
|
68
|
+
end
|
|
67
69
|
|
|
68
70
|
LogUtils.log " Running VACUUM to reclaim space..."
|
|
69
71
|
ScraperWiki.sqliteexecute("VACUUM")
|
|
70
72
|
rescue SqliteMagic::NoSuchTable => e
|
|
71
|
-
|
|
73
|
+
if ScraperUtils::DebugUtils.trace?
|
|
74
|
+
ScraperUtils::LogUtils.log "Ignoring: #{e} whilst cleaning old records"
|
|
75
|
+
end
|
|
72
76
|
end
|
|
73
77
|
end
|
|
74
78
|
end
|
|
@@ -18,7 +18,7 @@ module ScraperUtils
|
|
|
18
18
|
# Checks DEBUG and MORPH_DEBUG env variables
|
|
19
19
|
# @return [Integer] Debug level
|
|
20
20
|
def self.debug_level
|
|
21
|
-
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR,
|
|
21
|
+
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, "0"))
|
|
22
22
|
debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
|
|
23
23
|
end
|
|
24
24
|
|
|
@@ -48,7 +48,6 @@ module ScraperUtils
|
|
|
48
48
|
debug?(TRACE_LEVEL)
|
|
49
49
|
end
|
|
50
50
|
|
|
51
|
-
|
|
52
51
|
# Logs details of an HTTP request when debug mode is enabled
|
|
53
52
|
#
|
|
54
53
|
# @param http_method [String] HTTP http_method (GET, POST, etc.)
|
|
@@ -21,7 +21,7 @@ module ScraperUtils
|
|
|
21
21
|
@crawl_delay = crawl_delay.to_f
|
|
22
22
|
# Clamp between 10 (delay 9x response) and 100 (no extra delay)
|
|
23
23
|
@max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
|
|
24
|
-
@next_request_at = {}
|
|
24
|
+
@next_request_at = {} # hostname => Time
|
|
25
25
|
@request_started_at = {} # hostname => Time
|
|
26
26
|
end
|
|
27
27
|
|
|
@@ -52,23 +52,19 @@ module ScraperUtils
|
|
|
52
52
|
response_time = Time.now - started
|
|
53
53
|
|
|
54
54
|
delay = @crawl_delay
|
|
55
|
-
if @max_load
|
|
56
|
-
delay += (100.0 - @max_load) * response_time / @max_load
|
|
57
|
-
end
|
|
55
|
+
delay += (100.0 - @max_load) * response_time / @max_load if @max_load
|
|
58
56
|
|
|
59
|
-
if overloaded
|
|
60
|
-
delay = delay + response_time * 2 + 5.0
|
|
61
|
-
end
|
|
57
|
+
delay = delay + (response_time * 2) + 5.0 if overloaded
|
|
62
58
|
|
|
63
59
|
delay = delay.round(3).clamp(0.0, MAX_DELAY)
|
|
64
60
|
@next_request_at[hostname] = Time.now + delay
|
|
65
61
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
62
|
+
return unless DebugUtils.basic?
|
|
63
|
+
|
|
64
|
+
msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
|
|
65
|
+
msg += " OVERLOADED" if overloaded
|
|
66
|
+
msg += ", Will delay #{delay}s before next request"
|
|
67
|
+
LogUtils.log(msg)
|
|
72
68
|
end
|
|
73
69
|
|
|
74
70
|
# Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
|
|
@@ -14,7 +14,7 @@ module ScraperUtils
|
|
|
14
14
|
# @param message [String] the message to log
|
|
15
15
|
# @return [void]
|
|
16
16
|
def self.log(message, authority = nil)
|
|
17
|
-
authority ||= ENV
|
|
17
|
+
authority ||= ENV.fetch("AUTHORITY", nil)
|
|
18
18
|
$stderr.flush
|
|
19
19
|
if authority
|
|
20
20
|
puts "[#{authority}] #{message}"
|
|
@@ -85,7 +85,7 @@ module ScraperUtils
|
|
|
85
85
|
failed
|
|
86
86
|
)
|
|
87
87
|
|
|
88
|
-
DbUtils
|
|
88
|
+
DbUtils.cleanup_old_records
|
|
89
89
|
end
|
|
90
90
|
|
|
91
91
|
# Extracts the first relevant line from backtrace that's from our project
|
|
@@ -104,15 +104,15 @@ module ScraperUtils
|
|
|
104
104
|
format = options[:format] || false
|
|
105
105
|
|
|
106
106
|
# Normalize the root directory path with a trailing slash
|
|
107
|
-
pwd = File.join(pwd,
|
|
107
|
+
pwd = File.join(pwd, "")
|
|
108
108
|
|
|
109
109
|
backtrace.each do |line|
|
|
110
|
-
next if line.include?(
|
|
111
|
-
line.include?(
|
|
112
|
-
line.include?(
|
|
110
|
+
next if line.include?("/gems/") ||
|
|
111
|
+
line.include?("/vendor/") ||
|
|
112
|
+
line.include?("/ruby/")
|
|
113
113
|
|
|
114
114
|
if line.start_with?(pwd)
|
|
115
|
-
relative_path = line.sub(pwd,
|
|
115
|
+
relative_path = line.sub(pwd, "")
|
|
116
116
|
return format ? " [#{relative_path}]" : relative_path
|
|
117
117
|
end
|
|
118
118
|
end
|
|
@@ -138,7 +138,7 @@ module ScraperUtils
|
|
|
138
138
|
puts "\nScraping Summary:"
|
|
139
139
|
summary_format = "%-20s %6s %6s %s"
|
|
140
140
|
|
|
141
|
-
puts format(summary_format,
|
|
141
|
+
puts format(summary_format, "Authority", "OK", "Bad", "Exception")
|
|
142
142
|
puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
|
|
143
143
|
|
|
144
144
|
authorities.each do |authority|
|
|
@@ -149,7 +149,8 @@ module ScraperUtils
|
|
|
149
149
|
|
|
150
150
|
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
|
151
151
|
exception_msg = if exceptions[authority]
|
|
152
|
-
location =
|
|
152
|
+
location = project_backtrace_line(exceptions[authority].backtrace,
|
|
153
|
+
format: true)
|
|
153
154
|
"#{exceptions[authority].class} - #{exceptions[authority]}#{location}"
|
|
154
155
|
else
|
|
155
156
|
"-"
|
|
@@ -174,12 +175,12 @@ module ScraperUtils
|
|
|
174
175
|
|
|
175
176
|
# Check for authorities with unexpected errors
|
|
176
177
|
unexpected_errors = authorities
|
|
177
|
-
|
|
178
|
-
|
|
178
|
+
.select { |authority| exceptions[authority] }
|
|
179
|
+
.reject { |authority| expect_bad.include?(authority) }
|
|
179
180
|
|
|
180
181
|
if unexpected_errors.any?
|
|
181
182
|
errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
|
|
182
|
-
|
|
183
|
+
"(Add to MORPH_EXPECT_BAD?)"
|
|
183
184
|
unexpected_errors.each do |authority|
|
|
184
185
|
error = exceptions[authority]
|
|
185
186
|
errors << " #{authority}: #{error.class} - #{error}"
|
|
@@ -228,7 +229,8 @@ module ScraperUtils
|
|
|
228
229
|
# Moved to DbUtils
|
|
229
230
|
# :nocov:
|
|
230
231
|
def self.cleanup_old_records(force: false)
|
|
231
|
-
warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.",
|
|
232
|
+
warn "`#{self.class}##{__method__}` is deprecated and will be removed in a future release, use `ScraperUtils::DbUtils.cleanup_old_records` instead.",
|
|
233
|
+
category: :deprecated
|
|
232
234
|
ScraperUtils::DbUtils.cleanup_old_records(force: force)
|
|
233
235
|
end
|
|
234
236
|
# :nocov:
|
|
@@ -239,7 +241,9 @@ module ScraperUtils
|
|
|
239
241
|
|
|
240
242
|
lines = []
|
|
241
243
|
error.backtrace.each do |line|
|
|
242
|
-
|
|
244
|
+
if lines.length < 2 || !(line.include?("/vendor/") || line.include?("/gems/") || line.include?("/ruby/"))
|
|
245
|
+
lines << line
|
|
246
|
+
end
|
|
243
247
|
break if lines.length >= 6
|
|
244
248
|
end
|
|
245
249
|
|
|
@@ -61,12 +61,12 @@ module ScraperUtils
|
|
|
61
61
|
# Reset all configuration options to their default values
|
|
62
62
|
# @return [void]
|
|
63
63
|
def reset_defaults!
|
|
64
|
-
@default_timeout = ENV.fetch(
|
|
65
|
-
@default_disable_ssl_certificate_check = !ENV.fetch(
|
|
66
|
-
@default_australian_proxy = !ENV.fetch(
|
|
67
|
-
@default_user_agent = ENV.fetch(
|
|
68
|
-
@default_crawl_delay = ENV.fetch(
|
|
69
|
-
@default_max_load = ENV.fetch(
|
|
64
|
+
@default_timeout = ENV.fetch("MORPH_CLIENT_TIMEOUT", DEFAULT_TIMEOUT).to_i # 60
|
|
65
|
+
@default_disable_ssl_certificate_check = !ENV.fetch("MORPH_DISABLE_SSL_CHECK", nil).to_s.empty? # false
|
|
66
|
+
@default_australian_proxy = !ENV.fetch("MORPH_USE_PROXY", nil).to_s.empty? # false
|
|
67
|
+
@default_user_agent = ENV.fetch("MORPH_USER_AGENT", nil) # Uses Mechanize user agent
|
|
68
|
+
@default_crawl_delay = ENV.fetch("MORPH_CLIENT_CRAWL_DELAY", DEFAULT_CRAWL_DELAY)
|
|
69
|
+
@default_max_load = ENV.fetch("MORPH_MAX_LOAD", DEFAULT_MAX_LOAD)
|
|
70
70
|
end
|
|
71
71
|
end
|
|
72
72
|
|
|
@@ -113,10 +113,10 @@ module ScraperUtils
|
|
|
113
113
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
|
114
114
|
if @australian_proxy
|
|
115
115
|
uri = begin
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
116
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
|
117
|
+
rescue URI::InvalidURIError => e
|
|
118
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e}"
|
|
119
|
+
end
|
|
120
120
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
|
121
121
|
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
|
122
122
|
end
|
|
@@ -177,13 +177,13 @@ module ScraperUtils
|
|
|
177
177
|
end
|
|
178
178
|
|
|
179
179
|
def pre_connect_hook(_agent, request)
|
|
180
|
-
hostname = (request.respond_to?(:[]) && request[
|
|
180
|
+
hostname = (request.respond_to?(:[]) && request["Host"]) || "unknown"
|
|
181
181
|
@throttler.before_request(hostname)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
182
|
+
return unless DebugUtils.verbose?
|
|
183
|
+
|
|
184
|
+
ScraperUtils::LogUtils.log(
|
|
185
|
+
"Pre Connect request: #{request.inspect}"
|
|
186
|
+
)
|
|
187
187
|
end
|
|
188
188
|
|
|
189
189
|
def post_connect_hook(_agent, uri, response, _body)
|
|
@@ -191,7 +191,7 @@ module ScraperUtils
|
|
|
191
191
|
|
|
192
192
|
status = response.respond_to?(:code) ? response.code.to_i : nil
|
|
193
193
|
overloaded = [429, 500, 503].include?(status)
|
|
194
|
-
hostname = uri.host ||
|
|
194
|
+
hostname = uri.host || "unknown"
|
|
195
195
|
@throttler.after_request(hostname, overloaded: overloaded)
|
|
196
196
|
|
|
197
197
|
if DebugUtils.basic?
|
|
@@ -205,9 +205,7 @@ module ScraperUtils
|
|
|
205
205
|
def error_hook(_agent, error)
|
|
206
206
|
# Best-effort: record the error against whatever host we can find
|
|
207
207
|
# Mechanize errors often carry the URI in the message; fall back to 'unknown'
|
|
208
|
-
hostname = if error.respond_to?(:uri)
|
|
209
|
-
error.uri.host
|
|
210
|
-
end || 'unknown'
|
|
208
|
+
hostname = (error.uri.host if error.respond_to?(:uri)) || "unknown"
|
|
211
209
|
@throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
|
|
212
210
|
end
|
|
213
211
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative "host_throttler"
|
|
4
|
+
require_relative "mechanize_utils/agent_config"
|
|
4
5
|
|
|
5
6
|
module ScraperUtils
|
|
6
7
|
# Misc Standalone Utilities
|
|
@@ -8,6 +9,14 @@ module ScraperUtils
|
|
|
8
9
|
THROTTLE_HOSTNAME = "block"
|
|
9
10
|
|
|
10
11
|
class << self
|
|
12
|
+
attr_accessor :default_crawl_delay, :default_max_load
|
|
13
|
+
|
|
14
|
+
def reset_defaults!
|
|
15
|
+
@default_crawl_delay = MechanizeUtils::AgentConfig.default_crawl_delay
|
|
16
|
+
@default_max_load = MechanizeUtils::AgentConfig.default_max_load
|
|
17
|
+
reset_throttler!
|
|
18
|
+
end
|
|
19
|
+
|
|
11
20
|
# Throttle block to be nice to servers we are scraping.
|
|
12
21
|
# Time spent inside the block (parsing, saving) counts toward the delay.
|
|
13
22
|
def throttle_block
|
|
@@ -34,8 +43,15 @@ module ScraperUtils
|
|
|
34
43
|
private
|
|
35
44
|
|
|
36
45
|
def throttler
|
|
37
|
-
@throttler ||= HostThrottler.new
|
|
46
|
+
@throttler ||= HostThrottler.new(
|
|
47
|
+
crawl_delay: default_crawl_delay,
|
|
48
|
+
max_load: default_max_load
|
|
49
|
+
)
|
|
38
50
|
end
|
|
39
51
|
end
|
|
52
|
+
|
|
53
|
+
# Initialise defaults after AgentConfig is loaded
|
|
54
|
+
require_relative "mechanize_utils/agent_config"
|
|
55
|
+
reset_defaults!
|
|
40
56
|
end
|
|
41
57
|
end
|
|
@@ -31,8 +31,6 @@ module ScraperUtils
|
|
|
31
31
|
errors.empty? ? nil : errors
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
private
|
|
35
|
-
|
|
36
34
|
def self.validate_presence(record, errors)
|
|
37
35
|
REQUIRED_FIELDS.each do |field|
|
|
38
36
|
errors << "#{field} can't be blank" if record[field].to_s.strip.empty?
|
|
@@ -47,10 +45,10 @@ module ScraperUtils
|
|
|
47
45
|
begin
|
|
48
46
|
uri = URI.parse(url)
|
|
49
47
|
unless uri.is_a?(URI::HTTP) && uri.host.to_s != ""
|
|
50
|
-
errors << "info_url must be a valid http
|
|
48
|
+
errors << "info_url must be a valid http/https URL with host"
|
|
51
49
|
end
|
|
52
50
|
rescue URI::InvalidURIError
|
|
53
|
-
errors << "info_url must be a valid http
|
|
51
|
+
errors << "info_url must be a valid http/https URL"
|
|
54
52
|
end
|
|
55
53
|
end
|
|
56
54
|
|
|
@@ -58,18 +56,22 @@ module ScraperUtils
|
|
|
58
56
|
today = Date.today
|
|
59
57
|
|
|
60
58
|
date_scraped = parse_date(record["date_scraped"])
|
|
61
|
-
|
|
59
|
+
if record["date_scraped"] && date_scraped.nil?
|
|
60
|
+
errors << "Invalid date format for date_scraped: #{record['date_scraped'].inspect} is not a valid ISO 8601 date"
|
|
61
|
+
end
|
|
62
62
|
|
|
63
63
|
date_received = parse_date(record["date_received"])
|
|
64
64
|
if record["date_received"] && date_received.nil?
|
|
65
|
-
errors << "Invalid date format for date_received: #{record[
|
|
65
|
+
errors << "Invalid date format for date_received: #{record['date_received'].inspect} is not a valid ISO 8601 date"
|
|
66
66
|
elsif date_received && date_received.to_date > today
|
|
67
|
-
errors << "Invalid date for date_received: #{record[
|
|
67
|
+
errors << "Invalid date for date_received: #{record['date_received'].inspect} is in the future"
|
|
68
68
|
end
|
|
69
69
|
|
|
70
70
|
%w[on_notice_from on_notice_to].each do |field|
|
|
71
71
|
val = parse_date(record[field])
|
|
72
|
-
|
|
72
|
+
if record[field] && val.nil?
|
|
73
|
+
errors << "Invalid date format for #{field}: #{record[field].inspect} is not a valid ISO 8601 date"
|
|
74
|
+
end
|
|
73
75
|
end
|
|
74
76
|
end
|
|
75
77
|
|
|
@@ -47,41 +47,43 @@ module ScraperUtils
|
|
|
47
47
|
|
|
48
48
|
PLANNING_KEYWORDS = [
|
|
49
49
|
# Building types
|
|
50
|
-
|
|
50
|
+
"dwelling", "house", "unit", "building", "structure", "facility",
|
|
51
51
|
# Modifications
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
"addition", "extension", "renovation", "alteration", "modification",
|
|
53
|
+
"replacement", "upgrade", "improvement",
|
|
54
54
|
# Specific structures
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
"carport", "garage", "shed", "pool", "deck", "patio", "pergola",
|
|
56
|
+
"verandah", "balcony", "fence", "wall", "driveway",
|
|
57
57
|
# Development types
|
|
58
|
-
|
|
58
|
+
"subdivision", "demolition", "construction", "development",
|
|
59
59
|
# Services/utilities
|
|
60
|
-
|
|
60
|
+
"signage", "telecommunications", "stormwater", "water", "sewer",
|
|
61
61
|
# Approvals/certificates
|
|
62
|
-
|
|
62
|
+
"certificate", "approval", "consent", "permit"
|
|
63
63
|
].freeze
|
|
64
64
|
|
|
65
|
-
|
|
66
65
|
def self.fetch_url_head(url)
|
|
67
66
|
agent = Mechanize.new
|
|
68
|
-
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
67
|
+
# FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
69
68
|
agent.head(url)
|
|
70
69
|
end
|
|
71
70
|
|
|
72
71
|
def self.fetch_url_with_redirects(url)
|
|
73
72
|
agent = Mechanize.new
|
|
74
|
-
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
73
|
+
# FIXME: - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
75
74
|
agent.get(url)
|
|
76
75
|
end
|
|
77
76
|
|
|
78
|
-
def self.authority_label(results, prefix:
|
|
77
|
+
def self.authority_label(results, prefix: "", suffix: "")
|
|
79
78
|
return nil if results.nil?
|
|
80
79
|
|
|
81
|
-
authority_labels = results.map { |record| record[
|
|
80
|
+
authority_labels = results.map { |record| record["authority_label"] }.compact.uniq
|
|
82
81
|
return nil if authority_labels.empty?
|
|
83
82
|
|
|
84
|
-
|
|
83
|
+
if authority_labels.size > 1
|
|
84
|
+
raise "Expected one authority_label, not #{authority_labels.inspect}"
|
|
85
|
+
end
|
|
86
|
+
|
|
85
87
|
"#{prefix}#{authority_labels.first}#{suffix}"
|
|
86
88
|
end
|
|
87
89
|
|
|
@@ -95,7 +97,8 @@ module ScraperUtils
|
|
|
95
97
|
duplicates = groups.select { |_k, g| g.size > 1 }
|
|
96
98
|
return if duplicates.empty?
|
|
97
99
|
|
|
98
|
-
raise UnprocessableSite,
|
|
100
|
+
raise UnprocessableSite,
|
|
101
|
+
"Duplicate authority labels: #{duplicates.keys.map(&:inspect).join(', ')}"
|
|
99
102
|
end
|
|
100
103
|
|
|
101
104
|
# Validates enough addresses are geocodable
|
|
@@ -105,28 +108,32 @@ module ScraperUtils
|
|
|
105
108
|
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
106
109
|
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
107
110
|
# @raise RuntimeError if insufficient addresses are geocodable
|
|
108
|
-
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
|
|
111
|
+
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3,
|
|
112
|
+
ignore_case: false, known_suburbs: [])
|
|
109
113
|
return nil if results.empty?
|
|
110
114
|
|
|
111
115
|
geocodable = results
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
116
|
+
.map { |record| record["address"] }
|
|
117
|
+
.uniq
|
|
118
|
+
.count do |text|
|
|
119
|
+
ok = ScraperUtils::SpecSupport.geocodable? text,
|
|
120
|
+
known_suburbs: known_suburbs, ignore_case: ignore_case
|
|
121
|
+
if !ok && DebugUtils.verbose?
|
|
122
|
+
ScraperUtils::LogUtils.log(
|
|
123
|
+
"Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
ok
|
|
128
|
+
end
|
|
124
129
|
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
|
125
|
-
|
|
126
|
-
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
|
130
|
+
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
|
131
|
+
expected = [(((percentage.to_f / 100.0) * results.count) - variation), 1].max
|
|
127
132
|
unless geocodable >= expected
|
|
128
|
-
raise UnprocessableSite,
|
|
133
|
+
raise UnprocessableSite,
|
|
134
|
+
"Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}"
|
|
129
135
|
end
|
|
136
|
+
|
|
130
137
|
geocodable
|
|
131
138
|
end
|
|
132
139
|
|
|
@@ -138,10 +145,13 @@ module ScraperUtils
|
|
|
138
145
|
# @return [Boolean] True if the address appears to be geocodable.
|
|
139
146
|
def self.geocodable?(address, ignore_case: false, known_suburbs: [])
|
|
140
147
|
return false if address.nil? || address.empty?
|
|
148
|
+
|
|
141
149
|
check_address = ignore_case ? address.upcase : address
|
|
142
150
|
|
|
143
151
|
# Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
|
|
144
|
-
has_state = AUSTRALIAN_STATES.any?
|
|
152
|
+
has_state = AUSTRALIAN_STATES.any? do |state|
|
|
153
|
+
check_address.end_with?(" #{state}") || check_address.include?(" #{state} ")
|
|
154
|
+
end
|
|
145
155
|
has_postcode = address.match?(AUSTRALIAN_POSTCODES)
|
|
146
156
|
|
|
147
157
|
# Using the pre-compiled patterns
|
|
@@ -154,9 +164,13 @@ module ScraperUtils
|
|
|
154
164
|
if ENV["DEBUG"]
|
|
155
165
|
missing = []
|
|
156
166
|
missing << "street type" unless has_street_type
|
|
157
|
-
|
|
167
|
+
unless has_postcode || has_uppercase_suburb || has_known_suburb
|
|
168
|
+
missing << "postcode/Uppercase suburb/Known suburb"
|
|
169
|
+
end
|
|
158
170
|
missing << "state" unless has_state
|
|
159
|
-
|
|
171
|
+
if missing.any?
|
|
172
|
+
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}"
|
|
173
|
+
end
|
|
160
174
|
end
|
|
161
175
|
|
|
162
176
|
has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
|
|
@@ -183,17 +197,21 @@ module ScraperUtils
|
|
|
183
197
|
return nil if results.empty?
|
|
184
198
|
|
|
185
199
|
descriptions = results
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
200
|
+
.map { |record| record["description"] }
|
|
201
|
+
.uniq
|
|
202
|
+
.count do |text|
|
|
189
203
|
selected = ScraperUtils::SpecSupport.reasonable_description? text
|
|
190
204
|
puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
|
|
191
205
|
selected
|
|
192
206
|
end
|
|
193
207
|
puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
|
|
194
|
-
|
|
195
|
-
expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
|
|
196
|
-
|
|
208
|
+
"(#{(100.0 * descriptions / results.count).round(1)}%)"
|
|
209
|
+
expected = [((percentage.to_f / 100.0) * results.count) - variation, 1].max
|
|
210
|
+
unless descriptions >= expected
|
|
211
|
+
raise UnprocessableSite,
|
|
212
|
+
"Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}"
|
|
213
|
+
end
|
|
214
|
+
|
|
197
215
|
descriptions
|
|
198
216
|
end
|
|
199
217
|
|
|
@@ -216,7 +234,8 @@ module ScraperUtils
|
|
|
216
234
|
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
|
217
235
|
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
218
236
|
# @raise RuntimeError if records don't use the expected URL or it doesn't return 200
|
|
219
|
-
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
|
|
237
|
+
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false,
|
|
238
|
+
&block)
|
|
220
239
|
info_urls = results.map { |record| record["info_url"] }.uniq
|
|
221
240
|
|
|
222
241
|
unless info_urls.size == 1
|
|
@@ -262,7 +281,8 @@ module ScraperUtils
|
|
|
262
281
|
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
|
263
282
|
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
264
283
|
# @raise RuntimeError if insufficient detail checks pass
|
|
265
|
-
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
|
|
284
|
+
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3,
|
|
285
|
+
bot_check_expected: false, &block)
|
|
266
286
|
if defined?(VCR)
|
|
267
287
|
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
|
|
268
288
|
check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
|
@@ -284,15 +304,15 @@ module ScraperUtils
|
|
|
284
304
|
|
|
285
305
|
# Check for common bot protection indicators
|
|
286
306
|
bot_indicators = [
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
307
|
+
"recaptcha",
|
|
308
|
+
"cloudflare",
|
|
309
|
+
"are you human",
|
|
310
|
+
"bot detection",
|
|
311
|
+
"security check",
|
|
312
|
+
"verify you are human",
|
|
313
|
+
"access denied",
|
|
314
|
+
"blocked",
|
|
315
|
+
"captcha"
|
|
296
316
|
]
|
|
297
317
|
|
|
298
318
|
bot_indicators.any? { |indicator| body_lower.include?(indicator) }
|
|
@@ -308,10 +328,10 @@ module ScraperUtils
|
|
|
308
328
|
return
|
|
309
329
|
end
|
|
310
330
|
|
|
311
|
-
|
|
312
|
-
end
|
|
331
|
+
return if page.code == "200"
|
|
313
332
|
|
|
314
|
-
|
|
333
|
+
raise "Expected 200 response from the one expected info_url, got #{page.code}"
|
|
334
|
+
end
|
|
315
335
|
|
|
316
336
|
def self.check_info_url_is_present(results, percentage, variation, &block)
|
|
317
337
|
count = 0
|
|
@@ -337,17 +357,21 @@ module ScraperUtils
|
|
|
337
357
|
|
|
338
358
|
count += 1
|
|
339
359
|
if status.between?(200, 299)
|
|
340
|
-
puts " OK: #{status}" if ENV[
|
|
360
|
+
puts " OK: #{status}" if ENV["DEBUG"]
|
|
341
361
|
else
|
|
342
362
|
failed += 1
|
|
343
363
|
puts " Failed: #{status}"
|
|
344
|
-
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
|
364
|
+
min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
|
|
345
365
|
passed = count - failed
|
|
346
|
-
|
|
366
|
+
if passed < min_required
|
|
367
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
|
|
368
|
+
end
|
|
347
369
|
end
|
|
348
370
|
end
|
|
349
371
|
|
|
350
|
-
|
|
372
|
+
return unless count > 0
|
|
373
|
+
|
|
374
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!"
|
|
351
375
|
end
|
|
352
376
|
|
|
353
377
|
def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
|
@@ -367,7 +391,10 @@ module ScraperUtils
|
|
|
367
391
|
next
|
|
368
392
|
end
|
|
369
393
|
|
|
370
|
-
|
|
394
|
+
unless page.code == "200"
|
|
395
|
+
raise UnprocessableRecord,
|
|
396
|
+
"Expected 200 response, got #{page.code}"
|
|
397
|
+
end
|
|
371
398
|
|
|
372
399
|
page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
|
|
373
400
|
|
|
@@ -375,34 +402,40 @@ module ScraperUtils
|
|
|
375
402
|
count += 1
|
|
376
403
|
expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
|
|
377
404
|
expected2 = case attribute
|
|
378
|
-
when
|
|
379
|
-
expected.sub(/\ADA\s*-\s*/,
|
|
380
|
-
when
|
|
381
|
-
expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/,
|
|
405
|
+
when "council_reference"
|
|
406
|
+
expected.sub(/\ADA\s*-\s*/, "")
|
|
407
|
+
when "address"
|
|
408
|
+
expected.sub(/(\S+)\s+(\S+)\z/, '\2 \1').sub(/,\s*\z/, "") # Handle Lismore post-code/state swap
|
|
382
409
|
else
|
|
383
410
|
expected
|
|
384
411
|
end
|
|
385
412
|
expected3 = case attribute
|
|
386
|
-
when
|
|
387
|
-
expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/,
|
|
413
|
+
when "address"
|
|
414
|
+
expected.sub(/\s*,?\s+(VIC|NSW|QLD|SA|TAS|WA|ACT|NT)\z/, "")
|
|
388
415
|
else
|
|
389
416
|
expected
|
|
390
|
-
end.gsub(/\s*,\s*/,
|
|
391
|
-
next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/,
|
|
417
|
+
end.gsub(/\s*,\s*/, " ").gsub(/\s*-\s*/, "-")
|
|
418
|
+
next if page_body.include?(expected) || page_body.include?(expected2) || page_body.gsub(/\s*,\s*/, " ").gsub(
|
|
419
|
+
/\s*-\s*/, "-"
|
|
420
|
+
).include?(expected3)
|
|
392
421
|
|
|
393
422
|
failed += 1
|
|
394
|
-
desc2 = expected2 == expected ?
|
|
395
|
-
desc3 = expected3 == expected ?
|
|
423
|
+
desc2 = expected2 == expected ? "" : " or #{expected2.inspect}"
|
|
424
|
+
desc3 = expected3 == expected ? "" : " or #{expected3.inspect}"
|
|
396
425
|
puts " Missing: #{expected.inspect}#{desc2}#{desc3}"
|
|
397
|
-
puts " IN: #{page_body}" if ENV[
|
|
426
|
+
puts " IN: #{page_body}" if ENV["DEBUG"]
|
|
398
427
|
|
|
399
|
-
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
|
428
|
+
min_required = (((percentage.to_f / 100.0) * count) - variation).round(0)
|
|
400
429
|
passed = count - failed
|
|
401
|
-
|
|
430
|
+
if passed < min_required
|
|
431
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})"
|
|
432
|
+
end
|
|
402
433
|
end
|
|
403
434
|
end
|
|
404
435
|
|
|
405
|
-
|
|
436
|
+
return unless count > 0
|
|
437
|
+
|
|
438
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!"
|
|
406
439
|
end
|
|
407
440
|
end
|
|
408
441
|
end
|
data/scraper_utils.gemspec
CHANGED
|
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
|
13
13
|
|
|
14
14
|
spec.summary = "planningalerts scraper utilities"
|
|
15
15
|
spec.description = "Utilities to help make planningalerts scrapers, " \
|
|
16
|
-
|
|
16
|
+
"especially multi authority scrapers, easier to develop, run and debug."
|
|
17
17
|
spec.homepage = "https://github.com/ianheggie-oaf/#{spec.name}"
|
|
18
18
|
spec.license = "MIT"
|
|
19
19
|
|
|
@@ -23,10 +23,10 @@ Gem::Specification.new do |spec|
|
|
|
23
23
|
spec.metadata["homepage_uri"] = spec.homepage
|
|
24
24
|
spec.metadata["source_code_uri"] = spec.homepage
|
|
25
25
|
spec.metadata["documentation_uri"] = "https://rubydoc.info/gems/#{spec.name}/#{ScraperUtils::VERSION}"
|
|
26
|
-
spec.metadata["changelog_uri"] = "#{spec.metadata[
|
|
26
|
+
spec.metadata["changelog_uri"] = "#{spec.metadata['source_code_uri']}/blob/main/CHANGELOG.md"
|
|
27
27
|
else
|
|
28
28
|
raise "RubyGems 2.0 or newer is required to protect against " \
|
|
29
|
-
|
|
29
|
+
"public gem pushes."
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
# Specify which files should be added to the gem when it is released.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.16.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -114,7 +114,7 @@ metadata:
|
|
|
114
114
|
allowed_push_host: https://rubygems.org
|
|
115
115
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
116
116
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
117
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
117
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.16.0
|
|
118
118
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
119
119
|
rubygems_mfa_required: 'true'
|
|
120
120
|
post_install_message:
|