scraper_utils 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +39 -9
- data/CHANGELOG.md +34 -0
- data/README.md +7 -55
- data/docs/enhancing_specs.md +86 -47
- data/docs/example_custom_Rakefile +38 -0
- data/docs/example_dot_scraper_validation.yml +23 -0
- data/docs/mechanize_utilities.md +0 -3
- data/docs/testing_custom_scrapers.md +74 -0
- data/exe/validate_scraper_data +150 -0
- data/lib/scraper_utils/log_utils.rb +5 -5
- data/lib/scraper_utils/maths_utils.rb +23 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +9 -65
- data/lib/scraper_utils/mechanize_utils.rb +0 -2
- data/lib/scraper_utils/spec_support.rb +189 -6
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +1 -5
- data/scraper_utils.gemspec +1 -0
- metadata +11 -24
- data/docs/example_scrape_with_fibers.rb +0 -31
- data/docs/fibers_and_threads.md +0 -72
- data/docs/interleaving_requests.md +0 -33
- data/docs/parallel_requests.md +0 -138
- data/docs/randomizing_requests.md +0 -38
- data/docs/reducing_server_load.md +0 -63
- data/lib/scraper_utils/cycle_utils.rb +0 -26
- data/lib/scraper_utils/date_range_utils.rb +0 -118
- data/lib/scraper_utils/mechanize_actions.rb +0 -183
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +0 -80
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +0 -151
- data/lib/scraper_utils/randomize_utils.rb +0 -37
- data/lib/scraper_utils/scheduler/constants.rb +0 -12
- data/lib/scraper_utils/scheduler/operation_registry.rb +0 -101
- data/lib/scraper_utils/scheduler/operation_worker.rb +0 -199
- data/lib/scraper_utils/scheduler/process_request.rb +0 -59
- data/lib/scraper_utils/scheduler/thread_request.rb +0 -51
- data/lib/scraper_utils/scheduler/thread_response.rb +0 -59
- data/lib/scraper_utils/scheduler.rb +0 -286
@@ -9,12 +9,12 @@ module ScraperUtils
|
|
9
9
|
LOG_TABLE = "scrape_log"
|
10
10
|
LOG_RETENTION_DAYS = 30
|
11
11
|
|
12
|
-
# Logs a message, automatically prefixing with authority name if in a
|
12
|
+
# Logs a message, automatically prefixing with authority name if in a sub process
|
13
13
|
#
|
14
14
|
# @param message [String] the message to log
|
15
15
|
# @return [void]
|
16
16
|
def self.log(message, authority = nil)
|
17
|
-
authority ||=
|
17
|
+
authority ||= ENV['AUTHORITY']
|
18
18
|
$stderr.flush
|
19
19
|
if authority
|
20
20
|
puts "[#{authority}] #{message}"
|
@@ -174,12 +174,12 @@ module ScraperUtils
|
|
174
174
|
|
175
175
|
# Check for authorities with unexpected errors
|
176
176
|
unexpected_errors = authorities
|
177
|
-
|
178
|
-
|
177
|
+
.select { |authority| exceptions[authority] }
|
178
|
+
.reject { |authority| expect_bad.include?(authority) }
|
179
179
|
|
180
180
|
if unexpected_errors.any?
|
181
181
|
errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
|
182
|
-
|
182
|
+
"(Add to MORPH_EXPECT_BAD?)"
|
183
183
|
unexpected_errors.each do |authority|
|
184
184
|
error = exceptions[authority]
|
185
185
|
errors << " #{authority}: #{error.class} - #{error}"
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "scraperwiki"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
# Misc Maths Utilities
|
7
|
+
module MathsUtils
|
8
|
+
# Generate a fibonacci series
|
9
|
+
# @param max [Integer] The max the sequence goes up to
|
10
|
+
# @return [Array<Integer>] The fibonacci numbers up to max
|
11
|
+
def self.fibonacci_series(max)
|
12
|
+
result = []
|
13
|
+
# Start with the basic Fibonacci sequence
|
14
|
+
last_fib, this_fib = 1, 0
|
15
|
+
while this_fib <= max
|
16
|
+
result << this_fib
|
17
|
+
yield this_fib if block_given?
|
18
|
+
last_fib, this_fib = this_fib, this_fib + last_fib
|
19
|
+
end
|
20
|
+
result
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -10,8 +10,7 @@ module ScraperUtils
|
|
10
10
|
#
|
11
11
|
# @example Setting global defaults
|
12
12
|
# ScraperUtils::MechanizeUtils::AgentConfig.configure do |config|
|
13
|
-
# config.default_timeout =
|
14
|
-
# config.default_random_delay = 5
|
13
|
+
# config.default_timeout = 500
|
15
14
|
# end
|
16
15
|
#
|
17
16
|
# @example Creating an instance with defaults
|
@@ -24,24 +23,12 @@ module ScraperUtils
|
|
24
23
|
# )
|
25
24
|
class AgentConfig
|
26
25
|
DEFAULT_TIMEOUT = 60
|
27
|
-
DEFAULT_RANDOM_DELAY = 0
|
28
|
-
DEFAULT_MAX_LOAD = 50.0
|
29
|
-
MAX_LOAD_CAP = 80.0
|
30
26
|
|
31
27
|
# Class-level defaults that can be modified
|
32
28
|
class << self
|
33
29
|
# @return [Integer] Default timeout in seconds for agent connections
|
34
30
|
attr_accessor :default_timeout
|
35
31
|
|
36
|
-
# @return [Boolean] Default setting for compliance with headers and robots.txt
|
37
|
-
attr_accessor :default_compliant_mode
|
38
|
-
|
39
|
-
# @return [Integer, nil] Default average random delay in seconds
|
40
|
-
attr_accessor :default_random_delay
|
41
|
-
|
42
|
-
# @return [Float, nil] Default maximum server load percentage (nil = no response delay)
|
43
|
-
attr_accessor :default_max_load
|
44
|
-
|
45
32
|
# @return [Boolean] Default setting for SSL certificate verification
|
46
33
|
attr_accessor :default_disable_ssl_certificate_check
|
47
34
|
|
@@ -55,9 +42,7 @@ module ScraperUtils
|
|
55
42
|
# @yield [self] Yields self for configuration
|
56
43
|
# @example
|
57
44
|
# AgentConfig.configure do |config|
|
58
|
-
# config.default_timeout =
|
59
|
-
# config.default_random_delay = 5
|
60
|
-
# config.default_max_load = 15
|
45
|
+
# config.default_timeout = 300
|
61
46
|
# end
|
62
47
|
# @return [void]
|
63
48
|
def configure
|
@@ -68,9 +53,6 @@ module ScraperUtils
|
|
68
53
|
# @return [void]
|
69
54
|
def reset_defaults!
|
70
55
|
@default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
|
71
|
-
@default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
72
|
-
@default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
|
73
|
-
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 50.0
|
74
56
|
@default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
75
57
|
@default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
76
58
|
@default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
@@ -89,10 +71,6 @@ module ScraperUtils
|
|
89
71
|
|
90
72
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
91
73
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
92
|
-
# @param compliant_mode [Boolean, nil] Comply with headers and robots.txt (default: true)
|
93
|
-
# @param random_delay [Integer, nil] Average random delay in seconds (default: 3)
|
94
|
-
# @param max_load [Float, nil] Maximum server load percentage (nil = no delay, default: 20%)
|
95
|
-
# When compliant_mode is true, max_load is capped at 33%
|
96
74
|
# @param disable_ssl_certificate_check [Boolean, nil] Skip SSL verification (default: false)
|
97
75
|
# @param australian_proxy [Boolean, nil] Use proxy if available (default: false)
|
98
76
|
# @param user_agent [String, nil] Configure Mechanize user agent
|
@@ -104,10 +82,6 @@ module ScraperUtils
|
|
104
82
|
australian_proxy: nil,
|
105
83
|
user_agent: nil)
|
106
84
|
@timeout = timeout.nil? ? self.class.default_timeout : timeout
|
107
|
-
@compliant_mode = compliant_mode.nil? ? self.class.default_compliant_mode : compliant_mode
|
108
|
-
@random_delay = random_delay.nil? ? self.class.default_random_delay : random_delay
|
109
|
-
@max_load = max_load.nil? ? self.class.default_max_load : max_load
|
110
|
-
@max_load = [@max_load || DEFAULT_MAX_LOAD, MAX_LOAD_CAP].min if @compliant_mode
|
111
85
|
@user_agent = user_agent.nil? ? self.class.default_user_agent : user_agent
|
112
86
|
|
113
87
|
@disable_ssl_certificate_check = if disable_ssl_certificate_check.nil?
|
@@ -144,13 +118,9 @@ module ScraperUtils
|
|
144
118
|
|
145
119
|
today = Date.today.strftime("%Y-%m-%d")
|
146
120
|
@user_agent = ENV.fetch("MORPH_USER_AGENT", nil)&.sub("TODAY", today)
|
147
|
-
|
148
|
-
|
149
|
-
@user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
|
150
|
-
end
|
121
|
+
version = ScraperUtils::VERSION
|
122
|
+
@user_agent ||= "Mozilla/5.0 (compatible; ScraperUtils/#{version} #{today}; +https://github.com/ianheggie-oaf/scraper_utils)"
|
151
123
|
|
152
|
-
@robots_checker = RobotsChecker.new(@user_agent) if @user_agent
|
153
|
-
@adaptive_delay = AdaptiveDelay.new(max_load: @max_load) if @max_load
|
154
124
|
display_options
|
155
125
|
end
|
156
126
|
|
@@ -164,13 +134,11 @@ module ScraperUtils
|
|
164
134
|
agent.open_timeout = @timeout
|
165
135
|
agent.read_timeout = @timeout
|
166
136
|
end
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
agent.request_headers["Upgrade-Insecure-Requests"] = "1"
|
173
|
-
end
|
137
|
+
agent.user_agent = user_agent
|
138
|
+
agent.request_headers ||= {}
|
139
|
+
agent.request_headers["Accept"] =
|
140
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
141
|
+
agent.request_headers["Upgrade-Insecure-Requests"] = "1"
|
174
142
|
if @australian_proxy
|
175
143
|
agent.agent.set_proxy(ScraperUtils.australian_proxy)
|
176
144
|
agent.request_headers["Accept-Language"] = "en-AU,en-US;q=0.9,en;q=0.8"
|
@@ -191,9 +159,6 @@ module ScraperUtils
|
|
191
159
|
else
|
192
160
|
"australian_proxy=#{@australian_proxy.inspect}"
|
193
161
|
end
|
194
|
-
display_args << "compliant_mode" if @compliant_mode
|
195
|
-
display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
|
196
|
-
display_args << "max_load=#{@max_load}%" if @max_load
|
197
162
|
display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
|
198
163
|
display_args << "default args" if display_args.empty?
|
199
164
|
ScraperUtils::LogUtils.log(
|
@@ -220,27 +185,6 @@ module ScraperUtils
|
|
220
185
|
"after #{response_time} seconds"
|
221
186
|
)
|
222
187
|
end
|
223
|
-
|
224
|
-
if @robots_checker&.disallowed?(uri)
|
225
|
-
raise ScraperUtils::UnprocessableSite,
|
226
|
-
"URL is disallowed by robots.txt specific rules: #{uri}"
|
227
|
-
end
|
228
|
-
|
229
|
-
@delay_till = nil
|
230
|
-
@delay = @robots_checker&.crawl_delay&.round(3)
|
231
|
-
debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
|
232
|
-
unless @delay&.positive?
|
233
|
-
delays = {
|
234
|
-
max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
|
235
|
-
random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
|
236
|
-
}
|
237
|
-
@delay = [delays[:max_load], delays[:random]].compact.sum
|
238
|
-
debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
|
239
|
-
end
|
240
|
-
if @delay&.positive?
|
241
|
-
@delay_till = Time.now + @delay
|
242
|
-
ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
|
243
|
-
end
|
244
188
|
response
|
245
189
|
end
|
246
190
|
|
@@ -3,9 +3,7 @@
|
|
3
3
|
require "mechanize"
|
4
4
|
require "ipaddr"
|
5
5
|
|
6
|
-
require_relative "mechanize_utils/adaptive_delay"
|
7
6
|
require_relative "mechanize_utils/agent_config"
|
8
|
-
require_relative "mechanize_utils/robots_checker"
|
9
7
|
|
10
8
|
module ScraperUtils
|
11
9
|
# Utilities for configuring and using Mechanize for web scraping
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require "scraperwiki"
|
4
|
+
require "cgi"
|
4
5
|
|
5
6
|
module ScraperUtils
|
6
7
|
# Methods to support specs
|
@@ -35,6 +36,41 @@ module ScraperUtils
|
|
35
36
|
|
36
37
|
AUSTRALIAN_POSTCODES = /\b\d{4}\b/.freeze
|
37
38
|
|
39
|
+
def self.fetch_url_with_redirects(url)
|
40
|
+
agent = Mechanize.new
|
41
|
+
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
42
|
+
agent.get(url)
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.authority_label(results, prefix: '', suffix: '')
|
46
|
+
return nil if results.nil?
|
47
|
+
|
48
|
+
authority_labels = results.map { |record| record['authority_label']}.compact.uniq
|
49
|
+
return nil if authority_labels.empty?
|
50
|
+
|
51
|
+
raise "Expected one authority_label, not #{authority_labels.inspect}" if authority_labels.size > 1
|
52
|
+
"#{prefix}#{authority_labels.first}#{suffix}"
|
53
|
+
end
|
54
|
+
|
55
|
+
# Validates enough addresses are geocodable
|
56
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
57
|
+
# @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
|
58
|
+
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
59
|
+
# @raise RuntimeError if insufficient addresses are geocodable
|
60
|
+
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
|
61
|
+
return nil if results.empty?
|
62
|
+
|
63
|
+
geocodable = results
|
64
|
+
.map { |record| record["address"] }
|
65
|
+
.uniq
|
66
|
+
.count { |text| ScraperUtils::SpecSupport.geocodable? text }
|
67
|
+
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
68
|
+
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
69
|
+
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
70
|
+
raise "Expected at least #{expected} (#{percentage}% - #{variation}) geocodable addresses, got #{geocodable}" unless geocodable >= expected
|
71
|
+
geocodable
|
72
|
+
end
|
73
|
+
|
38
74
|
# Check if an address is likely to be geocodable by analyzing its format.
|
39
75
|
# This is a bit stricter than needed - typically assert >= 75% match
|
40
76
|
# @param address [String] The address to check
|
@@ -43,7 +79,7 @@ module ScraperUtils
|
|
43
79
|
return false if address.nil? || address.empty?
|
44
80
|
check_address = ignore_case ? address.upcase : address
|
45
81
|
|
46
|
-
# Basic structure check - must have a street
|
82
|
+
# Basic structure check - must have a street type or unit/lot, uppercase suburb or postcode, state
|
47
83
|
has_state = AUSTRALIAN_STATES.any? { |state| check_address.end_with?(" #{state}") || check_address.include?(" #{state} ") }
|
48
84
|
has_postcode = address.match?(AUSTRALIAN_POSTCODES)
|
49
85
|
|
@@ -52,20 +88,20 @@ module ScraperUtils
|
|
52
88
|
|
53
89
|
has_unit_or_lot = address.match?(/\b(Unit|Lot:?)\s+\d+/i)
|
54
90
|
|
55
|
-
|
91
|
+
uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
|
92
|
+
has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
|
56
93
|
|
57
94
|
if ENV["DEBUG"]
|
58
95
|
missing = []
|
59
96
|
unless has_street_type || has_unit_or_lot
|
60
97
|
missing << "street type / unit / lot"
|
61
98
|
end
|
99
|
+
missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
|
62
100
|
missing << "state" unless has_state
|
63
|
-
missing << "postcode" unless has_postcode
|
64
|
-
missing << "suburb state" unless has_suburb_stats
|
65
101
|
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
66
102
|
end
|
67
103
|
|
68
|
-
(has_street_type || has_unit_or_lot) &&
|
104
|
+
(has_street_type || has_unit_or_lot) && (has_postcode || has_uppercase_suburb) && has_state
|
69
105
|
end
|
70
106
|
|
71
107
|
PLACEHOLDERS = [
|
@@ -80,11 +116,158 @@ module ScraperUtils
|
|
80
116
|
PLACEHOLDERS.any? { |placeholder| text.to_s.match?(placeholder) }
|
81
117
|
end
|
82
118
|
|
119
|
+
# Validates enough descriptions are reasonable
|
120
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
121
|
+
# @param percentage [Integer] The min percentage of descriptions expected to be reasonable (default:50)
|
122
|
+
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
123
|
+
# @raise RuntimeError if insufficient descriptions are reasonable
|
124
|
+
def self.validate_descriptions_are_reasonable!(results, percentage: 50, variation: 3)
|
125
|
+
return nil if results.empty?
|
126
|
+
|
127
|
+
descriptions = results
|
128
|
+
.map { |record| record["description"] }
|
129
|
+
.uniq
|
130
|
+
.count do |text|
|
131
|
+
selected = ScraperUtils::SpecSupport.reasonable_description? text
|
132
|
+
puts " description: #{text} is not reasonable" if ENV["DEBUG"] && !selected
|
133
|
+
selected
|
134
|
+
end
|
135
|
+
puts "Found #{descriptions} out of #{results.count} unique reasonable descriptions " \
|
136
|
+
"(#{(100.0 * descriptions / results.count).round(1)}%)"
|
137
|
+
expected = [(percentage.to_f / 100.0) * results.count - variation, 1].max
|
138
|
+
raise "Expected at least #{expected} (#{percentage}% - #{variation}) reasonable descriptions, got #{descriptions}" unless descriptions >= expected
|
139
|
+
descriptions
|
140
|
+
end
|
141
|
+
|
83
142
|
# Check if this looks like a "reasonable" description
|
84
143
|
# This is a bit stricter than needed - typically assert >= 75% match
|
85
144
|
def self.reasonable_description?(text)
|
86
145
|
!placeholder?(text) && text.to_s.split.size >= 3
|
87
146
|
end
|
147
|
+
|
148
|
+
# Validates that all records use the expected global info_url and it returns 200
|
149
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
150
|
+
# @param expected_url [String] The expected global info_url for this authority
|
151
|
+
# @raise RuntimeError if records don't use the expected URL or it doesn't return 200
|
152
|
+
def self.validate_uses_one_valid_info_url!(results, expected_url, bot_check_expected: false)
|
153
|
+
info_urls = results.map { |record| record["info_url"] }.uniq
|
154
|
+
|
155
|
+
unless info_urls.size == 1
|
156
|
+
raise "Expected all records to use one info_url '#{expected_url}', found: #{info_urls.size}"
|
157
|
+
end
|
158
|
+
unless info_urls.first == expected_url
|
159
|
+
raise "Expected all records to use global info_url '#{expected_url}', found: #{info_urls.first}"
|
160
|
+
end
|
161
|
+
|
162
|
+
puts "Checking the one expected info_url returns 200: #{expected_url}"
|
163
|
+
|
164
|
+
if defined?(VCR)
|
165
|
+
VCR.use_cassette("#{authority_label(results, suffix: '_')}one_info_url") do
|
166
|
+
page = fetch_url_with_redirects(expected_url)
|
167
|
+
validate_page_response(page, bot_check_expected)
|
168
|
+
end
|
169
|
+
else
|
170
|
+
page = fetch_url_with_redirects(expected_url)
|
171
|
+
validate_page_response(page, bot_check_expected)
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
# Validates that info_urls have expected details (unique URLs with content validation)
|
176
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
177
|
+
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
178
|
+
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
179
|
+
# @raise RuntimeError if insufficient detail checks pass
|
180
|
+
def self.validate_info_urls_have_expected_details!(results, percentage: 75, variation: 3, bot_check_expected: false)
|
181
|
+
if defined?(VCR)
|
182
|
+
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_url_details") do
|
183
|
+
check_info_url_details(results, percentage, variation, bot_check_expected)
|
184
|
+
end
|
185
|
+
else
|
186
|
+
check_info_url_details(results, percentage, variation, bot_check_expected)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# Check if the page response indicates bot protection
|
191
|
+
# @param page [Mechanize::Page] The page response to check
|
192
|
+
# @return [Boolean] True if bot protection is detected
|
193
|
+
def self.bot_protection_detected?(page)
|
194
|
+
return true if %w[403 429].include?(page.code)
|
195
|
+
|
196
|
+
return false unless page.body
|
197
|
+
|
198
|
+
body_lower = page.body.downcase
|
199
|
+
|
200
|
+
# Check for common bot protection indicators
|
201
|
+
bot_indicators = [
|
202
|
+
'recaptcha',
|
203
|
+
'cloudflare',
|
204
|
+
'are you human',
|
205
|
+
'bot detection',
|
206
|
+
'security check',
|
207
|
+
'verify you are human',
|
208
|
+
'access denied',
|
209
|
+
'blocked',
|
210
|
+
'captcha'
|
211
|
+
]
|
212
|
+
|
213
|
+
bot_indicators.any? { |indicator| body_lower.include?(indicator) }
|
214
|
+
end
|
215
|
+
|
216
|
+
# Validate page response, accounting for bot protection
|
217
|
+
# @param page [Mechanize::Page] The page response to validate
|
218
|
+
# @param bot_check_expected [Boolean] Whether bot protection is acceptable
|
219
|
+
# @raise RuntimeError if page response is invalid and bot protection not expected
|
220
|
+
def self.validate_page_response(page, bot_check_expected)
|
221
|
+
if bot_check_expected && bot_protection_detected?(page)
|
222
|
+
puts " Bot protection detected - accepting as valid response"
|
223
|
+
return
|
224
|
+
end
|
225
|
+
|
226
|
+
raise "Expected 200 response from the one expected info_url, got #{page.code}" unless page.code == "200"
|
227
|
+
end
|
228
|
+
|
229
|
+
private
|
230
|
+
|
231
|
+
def self.check_info_url_details(results, percentage, variation, bot_check_expected)
|
232
|
+
count = 0
|
233
|
+
failed = 0
|
234
|
+
fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
|
235
|
+
|
236
|
+
fib_indices.each do |index|
|
237
|
+
record = results[index]
|
238
|
+
info_url = record["info_url"]
|
239
|
+
puts "Checking info_url[#{index}]: #{info_url} has the expected reference, address and description..."
|
240
|
+
|
241
|
+
page = fetch_url_with_redirects(info_url)
|
242
|
+
|
243
|
+
if bot_check_expected && bot_protection_detected?(page)
|
244
|
+
puts " Bot protection detected - skipping detailed validation"
|
245
|
+
next
|
246
|
+
end
|
247
|
+
|
248
|
+
raise "Expected 200 response, got #{page.code}" unless page.code == "200"
|
249
|
+
|
250
|
+
page_body = page.body.dup.force_encoding("UTF-8").gsub(/\s\s+/, " ")
|
251
|
+
|
252
|
+
%w[council_reference address description].each do |attribute|
|
253
|
+
count += 1
|
254
|
+
expected = CGI.escapeHTML(record[attribute]).gsub(/\s\s+/, " ")
|
255
|
+
expected2 = expected.gsub(/(\S+)\s+(\S+)\z/, '\2 \1') # Handle Lismore post-code/state swap
|
256
|
+
|
257
|
+
next if page_body.include?(expected) || page_body.include?(expected2)
|
258
|
+
|
259
|
+
failed += 1
|
260
|
+
puts " Missing: #{expected}"
|
261
|
+
puts " IN: #{page_body}" if ENV['DEBUG']
|
262
|
+
|
263
|
+
min_required = [((percentage.to_f / 100.0) * count - variation), 1].max
|
264
|
+
passed = count - failed
|
265
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% detail checks passed (#{failed}/#{count} failed)!" if count > 0
|
270
|
+
end
|
271
|
+
|
88
272
|
end
|
89
273
|
end
|
90
|
-
|
data/lib/scraper_utils.rb
CHANGED
@@ -4,18 +4,14 @@ require "scraper_utils/version"
|
|
4
4
|
|
5
5
|
# Public Apis (responsible for requiring their own dependencies)
|
6
6
|
require "scraper_utils/authority_utils"
|
7
|
-
require "scraper_utils/cycle_utils"
|
8
7
|
require "scraper_utils/data_quality_monitor"
|
9
|
-
require "scraper_utils/date_range_utils"
|
10
8
|
require "scraper_utils/db_utils"
|
11
9
|
require "scraper_utils/debug_utils"
|
12
10
|
require "scraper_utils/log_utils"
|
13
|
-
require "scraper_utils/
|
14
|
-
require "scraper_utils/scheduler"
|
11
|
+
require "scraper_utils/maths_utils"
|
15
12
|
require "scraper_utils/spec_support"
|
16
13
|
|
17
14
|
# Mechanize utilities
|
18
|
-
require "scraper_utils/mechanize_actions"
|
19
15
|
require "scraper_utils/mechanize_utils"
|
20
16
|
|
21
17
|
# Utilities for planningalerts scrapers
|
data/scraper_utils.gemspec
CHANGED
@@ -41,5 +41,6 @@ Gem::Specification.new do |spec|
|
|
41
41
|
spec.add_dependency "mechanize"
|
42
42
|
spec.add_dependency "nokogiri"
|
43
43
|
spec.add_dependency "sqlite3"
|
44
|
+
# Do NOT add - it depends on a non-default branch spec.add_dependency "scraperwiki"
|
44
45
|
spec.metadata["rubygems_mfa_required"] = "true"
|
45
46
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraper_utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ian Heggie
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-07-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -56,7 +56,8 @@ description: Utilities to help make planningalerts scrapers, especially multi au
|
|
56
56
|
scrapers, easier to develop, run and debug.
|
57
57
|
email:
|
58
58
|
- ian@heggie.biz
|
59
|
-
executables:
|
59
|
+
executables:
|
60
|
+
- validate_scraper_data
|
60
61
|
extensions: []
|
61
62
|
extra_rdoc_files: []
|
62
63
|
files:
|
@@ -79,36 +80,22 @@ files:
|
|
79
80
|
- bin/setup
|
80
81
|
- docs/debugging.md
|
81
82
|
- docs/enhancing_specs.md
|
82
|
-
- docs/
|
83
|
+
- docs/example_custom_Rakefile
|
84
|
+
- docs/example_dot_scraper_validation.yml
|
83
85
|
- docs/example_scraper.rb
|
84
|
-
- docs/fibers_and_threads.md
|
85
86
|
- docs/getting_started.md
|
86
|
-
- docs/interleaving_requests.md
|
87
87
|
- docs/mechanize_utilities.md
|
88
|
-
- docs/
|
89
|
-
-
|
90
|
-
- docs/reducing_server_load.md
|
88
|
+
- docs/testing_custom_scrapers.md
|
89
|
+
- exe/validate_scraper_data
|
91
90
|
- lib/scraper_utils.rb
|
92
91
|
- lib/scraper_utils/authority_utils.rb
|
93
|
-
- lib/scraper_utils/cycle_utils.rb
|
94
92
|
- lib/scraper_utils/data_quality_monitor.rb
|
95
|
-
- lib/scraper_utils/date_range_utils.rb
|
96
93
|
- lib/scraper_utils/db_utils.rb
|
97
94
|
- lib/scraper_utils/debug_utils.rb
|
98
95
|
- lib/scraper_utils/log_utils.rb
|
99
|
-
- lib/scraper_utils/
|
96
|
+
- lib/scraper_utils/maths_utils.rb
|
100
97
|
- lib/scraper_utils/mechanize_utils.rb
|
101
|
-
- lib/scraper_utils/mechanize_utils/adaptive_delay.rb
|
102
98
|
- lib/scraper_utils/mechanize_utils/agent_config.rb
|
103
|
-
- lib/scraper_utils/mechanize_utils/robots_checker.rb
|
104
|
-
- lib/scraper_utils/randomize_utils.rb
|
105
|
-
- lib/scraper_utils/scheduler.rb
|
106
|
-
- lib/scraper_utils/scheduler/constants.rb
|
107
|
-
- lib/scraper_utils/scheduler/operation_registry.rb
|
108
|
-
- lib/scraper_utils/scheduler/operation_worker.rb
|
109
|
-
- lib/scraper_utils/scheduler/process_request.rb
|
110
|
-
- lib/scraper_utils/scheduler/thread_request.rb
|
111
|
-
- lib/scraper_utils/scheduler/thread_response.rb
|
112
99
|
- lib/scraper_utils/spec_support.rb
|
113
100
|
- lib/scraper_utils/version.rb
|
114
101
|
- scraper_utils.gemspec
|
@@ -119,7 +106,7 @@ metadata:
|
|
119
106
|
allowed_push_host: https://rubygems.org
|
120
107
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
121
108
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
122
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.
|
109
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.8.3
|
123
110
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
124
111
|
rubygems_mfa_required: 'true'
|
125
112
|
post_install_message:
|
@@ -137,7 +124,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
137
124
|
- !ruby/object:Gem::Version
|
138
125
|
version: '0'
|
139
126
|
requirements: []
|
140
|
-
rubygems_version: 3.4.
|
127
|
+
rubygems_version: 3.4.10
|
141
128
|
signing_key:
|
142
129
|
specification_version: 4
|
143
130
|
summary: planningalerts scraper utilities
|
@@ -1,31 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
# Example scrape method updated to use ScraperUtils::FibreScheduler
|
4
|
-
|
5
|
-
def scrape(authorities, attempt)
|
6
|
-
ScraperUtils::Scheduler.reset!
|
7
|
-
exceptions = {}
|
8
|
-
authorities.each do |authority_label|
|
9
|
-
ScraperUtils::Scheduler.register_operation(authority_label) do
|
10
|
-
ScraperUtils::LogUtils.log(
|
11
|
-
"Collecting feed data for #{authority_label}, attempt: #{attempt}..."
|
12
|
-
)
|
13
|
-
ScraperUtils::DataQualityMonitor.start_authority(authority_label)
|
14
|
-
YourScraper.scrape(authority_label) do |record|
|
15
|
-
record["authority_label"] = authority_label.to_s
|
16
|
-
ScraperUtils::DbUtils.save_record(record)
|
17
|
-
rescue ScraperUtils::UnprocessableRecord => e
|
18
|
-
ScraperUtils::DataQualityMonitor.log_unprocessable_record(e, record)
|
19
|
-
exceptions[authority_label] = e
|
20
|
-
# Continues processing other records
|
21
|
-
end
|
22
|
-
rescue StandardError => e
|
23
|
-
warn "#{authority_label}: ERROR: #{e}"
|
24
|
-
warn e.backtrace || "No backtrace available"
|
25
|
-
exceptions[authority_label] = e
|
26
|
-
end
|
27
|
-
# end of register_operation block
|
28
|
-
end
|
29
|
-
ScraperUtils::Scheduler.run_operations
|
30
|
-
exceptions
|
31
|
-
end
|