scraper_utils 0.13.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/docs/misc_utilities.md +0 -6
- data/lib/scraper_utils/host_throttler.rb +86 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +25 -21
- data/lib/scraper_utils/misc_utils.rb +29 -12
- data/lib/scraper_utils/spec_support.rb +79 -5
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +2 -1
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 917ac18062a2b514b864ec39593a508c27cce14bd7c32fa71f13daed2ff442c1
|
|
4
|
+
data.tar.gz: 4f9652b9eab73158f2843730214b7e0b57a7ec854854f7be91080e06d8ec86e3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5b99f780772f265aea38cb8c09bf88c1c58a933642a4e42bd0bd424f4a51681fd596a64a84b939bb21f9a681c2b6ce832e0a32f7f4da25fc12ce1bd8fe73d2d5
|
|
7
|
+
data.tar.gz: 820d683532470049469a2926f946e58a64fbc7f24978e83593e6b8a28d656c0d544397ef35f8c39c232c4c91fc69f435a28a46cf094a6238b21a9d0b8fa57b33
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.15.0 - 2026-03-05
|
|
4
|
+
|
|
5
|
+
* Add `validate_info_urls_are_present!` to check info_urls respond with 2xx status using HEAD requests
|
|
6
|
+
* Fix pre_connect_hook hostname extraction to use `request['Host']` header
|
|
7
|
+
|
|
8
|
+
## 0.14.1 - 2026-03-04
|
|
9
|
+
|
|
10
|
+
* Can pass `known_suburbs: ['Suburb', ...]` to `ScraperUtils::SpecSupport.validate_addresses_are_geocodable!` and
|
|
11
|
+
`ScraperUtils::SpecSupport.geocodable?` to validate addresses that don't have postcodes nor capitalised suburb names
|
|
12
|
+
* Can pass ignore_case: true to relax the requirement for either postcode or uppercase suburb when you don't want to
|
|
13
|
+
pass Known suburb.
|
|
14
|
+
* Move Throttling to HostThrottler
|
|
15
|
+
|
|
3
16
|
## 0.13.1 - 2026.02-21
|
|
4
17
|
|
|
5
18
|
* Added PaValidation that validates based
|
data/docs/misc_utilities.md
CHANGED
|
@@ -18,11 +18,5 @@ The throttle automatically:
|
|
|
18
18
|
- Pauses before next request based on previous timing
|
|
19
19
|
- Caps pause at 120s maximum
|
|
20
20
|
|
|
21
|
-
Override the next pause duration manually if needed:
|
|
22
|
-
|
|
23
|
-
```ruby
|
|
24
|
-
ScraperUtils::MiscUtils.pause_duration = 2.0
|
|
25
|
-
```
|
|
26
|
-
|
|
27
21
|
**Note:** the agent returned by `ScraperUtils::MechanizeUtils.mechanize_agent` automatically applies throttling when
|
|
28
22
|
each request is made and thus does not need to be wrapped with the helper.
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module ScraperUtils
|
|
4
|
+
# Tracks per-host next-allowed-request time so that time spent parsing
|
|
5
|
+
# and saving records counts toward the crawl delay rather than being
|
|
6
|
+
# added on top of it.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# throttler = HostThrottler.new(crawl_delay: 1.0, max_load: 50.0)
|
|
10
|
+
# throttler.before_request(hostname) # sleep until ready
|
|
11
|
+
# # ... make request ...
|
|
12
|
+
# throttler.after_request(hostname) # record timing, schedule next slot
|
|
13
|
+
# throttler.after_request(hostname, overloaded: true) # double delay + 5s
|
|
14
|
+
class HostThrottler
|
|
15
|
+
MAX_DELAY = 120.0
|
|
16
|
+
|
|
17
|
+
# @param crawl_delay [Float] minimum seconds between requests per host
|
|
18
|
+
# @param max_load [Float] target server load percentage (10..100);
|
|
19
|
+
# 50 means response_time == pause_time
|
|
20
|
+
def initialize(crawl_delay: 0.0, max_load: nil)
|
|
21
|
+
@crawl_delay = crawl_delay.to_f
|
|
22
|
+
# Clamp between 10 (delay 9x response) and 100 (no extra delay)
|
|
23
|
+
@max_load = max_load ? max_load.to_f.clamp(10.0, 100.0) : nil
|
|
24
|
+
@next_request_at = {} # hostname => Time
|
|
25
|
+
@request_started_at = {} # hostname => Time
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def will_pause_till(hostname)
|
|
29
|
+
@next_request_at[hostname]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Sleep until this host's throttle window has elapsed.
|
|
33
|
+
# Records when the request actually started.
|
|
34
|
+
# @param hostname [String]
|
|
35
|
+
# @return [void]
|
|
36
|
+
def before_request(hostname)
|
|
37
|
+
target = @next_request_at[hostname]
|
|
38
|
+
if target
|
|
39
|
+
remaining = target - Time.now
|
|
40
|
+
sleep(remaining) if remaining > 0
|
|
41
|
+
end
|
|
42
|
+
@request_started_at[hostname] = Time.now
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Calculate and store the next allowed request time for this host.
|
|
46
|
+
# @param hostname [String]
|
|
47
|
+
# @param overloaded [Boolean] true when the server signalled overload
|
|
48
|
+
# (HTTP 429/500/503); doubles the normal delay and adds 5 seconds.
|
|
49
|
+
# @return [void]
|
|
50
|
+
def after_request(hostname, overloaded: false)
|
|
51
|
+
started = @request_started_at[hostname] || Time.now
|
|
52
|
+
response_time = Time.now - started
|
|
53
|
+
|
|
54
|
+
delay = @crawl_delay
|
|
55
|
+
if @max_load
|
|
56
|
+
delay += (100.0 - @max_load) * response_time / @max_load
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
if overloaded
|
|
60
|
+
delay = delay + response_time * 2 + 5.0
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
delay = delay.round(3).clamp(0.0, MAX_DELAY)
|
|
64
|
+
@next_request_at[hostname] = Time.now + delay
|
|
65
|
+
|
|
66
|
+
if DebugUtils.basic?
|
|
67
|
+
msg = "HostThrottler: #{hostname} response=#{response_time.round(3)}s"
|
|
68
|
+
msg += " OVERLOADED" if overloaded
|
|
69
|
+
msg += ", Will delay #{delay}s before next request"
|
|
70
|
+
LogUtils.log(msg)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Duck-type check for HTTP overload errors across Mechanize, HTTParty, etc.
|
|
75
|
+
# @param error [Exception]
|
|
76
|
+
# @return [Boolean]
|
|
77
|
+
def self.overload_error?(error)
|
|
78
|
+
code = if error.respond_to?(:response) && error.response.respond_to?(:code)
|
|
79
|
+
error.response.code.to_i # HTTParty style
|
|
80
|
+
elsif error.respond_to?(:response_code)
|
|
81
|
+
error.response_code.to_i # Mechanize style
|
|
82
|
+
end
|
|
83
|
+
[429, 500, 503].include?(code)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "mechanize"
|
|
4
4
|
require "ipaddr"
|
|
5
|
+
require_relative "../host_throttler"
|
|
5
6
|
|
|
6
7
|
module ScraperUtils
|
|
7
8
|
module MechanizeUtils
|
|
@@ -76,8 +77,7 @@ module ScraperUtils
|
|
|
76
77
|
attr_reader :user_agent
|
|
77
78
|
|
|
78
79
|
# Give access for testing
|
|
79
|
-
|
|
80
|
-
attr_reader :max_load, :crawl_delay
|
|
80
|
+
attr_reader :max_load, :crawl_delay, :throttler
|
|
81
81
|
|
|
82
82
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
|
83
83
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
|
@@ -107,6 +107,7 @@ module ScraperUtils
|
|
|
107
107
|
@crawl_delay = crawl_delay.nil? ? self.class.default_crawl_delay : crawl_delay.to_f
|
|
108
108
|
# Clamp between 10 (delay 9 x response) and 100 (no delay)
|
|
109
109
|
@max_load = (max_load.nil? ? self.class.default_max_load : max_load).to_f.clamp(10.0, 100.0)
|
|
110
|
+
@throttler = HostThrottler.new(crawl_delay: @crawl_delay, max_load: @max_load)
|
|
110
111
|
|
|
111
112
|
# Validate proxy URL format if proxy will be used
|
|
112
113
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
|
@@ -155,6 +156,7 @@ module ScraperUtils
|
|
|
155
156
|
|
|
156
157
|
agent.pre_connect_hooks << method(:pre_connect_hook)
|
|
157
158
|
agent.post_connect_hooks << method(:post_connect_hook)
|
|
159
|
+
agent.error_hooks << method(:error_hook) if agent.respond_to?(:error_hooks)
|
|
158
160
|
end
|
|
159
161
|
|
|
160
162
|
private
|
|
@@ -175,38 +177,40 @@ module ScraperUtils
|
|
|
175
177
|
end
|
|
176
178
|
|
|
177
179
|
def pre_connect_hook(_agent, request)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
180
|
+
hostname = (request.respond_to?(:[]) && request['Host']) || 'unknown'
|
|
181
|
+
@throttler.before_request(hostname)
|
|
182
|
+
if DebugUtils.verbose?
|
|
183
|
+
ScraperUtils::LogUtils.log(
|
|
184
|
+
"Pre Connect request: #{request.inspect}"
|
|
185
|
+
)
|
|
186
|
+
end
|
|
184
187
|
end
|
|
185
188
|
|
|
186
189
|
def post_connect_hook(_agent, uri, response, _body)
|
|
187
190
|
raise ArgumentError, "URI must be present in post-connect hook" unless uri
|
|
188
191
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
response_delay += response_time
|
|
194
|
-
if @max_load && @max_load >= 1
|
|
195
|
-
response_delay += (100.0 - @max_load) * response_time / @max_load
|
|
196
|
-
end
|
|
197
|
-
response_delay = response_delay.round(3)
|
|
198
|
-
end
|
|
192
|
+
status = response.respond_to?(:code) ? response.code.to_i : nil
|
|
193
|
+
overloaded = [429, 500, 503].include?(status)
|
|
194
|
+
hostname = uri.host || 'unknown'
|
|
195
|
+
@throttler.after_request(hostname, overloaded: overloaded)
|
|
199
196
|
|
|
200
197
|
if DebugUtils.basic?
|
|
201
198
|
ScraperUtils::LogUtils.log(
|
|
202
|
-
"Post Connect uri: #{uri.inspect}, response: #{response.inspect}
|
|
203
|
-
"after #{response_time} seconds#{response_delay > 0.0 ? ", pausing for #{response_delay} seconds" : ""}"
|
|
199
|
+
"Post Connect uri: #{uri.inspect}, response: #{response.inspect}"
|
|
204
200
|
)
|
|
205
201
|
end
|
|
206
|
-
sleep(response_delay) if response_delay > 0.0
|
|
207
202
|
response
|
|
208
203
|
end
|
|
209
204
|
|
|
205
|
+
def error_hook(_agent, error)
|
|
206
|
+
# Best-effort: record the error against whatever host we can find
|
|
207
|
+
# Mechanize errors often carry the URI in the message; fall back to 'unknown'
|
|
208
|
+
hostname = if error.respond_to?(:uri)
|
|
209
|
+
error.uri.host
|
|
210
|
+
end || 'unknown'
|
|
211
|
+
@throttler.after_request(hostname, overloaded: HostThrottler.overload_error?(error))
|
|
212
|
+
end
|
|
213
|
+
|
|
210
214
|
def verify_proxy_works(agent)
|
|
211
215
|
$stderr.flush
|
|
212
216
|
$stdout.flush
|
|
@@ -1,23 +1,40 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require_relative "host_throttler"
|
|
4
|
+
|
|
3
5
|
module ScraperUtils
|
|
4
6
|
# Misc Standalone Utilities
|
|
5
7
|
module MiscUtils
|
|
6
|
-
|
|
8
|
+
THROTTLE_HOSTNAME = "block"
|
|
7
9
|
|
|
8
10
|
class << self
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
11
|
+
# Throttle block to be nice to servers we are scraping.
|
|
12
|
+
# Time spent inside the block (parsing, saving) counts toward the delay.
|
|
13
|
+
def throttle_block
|
|
14
|
+
throttler.before_request(THROTTLE_HOSTNAME)
|
|
15
|
+
begin
|
|
16
|
+
result = yield
|
|
17
|
+
throttler.after_request(THROTTLE_HOSTNAME)
|
|
18
|
+
result
|
|
19
|
+
rescue StandardError => e
|
|
20
|
+
throttler.after_request(THROTTLE_HOSTNAME, overloaded: HostThrottler.overload_error?(e))
|
|
21
|
+
raise
|
|
16
22
|
end
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Reset the internal throttler (useful in tests)
|
|
26
|
+
def reset_throttler!
|
|
27
|
+
@throttler = nil
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def will_pause_till
|
|
31
|
+
throttler.will_pause_till(THROTTLE_HOSTNAME)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def throttler
|
|
37
|
+
@throttler ||= HostThrottler.new
|
|
21
38
|
end
|
|
22
39
|
end
|
|
23
40
|
end
|
|
@@ -62,6 +62,13 @@ module ScraperUtils
|
|
|
62
62
|
'certificate', 'approval', 'consent', 'permit'
|
|
63
63
|
].freeze
|
|
64
64
|
|
|
65
|
+
|
|
66
|
+
def self.fetch_url_head(url)
|
|
67
|
+
agent = Mechanize.new
|
|
68
|
+
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
69
|
+
agent.head(url)
|
|
70
|
+
end
|
|
71
|
+
|
|
65
72
|
def self.fetch_url_with_redirects(url)
|
|
66
73
|
agent = Mechanize.new
|
|
67
74
|
# FIXME - Allow injection of a check to agree to terms if needed to set a cookie and reget the url
|
|
@@ -95,14 +102,25 @@ module ScraperUtils
|
|
|
95
102
|
# @param results [Array<Hash>] The results from scraping an authority
|
|
96
103
|
# @param percentage [Integer] The min percentage of addresses expected to be geocodable (default:50)
|
|
97
104
|
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
|
105
|
+
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
106
|
+
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
98
107
|
# @raise RuntimeError if insufficient addresses are geocodable
|
|
99
|
-
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3)
|
|
108
|
+
def self.validate_addresses_are_geocodable!(results, percentage: 50, variation: 3, ignore_case: false, known_suburbs: [])
|
|
100
109
|
return nil if results.empty?
|
|
101
110
|
|
|
102
111
|
geocodable = results
|
|
103
112
|
.map { |record| record["address"] }
|
|
104
113
|
.uniq
|
|
105
|
-
.count
|
|
114
|
+
.count do |text|
|
|
115
|
+
ok = ScraperUtils::SpecSupport.geocodable? text, known_suburbs: known_suburbs, ignore_case: ignore_case
|
|
116
|
+
if !ok && DebugUtils.verbose?
|
|
117
|
+
ScraperUtils::LogUtils.log(
|
|
118
|
+
"Address: #{text.inspect} is not geocodeable with #{known_suburbs&.size} know suburbs, ignore_case: #{ignore_case.inspect}"
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
ok
|
|
123
|
+
end
|
|
106
124
|
puts "Found #{geocodable} out of #{results.count} unique geocodable addresses " \
|
|
107
125
|
"(#{(100.0 * geocodable / results.count).round(1)}%)"
|
|
108
126
|
expected = [((percentage.to_f / 100.0) * results.count - variation), 1].max
|
|
@@ -115,8 +133,10 @@ module ScraperUtils
|
|
|
115
133
|
# Check if an address is likely to be geocodable by analyzing its format.
|
|
116
134
|
# This is a bit stricter than needed - typically assert >= 75% match
|
|
117
135
|
# @param address [String] The address to check
|
|
136
|
+
# @param ignore_case [Boolean] Ignores case which relaxes suburb check
|
|
137
|
+
# @param known_suburbs [Array<String>] Known suburbs to detect in address when there is no postcode and no uppercase suburb
|
|
118
138
|
# @return [Boolean] True if the address appears to be geocodable.
|
|
119
|
-
def self.geocodable?(address, ignore_case: false)
|
|
139
|
+
def self.geocodable?(address, ignore_case: false, known_suburbs: [])
|
|
120
140
|
return false if address.nil? || address.empty?
|
|
121
141
|
check_address = ignore_case ? address.upcase : address
|
|
122
142
|
|
|
@@ -129,16 +149,17 @@ module ScraperUtils
|
|
|
129
149
|
|
|
130
150
|
uppercase_words = address.scan(/\b[A-Z]{2,}\b/)
|
|
131
151
|
has_uppercase_suburb = uppercase_words.any? { |word| !AUSTRALIAN_STATES.include?(word) }
|
|
152
|
+
has_known_suburb = known_suburbs.any? { |suburb| address.include?(suburb) }
|
|
132
153
|
|
|
133
154
|
if ENV["DEBUG"]
|
|
134
155
|
missing = []
|
|
135
156
|
missing << "street type" unless has_street_type
|
|
136
|
-
missing << "postcode/Uppercase suburb" unless has_postcode || has_uppercase_suburb
|
|
157
|
+
missing << "postcode/Uppercase suburb/Known suburb" unless has_postcode || has_uppercase_suburb || has_known_suburb
|
|
137
158
|
missing << "state" unless has_state
|
|
138
159
|
puts " address: #{address} is not geocodable, missing #{missing.join(', ')}" if missing.any?
|
|
139
160
|
end
|
|
140
161
|
|
|
141
|
-
has_street_type && (has_postcode || has_uppercase_suburb) && has_state
|
|
162
|
+
has_street_type && (has_postcode || has_uppercase_suburb || has_known_suburb) && has_state
|
|
142
163
|
end
|
|
143
164
|
|
|
144
165
|
PLACEHOLDERS = [
|
|
@@ -218,6 +239,22 @@ module ScraperUtils
|
|
|
218
239
|
end
|
|
219
240
|
end
|
|
220
241
|
|
|
242
|
+
# Validates that info_urls have are present (respond to HEAD request with 200 to 299 status)
|
|
243
|
+
# @param results [Array<Hash>] The results from scraping an authority
|
|
244
|
+
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
|
245
|
+
# @param variation [Integer] The variation allowed in addition to percentage (default:3)
|
|
246
|
+
# @yield [String] Optional block to customize URL fetching (e.g., handle terms agreement)
|
|
247
|
+
# @raise RuntimeError if insufficient detail checks pass
|
|
248
|
+
def self.validate_info_urls_are_present!(results, percentage: 75, variation: 3, &block)
|
|
249
|
+
if defined?(VCR)
|
|
250
|
+
VCR.use_cassette("#{authority_label(results, suffix: '_')}info_urls") do
|
|
251
|
+
check_info_url_is_present(results, percentage, variation, &block)
|
|
252
|
+
end
|
|
253
|
+
else
|
|
254
|
+
check_info_url_is_present(results, percentage, variation, &block)
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
|
|
221
258
|
# Validates that info_urls have expected details (unique URLs with content validation)
|
|
222
259
|
# @param results [Array<Hash>] The results from scraping an authority
|
|
223
260
|
# @param percentage [Integer] The min percentage of detail checks expected to pass (default:75)
|
|
@@ -276,6 +313,43 @@ module ScraperUtils
|
|
|
276
313
|
|
|
277
314
|
private
|
|
278
315
|
|
|
316
|
+
def self.check_info_url_is_present(results, percentage, variation, &block)
|
|
317
|
+
count = 0
|
|
318
|
+
failed = 0
|
|
319
|
+
fib_indices = ScraperUtils::MathsUtils.fibonacci_series(results.size - 1).uniq
|
|
320
|
+
|
|
321
|
+
fib_indices.each do |index|
|
|
322
|
+
record = results[index]
|
|
323
|
+
info_url = record["info_url"]
|
|
324
|
+
puts "Checking info_url[#{index}]: #{info_url} is present..."
|
|
325
|
+
|
|
326
|
+
begin
|
|
327
|
+
page = block_given? ? block.call(info_url) : fetch_url_head(info_url)
|
|
328
|
+
status = page.code.to_i
|
|
329
|
+
rescue Mechanize::ResponseCodeError => e
|
|
330
|
+
status = e.response_code.to_i
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
if [403, 429].include?(status)
|
|
334
|
+
puts " Bot protection detected - skipping"
|
|
335
|
+
next
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
count += 1
|
|
339
|
+
if status.between?(200, 299)
|
|
340
|
+
puts " OK: #{status}" if ENV['DEBUG']
|
|
341
|
+
else
|
|
342
|
+
failed += 1
|
|
343
|
+
puts " Failed: #{status}"
|
|
344
|
+
min_required = ((percentage.to_f / 100.0) * count - variation).round(0)
|
|
345
|
+
passed = count - failed
|
|
346
|
+
raise "Too many failures: #{passed}/#{count} passed (min required: #{min_required})" if passed < min_required
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
puts "#{(100.0 * (count - failed) / count).round(1)}% info_url checks passed (#{failed}/#{count} failed)!" if count > 0
|
|
351
|
+
end
|
|
352
|
+
|
|
279
353
|
def self.check_info_url_details(results, percentage, variation, bot_check_expected, &block)
|
|
280
354
|
count = 0
|
|
281
355
|
failed = 0
|
data/lib/scraper_utils.rb
CHANGED
|
@@ -5,12 +5,13 @@ require "scraper_utils/version"
|
|
|
5
5
|
# Public Apis (responsible for requiring their own dependencies)
|
|
6
6
|
require "scraper_utils/authority_utils"
|
|
7
7
|
require "scraper_utils/data_quality_monitor"
|
|
8
|
-
require "scraper_utils/pa_validation"
|
|
9
8
|
require "scraper_utils/db_utils"
|
|
10
9
|
require "scraper_utils/debug_utils"
|
|
10
|
+
require "scraper_utils/host_throttler"
|
|
11
11
|
require "scraper_utils/log_utils"
|
|
12
12
|
require "scraper_utils/maths_utils"
|
|
13
13
|
require "scraper_utils/misc_utils"
|
|
14
|
+
require "scraper_utils/pa_validation"
|
|
14
15
|
require "scraper_utils/spec_support"
|
|
15
16
|
|
|
16
17
|
# Mechanize utilities
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.15.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ian Heggie
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -97,6 +97,7 @@ files:
|
|
|
97
97
|
- lib/scraper_utils/data_quality_monitor.rb
|
|
98
98
|
- lib/scraper_utils/db_utils.rb
|
|
99
99
|
- lib/scraper_utils/debug_utils.rb
|
|
100
|
+
- lib/scraper_utils/host_throttler.rb
|
|
100
101
|
- lib/scraper_utils/log_utils.rb
|
|
101
102
|
- lib/scraper_utils/maths_utils.rb
|
|
102
103
|
- lib/scraper_utils/mechanize_utils.rb
|
|
@@ -113,7 +114,7 @@ metadata:
|
|
|
113
114
|
allowed_push_host: https://rubygems.org
|
|
114
115
|
homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
115
116
|
source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
|
|
116
|
-
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.
|
|
117
|
+
documentation_uri: https://rubydoc.info/gems/scraper_utils/0.15.0
|
|
117
118
|
changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
|
|
118
119
|
rubygems_mfa_required: 'true'
|
|
119
120
|
post_install_message:
|