scraper_utils 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -8
- data/CHANGELOG.md +5 -0
- data/GUIDELINES.md +75 -0
- data/Gemfile +1 -1
- data/IMPLEMENTATION.md +33 -0
- data/README.md +226 -131
- data/SPECS.md +25 -0
- data/bin/console +1 -0
- data/bin/setup +2 -1
- data/lib/scraper_utils/adaptive_delay.rb +65 -0
- data/lib/scraper_utils/authority_utils.rb +2 -2
- data/lib/scraper_utils/data_quality_monitor.rb +53 -0
- data/lib/scraper_utils/db_utils.rb +2 -1
- data/lib/scraper_utils/debug_utils.rb +13 -20
- data/lib/scraper_utils/fiber_scheduler.rb +206 -0
- data/lib/scraper_utils/log_utils.rb +57 -26
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +255 -0
- data/lib/scraper_utils/mechanize_utils.rb +23 -29
- data/lib/scraper_utils/robots_checker.rb +144 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +3 -0
- data/scraper_utils.gemspec +3 -8
- metadata +13 -74
@@ -0,0 +1,65 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
# Adapts delays between requests based on server response times.
|
6
|
+
# Target delay is proportional to response time based on max_load setting.
|
7
|
+
# Uses an exponential moving average to smooth variations in response times.
|
8
|
+
class AdaptiveDelay
|
9
|
+
DEFAULT_MIN_DELAY = 0.0
|
10
|
+
DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
|
11
|
+
|
12
|
+
attr_reader :min_delay, :max_delay, :max_load
|
13
|
+
|
14
|
+
# Creates a new adaptive delay calculator
|
15
|
+
#
|
16
|
+
# @param min_delay [Float] Minimum delay between requests in seconds
|
17
|
+
# @param max_delay [Float] Maximum delay between requests in seconds
|
18
|
+
# @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
|
19
|
+
# Lower values are more conservative (e.g., 20% = 4x response time delay)
|
20
|
+
def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
|
21
|
+
@delays = {} # domain -> last delay used
|
22
|
+
@min_delay = min_delay.to_f
|
23
|
+
@max_delay = max_delay.to_f
|
24
|
+
@max_load = max_load.to_f.clamp(1.0, 99.0)
|
25
|
+
@response_multiplier = (100.0 - @max_load) / @max_load
|
26
|
+
|
27
|
+
if ENV["DEBUG"]
|
28
|
+
ScraperUtils::FiberScheduler.log "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds"
|
29
|
+
ScraperUtils::FiberScheduler.log "Using max_load of #{@max_load}% (response time multiplier: #{@response_multiplier.round(2)}x)"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# @param uri [URI::Generic, String] The URL to extract the domain from
|
34
|
+
# @return [String] The domain in the format "scheme://host"
|
35
|
+
def domain(uri)
|
36
|
+
uri = URI(uri) unless uri.is_a?(URI)
|
37
|
+
"#{uri.scheme}://#{uri.host}".downcase
|
38
|
+
end
|
39
|
+
|
40
|
+
# @param uri [URI::Generic, String] URL to get delay for
|
41
|
+
# @return [Float] Current delay for the domain, or min_delay if no delay set
|
42
|
+
def delay(uri)
|
43
|
+
@delays[domain(uri)] || @min_delay
|
44
|
+
end
|
45
|
+
|
46
|
+
# @param uri [URI::Generic, String] URL the response came from
|
47
|
+
# @param response_time [Float] Time in seconds the server took to respond
|
48
|
+
# @return [Float] The calculated delay to use with the next request
|
49
|
+
def next_delay(uri, response_time)
|
50
|
+
uris_domain = domain(uri)
|
51
|
+
target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
|
52
|
+
current_delay = @delays[uris_domain] || target_delay
|
53
|
+
delay = ((9.0 * current_delay) + target_delay) / 10.0
|
54
|
+
delay = delay.clamp(@min_delay, @max_delay)
|
55
|
+
|
56
|
+
if ENV["DEBUG"]
|
57
|
+
ScraperUtils::FiberScheduler.log "Adaptive delay for #{uris_domain} updated to " \
|
58
|
+
"#{delay.round(2)}s (target: #{@response_multiplier.round(1)}x " \
|
59
|
+
"response_time of #{response_time.round(2)}s)"
|
60
|
+
end
|
61
|
+
|
62
|
+
@delays[uris_domain] = delay
|
63
|
+
delay
|
64
|
+
end
|
65
|
+
end
|
@@ -3,13 +3,13 @@
|
|
3
3
|
module ScraperUtils
|
4
4
|
# Utilities for managing and selecting authorities
|
5
5
|
module AuthorityUtils
|
6
|
+
AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
|
7
|
+
|
6
8
|
# Selects authorities based on environment variable or returns all authorities
|
7
9
|
#
|
8
10
|
# @param all_authorities [Array<Symbol>] Full list of available authorities
|
9
11
|
# @return [Array<Symbol>] Selected subset of authorities or all authorities
|
10
12
|
# @raise [ScraperUtils::Error] If invalid authorities are specified in MORPH_AUTHORITIES
|
11
|
-
AUTHORITIES_ENV_VAR = "MORPH_AUTHORITIES"
|
12
|
-
|
13
13
|
def self.selected_authorities(all_authorities)
|
14
14
|
if ENV[AUTHORITIES_ENV_VAR]
|
15
15
|
authorities = ENV[AUTHORITIES_ENV_VAR].split(",").map(&:strip).map(&:to_sym)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
# Monitors data quality during scraping by tracking successful vs failed record processing
|
5
|
+
# Automatically triggers an exception if the error rate exceeds a threshold
|
6
|
+
class DataQualityMonitor
|
7
|
+
# Get the statistics for all authorities
|
8
|
+
# @return [Hash, nil] Hash of statistics per authority or nil if none started
|
9
|
+
def self.stats
|
10
|
+
@stats
|
11
|
+
end
|
12
|
+
|
13
|
+
# Notes the start of processing an authority and clears any previous stats
|
14
|
+
#
|
15
|
+
# @param authority_label [Symbol] The authority we are processing
|
16
|
+
# @return [void]
|
17
|
+
def self.start_authority(authority_label)
|
18
|
+
@stats ||= {}
|
19
|
+
@authority_label = authority_label
|
20
|
+
@stats[@authority_label] = { saved: 0, unprocessed: 0}
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.threshold
|
24
|
+
5.01 + @stats[@authority_label][:saved] * 0.1 if @stats&.fetch(@authority_label, nil)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
28
|
+
# The threshold is 5 + 10% of saved records
|
29
|
+
#
|
30
|
+
# @param e [Exception] The exception that caused the record to be unprocessable
|
31
|
+
# @param record [Hash, nil] The record that couldn't be processed
|
32
|
+
# @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
|
33
|
+
# @return [void]
|
34
|
+
def self.log_unprocessable_record(e, record)
|
35
|
+
start_authority(:"") unless @stats
|
36
|
+
@stats[@authority_label][:unprocessed] += 1
|
37
|
+
ScraperUtils::FiberScheduler.log "Erroneous record #{@authority_label} - #{record&.fetch('address', nil) || record.inspect}: #{e}"
|
38
|
+
if @stats[@authority_label][:unprocessed] > threshold
|
39
|
+
raise ScraperUtils::UnprocessableSite, "Too many unprocessable_records for #{@authority_label}: #{@stats[@authority_label].inspect} - aborting processing of site!"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Logs a successfully saved record
|
44
|
+
#
|
45
|
+
# @param record [Hash] The record that was saved
|
46
|
+
# @return [void]
|
47
|
+
def self.log_saved_record(record)
|
48
|
+
start_authority(:"") unless @stats
|
49
|
+
@stats[@authority_label][:saved] += 1
|
50
|
+
ScraperUtils::FiberScheduler.log "Saving record #{@authority_label} - #{record['address']}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -34,8 +34,9 @@ module ScraperUtils
|
|
34
34
|
["council_reference"]
|
35
35
|
end
|
36
36
|
|
37
|
-
|
37
|
+
|
38
38
|
ScraperWiki.save_sqlite(primary_key, record)
|
39
|
+
ScraperUtils::DataQualityMonitor.log_saved_record(record)
|
39
40
|
end
|
40
41
|
end
|
41
42
|
end
|
@@ -16,19 +16,11 @@ module ScraperUtils
|
|
16
16
|
def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
|
17
17
|
return unless ScraperUtils.debug?
|
18
18
|
|
19
|
-
puts
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
if headers
|
25
|
-
puts "Headers:"
|
26
|
-
puts JSON.pretty_generate(headers)
|
27
|
-
end
|
28
|
-
return unless body
|
29
|
-
|
30
|
-
puts "Body:"
|
31
|
-
puts JSON.pretty_generate(body)
|
19
|
+
puts
|
20
|
+
ScraperUtils::FiberScheduler.log "🔍 #{method.upcase} #{url}"
|
21
|
+
puts "Parameters:", JSON.pretty_generate(parameters) if parameters
|
22
|
+
puts "Headers:", JSON.pretty_generate(headers) if headers
|
23
|
+
puts "Body:", JSON.pretty_generate(body) if body
|
32
24
|
end
|
33
25
|
|
34
26
|
# Logs details of a web page when debug mode is enabled
|
@@ -39,15 +31,15 @@ module ScraperUtils
|
|
39
31
|
def self.debug_page(page, message)
|
40
32
|
return unless ScraperUtils.debug?
|
41
33
|
|
42
|
-
puts
|
43
|
-
|
34
|
+
puts
|
35
|
+
ScraperUtils::FiberScheduler.log "🔍 DEBUG: #{message}"
|
44
36
|
puts "Current URL: #{page.uri}"
|
45
37
|
puts "Page title: #{page.at('title').text.strip}" if page.at("title")
|
46
38
|
puts "",
|
47
|
-
"Page content:"
|
48
|
-
|
49
|
-
|
50
|
-
|
39
|
+
"Page content:",
|
40
|
+
"-" * 40,
|
41
|
+
page.body,
|
42
|
+
"-" * 40
|
51
43
|
end
|
52
44
|
|
53
45
|
# Logs details about a specific page selector when debug mode is enabled
|
@@ -59,7 +51,8 @@ module ScraperUtils
|
|
59
51
|
def self.debug_selector(page, selector, message)
|
60
52
|
return unless ScraperUtils.debug?
|
61
53
|
|
62
|
-
puts
|
54
|
+
puts
|
55
|
+
ScraperUtils::FiberScheduler.log "🔍 DEBUG: #{message}"
|
63
56
|
puts "Looking for selector: #{selector}"
|
64
57
|
element = page.at(selector)
|
65
58
|
if element
|
@@ -0,0 +1,206 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fiber'
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
# A utility module for interleaving multiple scraping operations
|
7
|
+
# using fibers during connection delay periods. This allows efficient
|
8
|
+
# use of wait time by switching between operations.
|
9
|
+
module FiberScheduler
|
10
|
+
# @return [Array<Fiber>] List of active fibers managed by the scheduler
|
11
|
+
def self.registry
|
12
|
+
@registry ||= []
|
13
|
+
end
|
14
|
+
|
15
|
+
# Checks if the current code is running within a registered fiber
|
16
|
+
#
|
17
|
+
# @return [Boolean] true if running in a registered fiber, false otherwise
|
18
|
+
def self.in_fiber?
|
19
|
+
!Fiber.current.nil? && registry.include?(Fiber.current)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Gets the authority associated with the current fiber
|
23
|
+
#
|
24
|
+
# @return [String, nil] the authority name or nil if not in a fiber
|
25
|
+
def self.current_authority
|
26
|
+
return nil unless in_fiber?
|
27
|
+
Fiber.current.instance_variable_get(:@authority)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Logs a message, automatically prefixing with authority name if in a fiber
|
31
|
+
#
|
32
|
+
# @param message [String] the message to log
|
33
|
+
# @return [void]
|
34
|
+
def self.log(message)
|
35
|
+
authority = current_authority
|
36
|
+
if authority
|
37
|
+
puts "[#{authority}] #{message}"
|
38
|
+
else
|
39
|
+
puts message
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Returns a hash of exceptions encountered during processing, indexed by authority
|
44
|
+
#
|
45
|
+
# @return [Hash{Symbol => Exception}] exceptions by authority
|
46
|
+
def self.exceptions
|
47
|
+
@exceptions ||= {}
|
48
|
+
end
|
49
|
+
|
50
|
+
# Returns a hash of values which will be the values yielded along the way then the block value when it completes
|
51
|
+
#
|
52
|
+
# @return [Hash{Symbol => Any}] values by authority
|
53
|
+
def self.values
|
54
|
+
@values ||= {}
|
55
|
+
end
|
56
|
+
|
57
|
+
# Checks if fiber scheduling is currently enabled
|
58
|
+
#
|
59
|
+
# @return [Boolean] true if enabled, false otherwise
|
60
|
+
def self.enabled?
|
61
|
+
@enabled ||= false
|
62
|
+
end
|
63
|
+
|
64
|
+
# Enables fiber scheduling
|
65
|
+
#
|
66
|
+
# @return [void]
|
67
|
+
def self.enable!
|
68
|
+
reset! unless enabled?
|
69
|
+
@enabled = true
|
70
|
+
end
|
71
|
+
|
72
|
+
# Disables fiber scheduling
|
73
|
+
#
|
74
|
+
# @return [void]
|
75
|
+
def self.disable!
|
76
|
+
@enabled = false
|
77
|
+
end
|
78
|
+
|
79
|
+
# Resets the scheduler state, and disables the scheduler. Use this before retrying failed authorities.
|
80
|
+
#
|
81
|
+
# @return [void]
|
82
|
+
def self.reset!
|
83
|
+
@registry = []
|
84
|
+
@exceptions = {}
|
85
|
+
@values = {}
|
86
|
+
@enabled = false
|
87
|
+
@delay_requested = 0.0
|
88
|
+
@time_slept = 0.0
|
89
|
+
@resume_count = 0
|
90
|
+
@initial_resume_at = Time.now - 60.0 # one minute ago
|
91
|
+
end
|
92
|
+
|
93
|
+
# Registers a block to scrape for a specific authority
|
94
|
+
#
|
95
|
+
# @param authority [String] the name of the authority being processed
|
96
|
+
# @yield to the block containing the scraping operation to be run in the fiber
|
97
|
+
# @return [Fiber] the created fiber that calls the block. With @authority and @resume_at instance variables
|
98
|
+
def self.register_operation(authority, &block)
|
99
|
+
# Automatically enable fiber scheduling when operations are registered
|
100
|
+
enable!
|
101
|
+
|
102
|
+
fiber = Fiber.new do
|
103
|
+
begin
|
104
|
+
values[authority] = block.call
|
105
|
+
rescue StandardError => e
|
106
|
+
# Store exception against the authority
|
107
|
+
exceptions[authority] = e
|
108
|
+
ensure
|
109
|
+
# Remove itself when done regardless of success/failure
|
110
|
+
registry.delete(Fiber.current)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
# Start fibres in registration order
|
115
|
+
@initial_resume_at += 0.1
|
116
|
+
fiber.instance_variable_set(:@resume_at, @initial_resume_at)
|
117
|
+
fiber.instance_variable_set(:@authority, authority)
|
118
|
+
registry << fiber
|
119
|
+
|
120
|
+
puts "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving" if ENV['DEBUG']
|
121
|
+
# Important: Don't immediately resume the fiber here
|
122
|
+
# Let the caller decide when to start or coordinate fibers
|
123
|
+
fiber
|
124
|
+
end
|
125
|
+
|
126
|
+
# Run all registered fibers until completion
|
127
|
+
#
|
128
|
+
# @return [Hash] Exceptions that occurred during execution
|
129
|
+
def self.run_all
|
130
|
+
count = registry.size
|
131
|
+
while (fiber = find_earliest_fiber)
|
132
|
+
if fiber.alive?
|
133
|
+
authority = fiber.instance_variable_get(:@authority) rescue nil
|
134
|
+
@resume_count ||= 0
|
135
|
+
@resume_count += 1
|
136
|
+
values[authority] = fiber.resume
|
137
|
+
else
|
138
|
+
puts "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
|
139
|
+
registry.delete(fiber)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
percent_slept = (100.0 * @time_slept / @delay_requested).round(1) if @time_slept&.positive? && @delay_requested&.positive?
|
144
|
+
puts "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, sleeping " \
|
145
|
+
"#{percent_slept}% (#{@time_slept&.round(1)}) of the #{@delay_requested&.round(1)} seconds requested."
|
146
|
+
|
147
|
+
exceptions
|
148
|
+
end
|
149
|
+
|
150
|
+
# Delays the current fiber and potentially runs another one
|
151
|
+
# Falls back to regular sleep if fiber scheduling is not enabled
|
152
|
+
#
|
153
|
+
# @param seconds [Numeric] the number of seconds to delay
|
154
|
+
# @return [Integer] return from sleep operation or 0
|
155
|
+
def self.delay(seconds)
|
156
|
+
seconds = 0.0 unless seconds&.positive?
|
157
|
+
@delay_requested ||= 0.0
|
158
|
+
@delay_requested += seconds
|
159
|
+
|
160
|
+
current_fiber = Fiber.current
|
161
|
+
|
162
|
+
if !enabled? || !current_fiber || registry.size <= 1
|
163
|
+
@time_slept ||= 0.0
|
164
|
+
@time_slept += seconds
|
165
|
+
return sleep(seconds)
|
166
|
+
end
|
167
|
+
|
168
|
+
resume_at = Time.now + seconds
|
169
|
+
|
170
|
+
# Used to compare when other fibers need to be resumed
|
171
|
+
current_fiber.instance_variable_set(:@resume_at, resume_at)
|
172
|
+
|
173
|
+
# Yield control back to the scheduler so another fiber can run
|
174
|
+
Fiber.yield
|
175
|
+
|
176
|
+
# When we get control back, check if we need to sleep more
|
177
|
+
remaining = resume_at - Time.now
|
178
|
+
if remaining.positive?
|
179
|
+
@time_slept ||= 0.0
|
180
|
+
@time_slept += remaining
|
181
|
+
sleep(remaining)
|
182
|
+
end || 0
|
183
|
+
end
|
184
|
+
|
185
|
+
# Finds the fiber with the earliest wake-up time
|
186
|
+
#
|
187
|
+
# @return [Fiber, nil] the fiber with the earliest wake-up time or nil if none found
|
188
|
+
def self.find_earliest_fiber
|
189
|
+
earliest_time = nil
|
190
|
+
earliest_fiber = nil
|
191
|
+
|
192
|
+
registry.each do |fiber|
|
193
|
+
resume_at = fiber.instance_variable_get(:@resume_at)
|
194
|
+
if earliest_time.nil? || resume_at < earliest_time
|
195
|
+
earliest_time = resume_at
|
196
|
+
earliest_fiber = fiber
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
earliest_fiber
|
201
|
+
end
|
202
|
+
|
203
|
+
# Mark methods as private
|
204
|
+
private_class_method :find_earliest_fiber
|
205
|
+
end
|
206
|
+
end
|
@@ -13,13 +13,10 @@ module ScraperUtils
|
|
13
13
|
# @param start_time [Time] When this scraping attempt was started
|
14
14
|
# @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
|
15
15
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
16
|
-
# @param
|
17
|
-
#
|
18
|
-
# - :unprocessable_records [Integer] Optional Number of unprocessable record like regions
|
19
|
-
# - :error [Exception, nil] Any exception that occurred during scraping
|
20
|
-
# - :proxy_used [Boolean] Whether a proxy was used
|
16
|
+
# @param exceptions [Hash > Exception] Any exception that occurred during scraping
|
17
|
+
# DataQualityMonitor.stats is checked for :saved and :unprocessed entries
|
21
18
|
# @return [void]
|
22
|
-
def self.log_scraping_run(start_time, attempt, authorities,
|
19
|
+
def self.log_scraping_run(start_time, attempt, authorities, exceptions)
|
23
20
|
raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
|
24
21
|
raise ArgumentError, "Authorities must be a non-empty array" if authorities.empty?
|
25
22
|
|
@@ -31,10 +28,11 @@ module ScraperUtils
|
|
31
28
|
interrupted = []
|
32
29
|
|
33
30
|
authorities.each do |authority_label|
|
34
|
-
|
31
|
+
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority_label, nil) || {}
|
35
32
|
|
36
|
-
|
37
|
-
|
33
|
+
exception = exceptions[authority_label]
|
34
|
+
status = if stats[:saved]&.positive?
|
35
|
+
exception ? :interrupted : :successful
|
38
36
|
else
|
39
37
|
:failed
|
40
38
|
end
|
@@ -51,13 +49,12 @@ module ScraperUtils
|
|
51
49
|
"run_at" => start_time.iso8601,
|
52
50
|
"attempt" => attempt,
|
53
51
|
"authority_label" => authority_label.to_s,
|
54
|
-
"
|
55
|
-
"unprocessable_records" =>
|
56
|
-
"used_proxy" => result[:proxy_used] ? 1 : 0,
|
52
|
+
"records_saved" => stats[:saved] || 0,
|
53
|
+
"unprocessable_records" => stats[:unprocessed] || 0,
|
57
54
|
"status" => status.to_s,
|
58
|
-
"error_message" =>
|
59
|
-
"error_class" =>
|
60
|
-
"error_backtrace" => extract_meaningful_backtrace(
|
55
|
+
"error_message" => exception&.message,
|
56
|
+
"error_class" => exception&.class&.to_s,
|
57
|
+
"error_backtrace" => extract_meaningful_backtrace(exception)
|
61
58
|
}
|
62
59
|
|
63
60
|
save_log_record(record)
|
@@ -76,33 +73,66 @@ module ScraperUtils
|
|
76
73
|
cleanup_old_records
|
77
74
|
end
|
78
75
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
76
|
+
# Report on the results
|
77
|
+
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
78
|
+
# @param exceptions [Hash > Exception] Any exception that occurred during scraping
|
79
|
+
# DataQualityMonitor.stats is checked for :saved and :unprocessed entries
|
80
|
+
# @return [void]
|
81
|
+
def self.report_on_results(authorities, exceptions)
|
82
|
+
expect_bad = ENV["MORPH_EXPECT_BAD"]&.split(",")&.map(&:strip)&.map(&:to_sym) || []
|
83
|
+
|
84
|
+
puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}" if expect_bad.any?
|
85
|
+
|
86
|
+
# Print summary table
|
87
|
+
puts "\nScraping Summary:"
|
88
|
+
summary_format = "%-20s %6s %6s %s"
|
89
|
+
|
90
|
+
puts summary_format % %w[Authority OK Bad Exception]
|
91
|
+
puts summary_format % ['-' * 20, '-' * 6, '-' * 6, '-' * 50]
|
92
|
+
|
93
|
+
authorities.each do |authority|
|
94
|
+
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
|
95
|
+
|
96
|
+
ok_records = stats[:saved] || 0
|
97
|
+
bad_records = stats[:unprocessed] || 0
|
98
|
+
|
99
|
+
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
100
|
+
exception_msg = if exceptions[authority]
|
101
|
+
"#{exceptions[authority].class} - #{exceptions[authority].message}"
|
102
|
+
else
|
103
|
+
"-"
|
104
|
+
end
|
105
|
+
puts summary_format % [
|
106
|
+
authority.to_s,
|
107
|
+
ok_records,
|
108
|
+
bad_records,
|
109
|
+
"#{expect_bad_prefix}#{exception_msg}".slice(0, 70)
|
110
|
+
]
|
111
|
+
end
|
112
|
+
puts
|
83
113
|
|
84
114
|
errors = []
|
85
115
|
|
86
116
|
# Check for authorities that were expected to be bad but are now working
|
87
117
|
unexpected_working = expect_bad.select do |authority|
|
88
|
-
|
89
|
-
|
118
|
+
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
|
119
|
+
stats[:saved]&.positive? && !exceptions[authority]
|
90
120
|
end
|
91
121
|
|
92
122
|
if unexpected_working.any?
|
93
|
-
errors << "WARNING: Remove #{unexpected_working.join(',')} from
|
123
|
+
errors << "WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
|
94
124
|
end
|
95
125
|
|
96
126
|
# Check for authorities with unexpected errors
|
97
127
|
unexpected_errors = authorities
|
98
|
-
.select { |authority|
|
128
|
+
.select { |authority| exceptions[authority] }
|
99
129
|
.reject { |authority| expect_bad.include?(authority) }
|
100
130
|
|
101
131
|
if unexpected_errors.any?
|
102
132
|
errors << "ERROR: Unexpected errors in: #{unexpected_errors.join(',')} " \
|
103
|
-
|
133
|
+
"(Add to MORPH_EXPECT_BAD?)"
|
104
134
|
unexpected_errors.each do |authority|
|
105
|
-
error =
|
135
|
+
error = exceptions[authority]
|
106
136
|
errors << " #{authority}: #{error.class} - #{error.message}"
|
107
137
|
end
|
108
138
|
end
|
@@ -134,7 +164,8 @@ module ScraperUtils
|
|
134
164
|
"interrupted" => interrupted.join(","),
|
135
165
|
"successful_count" => successful.size,
|
136
166
|
"interrupted_count" => interrupted.size,
|
137
|
-
"failed_count" => failed.size
|
167
|
+
"failed_count" => failed.size,
|
168
|
+
"public_ip" => ScraperUtils::MechanizeUtils.public_ip
|
138
169
|
}
|
139
170
|
|
140
171
|
ScraperWiki.save_sqlite(
|