scraper_utils 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +4 -0
- data/CHANGELOG.md +10 -1
- data/Gemfile +5 -2
- data/README.md +103 -149
- data/docs/example_scrape_with_fibers.rb +31 -0
- data/docs/example_scraper.rb +93 -0
- data/lib/scraper_utils/adaptive_delay.rb +55 -50
- data/lib/scraper_utils/data_quality_monitor.rb +28 -17
- data/lib/scraper_utils/date_range_utils.rb +159 -0
- data/lib/scraper_utils/db_utils.rb +0 -2
- data/lib/scraper_utils/debug_utils.rb +53 -6
- data/lib/scraper_utils/fiber_scheduler.rb +45 -22
- data/lib/scraper_utils/log_utils.rb +19 -17
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +67 -46
- data/lib/scraper_utils/mechanize_utils.rb +12 -4
- data/lib/scraper_utils/randomize_utils.rb +34 -0
- data/lib/scraper_utils/robots_checker.rb +9 -4
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +3 -10
- metadata +6 -2
@@ -6,8 +6,8 @@ module ScraperUtils
|
|
6
6
|
class DataQualityMonitor
|
7
7
|
# Get the statistics for all authorities
|
8
8
|
# @return [Hash, nil] Hash of statistics per authority or nil if none started
|
9
|
-
|
10
|
-
|
9
|
+
class << self
|
10
|
+
attr_reader :stats
|
11
11
|
end
|
12
12
|
|
13
13
|
# Notes the start of processing an authority and clears any previous stats
|
@@ -16,28 +16,39 @@ module ScraperUtils
|
|
16
16
|
# @return [void]
|
17
17
|
def self.start_authority(authority_label)
|
18
18
|
@stats ||= {}
|
19
|
-
@authority_label =
|
20
|
-
@stats[@authority_label] = { saved: 0, unprocessed: 0}
|
19
|
+
@stats[authority_label] = { saved: 0, unprocessed: 0 }
|
21
20
|
end
|
22
21
|
|
23
|
-
|
24
|
-
|
22
|
+
# Extracts authority label and ensures stats are setup for record
|
23
|
+
def self.extract_authority(record)
|
24
|
+
authority_label = (record&.key?("authority_label") ? record["authority_label"] : "").to_sym
|
25
|
+
@stats ||= {}
|
26
|
+
@stats[authority_label] ||= { saved: 0, unprocessed: 0 }
|
27
|
+
authority_label
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.threshold(authority_label)
|
31
|
+
5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
|
25
32
|
end
|
26
33
|
|
27
34
|
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
28
35
|
# The threshold is 5 + 10% of saved records
|
29
36
|
#
|
30
|
-
# @param
|
37
|
+
# @param exception [Exception] The exception that caused the record to be unprocessable
|
31
38
|
# @param record [Hash, nil] The record that couldn't be processed
|
32
39
|
# @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
|
33
40
|
# @return [void]
|
34
|
-
def self.log_unprocessable_record(
|
35
|
-
|
36
|
-
@stats[
|
37
|
-
ScraperUtils::FiberScheduler.log "Erroneous record #{
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
def self.log_unprocessable_record(exception, record)
|
42
|
+
authority_label = extract_authority(record)
|
43
|
+
@stats[authority_label][:unprocessed] += 1
|
44
|
+
ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
|
45
|
+
'address', nil
|
46
|
+
) || record.inspect}: #{exception}"
|
47
|
+
return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
|
48
|
+
|
49
|
+
raise ScraperUtils::UnprocessableSite,
|
50
|
+
"Too many unprocessable_records for #{authority_label}: " \
|
51
|
+
"#{@stats[authority_label].inspect} - aborting processing of site!"
|
41
52
|
end
|
42
53
|
|
43
54
|
# Logs a successfully saved record
|
@@ -45,9 +56,9 @@ module ScraperUtils
|
|
45
56
|
# @param record [Hash] The record that was saved
|
46
57
|
# @return [void]
|
47
58
|
def self.log_saved_record(record)
|
48
|
-
|
49
|
-
@stats[
|
50
|
-
ScraperUtils::FiberScheduler.log "Saving record #{
|
59
|
+
authority_label = extract_authority(record)
|
60
|
+
@stats[authority_label][:saved] += 1
|
61
|
+
ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
|
51
62
|
end
|
52
63
|
end
|
53
64
|
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
class DateRangeUtils
|
5
|
+
MERGE_ADJACENT_RANGES = true
|
6
|
+
PERIODS = [2, 3, 5, 8].freeze
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# @return [Integer] Default number of days to cover
|
10
|
+
attr_accessor :default_days
|
11
|
+
|
12
|
+
# @return [Integer] Default days to always include in ranges
|
13
|
+
attr_accessor :default_everytime
|
14
|
+
|
15
|
+
# @return [Integer, nil] Default max days between any one date being in a range
|
16
|
+
attr_accessor :default_max_period
|
17
|
+
|
18
|
+
# Configure default settings for all DateRangeUtils instances
|
19
|
+
# @yield [self] Yields self for configuration
|
20
|
+
# @example
|
21
|
+
# AgentConfig.configure do |config|
|
22
|
+
# config.default_everytime = 3
|
23
|
+
# config.default_days = 35
|
24
|
+
# config.default_max_period = 5
|
25
|
+
# end
|
26
|
+
# @return [void]
|
27
|
+
def configure
|
28
|
+
yield self if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
# Reset all configuration options to their default values
|
32
|
+
# @return [void]
|
33
|
+
def reset_defaults!
|
34
|
+
@default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
|
35
|
+
@default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
|
36
|
+
@default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Set defaults on load
|
41
|
+
reset_defaults!
|
42
|
+
|
43
|
+
attr_reader :max_period_used
|
44
|
+
attr_reader :extended_max_period
|
45
|
+
|
46
|
+
# Generates one or more date ranges to check the most recent daily through to checking each max_period
|
47
|
+
# There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
|
48
|
+
# @param days [Integer, nil] create ranges that cover the last `days` dates
|
49
|
+
# @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
|
50
|
+
# @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
|
51
|
+
# @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
|
52
|
+
# @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
|
53
|
+
#
|
54
|
+
# Uses a Fibonacci sequence to create a natural progression of check frequencies.
|
55
|
+
# Newer data is checked more frequently, with periods between checks growing
|
56
|
+
# according to the Fibonacci sequence (2, 3, 5, 8, 13...) until reaching max_period.
|
57
|
+
# This creates an efficient schedule that mimics natural information decay patterns.
|
58
|
+
def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
|
59
|
+
_calculate_date_ranges(
|
60
|
+
Integer(days || self.class.default_days),
|
61
|
+
Integer(everytime || self.class.default_everytime),
|
62
|
+
Integer(max_period || self.class.default_max_period),
|
63
|
+
today || Time.now(in: '+09:30').to_date
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def _calculate_date_ranges(days, everytime, max_period, today)
|
70
|
+
@max_period_used = 1
|
71
|
+
to_date = today
|
72
|
+
valid_periods = PERIODS.select { |p| p <= max_period }
|
73
|
+
if !max_period.positive? || !days.positive?
|
74
|
+
return []
|
75
|
+
elsif valid_periods.empty? || everytime >= days
|
76
|
+
# cover everything everytime
|
77
|
+
return [[today + 1 - days, today, "everything"]]
|
78
|
+
end
|
79
|
+
|
80
|
+
max_period = valid_periods.max
|
81
|
+
|
82
|
+
run_number = today.to_date.jd
|
83
|
+
ranges = []
|
84
|
+
if everytime.positive?
|
85
|
+
ranges << [to_date + 1 - everytime, to_date, "everytime"]
|
86
|
+
days -= everytime
|
87
|
+
to_date -= everytime
|
88
|
+
end
|
89
|
+
|
90
|
+
periods = valid_periods.dup
|
91
|
+
loop do
|
92
|
+
period = periods.shift
|
93
|
+
break if period.nil? || period >= max_period || !days.positive?
|
94
|
+
|
95
|
+
if DebugUtils.trace?
|
96
|
+
FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
|
97
|
+
end
|
98
|
+
period.times do |index|
|
99
|
+
break unless days.positive?
|
100
|
+
|
101
|
+
this_period = [days, period].min
|
102
|
+
break if this_period <= 0
|
103
|
+
|
104
|
+
earliest_from = to_date - days
|
105
|
+
# we are working from the oldest back towards today
|
106
|
+
if run_number % period == index
|
107
|
+
from = to_date - index - (this_period - 1)
|
108
|
+
from = earliest_from if from < earliest_from
|
109
|
+
to = [today, to_date - index].min
|
110
|
+
break if from > to
|
111
|
+
|
112
|
+
@max_period_used = [this_period, @max_period_used].max
|
113
|
+
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
114
|
+
# extend adjacent range
|
115
|
+
ranges.last[0] = [from, ranges.last[0]].min
|
116
|
+
ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
|
117
|
+
else
|
118
|
+
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
119
|
+
ranges << [from, to, "#{period}\##{index}"]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
days -= this_period
|
123
|
+
to_date -= this_period
|
124
|
+
end
|
125
|
+
end
|
126
|
+
# remainder of range at max_period, whatever that is
|
127
|
+
if days.positive? && ScraperUtils::DebugUtils.trace?
|
128
|
+
FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
|
129
|
+
end
|
130
|
+
index = -1
|
131
|
+
while days.positive?
|
132
|
+
index += 1
|
133
|
+
this_period = [days, max_period].min
|
134
|
+
break if this_period <= 0
|
135
|
+
|
136
|
+
earliest_from = to_date - days
|
137
|
+
if (run_number % max_period) == (index % max_period)
|
138
|
+
from = to_date - index - (this_period - 1)
|
139
|
+
from = earliest_from if from < earliest_from
|
140
|
+
to = to_date - index
|
141
|
+
break if from > to
|
142
|
+
|
143
|
+
@max_period_used = [this_period, @max_period_used].max
|
144
|
+
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
145
|
+
# extend adjacent range
|
146
|
+
ranges.last[0] = [from, ranges.last[0]].min
|
147
|
+
ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
|
148
|
+
else
|
149
|
+
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
150
|
+
ranges << [from, to, "#{this_period}\##{index}"]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
days -= this_period
|
154
|
+
to_date -= this_period
|
155
|
+
end
|
156
|
+
ranges.reverse
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -5,6 +5,50 @@ require "json"
|
|
5
5
|
module ScraperUtils
|
6
6
|
# Utilities for debugging web scraping processes
|
7
7
|
module DebugUtils
|
8
|
+
DEBUG_ENV_VAR = "DEBUG"
|
9
|
+
MORPH_DEBUG_ENV_VAR = "MORPH_DEBUG"
|
10
|
+
|
11
|
+
# Debug level constants
|
12
|
+
DISABLED_LEVEL = 0
|
13
|
+
BASIC_LEVEL = 1
|
14
|
+
VERBOSE_LEVEL = 2
|
15
|
+
TRACE_LEVEL = 3
|
16
|
+
|
17
|
+
# Get current debug level (0 = disabled, 1 = basic, 2 = verbose, 3 = trace)
|
18
|
+
# Checks DEBUG and MORPH_DEBUG env variables
|
19
|
+
# @return [Integer] Debug level
|
20
|
+
def self.debug_level
|
21
|
+
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
|
22
|
+
debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
|
23
|
+
end
|
24
|
+
|
25
|
+
# Check if debug is enabled at specified level or higher
|
26
|
+
#
|
27
|
+
# @param level [Integer] Minimum debug level to check for
|
28
|
+
# @return [Boolean] true if debugging at specified level is enabled
|
29
|
+
def self.debug?(level = BASIC_LEVEL)
|
30
|
+
debug_level >= level
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if basic debug output or higher is enabled
|
34
|
+
# @return [Boolean] true if debugging is enabled
|
35
|
+
def self.basic?
|
36
|
+
debug?(BASIC_LEVEL)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Check if verbose debug output or higher is enabled
|
40
|
+
# @return [Boolean] true if verbose debugging is enabled
|
41
|
+
def self.verbose?
|
42
|
+
debug?(VERBOSE_LEVEL)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Check if debug tracing or higher is enabled
|
46
|
+
# @return [Boolean] true if debugging is enabled at trace level
|
47
|
+
def self.trace?
|
48
|
+
debug?(TRACE_LEVEL)
|
49
|
+
end
|
50
|
+
|
51
|
+
|
8
52
|
# Logs details of an HTTP request when debug mode is enabled
|
9
53
|
#
|
10
54
|
# @param method [String] HTTP method (GET, POST, etc.)
|
@@ -14,13 +58,14 @@ module ScraperUtils
|
|
14
58
|
# @param body [Hash, nil] Optional request body
|
15
59
|
# @return [void]
|
16
60
|
def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
|
17
|
-
return unless
|
61
|
+
return unless basic?
|
18
62
|
|
19
63
|
puts
|
20
|
-
|
64
|
+
FiberScheduler.log "🔍 #{method.upcase} #{url}"
|
21
65
|
puts "Parameters:", JSON.pretty_generate(parameters) if parameters
|
22
66
|
puts "Headers:", JSON.pretty_generate(headers) if headers
|
23
67
|
puts "Body:", JSON.pretty_generate(body) if body
|
68
|
+
$stdout.flush
|
24
69
|
end
|
25
70
|
|
26
71
|
# Logs details of a web page when debug mode is enabled
|
@@ -29,10 +74,10 @@ module ScraperUtils
|
|
29
74
|
# @param message [String] Context or description for the debug output
|
30
75
|
# @return [void]
|
31
76
|
def self.debug_page(page, message)
|
32
|
-
return unless
|
77
|
+
return unless trace?
|
33
78
|
|
34
79
|
puts
|
35
|
-
|
80
|
+
FiberScheduler.log "🔍 DEBUG: #{message}"
|
36
81
|
puts "Current URL: #{page.uri}"
|
37
82
|
puts "Page title: #{page.at('title').text.strip}" if page.at("title")
|
38
83
|
puts "",
|
@@ -40,6 +85,7 @@ module ScraperUtils
|
|
40
85
|
"-" * 40,
|
41
86
|
page.body,
|
42
87
|
"-" * 40
|
88
|
+
$stdout.flush
|
43
89
|
end
|
44
90
|
|
45
91
|
# Logs details about a specific page selector when debug mode is enabled
|
@@ -49,10 +95,10 @@ module ScraperUtils
|
|
49
95
|
# @param message [String] Context or description for the debug output
|
50
96
|
# @return [void]
|
51
97
|
def self.debug_selector(page, selector, message)
|
52
|
-
return unless
|
98
|
+
return unless trace?
|
53
99
|
|
54
100
|
puts
|
55
|
-
|
101
|
+
FiberScheduler.log "🔍 DEBUG: #{message}"
|
56
102
|
puts "Looking for selector: #{selector}"
|
57
103
|
element = page.at(selector)
|
58
104
|
if element
|
@@ -64,6 +110,7 @@ module ScraperUtils
|
|
64
110
|
puts page.body
|
65
111
|
puts "-" * 40
|
66
112
|
end
|
113
|
+
$stdout.flush
|
67
114
|
end
|
68
115
|
end
|
69
116
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
3
|
+
require "fiber"
|
4
4
|
|
5
5
|
module ScraperUtils
|
6
6
|
# A utility module for interleaving multiple scraping operations
|
@@ -24,6 +24,7 @@ module ScraperUtils
|
|
24
24
|
# @return [String, nil] the authority name or nil if not in a fiber
|
25
25
|
def self.current_authority
|
26
26
|
return nil unless in_fiber?
|
27
|
+
|
27
28
|
Fiber.current.instance_variable_get(:@authority)
|
28
29
|
end
|
29
30
|
|
@@ -33,11 +34,13 @@ module ScraperUtils
|
|
33
34
|
# @return [void]
|
34
35
|
def self.log(message)
|
35
36
|
authority = current_authority
|
37
|
+
$stderr.flush
|
36
38
|
if authority
|
37
39
|
puts "[#{authority}] #{message}"
|
38
40
|
else
|
39
41
|
puts message
|
40
42
|
end
|
43
|
+
$stdout.flush
|
41
44
|
end
|
42
45
|
|
43
46
|
# Returns a hash of exceptions encountered during processing, indexed by authority
|
@@ -47,7 +50,7 @@ module ScraperUtils
|
|
47
50
|
@exceptions ||= {}
|
48
51
|
end
|
49
52
|
|
50
|
-
# Returns a hash of
|
53
|
+
# Returns a hash of the yielded / block values
|
51
54
|
#
|
52
55
|
# @return [Hash{Symbol => Any}] values by authority
|
53
56
|
def self.values
|
@@ -76,7 +79,7 @@ module ScraperUtils
|
|
76
79
|
@enabled = false
|
77
80
|
end
|
78
81
|
|
79
|
-
# Resets the scheduler state, and disables
|
82
|
+
# Resets the scheduler state, and disables. Use before retrying failed authorities.
|
80
83
|
#
|
81
84
|
# @return [void]
|
82
85
|
def self.reset!
|
@@ -94,21 +97,19 @@ module ScraperUtils
|
|
94
97
|
#
|
95
98
|
# @param authority [String] the name of the authority being processed
|
96
99
|
# @yield to the block containing the scraping operation to be run in the fiber
|
97
|
-
# @return [Fiber]
|
100
|
+
# @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
|
98
101
|
def self.register_operation(authority, &block)
|
99
102
|
# Automatically enable fiber scheduling when operations are registered
|
100
103
|
enable!
|
101
104
|
|
102
105
|
fiber = Fiber.new do
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
registry.delete(Fiber.current)
|
111
|
-
end
|
106
|
+
values[authority] = block.call
|
107
|
+
rescue StandardError => e
|
108
|
+
# Store exception against the authority
|
109
|
+
exceptions[authority] = e
|
110
|
+
ensure
|
111
|
+
# Remove itself when done regardless of success/failure
|
112
|
+
registry.delete(Fiber.current)
|
112
113
|
end
|
113
114
|
|
114
115
|
# Start fibres in registration order
|
@@ -117,9 +118,11 @@ module ScraperUtils
|
|
117
118
|
fiber.instance_variable_set(:@authority, authority)
|
118
119
|
registry << fiber
|
119
120
|
|
120
|
-
|
121
|
-
|
122
|
-
|
121
|
+
if DebugUtils.basic?
|
122
|
+
FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
|
123
|
+
end
|
124
|
+
# Process immediately when testing
|
125
|
+
fiber.resume if ScraperUtils::RandomizeUtils.sequential?
|
123
126
|
fiber
|
124
127
|
end
|
125
128
|
|
@@ -130,19 +133,28 @@ module ScraperUtils
|
|
130
133
|
count = registry.size
|
131
134
|
while (fiber = find_earliest_fiber)
|
132
135
|
if fiber.alive?
|
133
|
-
authority =
|
136
|
+
authority = begin
|
137
|
+
fiber.instance_variable_get(:@authority)
|
138
|
+
rescue StandardError
|
139
|
+
nil
|
140
|
+
end
|
134
141
|
@resume_count ||= 0
|
135
142
|
@resume_count += 1
|
136
143
|
values[authority] = fiber.resume
|
137
144
|
else
|
138
|
-
|
145
|
+
FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
|
139
146
|
registry.delete(fiber)
|
140
147
|
end
|
141
148
|
end
|
142
149
|
|
143
|
-
|
144
|
-
|
145
|
-
|
150
|
+
if @time_slept&.positive? && @delay_requested&.positive?
|
151
|
+
percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
|
152
|
+
end
|
153
|
+
puts
|
154
|
+
FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
|
155
|
+
"sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
|
156
|
+
"#{@delay_requested&.round(1)} seconds requested."
|
157
|
+
puts
|
146
158
|
|
147
159
|
exceptions
|
148
160
|
end
|
@@ -162,10 +174,20 @@ module ScraperUtils
|
|
162
174
|
if !enabled? || !current_fiber || registry.size <= 1
|
163
175
|
@time_slept ||= 0.0
|
164
176
|
@time_slept += seconds
|
177
|
+
log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
|
165
178
|
return sleep(seconds)
|
166
179
|
end
|
167
180
|
|
168
|
-
|
181
|
+
now = Time.now
|
182
|
+
resume_at = now + seconds
|
183
|
+
|
184
|
+
# Don't resume at the same time as someone else,
|
185
|
+
# FIFO queue if seconds == 0
|
186
|
+
@other_resumes ||= []
|
187
|
+
@other_resumes = @other_resumes.delete_if { |t| t < now }
|
188
|
+
while @other_resumes.include?(resume_at) && resume_at
|
189
|
+
resume_at += 0.01
|
190
|
+
end
|
169
191
|
|
170
192
|
# Used to compare when other fibers need to be resumed
|
171
193
|
current_fiber.instance_variable_set(:@resume_at, resume_at)
|
@@ -178,6 +200,7 @@ module ScraperUtils
|
|
178
200
|
if remaining.positive?
|
179
201
|
@time_slept ||= 0.0
|
180
202
|
@time_slept += remaining
|
203
|
+
log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
|
181
204
|
sleep(remaining)
|
182
205
|
end || 0
|
183
206
|
end
|
@@ -13,8 +13,8 @@ module ScraperUtils
|
|
13
13
|
# @param start_time [Time] When this scraping attempt was started
|
14
14
|
# @param attempt [Integer] 1 for first run, 2 for first retry, 3 for last retry (without proxy)
|
15
15
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
16
|
-
# @param exceptions [Hash
|
17
|
-
# DataQualityMonitor.stats is checked for :saved and :unprocessed entries
|
16
|
+
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
17
|
+
# `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
|
18
18
|
# @return [void]
|
19
19
|
def self.log_scraping_run(start_time, attempt, authorities, exceptions)
|
20
20
|
raise ArgumentError, "Invalid start time" unless start_time.is_a?(Time)
|
@@ -75,39 +75,39 @@ module ScraperUtils
|
|
75
75
|
|
76
76
|
# Report on the results
|
77
77
|
# @param authorities [Array<Symbol>] List of authorities attempted to scrape
|
78
|
-
# @param exceptions [Hash
|
79
|
-
# DataQualityMonitor.stats is checked for :saved and :unprocessed entries
|
78
|
+
# @param exceptions [Hash{Symbol => Exception}] Any exceptions that occurred during scraping
|
79
|
+
# `DataQualityMonitor.stats` is checked for :saved and :unprocessed entries
|
80
80
|
# @return [void]
|
81
81
|
def self.report_on_results(authorities, exceptions)
|
82
|
-
|
82
|
+
if ENV["MORPH_EXPECT_BAD"]
|
83
|
+
expect_bad = ENV["MORPH_EXPECT_BAD"].split(",").map(&:strip).map(&:to_sym)
|
84
|
+
end
|
85
|
+
expect_bad ||= []
|
83
86
|
|
84
|
-
|
87
|
+
$stderr.flush
|
88
|
+
puts "MORPH_EXPECT_BAD=#{ENV.fetch('MORPH_EXPECT_BAD', nil)}"
|
85
89
|
|
86
90
|
# Print summary table
|
87
91
|
puts "\nScraping Summary:"
|
88
92
|
summary_format = "%-20s %6s %6s %s"
|
89
93
|
|
90
|
-
puts summary_format
|
91
|
-
puts summary_format
|
94
|
+
puts format(summary_format, 'Authority', 'OK', 'Bad', 'Exception')
|
95
|
+
puts format(summary_format, "-" * 20, "-" * 6, "-" * 6, "-" * 50)
|
92
96
|
|
93
97
|
authorities.each do |authority|
|
94
98
|
stats = ScraperUtils::DataQualityMonitor.stats&.fetch(authority, {}) || {}
|
95
|
-
|
99
|
+
|
96
100
|
ok_records = stats[:saved] || 0
|
97
101
|
bad_records = stats[:unprocessed] || 0
|
98
|
-
|
102
|
+
|
99
103
|
expect_bad_prefix = expect_bad.include?(authority) ? "[EXPECT BAD] " : ""
|
100
104
|
exception_msg = if exceptions[authority]
|
101
105
|
"#{exceptions[authority].class} - #{exceptions[authority].message}"
|
102
106
|
else
|
103
107
|
"-"
|
104
108
|
end
|
105
|
-
puts summary_format
|
106
|
-
|
107
|
-
ok_records,
|
108
|
-
bad_records,
|
109
|
-
"#{expect_bad_prefix}#{exception_msg}".slice(0, 70)
|
110
|
-
]
|
109
|
+
puts format(summary_format, authority.to_s, ok_records, bad_records,
|
110
|
+
"#{expect_bad_prefix}#{exception_msg}".slice(0, 70))
|
111
111
|
end
|
112
112
|
puts
|
113
113
|
|
@@ -120,7 +120,8 @@ module ScraperUtils
|
|
120
120
|
end
|
121
121
|
|
122
122
|
if unexpected_working.any?
|
123
|
-
errors <<
|
123
|
+
errors <<
|
124
|
+
"WARNING: Remove #{unexpected_working.join(',')} from MORPH_EXPECT_BAD as it now works!"
|
124
125
|
end
|
125
126
|
|
126
127
|
# Check for authorities with unexpected errors
|
@@ -137,6 +138,7 @@ module ScraperUtils
|
|
137
138
|
end
|
138
139
|
end
|
139
140
|
|
141
|
+
$stdout.flush
|
140
142
|
if errors.any?
|
141
143
|
errors << "See earlier output for details"
|
142
144
|
raise errors.join("\n")
|