scraper_utils 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -0
- data/.rubocop.yml +4 -0
- data/CHANGELOG.md +22 -1
- data/Gemfile +5 -2
- data/README.md +128 -149
- data/docs/example_scrape_with_fibers.rb +31 -0
- data/docs/example_scraper.rb +93 -0
- data/lib/scraper_utils/adaptive_delay.rb +55 -50
- data/lib/scraper_utils/cycle_utils.rb +25 -0
- data/lib/scraper_utils/data_quality_monitor.rb +28 -17
- data/lib/scraper_utils/date_range_utils.rb +159 -0
- data/lib/scraper_utils/db_utils.rb +0 -2
- data/lib/scraper_utils/debug_utils.rb +53 -6
- data/lib/scraper_utils/fiber_scheduler.rb +45 -22
- data/lib/scraper_utils/log_utils.rb +19 -17
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +67 -46
- data/lib/scraper_utils/mechanize_utils.rb +12 -4
- data/lib/scraper_utils/randomize_utils.rb +34 -0
- data/lib/scraper_utils/robots_checker.rb +9 -4
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +3 -10
- metadata +7 -2
@@ -2,64 +2,69 @@
|
|
2
2
|
|
3
3
|
require "uri"
|
4
4
|
|
5
|
-
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
module ScraperUtils
|
6
|
+
# Adapts delays between requests based on server response times.
|
7
|
+
# Target delay is proportional to response time based on max_load setting.
|
8
|
+
# Uses an exponential moving average to smooth variations in response times.
|
9
|
+
class AdaptiveDelay
|
10
|
+
DEFAULT_MIN_DELAY = 0.0
|
11
|
+
DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
|
11
12
|
|
12
|
-
|
13
|
+
attr_reader :min_delay, :max_delay, :max_load
|
13
14
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
15
|
+
# Creates a new adaptive delay calculator
|
16
|
+
#
|
17
|
+
# @param min_delay [Float] Minimum delay between requests in seconds
|
18
|
+
# @param max_delay [Float] Maximum delay between requests in seconds
|
19
|
+
# @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
|
20
|
+
# Lower values are more conservative (e.g., 20% = 4x response time delay)
|
21
|
+
def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
|
22
|
+
@delays = {} # domain -> last delay used
|
23
|
+
@min_delay = min_delay.to_f
|
24
|
+
@max_delay = max_delay.to_f
|
25
|
+
@max_load = max_load.to_f.clamp(1.0, 99.0)
|
26
|
+
@response_multiplier = (100.0 - @max_load) / @max_load
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
ScraperUtils::FiberScheduler.log
|
28
|
+
return unless DebugUtils.basic?
|
29
|
+
|
30
|
+
ScraperUtils::FiberScheduler.log(
|
31
|
+
"AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
|
32
|
+
"Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
|
33
|
+
)
|
30
34
|
end
|
31
|
-
end
|
32
35
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
36
|
+
# @param uri [URI::Generic, String] The URL to extract the domain from
|
37
|
+
# @return [String] The domain in the format "scheme://host"
|
38
|
+
def domain(uri)
|
39
|
+
uri = URI(uri) unless uri.is_a?(URI)
|
40
|
+
"#{uri.scheme}://#{uri.host}".downcase
|
41
|
+
end
|
39
42
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
# @param uri [URI::Generic, String] URL to get delay for
|
44
|
+
# @return [Float] Current delay for the domain, or min_delay if no delay set
|
45
|
+
def delay(uri)
|
46
|
+
@delays[domain(uri)] || @min_delay
|
47
|
+
end
|
45
48
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
# @param uri [URI::Generic, String] URL the response came from
|
50
|
+
# @param response_time [Float] Time in seconds the server took to respond
|
51
|
+
# @return [Float] The calculated delay to use with the next request
|
52
|
+
def next_delay(uri, response_time)
|
53
|
+
uris_domain = domain(uri)
|
54
|
+
target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
|
55
|
+
current_delay = @delays[uris_domain] || target_delay
|
56
|
+
delay = ((9.0 * current_delay) + target_delay) / 10.0
|
57
|
+
delay = delay.clamp(@min_delay, @max_delay)
|
55
58
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
59
|
+
if DebugUtils.basic?
|
60
|
+
ScraperUtils::FiberScheduler.log(
|
61
|
+
"Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
|
62
|
+
"#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
|
63
|
+
)
|
64
|
+
end
|
61
65
|
|
62
|
-
|
63
|
-
|
66
|
+
@delays[uris_domain] = delay
|
67
|
+
delay
|
68
|
+
end
|
64
69
|
end
|
65
70
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
# Provides utilities for cycling through a range of options day by day
|
5
|
+
module CycleUtils
|
6
|
+
# Returns position in cycle from zero onwards
|
7
|
+
# @param cycle [Integer] Cycle size (2 onwards)
|
8
|
+
# @param date [Date, nil] Optional date to use instead of today
|
9
|
+
# @return [Integer] position in cycle progressing from zero to cycle-1 and then repeating day by day
|
10
|
+
# Can override using CYCLE_POSITION ENV variable
|
11
|
+
def self.position(cycle, date: nil)
|
12
|
+
day = ENV.fetch('CYCLE_POSITION', (date || Date.today).jd).to_i
|
13
|
+
day % cycle
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns one value per day, cycling through all possible values in order
|
17
|
+
# @param values [Array] Values to cycle through
|
18
|
+
# @param date [Date, nil] Optional date to use instead of today to calculate position
|
19
|
+
# @return value from array
|
20
|
+
# Can override using CYCLE_POSITION ENV variable
|
21
|
+
def self.pick(values, date: nil)
|
22
|
+
values[position(values.size, date: date)]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -6,8 +6,8 @@ module ScraperUtils
|
|
6
6
|
class DataQualityMonitor
|
7
7
|
# Get the statistics for all authorities
|
8
8
|
# @return [Hash, nil] Hash of statistics per authority or nil if none started
|
9
|
-
|
10
|
-
|
9
|
+
class << self
|
10
|
+
attr_reader :stats
|
11
11
|
end
|
12
12
|
|
13
13
|
# Notes the start of processing an authority and clears any previous stats
|
@@ -16,28 +16,39 @@ module ScraperUtils
|
|
16
16
|
# @return [void]
|
17
17
|
def self.start_authority(authority_label)
|
18
18
|
@stats ||= {}
|
19
|
-
@authority_label =
|
20
|
-
@stats[@authority_label] = { saved: 0, unprocessed: 0}
|
19
|
+
@stats[authority_label] = { saved: 0, unprocessed: 0 }
|
21
20
|
end
|
22
21
|
|
23
|
-
|
24
|
-
|
22
|
+
# Extracts authority label and ensures stats are setup for record
|
23
|
+
def self.extract_authority(record)
|
24
|
+
authority_label = (record&.key?("authority_label") ? record["authority_label"] : "").to_sym
|
25
|
+
@stats ||= {}
|
26
|
+
@stats[authority_label] ||= { saved: 0, unprocessed: 0 }
|
27
|
+
authority_label
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.threshold(authority_label)
|
31
|
+
5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
|
25
32
|
end
|
26
33
|
|
27
34
|
# Logs an unprocessable record and raises an exception if error threshold is exceeded
|
28
35
|
# The threshold is 5 + 10% of saved records
|
29
36
|
#
|
30
|
-
# @param
|
37
|
+
# @param exception [Exception] The exception that caused the record to be unprocessable
|
31
38
|
# @param record [Hash, nil] The record that couldn't be processed
|
32
39
|
# @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
|
33
40
|
# @return [void]
|
34
|
-
def self.log_unprocessable_record(
|
35
|
-
|
36
|
-
@stats[
|
37
|
-
ScraperUtils::FiberScheduler.log "Erroneous record #{
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
def self.log_unprocessable_record(exception, record)
|
42
|
+
authority_label = extract_authority(record)
|
43
|
+
@stats[authority_label][:unprocessed] += 1
|
44
|
+
ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
|
45
|
+
'address', nil
|
46
|
+
) || record.inspect}: #{exception}"
|
47
|
+
return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
|
48
|
+
|
49
|
+
raise ScraperUtils::UnprocessableSite,
|
50
|
+
"Too many unprocessable_records for #{authority_label}: " \
|
51
|
+
"#{@stats[authority_label].inspect} - aborting processing of site!"
|
41
52
|
end
|
42
53
|
|
43
54
|
# Logs a successfully saved record
|
@@ -45,9 +56,9 @@ module ScraperUtils
|
|
45
56
|
# @param record [Hash] The record that was saved
|
46
57
|
# @return [void]
|
47
58
|
def self.log_saved_record(record)
|
48
|
-
|
49
|
-
@stats[
|
50
|
-
ScraperUtils::FiberScheduler.log "Saving record #{
|
59
|
+
authority_label = extract_authority(record)
|
60
|
+
@stats[authority_label][:saved] += 1
|
61
|
+
ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
|
51
62
|
end
|
52
63
|
end
|
53
64
|
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
class DateRangeUtils
|
5
|
+
MERGE_ADJACENT_RANGES = true
|
6
|
+
PERIODS = [2, 3, 5, 8].freeze
|
7
|
+
|
8
|
+
class << self
|
9
|
+
# @return [Integer] Default number of days to cover
|
10
|
+
attr_accessor :default_days
|
11
|
+
|
12
|
+
# @return [Integer] Default days to always include in ranges
|
13
|
+
attr_accessor :default_everytime
|
14
|
+
|
15
|
+
# @return [Integer, nil] Default max days between any one date being in a range
|
16
|
+
attr_accessor :default_max_period
|
17
|
+
|
18
|
+
# Configure default settings for all DateRangeUtils instances
|
19
|
+
# @yield [self] Yields self for configuration
|
20
|
+
# @example
|
21
|
+
# AgentConfig.configure do |config|
|
22
|
+
# config.default_everytime = 3
|
23
|
+
# config.default_days = 35
|
24
|
+
# config.default_max_period = 5
|
25
|
+
# end
|
26
|
+
# @return [void]
|
27
|
+
def configure
|
28
|
+
yield self if block_given?
|
29
|
+
end
|
30
|
+
|
31
|
+
# Reset all configuration options to their default values
|
32
|
+
# @return [void]
|
33
|
+
def reset_defaults!
|
34
|
+
@default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
|
35
|
+
@default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
|
36
|
+
@default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Set defaults on load
|
41
|
+
reset_defaults!
|
42
|
+
|
43
|
+
attr_reader :max_period_used
|
44
|
+
attr_reader :extended_max_period
|
45
|
+
|
46
|
+
# Generates one or more date ranges to check the most recent daily through to checking each max_period
|
47
|
+
# There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
|
48
|
+
# @param days [Integer, nil] create ranges that cover the last `days` dates
|
49
|
+
# @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
|
50
|
+
# @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
|
51
|
+
# @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
|
52
|
+
# @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
|
53
|
+
#
|
54
|
+
# Uses a Fibonacci sequence to create a natural progression of check frequencies.
|
55
|
+
# Newer data is checked more frequently, with periods between checks growing
|
56
|
+
# according to the Fibonacci sequence (2, 3, 5, 8, 13...) until reaching max_period.
|
57
|
+
# This creates an efficient schedule that mimics natural information decay patterns.
|
58
|
+
def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
|
59
|
+
_calculate_date_ranges(
|
60
|
+
Integer(days || self.class.default_days),
|
61
|
+
Integer(everytime || self.class.default_everytime),
|
62
|
+
Integer(max_period || self.class.default_max_period),
|
63
|
+
today || Time.now(in: '+09:30').to_date
|
64
|
+
)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def _calculate_date_ranges(days, everytime, max_period, today)
|
70
|
+
@max_period_used = 1
|
71
|
+
to_date = today
|
72
|
+
valid_periods = PERIODS.select { |p| p <= max_period }
|
73
|
+
if !max_period.positive? || !days.positive?
|
74
|
+
return []
|
75
|
+
elsif valid_periods.empty? || everytime >= days
|
76
|
+
# cover everything everytime
|
77
|
+
return [[today + 1 - days, today, "everything"]]
|
78
|
+
end
|
79
|
+
|
80
|
+
max_period = valid_periods.max
|
81
|
+
|
82
|
+
run_number = today.to_date.jd
|
83
|
+
ranges = []
|
84
|
+
if everytime.positive?
|
85
|
+
ranges << [to_date + 1 - everytime, to_date, "everytime"]
|
86
|
+
days -= everytime
|
87
|
+
to_date -= everytime
|
88
|
+
end
|
89
|
+
|
90
|
+
periods = valid_periods.dup
|
91
|
+
loop do
|
92
|
+
period = periods.shift
|
93
|
+
break if period.nil? || period >= max_period || !days.positive?
|
94
|
+
|
95
|
+
if DebugUtils.trace?
|
96
|
+
FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
|
97
|
+
end
|
98
|
+
period.times do |index|
|
99
|
+
break unless days.positive?
|
100
|
+
|
101
|
+
this_period = [days, period].min
|
102
|
+
break if this_period <= 0
|
103
|
+
|
104
|
+
earliest_from = to_date - days
|
105
|
+
# we are working from the oldest back towards today
|
106
|
+
if run_number % period == index
|
107
|
+
from = to_date - index - (this_period - 1)
|
108
|
+
from = earliest_from if from < earliest_from
|
109
|
+
to = [today, to_date - index].min
|
110
|
+
break if from > to
|
111
|
+
|
112
|
+
@max_period_used = [this_period, @max_period_used].max
|
113
|
+
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
114
|
+
# extend adjacent range
|
115
|
+
ranges.last[0] = [from, ranges.last[0]].min
|
116
|
+
ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
|
117
|
+
else
|
118
|
+
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
119
|
+
ranges << [from, to, "#{period}\##{index}"]
|
120
|
+
end
|
121
|
+
end
|
122
|
+
days -= this_period
|
123
|
+
to_date -= this_period
|
124
|
+
end
|
125
|
+
end
|
126
|
+
# remainder of range at max_period, whatever that is
|
127
|
+
if days.positive? && ScraperUtils::DebugUtils.trace?
|
128
|
+
FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
|
129
|
+
end
|
130
|
+
index = -1
|
131
|
+
while days.positive?
|
132
|
+
index += 1
|
133
|
+
this_period = [days, max_period].min
|
134
|
+
break if this_period <= 0
|
135
|
+
|
136
|
+
earliest_from = to_date - days
|
137
|
+
if (run_number % max_period) == (index % max_period)
|
138
|
+
from = to_date - index - (this_period - 1)
|
139
|
+
from = earliest_from if from < earliest_from
|
140
|
+
to = to_date - index
|
141
|
+
break if from > to
|
142
|
+
|
143
|
+
@max_period_used = [this_period, @max_period_used].max
|
144
|
+
if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
|
145
|
+
# extend adjacent range
|
146
|
+
ranges.last[0] = [from, ranges.last[0]].min
|
147
|
+
ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
|
148
|
+
else
|
149
|
+
to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
|
150
|
+
ranges << [from, to, "#{this_period}\##{index}"]
|
151
|
+
end
|
152
|
+
end
|
153
|
+
days -= this_period
|
154
|
+
to_date -= this_period
|
155
|
+
end
|
156
|
+
ranges.reverse
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -5,6 +5,50 @@ require "json"
|
|
5
5
|
module ScraperUtils
|
6
6
|
# Utilities for debugging web scraping processes
|
7
7
|
module DebugUtils
|
8
|
+
DEBUG_ENV_VAR = "DEBUG"
|
9
|
+
MORPH_DEBUG_ENV_VAR = "MORPH_DEBUG"
|
10
|
+
|
11
|
+
# Debug level constants
|
12
|
+
DISABLED_LEVEL = 0
|
13
|
+
BASIC_LEVEL = 1
|
14
|
+
VERBOSE_LEVEL = 2
|
15
|
+
TRACE_LEVEL = 3
|
16
|
+
|
17
|
+
# Get current debug level (0 = disabled, 1 = basic, 2 = verbose, 3 = trace)
|
18
|
+
# Checks DEBUG and MORPH_DEBUG env variables
|
19
|
+
# @return [Integer] Debug level
|
20
|
+
def self.debug_level
|
21
|
+
debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
|
22
|
+
debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
|
23
|
+
end
|
24
|
+
|
25
|
+
# Check if debug is enabled at specified level or higher
|
26
|
+
#
|
27
|
+
# @param level [Integer] Minimum debug level to check for
|
28
|
+
# @return [Boolean] true if debugging at specified level is enabled
|
29
|
+
def self.debug?(level = BASIC_LEVEL)
|
30
|
+
debug_level >= level
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if basic debug output or higher is enabled
|
34
|
+
# @return [Boolean] true if debugging is enabled
|
35
|
+
def self.basic?
|
36
|
+
debug?(BASIC_LEVEL)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Check if verbose debug output or higher is enabled
|
40
|
+
# @return [Boolean] true if verbose debugging is enabled
|
41
|
+
def self.verbose?
|
42
|
+
debug?(VERBOSE_LEVEL)
|
43
|
+
end
|
44
|
+
|
45
|
+
# Check if debug tracing or higher is enabled
|
46
|
+
# @return [Boolean] true if debugging is enabled at trace level
|
47
|
+
def self.trace?
|
48
|
+
debug?(TRACE_LEVEL)
|
49
|
+
end
|
50
|
+
|
51
|
+
|
8
52
|
# Logs details of an HTTP request when debug mode is enabled
|
9
53
|
#
|
10
54
|
# @param method [String] HTTP method (GET, POST, etc.)
|
@@ -14,13 +58,14 @@ module ScraperUtils
|
|
14
58
|
# @param body [Hash, nil] Optional request body
|
15
59
|
# @return [void]
|
16
60
|
def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
|
17
|
-
return unless
|
61
|
+
return unless basic?
|
18
62
|
|
19
63
|
puts
|
20
|
-
|
64
|
+
FiberScheduler.log "🔍 #{method.upcase} #{url}"
|
21
65
|
puts "Parameters:", JSON.pretty_generate(parameters) if parameters
|
22
66
|
puts "Headers:", JSON.pretty_generate(headers) if headers
|
23
67
|
puts "Body:", JSON.pretty_generate(body) if body
|
68
|
+
$stdout.flush
|
24
69
|
end
|
25
70
|
|
26
71
|
# Logs details of a web page when debug mode is enabled
|
@@ -29,10 +74,10 @@ module ScraperUtils
|
|
29
74
|
# @param message [String] Context or description for the debug output
|
30
75
|
# @return [void]
|
31
76
|
def self.debug_page(page, message)
|
32
|
-
return unless
|
77
|
+
return unless trace?
|
33
78
|
|
34
79
|
puts
|
35
|
-
|
80
|
+
FiberScheduler.log "🔍 DEBUG: #{message}"
|
36
81
|
puts "Current URL: #{page.uri}"
|
37
82
|
puts "Page title: #{page.at('title').text.strip}" if page.at("title")
|
38
83
|
puts "",
|
@@ -40,6 +85,7 @@ module ScraperUtils
|
|
40
85
|
"-" * 40,
|
41
86
|
page.body,
|
42
87
|
"-" * 40
|
88
|
+
$stdout.flush
|
43
89
|
end
|
44
90
|
|
45
91
|
# Logs details about a specific page selector when debug mode is enabled
|
@@ -49,10 +95,10 @@ module ScraperUtils
|
|
49
95
|
# @param message [String] Context or description for the debug output
|
50
96
|
# @return [void]
|
51
97
|
def self.debug_selector(page, selector, message)
|
52
|
-
return unless
|
98
|
+
return unless trace?
|
53
99
|
|
54
100
|
puts
|
55
|
-
|
101
|
+
FiberScheduler.log "🔍 DEBUG: #{message}"
|
56
102
|
puts "Looking for selector: #{selector}"
|
57
103
|
element = page.at(selector)
|
58
104
|
if element
|
@@ -64,6 +110,7 @@ module ScraperUtils
|
|
64
110
|
puts page.body
|
65
111
|
puts "-" * 40
|
66
112
|
end
|
113
|
+
$stdout.flush
|
67
114
|
end
|
68
115
|
end
|
69
116
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
3
|
+
require "fiber"
|
4
4
|
|
5
5
|
module ScraperUtils
|
6
6
|
# A utility module for interleaving multiple scraping operations
|
@@ -24,6 +24,7 @@ module ScraperUtils
|
|
24
24
|
# @return [String, nil] the authority name or nil if not in a fiber
|
25
25
|
def self.current_authority
|
26
26
|
return nil unless in_fiber?
|
27
|
+
|
27
28
|
Fiber.current.instance_variable_get(:@authority)
|
28
29
|
end
|
29
30
|
|
@@ -33,11 +34,13 @@ module ScraperUtils
|
|
33
34
|
# @return [void]
|
34
35
|
def self.log(message)
|
35
36
|
authority = current_authority
|
37
|
+
$stderr.flush
|
36
38
|
if authority
|
37
39
|
puts "[#{authority}] #{message}"
|
38
40
|
else
|
39
41
|
puts message
|
40
42
|
end
|
43
|
+
$stdout.flush
|
41
44
|
end
|
42
45
|
|
43
46
|
# Returns a hash of exceptions encountered during processing, indexed by authority
|
@@ -47,7 +50,7 @@ module ScraperUtils
|
|
47
50
|
@exceptions ||= {}
|
48
51
|
end
|
49
52
|
|
50
|
-
# Returns a hash of
|
53
|
+
# Returns a hash of the yielded / block values
|
51
54
|
#
|
52
55
|
# @return [Hash{Symbol => Any}] values by authority
|
53
56
|
def self.values
|
@@ -76,7 +79,7 @@ module ScraperUtils
|
|
76
79
|
@enabled = false
|
77
80
|
end
|
78
81
|
|
79
|
-
# Resets the scheduler state, and disables
|
82
|
+
# Resets the scheduler state, and disables. Use before retrying failed authorities.
|
80
83
|
#
|
81
84
|
# @return [void]
|
82
85
|
def self.reset!
|
@@ -94,21 +97,19 @@ module ScraperUtils
|
|
94
97
|
#
|
95
98
|
# @param authority [String] the name of the authority being processed
|
96
99
|
# @yield to the block containing the scraping operation to be run in the fiber
|
97
|
-
# @return [Fiber]
|
100
|
+
# @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
|
98
101
|
def self.register_operation(authority, &block)
|
99
102
|
# Automatically enable fiber scheduling when operations are registered
|
100
103
|
enable!
|
101
104
|
|
102
105
|
fiber = Fiber.new do
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
registry.delete(Fiber.current)
|
111
|
-
end
|
106
|
+
values[authority] = block.call
|
107
|
+
rescue StandardError => e
|
108
|
+
# Store exception against the authority
|
109
|
+
exceptions[authority] = e
|
110
|
+
ensure
|
111
|
+
# Remove itself when done regardless of success/failure
|
112
|
+
registry.delete(Fiber.current)
|
112
113
|
end
|
113
114
|
|
114
115
|
# Start fibres in registration order
|
@@ -117,9 +118,11 @@ module ScraperUtils
|
|
117
118
|
fiber.instance_variable_set(:@authority, authority)
|
118
119
|
registry << fiber
|
119
120
|
|
120
|
-
|
121
|
-
|
122
|
-
|
121
|
+
if DebugUtils.basic?
|
122
|
+
FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
|
123
|
+
end
|
124
|
+
# Process immediately when testing
|
125
|
+
fiber.resume if ScraperUtils::RandomizeUtils.sequential?
|
123
126
|
fiber
|
124
127
|
end
|
125
128
|
|
@@ -130,19 +133,28 @@ module ScraperUtils
|
|
130
133
|
count = registry.size
|
131
134
|
while (fiber = find_earliest_fiber)
|
132
135
|
if fiber.alive?
|
133
|
-
authority =
|
136
|
+
authority = begin
|
137
|
+
fiber.instance_variable_get(:@authority)
|
138
|
+
rescue StandardError
|
139
|
+
nil
|
140
|
+
end
|
134
141
|
@resume_count ||= 0
|
135
142
|
@resume_count += 1
|
136
143
|
values[authority] = fiber.resume
|
137
144
|
else
|
138
|
-
|
145
|
+
FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
|
139
146
|
registry.delete(fiber)
|
140
147
|
end
|
141
148
|
end
|
142
149
|
|
143
|
-
|
144
|
-
|
145
|
-
|
150
|
+
if @time_slept&.positive? && @delay_requested&.positive?
|
151
|
+
percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
|
152
|
+
end
|
153
|
+
puts
|
154
|
+
FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
|
155
|
+
"sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
|
156
|
+
"#{@delay_requested&.round(1)} seconds requested."
|
157
|
+
puts
|
146
158
|
|
147
159
|
exceptions
|
148
160
|
end
|
@@ -162,10 +174,20 @@ module ScraperUtils
|
|
162
174
|
if !enabled? || !current_fiber || registry.size <= 1
|
163
175
|
@time_slept ||= 0.0
|
164
176
|
@time_slept += seconds
|
177
|
+
log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
|
165
178
|
return sleep(seconds)
|
166
179
|
end
|
167
180
|
|
168
|
-
|
181
|
+
now = Time.now
|
182
|
+
resume_at = now + seconds
|
183
|
+
|
184
|
+
# Don't resume at the same time as someone else,
|
185
|
+
# FIFO queue if seconds == 0
|
186
|
+
@other_resumes ||= []
|
187
|
+
@other_resumes = @other_resumes.delete_if { |t| t < now }
|
188
|
+
while @other_resumes.include?(resume_at) && resume_at
|
189
|
+
resume_at += 0.01
|
190
|
+
end
|
169
191
|
|
170
192
|
# Used to compare when other fibers need to be resumed
|
171
193
|
current_fiber.instance_variable_set(:@resume_at, resume_at)
|
@@ -178,6 +200,7 @@ module ScraperUtils
|
|
178
200
|
if remaining.positive?
|
179
201
|
@time_slept ||= 0.0
|
180
202
|
@time_slept += remaining
|
203
|
+
log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
|
181
204
|
sleep(remaining)
|
182
205
|
end || 0
|
183
206
|
end
|