scraper_utils 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,64 +2,69 @@
2
2
 
3
3
  require "uri"
4
4
 
5
- # Adapts delays between requests based on server response times.
6
- # Target delay is proportional to response time based on max_load setting.
7
- # Uses an exponential moving average to smooth variations in response times.
8
- class AdaptiveDelay
9
- DEFAULT_MIN_DELAY = 0.0
10
- DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
5
+ module ScraperUtils
6
+ # Adapts delays between requests based on server response times.
7
+ # Target delay is proportional to response time based on max_load setting.
8
+ # Uses an exponential moving average to smooth variations in response times.
9
+ class AdaptiveDelay
10
+ DEFAULT_MIN_DELAY = 0.0
11
+ DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
11
12
 
12
- attr_reader :min_delay, :max_delay, :max_load
13
+ attr_reader :min_delay, :max_delay, :max_load
13
14
 
14
- # Creates a new adaptive delay calculator
15
- #
16
- # @param min_delay [Float] Minimum delay between requests in seconds
17
- # @param max_delay [Float] Maximum delay between requests in seconds
18
- # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
19
- # Lower values are more conservative (e.g., 20% = 4x response time delay)
20
- def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
21
- @delays = {} # domain -> last delay used
22
- @min_delay = min_delay.to_f
23
- @max_delay = max_delay.to_f
24
- @max_load = max_load.to_f.clamp(1.0, 99.0)
25
- @response_multiplier = (100.0 - @max_load) / @max_load
15
+ # Creates a new adaptive delay calculator
16
+ #
17
+ # @param min_delay [Float] Minimum delay between requests in seconds
18
+ # @param max_delay [Float] Maximum delay between requests in seconds
19
+ # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
20
+ # Lower values are more conservative (e.g., 20% = 4x response time delay)
21
+ def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
22
+ @delays = {} # domain -> last delay used
23
+ @min_delay = min_delay.to_f
24
+ @max_delay = max_delay.to_f
25
+ @max_load = max_load.to_f.clamp(1.0, 99.0)
26
+ @response_multiplier = (100.0 - @max_load) / @max_load
26
27
 
27
- if ENV["DEBUG"]
28
- ScraperUtils::FiberScheduler.log "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds"
29
- ScraperUtils::FiberScheduler.log "Using max_load of #{@max_load}% (response time multiplier: #{@response_multiplier.round(2)}x)"
28
+ return unless DebugUtils.basic?
29
+
30
+ ScraperUtils::FiberScheduler.log(
31
+ "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
32
+ "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
33
+ )
30
34
  end
31
- end
32
35
 
33
- # @param uri [URI::Generic, String] The URL to extract the domain from
34
- # @return [String] The domain in the format "scheme://host"
35
- def domain(uri)
36
- uri = URI(uri) unless uri.is_a?(URI)
37
- "#{uri.scheme}://#{uri.host}".downcase
38
- end
36
+ # @param uri [URI::Generic, String] The URL to extract the domain from
37
+ # @return [String] The domain in the format "scheme://host"
38
+ def domain(uri)
39
+ uri = URI(uri) unless uri.is_a?(URI)
40
+ "#{uri.scheme}://#{uri.host}".downcase
41
+ end
39
42
 
40
- # @param uri [URI::Generic, String] URL to get delay for
41
- # @return [Float] Current delay for the domain, or min_delay if no delay set
42
- def delay(uri)
43
- @delays[domain(uri)] || @min_delay
44
- end
43
+ # @param uri [URI::Generic, String] URL to get delay for
44
+ # @return [Float] Current delay for the domain, or min_delay if no delay set
45
+ def delay(uri)
46
+ @delays[domain(uri)] || @min_delay
47
+ end
45
48
 
46
- # @param uri [URI::Generic, String] URL the response came from
47
- # @param response_time [Float] Time in seconds the server took to respond
48
- # @return [Float] The calculated delay to use with the next request
49
- def next_delay(uri, response_time)
50
- uris_domain = domain(uri)
51
- target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
52
- current_delay = @delays[uris_domain] || target_delay
53
- delay = ((9.0 * current_delay) + target_delay) / 10.0
54
- delay = delay.clamp(@min_delay, @max_delay)
49
+ # @param uri [URI::Generic, String] URL the response came from
50
+ # @param response_time [Float] Time in seconds the server took to respond
51
+ # @return [Float] The calculated delay to use with the next request
52
+ def next_delay(uri, response_time)
53
+ uris_domain = domain(uri)
54
+ target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
55
+ current_delay = @delays[uris_domain] || target_delay
56
+ delay = ((9.0 * current_delay) + target_delay) / 10.0
57
+ delay = delay.clamp(@min_delay, @max_delay)
55
58
 
56
- if ENV["DEBUG"]
57
- ScraperUtils::FiberScheduler.log "Adaptive delay for #{uris_domain} updated to " \
58
- "#{delay.round(2)}s (target: #{@response_multiplier.round(1)}x " \
59
- "response_time of #{response_time.round(2)}s)"
60
- end
59
+ if DebugUtils.basic?
60
+ ScraperUtils::FiberScheduler.log(
61
+ "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
62
+ "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
63
+ )
64
+ end
61
65
 
62
- @delays[uris_domain] = delay
63
- delay
66
+ @delays[uris_domain] = delay
67
+ delay
68
+ end
64
69
  end
65
70
  end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ # Provides utilities for cycling through a range of options day by day
5
+ module CycleUtils
6
+ # Returns position in cycle from zero onwards
7
+ # @param cycle [Integer] Cycle size (2 onwards)
8
+ # @param date [Date, nil] Optional date to use instead of today
9
+ # @return [Integer] position in cycle progressing from zero to cycle-1 and then repeating day by day
10
+ # Can override using CYCLE_POSITION ENV variable
11
+ def self.position(cycle, date: nil)
12
+ day = ENV.fetch('CYCLE_POSITION', (date || Date.today).jd).to_i
13
+ day % cycle
14
+ end
15
+
16
+ # Returns one value per day, cycling through all possible values in order
17
+ # @param values [Array] Values to cycle through
18
+ # @param date [Date, nil] Optional date to use instead of today to calculate position
19
+ # @return value from array
20
+ # Can override using CYCLE_POSITION ENV variable
21
+ def self.pick(values, date: nil)
22
+ values[position(values.size, date: date)]
23
+ end
24
+ end
25
+ end
@@ -6,8 +6,8 @@ module ScraperUtils
6
6
  class DataQualityMonitor
7
7
  # Get the statistics for all authorities
8
8
  # @return [Hash, nil] Hash of statistics per authority or nil if none started
9
- def self.stats
10
- @stats
9
+ class << self
10
+ attr_reader :stats
11
11
  end
12
12
 
13
13
  # Notes the start of processing an authority and clears any previous stats
@@ -16,28 +16,39 @@ module ScraperUtils
16
16
  # @return [void]
17
17
  def self.start_authority(authority_label)
18
18
  @stats ||= {}
19
- @authority_label = authority_label
20
- @stats[@authority_label] = { saved: 0, unprocessed: 0}
19
+ @stats[authority_label] = { saved: 0, unprocessed: 0 }
21
20
  end
22
21
 
23
- def self.threshold
24
- 5.01 + @stats[@authority_label][:saved] * 0.1 if @stats&.fetch(@authority_label, nil)
22
+ # Extracts authority label and ensures stats are setup for record
23
+ def self.extract_authority(record)
24
+ authority_label = (record&.key?("authority_label") ? record["authority_label"] : "").to_sym
25
+ @stats ||= {}
26
+ @stats[authority_label] ||= { saved: 0, unprocessed: 0 }
27
+ authority_label
28
+ end
29
+
30
+ def self.threshold(authority_label)
31
+ 5.01 + (@stats[authority_label][:saved] * 0.1) if @stats&.fetch(authority_label, nil)
25
32
  end
26
33
 
27
34
  # Logs an unprocessable record and raises an exception if error threshold is exceeded
28
35
  # The threshold is 5 + 10% of saved records
29
36
  #
30
- # @param e [Exception] The exception that caused the record to be unprocessable
37
+ # @param exception [Exception] The exception that caused the record to be unprocessable
31
38
  # @param record [Hash, nil] The record that couldn't be processed
32
39
  # @raise [ScraperUtils::UnprocessableSite] When too many records are unprocessable
33
40
  # @return [void]
34
- def self.log_unprocessable_record(e, record)
35
- start_authority(:"") unless @stats
36
- @stats[@authority_label][:unprocessed] += 1
37
- ScraperUtils::FiberScheduler.log "Erroneous record #{@authority_label} - #{record&.fetch('address', nil) || record.inspect}: #{e}"
38
- if @stats[@authority_label][:unprocessed] > threshold
39
- raise ScraperUtils::UnprocessableSite, "Too many unprocessable_records for #{@authority_label}: #{@stats[@authority_label].inspect} - aborting processing of site!"
40
- end
41
+ def self.log_unprocessable_record(exception, record)
42
+ authority_label = extract_authority(record)
43
+ @stats[authority_label][:unprocessed] += 1
44
+ ScraperUtils::FiberScheduler.log "Erroneous record #{authority_label} - #{record&.fetch(
45
+ 'address', nil
46
+ ) || record.inspect}: #{exception}"
47
+ return unless @stats[authority_label][:unprocessed] > threshold(authority_label)
48
+
49
+ raise ScraperUtils::UnprocessableSite,
50
+ "Too many unprocessable_records for #{authority_label}: " \
51
+ "#{@stats[authority_label].inspect} - aborting processing of site!"
41
52
  end
42
53
 
43
54
  # Logs a successfully saved record
@@ -45,9 +56,9 @@ module ScraperUtils
45
56
  # @param record [Hash] The record that was saved
46
57
  # @return [void]
47
58
  def self.log_saved_record(record)
48
- start_authority(:"") unless @stats
49
- @stats[@authority_label][:saved] += 1
50
- ScraperUtils::FiberScheduler.log "Saving record #{@authority_label} - #{record['address']}"
59
+ authority_label = extract_authority(record)
60
+ @stats[authority_label][:saved] += 1
61
+ ScraperUtils::FiberScheduler.log "Saving record #{authority_label} - #{record['address']}"
51
62
  end
52
63
  end
53
64
  end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ class DateRangeUtils
5
+ MERGE_ADJACENT_RANGES = true
6
+ PERIODS = [2, 3, 5, 8].freeze
7
+
8
+ class << self
9
+ # @return [Integer] Default number of days to cover
10
+ attr_accessor :default_days
11
+
12
+ # @return [Integer] Default days to always include in ranges
13
+ attr_accessor :default_everytime
14
+
15
+ # @return [Integer, nil] Default max days between any one date being in a range
16
+ attr_accessor :default_max_period
17
+
18
+ # Configure default settings for all DateRangeUtils instances
19
+ # @yield [self] Yields self for configuration
20
+ # @example
21
+ # AgentConfig.configure do |config|
22
+ # config.default_everytime = 3
23
+ # config.default_days = 35
24
+ # config.default_max_period = 5
25
+ # end
26
+ # @return [void]
27
+ def configure
28
+ yield self if block_given?
29
+ end
30
+
31
+ # Reset all configuration options to their default values
32
+ # @return [void]
33
+ def reset_defaults!
34
+ @default_days = ENV.fetch('MORPH_DAYS', 33).to_i # 33
35
+ @default_everytime = ENV.fetch('MORPH_EVERYTIME', 4).to_i # 4
36
+ @default_max_period = ENV.fetch('MORPH_MAX_PERIOD', 3).to_i # 3
37
+ end
38
+ end
39
+
40
+ # Set defaults on load
41
+ reset_defaults!
42
+
43
+ attr_reader :max_period_used
44
+ attr_reader :extended_max_period
45
+
46
+ # Generates one or more date ranges to check the most recent daily through to checking each max_period
47
+ # There is a graduated schedule from the latest `everytime` days through to the oldest of `days` dates which is checked each `max_period` days.
48
+ # @param days [Integer, nil] create ranges that cover the last `days` dates
49
+ # @param everytime [Integer, nil] Always include the latest `everytime` out of `days` dates
50
+ # @param max_period [Integer, nil] the last `days` dates must be checked at least every `max_period` days
51
+ # @param today [Date, nil] overrides the default determination of today at UTC+09:30 (middle of Australia)
52
+ # @return [Array{[Date, Date, String]}] being from_date, to_date and a comment
53
+ #
54
+ # Uses a Fibonacci sequence to create a natural progression of check frequencies.
55
+ # Newer data is checked more frequently, with periods between checks growing
56
+ # according to the Fibonacci sequence (2, 3, 5, 8, 13...) until reaching max_period.
57
+ # This creates an efficient schedule that mimics natural information decay patterns.
58
+ def calculate_date_ranges(days: nil, everytime: nil, max_period: nil, today: nil)
59
+ _calculate_date_ranges(
60
+ Integer(days || self.class.default_days),
61
+ Integer(everytime || self.class.default_everytime),
62
+ Integer(max_period || self.class.default_max_period),
63
+ today || Time.now(in: '+09:30').to_date
64
+ )
65
+ end
66
+
67
+ private
68
+
69
+ def _calculate_date_ranges(days, everytime, max_period, today)
70
+ @max_period_used = 1
71
+ to_date = today
72
+ valid_periods = PERIODS.select { |p| p <= max_period }
73
+ if !max_period.positive? || !days.positive?
74
+ return []
75
+ elsif valid_periods.empty? || everytime >= days
76
+ # cover everything everytime
77
+ return [[today + 1 - days, today, "everything"]]
78
+ end
79
+
80
+ max_period = valid_periods.max
81
+
82
+ run_number = today.to_date.jd
83
+ ranges = []
84
+ if everytime.positive?
85
+ ranges << [to_date + 1 - everytime, to_date, "everytime"]
86
+ days -= everytime
87
+ to_date -= everytime
88
+ end
89
+
90
+ periods = valid_periods.dup
91
+ loop do
92
+ period = periods.shift
93
+ break if period.nil? || period >= max_period || !days.positive?
94
+
95
+ if DebugUtils.trace?
96
+ FiberScheduler.log "DEBUG: #{period} day periods started #{(today - to_date).to_i} days in."
97
+ end
98
+ period.times do |index|
99
+ break unless days.positive?
100
+
101
+ this_period = [days, period].min
102
+ break if this_period <= 0
103
+
104
+ earliest_from = to_date - days
105
+ # we are working from the oldest back towards today
106
+ if run_number % period == index
107
+ from = to_date - index - (this_period - 1)
108
+ from = earliest_from if from < earliest_from
109
+ to = [today, to_date - index].min
110
+ break if from > to
111
+
112
+ @max_period_used = [this_period, @max_period_used].max
113
+ if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
114
+ # extend adjacent range
115
+ ranges.last[0] = [from, ranges.last[0]].min
116
+ ranges.last[2] = "#{period}\##{index},#{ranges.last[2]}"
117
+ else
118
+ to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
119
+ ranges << [from, to, "#{period}\##{index}"]
120
+ end
121
+ end
122
+ days -= this_period
123
+ to_date -= this_period
124
+ end
125
+ end
126
+ # remainder of range at max_period, whatever that is
127
+ if days.positive? && ScraperUtils::DebugUtils.trace?
128
+ FiberScheduler.log "DEBUG: #{max_period} day periods started #{(today - to_date).to_i} days in."
129
+ end
130
+ index = -1
131
+ while days.positive?
132
+ index += 1
133
+ this_period = [days, max_period].min
134
+ break if this_period <= 0
135
+
136
+ earliest_from = to_date - days
137
+ if (run_number % max_period) == (index % max_period)
138
+ from = to_date - index - (this_period - 1)
139
+ from = earliest_from if from < earliest_from
140
+ to = to_date - index
141
+ break if from > to
142
+
143
+ @max_period_used = [this_period, @max_period_used].max
144
+ if ranges.any? && ranges.last[0] <= to + 1 && MERGE_ADJACENT_RANGES
145
+ # extend adjacent range
146
+ ranges.last[0] = [from, ranges.last[0]].min
147
+ ranges.last[2] = "#{this_period}\##{index},#{ranges.last[2]}"
148
+ else
149
+ to = ranges.last[0] - 1 if ranges.any? && to >= ranges.last[0]
150
+ ranges << [from, to, "#{this_period}\##{index}"]
151
+ end
152
+ end
153
+ days -= this_period
154
+ to_date -= this_period
155
+ end
156
+ ranges.reverse
157
+ end
158
+ end
159
+ end
@@ -33,8 +33,6 @@ module ScraperUtils
33
33
  else
34
34
  ["council_reference"]
35
35
  end
36
-
37
-
38
36
  ScraperWiki.save_sqlite(primary_key, record)
39
37
  ScraperUtils::DataQualityMonitor.log_saved_record(record)
40
38
  end
@@ -5,6 +5,50 @@ require "json"
5
5
  module ScraperUtils
6
6
  # Utilities for debugging web scraping processes
7
7
  module DebugUtils
8
+ DEBUG_ENV_VAR = "DEBUG"
9
+ MORPH_DEBUG_ENV_VAR = "MORPH_DEBUG"
10
+
11
+ # Debug level constants
12
+ DISABLED_LEVEL = 0
13
+ BASIC_LEVEL = 1
14
+ VERBOSE_LEVEL = 2
15
+ TRACE_LEVEL = 3
16
+
17
+ # Get current debug level (0 = disabled, 1 = basic, 2 = verbose, 3 = trace)
18
+ # Checks DEBUG and MORPH_DEBUG env variables
19
+ # @return [Integer] Debug level
20
+ def self.debug_level
21
+ debug = ENV.fetch(DEBUG_ENV_VAR, ENV.fetch(MORPH_DEBUG_ENV_VAR, '0'))
22
+ debug =~ /^\d/ ? debug.to_i : BASIC_LEVEL
23
+ end
24
+
25
+ # Check if debug is enabled at specified level or higher
26
+ #
27
+ # @param level [Integer] Minimum debug level to check for
28
+ # @return [Boolean] true if debugging at specified level is enabled
29
+ def self.debug?(level = BASIC_LEVEL)
30
+ debug_level >= level
31
+ end
32
+
33
+ # Check if basic debug output or higher is enabled
34
+ # @return [Boolean] true if debugging is enabled
35
+ def self.basic?
36
+ debug?(BASIC_LEVEL)
37
+ end
38
+
39
+ # Check if verbose debug output or higher is enabled
40
+ # @return [Boolean] true if verbose debugging is enabled
41
+ def self.verbose?
42
+ debug?(VERBOSE_LEVEL)
43
+ end
44
+
45
+ # Check if debug tracing or higher is enabled
46
+ # @return [Boolean] true if debugging is enabled at trace level
47
+ def self.trace?
48
+ debug?(TRACE_LEVEL)
49
+ end
50
+
51
+
8
52
  # Logs details of an HTTP request when debug mode is enabled
9
53
  #
10
54
  # @param method [String] HTTP method (GET, POST, etc.)
@@ -14,13 +58,14 @@ module ScraperUtils
14
58
  # @param body [Hash, nil] Optional request body
15
59
  # @return [void]
16
60
  def self.debug_request(method, url, parameters: nil, headers: nil, body: nil)
17
- return unless ScraperUtils.debug?
61
+ return unless basic?
18
62
 
19
63
  puts
20
- ScraperUtils::FiberScheduler.log "🔍 #{method.upcase} #{url}"
64
+ FiberScheduler.log "🔍 #{method.upcase} #{url}"
21
65
  puts "Parameters:", JSON.pretty_generate(parameters) if parameters
22
66
  puts "Headers:", JSON.pretty_generate(headers) if headers
23
67
  puts "Body:", JSON.pretty_generate(body) if body
68
+ $stdout.flush
24
69
  end
25
70
 
26
71
  # Logs details of a web page when debug mode is enabled
@@ -29,10 +74,10 @@ module ScraperUtils
29
74
  # @param message [String] Context or description for the debug output
30
75
  # @return [void]
31
76
  def self.debug_page(page, message)
32
- return unless ScraperUtils.debug?
77
+ return unless trace?
33
78
 
34
79
  puts
35
- ScraperUtils::FiberScheduler.log "🔍 DEBUG: #{message}"
80
+ FiberScheduler.log "🔍 DEBUG: #{message}"
36
81
  puts "Current URL: #{page.uri}"
37
82
  puts "Page title: #{page.at('title').text.strip}" if page.at("title")
38
83
  puts "",
@@ -40,6 +85,7 @@ module ScraperUtils
40
85
  "-" * 40,
41
86
  page.body,
42
87
  "-" * 40
88
+ $stdout.flush
43
89
  end
44
90
 
45
91
  # Logs details about a specific page selector when debug mode is enabled
@@ -49,10 +95,10 @@ module ScraperUtils
49
95
  # @param message [String] Context or description for the debug output
50
96
  # @return [void]
51
97
  def self.debug_selector(page, selector, message)
52
- return unless ScraperUtils.debug?
98
+ return unless trace?
53
99
 
54
100
  puts
55
- ScraperUtils::FiberScheduler.log "🔍 DEBUG: #{message}"
101
+ FiberScheduler.log "🔍 DEBUG: #{message}"
56
102
  puts "Looking for selector: #{selector}"
57
103
  element = page.at(selector)
58
104
  if element
@@ -64,6 +110,7 @@ module ScraperUtils
64
110
  puts page.body
65
111
  puts "-" * 40
66
112
  end
113
+ $stdout.flush
67
114
  end
68
115
  end
69
116
  end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'fiber'
3
+ require "fiber"
4
4
 
5
5
  module ScraperUtils
6
6
  # A utility module for interleaving multiple scraping operations
@@ -24,6 +24,7 @@ module ScraperUtils
24
24
  # @return [String, nil] the authority name or nil if not in a fiber
25
25
  def self.current_authority
26
26
  return nil unless in_fiber?
27
+
27
28
  Fiber.current.instance_variable_get(:@authority)
28
29
  end
29
30
 
@@ -33,11 +34,13 @@ module ScraperUtils
33
34
  # @return [void]
34
35
  def self.log(message)
35
36
  authority = current_authority
37
+ $stderr.flush
36
38
  if authority
37
39
  puts "[#{authority}] #{message}"
38
40
  else
39
41
  puts message
40
42
  end
43
+ $stdout.flush
41
44
  end
42
45
 
43
46
  # Returns a hash of exceptions encountered during processing, indexed by authority
@@ -47,7 +50,7 @@ module ScraperUtils
47
50
  @exceptions ||= {}
48
51
  end
49
52
 
50
- # Returns a hash of values which will be the values yielded along the way then the block value when it completes
53
+ # Returns a hash of the yielded / block values
51
54
  #
52
55
  # @return [Hash{Symbol => Any}] values by authority
53
56
  def self.values
@@ -76,7 +79,7 @@ module ScraperUtils
76
79
  @enabled = false
77
80
  end
78
81
 
79
- # Resets the scheduler state, and disables the scheduler. Use this before retrying failed authorities.
82
+ # Resets the scheduler state, and disables. Use before retrying failed authorities.
80
83
  #
81
84
  # @return [void]
82
85
  def self.reset!
@@ -94,21 +97,19 @@ module ScraperUtils
94
97
  #
95
98
  # @param authority [String] the name of the authority being processed
96
99
  # @yield to the block containing the scraping operation to be run in the fiber
97
- # @return [Fiber] the created fiber that calls the block. With @authority and @resume_at instance variables
100
+ # @return [Fiber] a fiber that calls the block. With @authority and @resume_at instance vars
98
101
  def self.register_operation(authority, &block)
99
102
  # Automatically enable fiber scheduling when operations are registered
100
103
  enable!
101
104
 
102
105
  fiber = Fiber.new do
103
- begin
104
- values[authority] = block.call
105
- rescue StandardError => e
106
- # Store exception against the authority
107
- exceptions[authority] = e
108
- ensure
109
- # Remove itself when done regardless of success/failure
110
- registry.delete(Fiber.current)
111
- end
106
+ values[authority] = block.call
107
+ rescue StandardError => e
108
+ # Store exception against the authority
109
+ exceptions[authority] = e
110
+ ensure
111
+ # Remove itself when done regardless of success/failure
112
+ registry.delete(Fiber.current)
112
113
  end
113
114
 
114
115
  # Start fibres in registration order
@@ -117,9 +118,11 @@ module ScraperUtils
117
118
  fiber.instance_variable_set(:@authority, authority)
118
119
  registry << fiber
119
120
 
120
- puts "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving" if ENV['DEBUG']
121
- # Important: Don't immediately resume the fiber here
122
- # Let the caller decide when to start or coordinate fibers
121
+ if DebugUtils.basic?
122
+ FiberScheduler.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
123
+ end
124
+ # Process immediately when testing
125
+ fiber.resume if ScraperUtils::RandomizeUtils.sequential?
123
126
  fiber
124
127
  end
125
128
 
@@ -130,19 +133,28 @@ module ScraperUtils
130
133
  count = registry.size
131
134
  while (fiber = find_earliest_fiber)
132
135
  if fiber.alive?
133
- authority = fiber.instance_variable_get(:@authority) rescue nil
136
+ authority = begin
137
+ fiber.instance_variable_get(:@authority)
138
+ rescue StandardError
139
+ nil
140
+ end
134
141
  @resume_count ||= 0
135
142
  @resume_count += 1
136
143
  values[authority] = fiber.resume
137
144
  else
138
- puts "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
145
+ FiberScheduler.log "WARNING: fiber is dead but did not remove itself from registry! #{fiber.object_id}"
139
146
  registry.delete(fiber)
140
147
  end
141
148
  end
142
149
 
143
- percent_slept = (100.0 * @time_slept / @delay_requested).round(1) if @time_slept&.positive? && @delay_requested&.positive?
144
- puts "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, sleeping " \
145
- "#{percent_slept}% (#{@time_slept&.round(1)}) of the #{@delay_requested&.round(1)} seconds requested."
150
+ if @time_slept&.positive? && @delay_requested&.positive?
151
+ percent_slept = (100.0 * @time_slept / @delay_requested).round(1)
152
+ end
153
+ puts
154
+ FiberScheduler.log "FiberScheduler processed #{@resume_count} calls to delay for #{count} registrations, " \
155
+ "sleeping #{percent_slept}% (#{@time_slept&.round(1)}) of the " \
156
+ "#{@delay_requested&.round(1)} seconds requested."
157
+ puts
146
158
 
147
159
  exceptions
148
160
  end
@@ -162,10 +174,20 @@ module ScraperUtils
162
174
  if !enabled? || !current_fiber || registry.size <= 1
163
175
  @time_slept ||= 0.0
164
176
  @time_slept += seconds
177
+ log("Sleeping #{seconds.round(3)} seconds") if DebugUtils.basic?
165
178
  return sleep(seconds)
166
179
  end
167
180
 
168
- resume_at = Time.now + seconds
181
+ now = Time.now
182
+ resume_at = now + seconds
183
+
184
+ # Don't resume at the same time as someone else,
185
+ # FIFO queue if seconds == 0
186
+ @other_resumes ||= []
187
+ @other_resumes = @other_resumes.delete_if { |t| t < now }
188
+ while @other_resumes.include?(resume_at) && resume_at
189
+ resume_at += 0.01
190
+ end
169
191
 
170
192
  # Used to compare when other fibers need to be resumed
171
193
  current_fiber.instance_variable_set(:@resume_at, resume_at)
@@ -178,6 +200,7 @@ module ScraperUtils
178
200
  if remaining.positive?
179
201
  @time_slept ||= 0.0
180
202
  @time_slept += remaining
203
+ log("Sleeping remaining #{remaining.round(3)} seconds") if DebugUtils.basic?
181
204
  sleep(remaining)
182
205
  end || 0
183
206
  end