scraper_utils 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +5 -0
  3. data/CHANGELOG.md +7 -0
  4. data/GUIDELINES.md +2 -1
  5. data/Gemfile +1 -0
  6. data/IMPLEMENTATION.md +40 -0
  7. data/README.md +29 -23
  8. data/SPECS.md +13 -1
  9. data/bin/rspec +27 -0
  10. data/docs/example_scrape_with_fibers.rb +4 -4
  11. data/docs/fibers_and_threads.md +72 -0
  12. data/docs/getting_started.md +6 -6
  13. data/docs/interleaving_requests.md +7 -7
  14. data/docs/parallel_requests.md +138 -0
  15. data/docs/randomizing_requests.md +12 -8
  16. data/docs/reducing_server_load.md +6 -6
  17. data/lib/scraper_utils/data_quality_monitor.rb +2 -3
  18. data/lib/scraper_utils/date_range_utils.rb +37 -78
  19. data/lib/scraper_utils/debug_utils.rb +5 -5
  20. data/lib/scraper_utils/log_utils.rb +15 -0
  21. data/lib/scraper_utils/mechanize_actions.rb +37 -8
  22. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +79 -0
  23. data/lib/scraper_utils/mechanize_utils/agent_config.rb +31 -30
  24. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
  25. data/lib/scraper_utils/mechanize_utils.rb +8 -5
  26. data/lib/scraper_utils/randomize_utils.rb +22 -19
  27. data/lib/scraper_utils/scheduler/constants.rb +12 -0
  28. data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
  29. data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
  30. data/lib/scraper_utils/scheduler/process_request.rb +59 -0
  31. data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
  32. data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
  33. data/lib/scraper_utils/scheduler.rb +286 -0
  34. data/lib/scraper_utils/version.rb +1 -1
  35. data/lib/scraper_utils.rb +11 -14
  36. metadata +16 -6
  37. data/lib/scraper_utils/adaptive_delay.rb +0 -70
  38. data/lib/scraper_utils/fiber_scheduler.rb +0 -229
  39. data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ module Scheduler
5
+ # Encapsulates a response from an asynchronous command execution
6
+ class ThreadResponse
7
+ # @return [Symbol] The authority from the original command
8
+ attr_reader :authority
9
+
10
+ # @return [Object, nil] The result of the command
11
+ attr_reader :result
12
+
13
+ # @return [Exception, nil] Any error that occurred during execution
14
+ attr_reader :error
15
+
16
+ # @return [Float] The time taken to execute the command in seconds
17
+ attr_reader :time_taken
18
+
19
+ # @return [Time, nil] Optionally delay the next process
20
+ attr_accessor :delay_till
21
+
22
+ # Initialize a new async response
23
+ #
24
+ # @param authority [Symbol] The authority from the original command
25
+ # @param result [Object, nil] The result of the command
26
+ # @param error [Exception, nil] Any error that occurred during execution
27
+ # @param time_taken [Float] The time taken to submit_request the command in seconds
28
+ def initialize(authority, result, error, time_taken)
29
+ @authority = authority
30
+ @result = result
31
+ @error = error
32
+ @time_taken = time_taken
33
+ @delay_till = nil
34
+ end
35
+
36
+ # Check if the command execution was successful
37
+ #
38
+ # @return [Boolean] true if successful, false otherwise
39
+ def success?
40
+ @error.nil?
41
+ end
42
+
43
+ # Return result or raise error
44
+ # @return [Object] Result pf request
45
+ def result!
46
+ return @result if success?
47
+ raise @error
48
+ end
49
+
50
+ # Provide a readable inspection of the response
51
+ # @return [String] Readable representation
52
+ def inspect
53
+ status = success? ? "success" : "FAILED"
54
+ error_info = success? ? "" : " - #{error.class}: #{error.message}"
55
+ "#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,286 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fiber"
4
+
5
+ require_relative "scheduler/constants"
6
+ require_relative "scheduler/operation_registry"
7
+ require_relative "scheduler/operation_worker"
8
+
9
+ # value objects
10
+ require_relative "scheduler/process_request"
11
+ require_relative "scheduler/thread_request"
12
+
13
+ module ScraperUtils
14
+ # A utility module to coordinate the scheduling of work,
15
+ # * interleaving multiple operations (scraping of an authorities site)
16
+ # uses Fibers (cooperative concurrency) so your code and the libraries you call don't have to be thread safe
17
+ # * Performing mechanize Network I/O in parallel using Threads
18
+ #
19
+ # Process flow
20
+ # 0. operation_workers start with response = true as the first resume passes args to block and response is ignored
21
+ # 1. resumes fiber of operation_worker with the last response when `Time.now` >= resume_at
22
+ # 2. worker fiber calls {Scheduler.execute_request}
23
+ # a. sets resume_at based on calculated delay and waiting_for_response
24
+ # b. pushes request onto local request queue if parallel, otherwise
25
+ # executes request immediately in fiber and passes response to save_thread_response
26
+ # c. fiber yields true to main fiber to indicate it wants to continue after resume_at / response arrives
27
+ # 3. one thread for each fiber (if parallel), thread:
28
+ # a. pops request
29
+ # b. executes request
30
+ # c. pushes response onto global response queue (includes response_time)
31
+ # 4. main fiber - schedule_all loop
32
+ # a. pops any responses and calls save_thread_response on operation_worker
33
+ # c. resumes(true) operation_worker (fiber) when `Time.now` >= resume_at and not waiting_for_response
34
+ # 5. When worker fiber is finished it returns false to indicate it is finished
35
+ # OR when shutdown is called resume(false) is called to indicate worker fiber should not continue
36
+ #
37
+ # save_thread_response:
38
+ # * Updates running average and calculates next_resume_at
39
+ #
40
+ # fiber aborts processing if 2nd argument is true
41
+ # fiber returns nil when finished
42
+ #
43
+ # Workers:
44
+ # * Push process requests onto individual request queues for their thread to process, and yield(true) to scheduler
45
+ #
46
+ # when enough
47
+ #
48
+ # Thread safe Implementation:
49
+ # * Uses fibers for each authority with its own mechanize agent so operations don't need to be thread safe
50
+ # * Only Mechanize requests are run in threads in parallel whilst they wait for network response
51
+ # * Uses message passing (using Queue's) to avoid having to share state between threads.
52
+ # * Execute request does not return till the response has been received from the thread,
53
+ # so the fiber's mechanize agent that is shared with the thread isn't used in multiple threads at once
54
+ # * Only one execute request per authority fiber can be in the thread request queue at any one time
55
+ module Scheduler
56
+ # @!group Main fiber / thread Api
57
+ # These Methods should only be called from main (initial) fiber
58
+
59
+ class << self
60
+ # Controls if network I/O requests will be processed in parallel using threads
61
+ #
62
+ # @return [Boolean] true if processing network I/O in parallel using threads, otherwise false
63
+ # @note Defaults to true unless the MORPH_DISABLE_THREADS ENV variable is set
64
+ attr_accessor :threaded
65
+
66
+ # @return (see #threaded)
67
+ alias threaded? threaded
68
+
69
+ # Controls whether Mechanize network requests are executed in parallel using threads
70
+ #
71
+ # @return [Integer] max concurrent workers using fibers and threads, defaults to MAX_WORKERS env variable or 50
72
+ attr_accessor :max_workers
73
+
74
+ # @return [Hash{Symbol => Exception}] exceptions by authority
75
+ attr_reader :exceptions
76
+
77
+ # Returns the run_operations timeout
78
+ # On timeout a message will be output and the ruby program will exit with exit code 124.
79
+ #
80
+ # @return [Integer] Overall process timeout in seconds (default MORPH_RUN_TIMEOUT ENV value or 6 hours)
81
+ attr_accessor :run_timeout
82
+
83
+ # Private accessors for internal use
84
+
85
+ private
86
+
87
+ attr_reader :initial_resume_at, :operation_registry, :reset, :response_queue, :totals
88
+
89
+ end
90
+
91
+ # Resets the scheduler state. Use before retrying failed authorities.
92
+ def self.reset!
93
+ @operation_registry&.shutdown
94
+ @operation_registry = nil
95
+ @response_queue.close if @response_queue
96
+ @threaded = ENV["MORPH_DISABLE_THREADS"].to_s.empty?
97
+ @max_workers = [1, ENV.fetch('MORPH_MAX_WORKERS', Constants::DEFAULT_MAX_WORKERS).to_i].max
98
+ @exceptions = {}
99
+ @totals = Hash.new { 0 }
100
+ @initial_resume_at = Time.now
101
+ @response_queue = Thread::Queue.new if self.threaded?
102
+ @operation_registry = OperationRegistry.new
103
+ @reset = true
104
+ @run_timeout = ENV.fetch('MORPH_RUN_TIMEOUT', Constants::DEFAULT_TIMEOUT).to_i
105
+ nil
106
+ end
107
+
108
+ # reset on class load
109
+ reset!
110
+
111
+ # Registers a block to scrape for a specific authority
112
+ #
113
+ # Block yields(:delay) when operation.resume_at is in the future, and returns :finished when finished
114
+ # @param authority [Symbol] the name of the authority being processed
115
+ # @yield to the block containing the scraping operation to be run in the fiber
116
+ def self.register_operation(authority, &block)
117
+ fiber = Fiber.new do |continue|
118
+ begin
119
+ raise "Terminated fiber for #{authority} before block run" unless continue
120
+
121
+ block.call
122
+ rescue StandardError => e
123
+ # Store exception against the authority
124
+ exceptions[authority] = e
125
+ ensure
126
+ # Clean up when done regardless of success/failure
127
+ operation_registry&.deregister
128
+ end
129
+ # no further requests
130
+ nil
131
+ end
132
+
133
+ operation = operation_registry&.register(fiber, authority)
134
+
135
+ if DebugUtils.basic?
136
+ LogUtils.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
137
+ end
138
+ if operation_registry&.size >= @max_workers
139
+ LogUtils.log "Running batch of #{operation_registry&.size} operations immediately"
140
+ run_operations
141
+ end
142
+ # return operation for ease of testing
143
+ operation
144
+ end
145
+
146
+ # Run all registered operations until completion
147
+ #
148
+ # @return [Hash] Exceptions that occurred during execution
149
+ def self.run_operations
150
+ monitor_run_time = Thread.new do
151
+ sleep run_timeout
152
+ desc = "#{(run_timeout / 3600.0).round(1)} hours"
153
+ desc = "#{(run_timeout / 60.0).round(1)} minutes" if run_timeout < 100 * 60
154
+ desc = "#{run_timeout} seconds" if run_timeout < 100
155
+ LogUtils.log "ERROR: Script exceeded maximum allowed runtime of #{desc}!\n" \
156
+ "Forcibly terminating process!"
157
+ Process.exit!(124)
158
+ end
159
+ count = operation_registry&.size
160
+
161
+ # Main scheduling loop - process till there is nothing left to do
162
+ until @operation_registry.empty?
163
+ save_thread_responses
164
+ resume_next_operation
165
+ end
166
+
167
+ report_summary(count)
168
+
169
+ exceptions
170
+ ensure
171
+ # Kill the monitoring thread if we finish normally
172
+ monitor_run_time.kill if monitor_run_time.alive?
173
+ monitor_run_time.join(2)
174
+ end
175
+
176
+ # ===========================================================
177
+ # @!group Fiber Api
178
+ # These Methods should be called from the worker's own fiber but can be called from the main fiber
179
+
180
+ # Execute Mechanize network request in parallel using the fiber's thread
181
+ # This allows multiple network I/O requests to be waiting for a response in parallel
182
+ # whilst responses that have arrived can be processed by their fibers.
183
+ #
184
+ # @example Replace this code in your scraper
185
+ # page = agent.get(url_period(url, period, webguest))
186
+ #
187
+ # @example With this code
188
+ # page = ScraperUtils::Scheduler.execute_request(agent, :get, [url_period(url, period, webguest)])
189
+ #
190
+ # @param client [MechanizeClient] client to be used to process request
191
+ # @param method_name [Symbol] method to be called on client
192
+ # @param args [Array] Arguments to be used with method call
193
+ # @return [Object] response from method call on client
194
+ def self.execute_request(client, method_name, args)
195
+ operation = current_operation
196
+ # execute immediately if not in a worker fiber
197
+ return client.send(method_name, args) unless operation
198
+
199
+ request = Scheduler::ProcessRequest.new(operation.authority, client, method_name, args)
200
+ log "Submitting request #{request.inspect}" if DebugUtils.basic?
201
+ response = operation.submit_request(request)
202
+ unless response.is_a?(ThreadResponse)
203
+ raise "Expected ThreadResponse, got: #{response.inspect}"
204
+ end
205
+ response.result!
206
+ end
207
+
208
+ # Gets the authority associated with the current fiber or thread
209
+ #
210
+ # @return [Symbol, nil] the authority name or nil if not in a fiber
211
+ def self.current_authority
212
+ current_operation&.authority
213
+ end
214
+
215
+ # @!endgroup
216
+ # ===========================================================
217
+
218
+ private
219
+
220
+ # Save results from threads in operation state so more operation fibers can be resumed
221
+ def self.save_thread_responses
222
+ while (thread_response = get_response)
223
+ operation = @operation_registry&.find(thread_response.authority)
224
+ operation&.save_thread_response(thread_response)
225
+ LogUtils.log "WARNING: orphaned thread response ignored: #{thread_response.inspect}", thread_response.authority
226
+ end
227
+ end
228
+
229
+ # Resume next operation or sleep POLL_PERIOD if non are ready
230
+ def self.resume_next_operation
231
+ delay = Constants::POLL_PERIOD
232
+ # Find the operation that ready to run with the earliest resume_at
233
+ can_resume_operations = @operation_registry&.can_resume
234
+ operation = can_resume_operations&.first
235
+
236
+ if !operation
237
+ # All the fibers must be waiting for responses, so sleep a bit to allow the responses to arrive
238
+ @operation_registry&.cleanup_zombies
239
+ sleep(delay)
240
+ @totals[:wait_response] += delay
241
+ else
242
+ delay = [(operation.resume_at - Time.now).to_f, delay].min
243
+ if delay.positive?
244
+ # Wait a bit for a fiber to be ready to run
245
+ sleep(delay)
246
+ waiting_for_delay = delay * can_resume_operations&.size.to_f / (@operation_registry&.size || 1)
247
+ @totals[:wait_delay] += waiting_for_delay
248
+ @totals[:wait_response] += delay - waiting_for_delay
249
+ else
250
+ @totals[:resume_count] += 1
251
+ # resume fiber with response to last request that is ready to be resumed now
252
+ operation.resume
253
+ end
254
+ operation
255
+ end
256
+ end
257
+
258
+ # Return the next response, returns nil if queue is empty
259
+ #
260
+ # @return [ThreadResponse, nil] Result of request execution
261
+ def self.get_response(non_block = true)
262
+ return nil if non_block && @response_queue.empty?
263
+
264
+ @response_queue.pop(non_block)
265
+ end
266
+
267
+ def self.current_operation
268
+ @operation_registry&.find
269
+ end
270
+
271
+ def self.report_summary(count)
272
+ wait_delay_percent = 0
273
+ wait_response_percent = 0
274
+ delay_requested = [@totals[:wait_delay], @totals[:wait_response]].sum
275
+ if delay_requested.positive?
276
+ wait_delay_percent = (100.0 * @totals[:wait_delay] / delay_requested).round(1)
277
+ wait_response_percent = (100.0 * @totals[:wait_response] / delay_requested).round(1)
278
+ end
279
+ puts
280
+ LogUtils.log "Scheduler processed #{@totals[:resume_count]} calls for #{count} registrations, " \
281
+ "with #{wait_delay_percent}% of #{delay_requested.round(1)} seconds spent keeping under max_load, " \
282
+ "and #{wait_response_percent}% waiting for network I/O requests."
283
+ puts
284
+ end
285
+ end
286
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module ScraperUtils
4
- VERSION = "0.5.1"
4
+ VERSION = "0.6.0"
5
5
  end
data/lib/scraper_utils.rb CHANGED
@@ -1,20 +1,21 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "scraper_utils/adaptive_delay"
3
+ require "scraper_utils/version"
4
+
5
+ # Public Apis (responsible for requiring their own dependencies)
4
6
  require "scraper_utils/authority_utils"
5
7
  require "scraper_utils/cycle_utils"
6
8
  require "scraper_utils/data_quality_monitor"
7
9
  require "scraper_utils/date_range_utils"
8
10
  require "scraper_utils/db_utils"
9
11
  require "scraper_utils/debug_utils"
10
- require "scraper_utils/fiber_scheduler"
11
12
  require "scraper_utils/log_utils"
13
+ require "scraper_utils/randomize_utils"
14
+ require "scraper_utils/scheduler"
15
+
16
+ # Mechanize utilities
12
17
  require "scraper_utils/mechanize_actions"
13
- require "scraper_utils/mechanize_utils/agent_config"
14
18
  require "scraper_utils/mechanize_utils"
15
- require "scraper_utils/randomize_utils"
16
- require "scraper_utils/robots_checker"
17
- require "scraper_utils/version"
18
19
 
19
20
  # Utilities for planningalerts scrapers
20
21
  module ScraperUtils
@@ -22,17 +23,13 @@ module ScraperUtils
22
23
  AUSTRALIAN_PROXY_ENV_VAR = "MORPH_AUSTRALIAN_PROXY"
23
24
 
24
25
  # Fatal Error
25
- class Error < StandardError
26
- end
26
+ class Error < StandardError; end
27
27
 
28
28
  # Fatal error with the site - retrying won't help
29
- class UnprocessableSite < Error
30
- end
29
+ class UnprocessableSite < Error; end
31
30
 
32
- # Content validation errors that should not be retried for that record,
33
- # but other records may be processable
34
- class UnprocessableRecord < Error
35
- end
31
+ # Fatal Error for a record - other records may be processable
32
+ class UnprocessableRecord < Error; end
36
33
 
37
34
  def self.australian_proxy
38
35
  ap = ENV[AUSTRALIAN_PROXY_ENV_VAR].to_s
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper_utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ian Heggie
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-05 00:00:00.000000000 Z
11
+ date: 2025-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -64,6 +64,7 @@ files:
64
64
  - ".rspec"
65
65
  - ".rubocop.yml"
66
66
  - ".travis.yml"
67
+ - ".yardopts"
67
68
  - CHANGELOG.md
68
69
  - GUIDELINES.md
69
70
  - Gemfile
@@ -73,30 +74,39 @@ files:
73
74
  - Rakefile
74
75
  - SPECS.md
75
76
  - bin/console
77
+ - bin/rspec
76
78
  - bin/setup
77
79
  - docs/debugging.md
78
80
  - docs/example_scrape_with_fibers.rb
79
81
  - docs/example_scraper.rb
82
+ - docs/fibers_and_threads.md
80
83
  - docs/getting_started.md
81
84
  - docs/interleaving_requests.md
82
85
  - docs/mechanize_utilities.md
86
+ - docs/parallel_requests.md
83
87
  - docs/randomizing_requests.md
84
88
  - docs/reducing_server_load.md
85
89
  - lib/scraper_utils.rb
86
- - lib/scraper_utils/adaptive_delay.rb
87
90
  - lib/scraper_utils/authority_utils.rb
88
91
  - lib/scraper_utils/cycle_utils.rb
89
92
  - lib/scraper_utils/data_quality_monitor.rb
90
93
  - lib/scraper_utils/date_range_utils.rb
91
94
  - lib/scraper_utils/db_utils.rb
92
95
  - lib/scraper_utils/debug_utils.rb
93
- - lib/scraper_utils/fiber_scheduler.rb
94
96
  - lib/scraper_utils/log_utils.rb
95
97
  - lib/scraper_utils/mechanize_actions.rb
96
98
  - lib/scraper_utils/mechanize_utils.rb
99
+ - lib/scraper_utils/mechanize_utils/adaptive_delay.rb
97
100
  - lib/scraper_utils/mechanize_utils/agent_config.rb
101
+ - lib/scraper_utils/mechanize_utils/robots_checker.rb
98
102
  - lib/scraper_utils/randomize_utils.rb
99
- - lib/scraper_utils/robots_checker.rb
103
+ - lib/scraper_utils/scheduler.rb
104
+ - lib/scraper_utils/scheduler/constants.rb
105
+ - lib/scraper_utils/scheduler/operation_registry.rb
106
+ - lib/scraper_utils/scheduler/operation_worker.rb
107
+ - lib/scraper_utils/scheduler/process_request.rb
108
+ - lib/scraper_utils/scheduler/thread_request.rb
109
+ - lib/scraper_utils/scheduler/thread_response.rb
100
110
  - lib/scraper_utils/version.rb
101
111
  - scraper_utils.gemspec
102
112
  homepage: https://github.com/ianheggie-oaf/scraper_utils
@@ -106,7 +116,7 @@ metadata:
106
116
  allowed_push_host: https://rubygems.org
107
117
  homepage_uri: https://github.com/ianheggie-oaf/scraper_utils
108
118
  source_code_uri: https://github.com/ianheggie-oaf/scraper_utils
109
- documentation_uri: https://rubydoc.info/gems/scraper_utils/0.5.1
119
+ documentation_uri: https://rubydoc.info/gems/scraper_utils/0.6.0
110
120
  changelog_uri: https://github.com/ianheggie-oaf/scraper_utils/blob/main/CHANGELOG.md
111
121
  rubygems_mfa_required: 'true'
112
122
  post_install_message:
@@ -1,70 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "uri"
4
-
5
- module ScraperUtils
6
- # Adapts delays between requests based on server response times.
7
- # Target delay is proportional to response time based on max_load setting.
8
- # Uses an exponential moving average to smooth variations in response times.
9
- class AdaptiveDelay
10
- DEFAULT_MIN_DELAY = 0.0
11
- DEFAULT_MAX_DELAY = 30.0 # Presumed default timeout for Mechanize
12
-
13
- attr_reader :min_delay, :max_delay, :max_load
14
-
15
- # Creates a new adaptive delay calculator
16
- #
17
- # @param min_delay [Float] Minimum delay between requests in seconds
18
- # @param max_delay [Float] Maximum delay between requests in seconds
19
- # @param max_load [Float] Maximum load percentage (1-99) we aim to place on the server
20
- # Lower values are more conservative (e.g., 20% = 4x response time delay)
21
- def initialize(min_delay: DEFAULT_MIN_DELAY, max_delay: DEFAULT_MAX_DELAY, max_load: 20.0)
22
- @delays = {} # domain -> last delay used
23
- @min_delay = min_delay.to_f
24
- @max_delay = max_delay.to_f
25
- @max_load = max_load.to_f.clamp(1.0, 99.0)
26
- @response_multiplier = (100.0 - @max_load) / @max_load
27
-
28
- return unless DebugUtils.basic?
29
-
30
- ScraperUtils::FiberScheduler.log(
31
- "AdaptiveDelay initialized with delays between #{@min_delay} and #{@max_delay} seconds, " \
32
- "Max_load #{@max_load}% thus response multiplier: #{@response_multiplier.round(2)}x"
33
- )
34
- end
35
-
36
- # @param uri [URI::Generic, String] The URL to extract the domain from
37
- # @return [String] The domain in the format "scheme://host"
38
- def domain(uri)
39
- uri = URI(uri) unless uri.is_a?(URI)
40
- "#{uri.scheme}://#{uri.host}".downcase
41
- end
42
-
43
- # @param uri [URI::Generic, String] URL to get delay for
44
- # @return [Float] Current delay for the domain, or min_delay if no delay set
45
- def delay(uri)
46
- @delays[domain(uri)] || @min_delay
47
- end
48
-
49
- # @param uri [URI::Generic, String] URL the response came from
50
- # @param response_time [Float] Time in seconds the server took to respond
51
- # @return [Float] The calculated delay to use with the next request
52
- def next_delay(uri, response_time)
53
- uris_domain = domain(uri)
54
- target_delay = (response_time * @response_multiplier).clamp(0.0, @max_delay)
55
- current_delay = @delays[uris_domain] || target_delay
56
- delay = ((9.0 * current_delay) + target_delay) / 10.0
57
- delay = delay.clamp(@min_delay, @max_delay)
58
-
59
- if DebugUtils.basic?
60
- ScraperUtils::FiberScheduler.log(
61
- "Adaptive delay for #{uris_domain} updated to #{delay.round(2)}s (target: " \
62
- "#{@response_multiplier.round(1)}x response_time of #{response_time.round(2)}s)"
63
- )
64
- end
65
-
66
- @delays[uris_domain] = delay
67
- delay
68
- end
69
- end
70
- end