scraper_utils 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +5 -0
  3. data/CHANGELOG.md +19 -0
  4. data/GUIDELINES.md +2 -1
  5. data/Gemfile +1 -0
  6. data/IMPLEMENTATION.md +39 -0
  7. data/README.md +29 -23
  8. data/SPECS.md +13 -1
  9. data/bin/rspec +27 -0
  10. data/docs/enhancing_specs.md +100 -0
  11. data/docs/example_scrape_with_fibers.rb +4 -4
  12. data/docs/fibers_and_threads.md +72 -0
  13. data/docs/getting_started.md +6 -6
  14. data/docs/interleaving_requests.md +9 -8
  15. data/docs/mechanize_utilities.md +4 -4
  16. data/docs/parallel_requests.md +138 -0
  17. data/docs/randomizing_requests.md +12 -8
  18. data/docs/reducing_server_load.md +6 -6
  19. data/lib/scraper_utils/data_quality_monitor.rb +2 -3
  20. data/lib/scraper_utils/date_range_utils.rb +37 -78
  21. data/lib/scraper_utils/debug_utils.rb +5 -5
  22. data/lib/scraper_utils/log_utils.rb +15 -0
  23. data/lib/scraper_utils/mechanize_actions.rb +37 -8
  24. data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +80 -0
  25. data/lib/scraper_utils/mechanize_utils/agent_config.rb +35 -34
  26. data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
  27. data/lib/scraper_utils/mechanize_utils.rb +8 -5
  28. data/lib/scraper_utils/randomize_utils.rb +22 -19
  29. data/lib/scraper_utils/scheduler/constants.rb +12 -0
  30. data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
  31. data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
  32. data/lib/scraper_utils/scheduler/process_request.rb +59 -0
  33. data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
  34. data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
  35. data/lib/scraper_utils/scheduler.rb +286 -0
  36. data/lib/scraper_utils/spec_support.rb +67 -0
  37. data/lib/scraper_utils/version.rb +1 -1
  38. data/lib/scraper_utils.rb +12 -14
  39. metadata +18 -6
  40. data/lib/scraper_utils/adaptive_delay.rb +0 -70
  41. data/lib/scraper_utils/fiber_scheduler.rb +0 -229
  42. data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "constants"
4
+ require_relative 'process_request'
5
+
6
+ module ScraperUtils
7
+ module Scheduler
8
+ # Handles the processing of a registered operation and associated fiber and thread state
9
+ class OperationWorker
10
+
11
+ class NotReadyError < RuntimeError; end
12
+
13
+ # @return [Fiber] The fiber
14
+ attr_reader :fiber
15
+
16
+ # @return [Symbol] The authority name associated with this fiber
17
+ attr_reader :authority
18
+
19
+ # @return [Time] When the fiber should be delayed till / ready to resume at
20
+ attr_accessor :resume_at
21
+
22
+ # @return [ThreadResponse, nil] The response to be passed on the next resume
23
+ attr_accessor :response
24
+
25
+ # @return [Boolean] Waiting for a response
26
+ attr_reader :waiting_for_response
27
+
28
+ # @return [Thread] Thread used
29
+ attr_reader :thread
30
+
31
+ # @return [Thread::Queue] The request queue for the thread
32
+ attr_reader :request_queue
33
+
34
+ def self.next_resume_at
35
+ @next_resume_at = [@next_resume_at, Time.now - 0.001].compact.max + 0.001
36
+ end
37
+
38
+ # Fiber has not finished running
39
+ def alive?
40
+ fiber.alive?
41
+ end
42
+
43
+ # Worker has the necessary state to be resumed
44
+ def can_resume?
45
+ !@response.nil? && !@resume_at.nil? && alive?
46
+ end
47
+
48
+ # Save thread response from main or worker fiber
49
+ def save_thread_response(response)
50
+ raise "#{authority} Wasn't waiting for response! Got: #{response.inspect}" unless @waiting_for_response
51
+ @response = response
52
+ @waiting_for_response = false
53
+ @resume_at = [response&.delay_till, Time.now].compact.max
54
+ if DebugUtils.basic?
55
+ log "Received #{response&.class&.name || 'nil response'} from thread for fiber #{authority} in #{response&.time_taken&.round(3)}s"
56
+ end
57
+ response
58
+ end
59
+
60
+ # close resources from worker fiber
61
+ # Called by worker fiber just before it exits
62
+ def close
63
+ validate_fiber(main: false)
64
+ # Signal thread to finish processing, then wait for it
65
+ @request_queue&.close
66
+ @thread&.join(60)
67
+ # drop references for GC
68
+ @request_queue = nil
69
+ @thread = nil
70
+ # make can_resume? false
71
+ clear_resume_state
72
+ end
73
+
74
+ # ===================================================
75
+ # @! Main Fiber API
76
+
77
+ # Initialize a new Worker Fiber and Thread, called from the main Fiber
78
+ #
79
+ # The Thread executes ThreadRequest objects from the request_queue and pushes
80
+ # responses to the global response_queue.
81
+ #
82
+ # @param fiber [Fiber] Fiber to process authority block
83
+ # @param authority [Symbol] Authority label
84
+ # @param response_queue [Thread::Queue, nil] Queue for thread responses if enabled
85
+ def initialize(fiber, authority, response_queue)
86
+ raise(ArgumentError, "Fiber and Authority must be provided") unless fiber && authority
87
+ validate_fiber(main: true)
88
+
89
+ @fiber = fiber
90
+ @authority = authority
91
+ @response_queue = response_queue
92
+ @fiber.instance_variable_set(:@operation_worker, self)
93
+ if response_queue
94
+ @request_queue = Thread::Queue.new
95
+ @thread = Thread.new do
96
+ Thread.current[:current_authority] = authority
97
+ while (request = @request_queue&.pop)
98
+ @response_queue.push request.execute
99
+ end
100
+ end
101
+ end
102
+ @resume_at = self.class.next_resume_at
103
+ @waiting_for_response = false
104
+ # First resume response is ignored
105
+ @response = true
106
+ end
107
+
108
+ # Resume an operation fiber and queue request if there is any from main fiber
109
+ #
110
+ # @return [ThreadRequest, nil] request returned by resume or nil if finished
111
+ def resume
112
+ raise ClosedQueueError unless alive?
113
+ raise NotReadyError, "Cannot resume #{authority} without response!" unless @response
114
+ validate_fiber(main: true)
115
+
116
+ request = @fiber.resume(@response)
117
+ # submit the next request for processing
118
+ submit_request(request) if request
119
+ request
120
+ end
121
+
122
+ # Shutdown worker called from main fiber
123
+ def shutdown
124
+ validate_fiber(main: true)
125
+
126
+ clear_resume_state
127
+ if @fiber&.alive?
128
+ # Trigger fiber to raise an error and thus call deregister
129
+ @fiber.resume(nil)
130
+ end
131
+ end
132
+
133
+ # ===================================================
134
+ # @! Worker Fiber API
135
+
136
+ # Queue a thread request to be executed from worker fiber
137
+ # otherwise locally if parallel processing is disabled
138
+ #
139
+ # Process flow if parallel enabled:
140
+ # 1. This method:
141
+ # a. pushes request onto local @request_queue
142
+ # b. calls Fiber.yield(true) so Scheduler can run other fibers
143
+ # 2. Meanwhile, this fibers thread:
144
+ # a. pops request off queue
145
+ # b. processes request
146
+ # c. pushes response to global response queue
147
+ # 3. Meanwhile, Scheduler on Main fiber:
148
+ # a. pops response from response queue as they arrive
149
+ # * calls {#save_thread_response} on associated worker to save each response
150
+ # c. calls {#resume} on worker when it is its' turn (based on resume_at) and it can_resume (has @response)
151
+ #
152
+ # If parallel processing is not enabled, then the processing occurs in the workers fiber
153
+ #
154
+ # @param request [ThreadRequest] The request to be processed in thread
155
+ def submit_request(request)
156
+ raise NotReadyError, "Cannot make a second request before the first has responded!" if @waiting_for_response
157
+ raise ArgumentError, "Must be passed a valid ThreadRequest! Got: #{request.inspect}" unless request.is_a? ThreadRequest
158
+ validate_fiber(main: false)
159
+
160
+ @response = nil
161
+ @waiting_for_response = true
162
+ if @request_queue
163
+ @request_queue&.push request
164
+ response = Fiber.yield true
165
+ raise "Terminated fiber for #{authority} as requested" unless response
166
+ else
167
+ response = save_thread_response request.execute
168
+ end
169
+ response
170
+ end
171
+
172
+ private
173
+
174
+ def validate_fiber(main: false)
175
+ required_fiber = main ? Constants::MAIN_FIBER : @fiber
176
+ current_id = Fiber.current.object_id
177
+ return if current_id == required_fiber.object_id
178
+
179
+ desc = main ? 'main' : 'worker'
180
+ we_are = if current_id == Constants::MAIN_FIBER.object_id
181
+ 'main'
182
+ elsif current_id == @fiber.object_id
183
+ 'worker'
184
+ else
185
+ 'other'
186
+ end
187
+ raise ArgumentError,
188
+ "Must be run within the #{desc} not #{we_are} fiber!"
189
+ end
190
+
191
+ # Clear resume state so the operation won't be resumed
192
+ def clear_resume_state
193
+ @resume_at = nil
194
+ @response = nil
195
+ @waiting_for_response = false
196
+ end
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "thread_request"
4
+
5
+ module ScraperUtils
6
+ module Scheduler
7
+ # Encapsulates a request to be executed (usually )asynchronously by the ThreadPool)
8
+ class ProcessRequest < ThreadRequest
9
+ # @return [Object] The object to call the method on
10
+ attr_reader :subject
11
+
12
+ # @return [Symbol] The method to call on the subject
13
+ attr_reader :method_name
14
+
15
+ # @return [Array] The arguments to pass to the method
16
+ attr_reader :args
17
+
18
+ # Initialize a new async request
19
+ #
20
+ # @param authority [Symbol, nil] Authority for correlating requests and responses
21
+ # nil is used when threads are disabled to process locally without duplicating codd
22
+ # @param subject [Object] The object to call the method on
23
+ # @param method_name [Symbol] The method to call on the subject
24
+ # @param args [Array] The arguments to pass to the method
25
+ # @raise [ArgumentError] If any required parameter is missing or invalid
26
+ def initialize(authority, subject, method_name, args)
27
+ super(authority)
28
+ @subject = subject
29
+ @method_name = method_name
30
+ @args = args
31
+
32
+ validate!
33
+ end
34
+
35
+ # Execute the request by calling the method on the subject
36
+ # If the subject has an instance variable @delay_till then that is added to the response
37
+ # @return [ThreadResponse] The result of the request
38
+ def execute
39
+ result = execute_block do
40
+ subject.send(method_name, *args)
41
+ end
42
+ result.delay_till = subject.instance_variable_get(:@delay_till)
43
+ result
44
+ end
45
+
46
+ private
47
+
48
+ # Validate that all required parameters are present and valid
49
+ #
50
+ # @raise [ArgumentError] If any parameter is missing or invalid
51
+ def validate!
52
+ raise ArgumentError, "Subject must be provided" unless @subject
53
+ raise ArgumentError, "Method name must be provided" unless @method_name
54
+ raise ArgumentError, "Args must be an array" unless @args.is_a?(Array)
55
+ raise ArgumentError, "Subject must respond to method" unless @subject&.respond_to?(@method_name)
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "thread_response"
4
+
5
+ module ScraperUtils
6
+ module Scheduler
7
+ # Encapsulates a request that pushed to the fiber's request queue to be executed by the Fiber's Thread
8
+ # The response is returned via the Scheduler's response queue
9
+ # @see {ProcessRequest}
10
+ class ThreadRequest
11
+ # @return [Symbol] Authority for correlating requests and responses
12
+ attr_reader :authority
13
+
14
+ # Initialize a new process request
15
+ #
16
+ # @param authority [Symbol, nil] Authority for correlating requests and responses
17
+ def initialize(authority)
18
+ @authority = authority
19
+ end
20
+
21
+ # Execute a request and return ThreadResponse - use helper method `.execute_block`
22
+ def execute
23
+ raise NotImplementedError, "Implement in subclass"
24
+ end
25
+
26
+ # Execute a request by calling the block
27
+ # @return [ThreadResponse] The result of the request
28
+ def execute_block
29
+ start_time = Time.now
30
+ begin
31
+ result = yield
32
+ elapsed_time = Time.now - start_time
33
+ ThreadResponse.new(
34
+ authority,
35
+ result,
36
+ nil,
37
+ elapsed_time
38
+ )
39
+ rescue => e
40
+ elapsed_time = Time.now - start_time
41
+ ThreadResponse.new(
42
+ authority,
43
+ nil,
44
+ e,
45
+ elapsed_time
46
+ )
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,59 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ScraperUtils
4
+ module Scheduler
5
+ # Encapsulates a response from an asynchronous command execution
6
+ class ThreadResponse
7
+ # @return [Symbol] The authority from the original command
8
+ attr_reader :authority
9
+
10
+ # @return [Object, nil] The result of the command
11
+ attr_reader :result
12
+
13
+ # @return [Exception, nil] Any error that occurred during execution
14
+ attr_reader :error
15
+
16
+ # @return [Float] The time taken to execute the command in seconds
17
+ attr_reader :time_taken
18
+
19
+ # @return [Time, nil] Optionally delay the next process
20
+ attr_accessor :delay_till
21
+
22
+ # Initialize a new async response
23
+ #
24
+ # @param authority [Symbol] The authority from the original command
25
+ # @param result [Object, nil] The result of the command
26
+ # @param error [Exception, nil] Any error that occurred during execution
27
+ # @param time_taken [Float] The time taken to submit_request the command in seconds
28
+ def initialize(authority, result, error, time_taken)
29
+ @authority = authority
30
+ @result = result
31
+ @error = error
32
+ @time_taken = time_taken
33
+ @delay_till = nil
34
+ end
35
+
36
+ # Check if the command execution was successful
37
+ #
38
+ # @return [Boolean] true if successful, false otherwise
39
+ def success?
40
+ @error.nil?
41
+ end
42
+
43
+ # Return result or raise error
44
+ # @return [Object] Result pf request
45
+ def result!
46
+ return @result if success?
47
+ raise @error
48
+ end
49
+
50
+ # Provide a readable inspection of the response
51
+ # @return [String] Readable representation
52
+ def inspect
53
+ status = success? ? "success" : "FAILED"
54
+ error_info = success? ? "" : " - #{error.class}: #{error.message}"
55
+ "#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,286 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fiber"
4
+
5
+ require_relative "scheduler/constants"
6
+ require_relative "scheduler/operation_registry"
7
+ require_relative "scheduler/operation_worker"
8
+
9
+ # value objects
10
+ require_relative "scheduler/process_request"
11
+ require_relative "scheduler/thread_request"
12
+
13
+ module ScraperUtils
14
+ # A utility module to coordinate the scheduling of work,
15
+ # * interleaving multiple operations (scraping of an authorities site)
16
+ # uses Fibers (cooperative concurrency) so your code and the libraries you call don't have to be thread safe
17
+ # * Performing mechanize Network I/O in parallel using Threads
18
+ #
19
+ # Process flow
20
+ # 0. operation_workers start with response = true as the first resume passes args to block and response is ignored
21
+ # 1. resumes fiber of operation_worker with the last response when `Time.now` >= resume_at
22
+ # 2. worker fiber calls {Scheduler.execute_request}
23
+ # a. sets resume_at based on calculated delay and waiting_for_response
24
+ # b. pushes request onto local request queue if parallel, otherwise
25
+ # executes request immediately in fiber and passes response to save_thread_response
26
+ # c. fiber yields true to main fiber to indicate it wants to continue after resume_at / response arrives
27
+ # 3. one thread for each fiber (if parallel), thread:
28
+ # a. pops request
29
+ # b. executes request
30
+ # c. pushes response onto global response queue (includes response_time)
31
+ # 4. main fiber - schedule_all loop
32
+ # a. pops any responses and calls save_thread_response on operation_worker
33
+ # c. resumes(true) operation_worker (fiber) when `Time.now` >= resume_at and not waiting_for_response
34
+ # 5. When worker fiber is finished it returns false to indicate it is finished
35
+ # OR when shutdown is called resume(false) is called to indicate worker fiber should not continue
36
+ #
37
+ # save_thread_response:
38
+ # * Updates running average and calculates next_resume_at
39
+ #
40
+ # fiber aborts processing if 2nd argument is true
41
+ # fiber returns nil when finished
42
+ #
43
+ # Workers:
44
+ # * Push process requests onto individual request queues for their thread to process, and yield(true) to scheduler
45
+ #
46
+ # when enough
47
+ #
48
+ # Thread safe Implementation:
49
+ # * Uses fibers for each authority with its own mechanize agent so operations don't need to be thread safe
50
+ # * Only Mechanize requests are run in threads in parallel whilst they wait for network response
51
+ # * Uses message passing (using Queue's) to avoid having to share state between threads.
52
+ # * Execute request does not return till the response has been received from the thread,
53
+ # so the fiber's mechanize agent that is shared with the thread isn't used in multiple threads at once
54
+ # * Only one execute request per authority fiber can be in the thread request queue at any one time
55
+ module Scheduler
56
+ # @!group Main fiber / thread Api
57
+ # These Methods should only be called from main (initial) fiber
58
+
59
+ class << self
60
+ # Controls if network I/O requests will be processed in parallel using threads
61
+ #
62
+ # @return [Boolean] true if processing network I/O in parallel using threads, otherwise false
63
+ # @note Defaults to true unless the MORPH_DISABLE_THREADS ENV variable is set
64
+ attr_accessor :threaded
65
+
66
+ # @return (see #threaded)
67
+ alias threaded? threaded
68
+
69
+ # Controls whether Mechanize network requests are executed in parallel using threads
70
+ #
71
+ # @return [Integer] max concurrent workers using fibers and threads, defaults to MORPH_MAX_WORKERS env variable or 50
72
+ attr_accessor :max_workers
73
+
74
+ # @return [Hash{Symbol => Exception}] exceptions by authority
75
+ attr_reader :exceptions
76
+
77
+ # Returns the run_operations timeout
78
+ # On timeout a message will be output and the ruby program will exit with exit code 124.
79
+ #
80
+ # @return [Integer] Overall process timeout in seconds (default MORPH_RUN_TIMEOUT ENV value or 6 hours)
81
+ attr_accessor :run_timeout
82
+
83
+ # Private accessors for internal use
84
+
85
+ private
86
+
87
+ attr_reader :initial_resume_at, :operation_registry, :reset, :response_queue, :totals
88
+
89
+ end
90
+
91
+ # Resets the scheduler state. Use before retrying failed authorities.
92
+ def self.reset!
93
+ @operation_registry&.shutdown
94
+ @operation_registry = nil
95
+ @response_queue.close if @response_queue
96
+ @threaded = ENV["MORPH_DISABLE_THREADS"].to_s.empty?
97
+ @max_workers = [1, ENV.fetch('MORPH_MAX_WORKERS', Constants::DEFAULT_MAX_WORKERS).to_i].max
98
+ @exceptions = {}
99
+ @totals = Hash.new { 0 }
100
+ @initial_resume_at = Time.now
101
+ @response_queue = Thread::Queue.new if self.threaded?
102
+ @operation_registry = OperationRegistry.new
103
+ @reset = true
104
+ @run_timeout = ENV.fetch('MORPH_RUN_TIMEOUT', Constants::DEFAULT_TIMEOUT).to_i
105
+ nil
106
+ end
107
+
108
+ # reset on class load
109
+ reset!
110
+
111
+ # Registers a block to scrape for a specific authority
112
+ #
113
+ # Block yields(:delay) when operation.resume_at is in the future, and returns :finished when finished
114
+ # @param authority [Symbol] the name of the authority being processed
115
+ # @yield to the block containing the scraping operation to be run in the fiber
116
+ def self.register_operation(authority, &block)
117
+ fiber = Fiber.new do |continue|
118
+ begin
119
+ raise "Terminated fiber for #{authority} before block run" unless continue
120
+
121
+ block.call
122
+ rescue StandardError => e
123
+ # Store exception against the authority
124
+ exceptions[authority] = e
125
+ ensure
126
+ # Clean up when done regardless of success/failure
127
+ operation_registry&.deregister
128
+ end
129
+ # no further requests
130
+ nil
131
+ end
132
+
133
+ operation = operation_registry&.register(fiber, authority)
134
+
135
+ if DebugUtils.basic?
136
+ LogUtils.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
137
+ end
138
+ if operation_registry&.size >= @max_workers
139
+ LogUtils.log "Running batch of #{operation_registry&.size} operations immediately"
140
+ run_operations
141
+ end
142
+ # return operation for ease of testing
143
+ operation
144
+ end
145
+
146
+ # Run all registered operations until completion
147
+ #
148
+ # @return [Hash] Exceptions that occurred during execution
149
+ def self.run_operations
150
+ monitor_run_time = Thread.new do
151
+ sleep run_timeout
152
+ desc = "#{(run_timeout / 3600.0).round(1)} hours"
153
+ desc = "#{(run_timeout / 60.0).round(1)} minutes" if run_timeout < 100 * 60
154
+ desc = "#{run_timeout} seconds" if run_timeout < 100
155
+ LogUtils.log "ERROR: Script exceeded maximum allowed runtime of #{desc}!\n" \
156
+ "Forcibly terminating process!"
157
+ Process.exit!(124)
158
+ end
159
+ count = operation_registry&.size
160
+
161
+ # Main scheduling loop - process till there is nothing left to do
162
+ until @operation_registry.empty?
163
+ save_thread_responses
164
+ resume_next_operation
165
+ end
166
+
167
+ report_summary(count)
168
+
169
+ exceptions
170
+ ensure
171
+ # Kill the monitoring thread if we finish normally
172
+ monitor_run_time.kill if monitor_run_time.alive?
173
+ monitor_run_time.join(2)
174
+ end
175
+
176
+ # ===========================================================
177
+ # @!group Fiber Api
178
+ # These Methods should be called from the worker's own fiber but can be called from the main fiber
179
+
180
+ # Execute Mechanize network request in parallel using the fiber's thread
181
+ # This allows multiple network I/O requests to be waiting for a response in parallel
182
+ # whilst responses that have arrived can be processed by their fibers.
183
+ #
184
+ # @example Replace this code in your scraper
185
+ # page = agent.get(url_period(url, period, webguest))
186
+ #
187
+ # @example With this code
188
+ # page = ScraperUtils::Scheduler.execute_request(agent, :get, [url_period(url, period, webguest)])
189
+ #
190
+ # @param client [MechanizeClient] client to be used to process request
191
+ # @param method_name [Symbol] method to be called on client
192
+ # @param args [Array] Arguments to be used with method call
193
+ # @return [Object] response from method call on client
194
+ def self.execute_request(client, method_name, args)
195
+ operation = current_operation
196
+ # execute immediately if not in a worker fiber
197
+ return client.send(method_name, args) unless operation
198
+
199
+ request = Scheduler::ProcessRequest.new(operation.authority, client, method_name, args)
200
+ log "Submitting request #{request.inspect}" if DebugUtils.basic?
201
+ response = operation.submit_request(request)
202
+ unless response.is_a?(ThreadResponse)
203
+ raise "Expected ThreadResponse, got: #{response.inspect}"
204
+ end
205
+ response.result!
206
+ end
207
+
208
+ # Gets the authority associated with the current fiber or thread
209
+ #
210
+ # @return [Symbol, nil] the authority name or nil if not in a fiber
211
+ def self.current_authority
212
+ current_operation&.authority
213
+ end
214
+
215
+ # @!endgroup
216
+ # ===========================================================
217
+
218
+ private
219
+
220
+ # Save results from threads in operation state so more operation fibers can be resumed
221
+ def self.save_thread_responses
222
+ while (thread_response = get_response)
223
+ operation = @operation_registry&.find(thread_response.authority)
224
+ operation&.save_thread_response(thread_response)
225
+ LogUtils.log "WARNING: orphaned thread response ignored: #{thread_response.inspect}", thread_response.authority
226
+ end
227
+ end
228
+
229
+ # Resume next operation or sleep POLL_PERIOD if non are ready
230
+ def self.resume_next_operation
231
+ delay = Constants::POLL_PERIOD
232
+ # Find the operation that ready to run with the earliest resume_at
233
+ can_resume_operations = @operation_registry&.can_resume
234
+ operation = can_resume_operations&.first
235
+
236
+ if !operation
237
+ # All the fibers must be waiting for responses, so sleep a bit to allow the responses to arrive
238
+ @operation_registry&.cleanup_zombies
239
+ sleep(delay)
240
+ @totals[:wait_response] += delay
241
+ else
242
+ delay = [(operation.resume_at - Time.now).to_f, delay].min
243
+ if delay.positive?
244
+ # Wait a bit for a fiber to be ready to run
245
+ sleep(delay)
246
+ waiting_for_delay = delay * can_resume_operations&.size.to_f / (@operation_registry&.size || 1)
247
+ @totals[:wait_delay] += waiting_for_delay
248
+ @totals[:wait_response] += delay - waiting_for_delay
249
+ else
250
+ @totals[:resume_count] += 1
251
+ # resume fiber with response to last request that is ready to be resumed now
252
+ operation.resume
253
+ end
254
+ operation
255
+ end
256
+ end
257
+
258
+ # Return the next response, returns nil if queue is empty
259
+ #
260
+ # @return [ThreadResponse, nil] Result of request execution
261
+ def self.get_response(non_block = true)
262
+ return nil if @response_queue.nil? || (non_block && @response_queue.empty?)
263
+
264
+ @response_queue.pop(non_block)
265
+ end
266
+
267
+ def self.current_operation
268
+ @operation_registry&.find
269
+ end
270
+
271
+ def self.report_summary(count)
272
+ wait_delay_percent = 0
273
+ wait_response_percent = 0
274
+ delay_requested = [@totals[:wait_delay], @totals[:wait_response]].sum
275
+ if delay_requested.positive?
276
+ wait_delay_percent = (100.0 * @totals[:wait_delay] / delay_requested).round(1)
277
+ wait_response_percent = (100.0 * @totals[:wait_response] / delay_requested).round(1)
278
+ end
279
+ puts
280
+ LogUtils.log "Scheduler processed #{@totals[:resume_count]} calls for #{count} registrations, " \
281
+ "with #{wait_delay_percent}% of #{delay_requested.round(1)} seconds spent keeping under max_load, " \
282
+ "and #{wait_response_percent}% waiting for network I/O requests."
283
+ puts
284
+ end
285
+ end
286
+ end