scraper_utils 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +5 -0
- data/CHANGELOG.md +19 -0
- data/GUIDELINES.md +2 -1
- data/Gemfile +1 -0
- data/IMPLEMENTATION.md +39 -0
- data/README.md +29 -23
- data/SPECS.md +13 -1
- data/bin/rspec +27 -0
- data/docs/enhancing_specs.md +100 -0
- data/docs/example_scrape_with_fibers.rb +4 -4
- data/docs/fibers_and_threads.md +72 -0
- data/docs/getting_started.md +6 -6
- data/docs/interleaving_requests.md +9 -8
- data/docs/mechanize_utilities.md +4 -4
- data/docs/parallel_requests.md +138 -0
- data/docs/randomizing_requests.md +12 -8
- data/docs/reducing_server_load.md +6 -6
- data/lib/scraper_utils/data_quality_monitor.rb +2 -3
- data/lib/scraper_utils/date_range_utils.rb +37 -78
- data/lib/scraper_utils/debug_utils.rb +5 -5
- data/lib/scraper_utils/log_utils.rb +15 -0
- data/lib/scraper_utils/mechanize_actions.rb +37 -8
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +80 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +35 -34
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
- data/lib/scraper_utils/mechanize_utils.rb +8 -5
- data/lib/scraper_utils/randomize_utils.rb +22 -19
- data/lib/scraper_utils/scheduler/constants.rb +12 -0
- data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
- data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
- data/lib/scraper_utils/scheduler/process_request.rb +59 -0
- data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
- data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
- data/lib/scraper_utils/scheduler.rb +286 -0
- data/lib/scraper_utils/spec_support.rb +67 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +12 -14
- metadata +18 -6
- data/lib/scraper_utils/adaptive_delay.rb +0 -70
- data/lib/scraper_utils/fiber_scheduler.rb +0 -229
- data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -0,0 +1,199 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "constants"
|
4
|
+
require_relative 'process_request'
|
5
|
+
|
6
|
+
module ScraperUtils
|
7
|
+
module Scheduler
|
8
|
+
# Handles the processing of a registered operation and associated fiber and thread state
|
9
|
+
class OperationWorker
|
10
|
+
|
11
|
+
class NotReadyError < RuntimeError; end
|
12
|
+
|
13
|
+
# @return [Fiber] The fiber
|
14
|
+
attr_reader :fiber
|
15
|
+
|
16
|
+
# @return [Symbol] The authority name associated with this fiber
|
17
|
+
attr_reader :authority
|
18
|
+
|
19
|
+
# @return [Time] When the fiber should be delayed till / ready to resume at
|
20
|
+
attr_accessor :resume_at
|
21
|
+
|
22
|
+
# @return [ThreadResponse, nil] The response to be passed on the next resume
|
23
|
+
attr_accessor :response
|
24
|
+
|
25
|
+
# @return [Boolean] Waiting for a response
|
26
|
+
attr_reader :waiting_for_response
|
27
|
+
|
28
|
+
# @return [Thread] Thread used
|
29
|
+
attr_reader :thread
|
30
|
+
|
31
|
+
# @return [Thread::Queue] The request queue for the thread
|
32
|
+
attr_reader :request_queue
|
33
|
+
|
34
|
+
def self.next_resume_at
|
35
|
+
@next_resume_at = [@next_resume_at, Time.now - 0.001].compact.max + 0.001
|
36
|
+
end
|
37
|
+
|
38
|
+
# Fiber has not finished running
|
39
|
+
def alive?
|
40
|
+
fiber.alive?
|
41
|
+
end
|
42
|
+
|
43
|
+
# Worker has the necessary state to be resumed
|
44
|
+
def can_resume?
|
45
|
+
!@response.nil? && !@resume_at.nil? && alive?
|
46
|
+
end
|
47
|
+
|
48
|
+
# Save thread response from main or worker fiber
|
49
|
+
def save_thread_response(response)
|
50
|
+
raise "#{authority} Wasn't waiting for response! Got: #{response.inspect}" unless @waiting_for_response
|
51
|
+
@response = response
|
52
|
+
@waiting_for_response = false
|
53
|
+
@resume_at = [response&.delay_till, Time.now].compact.max
|
54
|
+
if DebugUtils.basic?
|
55
|
+
log "Received #{response&.class&.name || 'nil response'} from thread for fiber #{authority} in #{response&.time_taken&.round(3)}s"
|
56
|
+
end
|
57
|
+
response
|
58
|
+
end
|
59
|
+
|
60
|
+
# close resources from worker fiber
|
61
|
+
# Called by worker fiber just before it exits
|
62
|
+
def close
|
63
|
+
validate_fiber(main: false)
|
64
|
+
# Signal thread to finish processing, then wait for it
|
65
|
+
@request_queue&.close
|
66
|
+
@thread&.join(60)
|
67
|
+
# drop references for GC
|
68
|
+
@request_queue = nil
|
69
|
+
@thread = nil
|
70
|
+
# make can_resume? false
|
71
|
+
clear_resume_state
|
72
|
+
end
|
73
|
+
|
74
|
+
# ===================================================
|
75
|
+
# @! Main Fiber API
|
76
|
+
|
77
|
+
# Initialize a new Worker Fiber and Thread, called from the main Fiber
|
78
|
+
#
|
79
|
+
# The Thread executes ThreadRequest objects from the request_queue and pushes
|
80
|
+
# responses to the global response_queue.
|
81
|
+
#
|
82
|
+
# @param fiber [Fiber] Fiber to process authority block
|
83
|
+
# @param authority [Symbol] Authority label
|
84
|
+
# @param response_queue [Thread::Queue, nil] Queue for thread responses if enabled
|
85
|
+
def initialize(fiber, authority, response_queue)
|
86
|
+
raise(ArgumentError, "Fiber and Authority must be provided") unless fiber && authority
|
87
|
+
validate_fiber(main: true)
|
88
|
+
|
89
|
+
@fiber = fiber
|
90
|
+
@authority = authority
|
91
|
+
@response_queue = response_queue
|
92
|
+
@fiber.instance_variable_set(:@operation_worker, self)
|
93
|
+
if response_queue
|
94
|
+
@request_queue = Thread::Queue.new
|
95
|
+
@thread = Thread.new do
|
96
|
+
Thread.current[:current_authority] = authority
|
97
|
+
while (request = @request_queue&.pop)
|
98
|
+
@response_queue.push request.execute
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
@resume_at = self.class.next_resume_at
|
103
|
+
@waiting_for_response = false
|
104
|
+
# First resume response is ignored
|
105
|
+
@response = true
|
106
|
+
end
|
107
|
+
|
108
|
+
# Resume an operation fiber and queue request if there is any from main fiber
|
109
|
+
#
|
110
|
+
# @return [ThreadRequest, nil] request returned by resume or nil if finished
|
111
|
+
def resume
|
112
|
+
raise ClosedQueueError unless alive?
|
113
|
+
raise NotReadyError, "Cannot resume #{authority} without response!" unless @response
|
114
|
+
validate_fiber(main: true)
|
115
|
+
|
116
|
+
request = @fiber.resume(@response)
|
117
|
+
# submit the next request for processing
|
118
|
+
submit_request(request) if request
|
119
|
+
request
|
120
|
+
end
|
121
|
+
|
122
|
+
# Shutdown worker called from main fiber
|
123
|
+
def shutdown
|
124
|
+
validate_fiber(main: true)
|
125
|
+
|
126
|
+
clear_resume_state
|
127
|
+
if @fiber&.alive?
|
128
|
+
# Trigger fiber to raise an error and thus call deregister
|
129
|
+
@fiber.resume(nil)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# ===================================================
|
134
|
+
# @! Worker Fiber API
|
135
|
+
|
136
|
+
# Queue a thread request to be executed from worker fiber
|
137
|
+
# otherwise locally if parallel processing is disabled
|
138
|
+
#
|
139
|
+
# Process flow if parallel enabled:
|
140
|
+
# 1. This method:
|
141
|
+
# a. pushes request onto local @request_queue
|
142
|
+
# b. calls Fiber.yield(true) so Scheduler can run other fibers
|
143
|
+
# 2. Meanwhile, this fibers thread:
|
144
|
+
# a. pops request off queue
|
145
|
+
# b. processes request
|
146
|
+
# c. pushes response to global response queue
|
147
|
+
# 3. Meanwhile, Scheduler on Main fiber:
|
148
|
+
# a. pops response from response queue as they arrive
|
149
|
+
# * calls {#save_thread_response} on associated worker to save each response
|
150
|
+
# c. calls {#resume} on worker when it is its' turn (based on resume_at) and it can_resume (has @response)
|
151
|
+
#
|
152
|
+
# If parallel processing is not enabled, then the processing occurs in the workers fiber
|
153
|
+
#
|
154
|
+
# @param request [ThreadRequest] The request to be processed in thread
|
155
|
+
def submit_request(request)
|
156
|
+
raise NotReadyError, "Cannot make a second request before the first has responded!" if @waiting_for_response
|
157
|
+
raise ArgumentError, "Must be passed a valid ThreadRequest! Got: #{request.inspect}" unless request.is_a? ThreadRequest
|
158
|
+
validate_fiber(main: false)
|
159
|
+
|
160
|
+
@response = nil
|
161
|
+
@waiting_for_response = true
|
162
|
+
if @request_queue
|
163
|
+
@request_queue&.push request
|
164
|
+
response = Fiber.yield true
|
165
|
+
raise "Terminated fiber for #{authority} as requested" unless response
|
166
|
+
else
|
167
|
+
response = save_thread_response request.execute
|
168
|
+
end
|
169
|
+
response
|
170
|
+
end
|
171
|
+
|
172
|
+
private
|
173
|
+
|
174
|
+
def validate_fiber(main: false)
|
175
|
+
required_fiber = main ? Constants::MAIN_FIBER : @fiber
|
176
|
+
current_id = Fiber.current.object_id
|
177
|
+
return if current_id == required_fiber.object_id
|
178
|
+
|
179
|
+
desc = main ? 'main' : 'worker'
|
180
|
+
we_are = if current_id == Constants::MAIN_FIBER.object_id
|
181
|
+
'main'
|
182
|
+
elsif current_id == @fiber.object_id
|
183
|
+
'worker'
|
184
|
+
else
|
185
|
+
'other'
|
186
|
+
end
|
187
|
+
raise ArgumentError,
|
188
|
+
"Must be run within the #{desc} not #{we_are} fiber!"
|
189
|
+
end
|
190
|
+
|
191
|
+
# Clear resume state so the operation won't be resumed
|
192
|
+
def clear_resume_state
|
193
|
+
@resume_at = nil
|
194
|
+
@response = nil
|
195
|
+
@waiting_for_response = false
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "thread_request"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
module Scheduler
|
7
|
+
# Encapsulates a request to be executed (usually )asynchronously by the ThreadPool)
|
8
|
+
class ProcessRequest < ThreadRequest
|
9
|
+
# @return [Object] The object to call the method on
|
10
|
+
attr_reader :subject
|
11
|
+
|
12
|
+
# @return [Symbol] The method to call on the subject
|
13
|
+
attr_reader :method_name
|
14
|
+
|
15
|
+
# @return [Array] The arguments to pass to the method
|
16
|
+
attr_reader :args
|
17
|
+
|
18
|
+
# Initialize a new async request
|
19
|
+
#
|
20
|
+
# @param authority [Symbol, nil] Authority for correlating requests and responses
|
21
|
+
# nil is used when threads are disabled to process locally without duplicating codd
|
22
|
+
# @param subject [Object] The object to call the method on
|
23
|
+
# @param method_name [Symbol] The method to call on the subject
|
24
|
+
# @param args [Array] The arguments to pass to the method
|
25
|
+
# @raise [ArgumentError] If any required parameter is missing or invalid
|
26
|
+
def initialize(authority, subject, method_name, args)
|
27
|
+
super(authority)
|
28
|
+
@subject = subject
|
29
|
+
@method_name = method_name
|
30
|
+
@args = args
|
31
|
+
|
32
|
+
validate!
|
33
|
+
end
|
34
|
+
|
35
|
+
# Execute the request by calling the method on the subject
|
36
|
+
# If the subject has an instance variable @delay_till then that is added to the response
|
37
|
+
# @return [ThreadResponse] The result of the request
|
38
|
+
def execute
|
39
|
+
result = execute_block do
|
40
|
+
subject.send(method_name, *args)
|
41
|
+
end
|
42
|
+
result.delay_till = subject.instance_variable_get(:@delay_till)
|
43
|
+
result
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# Validate that all required parameters are present and valid
|
49
|
+
#
|
50
|
+
# @raise [ArgumentError] If any parameter is missing or invalid
|
51
|
+
def validate!
|
52
|
+
raise ArgumentError, "Subject must be provided" unless @subject
|
53
|
+
raise ArgumentError, "Method name must be provided" unless @method_name
|
54
|
+
raise ArgumentError, "Args must be an array" unless @args.is_a?(Array)
|
55
|
+
raise ArgumentError, "Subject must respond to method" unless @subject&.respond_to?(@method_name)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "thread_response"
|
4
|
+
|
5
|
+
module ScraperUtils
|
6
|
+
module Scheduler
|
7
|
+
# Encapsulates a request that pushed to the fiber's request queue to be executed by the Fiber's Thread
|
8
|
+
# The response is returned via the Scheduler's response queue
|
9
|
+
# @see {ProcessRequest}
|
10
|
+
class ThreadRequest
|
11
|
+
# @return [Symbol] Authority for correlating requests and responses
|
12
|
+
attr_reader :authority
|
13
|
+
|
14
|
+
# Initialize a new process request
|
15
|
+
#
|
16
|
+
# @param authority [Symbol, nil] Authority for correlating requests and responses
|
17
|
+
def initialize(authority)
|
18
|
+
@authority = authority
|
19
|
+
end
|
20
|
+
|
21
|
+
# Execute a request and return ThreadResponse - use helper method `.execute_block`
|
22
|
+
def execute
|
23
|
+
raise NotImplementedError, "Implement in subclass"
|
24
|
+
end
|
25
|
+
|
26
|
+
# Execute a request by calling the block
|
27
|
+
# @return [ThreadResponse] The result of the request
|
28
|
+
def execute_block
|
29
|
+
start_time = Time.now
|
30
|
+
begin
|
31
|
+
result = yield
|
32
|
+
elapsed_time = Time.now - start_time
|
33
|
+
ThreadResponse.new(
|
34
|
+
authority,
|
35
|
+
result,
|
36
|
+
nil,
|
37
|
+
elapsed_time
|
38
|
+
)
|
39
|
+
rescue => e
|
40
|
+
elapsed_time = Time.now - start_time
|
41
|
+
ThreadResponse.new(
|
42
|
+
authority,
|
43
|
+
nil,
|
44
|
+
e,
|
45
|
+
elapsed_time
|
46
|
+
)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
module Scheduler
|
5
|
+
# Encapsulates a response from an asynchronous command execution
|
6
|
+
class ThreadResponse
|
7
|
+
# @return [Symbol] The authority from the original command
|
8
|
+
attr_reader :authority
|
9
|
+
|
10
|
+
# @return [Object, nil] The result of the command
|
11
|
+
attr_reader :result
|
12
|
+
|
13
|
+
# @return [Exception, nil] Any error that occurred during execution
|
14
|
+
attr_reader :error
|
15
|
+
|
16
|
+
# @return [Float] The time taken to execute the command in seconds
|
17
|
+
attr_reader :time_taken
|
18
|
+
|
19
|
+
# @return [Time, nil] Optionally delay the next process
|
20
|
+
attr_accessor :delay_till
|
21
|
+
|
22
|
+
# Initialize a new async response
|
23
|
+
#
|
24
|
+
# @param authority [Symbol] The authority from the original command
|
25
|
+
# @param result [Object, nil] The result of the command
|
26
|
+
# @param error [Exception, nil] Any error that occurred during execution
|
27
|
+
# @param time_taken [Float] The time taken to submit_request the command in seconds
|
28
|
+
def initialize(authority, result, error, time_taken)
|
29
|
+
@authority = authority
|
30
|
+
@result = result
|
31
|
+
@error = error
|
32
|
+
@time_taken = time_taken
|
33
|
+
@delay_till = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check if the command execution was successful
|
37
|
+
#
|
38
|
+
# @return [Boolean] true if successful, false otherwise
|
39
|
+
def success?
|
40
|
+
@error.nil?
|
41
|
+
end
|
42
|
+
|
43
|
+
# Return result or raise error
|
44
|
+
# @return [Object] Result pf request
|
45
|
+
def result!
|
46
|
+
return @result if success?
|
47
|
+
raise @error
|
48
|
+
end
|
49
|
+
|
50
|
+
# Provide a readable inspection of the response
|
51
|
+
# @return [String] Readable representation
|
52
|
+
def inspect
|
53
|
+
status = success? ? "success" : "FAILED"
|
54
|
+
error_info = success? ? "" : " - #{error.class}: #{error.message}"
|
55
|
+
"#<#{self.class} authority=#{authority} #{status}#{error_info} time=#{time_taken}>"
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,286 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fiber"
|
4
|
+
|
5
|
+
require_relative "scheduler/constants"
|
6
|
+
require_relative "scheduler/operation_registry"
|
7
|
+
require_relative "scheduler/operation_worker"
|
8
|
+
|
9
|
+
# value objects
|
10
|
+
require_relative "scheduler/process_request"
|
11
|
+
require_relative "scheduler/thread_request"
|
12
|
+
|
13
|
+
module ScraperUtils
|
14
|
+
# A utility module to coordinate the scheduling of work,
|
15
|
+
# * interleaving multiple operations (scraping of an authorities site)
|
16
|
+
# uses Fibers (cooperative concurrency) so your code and the libraries you call don't have to be thread safe
|
17
|
+
# * Performing mechanize Network I/O in parallel using Threads
|
18
|
+
#
|
19
|
+
# Process flow
|
20
|
+
# 0. operation_workers start with response = true as the first resume passes args to block and response is ignored
|
21
|
+
# 1. resumes fiber of operation_worker with the last response when `Time.now` >= resume_at
|
22
|
+
# 2. worker fiber calls {Scheduler.execute_request}
|
23
|
+
# a. sets resume_at based on calculated delay and waiting_for_response
|
24
|
+
# b. pushes request onto local request queue if parallel, otherwise
|
25
|
+
# executes request immediately in fiber and passes response to save_thread_response
|
26
|
+
# c. fiber yields true to main fiber to indicate it wants to continue after resume_at / response arrives
|
27
|
+
# 3. one thread for each fiber (if parallel), thread:
|
28
|
+
# a. pops request
|
29
|
+
# b. executes request
|
30
|
+
# c. pushes response onto global response queue (includes response_time)
|
31
|
+
# 4. main fiber - schedule_all loop
|
32
|
+
# a. pops any responses and calls save_thread_response on operation_worker
|
33
|
+
# c. resumes(true) operation_worker (fiber) when `Time.now` >= resume_at and not waiting_for_response
|
34
|
+
# 5. When worker fiber is finished it returns false to indicate it is finished
|
35
|
+
# OR when shutdown is called resume(false) is called to indicate worker fiber should not continue
|
36
|
+
#
|
37
|
+
# save_thread_response:
|
38
|
+
# * Updates running average and calculates next_resume_at
|
39
|
+
#
|
40
|
+
# fiber aborts processing if 2nd argument is true
|
41
|
+
# fiber returns nil when finished
|
42
|
+
#
|
43
|
+
# Workers:
|
44
|
+
# * Push process requests onto individual request queues for their thread to process, and yield(true) to scheduler
|
45
|
+
#
|
46
|
+
# when enough
|
47
|
+
#
|
48
|
+
# Thread safe Implementation:
|
49
|
+
# * Uses fibers for each authority with its own mechanize agent so operations don't need to be thread safe
|
50
|
+
# * Only Mechanize requests are run in threads in parallel whilst they wait for network response
|
51
|
+
# * Uses message passing (using Queue's) to avoid having to share state between threads.
|
52
|
+
# * Execute request does not return till the response has been received from the thread,
|
53
|
+
# so the fiber's mechanize agent that is shared with the thread isn't used in multiple threads at once
|
54
|
+
# * Only one execute request per authority fiber can be in the thread request queue at any one time
|
55
|
+
module Scheduler
|
56
|
+
# @!group Main fiber / thread Api
|
57
|
+
# These Methods should only be called from main (initial) fiber
|
58
|
+
|
59
|
+
class << self
|
60
|
+
# Controls if network I/O requests will be processed in parallel using threads
|
61
|
+
#
|
62
|
+
# @return [Boolean] true if processing network I/O in parallel using threads, otherwise false
|
63
|
+
# @note Defaults to true unless the MORPH_DISABLE_THREADS ENV variable is set
|
64
|
+
attr_accessor :threaded
|
65
|
+
|
66
|
+
# @return (see #threaded)
|
67
|
+
alias threaded? threaded
|
68
|
+
|
69
|
+
# Controls whether Mechanize network requests are executed in parallel using threads
|
70
|
+
#
|
71
|
+
# @return [Integer] max concurrent workers using fibers and threads, defaults to MORPH_MAX_WORKERS env variable or 50
|
72
|
+
attr_accessor :max_workers
|
73
|
+
|
74
|
+
# @return [Hash{Symbol => Exception}] exceptions by authority
|
75
|
+
attr_reader :exceptions
|
76
|
+
|
77
|
+
# Returns the run_operations timeout
|
78
|
+
# On timeout a message will be output and the ruby program will exit with exit code 124.
|
79
|
+
#
|
80
|
+
# @return [Integer] Overall process timeout in seconds (default MORPH_RUN_TIMEOUT ENV value or 6 hours)
|
81
|
+
attr_accessor :run_timeout
|
82
|
+
|
83
|
+
# Private accessors for internal use
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
attr_reader :initial_resume_at, :operation_registry, :reset, :response_queue, :totals
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
# Resets the scheduler state. Use before retrying failed authorities.
|
92
|
+
def self.reset!
|
93
|
+
@operation_registry&.shutdown
|
94
|
+
@operation_registry = nil
|
95
|
+
@response_queue.close if @response_queue
|
96
|
+
@threaded = ENV["MORPH_DISABLE_THREADS"].to_s.empty?
|
97
|
+
@max_workers = [1, ENV.fetch('MORPH_MAX_WORKERS', Constants::DEFAULT_MAX_WORKERS).to_i].max
|
98
|
+
@exceptions = {}
|
99
|
+
@totals = Hash.new { 0 }
|
100
|
+
@initial_resume_at = Time.now
|
101
|
+
@response_queue = Thread::Queue.new if self.threaded?
|
102
|
+
@operation_registry = OperationRegistry.new
|
103
|
+
@reset = true
|
104
|
+
@run_timeout = ENV.fetch('MORPH_RUN_TIMEOUT', Constants::DEFAULT_TIMEOUT).to_i
|
105
|
+
nil
|
106
|
+
end
|
107
|
+
|
108
|
+
# reset on class load
|
109
|
+
reset!
|
110
|
+
|
111
|
+
# Registers a block to scrape for a specific authority
|
112
|
+
#
|
113
|
+
# Block yields(:delay) when operation.resume_at is in the future, and returns :finished when finished
|
114
|
+
# @param authority [Symbol] the name of the authority being processed
|
115
|
+
# @yield to the block containing the scraping operation to be run in the fiber
|
116
|
+
def self.register_operation(authority, &block)
|
117
|
+
fiber = Fiber.new do |continue|
|
118
|
+
begin
|
119
|
+
raise "Terminated fiber for #{authority} before block run" unless continue
|
120
|
+
|
121
|
+
block.call
|
122
|
+
rescue StandardError => e
|
123
|
+
# Store exception against the authority
|
124
|
+
exceptions[authority] = e
|
125
|
+
ensure
|
126
|
+
# Clean up when done regardless of success/failure
|
127
|
+
operation_registry&.deregister
|
128
|
+
end
|
129
|
+
# no further requests
|
130
|
+
nil
|
131
|
+
end
|
132
|
+
|
133
|
+
operation = operation_registry&.register(fiber, authority)
|
134
|
+
|
135
|
+
if DebugUtils.basic?
|
136
|
+
LogUtils.log "Registered #{authority} operation with fiber: #{fiber.object_id} for interleaving"
|
137
|
+
end
|
138
|
+
if operation_registry&.size >= @max_workers
|
139
|
+
LogUtils.log "Running batch of #{operation_registry&.size} operations immediately"
|
140
|
+
run_operations
|
141
|
+
end
|
142
|
+
# return operation for ease of testing
|
143
|
+
operation
|
144
|
+
end
|
145
|
+
|
146
|
+
# Run all registered operations until completion
|
147
|
+
#
|
148
|
+
# @return [Hash] Exceptions that occurred during execution
|
149
|
+
def self.run_operations
|
150
|
+
monitor_run_time = Thread.new do
|
151
|
+
sleep run_timeout
|
152
|
+
desc = "#{(run_timeout / 3600.0).round(1)} hours"
|
153
|
+
desc = "#{(run_timeout / 60.0).round(1)} minutes" if run_timeout < 100 * 60
|
154
|
+
desc = "#{run_timeout} seconds" if run_timeout < 100
|
155
|
+
LogUtils.log "ERROR: Script exceeded maximum allowed runtime of #{desc}!\n" \
|
156
|
+
"Forcibly terminating process!"
|
157
|
+
Process.exit!(124)
|
158
|
+
end
|
159
|
+
count = operation_registry&.size
|
160
|
+
|
161
|
+
# Main scheduling loop - process till there is nothing left to do
|
162
|
+
until @operation_registry.empty?
|
163
|
+
save_thread_responses
|
164
|
+
resume_next_operation
|
165
|
+
end
|
166
|
+
|
167
|
+
report_summary(count)
|
168
|
+
|
169
|
+
exceptions
|
170
|
+
ensure
|
171
|
+
# Kill the monitoring thread if we finish normally
|
172
|
+
monitor_run_time.kill if monitor_run_time.alive?
|
173
|
+
monitor_run_time.join(2)
|
174
|
+
end
|
175
|
+
|
176
|
+
# ===========================================================
|
177
|
+
# @!group Fiber Api
|
178
|
+
# These Methods should be called from the worker's own fiber but can be called from the main fiber
|
179
|
+
|
180
|
+
# Execute Mechanize network request in parallel using the fiber's thread
|
181
|
+
# This allows multiple network I/O requests to be waiting for a response in parallel
|
182
|
+
# whilst responses that have arrived can be processed by their fibers.
|
183
|
+
#
|
184
|
+
# @example Replace this code in your scraper
|
185
|
+
# page = agent.get(url_period(url, period, webguest))
|
186
|
+
#
|
187
|
+
# @example With this code
|
188
|
+
# page = ScraperUtils::Scheduler.execute_request(agent, :get, [url_period(url, period, webguest)])
|
189
|
+
#
|
190
|
+
# @param client [MechanizeClient] client to be used to process request
|
191
|
+
# @param method_name [Symbol] method to be called on client
|
192
|
+
# @param args [Array] Arguments to be used with method call
|
193
|
+
# @return [Object] response from method call on client
|
194
|
+
def self.execute_request(client, method_name, args)
|
195
|
+
operation = current_operation
|
196
|
+
# execute immediately if not in a worker fiber
|
197
|
+
return client.send(method_name, args) unless operation
|
198
|
+
|
199
|
+
request = Scheduler::ProcessRequest.new(operation.authority, client, method_name, args)
|
200
|
+
log "Submitting request #{request.inspect}" if DebugUtils.basic?
|
201
|
+
response = operation.submit_request(request)
|
202
|
+
unless response.is_a?(ThreadResponse)
|
203
|
+
raise "Expected ThreadResponse, got: #{response.inspect}"
|
204
|
+
end
|
205
|
+
response.result!
|
206
|
+
end
|
207
|
+
|
208
|
+
# Gets the authority associated with the current fiber or thread
|
209
|
+
#
|
210
|
+
# @return [Symbol, nil] the authority name or nil if not in a fiber
|
211
|
+
def self.current_authority
|
212
|
+
current_operation&.authority
|
213
|
+
end
|
214
|
+
|
215
|
+
# @!endgroup
|
216
|
+
# ===========================================================
|
217
|
+
|
218
|
+
private
|
219
|
+
|
220
|
+
# Save results from threads in operation state so more operation fibers can be resumed
|
221
|
+
def self.save_thread_responses
|
222
|
+
while (thread_response = get_response)
|
223
|
+
operation = @operation_registry&.find(thread_response.authority)
|
224
|
+
operation&.save_thread_response(thread_response)
|
225
|
+
LogUtils.log "WARNING: orphaned thread response ignored: #{thread_response.inspect}", thread_response.authority
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
# Resume next operation or sleep POLL_PERIOD if non are ready
|
230
|
+
def self.resume_next_operation
|
231
|
+
delay = Constants::POLL_PERIOD
|
232
|
+
# Find the operation that ready to run with the earliest resume_at
|
233
|
+
can_resume_operations = @operation_registry&.can_resume
|
234
|
+
operation = can_resume_operations&.first
|
235
|
+
|
236
|
+
if !operation
|
237
|
+
# All the fibers must be waiting for responses, so sleep a bit to allow the responses to arrive
|
238
|
+
@operation_registry&.cleanup_zombies
|
239
|
+
sleep(delay)
|
240
|
+
@totals[:wait_response] += delay
|
241
|
+
else
|
242
|
+
delay = [(operation.resume_at - Time.now).to_f, delay].min
|
243
|
+
if delay.positive?
|
244
|
+
# Wait a bit for a fiber to be ready to run
|
245
|
+
sleep(delay)
|
246
|
+
waiting_for_delay = delay * can_resume_operations&.size.to_f / (@operation_registry&.size || 1)
|
247
|
+
@totals[:wait_delay] += waiting_for_delay
|
248
|
+
@totals[:wait_response] += delay - waiting_for_delay
|
249
|
+
else
|
250
|
+
@totals[:resume_count] += 1
|
251
|
+
# resume fiber with response to last request that is ready to be resumed now
|
252
|
+
operation.resume
|
253
|
+
end
|
254
|
+
operation
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# Return the next response, returns nil if queue is empty
|
259
|
+
#
|
260
|
+
# @return [ThreadResponse, nil] Result of request execution
|
261
|
+
def self.get_response(non_block = true)
|
262
|
+
return nil if @response_queue.nil? || (non_block && @response_queue.empty?)
|
263
|
+
|
264
|
+
@response_queue.pop(non_block)
|
265
|
+
end
|
266
|
+
|
267
|
+
def self.current_operation
|
268
|
+
@operation_registry&.find
|
269
|
+
end
|
270
|
+
|
271
|
+
def self.report_summary(count)
|
272
|
+
wait_delay_percent = 0
|
273
|
+
wait_response_percent = 0
|
274
|
+
delay_requested = [@totals[:wait_delay], @totals[:wait_response]].sum
|
275
|
+
if delay_requested.positive?
|
276
|
+
wait_delay_percent = (100.0 * @totals[:wait_delay] / delay_requested).round(1)
|
277
|
+
wait_response_percent = (100.0 * @totals[:wait_response] / delay_requested).round(1)
|
278
|
+
end
|
279
|
+
puts
|
280
|
+
LogUtils.log "Scheduler processed #{@totals[:resume_count]} calls for #{count} registrations, " \
|
281
|
+
"with #{wait_delay_percent}% of #{delay_requested.round(1)} seconds spent keeping under max_load, " \
|
282
|
+
"and #{wait_response_percent}% waiting for network I/O requests."
|
283
|
+
puts
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|