scraper_utils 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +5 -0
- data/CHANGELOG.md +19 -0
- data/GUIDELINES.md +2 -1
- data/Gemfile +1 -0
- data/IMPLEMENTATION.md +39 -0
- data/README.md +29 -23
- data/SPECS.md +13 -1
- data/bin/rspec +27 -0
- data/docs/enhancing_specs.md +100 -0
- data/docs/example_scrape_with_fibers.rb +4 -4
- data/docs/fibers_and_threads.md +72 -0
- data/docs/getting_started.md +6 -6
- data/docs/interleaving_requests.md +9 -8
- data/docs/mechanize_utilities.md +4 -4
- data/docs/parallel_requests.md +138 -0
- data/docs/randomizing_requests.md +12 -8
- data/docs/reducing_server_load.md +6 -6
- data/lib/scraper_utils/data_quality_monitor.rb +2 -3
- data/lib/scraper_utils/date_range_utils.rb +37 -78
- data/lib/scraper_utils/debug_utils.rb +5 -5
- data/lib/scraper_utils/log_utils.rb +15 -0
- data/lib/scraper_utils/mechanize_actions.rb +37 -8
- data/lib/scraper_utils/mechanize_utils/adaptive_delay.rb +80 -0
- data/lib/scraper_utils/mechanize_utils/agent_config.rb +35 -34
- data/lib/scraper_utils/mechanize_utils/robots_checker.rb +151 -0
- data/lib/scraper_utils/mechanize_utils.rb +8 -5
- data/lib/scraper_utils/randomize_utils.rb +22 -19
- data/lib/scraper_utils/scheduler/constants.rb +12 -0
- data/lib/scraper_utils/scheduler/operation_registry.rb +101 -0
- data/lib/scraper_utils/scheduler/operation_worker.rb +199 -0
- data/lib/scraper_utils/scheduler/process_request.rb +59 -0
- data/lib/scraper_utils/scheduler/thread_request.rb +51 -0
- data/lib/scraper_utils/scheduler/thread_response.rb +59 -0
- data/lib/scraper_utils/scheduler.rb +286 -0
- data/lib/scraper_utils/spec_support.rb +67 -0
- data/lib/scraper_utils/version.rb +1 -1
- data/lib/scraper_utils.rb +12 -14
- metadata +18 -6
- data/lib/scraper_utils/adaptive_delay.rb +0 -70
- data/lib/scraper_utils/fiber_scheduler.rb +0 -229
- data/lib/scraper_utils/robots_checker.rb +0 -149
@@ -24,9 +24,9 @@ module ScraperUtils
|
|
24
24
|
# )
|
25
25
|
class AgentConfig
|
26
26
|
DEFAULT_TIMEOUT = 60
|
27
|
-
DEFAULT_RANDOM_DELAY =
|
28
|
-
DEFAULT_MAX_LOAD =
|
29
|
-
MAX_LOAD_CAP =
|
27
|
+
DEFAULT_RANDOM_DELAY = 0
|
28
|
+
DEFAULT_MAX_LOAD = 50.0
|
29
|
+
MAX_LOAD_CAP = 80.0
|
30
30
|
|
31
31
|
# Class-level defaults that can be modified
|
32
32
|
class << self
|
@@ -67,10 +67,10 @@ module ScraperUtils
|
|
67
67
|
# Reset all configuration options to their default values
|
68
68
|
# @return [void]
|
69
69
|
def reset_defaults!
|
70
|
-
@default_timeout = ENV.fetch('
|
70
|
+
@default_timeout = ENV.fetch('MORPH_CLIENT_TIMEOUT', DEFAULT_TIMEOUT).to_i # 60
|
71
71
|
@default_compliant_mode = ENV.fetch('MORPH_NOT_COMPLIANT', nil).to_s.empty? # true
|
72
|
-
@default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i #
|
73
|
-
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f #
|
72
|
+
@default_random_delay = ENV.fetch('MORPH_RANDOM_DELAY', DEFAULT_RANDOM_DELAY).to_i # 0
|
73
|
+
@default_max_load = ENV.fetch('MORPH_MAX_LOAD', DEFAULT_MAX_LOAD).to_f # 50.0
|
74
74
|
@default_disable_ssl_certificate_check = !ENV.fetch('MORPH_DISABLE_SSL_CHECK', nil).to_s.empty? # false
|
75
75
|
@default_australian_proxy = !ENV.fetch('MORPH_USE_PROXY', nil).to_s.empty? # false
|
76
76
|
@default_user_agent = ENV.fetch('MORPH_USER_AGENT', nil) # Uses Mechanize user agent
|
@@ -85,7 +85,7 @@ module ScraperUtils
|
|
85
85
|
|
86
86
|
# Give access for testing
|
87
87
|
|
88
|
-
attr_reader :max_load, :
|
88
|
+
attr_reader :max_load, :random_range
|
89
89
|
|
90
90
|
# Creates Mechanize agent configuration with sensible defaults overridable via configure
|
91
91
|
# @param timeout [Integer, nil] Timeout for agent connections (default: 60)
|
@@ -125,21 +125,21 @@ module ScraperUtils
|
|
125
125
|
@australian_proxy &&= !ScraperUtils.australian_proxy.to_s.empty?
|
126
126
|
if @australian_proxy
|
127
127
|
uri = begin
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
128
|
+
URI.parse(ScraperUtils.australian_proxy.to_s)
|
129
|
+
rescue URI::InvalidURIError => e
|
130
|
+
raise URI::InvalidURIError, "Invalid proxy URL format: #{e.message}"
|
131
|
+
end
|
132
132
|
unless uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
|
133
133
|
raise URI::InvalidURIError, "Proxy URL must start with http:// or https://"
|
134
134
|
end
|
135
|
-
unless uri.host && uri.port
|
135
|
+
unless !uri.host.to_s.empty? && uri.port&.positive?
|
136
136
|
raise URI::InvalidURIError, "Proxy URL must include host and port"
|
137
137
|
end
|
138
138
|
end
|
139
139
|
|
140
|
-
if @random_delay
|
141
|
-
|
142
|
-
@
|
140
|
+
if @random_delay&.positive?
|
141
|
+
min_random = Math.sqrt(@random_delay * 3.0 / 13.0)
|
142
|
+
@random_range = min_random.round(3)..(3 * min_random).round(3)
|
143
143
|
end
|
144
144
|
|
145
145
|
today = Date.today.strftime("%Y-%m-%d")
|
@@ -177,7 +177,6 @@ module ScraperUtils
|
|
177
177
|
verify_proxy_works(agent)
|
178
178
|
end
|
179
179
|
|
180
|
-
@connection_started_at = nil
|
181
180
|
agent.pre_connect_hooks << method(:pre_connect_hook)
|
182
181
|
agent.post_connect_hooks << method(:post_connect_hook)
|
183
182
|
end
|
@@ -193,11 +192,11 @@ module ScraperUtils
|
|
193
192
|
"australian_proxy=#{@australian_proxy.inspect}"
|
194
193
|
end
|
195
194
|
display_args << "compliant_mode" if @compliant_mode
|
196
|
-
display_args << "random_delay=#{@random_delay}" if @random_delay
|
195
|
+
display_args << "random_delay=#{@random_delay}" if @random_delay&.positive?
|
197
196
|
display_args << "max_load=#{@max_load}%" if @max_load
|
198
197
|
display_args << "disable_ssl_certificate_check" if @disable_ssl_certificate_check
|
199
198
|
display_args << "default args" if display_args.empty?
|
200
|
-
ScraperUtils::
|
199
|
+
ScraperUtils::LogUtils.log(
|
201
200
|
"Configuring Mechanize agent with #{display_args.join(', ')}"
|
202
201
|
)
|
203
202
|
end
|
@@ -206,7 +205,7 @@ module ScraperUtils
|
|
206
205
|
@connection_started_at = Time.now
|
207
206
|
return unless DebugUtils.verbose?
|
208
207
|
|
209
|
-
ScraperUtils::
|
208
|
+
ScraperUtils::LogUtils.log(
|
210
209
|
"Pre Connect request: #{request.inspect} at #{@connection_started_at}"
|
211
210
|
)
|
212
211
|
end
|
@@ -216,9 +215,9 @@ module ScraperUtils
|
|
216
215
|
|
217
216
|
response_time = Time.now - @connection_started_at
|
218
217
|
if DebugUtils.basic?
|
219
|
-
ScraperUtils::
|
218
|
+
ScraperUtils::LogUtils.log(
|
220
219
|
"Post Connect uri: #{uri.inspect}, response: #{response.inspect} " \
|
221
|
-
|
220
|
+
"after #{response_time} seconds"
|
222
221
|
)
|
223
222
|
end
|
224
223
|
|
@@ -227,33 +226,35 @@ module ScraperUtils
|
|
227
226
|
"URL is disallowed by robots.txt specific rules: #{uri}"
|
228
227
|
end
|
229
228
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
229
|
+
@delay_till = nil
|
230
|
+
@delay = @robots_checker&.crawl_delay&.round(3)
|
231
|
+
debug_msg = "Delaying robots.txt: crawl_delay #{@delay} seconds"
|
232
|
+
unless @delay&.positive?
|
233
|
+
delays = {
|
234
|
+
max_load: @adaptive_delay&.next_delay(uri, response_time)&.round(3),
|
235
|
+
random: (@random_range ? (rand(@random_range) ** 2).round(3) : nil)
|
236
|
+
}
|
237
|
+
@delay = [delays[:max_load], delays[:random]].compact.sum
|
238
|
+
debug_msg = "Delaying #{@delay} seconds, sum of: #{delays.inspect}"
|
239
|
+
end
|
236
240
|
if @delay&.positive?
|
237
|
-
|
238
|
-
ScraperUtils::
|
239
|
-
$stdout.flush
|
240
|
-
ScraperUtils::FiberScheduler.delay(@delay)
|
241
|
+
@delay_till = Time.now + @delay
|
242
|
+
ScraperUtils::LogUtils.log(debug_msg) if ScraperUtils::DebugUtils.basic?
|
241
243
|
end
|
242
|
-
|
243
244
|
response
|
244
245
|
end
|
245
246
|
|
246
247
|
def verify_proxy_works(agent)
|
247
248
|
$stderr.flush
|
248
249
|
$stdout.flush
|
249
|
-
|
250
|
+
LogUtils.log "Checking proxy works..."
|
250
251
|
my_ip = MechanizeUtils.public_ip(agent)
|
251
252
|
begin
|
252
253
|
IPAddr.new(my_ip)
|
253
254
|
rescue IPAddr::InvalidAddressError => e
|
254
255
|
raise "Invalid public IP address returned by proxy check: #{my_ip.inspect}: #{e}"
|
255
256
|
end
|
256
|
-
ScraperUtils::
|
257
|
+
ScraperUtils::LogUtils.log "Proxy is using IP address: #{my_ip.inspect}"
|
257
258
|
my_headers = MechanizeUtils.public_headers(agent)
|
258
259
|
begin
|
259
260
|
# Check response is JSON just to be safe!
|
@@ -0,0 +1,151 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ScraperUtils
|
4
|
+
module MechanizeUtils
|
5
|
+
# robots.txt checker with deliberately simplistic rules
|
6
|
+
class RobotsChecker
|
7
|
+
# @return [String] Lowercased user_agent for matching
|
8
|
+
attr_reader :user_agent
|
9
|
+
|
10
|
+
# Initialize with full user agent string like:
|
11
|
+
# "Mozilla/5.0 (compatible; ScraperUtils/0.1.0 2025-02-22; +https://github.com/ianheggie-oaf/scraper_utils)"
|
12
|
+
# Extracts the bot name (e.g. "ScraperUtils") to check against robots.txt
|
13
|
+
# Checks for
|
14
|
+
# * Disallow for User-agent: bot_name and
|
15
|
+
# * Crawl-delay from either User-agent: bot name or * (default)
|
16
|
+
def initialize(user_agent)
|
17
|
+
@user_agent = extract_user_agent(user_agent).downcase
|
18
|
+
if DebugUtils.basic?
|
19
|
+
ScraperUtils::LogUtils.log(
|
20
|
+
"Checking robots.txt for user agent prefix: #{@user_agent} (case insensitive)"
|
21
|
+
)
|
22
|
+
end
|
23
|
+
@rules = {} # domain -> {rules: [], delay: int}
|
24
|
+
@delay = nil # Delay from last robots.txt check
|
25
|
+
end
|
26
|
+
|
27
|
+
# Check if a URL is disallowed based on robots.txt rules specific to our user agent
|
28
|
+
# @param url [String] The full URL to check
|
29
|
+
# @return [Boolean] true if specifically blocked for our user agent, otherwise false
|
30
|
+
def disallowed?(url)
|
31
|
+
return false unless url
|
32
|
+
|
33
|
+
uri = URI(url)
|
34
|
+
domain = "#{uri.scheme}://#{uri.host}"
|
35
|
+
path = uri.path || "/"
|
36
|
+
|
37
|
+
# Get or fetch robots.txt rules
|
38
|
+
rules = get_rules(domain)
|
39
|
+
return false unless rules # If we can't get robots.txt, assume allowed
|
40
|
+
|
41
|
+
# Store any delay found for this domain
|
42
|
+
@delay = rules[:our_delay]
|
43
|
+
|
44
|
+
# Check rules specific to our user agent
|
45
|
+
matches_any_rule?(path, rules[:our_rules])
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the crawl delay (if any) that applied to the last URL checked
|
49
|
+
# Should be called after disallowed? to get relevant delay
|
50
|
+
# @return [Integer, nil] The delay in seconds, or nil if no delay specified
|
51
|
+
def crawl_delay
|
52
|
+
@delay
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def extract_user_agent(user_agent)
|
58
|
+
if user_agent =~ /^(.*compatible;\s*)?([-_a-z0-9]+)/i
|
59
|
+
user_agent = ::Regexp.last_match(2)&.strip
|
60
|
+
end
|
61
|
+
user_agent&.strip
|
62
|
+
end
|
63
|
+
|
64
|
+
def matches_any_rule?(path, rules)
|
65
|
+
rules&.any? { |rule| path.start_with?(rule) }
|
66
|
+
end
|
67
|
+
|
68
|
+
def get_rules(domain)
|
69
|
+
return @rules[domain] if @rules.key?(domain)
|
70
|
+
|
71
|
+
begin
|
72
|
+
response = Net::HTTP.get_response(URI("#{domain}/robots.txt"))
|
73
|
+
return nil unless response.code.start_with?("2") # 2xx response
|
74
|
+
|
75
|
+
rules = parse_robots_txt(response.body)
|
76
|
+
@rules[domain] = rules
|
77
|
+
rules
|
78
|
+
rescue StandardError => e
|
79
|
+
if DebugUtils.basic?
|
80
|
+
ScraperUtils::LogUtils.log(
|
81
|
+
"WARNING: Failed to fetch robots.txt for #{domain}: #{e.message}"
|
82
|
+
)
|
83
|
+
end
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Parse robots.txt content into structured rules
|
89
|
+
# Only collects rules for our specific user agent and generic crawl-delay
|
90
|
+
# @param content [String] The robots.txt content
|
91
|
+
# @return [Hash] Hash containing :our_rules and :our_delay
|
92
|
+
def parse_robots_txt(content)
|
93
|
+
sections = [] # Array of {agent:, rules:[], delay:} hashes
|
94
|
+
current_section = nil
|
95
|
+
|
96
|
+
content.each_line do |line|
|
97
|
+
line = line.strip.downcase
|
98
|
+
next if line.empty? || line.start_with?("#")
|
99
|
+
|
100
|
+
if line.start_with?("user-agent:")
|
101
|
+
agent = line.split(":", 2).last.strip
|
102
|
+
# Check if this is a continuation of the previous section
|
103
|
+
if current_section && current_section[:rules].empty? && current_section[:delay].nil?
|
104
|
+
current_section[:agents] << agent
|
105
|
+
else
|
106
|
+
current_section = { agents: [agent], rules: [], delay: nil }
|
107
|
+
sections << current_section
|
108
|
+
end
|
109
|
+
next
|
110
|
+
end
|
111
|
+
|
112
|
+
next unless current_section # Skip rules before first user-agent
|
113
|
+
|
114
|
+
if line.start_with?("disallow:")
|
115
|
+
path = line.split(":", 2).last.strip
|
116
|
+
current_section[:rules] << path unless path.empty?
|
117
|
+
elsif line.start_with?("crawl-delay:")
|
118
|
+
delay = line.split(":", 2).last.strip.to_i
|
119
|
+
current_section[:delay] = delay if delay.positive?
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Sort sections by most specific agent match first
|
124
|
+
matched_section = sections.find do |section|
|
125
|
+
section[:agents].any? do |agent|
|
126
|
+
# Our user agent starts with the agent from robots.txt
|
127
|
+
@user_agent.start_with?(agent) ||
|
128
|
+
# Or the agent from robots.txt starts with our user agent
|
129
|
+
# (handles ScraperUtils matching ScraperUtils/1.0)
|
130
|
+
agent.start_with?(@user_agent)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Use matched section or fall back to wildcard
|
135
|
+
if matched_section
|
136
|
+
{
|
137
|
+
our_rules: matched_section[:rules],
|
138
|
+
our_delay: matched_section[:delay]
|
139
|
+
}
|
140
|
+
else
|
141
|
+
# Find default section
|
142
|
+
default_section = sections.find { |s| s[:agents].include?("*") }
|
143
|
+
{
|
144
|
+
our_rules: [],
|
145
|
+
our_delay: default_section&.dig(:delay)
|
146
|
+
}
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
@@ -2,7 +2,10 @@
|
|
2
2
|
|
3
3
|
require "mechanize"
|
4
4
|
require "ipaddr"
|
5
|
-
|
5
|
+
|
6
|
+
require_relative "mechanize_utils/adaptive_delay"
|
7
|
+
require_relative "mechanize_utils/agent_config"
|
8
|
+
require_relative "mechanize_utils/robots_checker"
|
6
9
|
|
7
10
|
module ScraperUtils
|
8
11
|
# Utilities for configuring and using Mechanize for web scraping
|
@@ -43,8 +46,8 @@ module ScraperUtils
|
|
43
46
|
|
44
47
|
# Retrieves and logs the public IP address
|
45
48
|
#
|
46
|
-
# @param agent [Mechanize, nil] Mechanize agent to use for IP
|
47
|
-
# @param force [Boolean] Force a new IP
|
49
|
+
# @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
|
50
|
+
# @param force [Boolean] Force a new IP find, by clearing cache first
|
48
51
|
# @return [String, nil] The public IP address
|
49
52
|
def self.public_ip(agent = nil, force: false)
|
50
53
|
@public_ip = nil if force
|
@@ -57,8 +60,8 @@ module ScraperUtils
|
|
57
60
|
|
58
61
|
# Retrieves and logs the headers that make it through the proxy
|
59
62
|
#
|
60
|
-
# @param agent [Mechanize, nil] Mechanize agent to use for IP
|
61
|
-
# @param force [Boolean] Force a new IP
|
63
|
+
# @param agent [Mechanize, nil] Mechanize agent to use for IP find or nil when clearing cache
|
64
|
+
# @param force [Boolean] Force a new IP find, by clearing cache first
|
62
65
|
# @return [String, nil] The list of headers in json format
|
63
66
|
def self.public_headers(agent = nil, force: false)
|
64
67
|
@public_headers = nil if force
|
@@ -4,31 +4,34 @@ module ScraperUtils
|
|
4
4
|
# Provides utilities for randomizing processing order in scrapers,
|
5
5
|
# particularly helpful for distributing load and avoiding predictable patterns
|
6
6
|
module RandomizeUtils
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
return collection.to_a if sequential?
|
7
|
+
class << self
|
8
|
+
# Controls if processing order can be randomized
|
9
|
+
#
|
10
|
+
# @return [Boolean] true if all processing is done sequentially, otherwise false
|
11
|
+
# @note Defaults to true unless the MORPH_DISABLE_RANDOM ENV variable is set
|
12
|
+
attr_accessor :random
|
14
13
|
|
15
|
-
|
14
|
+
# Reports if processing order will be randomized
|
15
|
+
#
|
16
|
+
# @return (see #random)
|
17
|
+
alias random? random
|
16
18
|
end
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
-
# @return [Boolean] true when in test mode or MORPH_PROCESS_SEQUENTIALLY is set
|
21
|
-
def self.sequential?
|
22
|
-
@sequential = !ENV["MORPH_PROCESS_SEQUENTIALLY"].to_s.empty? if @sequential.nil?
|
23
|
-
@sequential || false
|
20
|
+
def self.reset!
|
21
|
+
@random = ENV["MORPH_DISABLE_RANDOM"].to_s.empty?
|
24
22
|
end
|
25
23
|
|
26
|
-
#
|
24
|
+
# reset on class load
|
25
|
+
reset!
|
26
|
+
|
27
|
+
# Returns a randomized version of the input collection unless `.sequential?` is true.
|
27
28
|
#
|
28
|
-
# @param
|
29
|
-
# @return [
|
30
|
-
def self.
|
31
|
-
|
29
|
+
# @param collection [Array, Enumerable] Collection of items
|
30
|
+
# @return [Array] Randomized unless {.sequential?} is true, otherwise original order
|
31
|
+
def self.randomize_order(collection)
|
32
|
+
return collection.to_a.shuffle if random?
|
33
|
+
|
34
|
+
collection.to_a
|
32
35
|
end
|
33
36
|
end
|
34
37
|
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "fiber"
|
4
|
+
|
5
|
+
require_relative "operation_worker"
|
6
|
+
|
7
|
+
module ScraperUtils
|
8
|
+
module Scheduler
|
9
|
+
# Registry of all active OperationWorkers registered to be processed
|
10
|
+
class OperationRegistry
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@operations = {}
|
14
|
+
@fiber_ids = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def register(fiber, authority)
|
18
|
+
authority = authority.to_sym
|
19
|
+
operation = OperationWorker.new(fiber, authority, @response_queue)
|
20
|
+
@operations[authority] = operation
|
21
|
+
@fiber_ids[operation.fiber.object_id] = operation
|
22
|
+
end
|
23
|
+
|
24
|
+
# Remove yourself from registry, called from fiber
|
25
|
+
def deregister
|
26
|
+
operation = find
|
27
|
+
return unless operation
|
28
|
+
|
29
|
+
operation.close
|
30
|
+
# Remove operation from registry since shutdown has done all it can to shut down the thread and fiber
|
31
|
+
@operations.delete(operation.authority)
|
32
|
+
@fiber_ids.delete(operation.fiber.object_id)
|
33
|
+
end
|
34
|
+
|
35
|
+
def current_authority
|
36
|
+
find(Fiber.current.object_id)&.authority
|
37
|
+
end
|
38
|
+
|
39
|
+
# Find OperationWorker
|
40
|
+
# @param key [Integer, String, nil] Fiber's object_id or authority (default current fiber's object_id)
|
41
|
+
# @return [OperationWorker, nil] Returns worker or nil if not found
|
42
|
+
def find(key = nil)
|
43
|
+
key ||= Fiber.current.object_id
|
44
|
+
if key.is_a?(Symbol)
|
45
|
+
@operations[key]
|
46
|
+
elsif key.is_a?(Integer)
|
47
|
+
@fiber_ids[key]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Removes operations
|
52
|
+
def shutdown
|
53
|
+
operations.each do |_key, operation|
|
54
|
+
operation.shutdown
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns true if there are no registered operations
|
59
|
+
def empty?
|
60
|
+
@operations.empty?
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns number of registered operations
|
64
|
+
def size
|
65
|
+
@operations.size
|
66
|
+
end
|
67
|
+
|
68
|
+
# Find operations that can be resumed in resume_at order (may include future resume_at)
|
69
|
+
#
|
70
|
+
# @return [Array{OperationWorker}] Operations that are alive and have a response to use with resume
|
71
|
+
def can_resume
|
72
|
+
@operations
|
73
|
+
.values
|
74
|
+
.select { |op| op.can_resume? }
|
75
|
+
.sort_by(&:resume_at)
|
76
|
+
end
|
77
|
+
|
78
|
+
# Cleanup dead fibers that haven't removed themselves so we don't loop forever
|
79
|
+
def cleanup_zombies
|
80
|
+
dead_operations = @operations.values.reject(&:alive?)
|
81
|
+
|
82
|
+
dead_operations.each do |operation|
|
83
|
+
LogUtils.log "WARNING: removing dead operation for #{operation.authority} - it should have cleaned up after itself!"
|
84
|
+
operation.shutdown
|
85
|
+
@operations.delete(operation.authority)
|
86
|
+
@fiber_ids.delete(operation.fiber.object_id)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Save the thread response into the thread and mark that it can continue
|
91
|
+
def process_thread_response(response)
|
92
|
+
operation = find(response.authority)
|
93
|
+
operation&.save_thread_response response
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
attr_accessor :operations
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|