chore-core 1.8.2 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE.txt +1 -1
- data/README.md +173 -150
- data/chore-core.gemspec +3 -3
- data/lib/chore.rb +31 -5
- data/lib/chore/cli.rb +22 -4
- data/lib/chore/configuration.rb +1 -1
- data/lib/chore/consumer.rb +54 -12
- data/lib/chore/fetcher.rb +12 -7
- data/lib/chore/hooks.rb +2 -1
- data/lib/chore/job.rb +19 -0
- data/lib/chore/manager.rb +18 -2
- data/lib/chore/publisher.rb +18 -2
- data/lib/chore/queues/filesystem/consumer.rb +126 -64
- data/lib/chore/queues/filesystem/filesystem_queue.rb +19 -0
- data/lib/chore/queues/filesystem/publisher.rb +13 -19
- data/lib/chore/queues/sqs.rb +22 -13
- data/lib/chore/queues/sqs/consumer.rb +64 -51
- data/lib/chore/queues/sqs/publisher.rb +26 -17
- data/lib/chore/strategies/consumer/batcher.rb +14 -15
- data/lib/chore/strategies/consumer/single_consumer_strategy.rb +5 -5
- data/lib/chore/strategies/consumer/threaded_consumer_strategy.rb +9 -7
- data/lib/chore/strategies/consumer/throttled_consumer_strategy.rb +120 -0
- data/lib/chore/strategies/worker/forked_worker_strategy.rb +5 -6
- data/lib/chore/strategies/worker/helpers/ipc.rb +87 -0
- data/lib/chore/strategies/worker/helpers/preforked_worker.rb +163 -0
- data/lib/chore/strategies/worker/helpers/work_distributor.rb +65 -0
- data/lib/chore/strategies/worker/helpers/worker_info.rb +13 -0
- data/lib/chore/strategies/worker/helpers/worker_killer.rb +40 -0
- data/lib/chore/strategies/worker/helpers/worker_manager.rb +183 -0
- data/lib/chore/strategies/worker/preforked_worker_strategy.rb +150 -0
- data/lib/chore/strategies/worker/single_worker_strategy.rb +35 -13
- data/lib/chore/unit_of_work.rb +10 -1
- data/lib/chore/util.rb +5 -1
- data/lib/chore/version.rb +3 -3
- data/lib/chore/worker.rb +32 -3
- data/spec/chore/cli_spec.rb +2 -2
- data/spec/chore/consumer_spec.rb +1 -5
- data/spec/chore/duplicate_detector_spec.rb +17 -5
- data/spec/chore/fetcher_spec.rb +0 -11
- data/spec/chore/manager_spec.rb +7 -0
- data/spec/chore/queues/filesystem/filesystem_consumer_spec.rb +74 -16
- data/spec/chore/queues/sqs/consumer_spec.rb +117 -78
- data/spec/chore/queues/sqs/publisher_spec.rb +49 -60
- data/spec/chore/queues/sqs_spec.rb +32 -41
- data/spec/chore/strategies/consumer/batcher_spec.rb +50 -0
- data/spec/chore/strategies/consumer/single_consumer_strategy_spec.rb +3 -3
- data/spec/chore/strategies/consumer/threaded_consumer_strategy_spec.rb +7 -6
- data/spec/chore/strategies/consumer/throttled_consumer_strategy_spec.rb +165 -0
- data/spec/chore/strategies/worker/forked_worker_strategy_spec.rb +17 -2
- data/spec/chore/strategies/worker/helpers/ipc_spec.rb +127 -0
- data/spec/chore/strategies/worker/helpers/preforked_worker_spec.rb +236 -0
- data/spec/chore/strategies/worker/helpers/work_distributor_spec.rb +131 -0
- data/spec/chore/strategies/worker/helpers/worker_info_spec.rb +14 -0
- data/spec/chore/strategies/worker/helpers/worker_killer_spec.rb +97 -0
- data/spec/chore/strategies/worker/helpers/worker_manager_spec.rb +304 -0
- data/spec/chore/strategies/worker/preforked_worker_strategy_spec.rb +183 -0
- data/spec/chore/strategies/worker/single_worker_strategy_spec.rb +25 -0
- data/spec/chore/worker_spec.rb +82 -14
- data/spec/spec_helper.rb +1 -1
- data/spec/support/queues/sqs/fake_objects.rb +18 -0
- metadata +39 -15
@@ -0,0 +1,163 @@
|
|
1
|
+
require 'chore/signal'
|
2
|
+
require 'socket'
|
3
|
+
require 'timeout'
|
4
|
+
require 'chore/strategies/worker/helpers/ipc'
|
5
|
+
|
6
|
+
module Chore
|
7
|
+
module Strategy
|
8
|
+
class PreforkedWorker #:nodoc:
|
9
|
+
include Util
|
10
|
+
include Ipc
|
11
|
+
|
12
|
+
def initialize(_opts = {})
|
13
|
+
Chore.logger.info "PFW: #{Process.pid} initializing"
|
14
|
+
@manager_pid = Process.ppid
|
15
|
+
@consumer_cache = {}
|
16
|
+
@running = true
|
17
|
+
post_fork_setup
|
18
|
+
end
|
19
|
+
|
20
|
+
def start_worker(master_socket)
|
21
|
+
Chore.logger.info 'PFW: Worker starting'
|
22
|
+
raise 'PFW: Did not get master_socket' unless master_socket
|
23
|
+
connection = connect_to_master(master_socket)
|
24
|
+
worker(connection)
|
25
|
+
rescue => e
|
26
|
+
Chore.logger.error "PFW: Shutting down #{e.message} #{e.backtrace}"
|
27
|
+
raise e
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def worker(connection)
|
33
|
+
worker_killer = WorkerKiller.new
|
34
|
+
while running?
|
35
|
+
# Select on the connection to the master and the self pipe
|
36
|
+
readables, _, ex = select_sockets(connection, nil, Chore.config.shutdown_timeout)
|
37
|
+
|
38
|
+
if readables.nil? # timeout
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
42
|
+
read_socket = readables.first
|
43
|
+
|
44
|
+
# Get the work from the connection to master
|
45
|
+
work = read_msg(read_socket)
|
46
|
+
|
47
|
+
# When the Master (manager process) dies, the sockets are set to
|
48
|
+
# readable, but there is no data in the socket. In this case we check
|
49
|
+
# to see if the manager is actually dead, and in that case, we exit.
|
50
|
+
if work.nil? && is_orphan?
|
51
|
+
Chore.logger.info "PFW: Manager no longer alive; Shutting down"
|
52
|
+
break
|
53
|
+
end
|
54
|
+
|
55
|
+
unless work.nil?
|
56
|
+
# Do the work
|
57
|
+
process_work(work)
|
58
|
+
|
59
|
+
worker_killer.check_requests
|
60
|
+
worker_killer.check_memory
|
61
|
+
|
62
|
+
# Alert master that worker is ready to receive more work
|
63
|
+
signal_ready(read_socket)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
rescue Errno::ECONNRESET, Errno::EPIPE
|
67
|
+
Chore.logger.info "PFW: Worker-#{Process.pid} lost connection to master, shutting down"
|
68
|
+
ensure
|
69
|
+
Chore.logger.info "PFW: Worker process terminating"
|
70
|
+
exit(true)
|
71
|
+
end
|
72
|
+
|
73
|
+
# Method wrapper around @running makes it easier to write specs
|
74
|
+
def running?
|
75
|
+
@running
|
76
|
+
end
|
77
|
+
|
78
|
+
# Connects to the master socket, sends its PID, send a ready for work
|
79
|
+
# message, and returns the connection
|
80
|
+
def connect_to_master(master_socket)
|
81
|
+
Chore.logger.info 'PFW: connect protocol started'
|
82
|
+
child_connection(master_socket).tap do |conn|
|
83
|
+
send_msg(conn, Process.pid)
|
84
|
+
signal_ready(conn)
|
85
|
+
Chore.logger.info 'PFW: connect protocol completed'
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def post_fork_setup
|
90
|
+
# Immediately swap out the process name so that it doesn't look like
|
91
|
+
# the master process
|
92
|
+
procline("#{Chore.config.worker_procline}:Started:#{Time.now}")
|
93
|
+
|
94
|
+
# We need to reset the logger after fork. This fixes a longstanding bug
|
95
|
+
# where workers would hang around and never die
|
96
|
+
Chore.logger = nil
|
97
|
+
|
98
|
+
config = Chore.config
|
99
|
+
# When we fork, the consumer's/publisher's need their connections reset.
|
100
|
+
# The specifics of this are queue dependent, and may result in a noop.
|
101
|
+
config.consumer.reset_connection!
|
102
|
+
# It is possible for this to be nil due to configuration woes with chore
|
103
|
+
config.publisher.reset_connection! if Chore.config.publisher
|
104
|
+
|
105
|
+
# Ensure that all signals are handled before we hand off a hook to the
|
106
|
+
# application.
|
107
|
+
trap_signals
|
108
|
+
|
109
|
+
Chore.run_hooks_for(:after_fork,self)
|
110
|
+
end
|
111
|
+
|
112
|
+
def process_work(work)
|
113
|
+
work = [work] unless work.is_a?(Array)
|
114
|
+
work.each do |item|
|
115
|
+
item.consumer = consumer(item.queue_name)
|
116
|
+
begin
|
117
|
+
Timeout.timeout( item.queue_timeout ) do
|
118
|
+
worker = Worker.new(item)
|
119
|
+
worker.start
|
120
|
+
end
|
121
|
+
rescue Timeout::Error => ex
|
122
|
+
Chore.logger.info "PFW: Worker #{Process.pid} timed out"
|
123
|
+
Chore.logger.info "PFW: Worker time out set at #{item.queue_timeout} seconds"
|
124
|
+
raise ex
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# We need to resue Consumer objects because it takes 500ms to recreate
|
130
|
+
# each one.
|
131
|
+
def consumer(queue)
|
132
|
+
unless @consumer_cache.key?(queue)
|
133
|
+
raise Chore::TerribleMistake if @consumer_cache.size >= Chore.config.queues.size
|
134
|
+
@consumer_cache[queue] = Chore.config.consumer.new(queue)
|
135
|
+
end
|
136
|
+
@consumer_cache[queue]
|
137
|
+
end
|
138
|
+
|
139
|
+
def trap_signals
|
140
|
+
Signal.reset
|
141
|
+
|
142
|
+
[:INT, :QUIT, :TERM].each do |signal|
|
143
|
+
Signal.trap(signal) do
|
144
|
+
Chore.logger.info "PFW: received signal: #{signal}"
|
145
|
+
@running = false
|
146
|
+
sleep(Chore.config.shutdown_timeout)
|
147
|
+
Chore.logger.info "PFW: Worker process terminating"
|
148
|
+
exit(true)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
Signal.trap(:USR1) do
|
153
|
+
Chore.reopen_logs
|
154
|
+
Chore.logger.info "PFW: Worker process reopened log"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def is_orphan?
|
159
|
+
Process.ppid != @manager_pid
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'chore/strategies/worker/helpers/ipc'
|
2
|
+
|
3
|
+
module Chore
|
4
|
+
module Strategy
|
5
|
+
class WorkDistributor #:nodoc:
|
6
|
+
class << self
|
7
|
+
include Ipc
|
8
|
+
|
9
|
+
def fetch_and_assign_jobs(workers, manager)
|
10
|
+
jobs = manager.fetch_work(workers.size)
|
11
|
+
raise "DW: jobs needs to be a list got #{jobs.class}" unless jobs.is_a?(Array)
|
12
|
+
if jobs.empty?
|
13
|
+
# This conditon is due to the internal consumer queue being empty.
|
14
|
+
# Assuming that the the consumer has to fetch from an external queue,
|
15
|
+
# if we return here, we would create a tight loop that would use up
|
16
|
+
# a lot the CPU's time. In order to prevent that, we wait for the
|
17
|
+
# consumer queue to be populated, by sleeping.
|
18
|
+
sleep(0.1)
|
19
|
+
return
|
20
|
+
end
|
21
|
+
jobs_to_return = assign_jobs(jobs, workers)
|
22
|
+
manager.return_work(jobs_to_return)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def assign_jobs(jobs, workers)
|
28
|
+
raise 'DW: assign_jobs got 0 workers' if workers.empty?
|
29
|
+
jobs_to_return = []
|
30
|
+
jobs.each_with_index do |job, i|
|
31
|
+
raise 'DW: More Jobs than Sockets' if workers[i].nil?
|
32
|
+
unless push_job_to_worker(job, workers[i])
|
33
|
+
jobs_to_return << job
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
jobs_to_return
|
38
|
+
end
|
39
|
+
|
40
|
+
def push_job_to_worker(job, worker)
|
41
|
+
Chore.run_hooks_for(:before_send_to_worker, job)
|
42
|
+
clear_ready(worker.socket)
|
43
|
+
send_msg(worker.socket, job)
|
44
|
+
true
|
45
|
+
rescue => e
|
46
|
+
Chore.logger.error "DW: Could not assign job #{job.inspect} (worker: #{worker.pid})\nException #{e.message} #{e.backtrace * "\n"}"
|
47
|
+
|
48
|
+
# We generally shouldn't get into this situations since we've already
|
49
|
+
# tested that we can read/write to the Worker's socket. However,
|
50
|
+
# the Worker could still fail between that check and pushing the
|
51
|
+
# job, so we need to allow the work to be re-assigned to handle that
|
52
|
+
# edge case.
|
53
|
+
false
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
# Used for unit tests
|
59
|
+
def sleep(n)
|
60
|
+
Kernel.sleep(n)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'get_process_mem'
|
2
|
+
|
3
|
+
module Chore
|
4
|
+
module Strategy
|
5
|
+
class WorkerKiller #:nodoc:
|
6
|
+
def initialize
|
7
|
+
@memory_limit = Chore.config.memory_limit_bytes
|
8
|
+
@request_limit = Chore.config.request_limit
|
9
|
+
@check_cycle = Chore.config.worker_check_cycle || 16
|
10
|
+
@check_count = 0
|
11
|
+
@current_requests = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def check_memory
|
15
|
+
return if @memory_limit.nil? || (@memory_limit == 0)
|
16
|
+
@check_count += 1
|
17
|
+
|
18
|
+
if @check_count == @check_cycle
|
19
|
+
rss = GetProcessMem.new.bytes.to_i
|
20
|
+
if rss > @memory_limit
|
21
|
+
Chore.logger.info "WK: (pid: #{Process.pid}) exceeded memory limit (#{rss.to_i} bytes > #{@memory_limit} bytes)"
|
22
|
+
Chore.run_hooks_for(:worker_mem_kill)
|
23
|
+
exit(true)
|
24
|
+
end
|
25
|
+
@check_count = 0
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def check_requests
|
30
|
+
return if @request_limit.nil? || (@request_limit == 0)
|
31
|
+
|
32
|
+
if (@current_requests += 1) >= @request_limit
|
33
|
+
Chore.logger.info "WK: (pid: #{Process.pid}) exceeded max number of requests (limit: #{@request_limit})"
|
34
|
+
Chore.run_hooks_for(:worker_req_kill)
|
35
|
+
exit(true)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'chore/strategies/worker/helpers/ipc'
|
2
|
+
|
3
|
+
module Chore
|
4
|
+
module Strategy
|
5
|
+
class WorkerManager #:nodoc:
|
6
|
+
include Ipc
|
7
|
+
|
8
|
+
def initialize(master_socket)
|
9
|
+
@master_socket = master_socket
|
10
|
+
@pid_to_worker = {}
|
11
|
+
@socket_to_worker = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
# Create num of missing workers and sockets and attach them for the
|
15
|
+
# master
|
16
|
+
def create_and_attach_workers
|
17
|
+
create_workers do |num_workers|
|
18
|
+
attach_workers(num_workers)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Reap dead workers and create new ones to replace them
|
23
|
+
def respawn_terminated_workers!
|
24
|
+
Chore.logger.info 'WM: Respawning terminated workers'
|
25
|
+
reap_workers
|
26
|
+
create_and_attach_workers
|
27
|
+
end
|
28
|
+
|
29
|
+
# Stop children with the given kill signal and wait for them to die
|
30
|
+
def stop_workers(sig)
|
31
|
+
@pid_to_worker.each do |pid, worker|
|
32
|
+
begin
|
33
|
+
Chore.logger.info { "WM: Sending #{sig} to: #{pid}" }
|
34
|
+
Process.kill(sig, pid)
|
35
|
+
rescue Errno::ESRCH => e
|
36
|
+
Chore.logger.error "WM: Signal to children error: #{e}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
# TODO: Sleep for the shutdown timeout and kill any remaining workers
|
40
|
+
reap_workers
|
41
|
+
end
|
42
|
+
|
43
|
+
# Return all the worker sockets
|
44
|
+
def worker_sockets
|
45
|
+
@socket_to_worker.keys
|
46
|
+
end
|
47
|
+
|
48
|
+
# Return the workers associated with a given array of sockets.
|
49
|
+
# +block+:: A block can be provided to perform tasks on the workers
|
50
|
+
# associated with the sockets given
|
51
|
+
def ready_workers(sockets = [], &block)
|
52
|
+
workers = @socket_to_worker.values_at(*sockets)
|
53
|
+
yield workers if block_given?
|
54
|
+
workers
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
# Creates worker processes until we have the number of workers defined
|
60
|
+
# by the configuration. Initializes and starts a worker instance in each
|
61
|
+
# of the new processes.
|
62
|
+
# +block+:: Block can be provided to run tasks on the number of newly
|
63
|
+
# created worker processes.
|
64
|
+
def create_workers(&block)
|
65
|
+
num_created_workers = 0
|
66
|
+
|
67
|
+
while @pid_to_worker.size < Chore.config.num_workers
|
68
|
+
pid = fork do
|
69
|
+
run_worker_instance
|
70
|
+
end
|
71
|
+
|
72
|
+
Chore.logger.info "WM: created_worker #{pid}"
|
73
|
+
# Keep track of the new worker process
|
74
|
+
@pid_to_worker[pid] = WorkerInfo.new(pid)
|
75
|
+
num_created_workers += 1
|
76
|
+
end
|
77
|
+
|
78
|
+
raise 'WM: Not enough workers' if inconsistent_worker_number
|
79
|
+
Chore.logger.info "WM: created #{num_created_workers} workers"
|
80
|
+
yield num_created_workers if block_given?
|
81
|
+
num_created_workers
|
82
|
+
end
|
83
|
+
|
84
|
+
# Check that number of workers registered in master match the config
|
85
|
+
def inconsistent_worker_number
|
86
|
+
Chore.config.num_workers != @pid_to_worker.size
|
87
|
+
end
|
88
|
+
|
89
|
+
# Initialize and start a new worker instance
|
90
|
+
def run_worker_instance
|
91
|
+
PreforkedWorker.new.start_worker(@master_socket)
|
92
|
+
ensure
|
93
|
+
exit(true)
|
94
|
+
end
|
95
|
+
|
96
|
+
# Creates individual sockets for each worker to use and attaches them to
|
97
|
+
# the correct worker
|
98
|
+
def attach_workers(num)
|
99
|
+
Chore.logger.info "WM: Started attaching #{num} workers"
|
100
|
+
|
101
|
+
create_worker_sockets(num).each do |socket|
|
102
|
+
begin
|
103
|
+
readable, _, _ = select_sockets(socket, nil, 2)
|
104
|
+
|
105
|
+
if readable.nil?
|
106
|
+
Chore.logger.info "WM: #{socket} timeout waiting for a worker"
|
107
|
+
socket.close
|
108
|
+
next
|
109
|
+
end
|
110
|
+
|
111
|
+
r_socket = readable.first
|
112
|
+
reported_pid = read_msg(r_socket)
|
113
|
+
|
114
|
+
assigned_worker = @pid_to_worker[reported_pid]
|
115
|
+
assigned_worker.socket = socket
|
116
|
+
@socket_to_worker[socket] = assigned_worker
|
117
|
+
|
118
|
+
Chore.logger.info "WM: Connected #{reported_pid} with #{r_socket}"
|
119
|
+
rescue Errno::ECONNRESET
|
120
|
+
Chore.logger.info "WM: A worker failed to connect to #{socket}"
|
121
|
+
socket.close
|
122
|
+
next
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# If the connection from a worker times out, we are unable to associate
|
127
|
+
# the process with a connection and so we kill the worker process
|
128
|
+
kill_unattached_workers
|
129
|
+
Chore.logger.info 'WM: Finished attaching workers'
|
130
|
+
end
|
131
|
+
|
132
|
+
# Create num amount of sockets that are available for worker connections
|
133
|
+
def create_worker_sockets(num)
|
134
|
+
Array.new(num) do
|
135
|
+
add_worker_socket
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Kill workers that failed to connect to the master
|
140
|
+
def kill_unattached_workers
|
141
|
+
@pid_to_worker.each do |pid, worker|
|
142
|
+
next unless worker.socket.nil?
|
143
|
+
Chore.logger.info "WM: kill_unattached_workers #{pid}"
|
144
|
+
Process.kill('KILL', pid)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Wait for terminated workers to die and remove their references from
|
149
|
+
# master
|
150
|
+
def reap_workers
|
151
|
+
Chore.logger.info "WM: reaping workers.."
|
152
|
+
dead_workers = @pid_to_worker.select do |pid, worker|
|
153
|
+
reap_process(pid)
|
154
|
+
end
|
155
|
+
|
156
|
+
dead_workers.each do |pid, worker|
|
157
|
+
dead_worker = @pid_to_worker.delete(pid)
|
158
|
+
dead_worker.socket.close
|
159
|
+
@socket_to_worker.delete(dead_worker.socket)
|
160
|
+
Chore.logger.info "WM: Removed preforked worker:#{worker.pid} - #{worker.socket}"
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
# Non-blocking wait for process to die. Returns whether it stopped
|
165
|
+
def reap_process(pid)
|
166
|
+
status = Process.wait(pid, Process::WNOHANG)
|
167
|
+
case status
|
168
|
+
when nil # Process is still running
|
169
|
+
return false
|
170
|
+
when pid # Collected status of this pid
|
171
|
+
return true
|
172
|
+
end
|
173
|
+
rescue Errno::ECHILD
|
174
|
+
# Child process has already terminated
|
175
|
+
true
|
176
|
+
end
|
177
|
+
|
178
|
+
def fork(&block)
|
179
|
+
Kernel.fork(&block)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|