rocketjob 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +160 -0
- data/Rakefile +28 -0
- data/bin/rocketjob +13 -0
- data/lib/rocket_job/cli.rb +76 -0
- data/lib/rocket_job/concerns/worker.rb +157 -0
- data/lib/rocket_job/config.rb +82 -0
- data/lib/rocket_job/heartbeat.rb +44 -0
- data/lib/rocket_job/job.rb +442 -0
- data/lib/rocket_job/job_exception.rb +35 -0
- data/lib/rocket_job/server.rb +346 -0
- data/lib/rocket_job/version.rb +4 -0
- data/lib/rocketjob.rb +18 -0
- data/test/config/mongo.yml +46 -0
- data/test/job_test.rb +186 -0
- data/test/jobs/test_job.rb +42 -0
- data/test/server_test.rb +39 -0
- data/test/test_helper.rb +43 -0
- data/test/worker_test.rb +89 -0
- metadata +168 -0
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module RocketJob
|
3
|
+
# Heartbeat
|
4
|
+
#
|
5
|
+
# Information from the server as at it's last heartbeat
|
6
|
+
class JobException
|
7
|
+
include MongoMapper::EmbeddedDocument
|
8
|
+
|
9
|
+
# Name of the exception class
|
10
|
+
key :class_name, String
|
11
|
+
|
12
|
+
# Exception message
|
13
|
+
key :message, String
|
14
|
+
|
15
|
+
# Exception Backtrace [Array<String>]
|
16
|
+
key :backtrace, Array
|
17
|
+
|
18
|
+
# Name of the server on which this exception occurred
|
19
|
+
key :server_name, String
|
20
|
+
|
21
|
+
# The record within which this exception occurred
|
22
|
+
key :record_number, Integer
|
23
|
+
|
24
|
+
# Returns [JobException] built from the supplied exception
|
25
|
+
def self.from_exception(exc)
|
26
|
+
self.new(
|
27
|
+
class_name: exc.class.name,
|
28
|
+
message: exc.message,
|
29
|
+
backtrace: exc.backtrace || []
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,346 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'socket'
|
3
|
+
require 'sync_attr'
|
4
|
+
require 'aasm'
|
5
|
+
module RocketJob
|
6
|
+
# Server
|
7
|
+
#
|
8
|
+
# On startup a server instance will automatically register itself
|
9
|
+
# if not already present
|
10
|
+
#
|
11
|
+
# Starting a server in the foreground:
|
12
|
+
# - Using a Rails runner:
|
13
|
+
# bin/rocketjob
|
14
|
+
#
|
15
|
+
# Starting a server in the background:
|
16
|
+
# - Using a Rails runner:
|
17
|
+
# nohup bin/rocketjob --quiet 2>&1 1>output.log &
|
18
|
+
#
|
19
|
+
# Stopping a server:
|
20
|
+
# - Stop the server via the Web UI
|
21
|
+
# - Send a regular kill signal to make it shutdown once all active work is complete
|
22
|
+
# kill <pid>
|
23
|
+
# - Or, use the following Ruby code:
|
24
|
+
# server = RocketJob::Server.where(name: 'server name').first
|
25
|
+
# server.stop!
|
26
|
+
#
|
27
|
+
# Sending the kill signal locally will result in starting the shutdown process
|
28
|
+
# immediately. Via the UI or Ruby code the server can take up to 15 seconds
|
29
|
+
# (the heartbeat interval) to start shutting down.
|
30
|
+
class Server
|
31
|
+
include MongoMapper::Document
|
32
|
+
include AASM
|
33
|
+
include SyncAttr
|
34
|
+
include SemanticLogger::Loggable
|
35
|
+
|
36
|
+
# Prevent data in MongoDB from re-defining the model behavior
|
37
|
+
#self.static_keys = true
|
38
|
+
|
39
|
+
# Unique Name of this server instance
|
40
|
+
# Defaults to the `hostname` but _must_ be overriden if mutiple Server instances
|
41
|
+
# are started on the same host
|
42
|
+
# The unique name is used on re-start to re-queue any jobs that were being processed
|
43
|
+
# at the time the server or host unexpectedly terminated, if any
|
44
|
+
key :name, String, default: -> { "#{Socket.gethostname}:#{$$}" }
|
45
|
+
|
46
|
+
# The maximum number of threads that this worker should use
|
47
|
+
# If set, it will override the default value in RocketJob::Config
|
48
|
+
key :max_threads, Integer, default: -> { Config.instance.max_worker_threads }
|
49
|
+
|
50
|
+
# When this server process was started
|
51
|
+
key :started_at, Time
|
52
|
+
|
53
|
+
# The heartbeat information for this server
|
54
|
+
one :heartbeat, class_name: 'RocketJob::Heartbeat'
|
55
|
+
|
56
|
+
# Current state
|
57
|
+
# Internal use only. Do not set this field directly
|
58
|
+
key :state, Symbol, default: :starting
|
59
|
+
|
60
|
+
validates_presence_of :state, :name, :max_threads
|
61
|
+
|
62
|
+
# States
|
63
|
+
# :starting -> :running -> :paused
|
64
|
+
# -> :stopping
|
65
|
+
aasm column: :state do
|
66
|
+
state :starting, initial: true
|
67
|
+
state :running
|
68
|
+
state :paused
|
69
|
+
state :stopping
|
70
|
+
|
71
|
+
event :started do
|
72
|
+
transitions from: :starting, to: :running
|
73
|
+
before do
|
74
|
+
self.started_at = Time.now
|
75
|
+
end
|
76
|
+
end
|
77
|
+
event :pause do
|
78
|
+
transitions from: :running, to: :paused
|
79
|
+
end
|
80
|
+
event :resume do
|
81
|
+
transitions from: :paused, to: :running
|
82
|
+
end
|
83
|
+
event :stop do
|
84
|
+
transitions from: :running, to: :stopping
|
85
|
+
transitions from: :paused, to: :stopping
|
86
|
+
transitions from: :starting, to: :stopping
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
attr_reader :thread_pool
|
91
|
+
|
92
|
+
# Requeue any jobs being worked by this server when it is destroyed
|
93
|
+
before_destroy :requeue_jobs
|
94
|
+
|
95
|
+
# Run the server process
|
96
|
+
# Attributes supplied are passed to #new
|
97
|
+
def self.run(attrs={})
|
98
|
+
server = new(attrs)
|
99
|
+
server.build_heartbeat
|
100
|
+
server.save!
|
101
|
+
create_indexes
|
102
|
+
register_signal_handlers
|
103
|
+
raise "The RocketJob configuration is being applied after the system has been initialized" unless RocketJob::Job.database.name == RocketJob::SlicedJob.database.name
|
104
|
+
logger.info "Using MongoDB Database: #{RocketJob::Job.database.name}"
|
105
|
+
server.run
|
106
|
+
end
|
107
|
+
|
108
|
+
# Create indexes
|
109
|
+
def self.create_indexes
|
110
|
+
ensure_index [[:name, 1]], background: true, unique: true
|
111
|
+
# Also create indexes for the jobs collection
|
112
|
+
Job.create_indexes
|
113
|
+
end
|
114
|
+
|
115
|
+
# Destroy dead servers ( missed at least the last 4 heartbeats )
|
116
|
+
# Requeue jobs assigned to dead servers
|
117
|
+
# Destroy dead servers
|
118
|
+
def self.destroy_dead_servers
|
119
|
+
dead_seconds = Config.instance.heartbeat_seconds * 4
|
120
|
+
each do |server|
|
121
|
+
next if (Time.now - server.heartbeat.updated_at) < dead_seconds
|
122
|
+
logger.warn "Destroying server #{server.name}, and requeueing its jobs"
|
123
|
+
server.destroy
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Stop all running, paused, or starting servers
|
128
|
+
def self.stop_all
|
129
|
+
where(state: ['running', 'paused', 'starting']).each { |server| server.stop! }
|
130
|
+
end
|
131
|
+
|
132
|
+
# Pause all running servers
|
133
|
+
def self.pause_all
|
134
|
+
where(state: 'running').each { |server| server.pause! }
|
135
|
+
end
|
136
|
+
|
137
|
+
# Resume all paused servers
|
138
|
+
def self.resume_all
|
139
|
+
each { |server| server.resume! if server.paused? }
|
140
|
+
end
|
141
|
+
|
142
|
+
# Register a handler to perform cleanups etc. whenever a server is
|
143
|
+
# explicitly destroyed
|
144
|
+
def self.register_destroy_handler(&block)
|
145
|
+
@@destroy_handlers << block
|
146
|
+
end
|
147
|
+
|
148
|
+
# Returns [Boolean] whether the server is shutting down
|
149
|
+
def shutting_down?
|
150
|
+
if self.class.shutdown
|
151
|
+
stop! if running?
|
152
|
+
true
|
153
|
+
else
|
154
|
+
!running?
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# Returns [Array<Thread>] threads in the thread_pool
|
159
|
+
def thread_pool
|
160
|
+
@thread_pool ||= []
|
161
|
+
end
|
162
|
+
|
163
|
+
# Run this instance of the server
|
164
|
+
def run
|
165
|
+
Thread.current.name = 'RocketJob main'
|
166
|
+
build_heartbeat unless heartbeat
|
167
|
+
|
168
|
+
started
|
169
|
+
adjust_thread_pool(true)
|
170
|
+
save
|
171
|
+
logger.info "RocketJob Server started with #{max_threads} workers running"
|
172
|
+
|
173
|
+
count = 0
|
174
|
+
loop do
|
175
|
+
# Update heartbeat so that monitoring tools know that this server is alive
|
176
|
+
set(
|
177
|
+
'heartbeat.updated_at' => Time.now,
|
178
|
+
'heartbeat.current_threads' => thread_pool_count
|
179
|
+
)
|
180
|
+
|
181
|
+
# Reload the server model every 10 heartbeats in case its config was changed
|
182
|
+
# TODO make 3 configurable
|
183
|
+
if count >= 3
|
184
|
+
reload
|
185
|
+
adjust_thread_pool
|
186
|
+
count = 0
|
187
|
+
else
|
188
|
+
count += 1
|
189
|
+
end
|
190
|
+
|
191
|
+
# Stop server if shutdown signal was raised
|
192
|
+
stop! if self.class.shutdown && !stopping?
|
193
|
+
|
194
|
+
break if stopping?
|
195
|
+
|
196
|
+
sleep Config.instance.heartbeat_seconds
|
197
|
+
end
|
198
|
+
logger.info 'Waiting for worker threads to stop'
|
199
|
+
# TODO Put a timeout on join.
|
200
|
+
# Log Thread dump for active threads
|
201
|
+
# Compare thread dumps for any changes, force down if no change?
|
202
|
+
# reload, if model missing: Send Shutdown exception to each thread
|
203
|
+
# 5 more seconds then exit
|
204
|
+
thread_pool.each { |t| t.join }
|
205
|
+
logger.info 'Shutdown'
|
206
|
+
rescue Exception => exc
|
207
|
+
logger.error('RocketJob::Server is stopping due to an exception', exc)
|
208
|
+
ensure
|
209
|
+
# Destroy this server instance
|
210
|
+
destroy
|
211
|
+
end
|
212
|
+
|
213
|
+
def thread_pool_count
|
214
|
+
thread_pool.count{ |i| i.alive? }
|
215
|
+
end
|
216
|
+
|
217
|
+
protected
|
218
|
+
|
219
|
+
def next_worker_id
|
220
|
+
@worker_id ||= 0
|
221
|
+
@worker_id += 1
|
222
|
+
end
|
223
|
+
|
224
|
+
# Re-adjust the number of running threads to get it up to the
|
225
|
+
# required number of threads
|
226
|
+
# Parameters
|
227
|
+
# stagger_threads
|
228
|
+
# Whether to stagger when the threads poll for work the first time
|
229
|
+
# It spreads out the queue polling over the max_poll_seconds so
|
230
|
+
# that not all workers poll at the same time
|
231
|
+
# The worker also respond faster than max_poll_seconds when a new
|
232
|
+
# job is added.
|
233
|
+
def adjust_thread_pool(stagger_threads=false)
|
234
|
+
count = thread_pool_count
|
235
|
+
# Cleanup threads that have stopped
|
236
|
+
if count != thread_pool.count
|
237
|
+
logger.info "Cleaning up #{thread_pool.count - count} threads that went away"
|
238
|
+
thread_pool.delete_if { |t| !t.alive? }
|
239
|
+
end
|
240
|
+
|
241
|
+
return if shutting_down?
|
242
|
+
|
243
|
+
# Need to add more threads?
|
244
|
+
if count < max_threads
|
245
|
+
thread_count = max_threads - count
|
246
|
+
logger.info "Starting #{thread_count} threads"
|
247
|
+
thread_count.times.each do
|
248
|
+
# Start worker thread
|
249
|
+
thread_pool << Thread.new(next_worker_id) do |id|
|
250
|
+
begin
|
251
|
+
sleep (Config.instance.max_poll_seconds.to_f / max_threads) * (id - 1) if stagger_threads
|
252
|
+
worker(id)
|
253
|
+
rescue Exception => exc
|
254
|
+
logger.fatal('Cannot start worker thread', exc)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Keep processing jobs until server stops running
|
262
|
+
def worker(worker_id)
|
263
|
+
Thread.current.name = "rocketjob #{worker_id}"
|
264
|
+
logger.info 'Started'
|
265
|
+
while !shutting_down?
|
266
|
+
if process_next_job
|
267
|
+
# Keeps workers staggered across the poll interval so that not
|
268
|
+
# all workers poll at the same time
|
269
|
+
sleep rand(RocketJob::Config.instance.max_poll_seconds * 1000) / 1000
|
270
|
+
else
|
271
|
+
sleep RocketJob::Config.instance.max_poll_seconds
|
272
|
+
end
|
273
|
+
end
|
274
|
+
logger.info "Stopping. Server state: #{state.inspect}"
|
275
|
+
rescue Exception => exc
|
276
|
+
logger.fatal('Unhandled exception in job processing thread', exc)
|
277
|
+
end
|
278
|
+
|
279
|
+
# Process the next available job
|
280
|
+
# Returns [Boolean] whether any job was actually processed
|
281
|
+
def process_next_job
|
282
|
+
skip_job_ids = []
|
283
|
+
while job = Job.next_job(name, skip_job_ids)
|
284
|
+
logger.tagged("Job #{job.id}") do
|
285
|
+
if job.work(self)
|
286
|
+
return true if shutting_down?
|
287
|
+
# Need to skip the specified job due to throttling or no work available
|
288
|
+
skip_job_ids << job.id
|
289
|
+
else
|
290
|
+
return true
|
291
|
+
end
|
292
|
+
end
|
293
|
+
end
|
294
|
+
false
|
295
|
+
end
|
296
|
+
|
297
|
+
# Requeue any jobs assigned to this server
|
298
|
+
def requeue_jobs
|
299
|
+
stop! if running? || paused?
|
300
|
+
@@destroy_handlers.each { |handler| handler.call(name) }
|
301
|
+
end
|
302
|
+
|
303
|
+
# Mutex protected shutdown indicator
|
304
|
+
sync_cattr_accessor :shutdown do
|
305
|
+
false
|
306
|
+
end
|
307
|
+
|
308
|
+
# Register handlers for the various signals
|
309
|
+
# Term:
|
310
|
+
# Perform clean shutdown
|
311
|
+
#
|
312
|
+
def self.register_signal_handlers
|
313
|
+
begin
|
314
|
+
Signal.trap "SIGTERM" do
|
315
|
+
# Cannot use Mutex protected writer here since it is in a signal handler
|
316
|
+
@@shutdown = true
|
317
|
+
logger.warn "Shutdown signal (SIGTERM) received. Will shutdown as soon as active jobs/slices have completed."
|
318
|
+
end
|
319
|
+
|
320
|
+
Signal.trap "INT" do
|
321
|
+
# Cannot use Mutex protected writer here since it is in a signal handler
|
322
|
+
@@shutdown = true
|
323
|
+
logger.warn "Shutdown signal (INT) received. Will shutdown as soon as active jobs/slices have completed."
|
324
|
+
end
|
325
|
+
rescue Exception
|
326
|
+
logger.warn "SIGTERM handler not installed. Not able to shutdown gracefully"
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
# Patch the way MongoMapper reloads a model
|
331
|
+
def reload
|
332
|
+
if doc = collection.find_one(:_id => id)
|
333
|
+
load_from_database(doc)
|
334
|
+
self
|
335
|
+
else
|
336
|
+
raise MongoMapper::DocumentNotFound, "Document match #{_id.inspect} does not exist in #{collection.name} collection"
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
private
|
341
|
+
|
342
|
+
@@destroy_handlers = ThreadSafe::Array.new
|
343
|
+
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
data/lib/rocketjob.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'mongo'
|
3
|
+
require 'mongo_ha'
|
4
|
+
require 'mongo_mapper'
|
5
|
+
require 'semantic_logger'
|
6
|
+
require 'rocket_job/version'
|
7
|
+
|
8
|
+
module RocketJob
|
9
|
+
autoload :CLI, 'rocket_job/cli'
|
10
|
+
autoload :Config, 'rocket_job/config'
|
11
|
+
autoload :Heartbeat, 'rocket_job/heartbeat'
|
12
|
+
autoload :Job, 'rocket_job/job'
|
13
|
+
autoload :JobException, 'rocket_job/job_exception'
|
14
|
+
autoload :Server, 'rocket_job/server'
|
15
|
+
module Concerns
|
16
|
+
autoload :Worker, 'rocket_job/concerns/worker'
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Some of these options are from mongo_ha
|
2
|
+
#
|
3
|
+
default_options: &default_options
|
4
|
+
:w: 1
|
5
|
+
:pool_size: 5
|
6
|
+
:pool_timeout: 5
|
7
|
+
:connect_timeout: 5
|
8
|
+
:reconnect_attempts: 53
|
9
|
+
:reconnect_retry_seconds: 0.1
|
10
|
+
:reconnect_retry_multiplier: 2
|
11
|
+
:reconnect_max_retry_seconds: 5
|
12
|
+
|
13
|
+
development:
|
14
|
+
uri: mongodb://localhost:27017/development_rocket_job
|
15
|
+
options:
|
16
|
+
<<: *default_options
|
17
|
+
|
18
|
+
development_work:
|
19
|
+
uri: mongodb://localhost:27017/development_rocket_job_work
|
20
|
+
options:
|
21
|
+
<<: *default_options
|
22
|
+
|
23
|
+
test:
|
24
|
+
uri: mongodb://localhost:27017/test_rocket_job
|
25
|
+
options:
|
26
|
+
<<: *default_options
|
27
|
+
|
28
|
+
test_work:
|
29
|
+
uri: mongodb://localhost:27017/test_rocket_job_work
|
30
|
+
options:
|
31
|
+
<<: *default_options
|
32
|
+
|
33
|
+
# Sample Production Settings
|
34
|
+
production:
|
35
|
+
uri: mongodb://mongo1.site.com:27017,mongo2.site.com:27017/production_rocket_job
|
36
|
+
options:
|
37
|
+
<<: *default_options
|
38
|
+
:pool_size: 50
|
39
|
+
:pool_timeout: 5
|
40
|
+
|
41
|
+
production_work:
|
42
|
+
uri: mongodb://mongo_local.site.com:27017/production_rocket_job_work
|
43
|
+
options:
|
44
|
+
<<: *default_options
|
45
|
+
:pool_size: 50
|
46
|
+
:pool_timeout: 5
|