rocketjob 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ # encoding: UTF-8
2
+ module RocketJob
3
+ # Heartbeat
4
+ #
5
+ # Information from the server as at it's last heartbeat
6
+ class JobException
7
+ include MongoMapper::EmbeddedDocument
8
+
9
+ # Name of the exception class
10
+ key :class_name, String
11
+
12
+ # Exception message
13
+ key :message, String
14
+
15
+ # Exception Backtrace [Array<String>]
16
+ key :backtrace, Array
17
+
18
+ # Name of the server on which this exception occurred
19
+ key :server_name, String
20
+
21
+ # The record within which this exception occurred
22
+ key :record_number, Integer
23
+
24
+ # Returns [JobException] built from the supplied exception
25
+ def self.from_exception(exc)
26
+ self.new(
27
+ class_name: exc.class.name,
28
+ message: exc.message,
29
+ backtrace: exc.backtrace || []
30
+ )
31
+ end
32
+
33
+ end
34
+ end
35
+
@@ -0,0 +1,346 @@
1
+ # encoding: UTF-8
2
+ require 'socket'
3
+ require 'sync_attr'
4
+ require 'aasm'
5
+ module RocketJob
6
+ # Server
7
+ #
8
+ # On startup a server instance will automatically register itself
9
+ # if not already present
10
+ #
11
+ # Starting a server in the foreground:
12
+ # - Using a Rails runner:
13
+ # bin/rocketjob
14
+ #
15
+ # Starting a server in the background:
16
+ # - Using a Rails runner:
17
+ # nohup bin/rocketjob --quiet 2>&1 1>output.log &
18
+ #
19
+ # Stopping a server:
20
+ # - Stop the server via the Web UI
21
+ # - Send a regular kill signal to make it shutdown once all active work is complete
22
+ # kill <pid>
23
+ # - Or, use the following Ruby code:
24
+ # server = RocketJob::Server.where(name: 'server name').first
25
+ # server.stop!
26
+ #
27
+ # Sending the kill signal locally will result in starting the shutdown process
28
+ # immediately. Via the UI or Ruby code the server can take up to 15 seconds
29
+ # (the heartbeat interval) to start shutting down.
30
+ class Server
31
+ include MongoMapper::Document
32
+ include AASM
33
+ include SyncAttr
34
+ include SemanticLogger::Loggable
35
+
36
+ # Prevent data in MongoDB from re-defining the model behavior
37
+ #self.static_keys = true
38
+
39
+ # Unique Name of this server instance
40
+ # Defaults to the `hostname` but _must_ be overriden if mutiple Server instances
41
+ # are started on the same host
42
+ # The unique name is used on re-start to re-queue any jobs that were being processed
43
+ # at the time the server or host unexpectedly terminated, if any
44
+ key :name, String, default: -> { "#{Socket.gethostname}:#{$$}" }
45
+
46
+ # The maximum number of threads that this worker should use
47
+ # If set, it will override the default value in RocketJob::Config
48
+ key :max_threads, Integer, default: -> { Config.instance.max_worker_threads }
49
+
50
+ # When this server process was started
51
+ key :started_at, Time
52
+
53
+ # The heartbeat information for this server
54
+ one :heartbeat, class_name: 'RocketJob::Heartbeat'
55
+
56
+ # Current state
57
+ # Internal use only. Do not set this field directly
58
+ key :state, Symbol, default: :starting
59
+
60
+ validates_presence_of :state, :name, :max_threads
61
+
62
+ # States
63
+ # :starting -> :running -> :paused
64
+ # -> :stopping
65
+ aasm column: :state do
66
+ state :starting, initial: true
67
+ state :running
68
+ state :paused
69
+ state :stopping
70
+
71
+ event :started do
72
+ transitions from: :starting, to: :running
73
+ before do
74
+ self.started_at = Time.now
75
+ end
76
+ end
77
+ event :pause do
78
+ transitions from: :running, to: :paused
79
+ end
80
+ event :resume do
81
+ transitions from: :paused, to: :running
82
+ end
83
+ event :stop do
84
+ transitions from: :running, to: :stopping
85
+ transitions from: :paused, to: :stopping
86
+ transitions from: :starting, to: :stopping
87
+ end
88
+ end
89
+
90
+ attr_reader :thread_pool
91
+
92
+ # Requeue any jobs being worked by this server when it is destroyed
93
+ before_destroy :requeue_jobs
94
+
95
+ # Run the server process
96
+ # Attributes supplied are passed to #new
97
+ def self.run(attrs={})
98
+ server = new(attrs)
99
+ server.build_heartbeat
100
+ server.save!
101
+ create_indexes
102
+ register_signal_handlers
103
+ raise "The RocketJob configuration is being applied after the system has been initialized" unless RocketJob::Job.database.name == RocketJob::SlicedJob.database.name
104
+ logger.info "Using MongoDB Database: #{RocketJob::Job.database.name}"
105
+ server.run
106
+ end
107
+
108
+ # Create indexes
109
+ def self.create_indexes
110
+ ensure_index [[:name, 1]], background: true, unique: true
111
+ # Also create indexes for the jobs collection
112
+ Job.create_indexes
113
+ end
114
+
115
+ # Destroy dead servers ( missed at least the last 4 heartbeats )
116
+ # Requeue jobs assigned to dead servers
117
+ # Destroy dead servers
118
+ def self.destroy_dead_servers
119
+ dead_seconds = Config.instance.heartbeat_seconds * 4
120
+ each do |server|
121
+ next if (Time.now - server.heartbeat.updated_at) < dead_seconds
122
+ logger.warn "Destroying server #{server.name}, and requeueing its jobs"
123
+ server.destroy
124
+ end
125
+ end
126
+
127
+ # Stop all running, paused, or starting servers
128
+ def self.stop_all
129
+ where(state: ['running', 'paused', 'starting']).each { |server| server.stop! }
130
+ end
131
+
132
+ # Pause all running servers
133
+ def self.pause_all
134
+ where(state: 'running').each { |server| server.pause! }
135
+ end
136
+
137
+ # Resume all paused servers
138
+ def self.resume_all
139
+ each { |server| server.resume! if server.paused? }
140
+ end
141
+
142
+ # Register a handler to perform cleanups etc. whenever a server is
143
+ # explicitly destroyed
144
+ def self.register_destroy_handler(&block)
145
+ @@destroy_handlers << block
146
+ end
147
+
148
+ # Returns [Boolean] whether the server is shutting down
149
+ def shutting_down?
150
+ if self.class.shutdown
151
+ stop! if running?
152
+ true
153
+ else
154
+ !running?
155
+ end
156
+ end
157
+
158
+ # Returns [Array<Thread>] threads in the thread_pool
159
+ def thread_pool
160
+ @thread_pool ||= []
161
+ end
162
+
163
+ # Run this instance of the server
164
+ def run
165
+ Thread.current.name = 'RocketJob main'
166
+ build_heartbeat unless heartbeat
167
+
168
+ started
169
+ adjust_thread_pool(true)
170
+ save
171
+ logger.info "RocketJob Server started with #{max_threads} workers running"
172
+
173
+ count = 0
174
+ loop do
175
+ # Update heartbeat so that monitoring tools know that this server is alive
176
+ set(
177
+ 'heartbeat.updated_at' => Time.now,
178
+ 'heartbeat.current_threads' => thread_pool_count
179
+ )
180
+
181
+ # Reload the server model every 10 heartbeats in case its config was changed
182
+ # TODO make 3 configurable
183
+ if count >= 3
184
+ reload
185
+ adjust_thread_pool
186
+ count = 0
187
+ else
188
+ count += 1
189
+ end
190
+
191
+ # Stop server if shutdown signal was raised
192
+ stop! if self.class.shutdown && !stopping?
193
+
194
+ break if stopping?
195
+
196
+ sleep Config.instance.heartbeat_seconds
197
+ end
198
+ logger.info 'Waiting for worker threads to stop'
199
+ # TODO Put a timeout on join.
200
+ # Log Thread dump for active threads
201
+ # Compare thread dumps for any changes, force down if no change?
202
+ # reload, if model missing: Send Shutdown exception to each thread
203
+ # 5 more seconds then exit
204
+ thread_pool.each { |t| t.join }
205
+ logger.info 'Shutdown'
206
+ rescue Exception => exc
207
+ logger.error('RocketJob::Server is stopping due to an exception', exc)
208
+ ensure
209
+ # Destroy this server instance
210
+ destroy
211
+ end
212
+
213
+ def thread_pool_count
214
+ thread_pool.count{ |i| i.alive? }
215
+ end
216
+
217
+ protected
218
+
219
+ def next_worker_id
220
+ @worker_id ||= 0
221
+ @worker_id += 1
222
+ end
223
+
224
+ # Re-adjust the number of running threads to get it up to the
225
+ # required number of threads
226
+ # Parameters
227
+ # stagger_threads
228
+ # Whether to stagger when the threads poll for work the first time
229
+ # It spreads out the queue polling over the max_poll_seconds so
230
+ # that not all workers poll at the same time
231
+ # The worker also respond faster than max_poll_seconds when a new
232
+ # job is added.
233
+ def adjust_thread_pool(stagger_threads=false)
234
+ count = thread_pool_count
235
+ # Cleanup threads that have stopped
236
+ if count != thread_pool.count
237
+ logger.info "Cleaning up #{thread_pool.count - count} threads that went away"
238
+ thread_pool.delete_if { |t| !t.alive? }
239
+ end
240
+
241
+ return if shutting_down?
242
+
243
+ # Need to add more threads?
244
+ if count < max_threads
245
+ thread_count = max_threads - count
246
+ logger.info "Starting #{thread_count} threads"
247
+ thread_count.times.each do
248
+ # Start worker thread
249
+ thread_pool << Thread.new(next_worker_id) do |id|
250
+ begin
251
+ sleep (Config.instance.max_poll_seconds.to_f / max_threads) * (id - 1) if stagger_threads
252
+ worker(id)
253
+ rescue Exception => exc
254
+ logger.fatal('Cannot start worker thread', exc)
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
260
+
261
+ # Keep processing jobs until server stops running
262
+ def worker(worker_id)
263
+ Thread.current.name = "rocketjob #{worker_id}"
264
+ logger.info 'Started'
265
+ while !shutting_down?
266
+ if process_next_job
267
+ # Keeps workers staggered across the poll interval so that not
268
+ # all workers poll at the same time
269
+ sleep rand(RocketJob::Config.instance.max_poll_seconds * 1000) / 1000
270
+ else
271
+ sleep RocketJob::Config.instance.max_poll_seconds
272
+ end
273
+ end
274
+ logger.info "Stopping. Server state: #{state.inspect}"
275
+ rescue Exception => exc
276
+ logger.fatal('Unhandled exception in job processing thread', exc)
277
+ end
278
+
279
+ # Process the next available job
280
+ # Returns [Boolean] whether any job was actually processed
281
+ def process_next_job
282
+ skip_job_ids = []
283
+ while job = Job.next_job(name, skip_job_ids)
284
+ logger.tagged("Job #{job.id}") do
285
+ if job.work(self)
286
+ return true if shutting_down?
287
+ # Need to skip the specified job due to throttling or no work available
288
+ skip_job_ids << job.id
289
+ else
290
+ return true
291
+ end
292
+ end
293
+ end
294
+ false
295
+ end
296
+
297
+ # Requeue any jobs assigned to this server
298
+ def requeue_jobs
299
+ stop! if running? || paused?
300
+ @@destroy_handlers.each { |handler| handler.call(name) }
301
+ end
302
+
303
+ # Mutex protected shutdown indicator
304
+ sync_cattr_accessor :shutdown do
305
+ false
306
+ end
307
+
308
+ # Register handlers for the various signals
309
+ # Term:
310
+ # Perform clean shutdown
311
+ #
312
+ def self.register_signal_handlers
313
+ begin
314
+ Signal.trap "SIGTERM" do
315
+ # Cannot use Mutex protected writer here since it is in a signal handler
316
+ @@shutdown = true
317
+ logger.warn "Shutdown signal (SIGTERM) received. Will shutdown as soon as active jobs/slices have completed."
318
+ end
319
+
320
+ Signal.trap "INT" do
321
+ # Cannot use Mutex protected writer here since it is in a signal handler
322
+ @@shutdown = true
323
+ logger.warn "Shutdown signal (INT) received. Will shutdown as soon as active jobs/slices have completed."
324
+ end
325
+ rescue Exception
326
+ logger.warn "SIGTERM handler not installed. Not able to shutdown gracefully"
327
+ end
328
+ end
329
+
330
+ # Patch the way MongoMapper reloads a model
331
+ def reload
332
+ if doc = collection.find_one(:_id => id)
333
+ load_from_database(doc)
334
+ self
335
+ else
336
+ raise MongoMapper::DocumentNotFound, "Document match #{_id.inspect} does not exist in #{collection.name} collection"
337
+ end
338
+ end
339
+
340
+ private
341
+
342
+ @@destroy_handlers = ThreadSafe::Array.new
343
+
344
+ end
345
+ end
346
+
@@ -0,0 +1,4 @@
1
+ # encoding: UTF-8
2
+ module RocketJob #:nodoc
3
+ VERSION = "0.7.0"
4
+ end
data/lib/rocketjob.rb ADDED
@@ -0,0 +1,18 @@
1
+ # encoding: UTF-8
2
+ require 'mongo'
3
+ require 'mongo_ha'
4
+ require 'mongo_mapper'
5
+ require 'semantic_logger'
6
+ require 'rocket_job/version'
7
+
8
+ module RocketJob
9
+ autoload :CLI, 'rocket_job/cli'
10
+ autoload :Config, 'rocket_job/config'
11
+ autoload :Heartbeat, 'rocket_job/heartbeat'
12
+ autoload :Job, 'rocket_job/job'
13
+ autoload :JobException, 'rocket_job/job_exception'
14
+ autoload :Server, 'rocket_job/server'
15
+ module Concerns
16
+ autoload :Worker, 'rocket_job/concerns/worker'
17
+ end
18
+ end
@@ -0,0 +1,46 @@
1
+ # Some of these options are from mongo_ha
2
+ #
3
+ default_options: &default_options
4
+ :w: 1
5
+ :pool_size: 5
6
+ :pool_timeout: 5
7
+ :connect_timeout: 5
8
+ :reconnect_attempts: 53
9
+ :reconnect_retry_seconds: 0.1
10
+ :reconnect_retry_multiplier: 2
11
+ :reconnect_max_retry_seconds: 5
12
+
13
+ development:
14
+ uri: mongodb://localhost:27017/development_rocket_job
15
+ options:
16
+ <<: *default_options
17
+
18
+ development_work:
19
+ uri: mongodb://localhost:27017/development_rocket_job_work
20
+ options:
21
+ <<: *default_options
22
+
23
+ test:
24
+ uri: mongodb://localhost:27017/test_rocket_job
25
+ options:
26
+ <<: *default_options
27
+
28
+ test_work:
29
+ uri: mongodb://localhost:27017/test_rocket_job_work
30
+ options:
31
+ <<: *default_options
32
+
33
+ # Sample Production Settings
34
+ production:
35
+ uri: mongodb://mongo1.site.com:27017,mongo2.site.com:27017/production_rocket_job
36
+ options:
37
+ <<: *default_options
38
+ :pool_size: 50
39
+ :pool_timeout: 5
40
+
41
+ production_work:
42
+ uri: mongodb://mongo_local.site.com:27017/production_rocket_job_work
43
+ options:
44
+ <<: *default_options
45
+ :pool_size: 50
46
+ :pool_timeout: 5