rocketjob 4.0.0 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3d1d8a5bd37991ac8ad2dd5ae28b833dc4cb59d69da1a1cc281c904ab15c3eb
4
- data.tar.gz: 39336f01d701f34e4f25e00e03fa5f06299ec19d1f7c0b52af5aff6ae4b4327b
3
+ metadata.gz: d14befce83747b072cf398356d5a5a798630d65c9aa50ec5f5de1e1f4d0e8d69
4
+ data.tar.gz: 17c5e295968836458ef1b998d1ffb79228ab6b5971c6339ec1bdc74d429d2511
5
5
  SHA512:
6
- metadata.gz: 42266518d00516c62ccd3d0139e6fa22427f48b14454a2f06f0c04f559620b9bbf71700f38b518db03b8290cd6ab7ad7353c47cfbb579aa2f5642ef2a9b2cfb5
7
- data.tar.gz: 444e848667f4a09629b18b467713fe9ae26abb22fd68ceb9048e0adb617c3bbe066ec407c9922577f9ea6bc40271a9d6c84d6064456e5e0532ddfa8f0be85a69
6
+ metadata.gz: 07b25f25ca1fd804e6c3d98ebcf3724acb8bcb564281f1b5ffd21bd815e8a05d7a99988721c5e3a25063da44f800ea3c998fcbecd7751a8b10755b214804feab
7
+ data.tar.gz: 48093ed5e152571a097e07f8e606df99fc48fcf1f8364e78b77d371e5e668acf384c972a002c078930b1af809dd7e34483ba28194fbcd52a9ae06bfb60f2dfce
@@ -47,7 +47,7 @@ module RocketJob
47
47
  opts[:max_workers] = workers if workers
48
48
  opts[:filter] = filter if filter
49
49
 
50
- Server.run(opts)
50
+ Supervisor.run(opts)
51
51
  end
52
52
 
53
53
  def rails?
@@ -96,7 +96,7 @@ module RocketJob
96
96
 
97
97
  require 'rocketjob'
98
98
  begin
99
- require 'rocketjob_batch'
99
+ require 'rocketjob_enterprise'
100
100
  rescue LoadError
101
101
  nil
102
102
  end
@@ -0,0 +1,163 @@
1
+ require 'concurrent-ruby'
2
+
3
+ module RocketJob
4
+ # RocketJob::Event
5
+ #
6
+ # Publish and Subscribe to events. Events are published immediately and usually consumed
7
+ # almost immediately by all subscriber processes.
8
+ class Event
9
+ include SemanticLogger::Loggable
10
+ include Plugins::Document
11
+ include Mongoid::Timestamps
12
+
13
+ ALL_EVENTS = '*'.freeze
14
+
15
+ # Capped collection long polling interval.
16
+ class_attribute :long_poll_seconds, instance_accessor: false
17
+ self.long_poll_seconds = 300
18
+
19
+ # Capped collection size.
20
+ # Only used the first time the collection is created.
21
+ #
22
+ # Default: 128MB.
23
+ class_attribute :capped_collection_size, instance_accessor: false
24
+ self.capped_collection_size = 128 * 1024 * 1024
25
+
26
+ # Mandatory Event Name
27
+ # Examples:
28
+ # '/rocket_job/config'
29
+ # '/rocket_job/server'
30
+ # '/rocket_job/worker'
31
+ field :name, type: String
32
+
33
+ # Event Action
34
+ # Examples:
35
+ # :shutdown
36
+ # :pause
37
+ # :updated
38
+ field :action, type: Symbol
39
+
40
+ # Hash Parameters to be sent with the event (event specific).
41
+ field :parameters, type: Hash
42
+
43
+ validates_presence_of :name
44
+
45
+ store_in collection: 'rocket_job.events'
46
+ index({created_at: 1}, background: true)
47
+
48
+ # Add a subscriber for its events.
49
+ # Returns a handle to the subscription that can be used to unsubscribe
50
+ # this particular subscription
51
+ #
52
+ # Example:
53
+ # def MySubscriber
54
+ # include RocketJob::Subscriber
55
+ #
56
+ # def hello
57
+ # logger.info "Hello Action Received"
58
+ # end
59
+ #
60
+ # def show(message:)
61
+ # logger.info "Received: #{message}"
62
+ # end
63
+ # end
64
+ #
65
+ # MySubscriber.subscribe
66
+ def self.subscribe(subscriber)
67
+ if block_given?
68
+ begin
69
+ handle = add_subscriber(subscriber)
70
+ yield(subscriber)
71
+ ensure
72
+ unsubscribe(handle) if handle
73
+ end
74
+ else
75
+ add_subscriber(subscriber)
76
+ end
77
+ end
78
+
79
+ # Unsubscribes a previous subscription
80
+ def self.unsubscribe(handle)
81
+ @subscribers.each_value { |v| v.delete_if { |i| i.object_id == handle } }
82
+ end
83
+
84
+ # Indefinitely tail the capped collection looking for new events.
85
+ # time: the start time from which to start looking for new events.
86
+ def self.listener(time: @load_time)
87
+ Thread.current.name = 'rocketjob event'
88
+ create_capped_collection
89
+
90
+ logger.info('Event listener started')
91
+ tail_capped_collection(time) { |event| process_event(event) }
92
+ rescue Exception => exc
93
+ logger.error('#listener Event listener is terminating due to unhandled exception', exc)
94
+ raise(exc)
95
+ end
96
+
97
+ # Create the capped collection only if it does not exist.
98
+ # Drop the collection before calling this method to re-create it.
99
+ def self.create_capped_collection(size: capped_collection_size)
100
+ if collection_exists?
101
+ convert_to_capped_collection(size) unless collection.capped?
102
+ else
103
+ collection.client[collection_name, {capped: true, size: size}].create
104
+ end
105
+ end
106
+
107
+ private
108
+
109
+ @load_time = Time.now.utc
110
+ @subscribers = Concurrent::Map.new { Concurrent::Array.new }
111
+
112
+ def self.add_subscriber(subscriber)
113
+ name = subscriber.class.event_name
114
+ @subscribers[name] = @subscribers[name] << subscriber
115
+ subscriber.object_id
116
+ end
117
+
118
+ def self.tail_capped_collection(time)
119
+ with(socket_timeout: long_poll_seconds + 10) do
120
+ filter = {created_at: {'$gt' => time}}
121
+ collection.
122
+ find(filter).
123
+ await_data.
124
+ cursor_type(:tailable_await).
125
+ max_await_time_ms(long_poll_seconds * 1000).
126
+ sort('$natural' => 1).
127
+ each do |doc|
128
+ event = Mongoid::Factory.from_db(Event, doc)
129
+ # Recovery will occur from after the last message read
130
+ time = event.created_at
131
+ yield(event)
132
+ end
133
+ end
134
+ rescue Mongo::Error::SocketError, Mongo::Error::SocketTimeoutError, Mongo::Error::OperationFailure, Timeout::Error => exc
135
+ logger.info("Creating a new cursor and trying again: #{exc.class.name} #{exc.message}")
136
+ retry
137
+ end
138
+
139
+ # Process a new event, calling registered subscribers.
140
+ def self.process_event(event)
141
+ logger.info('Event Received', event.attributes)
142
+
143
+ if @subscribers.key?(event.name)
144
+ @subscribers[event.name].each { |subscriber| subscriber.process_action(event.action, event.parameters) }
145
+ end
146
+
147
+ if @subscribers.key?(ALL_EVENTS)
148
+ @subscribers[ALL_EVENTS].each { |subscriber| subscriber.process_event(event.name, event.action, event.parameters) }
149
+ end
150
+ rescue StandardError => exc
151
+ logger.error('Unknown subscriber. Continuing..', exc)
152
+ end
153
+
154
+ def self.collection_exists?
155
+ collection.database.collection_names.include?(collection_name.to_s)
156
+ end
157
+
158
+ # Convert a non-capped collection to capped
159
+ def self.convert_to_capped_collection(size)
160
+ collection.database.command('convertToCapped' => collection_name.to_s, 'size' => size)
161
+ end
162
+ end
163
+ end
@@ -46,13 +46,6 @@ module RocketJob
46
46
  field :queued_retention, type: Integer, user_editable: true, copy_on_restart: true
47
47
 
48
48
  def perform
49
- if destroy_zombies
50
- # Cleanup zombie servers
51
- RocketJob::Server.destroy_zombies
52
- # Requeue jobs where the worker is in the zombie state and its server has gone away
53
- RocketJob::ActiveWorker.requeue_zombies
54
- end
55
-
56
49
  RocketJob::Job.aborted.where(completed_at: {'$lte' => aborted_retention.seconds.ago}).destroy_all if aborted_retention
57
50
  if completed_retention
58
51
  RocketJob::Job.completed.where(completed_at: {'$lte' => completed_retention.seconds.ago}).destroy_all
@@ -60,6 +53,13 @@ module RocketJob
60
53
  RocketJob::Job.failed.where(completed_at: {'$lte' => failed_retention.seconds.ago}).destroy_all if failed_retention
61
54
  RocketJob::Job.paused.where(completed_at: {'$lte' => paused_retention.seconds.ago}).destroy_all if paused_retention
62
55
  RocketJob::Job.queued.where(created_at: {'$lte' => queued_retention.seconds.ago}).destroy_all if queued_retention
56
+
57
+ if destroy_zombies
58
+ # Cleanup zombie servers
59
+ RocketJob::Server.destroy_zombies
60
+ # Requeue jobs where the worker is in the zombie state and its server has gone away
61
+ RocketJob::ActiveWorker.requeue_zombies
62
+ end
63
63
  end
64
64
  end
65
65
  end
@@ -25,7 +25,7 @@ module RocketJob
25
25
  # end
26
26
  #
27
27
  # Performance
28
- # - On Ruby (MRI) an empty transaction block call takes about 1ms.
28
+ # - On CRuby an empty transaction block call takes about 1ms.
29
29
  # - On JRuby an empty transaction block call takes about 55ms.
30
30
  #
31
31
  # Note:
@@ -1,4 +1,11 @@
1
1
  module RocketJob
2
+ def self.create_indexes
3
+ # Ensure models with indexes are loaded into memory first
4
+ Job.create_indexes
5
+ Server.create_indexes
6
+ DirmonEntry.create_indexes
7
+ end
8
+
2
9
  # Whether the current process is running inside a Rocket Job server process.
3
10
  def self.server?
4
11
  @server
@@ -1,5 +1,6 @@
1
- require 'yaml'
2
- require 'concurrent'
1
+ require 'rocket_job/server/model'
2
+ require 'rocket_job/server/state_machine'
3
+
3
4
  module RocketJob
4
5
  # Server
5
6
  #
@@ -29,359 +30,7 @@ module RocketJob
29
30
  include Plugins::Document
30
31
  include Plugins::StateMachine
31
32
  include SemanticLogger::Loggable
32
-
33
- store_in collection: 'rocket_job.servers'
34
-
35
- # Unique Name of this server instance
36
- # Default: `host name:PID`
37
- # The unique name is used on re-start to re-queue any jobs that were being processed
38
- # at the time the server unexpectedly terminated, if any
39
- field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
40
-
41
- # The maximum number of workers this server should start
42
- # If set, it will override the default value in RocketJob::Config
43
- field :max_workers, type: Integer, default: -> { Config.instance.max_workers }
44
-
45
- # When this server process was started
46
- field :started_at, type: Time
47
-
48
- # Filter to apply to control which job classes this server can process
49
- field :yaml_filter, type: String
50
-
51
- # The heartbeat information for this server
52
- embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
53
-
54
- # Current state
55
- # Internal use only. Do not set this field directly
56
- field :state, type: Symbol, default: :starting
57
-
58
- index({name: 1}, background: true, unique: true, drop_dups: true)
59
-
60
- validates_presence_of :state, :name, :max_workers
61
-
62
- # States
63
- # :starting -> :running -> :paused
64
- # -> :stopping
65
- aasm column: :state, whiny_persistence: true do
66
- state :starting, initial: true
67
- state :running
68
- state :paused
69
- state :stopping
70
-
71
- event :started do
72
- transitions from: :starting, to: :running
73
- before do
74
- self.started_at = Time.now
75
- end
76
- end
77
-
78
- event :pause do
79
- transitions from: :running, to: :paused
80
- end
81
-
82
- event :resume do
83
- transitions from: :paused, to: :running
84
- end
85
-
86
- event :stop do
87
- transitions from: :running, to: :stopping
88
- transitions from: :paused, to: :stopping
89
- transitions from: :starting, to: :stopping
90
- end
91
- end
92
-
93
- # Requeue any jobs being worked by this server when it is destroyed
94
- before_destroy :requeue_jobs
95
-
96
- # Destroy's all instances of zombie servers and requeues any jobs still "running"
97
- # on those servers.
98
- def self.destroy_zombies
99
- count = 0
100
- each do |server|
101
- next unless server.zombie?
102
- logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
103
- server.destroy
104
- count += 1
105
- end
106
- count
107
- end
108
-
109
- # Stop all running, paused, or starting servers
110
- def self.stop_all
111
- where(:state.in => %i[running paused starting]).each(&:stop!)
112
- end
113
-
114
- # Pause all running servers
115
- def self.pause_all
116
- running.each(&:pause!)
117
- end
118
-
119
- # Resume all paused servers
120
- def self.resume_all
121
- paused.each(&:resume!)
122
- end
123
-
124
- # Returns [Hash<String:Integer>] of the number of servers in each state.
125
- # Note: If there are no servers in that particular state then the hash will not have a value for it.
126
- #
127
- # Example servers in every state:
128
- # RocketJob::Server.counts_by_state
129
- # # => {
130
- # :aborted => 1,
131
- # :completed => 37,
132
- # :failed => 1,
133
- # :paused => 3,
134
- # :queued => 4,
135
- # :running => 1,
136
- # :queued_now => 1,
137
- # :scheduled => 3
138
- # }
139
- #
140
- # Example no servers active:
141
- # RocketJob::Server.counts_by_state
142
- # # => {}
143
- def self.counts_by_state
144
- counts = {}
145
- collection.aggregate(
146
- [
147
- {
148
- '$group' => {
149
- _id: '$state',
150
- count: {'$sum' => 1}
151
- }
152
- }
153
- ]
154
- ).each do |result|
155
- counts[result['_id'].to_sym] = result['count']
156
- end
157
- counts
158
- end
159
-
160
- # On MRI the 'concurrent-ruby-ext' gem may not be loaded
161
- if defined?(Concurrent::JavaAtomicBoolean) || defined?(Concurrent::CAtomicBoolean)
162
- # Returns [true|false] whether the shutdown indicator has been set for this server process
163
- def self.shutdown?
164
- @shutdown.value
165
- end
166
-
167
- # Set shutdown indicator for this server process
168
- def self.shutdown!
169
- @shutdown.make_true
170
- end
171
-
172
- @shutdown = Concurrent::AtomicBoolean.new(false)
173
- else
174
- # Returns [true|false] whether the shutdown indicator has been set for this server process
175
- def self.shutdown?
176
- @shutdown
177
- end
178
-
179
- # Set shutdown indicator for this server process
180
- def self.shutdown!
181
- @shutdown = true
182
- end
183
-
184
- @shutdown = false
185
- end
186
-
187
- # Run the server process
188
- # Attributes supplied are passed to #new
189
- def self.run(attrs = {})
190
- Thread.current.name = 'rocketjob main'
191
- # Create Indexes on server startup
192
- ::Mongoid::Tasks::Database.create_indexes
193
- register_signal_handlers
194
-
195
- server = create!(attrs)
196
- server.send(:run)
197
- ensure
198
- server&.destroy
199
- end
200
-
201
- # Returns [Boolean] whether the server is shutting down
202
- def shutdown?
203
- self.class.shutdown? || !running?
204
- end
205
-
206
- # Scope for all zombie servers
207
- def self.zombies(missed = 4)
208
- dead_seconds = Config.instance.heartbeat_seconds * missed
209
- last_heartbeat_time = Time.now - dead_seconds
210
- where(
211
- :state.in => %i[stopping running paused],
212
- '$or' => [
213
- {'heartbeat.updated_at' => {'$exists' => false}},
214
- {'heartbeat.updated_at' => {'$lte' => last_heartbeat_time}}
215
- ]
216
- )
217
- end
218
-
219
- # Returns [true|false] if this server has missed at least the last 4 heartbeats
220
- #
221
- # Possible causes for a server to miss its heartbeats:
222
- # - The server process has died
223
- # - The server process is "hanging"
224
- # - The server is no longer able to communicate with the MongoDB Server
225
- def zombie?(missed = 4)
226
- return false unless running? || stopping? || paused?
227
- return true if heartbeat.nil? || heartbeat.updated_at.nil?
228
- dead_seconds = Config.instance.heartbeat_seconds * missed
229
- (Time.now - heartbeat.updated_at) >= dead_seconds
230
- end
231
-
232
- # Where clause filter to apply to workers looking for jobs
233
- def filter
234
- YAML.load(yaml_filter) if yaml_filter
235
- end
236
-
237
- def filter=(hash)
238
- self.yaml_filter = hash.nil? ? nil : hash.to_yaml
239
- end
240
-
241
- private
242
-
243
- # Returns [Array<Worker>] collection of workers
244
- def workers
245
- @workers ||= []
246
- end
247
-
248
- # Management Thread
249
- def run
250
- logger.info "Using MongoDB Database: #{RocketJob::Job.collection.database.name}"
251
- logger.info('Running with filter', filter) if filter
252
- build_heartbeat(updated_at: Time.now, workers: 0)
253
- started!
254
- logger.info 'Rocket Job Server started'
255
-
256
- run_workers
257
-
258
- logger.info 'Waiting for workers to stop'
259
- # Tell each worker to shutdown cleanly
260
- workers.each(&:shutdown!)
261
-
262
- while (worker = workers.first)
263
- if worker.join(5)
264
- # Worker thread is dead
265
- workers.shift
266
- else
267
- # Timeout waiting for worker to stop
268
- find_and_update(
269
- 'heartbeat.updated_at' => Time.now,
270
- 'heartbeat.workers' => worker_count
271
- )
272
- end
273
- end
274
-
275
- logger.info 'Shutdown'
276
- rescue ::Mongoid::Errors::DocumentNotFound
277
- logger.warn('Server has been destroyed. Going down hard!')
278
- rescue Exception => exc
279
- logger.error('RocketJob::Server is stopping due to an exception', exc)
280
- ensure
281
- # Logs the backtrace for each running worker
282
- workers.each { |worker| logger.backtrace(thread: worker.thread) if worker.thread && worker.alive? }
283
- end
284
-
285
- def run_workers
286
- stagger = true
287
- while running? || paused?
288
- SemanticLogger.silence(:info) do
289
- find_and_update(
290
- 'heartbeat.updated_at' => Time.now,
291
- 'heartbeat.workers' => worker_count
292
- )
293
- end
294
- if paused?
295
- workers.each(&:shutdown!)
296
- stagger = true
297
- end
298
-
299
- # In case number of threads has been modified
300
- adjust_workers(stagger)
301
- stagger = false
302
-
303
- # Stop server if shutdown indicator was set
304
- if self.class.shutdown? && may_stop?
305
- stop!
306
- else
307
- sleep Config.instance.heartbeat_seconds
308
- end
309
- end
310
- end
311
-
312
- # Returns [Fixnum] number of workers (threads) that are alive
313
- def worker_count
314
- workers.count(&:alive?)
315
- end
316
-
317
- def next_worker_id
318
- @worker_id ||= 0
319
- @worker_id += 1
320
- end
321
-
322
- # Re-adjust the number of running workers to get it up to the
323
- # required number of workers
324
- # Parameters
325
- # stagger_workers
326
- # Whether to stagger when the workers poll for work the first time
327
- # It spreads out the queue polling over the max_poll_seconds so
328
- # that not all workers poll at the same time
329
- # The worker also respond faster than max_poll_seconds when a new
330
- # job is added.
331
- def adjust_workers(stagger_workers = false)
332
- count = worker_count
333
- # Cleanup workers that have stopped
334
- if count != workers.count
335
- logger.info "Cleaning up #{workers.count - count} workers that went away"
336
- workers.delete_if { |t| !t.alive? }
337
- end
338
-
339
- return unless running?
340
-
341
- # Need to add more workers?
342
- return unless count < max_workers
343
-
344
- worker_count = max_workers - count
345
- logger.info "Starting #{worker_count} workers"
346
- worker_count.times.each do
347
- sleep(Config.instance.max_poll_seconds.to_f / max_workers) if stagger_workers
348
- return if shutdown?
349
- # Start worker
350
- begin
351
- workers << Worker.new(id: next_worker_id, server_name: name, filter: filter)
352
- rescue Exception => exc
353
- logger.fatal('Cannot start worker', exc)
354
- end
355
- end
356
- end
357
-
358
- # Register handlers for the various signals
359
- # Term:
360
- # Perform clean shutdown
361
- #
362
- def self.register_signal_handlers
363
- Signal.trap 'SIGTERM' do
364
- shutdown!
365
- message = 'Shutdown signal (SIGTERM) received. Will shutdown as soon as active jobs/slices have completed.'
366
- # Logging uses a mutex to access Queue on MRI/CRuby
367
- defined?(JRuby) ? logger.warn(message) : puts(message)
368
- end
369
-
370
- Signal.trap 'INT' do
371
- shutdown!
372
- message = 'Shutdown signal (INT) received. Will shutdown as soon as active jobs/slices have completed.'
373
- # Logging uses a mutex to access Queue on MRI/CRuby
374
- defined?(JRuby) ? logger.warn(message) : puts(message)
375
- end
376
- rescue StandardError
377
- logger.warn 'SIGTERM handler not installed. Not able to shutdown gracefully'
378
- end
379
-
380
- private_class_method :register_signal_handlers
381
-
382
- # Requeue any jobs assigned to this server when it is destroyed
383
- def requeue_jobs
384
- RocketJob::Job.requeue_dead_server(name)
385
- end
33
+ include Server::Model
34
+ include Server::StateMachine
386
35
  end
387
36
  end