rocketjob 4.0.0 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3d1d8a5bd37991ac8ad2dd5ae28b833dc4cb59d69da1a1cc281c904ab15c3eb
4
- data.tar.gz: 39336f01d701f34e4f25e00e03fa5f06299ec19d1f7c0b52af5aff6ae4b4327b
3
+ metadata.gz: d14befce83747b072cf398356d5a5a798630d65c9aa50ec5f5de1e1f4d0e8d69
4
+ data.tar.gz: 17c5e295968836458ef1b998d1ffb79228ab6b5971c6339ec1bdc74d429d2511
5
5
  SHA512:
6
- metadata.gz: 42266518d00516c62ccd3d0139e6fa22427f48b14454a2f06f0c04f559620b9bbf71700f38b518db03b8290cd6ab7ad7353c47cfbb579aa2f5642ef2a9b2cfb5
7
- data.tar.gz: 444e848667f4a09629b18b467713fe9ae26abb22fd68ceb9048e0adb617c3bbe066ec407c9922577f9ea6bc40271a9d6c84d6064456e5e0532ddfa8f0be85a69
6
+ metadata.gz: 07b25f25ca1fd804e6c3d98ebcf3724acb8bcb564281f1b5ffd21bd815e8a05d7a99988721c5e3a25063da44f800ea3c998fcbecd7751a8b10755b214804feab
7
+ data.tar.gz: 48093ed5e152571a097e07f8e606df99fc48fcf1f8364e78b77d371e5e668acf384c972a002c078930b1af809dd7e34483ba28194fbcd52a9ae06bfb60f2dfce
@@ -47,7 +47,7 @@ module RocketJob
47
47
  opts[:max_workers] = workers if workers
48
48
  opts[:filter] = filter if filter
49
49
 
50
- Server.run(opts)
50
+ Supervisor.run(opts)
51
51
  end
52
52
 
53
53
  def rails?
@@ -96,7 +96,7 @@ module RocketJob
96
96
 
97
97
  require 'rocketjob'
98
98
  begin
99
- require 'rocketjob_batch'
99
+ require 'rocketjob_enterprise'
100
100
  rescue LoadError
101
101
  nil
102
102
  end
@@ -0,0 +1,163 @@
1
+ require 'concurrent-ruby'
2
+
3
+ module RocketJob
4
+ # RocketJob::Event
5
+ #
6
+ # Publish and Subscribe to events. Events are published immediately and usually consumed
7
+ # almost immediately by all subscriber processes.
8
+ class Event
9
+ include SemanticLogger::Loggable
10
+ include Plugins::Document
11
+ include Mongoid::Timestamps
12
+
13
+ ALL_EVENTS = '*'.freeze
14
+
15
+ # Capped collection long polling interval.
16
+ class_attribute :long_poll_seconds, instance_accessor: false
17
+ self.long_poll_seconds = 300
18
+
19
+ # Capped collection size.
20
+ # Only used the first time the collection is created.
21
+ #
22
+ # Default: 128MB.
23
+ class_attribute :capped_collection_size, instance_accessor: false
24
+ self.capped_collection_size = 128 * 1024 * 1024
25
+
26
+ # Mandatory Event Name
27
+ # Examples:
28
+ # '/rocket_job/config'
29
+ # '/rocket_job/server'
30
+ # '/rocket_job/worker'
31
+ field :name, type: String
32
+
33
+ # Event Action
34
+ # Examples:
35
+ # :shutdown
36
+ # :pause
37
+ # :updated
38
+ field :action, type: Symbol
39
+
40
+ # Hash Parameters to be sent with the event (event specific).
41
+ field :parameters, type: Hash
42
+
43
+ validates_presence_of :name
44
+
45
+ store_in collection: 'rocket_job.events'
46
+ index({created_at: 1}, background: true)
47
+
48
+ # Add a subscriber for its events.
49
+ # Returns a handle to the subscription that can be used to unsubscribe
50
+ # this particular subscription
51
+ #
52
+ # Example:
53
+ # def MySubscriber
54
+ # include RocketJob::Subscriber
55
+ #
56
+ # def hello
57
+ # logger.info "Hello Action Received"
58
+ # end
59
+ #
60
+ # def show(message:)
61
+ # logger.info "Received: #{message}"
62
+ # end
63
+ # end
64
+ #
65
+ # MySubscriber.subscribe
66
+ def self.subscribe(subscriber)
67
+ if block_given?
68
+ begin
69
+ handle = add_subscriber(subscriber)
70
+ yield(subscriber)
71
+ ensure
72
+ unsubscribe(handle) if handle
73
+ end
74
+ else
75
+ add_subscriber(subscriber)
76
+ end
77
+ end
78
+
79
+ # Unsubscribes a previous subscription
80
+ def self.unsubscribe(handle)
81
+ @subscribers.each_value { |v| v.delete_if { |i| i.object_id == handle } }
82
+ end
83
+
84
+ # Indefinitely tail the capped collection looking for new events.
85
+ # time: the start time from which to start looking for new events.
86
+ def self.listener(time: @load_time)
87
+ Thread.current.name = 'rocketjob event'
88
+ create_capped_collection
89
+
90
+ logger.info('Event listener started')
91
+ tail_capped_collection(time) { |event| process_event(event) }
92
+ rescue Exception => exc
93
+ logger.error('#listener Event listener is terminating due to unhandled exception', exc)
94
+ raise(exc)
95
+ end
96
+
97
+ # Create the capped collection only if it does not exist.
98
+ # Drop the collection before calling this method to re-create it.
99
+ def self.create_capped_collection(size: capped_collection_size)
100
+ if collection_exists?
101
+ convert_to_capped_collection(size) unless collection.capped?
102
+ else
103
+ collection.client[collection_name, {capped: true, size: size}].create
104
+ end
105
+ end
106
+
107
+ private
108
+
109
+ @load_time = Time.now.utc
110
+ @subscribers = Concurrent::Map.new { Concurrent::Array.new }
111
+
112
+ def self.add_subscriber(subscriber)
113
+ name = subscriber.class.event_name
114
+ @subscribers[name] = @subscribers[name] << subscriber
115
+ subscriber.object_id
116
+ end
117
+
118
+ def self.tail_capped_collection(time)
119
+ with(socket_timeout: long_poll_seconds + 10) do
120
+ filter = {created_at: {'$gt' => time}}
121
+ collection.
122
+ find(filter).
123
+ await_data.
124
+ cursor_type(:tailable_await).
125
+ max_await_time_ms(long_poll_seconds * 1000).
126
+ sort('$natural' => 1).
127
+ each do |doc|
128
+ event = Mongoid::Factory.from_db(Event, doc)
129
+ # Recovery will occur from after the last message read
130
+ time = event.created_at
131
+ yield(event)
132
+ end
133
+ end
134
+ rescue Mongo::Error::SocketError, Mongo::Error::SocketTimeoutError, Mongo::Error::OperationFailure, Timeout::Error => exc
135
+ logger.info("Creating a new cursor and trying again: #{exc.class.name} #{exc.message}")
136
+ retry
137
+ end
138
+
139
+ # Process a new event, calling registered subscribers.
140
+ def self.process_event(event)
141
+ logger.info('Event Received', event.attributes)
142
+
143
+ if @subscribers.key?(event.name)
144
+ @subscribers[event.name].each { |subscriber| subscriber.process_action(event.action, event.parameters) }
145
+ end
146
+
147
+ if @subscribers.key?(ALL_EVENTS)
148
+ @subscribers[ALL_EVENTS].each { |subscriber| subscriber.process_event(event.name, event.action, event.parameters) }
149
+ end
150
+ rescue StandardError => exc
151
+ logger.error('Unknown subscriber. Continuing..', exc)
152
+ end
153
+
154
+ def self.collection_exists?
155
+ collection.database.collection_names.include?(collection_name.to_s)
156
+ end
157
+
158
+ # Convert a non-capped collection to capped
159
+ def self.convert_to_capped_collection(size)
160
+ collection.database.command('convertToCapped' => collection_name.to_s, 'size' => size)
161
+ end
162
+ end
163
+ end
@@ -46,13 +46,6 @@ module RocketJob
46
46
  field :queued_retention, type: Integer, user_editable: true, copy_on_restart: true
47
47
 
48
48
  def perform
49
- if destroy_zombies
50
- # Cleanup zombie servers
51
- RocketJob::Server.destroy_zombies
52
- # Requeue jobs where the worker is in the zombie state and its server has gone away
53
- RocketJob::ActiveWorker.requeue_zombies
54
- end
55
-
56
49
  RocketJob::Job.aborted.where(completed_at: {'$lte' => aborted_retention.seconds.ago}).destroy_all if aborted_retention
57
50
  if completed_retention
58
51
  RocketJob::Job.completed.where(completed_at: {'$lte' => completed_retention.seconds.ago}).destroy_all
@@ -60,6 +53,13 @@ module RocketJob
60
53
  RocketJob::Job.failed.where(completed_at: {'$lte' => failed_retention.seconds.ago}).destroy_all if failed_retention
61
54
  RocketJob::Job.paused.where(completed_at: {'$lte' => paused_retention.seconds.ago}).destroy_all if paused_retention
62
55
  RocketJob::Job.queued.where(created_at: {'$lte' => queued_retention.seconds.ago}).destroy_all if queued_retention
56
+
57
+ if destroy_zombies
58
+ # Cleanup zombie servers
59
+ RocketJob::Server.destroy_zombies
60
+ # Requeue jobs where the worker is in the zombie state and its server has gone away
61
+ RocketJob::ActiveWorker.requeue_zombies
62
+ end
63
63
  end
64
64
  end
65
65
  end
@@ -25,7 +25,7 @@ module RocketJob
25
25
  # end
26
26
  #
27
27
  # Performance
28
- # - On Ruby (MRI) an empty transaction block call takes about 1ms.
28
+ # - On CRuby an empty transaction block call takes about 1ms.
29
29
  # - On JRuby an empty transaction block call takes about 55ms.
30
30
  #
31
31
  # Note:
@@ -1,4 +1,11 @@
1
1
  module RocketJob
2
+ def self.create_indexes
3
+ # Ensure models with indexes are loaded into memory first
4
+ Job.create_indexes
5
+ Server.create_indexes
6
+ DirmonEntry.create_indexes
7
+ end
8
+
2
9
  # Whether the current process is running inside a Rocket Job server process.
3
10
  def self.server?
4
11
  @server
@@ -1,5 +1,6 @@
1
- require 'yaml'
2
- require 'concurrent'
1
+ require 'rocket_job/server/model'
2
+ require 'rocket_job/server/state_machine'
3
+
3
4
  module RocketJob
4
5
  # Server
5
6
  #
@@ -29,359 +30,7 @@ module RocketJob
29
30
  include Plugins::Document
30
31
  include Plugins::StateMachine
31
32
  include SemanticLogger::Loggable
32
-
33
- store_in collection: 'rocket_job.servers'
34
-
35
- # Unique Name of this server instance
36
- # Default: `host name:PID`
37
- # The unique name is used on re-start to re-queue any jobs that were being processed
38
- # at the time the server unexpectedly terminated, if any
39
- field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
40
-
41
- # The maximum number of workers this server should start
42
- # If set, it will override the default value in RocketJob::Config
43
- field :max_workers, type: Integer, default: -> { Config.instance.max_workers }
44
-
45
- # When this server process was started
46
- field :started_at, type: Time
47
-
48
- # Filter to apply to control which job classes this server can process
49
- field :yaml_filter, type: String
50
-
51
- # The heartbeat information for this server
52
- embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
53
-
54
- # Current state
55
- # Internal use only. Do not set this field directly
56
- field :state, type: Symbol, default: :starting
57
-
58
- index({name: 1}, background: true, unique: true, drop_dups: true)
59
-
60
- validates_presence_of :state, :name, :max_workers
61
-
62
- # States
63
- # :starting -> :running -> :paused
64
- # -> :stopping
65
- aasm column: :state, whiny_persistence: true do
66
- state :starting, initial: true
67
- state :running
68
- state :paused
69
- state :stopping
70
-
71
- event :started do
72
- transitions from: :starting, to: :running
73
- before do
74
- self.started_at = Time.now
75
- end
76
- end
77
-
78
- event :pause do
79
- transitions from: :running, to: :paused
80
- end
81
-
82
- event :resume do
83
- transitions from: :paused, to: :running
84
- end
85
-
86
- event :stop do
87
- transitions from: :running, to: :stopping
88
- transitions from: :paused, to: :stopping
89
- transitions from: :starting, to: :stopping
90
- end
91
- end
92
-
93
- # Requeue any jobs being worked by this server when it is destroyed
94
- before_destroy :requeue_jobs
95
-
96
- # Destroy's all instances of zombie servers and requeues any jobs still "running"
97
- # on those servers.
98
- def self.destroy_zombies
99
- count = 0
100
- each do |server|
101
- next unless server.zombie?
102
- logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
103
- server.destroy
104
- count += 1
105
- end
106
- count
107
- end
108
-
109
- # Stop all running, paused, or starting servers
110
- def self.stop_all
111
- where(:state.in => %i[running paused starting]).each(&:stop!)
112
- end
113
-
114
- # Pause all running servers
115
- def self.pause_all
116
- running.each(&:pause!)
117
- end
118
-
119
- # Resume all paused servers
120
- def self.resume_all
121
- paused.each(&:resume!)
122
- end
123
-
124
- # Returns [Hash<String:Integer>] of the number of servers in each state.
125
- # Note: If there are no servers in that particular state then the hash will not have a value for it.
126
- #
127
- # Example servers in every state:
128
- # RocketJob::Server.counts_by_state
129
- # # => {
130
- # :aborted => 1,
131
- # :completed => 37,
132
- # :failed => 1,
133
- # :paused => 3,
134
- # :queued => 4,
135
- # :running => 1,
136
- # :queued_now => 1,
137
- # :scheduled => 3
138
- # }
139
- #
140
- # Example no servers active:
141
- # RocketJob::Server.counts_by_state
142
- # # => {}
143
- def self.counts_by_state
144
- counts = {}
145
- collection.aggregate(
146
- [
147
- {
148
- '$group' => {
149
- _id: '$state',
150
- count: {'$sum' => 1}
151
- }
152
- }
153
- ]
154
- ).each do |result|
155
- counts[result['_id'].to_sym] = result['count']
156
- end
157
- counts
158
- end
159
-
160
- # On MRI the 'concurrent-ruby-ext' gem may not be loaded
161
- if defined?(Concurrent::JavaAtomicBoolean) || defined?(Concurrent::CAtomicBoolean)
162
- # Returns [true|false] whether the shutdown indicator has been set for this server process
163
- def self.shutdown?
164
- @shutdown.value
165
- end
166
-
167
- # Set shutdown indicator for this server process
168
- def self.shutdown!
169
- @shutdown.make_true
170
- end
171
-
172
- @shutdown = Concurrent::AtomicBoolean.new(false)
173
- else
174
- # Returns [true|false] whether the shutdown indicator has been set for this server process
175
- def self.shutdown?
176
- @shutdown
177
- end
178
-
179
- # Set shutdown indicator for this server process
180
- def self.shutdown!
181
- @shutdown = true
182
- end
183
-
184
- @shutdown = false
185
- end
186
-
187
- # Run the server process
188
- # Attributes supplied are passed to #new
189
- def self.run(attrs = {})
190
- Thread.current.name = 'rocketjob main'
191
- # Create Indexes on server startup
192
- ::Mongoid::Tasks::Database.create_indexes
193
- register_signal_handlers
194
-
195
- server = create!(attrs)
196
- server.send(:run)
197
- ensure
198
- server&.destroy
199
- end
200
-
201
- # Returns [Boolean] whether the server is shutting down
202
- def shutdown?
203
- self.class.shutdown? || !running?
204
- end
205
-
206
- # Scope for all zombie servers
207
- def self.zombies(missed = 4)
208
- dead_seconds = Config.instance.heartbeat_seconds * missed
209
- last_heartbeat_time = Time.now - dead_seconds
210
- where(
211
- :state.in => %i[stopping running paused],
212
- '$or' => [
213
- {'heartbeat.updated_at' => {'$exists' => false}},
214
- {'heartbeat.updated_at' => {'$lte' => last_heartbeat_time}}
215
- ]
216
- )
217
- end
218
-
219
- # Returns [true|false] if this server has missed at least the last 4 heartbeats
220
- #
221
- # Possible causes for a server to miss its heartbeats:
222
- # - The server process has died
223
- # - The server process is "hanging"
224
- # - The server is no longer able to communicate with the MongoDB Server
225
- def zombie?(missed = 4)
226
- return false unless running? || stopping? || paused?
227
- return true if heartbeat.nil? || heartbeat.updated_at.nil?
228
- dead_seconds = Config.instance.heartbeat_seconds * missed
229
- (Time.now - heartbeat.updated_at) >= dead_seconds
230
- end
231
-
232
- # Where clause filter to apply to workers looking for jobs
233
- def filter
234
- YAML.load(yaml_filter) if yaml_filter
235
- end
236
-
237
- def filter=(hash)
238
- self.yaml_filter = hash.nil? ? nil : hash.to_yaml
239
- end
240
-
241
- private
242
-
243
- # Returns [Array<Worker>] collection of workers
244
- def workers
245
- @workers ||= []
246
- end
247
-
248
- # Management Thread
249
- def run
250
- logger.info "Using MongoDB Database: #{RocketJob::Job.collection.database.name}"
251
- logger.info('Running with filter', filter) if filter
252
- build_heartbeat(updated_at: Time.now, workers: 0)
253
- started!
254
- logger.info 'Rocket Job Server started'
255
-
256
- run_workers
257
-
258
- logger.info 'Waiting for workers to stop'
259
- # Tell each worker to shutdown cleanly
260
- workers.each(&:shutdown!)
261
-
262
- while (worker = workers.first)
263
- if worker.join(5)
264
- # Worker thread is dead
265
- workers.shift
266
- else
267
- # Timeout waiting for worker to stop
268
- find_and_update(
269
- 'heartbeat.updated_at' => Time.now,
270
- 'heartbeat.workers' => worker_count
271
- )
272
- end
273
- end
274
-
275
- logger.info 'Shutdown'
276
- rescue ::Mongoid::Errors::DocumentNotFound
277
- logger.warn('Server has been destroyed. Going down hard!')
278
- rescue Exception => exc
279
- logger.error('RocketJob::Server is stopping due to an exception', exc)
280
- ensure
281
- # Logs the backtrace for each running worker
282
- workers.each { |worker| logger.backtrace(thread: worker.thread) if worker.thread && worker.alive? }
283
- end
284
-
285
- def run_workers
286
- stagger = true
287
- while running? || paused?
288
- SemanticLogger.silence(:info) do
289
- find_and_update(
290
- 'heartbeat.updated_at' => Time.now,
291
- 'heartbeat.workers' => worker_count
292
- )
293
- end
294
- if paused?
295
- workers.each(&:shutdown!)
296
- stagger = true
297
- end
298
-
299
- # In case number of threads has been modified
300
- adjust_workers(stagger)
301
- stagger = false
302
-
303
- # Stop server if shutdown indicator was set
304
- if self.class.shutdown? && may_stop?
305
- stop!
306
- else
307
- sleep Config.instance.heartbeat_seconds
308
- end
309
- end
310
- end
311
-
312
- # Returns [Fixnum] number of workers (threads) that are alive
313
- def worker_count
314
- workers.count(&:alive?)
315
- end
316
-
317
- def next_worker_id
318
- @worker_id ||= 0
319
- @worker_id += 1
320
- end
321
-
322
- # Re-adjust the number of running workers to get it up to the
323
- # required number of workers
324
- # Parameters
325
- # stagger_workers
326
- # Whether to stagger when the workers poll for work the first time
327
- # It spreads out the queue polling over the max_poll_seconds so
328
- # that not all workers poll at the same time
329
- # The worker also respond faster than max_poll_seconds when a new
330
- # job is added.
331
- def adjust_workers(stagger_workers = false)
332
- count = worker_count
333
- # Cleanup workers that have stopped
334
- if count != workers.count
335
- logger.info "Cleaning up #{workers.count - count} workers that went away"
336
- workers.delete_if { |t| !t.alive? }
337
- end
338
-
339
- return unless running?
340
-
341
- # Need to add more workers?
342
- return unless count < max_workers
343
-
344
- worker_count = max_workers - count
345
- logger.info "Starting #{worker_count} workers"
346
- worker_count.times.each do
347
- sleep(Config.instance.max_poll_seconds.to_f / max_workers) if stagger_workers
348
- return if shutdown?
349
- # Start worker
350
- begin
351
- workers << Worker.new(id: next_worker_id, server_name: name, filter: filter)
352
- rescue Exception => exc
353
- logger.fatal('Cannot start worker', exc)
354
- end
355
- end
356
- end
357
-
358
- # Register handlers for the various signals
359
- # Term:
360
- # Perform clean shutdown
361
- #
362
- def self.register_signal_handlers
363
- Signal.trap 'SIGTERM' do
364
- shutdown!
365
- message = 'Shutdown signal (SIGTERM) received. Will shutdown as soon as active jobs/slices have completed.'
366
- # Logging uses a mutex to access Queue on MRI/CRuby
367
- defined?(JRuby) ? logger.warn(message) : puts(message)
368
- end
369
-
370
- Signal.trap 'INT' do
371
- shutdown!
372
- message = 'Shutdown signal (INT) received. Will shutdown as soon as active jobs/slices have completed.'
373
- # Logging uses a mutex to access Queue on MRI/CRuby
374
- defined?(JRuby) ? logger.warn(message) : puts(message)
375
- end
376
- rescue StandardError
377
- logger.warn 'SIGTERM handler not installed. Not able to shutdown gracefully'
378
- end
379
-
380
- private_class_method :register_signal_handlers
381
-
382
- # Requeue any jobs assigned to this server when it is destroyed
383
- def requeue_jobs
384
- RocketJob::Job.requeue_dead_server(name)
385
- end
33
+ include Server::Model
34
+ include Server::StateMachine
386
35
  end
387
36
  end