rocketjob 2.1.3 → 3.0.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +36 -0
- data/lib/rocket_job/active_server.rb +48 -0
- data/lib/rocket_job/cli.rb +29 -17
- data/lib/rocket_job/config.rb +19 -31
- data/lib/rocket_job/dirmon_entry.rb +15 -45
- data/lib/rocket_job/extensions/mongo/logging.rb +26 -0
- data/lib/rocket_job/extensions/rocket_job_adapter.rb +3 -5
- data/lib/rocket_job/heartbeat.rb +18 -23
- data/lib/rocket_job/job.rb +0 -1
- data/lib/rocket_job/job_exception.rb +11 -13
- data/lib/rocket_job/jobs/dirmon_job.rb +8 -8
- data/lib/rocket_job/jobs/housekeeping_job.rb +13 -15
- data/lib/rocket_job/performance.rb +5 -5
- data/lib/rocket_job/plugins/cron.rb +3 -10
- data/lib/rocket_job/plugins/document.rb +58 -33
- data/lib/rocket_job/plugins/job/model.rb +43 -71
- data/lib/rocket_job/plugins/job/persistence.rb +7 -63
- data/lib/rocket_job/plugins/job/worker.rb +24 -26
- data/lib/rocket_job/plugins/processing_window.rb +6 -9
- data/lib/rocket_job/plugins/retry.rb +3 -8
- data/lib/rocket_job/plugins/singleton.rb +1 -1
- data/lib/rocket_job/plugins/state_machine.rb +1 -7
- data/lib/rocket_job/server.rb +352 -0
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +46 -336
- data/lib/rocketjob.rb +5 -4
- data/test/config/mongoid.yml +88 -0
- data/test/config_test.rb +1 -1
- data/test/dirmon_entry_test.rb +15 -79
- data/test/dirmon_job_test.rb +6 -6
- data/test/job_test.rb +2 -2
- data/test/plugins/job/callbacks_test.rb +40 -32
- data/test/plugins/job/defaults_test.rb +10 -8
- data/test/plugins/job/model_test.rb +1 -3
- data/test/plugins/job/persistence_test.rb +11 -13
- data/test/plugins/job/worker_test.rb +45 -26
- data/test/plugins/processing_window_test.rb +4 -4
- data/test/plugins/restart_test.rb +11 -12
- data/test/plugins/state_machine_event_callbacks_test.rb +20 -18
- data/test/plugins/state_machine_test.rb +5 -5
- data/test/test_helper.rb +4 -1
- metadata +15 -29
- data/lib/rocket_job/extensions/mongo.rb +0 -23
- data/lib/rocket_job/extensions/mongo_mapper.rb +0 -30
- data/lib/rocket_job/plugins/job/defaults.rb +0 -40
- data/test/config/mongo.yml +0 -46
@@ -10,18 +10,7 @@ module RocketJob
|
|
10
10
|
|
11
11
|
included do
|
12
12
|
# Store all job types in this collection
|
13
|
-
|
14
|
-
|
15
|
-
# Create indexes
|
16
|
-
def self.create_indexes
|
17
|
-
# Used by find_and_modify in .rocket_job_retrieve
|
18
|
-
ensure_index({state: 1, priority: 1, _id: 1}, background: true)
|
19
|
-
# Remove outdated indexes if present
|
20
|
-
drop_index('state_1_run_at_1_priority_1_created_at_1_sub_state_1') rescue nil
|
21
|
-
drop_index('state_1_priority_1_created_at_1_sub_state_1') rescue nil
|
22
|
-
drop_index('state_1_priority_1_created_at_1') rescue nil
|
23
|
-
drop_index('created_at_1') rescue nil
|
24
|
-
end
|
13
|
+
store_in collection: 'rocket_job.jobs'
|
25
14
|
|
26
15
|
# Retrieves the next job to work on in priority based order
|
27
16
|
# and assigns it to this worker
|
@@ -35,40 +24,12 @@ module RocketJob
|
|
35
24
|
# skip_job_ids [Array<BSON::ObjectId>]
|
36
25
|
# Job ids to exclude when looking for the next job
|
37
26
|
def self.rocket_job_retrieve(worker_name, skip_job_ids = nil)
|
38
|
-
|
39
|
-
|
40
|
-
{run_at: {'$lte' => Time.now}}
|
41
|
-
]
|
42
|
-
update = query = nil
|
43
|
-
if defined?(RocketJobPro)
|
44
|
-
query = {
|
45
|
-
'$and' => [
|
46
|
-
{
|
47
|
-
'$or' => [
|
48
|
-
{'state' => 'queued'}, # Jobs
|
49
|
-
{'state' => 'running', 'sub_state' => :processing} # Slices
|
50
|
-
]
|
51
|
-
},
|
52
|
-
{
|
53
|
-
'$or' => run_at
|
54
|
-
}
|
55
|
-
]
|
56
|
-
}
|
57
|
-
update = {'$set' => {'worker_name' => worker_name, 'state' => 'running'}}
|
58
|
-
else
|
59
|
-
query = {'state' => 'queued', '$or' => run_at}
|
60
|
-
update = {'$set' => {'worker_name' => worker_name, 'state' => 'running', 'started_at' => Time.now}}
|
61
|
-
end
|
27
|
+
query = queued_now
|
28
|
+
update = {'$set' => {'worker_name' => worker_name, 'state' => 'running', 'started_at' => Time.now}}
|
62
29
|
|
63
|
-
query
|
30
|
+
query = query.where(:id.nin => skip_job_ids) if skip_job_ids && skip_job_ids.size > 0
|
64
31
|
|
65
|
-
|
66
|
-
query: query,
|
67
|
-
sort: {priority: 1, _id: 1},
|
68
|
-
update: update
|
69
|
-
)
|
70
|
-
load(doc)
|
71
|
-
end
|
32
|
+
query.sort(priority: 1, _id: 1).find_one_and_update(update)
|
72
33
|
end
|
73
34
|
|
74
35
|
# Returns [Hash<String:Integer>] of the number of jobs in each state
|
@@ -114,7 +75,7 @@ module RocketJob
|
|
114
75
|
|
115
76
|
# Calculate :queued_now and :scheduled if there are queued jobs
|
116
77
|
if queued_count = counts[:queued]
|
117
|
-
scheduled_count = RocketJob::Job.
|
78
|
+
scheduled_count = RocketJob::Job.scheduled.count
|
118
79
|
if scheduled_count > 0
|
119
80
|
queued_now_count = queued_count - scheduled_count
|
120
81
|
counts[:queued_now] = queued_count - scheduled_count if queued_now_count > 0
|
@@ -133,7 +94,7 @@ module RocketJob
|
|
133
94
|
return super unless destroy_on_complete
|
134
95
|
begin
|
135
96
|
super
|
136
|
-
rescue
|
97
|
+
rescue Mongoid::Errors::DocumentNotFound
|
137
98
|
unless completed?
|
138
99
|
self.state = :completed
|
139
100
|
rocket_job_set_completed_at
|
@@ -143,23 +104,6 @@ module RocketJob
|
|
143
104
|
end
|
144
105
|
end
|
145
106
|
|
146
|
-
private
|
147
|
-
|
148
|
-
# After this model is loaded, convert any hashes in the arguments list to HashWithIndifferentAccess
|
149
|
-
def load_from_database(*args)
|
150
|
-
super
|
151
|
-
if arguments.present?
|
152
|
-
self.arguments = arguments.collect { |i| i.is_a?(BSON::OrderedHash) ? i.with_indifferent_access : i }
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
# Apply RocketJob defaults after initializing default values
|
157
|
-
# but before setting attributes. after_initialize is too late
|
158
|
-
def initialize_default_values(except = {})
|
159
|
-
super
|
160
|
-
rocket_job_set_defaults
|
161
|
-
end
|
162
|
-
|
163
107
|
end
|
164
108
|
end
|
165
109
|
end
|
@@ -8,15 +8,15 @@ module RocketJob
|
|
8
8
|
module Worker
|
9
9
|
extend ActiveSupport::Concern
|
10
10
|
|
11
|
-
|
11
|
+
module ClassMethods
|
12
12
|
# Run this job later
|
13
13
|
#
|
14
14
|
# Saves it to the database for processing later by workers
|
15
|
-
def
|
15
|
+
def perform_later(args, &block)
|
16
16
|
if RocketJob::Config.inline_mode
|
17
|
-
perform_now(
|
17
|
+
perform_now(args, &block)
|
18
18
|
else
|
19
|
-
job = new(
|
19
|
+
job = new(args)
|
20
20
|
block.call(job) if block
|
21
21
|
job.save!
|
22
22
|
job
|
@@ -28,8 +28,8 @@ module RocketJob
|
|
28
28
|
# The job is not saved to the database since it is processed entriely in memory
|
29
29
|
# As a result before_save and before_destroy callbacks will not be called.
|
30
30
|
# Validations are still called however prior to calling #perform
|
31
|
-
def
|
32
|
-
job = new(
|
31
|
+
def perform_now(args, &block)
|
32
|
+
job = new(args)
|
33
33
|
block.call(job) if block
|
34
34
|
job.perform_now
|
35
35
|
job
|
@@ -48,7 +48,7 @@ module RocketJob
|
|
48
48
|
#
|
49
49
|
# Note:
|
50
50
|
# If a job is in queued state it will be started
|
51
|
-
def
|
51
|
+
def rocket_job_next_job(worker_name, skip_job_ids = nil)
|
52
52
|
while (job = rocket_job_retrieve(worker_name, skip_job_ids))
|
53
53
|
case
|
54
54
|
when job.running?
|
@@ -67,16 +67,13 @@ module RocketJob
|
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
|
-
# Requeues all jobs that were running on
|
71
|
-
def
|
72
|
-
#
|
73
|
-
running.each do |job|
|
74
|
-
job.requeue!(
|
70
|
+
# Requeues all jobs that were running on a server that died
|
71
|
+
def requeue_dead_server(server_name)
|
72
|
+
# Need to requeue paused, failed since user may have transitioned job before it finished
|
73
|
+
where(:state.in => [:running, :paused, :faled]).each do |job|
|
74
|
+
job.requeue!(server_name) if job.may_requeue?(server_name)
|
75
75
|
end
|
76
76
|
end
|
77
|
-
|
78
|
-
# Turn off embedded callbacks. Slow and not used for Jobs
|
79
|
-
embedded_callbacks_off
|
80
77
|
end
|
81
78
|
|
82
79
|
# Runs the job now in the current thread.
|
@@ -91,14 +88,9 @@ module RocketJob
|
|
91
88
|
#
|
92
89
|
# Exceptions are _not_ suppressed and should be handled by the caller.
|
93
90
|
def perform_now
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
elsif invalid?
|
98
|
-
raise(MongoMapper::DocumentNotValid, self)
|
99
|
-
end
|
100
|
-
worker = RocketJob::Worker.new(name: 'inline')
|
101
|
-
worker.started
|
91
|
+
raise(Mongoid::Errors::Validations, self) unless valid?
|
92
|
+
|
93
|
+
worker = RocketJob::Worker.new(inline: true)
|
102
94
|
start if may_start?
|
103
95
|
# Re-Raise exceptions
|
104
96
|
rocket_job_work(worker, true) if running?
|
@@ -114,7 +106,7 @@ module RocketJob
|
|
114
106
|
# The job is automatically saved only if an exception is raised in the supplied block.
|
115
107
|
#
|
116
108
|
# worker_name: [String]
|
117
|
-
# Name of the
|
109
|
+
# Name of the server on which the exception has occurred
|
118
110
|
#
|
119
111
|
# re_raise_exceptions: [true|false]
|
120
112
|
# Re-raise the exception after updating the job
|
@@ -150,10 +142,10 @@ module RocketJob
|
|
150
142
|
run_callbacks :perform do
|
151
143
|
# Allow callbacks to fail, complete or abort the job
|
152
144
|
if running?
|
153
|
-
ret = perform
|
145
|
+
ret = perform
|
154
146
|
if collect_output?
|
155
147
|
# Result must be a Hash, if not put it in a Hash
|
156
|
-
self.result =
|
148
|
+
self.result = ret.is_a?(Hash) ? ret : {'result' => ret}
|
157
149
|
end
|
158
150
|
end
|
159
151
|
end
|
@@ -166,6 +158,12 @@ module RocketJob
|
|
166
158
|
false
|
167
159
|
end
|
168
160
|
|
161
|
+
# Returns [Hash<String:[Array<ActiveWorker>]>] All servers actively working on this job
|
162
|
+
def rocket_job_active_servers
|
163
|
+
return {} unless running?
|
164
|
+
{worker_name => [ActiveServer.new(worker_name, started_at, self)]}
|
165
|
+
end
|
166
|
+
|
169
167
|
end
|
170
168
|
end
|
171
169
|
end
|
@@ -20,13 +20,10 @@ module RocketJob
|
|
20
20
|
# class BusinessHoursJob < RocketJob::Job
|
21
21
|
# include RocketJob::Plugins::ProcessingWindow
|
22
22
|
#
|
23
|
-
# #
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# # How long the processing window is:
|
28
|
-
# job.processing_duration = 12.hours
|
29
|
-
# end
|
23
|
+
# # The start of the processing window
|
24
|
+
# self.processing_schedule = "30 8 * * * America/New_York"
|
25
|
+
# # How long the processing window is:
|
26
|
+
# self..processing_duration = 12.hours
|
30
27
|
#
|
31
28
|
# def perform
|
32
29
|
# # Job will only run between 8:30am and 8:30pm Eastern
|
@@ -41,8 +38,8 @@ module RocketJob
|
|
41
38
|
extend ActiveSupport::Concern
|
42
39
|
|
43
40
|
included do
|
44
|
-
|
45
|
-
|
41
|
+
field :processing_schedule, type: String, class_attribute: true
|
42
|
+
field :processing_duration, type: Integer, class_attribute: true
|
46
43
|
|
47
44
|
before_create :rocket_job_processing_window_set_run_at
|
48
45
|
before_retry :rocket_job_processing_window_set_run_at
|
@@ -26,9 +26,7 @@ module RocketJob
|
|
26
26
|
# include RocketJob::Plugins::Retry
|
27
27
|
#
|
28
28
|
# # Set the default retry_count
|
29
|
-
#
|
30
|
-
# job.max_retries = 3
|
31
|
-
# end
|
29
|
+
# self.max_retries = 3
|
32
30
|
#
|
33
31
|
# def perform
|
34
32
|
# puts "DONE"
|
@@ -52,13 +50,10 @@ module RocketJob
|
|
52
50
|
|
53
51
|
# Maximum number of times to retry this job
|
54
52
|
# 25 is approximately 3 weeks of retries
|
55
|
-
|
53
|
+
field :max_retries, type: Integer, default: 25, class_attribute: true, user_editable: true
|
56
54
|
|
57
55
|
# List of times when this job failed
|
58
|
-
|
59
|
-
|
60
|
-
# Make max_retries editable in Rocket Job Mission Control
|
61
|
-
public_rocket_job_properties :max_retries
|
56
|
+
field :failed_times, type: Array, default: []
|
62
57
|
|
63
58
|
validates_presence_of :max_retries
|
64
59
|
end
|
@@ -17,7 +17,7 @@ module RocketJob
|
|
17
17
|
|
18
18
|
# Returns [true|false] whether another instance of this job is already active
|
19
19
|
def rocket_job_singleton_active?
|
20
|
-
self.class.where(state
|
20
|
+
self.class.where(:state.in => [:running, :queued], :id.ne => id).exists?
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
@@ -86,13 +86,7 @@ module RocketJob
|
|
86
86
|
write_attribute(attr_name, state)
|
87
87
|
|
88
88
|
begin
|
89
|
-
|
90
|
-
saved = save(validate: false)
|
91
|
-
write_attribute(attr_name, old_value) unless saved
|
92
|
-
saved
|
93
|
-
else
|
94
|
-
save!
|
95
|
-
end
|
89
|
+
save!
|
96
90
|
rescue Exception => exc
|
97
91
|
write_attribute(attr_name, old_value)
|
98
92
|
raise(exc)
|
@@ -0,0 +1,352 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'concurrent'
|
3
|
+
module RocketJob
|
4
|
+
# Server
|
5
|
+
#
|
6
|
+
# On startup a server instance will automatically register itself
|
7
|
+
# if not already present
|
8
|
+
#
|
9
|
+
# Starting a server in the foreground:
|
10
|
+
# - Using a Rails runner:
|
11
|
+
# bin/rocketjob
|
12
|
+
#
|
13
|
+
# Starting a server in the background:
|
14
|
+
# - Using a Rails runner:
|
15
|
+
# nohup bin/rocketjob --quiet 2>&1 1>output.log &
|
16
|
+
#
|
17
|
+
# Stopping a server:
|
18
|
+
# - Stop the server via the Web UI
|
19
|
+
# - Send a regular kill signal to make it shutdown once all active work is complete
|
20
|
+
# kill <pid>
|
21
|
+
# - Or, use the following Ruby code:
|
22
|
+
# server = RocketJob::Server.where(name: 'server name').first
|
23
|
+
# server.stop!
|
24
|
+
#
|
25
|
+
# Sending the kill signal locally will result in starting the shutdown process
|
26
|
+
# immediately. Via the UI or Ruby code the server can take up to 15 seconds
|
27
|
+
# (the heartbeat interval) to start shutting down.
|
28
|
+
class Server
|
29
|
+
include Plugins::Document
|
30
|
+
include Plugins::StateMachine
|
31
|
+
include SemanticLogger::Loggable
|
32
|
+
|
33
|
+
# Unique Name of this server instance
|
34
|
+
# Default: `host name:PID`
|
35
|
+
# The unique name is used on re-start to re-queue any jobs that were being processed
|
36
|
+
# at the time the server unexpectedly terminated, if any
|
37
|
+
field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
|
38
|
+
|
39
|
+
# The maximum number of workers this server should start
|
40
|
+
# If set, it will override the default value in RocketJob::Config
|
41
|
+
field :max_workers, type: Integer, default: -> { Config.instance.max_worker_threads }
|
42
|
+
|
43
|
+
# When this server process was started
|
44
|
+
field :started_at, type: Time
|
45
|
+
|
46
|
+
# The heartbeat information for this server
|
47
|
+
embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
|
48
|
+
|
49
|
+
# Current state
|
50
|
+
# Internal use only. Do not set this field directly
|
51
|
+
field :state, type: Symbol, default: :starting
|
52
|
+
|
53
|
+
index({name: 1}, background: true, unique: true, drop_dups: true)
|
54
|
+
|
55
|
+
validates_presence_of :state, :name, :max_workers
|
56
|
+
|
57
|
+
# States
|
58
|
+
# :starting -> :running -> :paused
|
59
|
+
# -> :stopping
|
60
|
+
aasm column: :state do
|
61
|
+
state :starting, initial: true
|
62
|
+
state :running
|
63
|
+
state :paused
|
64
|
+
state :stopping
|
65
|
+
|
66
|
+
event :started do
|
67
|
+
transitions from: :starting, to: :running
|
68
|
+
before do
|
69
|
+
self.started_at = Time.now
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
event :pause do
|
74
|
+
transitions from: :running, to: :paused
|
75
|
+
end
|
76
|
+
|
77
|
+
event :resume do
|
78
|
+
transitions from: :paused, to: :running
|
79
|
+
end
|
80
|
+
|
81
|
+
event :stop do
|
82
|
+
transitions from: :running, to: :stopping
|
83
|
+
transitions from: :paused, to: :stopping
|
84
|
+
transitions from: :starting, to: :stopping
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Requeue any jobs being worked by this server when it is destroyed
|
89
|
+
before_destroy :requeue_jobs
|
90
|
+
|
91
|
+
# Destroy's all instances of zombie server and requeue any jobs still "running"
|
92
|
+
# on those servers
|
93
|
+
def self.destroy_zombies
|
94
|
+
count = 0
|
95
|
+
each do |server|
|
96
|
+
next unless server.zombie?
|
97
|
+
logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
|
98
|
+
server.destroy
|
99
|
+
count += 1
|
100
|
+
end
|
101
|
+
count
|
102
|
+
end
|
103
|
+
|
104
|
+
# Stop all running, paused, or starting servers
|
105
|
+
def self.stop_all
|
106
|
+
where(:state.in => [:running, :paused, :starting]).each(&:stop!)
|
107
|
+
end
|
108
|
+
|
109
|
+
# Pause all running servers
|
110
|
+
def self.pause_all
|
111
|
+
running.each(&:pause!)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Resume all paused servers
|
115
|
+
def self.resume_all
|
116
|
+
paused.each(&:resume!)
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns [Hash<String:Integer>] of the number of servers in each state.
|
120
|
+
# Note: If there are no servers in that particular state then the hash will not have a value for it.
|
121
|
+
#
|
122
|
+
# Example servers in every state:
|
123
|
+
# RocketJob::Server.counts_by_state
|
124
|
+
# # => {
|
125
|
+
# :aborted => 1,
|
126
|
+
# :completed => 37,
|
127
|
+
# :failed => 1,
|
128
|
+
# :paused => 3,
|
129
|
+
# :queued => 4,
|
130
|
+
# :running => 1,
|
131
|
+
# :queued_now => 1,
|
132
|
+
# :scheduled => 3
|
133
|
+
# }
|
134
|
+
#
|
135
|
+
# Example no servers active:
|
136
|
+
# RocketJob::Server.counts_by_state
|
137
|
+
# # => {}
|
138
|
+
def self.counts_by_state
|
139
|
+
counts = {}
|
140
|
+
collection.aggregate([
|
141
|
+
{
|
142
|
+
'$group' => {
|
143
|
+
_id: '$state',
|
144
|
+
count: {'$sum' => 1}
|
145
|
+
}
|
146
|
+
}
|
147
|
+
]
|
148
|
+
).each do |result|
|
149
|
+
counts[result['_id'].to_sym] = result['count']
|
150
|
+
end
|
151
|
+
counts
|
152
|
+
end
|
153
|
+
|
154
|
+
# On MRI the 'concurrent-ruby-ext' gem may not be loaded
|
155
|
+
if defined?(Concurrent::JavaAtomicBoolean) || defined?(Concurrent::CAtomicBoolean)
|
156
|
+
# Returns [true|false] whether the shutdown indicator has been set for this server process
|
157
|
+
def self.shutdown?
|
158
|
+
@@shutdown.value
|
159
|
+
end
|
160
|
+
|
161
|
+
# Set shutdown indicator for this server process
|
162
|
+
def self.shutdown!
|
163
|
+
@@shutdown.make_true
|
164
|
+
end
|
165
|
+
|
166
|
+
@@shutdown = Concurrent::AtomicBoolean.new(false)
|
167
|
+
else
|
168
|
+
# Returns [true|false] whether the shutdown indicator has been set for this server process
|
169
|
+
def self.shutdown?
|
170
|
+
@@shutdown
|
171
|
+
end
|
172
|
+
|
173
|
+
# Set shutdown indicator for this server process
|
174
|
+
def self.shutdown!
|
175
|
+
@@shutdown = true
|
176
|
+
end
|
177
|
+
|
178
|
+
@@shutdown = false
|
179
|
+
end
|
180
|
+
|
181
|
+
# Run the server process
|
182
|
+
# Attributes supplied are passed to #new
|
183
|
+
def self.run(attrs = {})
|
184
|
+
Thread.current.name = 'rocketjob main'
|
185
|
+
# Create Indexes on server startup
|
186
|
+
Mongoid::Tasks::Database.create_indexes
|
187
|
+
register_signal_handlers
|
188
|
+
|
189
|
+
server = create!(attrs)
|
190
|
+
server.send(:run)
|
191
|
+
|
192
|
+
ensure
|
193
|
+
server.destroy if server
|
194
|
+
end
|
195
|
+
|
196
|
+
# Returns [Boolean] whether the server is shutting down
|
197
|
+
def shutdown?
|
198
|
+
self.class.shutdown? || !running?
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns [true|false] if this server has missed at least the last 4 heartbeats
|
202
|
+
#
|
203
|
+
# Possible causes for a server to miss its heartbeats:
|
204
|
+
# - The server process has died
|
205
|
+
# - The server process is "hanging"
|
206
|
+
# - The server is no longer able to communicate with the MongoDB Server
|
207
|
+
def zombie?(missed = 4)
|
208
|
+
return false unless running? || stopping?
|
209
|
+
return true if heartbeat.nil? || heartbeat.updated_at.nil?
|
210
|
+
dead_seconds = Config.instance.heartbeat_seconds * missed
|
211
|
+
(Time.now - heartbeat.updated_at) >= dead_seconds
|
212
|
+
end
|
213
|
+
|
214
|
+
private
|
215
|
+
|
216
|
+
attr_reader :workers
|
217
|
+
|
218
|
+
# Returns [Array<Worker>] collection of workers
|
219
|
+
def workers
|
220
|
+
@workers ||= []
|
221
|
+
end
|
222
|
+
|
223
|
+
# Management Thread
|
224
|
+
def run
|
225
|
+
logger.info "Using MongoDB Database: #{RocketJob::Job.collection.database.name}"
|
226
|
+
build_heartbeat(updated_at: Time.now, current_threads: 0)
|
227
|
+
started!
|
228
|
+
adjust_workers(true)
|
229
|
+
logger.info "RocketJob Server started with #{workers.size} workers running"
|
230
|
+
|
231
|
+
while running? || paused?
|
232
|
+
sleep Config.instance.heartbeat_seconds
|
233
|
+
|
234
|
+
find_and_update(
|
235
|
+
'heartbeat.updated_at' => Time.now,
|
236
|
+
'heartbeat.current_threads' => worker_count
|
237
|
+
)
|
238
|
+
|
239
|
+
# In case number of threads has been modified
|
240
|
+
adjust_workers
|
241
|
+
|
242
|
+
# Stop server if shutdown indicator was set
|
243
|
+
stop! if self.class.shutdown? && may_stop?
|
244
|
+
end
|
245
|
+
|
246
|
+
logger.info 'Waiting for workers to stop'
|
247
|
+
# Tell each worker to shutdown cleanly
|
248
|
+
workers.each(&:shutdown!)
|
249
|
+
|
250
|
+
while worker = workers.first
|
251
|
+
if worker.join(5)
|
252
|
+
# Worker thread is dead
|
253
|
+
workers.shift
|
254
|
+
else
|
255
|
+
# Timeout waiting for worker to stop
|
256
|
+
begin
|
257
|
+
find_and_update(
|
258
|
+
'heartbeat.updated_at' => Time.now,
|
259
|
+
'heartbeat.current_threads' => worker_count
|
260
|
+
)
|
261
|
+
rescue Mongoid::Errors::DocumentNotFound
|
262
|
+
logger.warn('Server has been destroyed. Going down hard!')
|
263
|
+
break
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# Logs the backtrace for each running worker
|
269
|
+
if SemanticLogger::VERSION.to_i >= 4
|
270
|
+
workers.each { |thread| logger.backtrace(thread: thread) }
|
271
|
+
end
|
272
|
+
logger.info 'Shutdown'
|
273
|
+
rescue Exception => exc
|
274
|
+
logger.error('RocketJob::Server is stopping due to an exception', exc)
|
275
|
+
end
|
276
|
+
|
277
|
+
# Returns [Fixnum] number of workers (threads) that are alive
|
278
|
+
def worker_count
|
279
|
+
workers.count(&:alive?)
|
280
|
+
end
|
281
|
+
|
282
|
+
def next_worker_id
|
283
|
+
@worker_id ||= 0
|
284
|
+
@worker_id += 1
|
285
|
+
end
|
286
|
+
|
287
|
+
# Re-adjust the number of running workers to get it up to the
|
288
|
+
# required number of workers
|
289
|
+
# Parameters
|
290
|
+
# stagger_workers
|
291
|
+
# Whether to stagger when the workers poll for work the first time
|
292
|
+
# It spreads out the queue polling over the max_poll_seconds so
|
293
|
+
# that not all workers poll at the same time
|
294
|
+
# The worker also respond faster than max_poll_seconds when a new
|
295
|
+
# job is added.
|
296
|
+
def adjust_workers(stagger_workers=false)
|
297
|
+
count = worker_count
|
298
|
+
# Cleanup workers that have stopped
|
299
|
+
if count != workers.count
|
300
|
+
logger.info "Cleaning up #{workers.count - count} workers that went away"
|
301
|
+
workers.delete_if { |t| !t.alive? }
|
302
|
+
end
|
303
|
+
|
304
|
+
# Need to add more workers?
|
305
|
+
if count < max_workers
|
306
|
+
worker_count = max_workers - count
|
307
|
+
logger.info "Starting #{worker_count} workers"
|
308
|
+
worker_count.times.each do
|
309
|
+
sleep (Config.instance.max_poll_seconds.to_f / max_workers) * (next_worker_id - 1) if stagger_workers
|
310
|
+
return if shutdown?
|
311
|
+
# Start worker
|
312
|
+
begin
|
313
|
+
workers << Worker.new(id: next_worker_id, server_name: name)
|
314
|
+
rescue Exception => exc
|
315
|
+
logger.fatal('Cannot start worker', exc)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
# Register handlers for the various signals
|
322
|
+
# Term:
|
323
|
+
# Perform clean shutdown
|
324
|
+
#
|
325
|
+
def self.register_signal_handlers
|
326
|
+
begin
|
327
|
+
Signal.trap 'SIGTERM' do
|
328
|
+
shutdown!
|
329
|
+
message = 'Shutdown signal (SIGTERM) received. Will shutdown as soon as active jobs/slices have completed.'
|
330
|
+
# Logging uses a mutex to access Queue on MRI/CRuby
|
331
|
+
defined?(JRuby) ? logger.warn(message) : puts(message)
|
332
|
+
end
|
333
|
+
|
334
|
+
Signal.trap 'INT' do
|
335
|
+
shutdown!
|
336
|
+
message = 'Shutdown signal (INT) received. Will shutdown as soon as active jobs/slices have completed.'
|
337
|
+
# Logging uses a mutex to access Queue on MRI/CRuby
|
338
|
+
defined?(JRuby) ? logger.warn(message) : puts(message)
|
339
|
+
end
|
340
|
+
rescue StandardError
|
341
|
+
logger.warn 'SIGTERM handler not installed. Not able to shutdown gracefully'
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Requeue any jobs assigned to this server when it is destroyed
|
346
|
+
def requeue_jobs
|
347
|
+
RocketJob::Job.requeue_dead_server(name)
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|