rocketjob 2.1.3 → 3.0.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +36 -0
- data/lib/rocket_job/active_server.rb +48 -0
- data/lib/rocket_job/cli.rb +29 -17
- data/lib/rocket_job/config.rb +19 -31
- data/lib/rocket_job/dirmon_entry.rb +15 -45
- data/lib/rocket_job/extensions/mongo/logging.rb +26 -0
- data/lib/rocket_job/extensions/rocket_job_adapter.rb +3 -5
- data/lib/rocket_job/heartbeat.rb +18 -23
- data/lib/rocket_job/job.rb +0 -1
- data/lib/rocket_job/job_exception.rb +11 -13
- data/lib/rocket_job/jobs/dirmon_job.rb +8 -8
- data/lib/rocket_job/jobs/housekeeping_job.rb +13 -15
- data/lib/rocket_job/performance.rb +5 -5
- data/lib/rocket_job/plugins/cron.rb +3 -10
- data/lib/rocket_job/plugins/document.rb +58 -33
- data/lib/rocket_job/plugins/job/model.rb +43 -71
- data/lib/rocket_job/plugins/job/persistence.rb +7 -63
- data/lib/rocket_job/plugins/job/worker.rb +24 -26
- data/lib/rocket_job/plugins/processing_window.rb +6 -9
- data/lib/rocket_job/plugins/retry.rb +3 -8
- data/lib/rocket_job/plugins/singleton.rb +1 -1
- data/lib/rocket_job/plugins/state_machine.rb +1 -7
- data/lib/rocket_job/server.rb +352 -0
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +46 -336
- data/lib/rocketjob.rb +5 -4
- data/test/config/mongoid.yml +88 -0
- data/test/config_test.rb +1 -1
- data/test/dirmon_entry_test.rb +15 -79
- data/test/dirmon_job_test.rb +6 -6
- data/test/job_test.rb +2 -2
- data/test/plugins/job/callbacks_test.rb +40 -32
- data/test/plugins/job/defaults_test.rb +10 -8
- data/test/plugins/job/model_test.rb +1 -3
- data/test/plugins/job/persistence_test.rb +11 -13
- data/test/plugins/job/worker_test.rb +45 -26
- data/test/plugins/processing_window_test.rb +4 -4
- data/test/plugins/restart_test.rb +11 -12
- data/test/plugins/state_machine_event_callbacks_test.rb +20 -18
- data/test/plugins/state_machine_test.rb +5 -5
- data/test/test_helper.rb +4 -1
- metadata +15 -29
- data/lib/rocket_job/extensions/mongo.rb +0 -23
- data/lib/rocket_job/extensions/mongo_mapper.rb +0 -30
- data/lib/rocket_job/plugins/job/defaults.rb +0 -40
- data/test/config/mongo.yml +0 -46
@@ -10,18 +10,7 @@ module RocketJob
|
|
10
10
|
|
11
11
|
included do
|
12
12
|
# Store all job types in this collection
|
13
|
-
|
14
|
-
|
15
|
-
# Create indexes
|
16
|
-
def self.create_indexes
|
17
|
-
# Used by find_and_modify in .rocket_job_retrieve
|
18
|
-
ensure_index({state: 1, priority: 1, _id: 1}, background: true)
|
19
|
-
# Remove outdated indexes if present
|
20
|
-
drop_index('state_1_run_at_1_priority_1_created_at_1_sub_state_1') rescue nil
|
21
|
-
drop_index('state_1_priority_1_created_at_1_sub_state_1') rescue nil
|
22
|
-
drop_index('state_1_priority_1_created_at_1') rescue nil
|
23
|
-
drop_index('created_at_1') rescue nil
|
24
|
-
end
|
13
|
+
store_in collection: 'rocket_job.jobs'
|
25
14
|
|
26
15
|
# Retrieves the next job to work on in priority based order
|
27
16
|
# and assigns it to this worker
|
@@ -35,40 +24,12 @@ module RocketJob
|
|
35
24
|
# skip_job_ids [Array<BSON::ObjectId>]
|
36
25
|
# Job ids to exclude when looking for the next job
|
37
26
|
def self.rocket_job_retrieve(worker_name, skip_job_ids = nil)
|
38
|
-
|
39
|
-
|
40
|
-
{run_at: {'$lte' => Time.now}}
|
41
|
-
]
|
42
|
-
update = query = nil
|
43
|
-
if defined?(RocketJobPro)
|
44
|
-
query = {
|
45
|
-
'$and' => [
|
46
|
-
{
|
47
|
-
'$or' => [
|
48
|
-
{'state' => 'queued'}, # Jobs
|
49
|
-
{'state' => 'running', 'sub_state' => :processing} # Slices
|
50
|
-
]
|
51
|
-
},
|
52
|
-
{
|
53
|
-
'$or' => run_at
|
54
|
-
}
|
55
|
-
]
|
56
|
-
}
|
57
|
-
update = {'$set' => {'worker_name' => worker_name, 'state' => 'running'}}
|
58
|
-
else
|
59
|
-
query = {'state' => 'queued', '$or' => run_at}
|
60
|
-
update = {'$set' => {'worker_name' => worker_name, 'state' => 'running', 'started_at' => Time.now}}
|
61
|
-
end
|
27
|
+
query = queued_now
|
28
|
+
update = {'$set' => {'worker_name' => worker_name, 'state' => 'running', 'started_at' => Time.now}}
|
62
29
|
|
63
|
-
query
|
30
|
+
query = query.where(:id.nin => skip_job_ids) if skip_job_ids && skip_job_ids.size > 0
|
64
31
|
|
65
|
-
|
66
|
-
query: query,
|
67
|
-
sort: {priority: 1, _id: 1},
|
68
|
-
update: update
|
69
|
-
)
|
70
|
-
load(doc)
|
71
|
-
end
|
32
|
+
query.sort(priority: 1, _id: 1).find_one_and_update(update)
|
72
33
|
end
|
73
34
|
|
74
35
|
# Returns [Hash<String:Integer>] of the number of jobs in each state
|
@@ -114,7 +75,7 @@ module RocketJob
|
|
114
75
|
|
115
76
|
# Calculate :queued_now and :scheduled if there are queued jobs
|
116
77
|
if queued_count = counts[:queued]
|
117
|
-
scheduled_count = RocketJob::Job.
|
78
|
+
scheduled_count = RocketJob::Job.scheduled.count
|
118
79
|
if scheduled_count > 0
|
119
80
|
queued_now_count = queued_count - scheduled_count
|
120
81
|
counts[:queued_now] = queued_count - scheduled_count if queued_now_count > 0
|
@@ -133,7 +94,7 @@ module RocketJob
|
|
133
94
|
return super unless destroy_on_complete
|
134
95
|
begin
|
135
96
|
super
|
136
|
-
rescue
|
97
|
+
rescue Mongoid::Errors::DocumentNotFound
|
137
98
|
unless completed?
|
138
99
|
self.state = :completed
|
139
100
|
rocket_job_set_completed_at
|
@@ -143,23 +104,6 @@ module RocketJob
|
|
143
104
|
end
|
144
105
|
end
|
145
106
|
|
146
|
-
private
|
147
|
-
|
148
|
-
# After this model is loaded, convert any hashes in the arguments list to HashWithIndifferentAccess
|
149
|
-
def load_from_database(*args)
|
150
|
-
super
|
151
|
-
if arguments.present?
|
152
|
-
self.arguments = arguments.collect { |i| i.is_a?(BSON::OrderedHash) ? i.with_indifferent_access : i }
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
# Apply RocketJob defaults after initializing default values
|
157
|
-
# but before setting attributes. after_initialize is too late
|
158
|
-
def initialize_default_values(except = {})
|
159
|
-
super
|
160
|
-
rocket_job_set_defaults
|
161
|
-
end
|
162
|
-
|
163
107
|
end
|
164
108
|
end
|
165
109
|
end
|
@@ -8,15 +8,15 @@ module RocketJob
|
|
8
8
|
module Worker
|
9
9
|
extend ActiveSupport::Concern
|
10
10
|
|
11
|
-
|
11
|
+
module ClassMethods
|
12
12
|
# Run this job later
|
13
13
|
#
|
14
14
|
# Saves it to the database for processing later by workers
|
15
|
-
def
|
15
|
+
def perform_later(args, &block)
|
16
16
|
if RocketJob::Config.inline_mode
|
17
|
-
perform_now(
|
17
|
+
perform_now(args, &block)
|
18
18
|
else
|
19
|
-
job = new(
|
19
|
+
job = new(args)
|
20
20
|
block.call(job) if block
|
21
21
|
job.save!
|
22
22
|
job
|
@@ -28,8 +28,8 @@ module RocketJob
|
|
28
28
|
# The job is not saved to the database since it is processed entriely in memory
|
29
29
|
# As a result before_save and before_destroy callbacks will not be called.
|
30
30
|
# Validations are still called however prior to calling #perform
|
31
|
-
def
|
32
|
-
job = new(
|
31
|
+
def perform_now(args, &block)
|
32
|
+
job = new(args)
|
33
33
|
block.call(job) if block
|
34
34
|
job.perform_now
|
35
35
|
job
|
@@ -48,7 +48,7 @@ module RocketJob
|
|
48
48
|
#
|
49
49
|
# Note:
|
50
50
|
# If a job is in queued state it will be started
|
51
|
-
def
|
51
|
+
def rocket_job_next_job(worker_name, skip_job_ids = nil)
|
52
52
|
while (job = rocket_job_retrieve(worker_name, skip_job_ids))
|
53
53
|
case
|
54
54
|
when job.running?
|
@@ -67,16 +67,13 @@ module RocketJob
|
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
|
-
# Requeues all jobs that were running on
|
71
|
-
def
|
72
|
-
#
|
73
|
-
running.each do |job|
|
74
|
-
job.requeue!(
|
70
|
+
# Requeues all jobs that were running on a server that died
|
71
|
+
def requeue_dead_server(server_name)
|
72
|
+
# Need to requeue paused, failed since user may have transitioned job before it finished
|
73
|
+
where(:state.in => [:running, :paused, :faled]).each do |job|
|
74
|
+
job.requeue!(server_name) if job.may_requeue?(server_name)
|
75
75
|
end
|
76
76
|
end
|
77
|
-
|
78
|
-
# Turn off embedded callbacks. Slow and not used for Jobs
|
79
|
-
embedded_callbacks_off
|
80
77
|
end
|
81
78
|
|
82
79
|
# Runs the job now in the current thread.
|
@@ -91,14 +88,9 @@ module RocketJob
|
|
91
88
|
#
|
92
89
|
# Exceptions are _not_ suppressed and should be handled by the caller.
|
93
90
|
def perform_now
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
elsif invalid?
|
98
|
-
raise(MongoMapper::DocumentNotValid, self)
|
99
|
-
end
|
100
|
-
worker = RocketJob::Worker.new(name: 'inline')
|
101
|
-
worker.started
|
91
|
+
raise(Mongoid::Errors::Validations, self) unless valid?
|
92
|
+
|
93
|
+
worker = RocketJob::Worker.new(inline: true)
|
102
94
|
start if may_start?
|
103
95
|
# Re-Raise exceptions
|
104
96
|
rocket_job_work(worker, true) if running?
|
@@ -114,7 +106,7 @@ module RocketJob
|
|
114
106
|
# The job is automatically saved only if an exception is raised in the supplied block.
|
115
107
|
#
|
116
108
|
# worker_name: [String]
|
117
|
-
# Name of the
|
109
|
+
# Name of the server on which the exception has occurred
|
118
110
|
#
|
119
111
|
# re_raise_exceptions: [true|false]
|
120
112
|
# Re-raise the exception after updating the job
|
@@ -150,10 +142,10 @@ module RocketJob
|
|
150
142
|
run_callbacks :perform do
|
151
143
|
# Allow callbacks to fail, complete or abort the job
|
152
144
|
if running?
|
153
|
-
ret = perform
|
145
|
+
ret = perform
|
154
146
|
if collect_output?
|
155
147
|
# Result must be a Hash, if not put it in a Hash
|
156
|
-
self.result =
|
148
|
+
self.result = ret.is_a?(Hash) ? ret : {'result' => ret}
|
157
149
|
end
|
158
150
|
end
|
159
151
|
end
|
@@ -166,6 +158,12 @@ module RocketJob
|
|
166
158
|
false
|
167
159
|
end
|
168
160
|
|
161
|
+
# Returns [Hash<String:[Array<ActiveWorker>]>] All servers actively working on this job
|
162
|
+
def rocket_job_active_servers
|
163
|
+
return {} unless running?
|
164
|
+
{worker_name => [ActiveServer.new(worker_name, started_at, self)]}
|
165
|
+
end
|
166
|
+
|
169
167
|
end
|
170
168
|
end
|
171
169
|
end
|
@@ -20,13 +20,10 @@ module RocketJob
|
|
20
20
|
# class BusinessHoursJob < RocketJob::Job
|
21
21
|
# include RocketJob::Plugins::ProcessingWindow
|
22
22
|
#
|
23
|
-
# #
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# # How long the processing window is:
|
28
|
-
# job.processing_duration = 12.hours
|
29
|
-
# end
|
23
|
+
# # The start of the processing window
|
24
|
+
# self.processing_schedule = "30 8 * * * America/New_York"
|
25
|
+
# # How long the processing window is:
|
26
|
+
# self..processing_duration = 12.hours
|
30
27
|
#
|
31
28
|
# def perform
|
32
29
|
# # Job will only run between 8:30am and 8:30pm Eastern
|
@@ -41,8 +38,8 @@ module RocketJob
|
|
41
38
|
extend ActiveSupport::Concern
|
42
39
|
|
43
40
|
included do
|
44
|
-
|
45
|
-
|
41
|
+
field :processing_schedule, type: String, class_attribute: true
|
42
|
+
field :processing_duration, type: Integer, class_attribute: true
|
46
43
|
|
47
44
|
before_create :rocket_job_processing_window_set_run_at
|
48
45
|
before_retry :rocket_job_processing_window_set_run_at
|
@@ -26,9 +26,7 @@ module RocketJob
|
|
26
26
|
# include RocketJob::Plugins::Retry
|
27
27
|
#
|
28
28
|
# # Set the default retry_count
|
29
|
-
#
|
30
|
-
# job.max_retries = 3
|
31
|
-
# end
|
29
|
+
# self.max_retries = 3
|
32
30
|
#
|
33
31
|
# def perform
|
34
32
|
# puts "DONE"
|
@@ -52,13 +50,10 @@ module RocketJob
|
|
52
50
|
|
53
51
|
# Maximum number of times to retry this job
|
54
52
|
# 25 is approximately 3 weeks of retries
|
55
|
-
|
53
|
+
field :max_retries, type: Integer, default: 25, class_attribute: true, user_editable: true
|
56
54
|
|
57
55
|
# List of times when this job failed
|
58
|
-
|
59
|
-
|
60
|
-
# Make max_retries editable in Rocket Job Mission Control
|
61
|
-
public_rocket_job_properties :max_retries
|
56
|
+
field :failed_times, type: Array, default: []
|
62
57
|
|
63
58
|
validates_presence_of :max_retries
|
64
59
|
end
|
@@ -17,7 +17,7 @@ module RocketJob
|
|
17
17
|
|
18
18
|
# Returns [true|false] whether another instance of this job is already active
|
19
19
|
def rocket_job_singleton_active?
|
20
|
-
self.class.where(state
|
20
|
+
self.class.where(:state.in => [:running, :queued], :id.ne => id).exists?
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
@@ -86,13 +86,7 @@ module RocketJob
|
|
86
86
|
write_attribute(attr_name, state)
|
87
87
|
|
88
88
|
begin
|
89
|
-
|
90
|
-
saved = save(validate: false)
|
91
|
-
write_attribute(attr_name, old_value) unless saved
|
92
|
-
saved
|
93
|
-
else
|
94
|
-
save!
|
95
|
-
end
|
89
|
+
save!
|
96
90
|
rescue Exception => exc
|
97
91
|
write_attribute(attr_name, old_value)
|
98
92
|
raise(exc)
|
@@ -0,0 +1,352 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'concurrent'
|
3
|
+
module RocketJob
|
4
|
+
# Server
|
5
|
+
#
|
6
|
+
# On startup a server instance will automatically register itself
|
7
|
+
# if not already present
|
8
|
+
#
|
9
|
+
# Starting a server in the foreground:
|
10
|
+
# - Using a Rails runner:
|
11
|
+
# bin/rocketjob
|
12
|
+
#
|
13
|
+
# Starting a server in the background:
|
14
|
+
# - Using a Rails runner:
|
15
|
+
# nohup bin/rocketjob --quiet 2>&1 1>output.log &
|
16
|
+
#
|
17
|
+
# Stopping a server:
|
18
|
+
# - Stop the server via the Web UI
|
19
|
+
# - Send a regular kill signal to make it shutdown once all active work is complete
|
20
|
+
# kill <pid>
|
21
|
+
# - Or, use the following Ruby code:
|
22
|
+
# server = RocketJob::Server.where(name: 'server name').first
|
23
|
+
# server.stop!
|
24
|
+
#
|
25
|
+
# Sending the kill signal locally will result in starting the shutdown process
|
26
|
+
# immediately. Via the UI or Ruby code the server can take up to 15 seconds
|
27
|
+
# (the heartbeat interval) to start shutting down.
|
28
|
+
class Server
|
29
|
+
include Plugins::Document
|
30
|
+
include Plugins::StateMachine
|
31
|
+
include SemanticLogger::Loggable
|
32
|
+
|
33
|
+
# Unique Name of this server instance
|
34
|
+
# Default: `host name:PID`
|
35
|
+
# The unique name is used on re-start to re-queue any jobs that were being processed
|
36
|
+
# at the time the server unexpectedly terminated, if any
|
37
|
+
field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
|
38
|
+
|
39
|
+
# The maximum number of workers this server should start
|
40
|
+
# If set, it will override the default value in RocketJob::Config
|
41
|
+
field :max_workers, type: Integer, default: -> { Config.instance.max_worker_threads }
|
42
|
+
|
43
|
+
# When this server process was started
|
44
|
+
field :started_at, type: Time
|
45
|
+
|
46
|
+
# The heartbeat information for this server
|
47
|
+
embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
|
48
|
+
|
49
|
+
# Current state
|
50
|
+
# Internal use only. Do not set this field directly
|
51
|
+
field :state, type: Symbol, default: :starting
|
52
|
+
|
53
|
+
index({name: 1}, background: true, unique: true, drop_dups: true)
|
54
|
+
|
55
|
+
validates_presence_of :state, :name, :max_workers
|
56
|
+
|
57
|
+
# States
|
58
|
+
# :starting -> :running -> :paused
|
59
|
+
# -> :stopping
|
60
|
+
aasm column: :state do
|
61
|
+
state :starting, initial: true
|
62
|
+
state :running
|
63
|
+
state :paused
|
64
|
+
state :stopping
|
65
|
+
|
66
|
+
event :started do
|
67
|
+
transitions from: :starting, to: :running
|
68
|
+
before do
|
69
|
+
self.started_at = Time.now
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
event :pause do
|
74
|
+
transitions from: :running, to: :paused
|
75
|
+
end
|
76
|
+
|
77
|
+
event :resume do
|
78
|
+
transitions from: :paused, to: :running
|
79
|
+
end
|
80
|
+
|
81
|
+
event :stop do
|
82
|
+
transitions from: :running, to: :stopping
|
83
|
+
transitions from: :paused, to: :stopping
|
84
|
+
transitions from: :starting, to: :stopping
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Requeue any jobs being worked by this server when it is destroyed
|
89
|
+
before_destroy :requeue_jobs
|
90
|
+
|
91
|
+
# Destroy's all instances of zombie server and requeue any jobs still "running"
|
92
|
+
# on those servers
|
93
|
+
def self.destroy_zombies
|
94
|
+
count = 0
|
95
|
+
each do |server|
|
96
|
+
next unless server.zombie?
|
97
|
+
logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
|
98
|
+
server.destroy
|
99
|
+
count += 1
|
100
|
+
end
|
101
|
+
count
|
102
|
+
end
|
103
|
+
|
104
|
+
# Stop all running, paused, or starting servers
|
105
|
+
def self.stop_all
|
106
|
+
where(:state.in => [:running, :paused, :starting]).each(&:stop!)
|
107
|
+
end
|
108
|
+
|
109
|
+
# Pause all running servers
|
110
|
+
def self.pause_all
|
111
|
+
running.each(&:pause!)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Resume all paused servers
|
115
|
+
def self.resume_all
|
116
|
+
paused.each(&:resume!)
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns [Hash<String:Integer>] of the number of servers in each state.
|
120
|
+
# Note: If there are no servers in that particular state then the hash will not have a value for it.
|
121
|
+
#
|
122
|
+
# Example servers in every state:
|
123
|
+
# RocketJob::Server.counts_by_state
|
124
|
+
# # => {
|
125
|
+
# :aborted => 1,
|
126
|
+
# :completed => 37,
|
127
|
+
# :failed => 1,
|
128
|
+
# :paused => 3,
|
129
|
+
# :queued => 4,
|
130
|
+
# :running => 1,
|
131
|
+
# :queued_now => 1,
|
132
|
+
# :scheduled => 3
|
133
|
+
# }
|
134
|
+
#
|
135
|
+
# Example no servers active:
|
136
|
+
# RocketJob::Server.counts_by_state
|
137
|
+
# # => {}
|
138
|
+
def self.counts_by_state
|
139
|
+
counts = {}
|
140
|
+
collection.aggregate([
|
141
|
+
{
|
142
|
+
'$group' => {
|
143
|
+
_id: '$state',
|
144
|
+
count: {'$sum' => 1}
|
145
|
+
}
|
146
|
+
}
|
147
|
+
]
|
148
|
+
).each do |result|
|
149
|
+
counts[result['_id'].to_sym] = result['count']
|
150
|
+
end
|
151
|
+
counts
|
152
|
+
end
|
153
|
+
|
154
|
+
# On MRI the 'concurrent-ruby-ext' gem may not be loaded
|
155
|
+
if defined?(Concurrent::JavaAtomicBoolean) || defined?(Concurrent::CAtomicBoolean)
|
156
|
+
# Returns [true|false] whether the shutdown indicator has been set for this server process
|
157
|
+
def self.shutdown?
|
158
|
+
@@shutdown.value
|
159
|
+
end
|
160
|
+
|
161
|
+
# Set shutdown indicator for this server process
|
162
|
+
def self.shutdown!
|
163
|
+
@@shutdown.make_true
|
164
|
+
end
|
165
|
+
|
166
|
+
@@shutdown = Concurrent::AtomicBoolean.new(false)
|
167
|
+
else
|
168
|
+
# Returns [true|false] whether the shutdown indicator has been set for this server process
|
169
|
+
def self.shutdown?
|
170
|
+
@@shutdown
|
171
|
+
end
|
172
|
+
|
173
|
+
# Set shutdown indicator for this server process
|
174
|
+
def self.shutdown!
|
175
|
+
@@shutdown = true
|
176
|
+
end
|
177
|
+
|
178
|
+
@@shutdown = false
|
179
|
+
end
|
180
|
+
|
181
|
+
# Run the server process
|
182
|
+
# Attributes supplied are passed to #new
|
183
|
+
def self.run(attrs = {})
|
184
|
+
Thread.current.name = 'rocketjob main'
|
185
|
+
# Create Indexes on server startup
|
186
|
+
Mongoid::Tasks::Database.create_indexes
|
187
|
+
register_signal_handlers
|
188
|
+
|
189
|
+
server = create!(attrs)
|
190
|
+
server.send(:run)
|
191
|
+
|
192
|
+
ensure
|
193
|
+
server.destroy if server
|
194
|
+
end
|
195
|
+
|
196
|
+
# Returns [Boolean] whether the server is shutting down
|
197
|
+
def shutdown?
|
198
|
+
self.class.shutdown? || !running?
|
199
|
+
end
|
200
|
+
|
201
|
+
# Returns [true|false] if this server has missed at least the last 4 heartbeats
|
202
|
+
#
|
203
|
+
# Possible causes for a server to miss its heartbeats:
|
204
|
+
# - The server process has died
|
205
|
+
# - The server process is "hanging"
|
206
|
+
# - The server is no longer able to communicate with the MongoDB Server
|
207
|
+
def zombie?(missed = 4)
|
208
|
+
return false unless running? || stopping?
|
209
|
+
return true if heartbeat.nil? || heartbeat.updated_at.nil?
|
210
|
+
dead_seconds = Config.instance.heartbeat_seconds * missed
|
211
|
+
(Time.now - heartbeat.updated_at) >= dead_seconds
|
212
|
+
end
|
213
|
+
|
214
|
+
private
|
215
|
+
|
216
|
+
attr_reader :workers
|
217
|
+
|
218
|
+
# Returns [Array<Worker>] collection of workers
|
219
|
+
def workers
|
220
|
+
@workers ||= []
|
221
|
+
end
|
222
|
+
|
223
|
+
# Management Thread
|
224
|
+
def run
|
225
|
+
logger.info "Using MongoDB Database: #{RocketJob::Job.collection.database.name}"
|
226
|
+
build_heartbeat(updated_at: Time.now, current_threads: 0)
|
227
|
+
started!
|
228
|
+
adjust_workers(true)
|
229
|
+
logger.info "RocketJob Server started with #{workers.size} workers running"
|
230
|
+
|
231
|
+
while running? || paused?
|
232
|
+
sleep Config.instance.heartbeat_seconds
|
233
|
+
|
234
|
+
find_and_update(
|
235
|
+
'heartbeat.updated_at' => Time.now,
|
236
|
+
'heartbeat.current_threads' => worker_count
|
237
|
+
)
|
238
|
+
|
239
|
+
# In case number of threads has been modified
|
240
|
+
adjust_workers
|
241
|
+
|
242
|
+
# Stop server if shutdown indicator was set
|
243
|
+
stop! if self.class.shutdown? && may_stop?
|
244
|
+
end
|
245
|
+
|
246
|
+
logger.info 'Waiting for workers to stop'
|
247
|
+
# Tell each worker to shutdown cleanly
|
248
|
+
workers.each(&:shutdown!)
|
249
|
+
|
250
|
+
while worker = workers.first
|
251
|
+
if worker.join(5)
|
252
|
+
# Worker thread is dead
|
253
|
+
workers.shift
|
254
|
+
else
|
255
|
+
# Timeout waiting for worker to stop
|
256
|
+
begin
|
257
|
+
find_and_update(
|
258
|
+
'heartbeat.updated_at' => Time.now,
|
259
|
+
'heartbeat.current_threads' => worker_count
|
260
|
+
)
|
261
|
+
rescue Mongoid::Errors::DocumentNotFound
|
262
|
+
logger.warn('Server has been destroyed. Going down hard!')
|
263
|
+
break
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# Logs the backtrace for each running worker
|
269
|
+
if SemanticLogger::VERSION.to_i >= 4
|
270
|
+
workers.each { |thread| logger.backtrace(thread: thread) }
|
271
|
+
end
|
272
|
+
logger.info 'Shutdown'
|
273
|
+
rescue Exception => exc
|
274
|
+
logger.error('RocketJob::Server is stopping due to an exception', exc)
|
275
|
+
end
|
276
|
+
|
277
|
+
# Returns [Fixnum] number of workers (threads) that are alive
|
278
|
+
def worker_count
|
279
|
+
workers.count(&:alive?)
|
280
|
+
end
|
281
|
+
|
282
|
+
def next_worker_id
|
283
|
+
@worker_id ||= 0
|
284
|
+
@worker_id += 1
|
285
|
+
end
|
286
|
+
|
287
|
+
# Re-adjust the number of running workers to get it up to the
|
288
|
+
# required number of workers
|
289
|
+
# Parameters
|
290
|
+
# stagger_workers
|
291
|
+
# Whether to stagger when the workers poll for work the first time
|
292
|
+
# It spreads out the queue polling over the max_poll_seconds so
|
293
|
+
# that not all workers poll at the same time
|
294
|
+
# The worker also respond faster than max_poll_seconds when a new
|
295
|
+
# job is added.
|
296
|
+
def adjust_workers(stagger_workers=false)
|
297
|
+
count = worker_count
|
298
|
+
# Cleanup workers that have stopped
|
299
|
+
if count != workers.count
|
300
|
+
logger.info "Cleaning up #{workers.count - count} workers that went away"
|
301
|
+
workers.delete_if { |t| !t.alive? }
|
302
|
+
end
|
303
|
+
|
304
|
+
# Need to add more workers?
|
305
|
+
if count < max_workers
|
306
|
+
worker_count = max_workers - count
|
307
|
+
logger.info "Starting #{worker_count} workers"
|
308
|
+
worker_count.times.each do
|
309
|
+
sleep (Config.instance.max_poll_seconds.to_f / max_workers) * (next_worker_id - 1) if stagger_workers
|
310
|
+
return if shutdown?
|
311
|
+
# Start worker
|
312
|
+
begin
|
313
|
+
workers << Worker.new(id: next_worker_id, server_name: name)
|
314
|
+
rescue Exception => exc
|
315
|
+
logger.fatal('Cannot start worker', exc)
|
316
|
+
end
|
317
|
+
end
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
# Register handlers for the various signals
|
322
|
+
# Term:
|
323
|
+
# Perform clean shutdown
|
324
|
+
#
|
325
|
+
def self.register_signal_handlers
|
326
|
+
begin
|
327
|
+
Signal.trap 'SIGTERM' do
|
328
|
+
shutdown!
|
329
|
+
message = 'Shutdown signal (SIGTERM) received. Will shutdown as soon as active jobs/slices have completed.'
|
330
|
+
# Logging uses a mutex to access Queue on MRI/CRuby
|
331
|
+
defined?(JRuby) ? logger.warn(message) : puts(message)
|
332
|
+
end
|
333
|
+
|
334
|
+
Signal.trap 'INT' do
|
335
|
+
shutdown!
|
336
|
+
message = 'Shutdown signal (INT) received. Will shutdown as soon as active jobs/slices have completed.'
|
337
|
+
# Logging uses a mutex to access Queue on MRI/CRuby
|
338
|
+
defined?(JRuby) ? logger.warn(message) : puts(message)
|
339
|
+
end
|
340
|
+
rescue StandardError
|
341
|
+
logger.warn 'SIGTERM handler not installed. Not able to shutdown gracefully'
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Requeue any jobs assigned to this server when it is destroyed
|
346
|
+
def requeue_jobs
|
347
|
+
RocketJob::Job.requeue_dead_server(name)
|
348
|
+
end
|
349
|
+
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|