rocketjob 4.0.0 → 4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rocket_job/cli.rb +2 -2
- data/lib/rocket_job/event.rb +163 -0
- data/lib/rocket_job/jobs/housekeeping_job.rb +7 -7
- data/lib/rocket_job/plugins/transaction.rb +1 -1
- data/lib/rocket_job/rocket_job.rb +7 -0
- data/lib/rocket_job/server.rb +5 -356
- data/lib/rocket_job/server/model.rb +138 -0
- data/lib/rocket_job/server/state_machine.rb +60 -0
- data/lib/rocket_job/subscriber.rb +79 -0
- data/lib/rocket_job/subscribers/logger.rb +75 -0
- data/lib/rocket_job/subscribers/server.rb +71 -0
- data/lib/rocket_job/subscribers/worker.rb +61 -0
- data/lib/rocket_job/supervisor.rb +96 -0
- data/lib/rocket_job/supervisor/shutdown.rb +63 -0
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +41 -31
- data/lib/rocket_job/worker_pool.rb +103 -0
- data/lib/rocketjob.rb +17 -7
- metadata +15 -6
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'active_support/concern'
|
3
|
+
|
4
|
+
module RocketJob
|
5
|
+
class Server
|
6
|
+
# Model attributes
|
7
|
+
module Model
|
8
|
+
extend ActiveSupport::Concern
|
9
|
+
|
10
|
+
included do
|
11
|
+
store_in collection: 'rocket_job.servers'
|
12
|
+
|
13
|
+
# Unique Name of this server instance
|
14
|
+
# Default: `host name:PID`
|
15
|
+
# The unique name is used on re-start to re-queue any jobs that were being processed
|
16
|
+
# at the time the server unexpectedly terminated, if any
|
17
|
+
field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
|
18
|
+
|
19
|
+
# The maximum number of workers this server should start
|
20
|
+
# If set, it will override the default value in RocketJob::Config
|
21
|
+
field :max_workers, type: Integer, default: -> { Config.instance.max_workers }
|
22
|
+
|
23
|
+
# When this server process was started
|
24
|
+
field :started_at, type: Time
|
25
|
+
|
26
|
+
# Filter to apply to control which job classes this server can process
|
27
|
+
field :yaml_filter, type: String
|
28
|
+
|
29
|
+
# The heartbeat information for this server
|
30
|
+
embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
|
31
|
+
|
32
|
+
# Current state
|
33
|
+
# Internal use only. Do not set this field directly
|
34
|
+
field :state, type: Symbol, default: :starting
|
35
|
+
|
36
|
+
index({name: 1}, background: true, unique: true, drop_dups: true)
|
37
|
+
|
38
|
+
validates_presence_of :state, :name, :max_workers
|
39
|
+
|
40
|
+
# Requeue any jobs being worked by this server when it is destroyed
|
41
|
+
before_destroy :requeue_jobs
|
42
|
+
|
43
|
+
# Returns [Hash<String:Integer>] of the number of servers in each state.
|
44
|
+
# Note: If there are no servers in that particular state then the hash will not have a value for it.
|
45
|
+
#
|
46
|
+
# Example servers in every state:
|
47
|
+
# RocketJob::Server.counts_by_state
|
48
|
+
# # => {
|
49
|
+
# :aborted => 1,
|
50
|
+
# :completed => 37,
|
51
|
+
# :failed => 1,
|
52
|
+
# :paused => 3,
|
53
|
+
# :queued => 4,
|
54
|
+
# :running => 1,
|
55
|
+
# :queued_now => 1,
|
56
|
+
# :scheduled => 3
|
57
|
+
# }
|
58
|
+
#
|
59
|
+
# Example no servers active:
|
60
|
+
# RocketJob::Server.counts_by_state
|
61
|
+
# # => {}
|
62
|
+
def self.counts_by_state
|
63
|
+
counts = {}
|
64
|
+
collection.aggregate([{'$group' => {_id: '$state', count: {'$sum' => 1}}}]).each do |result|
|
65
|
+
counts[result['_id'].to_sym] = result['count']
|
66
|
+
end
|
67
|
+
counts
|
68
|
+
end
|
69
|
+
|
70
|
+
# Destroy's all instances of zombie servers and requeues any jobs still "running"
|
71
|
+
# on those servers.
|
72
|
+
def self.destroy_zombies
|
73
|
+
count = 0
|
74
|
+
each do |server|
|
75
|
+
next unless server.zombie?
|
76
|
+
logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
|
77
|
+
server.destroy
|
78
|
+
count += 1
|
79
|
+
end
|
80
|
+
count
|
81
|
+
end
|
82
|
+
|
83
|
+
# Scope for all zombie servers
|
84
|
+
def self.zombies(missed = 4)
|
85
|
+
dead_seconds = Config.instance.heartbeat_seconds * missed
|
86
|
+
last_heartbeat_time = Time.now - dead_seconds
|
87
|
+
where(
|
88
|
+
:state.in => %i[stopping running paused],
|
89
|
+
'$or' => [
|
90
|
+
{'heartbeat.updated_at' => {'$exists' => false}},
|
91
|
+
{'heartbeat.updated_at' => {'$lte' => last_heartbeat_time}}
|
92
|
+
]
|
93
|
+
)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Where clause filter to apply to workers looking for jobs
|
98
|
+
def filter
|
99
|
+
YAML.load(yaml_filter) if yaml_filter
|
100
|
+
end
|
101
|
+
|
102
|
+
def filter=(hash)
|
103
|
+
self.yaml_filter = hash.nil? ? nil : hash.to_yaml
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns [true|false] if this server has missed at least the last 4 heartbeats
|
107
|
+
#
|
108
|
+
# Possible causes for a server to miss its heartbeats:
|
109
|
+
# - The server process has died
|
110
|
+
# - The server process is "hanging"
|
111
|
+
# - The server is no longer able to communicate with the MongoDB Server
|
112
|
+
def zombie?(missed = 4)
|
113
|
+
return false unless running? || stopping? || paused?
|
114
|
+
return true if heartbeat.nil? || heartbeat.updated_at.nil?
|
115
|
+
dead_seconds = Config.instance.heartbeat_seconds * missed
|
116
|
+
(Time.now - heartbeat.updated_at) >= dead_seconds
|
117
|
+
end
|
118
|
+
|
119
|
+
# Updates the heartbeat and returns a refreshed server instance.
|
120
|
+
def refresh(worker_count)
|
121
|
+
SemanticLogger.silence(:info) do
|
122
|
+
find_and_update(
|
123
|
+
'heartbeat.updated_at' => Time.now,
|
124
|
+
'heartbeat.workers' => worker_count
|
125
|
+
)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
# Requeue any jobs assigned to this server when it is destroyed
|
132
|
+
def requeue_jobs
|
133
|
+
RocketJob::Job.requeue_dead_server(name)
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
class Server
|
5
|
+
# State machine for sliced jobs
|
6
|
+
module StateMachine
|
7
|
+
extend ActiveSupport::Concern
|
8
|
+
|
9
|
+
included do
|
10
|
+
# States
|
11
|
+
# :starting -> :running -> :paused
|
12
|
+
# -> :stopping
|
13
|
+
aasm column: :state, whiny_persistence: true do
|
14
|
+
state :starting, initial: true
|
15
|
+
state :running
|
16
|
+
state :paused
|
17
|
+
state :stopping
|
18
|
+
|
19
|
+
event :started do
|
20
|
+
transitions from: :starting, to: :running
|
21
|
+
before do
|
22
|
+
self.started_at = Time.now
|
23
|
+
build_heartbeat(updated_at: Time.now, workers: 0)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
event :pause do
|
28
|
+
transitions from: :running, to: :paused
|
29
|
+
end
|
30
|
+
|
31
|
+
event :resume do
|
32
|
+
transitions from: :paused, to: :running
|
33
|
+
end
|
34
|
+
|
35
|
+
event :stop do
|
36
|
+
transitions from: :running, to: :stopping
|
37
|
+
transitions from: :paused, to: :stopping
|
38
|
+
transitions from: :starting, to: :stopping
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Stop all running, paused, or starting servers
|
43
|
+
def self.stop_all
|
44
|
+
where(:state.in => %i[running paused starting]).each(&:stop!)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Pause all running servers
|
48
|
+
def self.pause_all
|
49
|
+
running.each(&:pause!)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Resume all paused servers
|
53
|
+
def self.resume_all
|
54
|
+
paused.each(&:resume!)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
# Mix-in to publish and subscribe to events.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# def MySubscriber
|
8
|
+
# include RocketJob::Subscriber
|
9
|
+
#
|
10
|
+
# def hello
|
11
|
+
# logger.info "Hello Action Received"
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
# def show(message:)
|
15
|
+
# logger.info "Received: #{message}"
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# # If `message` is not supplied it defaults to "Hello World"
|
19
|
+
# def show_default(message: "Hello World")
|
20
|
+
# logger.info "Received: #{message}"
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# MySubscriber.subscribe
|
25
|
+
module Subscriber
|
26
|
+
extend ActiveSupport::Concern
|
27
|
+
|
28
|
+
# Test Mode
|
29
|
+
# Bypasses publishing the event and calls the subscribers directly
|
30
|
+
def self.test_mode!
|
31
|
+
@test_mode = true
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.test_mode?
|
35
|
+
@test_mode
|
36
|
+
end
|
37
|
+
|
38
|
+
@test_mode = false
|
39
|
+
|
40
|
+
included do
|
41
|
+
include SemanticLogger::Loggable
|
42
|
+
|
43
|
+
# Name of the event published and subscribed to
|
44
|
+
class_attribute :event_name, instance_accessor: false
|
45
|
+
self.event_name = name
|
46
|
+
|
47
|
+
def self.publish(action, **parameters)
|
48
|
+
raise(ArgumentError, "Invalid action: #{action}") unless public_method_defined?(action)
|
49
|
+
if event_name == Event::ALL_EVENTS
|
50
|
+
raise(NotImplementedError, "Cannot publish to an all events subscriber: event_name='#{Event::ALL_EVENTS}'")
|
51
|
+
end
|
52
|
+
|
53
|
+
event = Event.new(name: event_name, action: action, parameters: parameters)
|
54
|
+
Subscriber.test_mode? ? Event.process_event(event) : event.save!
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.subscribe(*args, &block)
|
58
|
+
instance = new(*args)
|
59
|
+
Event.subscribe(instance, &block)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def process_action(action, parameters)
|
64
|
+
unless public_methods.include?(action)
|
65
|
+
logger.warn("Ignoring unknown action: #{action}")
|
66
|
+
return
|
67
|
+
end
|
68
|
+
|
69
|
+
args = (method(action).arity == 0) || parameters.nil? ? nil : parameters.symbolize_keys
|
70
|
+
args ? public_send(action, **args) : public_send(action)
|
71
|
+
rescue StandardError => exc
|
72
|
+
logger.error('Exception calling subscriber. Resuming..', exc)
|
73
|
+
end
|
74
|
+
|
75
|
+
def process_event(name, action, parameters)
|
76
|
+
raise(NotImplementedError)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
module Subscribers
|
5
|
+
class Logger
|
6
|
+
include RocketJob::Subscriber
|
7
|
+
|
8
|
+
def self.host_name
|
9
|
+
@host_name ||= Socket.gethostname
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.host_name=(host_name)
|
13
|
+
@host_name = host_name
|
14
|
+
end
|
15
|
+
|
16
|
+
# Change the log level
|
17
|
+
#
|
18
|
+
# Examples:
|
19
|
+
# # Change the global log level to :trace on all servers.
|
20
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace)
|
21
|
+
#
|
22
|
+
# # Change the global log level to :trace on one server.
|
23
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com')
|
24
|
+
#
|
25
|
+
# # Change the global log level to :trace for a specific process id.
|
26
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com', pid: 34567)
|
27
|
+
#
|
28
|
+
# # Change the log level for a specific class to :trace.
|
29
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace, class_name: 'RocketJob::Supervisor')
|
30
|
+
def set(level: :info, class_name: nil, host_name: nil, pid: nil)
|
31
|
+
return unless for_me?(host_name, pid)
|
32
|
+
|
33
|
+
if class_name
|
34
|
+
class_name.constantize.logger.level = level
|
35
|
+
logger.info "Changed log level to #{level} for #{class_name}"
|
36
|
+
else
|
37
|
+
SemanticLogger.default_level = level
|
38
|
+
logger.info "Changed global log level to #{level}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Dump all backtraces to the log file.
|
43
|
+
#
|
44
|
+
# Examples:
|
45
|
+
# # Thread dump on all servers:
|
46
|
+
# RocketJob::Subscribers::Logger.publish(:thread_dump)
|
47
|
+
#
|
48
|
+
# # Change the global log level to :trace on one server.
|
49
|
+
# RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com')
|
50
|
+
#
|
51
|
+
# # Change the global log level to :trace for a specific process id.
|
52
|
+
# RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com', pid: 34567)
|
53
|
+
def thread_dump(host_name: nil, pid: nil)
|
54
|
+
return unless for_me?(host_name, pid)
|
55
|
+
|
56
|
+
Thread.list.each do |thread|
|
57
|
+
next if thread == Thread.current
|
58
|
+
|
59
|
+
logger.backtrace(thread: thread)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def for_me?(host_name, pid)
|
66
|
+
return true if host_name.nil? && pid.nil?
|
67
|
+
|
68
|
+
return false if host_name && (host_name != self.class.host_name)
|
69
|
+
return false if pid && (pid != $$)
|
70
|
+
|
71
|
+
true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Subscribers
|
3
|
+
class Server
|
4
|
+
include RocketJob::Subscriber
|
5
|
+
|
6
|
+
attr_reader :supervisor
|
7
|
+
|
8
|
+
def initialize(supervisor)
|
9
|
+
@supervisor = supervisor
|
10
|
+
end
|
11
|
+
|
12
|
+
def kill(server_id: nil, wait_timeout: 3)
|
13
|
+
return unless my_server?(server_id)
|
14
|
+
|
15
|
+
supervisor.synchronize do
|
16
|
+
supervisor.worker_pool.stop
|
17
|
+
supervisor.worker_pool.join(wait_timeout)
|
18
|
+
supervisor.worker_pool.kill
|
19
|
+
end
|
20
|
+
|
21
|
+
Supervisor.shutdown!
|
22
|
+
logger.info "Killed"
|
23
|
+
end
|
24
|
+
|
25
|
+
def pause(server_id: nil)
|
26
|
+
return unless my_server?(server_id)
|
27
|
+
|
28
|
+
supervisor.synchronize { supervisor.server.pause! if supervisor.server.may_pause? }
|
29
|
+
Supervisor.event!
|
30
|
+
logger.info "Paused"
|
31
|
+
end
|
32
|
+
|
33
|
+
def refresh(server_id: nil)
|
34
|
+
return unless my_server?(server_id)
|
35
|
+
|
36
|
+
Supervisor.event!
|
37
|
+
logger.info "Refreshed"
|
38
|
+
end
|
39
|
+
|
40
|
+
def resume(server_id: nil)
|
41
|
+
return unless my_server?(server_id)
|
42
|
+
|
43
|
+
supervisor.synchronize { supervisor.server.resume! if supervisor.server.may_resume? }
|
44
|
+
Supervisor.event!
|
45
|
+
logger.info "Resumed"
|
46
|
+
end
|
47
|
+
|
48
|
+
def stop(server_id: nil)
|
49
|
+
return unless my_server?(server_id)
|
50
|
+
|
51
|
+
Supervisor.shutdown!
|
52
|
+
logger.info "Shutdown"
|
53
|
+
end
|
54
|
+
|
55
|
+
def thread_dump(server_id: nil)
|
56
|
+
return unless my_server?(server_id)
|
57
|
+
|
58
|
+
logger.info "Thread dump"
|
59
|
+
supervisor.worker_pool.log_backtraces
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def my_server?(server_id)
|
65
|
+
return true if server_id.nil?
|
66
|
+
|
67
|
+
server_id == supervisor.server.id
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Subscribers
|
3
|
+
class Worker
|
4
|
+
include RocketJob::Subscriber
|
5
|
+
|
6
|
+
attr_reader :supervisor
|
7
|
+
|
8
|
+
def initialize(supervisor)
|
9
|
+
@supervisor = supervisor
|
10
|
+
end
|
11
|
+
|
12
|
+
def kill(server_id:, worker_id:, wait_timeout: 3)
|
13
|
+
return unless my_server?(server_id)
|
14
|
+
|
15
|
+
worker = locate_worker(worker_id)
|
16
|
+
return unless worker
|
17
|
+
|
18
|
+
worker.shutdown!
|
19
|
+
worker.join(wait_timeout)
|
20
|
+
worker.kill
|
21
|
+
|
22
|
+
logger.info "Killed"
|
23
|
+
end
|
24
|
+
|
25
|
+
def stop(server_id:, worker_id:)
|
26
|
+
return unless my_server?(server_id)
|
27
|
+
|
28
|
+
worker = locate_worker(worker_id)
|
29
|
+
return unless worker
|
30
|
+
|
31
|
+
worker.shutdown!
|
32
|
+
logger.info "Stopped Worker: #{worker_id}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def thread_dump(server_id:, worker_id:)
|
36
|
+
return unless my_server?(server_id)
|
37
|
+
|
38
|
+
worker = locate_worker(worker_id)
|
39
|
+
return unless worker
|
40
|
+
|
41
|
+
logger.info "Thread dump Worker: #{worker_id}"
|
42
|
+
logger.backtrace(thread: worker.thread) if worker.thread && worker.alive?
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def my_server?(server_id)
|
48
|
+
server_id == supervisor.server.id
|
49
|
+
end
|
50
|
+
|
51
|
+
def locate_worker(worker_id)
|
52
|
+
return unless worker_id
|
53
|
+
|
54
|
+
worker = supervisor.worker_pool.find(worker_id)
|
55
|
+
return unless worker&.alive?
|
56
|
+
|
57
|
+
worker
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|