rocketjob 4.0.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rocket_job/cli.rb +2 -2
- data/lib/rocket_job/event.rb +163 -0
- data/lib/rocket_job/jobs/housekeeping_job.rb +7 -7
- data/lib/rocket_job/plugins/transaction.rb +1 -1
- data/lib/rocket_job/rocket_job.rb +7 -0
- data/lib/rocket_job/server.rb +5 -356
- data/lib/rocket_job/server/model.rb +138 -0
- data/lib/rocket_job/server/state_machine.rb +60 -0
- data/lib/rocket_job/subscriber.rb +79 -0
- data/lib/rocket_job/subscribers/logger.rb +75 -0
- data/lib/rocket_job/subscribers/server.rb +71 -0
- data/lib/rocket_job/subscribers/worker.rb +61 -0
- data/lib/rocket_job/supervisor.rb +96 -0
- data/lib/rocket_job/supervisor/shutdown.rb +63 -0
- data/lib/rocket_job/version.rb +1 -1
- data/lib/rocket_job/worker.rb +41 -31
- data/lib/rocket_job/worker_pool.rb +103 -0
- data/lib/rocketjob.rb +17 -7
- metadata +15 -6
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'active_support/concern'
|
3
|
+
|
4
|
+
module RocketJob
|
5
|
+
class Server
|
6
|
+
# Model attributes
|
7
|
+
module Model
|
8
|
+
extend ActiveSupport::Concern
|
9
|
+
|
10
|
+
included do
|
11
|
+
store_in collection: 'rocket_job.servers'
|
12
|
+
|
13
|
+
# Unique Name of this server instance
|
14
|
+
# Default: `host name:PID`
|
15
|
+
# The unique name is used on re-start to re-queue any jobs that were being processed
|
16
|
+
# at the time the server unexpectedly terminated, if any
|
17
|
+
field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
|
18
|
+
|
19
|
+
# The maximum number of workers this server should start
|
20
|
+
# If set, it will override the default value in RocketJob::Config
|
21
|
+
field :max_workers, type: Integer, default: -> { Config.instance.max_workers }
|
22
|
+
|
23
|
+
# When this server process was started
|
24
|
+
field :started_at, type: Time
|
25
|
+
|
26
|
+
# Filter to apply to control which job classes this server can process
|
27
|
+
field :yaml_filter, type: String
|
28
|
+
|
29
|
+
# The heartbeat information for this server
|
30
|
+
embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
|
31
|
+
|
32
|
+
# Current state
|
33
|
+
# Internal use only. Do not set this field directly
|
34
|
+
field :state, type: Symbol, default: :starting
|
35
|
+
|
36
|
+
index({name: 1}, background: true, unique: true, drop_dups: true)
|
37
|
+
|
38
|
+
validates_presence_of :state, :name, :max_workers
|
39
|
+
|
40
|
+
# Requeue any jobs being worked by this server when it is destroyed
|
41
|
+
before_destroy :requeue_jobs
|
42
|
+
|
43
|
+
# Returns [Hash<String:Integer>] of the number of servers in each state.
|
44
|
+
# Note: If there are no servers in that particular state then the hash will not have a value for it.
|
45
|
+
#
|
46
|
+
# Example servers in every state:
|
47
|
+
# RocketJob::Server.counts_by_state
|
48
|
+
# # => {
|
49
|
+
# :aborted => 1,
|
50
|
+
# :completed => 37,
|
51
|
+
# :failed => 1,
|
52
|
+
# :paused => 3,
|
53
|
+
# :queued => 4,
|
54
|
+
# :running => 1,
|
55
|
+
# :queued_now => 1,
|
56
|
+
# :scheduled => 3
|
57
|
+
# }
|
58
|
+
#
|
59
|
+
# Example no servers active:
|
60
|
+
# RocketJob::Server.counts_by_state
|
61
|
+
# # => {}
|
62
|
+
def self.counts_by_state
|
63
|
+
counts = {}
|
64
|
+
collection.aggregate([{'$group' => {_id: '$state', count: {'$sum' => 1}}}]).each do |result|
|
65
|
+
counts[result['_id'].to_sym] = result['count']
|
66
|
+
end
|
67
|
+
counts
|
68
|
+
end
|
69
|
+
|
70
|
+
# Destroy's all instances of zombie servers and requeues any jobs still "running"
|
71
|
+
# on those servers.
|
72
|
+
def self.destroy_zombies
|
73
|
+
count = 0
|
74
|
+
each do |server|
|
75
|
+
next unless server.zombie?
|
76
|
+
logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
|
77
|
+
server.destroy
|
78
|
+
count += 1
|
79
|
+
end
|
80
|
+
count
|
81
|
+
end
|
82
|
+
|
83
|
+
# Scope for all zombie servers
|
84
|
+
def self.zombies(missed = 4)
|
85
|
+
dead_seconds = Config.instance.heartbeat_seconds * missed
|
86
|
+
last_heartbeat_time = Time.now - dead_seconds
|
87
|
+
where(
|
88
|
+
:state.in => %i[stopping running paused],
|
89
|
+
'$or' => [
|
90
|
+
{'heartbeat.updated_at' => {'$exists' => false}},
|
91
|
+
{'heartbeat.updated_at' => {'$lte' => last_heartbeat_time}}
|
92
|
+
]
|
93
|
+
)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Where clause filter to apply to workers looking for jobs
|
98
|
+
def filter
|
99
|
+
YAML.load(yaml_filter) if yaml_filter
|
100
|
+
end
|
101
|
+
|
102
|
+
def filter=(hash)
|
103
|
+
self.yaml_filter = hash.nil? ? nil : hash.to_yaml
|
104
|
+
end
|
105
|
+
|
106
|
+
# Returns [true|false] if this server has missed at least the last 4 heartbeats
|
107
|
+
#
|
108
|
+
# Possible causes for a server to miss its heartbeats:
|
109
|
+
# - The server process has died
|
110
|
+
# - The server process is "hanging"
|
111
|
+
# - The server is no longer able to communicate with the MongoDB Server
|
112
|
+
def zombie?(missed = 4)
|
113
|
+
return false unless running? || stopping? || paused?
|
114
|
+
return true if heartbeat.nil? || heartbeat.updated_at.nil?
|
115
|
+
dead_seconds = Config.instance.heartbeat_seconds * missed
|
116
|
+
(Time.now - heartbeat.updated_at) >= dead_seconds
|
117
|
+
end
|
118
|
+
|
119
|
+
# Updates the heartbeat and returns a refreshed server instance.
|
120
|
+
def refresh(worker_count)
|
121
|
+
SemanticLogger.silence(:info) do
|
122
|
+
find_and_update(
|
123
|
+
'heartbeat.updated_at' => Time.now,
|
124
|
+
'heartbeat.workers' => worker_count
|
125
|
+
)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
# Requeue any jobs assigned to this server when it is destroyed
|
132
|
+
def requeue_jobs
|
133
|
+
RocketJob::Job.requeue_dead_server(name)
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
class Server
|
5
|
+
# State machine for sliced jobs
|
6
|
+
module StateMachine
|
7
|
+
extend ActiveSupport::Concern
|
8
|
+
|
9
|
+
included do
|
10
|
+
# States
|
11
|
+
# :starting -> :running -> :paused
|
12
|
+
# -> :stopping
|
13
|
+
aasm column: :state, whiny_persistence: true do
|
14
|
+
state :starting, initial: true
|
15
|
+
state :running
|
16
|
+
state :paused
|
17
|
+
state :stopping
|
18
|
+
|
19
|
+
event :started do
|
20
|
+
transitions from: :starting, to: :running
|
21
|
+
before do
|
22
|
+
self.started_at = Time.now
|
23
|
+
build_heartbeat(updated_at: Time.now, workers: 0)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
event :pause do
|
28
|
+
transitions from: :running, to: :paused
|
29
|
+
end
|
30
|
+
|
31
|
+
event :resume do
|
32
|
+
transitions from: :paused, to: :running
|
33
|
+
end
|
34
|
+
|
35
|
+
event :stop do
|
36
|
+
transitions from: :running, to: :stopping
|
37
|
+
transitions from: :paused, to: :stopping
|
38
|
+
transitions from: :starting, to: :stopping
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Stop all running, paused, or starting servers
|
43
|
+
def self.stop_all
|
44
|
+
where(:state.in => %i[running paused starting]).each(&:stop!)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Pause all running servers
|
48
|
+
def self.pause_all
|
49
|
+
running.each(&:pause!)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Resume all paused servers
|
53
|
+
def self.resume_all
|
54
|
+
paused.each(&:resume!)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require 'active_support/concern'
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
# Mix-in to publish and subscribe to events.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
# def MySubscriber
|
8
|
+
# include RocketJob::Subscriber
|
9
|
+
#
|
10
|
+
# def hello
|
11
|
+
# logger.info "Hello Action Received"
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
# def show(message:)
|
15
|
+
# logger.info "Received: #{message}"
|
16
|
+
# end
|
17
|
+
#
|
18
|
+
# # If `message` is not supplied it defaults to "Hello World"
|
19
|
+
# def show_default(message: "Hello World")
|
20
|
+
# logger.info "Received: #{message}"
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# MySubscriber.subscribe
|
25
|
+
module Subscriber
|
26
|
+
extend ActiveSupport::Concern
|
27
|
+
|
28
|
+
# Test Mode
|
29
|
+
# Bypasses publishing the event and calls the subscribers directly
|
30
|
+
def self.test_mode!
|
31
|
+
@test_mode = true
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.test_mode?
|
35
|
+
@test_mode
|
36
|
+
end
|
37
|
+
|
38
|
+
@test_mode = false
|
39
|
+
|
40
|
+
included do
|
41
|
+
include SemanticLogger::Loggable
|
42
|
+
|
43
|
+
# Name of the event published and subscribed to
|
44
|
+
class_attribute :event_name, instance_accessor: false
|
45
|
+
self.event_name = name
|
46
|
+
|
47
|
+
def self.publish(action, **parameters)
|
48
|
+
raise(ArgumentError, "Invalid action: #{action}") unless public_method_defined?(action)
|
49
|
+
if event_name == Event::ALL_EVENTS
|
50
|
+
raise(NotImplementedError, "Cannot publish to an all events subscriber: event_name='#{Event::ALL_EVENTS}'")
|
51
|
+
end
|
52
|
+
|
53
|
+
event = Event.new(name: event_name, action: action, parameters: parameters)
|
54
|
+
Subscriber.test_mode? ? Event.process_event(event) : event.save!
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.subscribe(*args, &block)
|
58
|
+
instance = new(*args)
|
59
|
+
Event.subscribe(instance, &block)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def process_action(action, parameters)
|
64
|
+
unless public_methods.include?(action)
|
65
|
+
logger.warn("Ignoring unknown action: #{action}")
|
66
|
+
return
|
67
|
+
end
|
68
|
+
|
69
|
+
args = (method(action).arity == 0) || parameters.nil? ? nil : parameters.symbolize_keys
|
70
|
+
args ? public_send(action, **args) : public_send(action)
|
71
|
+
rescue StandardError => exc
|
72
|
+
logger.error('Exception calling subscriber. Resuming..', exc)
|
73
|
+
end
|
74
|
+
|
75
|
+
def process_event(name, action, parameters)
|
76
|
+
raise(NotImplementedError)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'socket'
|
2
|
+
|
3
|
+
module RocketJob
|
4
|
+
module Subscribers
|
5
|
+
class Logger
|
6
|
+
include RocketJob::Subscriber
|
7
|
+
|
8
|
+
def self.host_name
|
9
|
+
@host_name ||= Socket.gethostname
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.host_name=(host_name)
|
13
|
+
@host_name = host_name
|
14
|
+
end
|
15
|
+
|
16
|
+
# Change the log level
|
17
|
+
#
|
18
|
+
# Examples:
|
19
|
+
# # Change the global log level to :trace on all servers.
|
20
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace)
|
21
|
+
#
|
22
|
+
# # Change the global log level to :trace on one server.
|
23
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com')
|
24
|
+
#
|
25
|
+
# # Change the global log level to :trace for a specific process id.
|
26
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com', pid: 34567)
|
27
|
+
#
|
28
|
+
# # Change the log level for a specific class to :trace.
|
29
|
+
# RocketJob::Subscribers::Logger.publish(:set, level: :trace, class_name: 'RocketJob::Supervisor')
|
30
|
+
def set(level: :info, class_name: nil, host_name: nil, pid: nil)
|
31
|
+
return unless for_me?(host_name, pid)
|
32
|
+
|
33
|
+
if class_name
|
34
|
+
class_name.constantize.logger.level = level
|
35
|
+
logger.info "Changed log level to #{level} for #{class_name}"
|
36
|
+
else
|
37
|
+
SemanticLogger.default_level = level
|
38
|
+
logger.info "Changed global log level to #{level}"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Dump all backtraces to the log file.
|
43
|
+
#
|
44
|
+
# Examples:
|
45
|
+
# # Thread dump on all servers:
|
46
|
+
# RocketJob::Subscribers::Logger.publish(:thread_dump)
|
47
|
+
#
|
48
|
+
# # Change the global log level to :trace on one server.
|
49
|
+
# RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com')
|
50
|
+
#
|
51
|
+
# # Change the global log level to :trace for a specific process id.
|
52
|
+
# RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com', pid: 34567)
|
53
|
+
def thread_dump(host_name: nil, pid: nil)
|
54
|
+
return unless for_me?(host_name, pid)
|
55
|
+
|
56
|
+
Thread.list.each do |thread|
|
57
|
+
next if thread == Thread.current
|
58
|
+
|
59
|
+
logger.backtrace(thread: thread)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def for_me?(host_name, pid)
|
66
|
+
return true if host_name.nil? && pid.nil?
|
67
|
+
|
68
|
+
return false if host_name && (host_name != self.class.host_name)
|
69
|
+
return false if pid && (pid != $$)
|
70
|
+
|
71
|
+
true
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Subscribers
|
3
|
+
class Server
|
4
|
+
include RocketJob::Subscriber
|
5
|
+
|
6
|
+
attr_reader :supervisor
|
7
|
+
|
8
|
+
def initialize(supervisor)
|
9
|
+
@supervisor = supervisor
|
10
|
+
end
|
11
|
+
|
12
|
+
def kill(server_id: nil, wait_timeout: 3)
|
13
|
+
return unless my_server?(server_id)
|
14
|
+
|
15
|
+
supervisor.synchronize do
|
16
|
+
supervisor.worker_pool.stop
|
17
|
+
supervisor.worker_pool.join(wait_timeout)
|
18
|
+
supervisor.worker_pool.kill
|
19
|
+
end
|
20
|
+
|
21
|
+
Supervisor.shutdown!
|
22
|
+
logger.info "Killed"
|
23
|
+
end
|
24
|
+
|
25
|
+
def pause(server_id: nil)
|
26
|
+
return unless my_server?(server_id)
|
27
|
+
|
28
|
+
supervisor.synchronize { supervisor.server.pause! if supervisor.server.may_pause? }
|
29
|
+
Supervisor.event!
|
30
|
+
logger.info "Paused"
|
31
|
+
end
|
32
|
+
|
33
|
+
def refresh(server_id: nil)
|
34
|
+
return unless my_server?(server_id)
|
35
|
+
|
36
|
+
Supervisor.event!
|
37
|
+
logger.info "Refreshed"
|
38
|
+
end
|
39
|
+
|
40
|
+
def resume(server_id: nil)
|
41
|
+
return unless my_server?(server_id)
|
42
|
+
|
43
|
+
supervisor.synchronize { supervisor.server.resume! if supervisor.server.may_resume? }
|
44
|
+
Supervisor.event!
|
45
|
+
logger.info "Resumed"
|
46
|
+
end
|
47
|
+
|
48
|
+
def stop(server_id: nil)
|
49
|
+
return unless my_server?(server_id)
|
50
|
+
|
51
|
+
Supervisor.shutdown!
|
52
|
+
logger.info "Shutdown"
|
53
|
+
end
|
54
|
+
|
55
|
+
def thread_dump(server_id: nil)
|
56
|
+
return unless my_server?(server_id)
|
57
|
+
|
58
|
+
logger.info "Thread dump"
|
59
|
+
supervisor.worker_pool.log_backtraces
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
def my_server?(server_id)
|
65
|
+
return true if server_id.nil?
|
66
|
+
|
67
|
+
server_id == supervisor.server.id
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module RocketJob
|
2
|
+
module Subscribers
|
3
|
+
class Worker
|
4
|
+
include RocketJob::Subscriber
|
5
|
+
|
6
|
+
attr_reader :supervisor
|
7
|
+
|
8
|
+
def initialize(supervisor)
|
9
|
+
@supervisor = supervisor
|
10
|
+
end
|
11
|
+
|
12
|
+
def kill(server_id:, worker_id:, wait_timeout: 3)
|
13
|
+
return unless my_server?(server_id)
|
14
|
+
|
15
|
+
worker = locate_worker(worker_id)
|
16
|
+
return unless worker
|
17
|
+
|
18
|
+
worker.shutdown!
|
19
|
+
worker.join(wait_timeout)
|
20
|
+
worker.kill
|
21
|
+
|
22
|
+
logger.info "Killed"
|
23
|
+
end
|
24
|
+
|
25
|
+
def stop(server_id:, worker_id:)
|
26
|
+
return unless my_server?(server_id)
|
27
|
+
|
28
|
+
worker = locate_worker(worker_id)
|
29
|
+
return unless worker
|
30
|
+
|
31
|
+
worker.shutdown!
|
32
|
+
logger.info "Stopped Worker: #{worker_id}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def thread_dump(server_id:, worker_id:)
|
36
|
+
return unless my_server?(server_id)
|
37
|
+
|
38
|
+
worker = locate_worker(worker_id)
|
39
|
+
return unless worker
|
40
|
+
|
41
|
+
logger.info "Thread dump Worker: #{worker_id}"
|
42
|
+
logger.backtrace(thread: worker.thread) if worker.thread && worker.alive?
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def my_server?(server_id)
|
48
|
+
server_id == supervisor.server.id
|
49
|
+
end
|
50
|
+
|
51
|
+
def locate_worker(worker_id)
|
52
|
+
return unless worker_id
|
53
|
+
|
54
|
+
worker = supervisor.worker_pool.find(worker_id)
|
55
|
+
return unless worker&.alive?
|
56
|
+
|
57
|
+
worker
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|