rocketjob 4.0.0 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,138 @@
1
+ require 'yaml'
2
+ require 'active_support/concern'
3
+
4
+ module RocketJob
5
+ class Server
6
+ # Model attributes
7
+ module Model
8
+ extend ActiveSupport::Concern
9
+
10
+ included do
11
+ store_in collection: 'rocket_job.servers'
12
+
13
+ # Unique Name of this server instance
14
+ # Default: `host name:PID`
15
+ # The unique name is used on re-start to re-queue any jobs that were being processed
16
+ # at the time the server unexpectedly terminated, if any
17
+ field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
18
+
19
+ # The maximum number of workers this server should start
20
+ # If set, it will override the default value in RocketJob::Config
21
+ field :max_workers, type: Integer, default: -> { Config.instance.max_workers }
22
+
23
+ # When this server process was started
24
+ field :started_at, type: Time
25
+
26
+ # Filter to apply to control which job classes this server can process
27
+ field :yaml_filter, type: String
28
+
29
+ # The heartbeat information for this server
30
+ embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
31
+
32
+ # Current state
33
+ # Internal use only. Do not set this field directly
34
+ field :state, type: Symbol, default: :starting
35
+
36
+ index({name: 1}, background: true, unique: true, drop_dups: true)
37
+
38
+ validates_presence_of :state, :name, :max_workers
39
+
40
+ # Requeue any jobs being worked by this server when it is destroyed
41
+ before_destroy :requeue_jobs
42
+
43
+ # Returns [Hash<String:Integer>] of the number of servers in each state.
44
+ # Note: If there are no servers in that particular state then the hash will not have a value for it.
45
+ #
46
+ # Example servers in every state:
47
+ # RocketJob::Server.counts_by_state
48
+ # # => {
49
+ # :aborted => 1,
50
+ # :completed => 37,
51
+ # :failed => 1,
52
+ # :paused => 3,
53
+ # :queued => 4,
54
+ # :running => 1,
55
+ # :queued_now => 1,
56
+ # :scheduled => 3
57
+ # }
58
+ #
59
+ # Example no servers active:
60
+ # RocketJob::Server.counts_by_state
61
+ # # => {}
62
+ def self.counts_by_state
63
+ counts = {}
64
+ collection.aggregate([{'$group' => {_id: '$state', count: {'$sum' => 1}}}]).each do |result|
65
+ counts[result['_id'].to_sym] = result['count']
66
+ end
67
+ counts
68
+ end
69
+
70
+ # Destroy's all instances of zombie servers and requeues any jobs still "running"
71
+ # on those servers.
72
+ def self.destroy_zombies
73
+ count = 0
74
+ each do |server|
75
+ next unless server.zombie?
76
+ logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
77
+ server.destroy
78
+ count += 1
79
+ end
80
+ count
81
+ end
82
+
83
+ # Scope for all zombie servers
84
+ def self.zombies(missed = 4)
85
+ dead_seconds = Config.instance.heartbeat_seconds * missed
86
+ last_heartbeat_time = Time.now - dead_seconds
87
+ where(
88
+ :state.in => %i[stopping running paused],
89
+ '$or' => [
90
+ {'heartbeat.updated_at' => {'$exists' => false}},
91
+ {'heartbeat.updated_at' => {'$lte' => last_heartbeat_time}}
92
+ ]
93
+ )
94
+ end
95
+ end
96
+
97
+ # Where clause filter to apply to workers looking for jobs
98
+ def filter
99
+ YAML.load(yaml_filter) if yaml_filter
100
+ end
101
+
102
+ def filter=(hash)
103
+ self.yaml_filter = hash.nil? ? nil : hash.to_yaml
104
+ end
105
+
106
+ # Returns [true|false] if this server has missed at least the last 4 heartbeats
107
+ #
108
+ # Possible causes for a server to miss its heartbeats:
109
+ # - The server process has died
110
+ # - The server process is "hanging"
111
+ # - The server is no longer able to communicate with the MongoDB Server
112
+ def zombie?(missed = 4)
113
+ return false unless running? || stopping? || paused?
114
+ return true if heartbeat.nil? || heartbeat.updated_at.nil?
115
+ dead_seconds = Config.instance.heartbeat_seconds * missed
116
+ (Time.now - heartbeat.updated_at) >= dead_seconds
117
+ end
118
+
119
+ # Updates the heartbeat and returns a refreshed server instance.
120
+ def refresh(worker_count)
121
+ SemanticLogger.silence(:info) do
122
+ find_and_update(
123
+ 'heartbeat.updated_at' => Time.now,
124
+ 'heartbeat.workers' => worker_count
125
+ )
126
+ end
127
+ end
128
+
129
+ private
130
+
131
+ # Requeue any jobs assigned to this server when it is destroyed
132
+ def requeue_jobs
133
+ RocketJob::Job.requeue_dead_server(name)
134
+ end
135
+
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,60 @@
1
+ require 'active_support/concern'
2
+
3
+ module RocketJob
4
+ class Server
5
+ # State machine for sliced jobs
6
+ module StateMachine
7
+ extend ActiveSupport::Concern
8
+
9
+ included do
10
+ # States
11
+ # :starting -> :running -> :paused
12
+ # -> :stopping
13
+ aasm column: :state, whiny_persistence: true do
14
+ state :starting, initial: true
15
+ state :running
16
+ state :paused
17
+ state :stopping
18
+
19
+ event :started do
20
+ transitions from: :starting, to: :running
21
+ before do
22
+ self.started_at = Time.now
23
+ build_heartbeat(updated_at: Time.now, workers: 0)
24
+ end
25
+ end
26
+
27
+ event :pause do
28
+ transitions from: :running, to: :paused
29
+ end
30
+
31
+ event :resume do
32
+ transitions from: :paused, to: :running
33
+ end
34
+
35
+ event :stop do
36
+ transitions from: :running, to: :stopping
37
+ transitions from: :paused, to: :stopping
38
+ transitions from: :starting, to: :stopping
39
+ end
40
+ end
41
+
42
+ # Stop all running, paused, or starting servers
43
+ def self.stop_all
44
+ where(:state.in => %i[running paused starting]).each(&:stop!)
45
+ end
46
+
47
+ # Pause all running servers
48
+ def self.pause_all
49
+ running.each(&:pause!)
50
+ end
51
+
52
+ # Resume all paused servers
53
+ def self.resume_all
54
+ paused.each(&:resume!)
55
+ end
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,79 @@
1
+ require 'active_support/concern'
2
+
3
+ module RocketJob
4
+ # Mix-in to publish and subscribe to events.
5
+ #
6
+ # Example:
7
+ # def MySubscriber
8
+ # include RocketJob::Subscriber
9
+ #
10
+ # def hello
11
+ # logger.info "Hello Action Received"
12
+ # end
13
+ #
14
+ # def show(message:)
15
+ # logger.info "Received: #{message}"
16
+ # end
17
+ #
18
+ # # If `message` is not supplied it defaults to "Hello World"
19
+ # def show_default(message: "Hello World")
20
+ # logger.info "Received: #{message}"
21
+ # end
22
+ # end
23
+ #
24
+ # MySubscriber.subscribe
25
+ module Subscriber
26
+ extend ActiveSupport::Concern
27
+
28
+ # Test Mode
29
+ # Bypasses publishing the event and calls the subscribers directly
30
+ def self.test_mode!
31
+ @test_mode = true
32
+ end
33
+
34
+ def self.test_mode?
35
+ @test_mode
36
+ end
37
+
38
+ @test_mode = false
39
+
40
+ included do
41
+ include SemanticLogger::Loggable
42
+
43
+ # Name of the event published and subscribed to
44
+ class_attribute :event_name, instance_accessor: false
45
+ self.event_name = name
46
+
47
+ def self.publish(action, **parameters)
48
+ raise(ArgumentError, "Invalid action: #{action}") unless public_method_defined?(action)
49
+ if event_name == Event::ALL_EVENTS
50
+ raise(NotImplementedError, "Cannot publish to an all events subscriber: event_name='#{Event::ALL_EVENTS}'")
51
+ end
52
+
53
+ event = Event.new(name: event_name, action: action, parameters: parameters)
54
+ Subscriber.test_mode? ? Event.process_event(event) : event.save!
55
+ end
56
+
57
+ def self.subscribe(*args, &block)
58
+ instance = new(*args)
59
+ Event.subscribe(instance, &block)
60
+ end
61
+ end
62
+
63
+ def process_action(action, parameters)
64
+ unless public_methods.include?(action)
65
+ logger.warn("Ignoring unknown action: #{action}")
66
+ return
67
+ end
68
+
69
+ args = (method(action).arity == 0) || parameters.nil? ? nil : parameters.symbolize_keys
70
+ args ? public_send(action, **args) : public_send(action)
71
+ rescue StandardError => exc
72
+ logger.error('Exception calling subscriber. Resuming..', exc)
73
+ end
74
+
75
+ def process_event(name, action, parameters)
76
+ raise(NotImplementedError)
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,75 @@
1
+ require 'socket'
2
+
3
+ module RocketJob
4
+ module Subscribers
5
+ class Logger
6
+ include RocketJob::Subscriber
7
+
8
+ def self.host_name
9
+ @host_name ||= Socket.gethostname
10
+ end
11
+
12
+ def self.host_name=(host_name)
13
+ @host_name = host_name
14
+ end
15
+
16
+ # Change the log level
17
+ #
18
+ # Examples:
19
+ # # Change the global log level to :trace on all servers.
20
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace)
21
+ #
22
+ # # Change the global log level to :trace on one server.
23
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com')
24
+ #
25
+ # # Change the global log level to :trace for a specific process id.
26
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com', pid: 34567)
27
+ #
28
+ # # Change the log level for a specific class to :trace.
29
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace, class_name: 'RocketJob::Supervisor')
30
+ def set(level: :info, class_name: nil, host_name: nil, pid: nil)
31
+ return unless for_me?(host_name, pid)
32
+
33
+ if class_name
34
+ class_name.constantize.logger.level = level
35
+ logger.info "Changed log level to #{level} for #{class_name}"
36
+ else
37
+ SemanticLogger.default_level = level
38
+ logger.info "Changed global log level to #{level}"
39
+ end
40
+ end
41
+
42
+ # Dump all backtraces to the log file.
43
+ #
44
+ # Examples:
45
+ # # Thread dump on all servers:
46
+ # RocketJob::Subscribers::Logger.publish(:thread_dump)
47
+ #
48
+ # # Change the global log level to :trace on one server.
49
+ # RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com')
50
+ #
51
+ # # Change the global log level to :trace for a specific process id.
52
+ # RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com', pid: 34567)
53
+ def thread_dump(host_name: nil, pid: nil)
54
+ return unless for_me?(host_name, pid)
55
+
56
+ Thread.list.each do |thread|
57
+ next if thread == Thread.current
58
+
59
+ logger.backtrace(thread: thread)
60
+ end
61
+ end
62
+
63
+ private
64
+
65
+ def for_me?(host_name, pid)
66
+ return true if host_name.nil? && pid.nil?
67
+
68
+ return false if host_name && (host_name != self.class.host_name)
69
+ return false if pid && (pid != $$)
70
+
71
+ true
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,71 @@
1
+ module RocketJob
2
+ module Subscribers
3
+ class Server
4
+ include RocketJob::Subscriber
5
+
6
+ attr_reader :supervisor
7
+
8
+ def initialize(supervisor)
9
+ @supervisor = supervisor
10
+ end
11
+
12
+ def kill(server_id: nil, wait_timeout: 3)
13
+ return unless my_server?(server_id)
14
+
15
+ supervisor.synchronize do
16
+ supervisor.worker_pool.stop
17
+ supervisor.worker_pool.join(wait_timeout)
18
+ supervisor.worker_pool.kill
19
+ end
20
+
21
+ Supervisor.shutdown!
22
+ logger.info "Killed"
23
+ end
24
+
25
+ def pause(server_id: nil)
26
+ return unless my_server?(server_id)
27
+
28
+ supervisor.synchronize { supervisor.server.pause! if supervisor.server.may_pause? }
29
+ Supervisor.event!
30
+ logger.info "Paused"
31
+ end
32
+
33
+ def refresh(server_id: nil)
34
+ return unless my_server?(server_id)
35
+
36
+ Supervisor.event!
37
+ logger.info "Refreshed"
38
+ end
39
+
40
+ def resume(server_id: nil)
41
+ return unless my_server?(server_id)
42
+
43
+ supervisor.synchronize { supervisor.server.resume! if supervisor.server.may_resume? }
44
+ Supervisor.event!
45
+ logger.info "Resumed"
46
+ end
47
+
48
+ def stop(server_id: nil)
49
+ return unless my_server?(server_id)
50
+
51
+ Supervisor.shutdown!
52
+ logger.info "Shutdown"
53
+ end
54
+
55
+ def thread_dump(server_id: nil)
56
+ return unless my_server?(server_id)
57
+
58
+ logger.info "Thread dump"
59
+ supervisor.worker_pool.log_backtraces
60
+ end
61
+
62
+ private
63
+
64
+ def my_server?(server_id)
65
+ return true if server_id.nil?
66
+
67
+ server_id == supervisor.server.id
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,61 @@
1
+ module RocketJob
2
+ module Subscribers
3
+ class Worker
4
+ include RocketJob::Subscriber
5
+
6
+ attr_reader :supervisor
7
+
8
+ def initialize(supervisor)
9
+ @supervisor = supervisor
10
+ end
11
+
12
+ def kill(server_id:, worker_id:, wait_timeout: 3)
13
+ return unless my_server?(server_id)
14
+
15
+ worker = locate_worker(worker_id)
16
+ return unless worker
17
+
18
+ worker.shutdown!
19
+ worker.join(wait_timeout)
20
+ worker.kill
21
+
22
+ logger.info "Killed"
23
+ end
24
+
25
+ def stop(server_id:, worker_id:)
26
+ return unless my_server?(server_id)
27
+
28
+ worker = locate_worker(worker_id)
29
+ return unless worker
30
+
31
+ worker.shutdown!
32
+ logger.info "Stopped Worker: #{worker_id}"
33
+ end
34
+
35
+ def thread_dump(server_id:, worker_id:)
36
+ return unless my_server?(server_id)
37
+
38
+ worker = locate_worker(worker_id)
39
+ return unless worker
40
+
41
+ logger.info "Thread dump Worker: #{worker_id}"
42
+ logger.backtrace(thread: worker.thread) if worker.thread && worker.alive?
43
+ end
44
+
45
+ private
46
+
47
+ def my_server?(server_id)
48
+ server_id == supervisor.server.id
49
+ end
50
+
51
+ def locate_worker(worker_id)
52
+ return unless worker_id
53
+
54
+ worker = supervisor.worker_pool.find(worker_id)
55
+ return unless worker&.alive?
56
+
57
+ worker
58
+ end
59
+ end
60
+ end
61
+ end