rocketjob 4.0.0 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,138 @@
1
+ require 'yaml'
2
+ require 'active_support/concern'
3
+
4
+ module RocketJob
5
+ class Server
6
+ # Model attributes
7
+ module Model
8
+ extend ActiveSupport::Concern
9
+
10
+ included do
11
+ store_in collection: 'rocket_job.servers'
12
+
13
+ # Unique Name of this server instance
14
+ # Default: `host name:PID`
15
+ # The unique name is used on re-start to re-queue any jobs that were being processed
16
+ # at the time the server unexpectedly terminated, if any
17
+ field :name, type: String, default: -> { "#{SemanticLogger.host}:#{$$}" }
18
+
19
+ # The maximum number of workers this server should start
20
+ # If set, it will override the default value in RocketJob::Config
21
+ field :max_workers, type: Integer, default: -> { Config.instance.max_workers }
22
+
23
+ # When this server process was started
24
+ field :started_at, type: Time
25
+
26
+ # Filter to apply to control which job classes this server can process
27
+ field :yaml_filter, type: String
28
+
29
+ # The heartbeat information for this server
30
+ embeds_one :heartbeat, class_name: 'RocketJob::Heartbeat'
31
+
32
+ # Current state
33
+ # Internal use only. Do not set this field directly
34
+ field :state, type: Symbol, default: :starting
35
+
36
+ index({name: 1}, background: true, unique: true, drop_dups: true)
37
+
38
+ validates_presence_of :state, :name, :max_workers
39
+
40
+ # Requeue any jobs being worked by this server when it is destroyed
41
+ before_destroy :requeue_jobs
42
+
43
+ # Returns [Hash<String:Integer>] of the number of servers in each state.
44
+ # Note: If there are no servers in that particular state then the hash will not have a value for it.
45
+ #
46
+ # Example servers in every state:
47
+ # RocketJob::Server.counts_by_state
48
+ # # => {
49
+ # :aborted => 1,
50
+ # :completed => 37,
51
+ # :failed => 1,
52
+ # :paused => 3,
53
+ # :queued => 4,
54
+ # :running => 1,
55
+ # :queued_now => 1,
56
+ # :scheduled => 3
57
+ # }
58
+ #
59
+ # Example no servers active:
60
+ # RocketJob::Server.counts_by_state
61
+ # # => {}
62
+ def self.counts_by_state
63
+ counts = {}
64
+ collection.aggregate([{'$group' => {_id: '$state', count: {'$sum' => 1}}}]).each do |result|
65
+ counts[result['_id'].to_sym] = result['count']
66
+ end
67
+ counts
68
+ end
69
+
70
+ # Destroy's all instances of zombie servers and requeues any jobs still "running"
71
+ # on those servers.
72
+ def self.destroy_zombies
73
+ count = 0
74
+ each do |server|
75
+ next unless server.zombie?
76
+ logger.warn "Destroying zombie server #{server.name}, and requeueing its jobs"
77
+ server.destroy
78
+ count += 1
79
+ end
80
+ count
81
+ end
82
+
83
+ # Scope for all zombie servers
84
+ def self.zombies(missed = 4)
85
+ dead_seconds = Config.instance.heartbeat_seconds * missed
86
+ last_heartbeat_time = Time.now - dead_seconds
87
+ where(
88
+ :state.in => %i[stopping running paused],
89
+ '$or' => [
90
+ {'heartbeat.updated_at' => {'$exists' => false}},
91
+ {'heartbeat.updated_at' => {'$lte' => last_heartbeat_time}}
92
+ ]
93
+ )
94
+ end
95
+ end
96
+
97
+ # Where clause filter to apply to workers looking for jobs
98
+ def filter
99
+ YAML.load(yaml_filter) if yaml_filter
100
+ end
101
+
102
+ def filter=(hash)
103
+ self.yaml_filter = hash.nil? ? nil : hash.to_yaml
104
+ end
105
+
106
+ # Returns [true|false] if this server has missed at least the last 4 heartbeats
107
+ #
108
+ # Possible causes for a server to miss its heartbeats:
109
+ # - The server process has died
110
+ # - The server process is "hanging"
111
+ # - The server is no longer able to communicate with the MongoDB Server
112
+ def zombie?(missed = 4)
113
+ return false unless running? || stopping? || paused?
114
+ return true if heartbeat.nil? || heartbeat.updated_at.nil?
115
+ dead_seconds = Config.instance.heartbeat_seconds * missed
116
+ (Time.now - heartbeat.updated_at) >= dead_seconds
117
+ end
118
+
119
+ # Updates the heartbeat and returns a refreshed server instance.
120
+ def refresh(worker_count)
121
+ SemanticLogger.silence(:info) do
122
+ find_and_update(
123
+ 'heartbeat.updated_at' => Time.now,
124
+ 'heartbeat.workers' => worker_count
125
+ )
126
+ end
127
+ end
128
+
129
+ private
130
+
131
+ # Requeue any jobs assigned to this server when it is destroyed
132
+ def requeue_jobs
133
+ RocketJob::Job.requeue_dead_server(name)
134
+ end
135
+
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,60 @@
1
+ require 'active_support/concern'
2
+
3
+ module RocketJob
4
+ class Server
5
+ # State machine for sliced jobs
6
+ module StateMachine
7
+ extend ActiveSupport::Concern
8
+
9
+ included do
10
+ # States
11
+ # :starting -> :running -> :paused
12
+ # -> :stopping
13
+ aasm column: :state, whiny_persistence: true do
14
+ state :starting, initial: true
15
+ state :running
16
+ state :paused
17
+ state :stopping
18
+
19
+ event :started do
20
+ transitions from: :starting, to: :running
21
+ before do
22
+ self.started_at = Time.now
23
+ build_heartbeat(updated_at: Time.now, workers: 0)
24
+ end
25
+ end
26
+
27
+ event :pause do
28
+ transitions from: :running, to: :paused
29
+ end
30
+
31
+ event :resume do
32
+ transitions from: :paused, to: :running
33
+ end
34
+
35
+ event :stop do
36
+ transitions from: :running, to: :stopping
37
+ transitions from: :paused, to: :stopping
38
+ transitions from: :starting, to: :stopping
39
+ end
40
+ end
41
+
42
+ # Stop all running, paused, or starting servers
43
+ def self.stop_all
44
+ where(:state.in => %i[running paused starting]).each(&:stop!)
45
+ end
46
+
47
+ # Pause all running servers
48
+ def self.pause_all
49
+ running.each(&:pause!)
50
+ end
51
+
52
+ # Resume all paused servers
53
+ def self.resume_all
54
+ paused.each(&:resume!)
55
+ end
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,79 @@
1
+ require 'active_support/concern'
2
+
3
+ module RocketJob
4
+ # Mix-in to publish and subscribe to events.
5
+ #
6
+ # Example:
7
+ # def MySubscriber
8
+ # include RocketJob::Subscriber
9
+ #
10
+ # def hello
11
+ # logger.info "Hello Action Received"
12
+ # end
13
+ #
14
+ # def show(message:)
15
+ # logger.info "Received: #{message}"
16
+ # end
17
+ #
18
+ # # If `message` is not supplied it defaults to "Hello World"
19
+ # def show_default(message: "Hello World")
20
+ # logger.info "Received: #{message}"
21
+ # end
22
+ # end
23
+ #
24
+ # MySubscriber.subscribe
25
+ module Subscriber
26
+ extend ActiveSupport::Concern
27
+
28
+ # Test Mode
29
+ # Bypasses publishing the event and calls the subscribers directly
30
+ def self.test_mode!
31
+ @test_mode = true
32
+ end
33
+
34
+ def self.test_mode?
35
+ @test_mode
36
+ end
37
+
38
+ @test_mode = false
39
+
40
+ included do
41
+ include SemanticLogger::Loggable
42
+
43
+ # Name of the event published and subscribed to
44
+ class_attribute :event_name, instance_accessor: false
45
+ self.event_name = name
46
+
47
+ def self.publish(action, **parameters)
48
+ raise(ArgumentError, "Invalid action: #{action}") unless public_method_defined?(action)
49
+ if event_name == Event::ALL_EVENTS
50
+ raise(NotImplementedError, "Cannot publish to an all events subscriber: event_name='#{Event::ALL_EVENTS}'")
51
+ end
52
+
53
+ event = Event.new(name: event_name, action: action, parameters: parameters)
54
+ Subscriber.test_mode? ? Event.process_event(event) : event.save!
55
+ end
56
+
57
+ def self.subscribe(*args, &block)
58
+ instance = new(*args)
59
+ Event.subscribe(instance, &block)
60
+ end
61
+ end
62
+
63
+ def process_action(action, parameters)
64
+ unless public_methods.include?(action)
65
+ logger.warn("Ignoring unknown action: #{action}")
66
+ return
67
+ end
68
+
69
+ args = (method(action).arity == 0) || parameters.nil? ? nil : parameters.symbolize_keys
70
+ args ? public_send(action, **args) : public_send(action)
71
+ rescue StandardError => exc
72
+ logger.error('Exception calling subscriber. Resuming..', exc)
73
+ end
74
+
75
+ def process_event(name, action, parameters)
76
+ raise(NotImplementedError)
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,75 @@
1
+ require 'socket'
2
+
3
+ module RocketJob
4
+ module Subscribers
5
+ class Logger
6
+ include RocketJob::Subscriber
7
+
8
+ def self.host_name
9
+ @host_name ||= Socket.gethostname
10
+ end
11
+
12
+ def self.host_name=(host_name)
13
+ @host_name = host_name
14
+ end
15
+
16
+ # Change the log level
17
+ #
18
+ # Examples:
19
+ # # Change the global log level to :trace on all servers.
20
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace)
21
+ #
22
+ # # Change the global log level to :trace on one server.
23
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com')
24
+ #
25
+ # # Change the global log level to :trace for a specific process id.
26
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace, host_name: 'server1.company.com', pid: 34567)
27
+ #
28
+ # # Change the log level for a specific class to :trace.
29
+ # RocketJob::Subscribers::Logger.publish(:set, level: :trace, class_name: 'RocketJob::Supervisor')
30
+ def set(level: :info, class_name: nil, host_name: nil, pid: nil)
31
+ return unless for_me?(host_name, pid)
32
+
33
+ if class_name
34
+ class_name.constantize.logger.level = level
35
+ logger.info "Changed log level to #{level} for #{class_name}"
36
+ else
37
+ SemanticLogger.default_level = level
38
+ logger.info "Changed global log level to #{level}"
39
+ end
40
+ end
41
+
42
+ # Dump all backtraces to the log file.
43
+ #
44
+ # Examples:
45
+ # # Thread dump on all servers:
46
+ # RocketJob::Subscribers::Logger.publish(:thread_dump)
47
+ #
48
+ # # Change the global log level to :trace on one server.
49
+ # RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com')
50
+ #
51
+ # # Change the global log level to :trace for a specific process id.
52
+ # RocketJob::Subscribers::Logger.publish(:thread_dump, host_name: 'server1.company.com', pid: 34567)
53
+ def thread_dump(host_name: nil, pid: nil)
54
+ return unless for_me?(host_name, pid)
55
+
56
+ Thread.list.each do |thread|
57
+ next if thread == Thread.current
58
+
59
+ logger.backtrace(thread: thread)
60
+ end
61
+ end
62
+
63
+ private
64
+
65
+ def for_me?(host_name, pid)
66
+ return true if host_name.nil? && pid.nil?
67
+
68
+ return false if host_name && (host_name != self.class.host_name)
69
+ return false if pid && (pid != $$)
70
+
71
+ true
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,71 @@
1
+ module RocketJob
2
+ module Subscribers
3
+ class Server
4
+ include RocketJob::Subscriber
5
+
6
+ attr_reader :supervisor
7
+
8
+ def initialize(supervisor)
9
+ @supervisor = supervisor
10
+ end
11
+
12
+ def kill(server_id: nil, wait_timeout: 3)
13
+ return unless my_server?(server_id)
14
+
15
+ supervisor.synchronize do
16
+ supervisor.worker_pool.stop
17
+ supervisor.worker_pool.join(wait_timeout)
18
+ supervisor.worker_pool.kill
19
+ end
20
+
21
+ Supervisor.shutdown!
22
+ logger.info "Killed"
23
+ end
24
+
25
+ def pause(server_id: nil)
26
+ return unless my_server?(server_id)
27
+
28
+ supervisor.synchronize { supervisor.server.pause! if supervisor.server.may_pause? }
29
+ Supervisor.event!
30
+ logger.info "Paused"
31
+ end
32
+
33
+ def refresh(server_id: nil)
34
+ return unless my_server?(server_id)
35
+
36
+ Supervisor.event!
37
+ logger.info "Refreshed"
38
+ end
39
+
40
+ def resume(server_id: nil)
41
+ return unless my_server?(server_id)
42
+
43
+ supervisor.synchronize { supervisor.server.resume! if supervisor.server.may_resume? }
44
+ Supervisor.event!
45
+ logger.info "Resumed"
46
+ end
47
+
48
+ def stop(server_id: nil)
49
+ return unless my_server?(server_id)
50
+
51
+ Supervisor.shutdown!
52
+ logger.info "Shutdown"
53
+ end
54
+
55
+ def thread_dump(server_id: nil)
56
+ return unless my_server?(server_id)
57
+
58
+ logger.info "Thread dump"
59
+ supervisor.worker_pool.log_backtraces
60
+ end
61
+
62
+ private
63
+
64
+ def my_server?(server_id)
65
+ return true if server_id.nil?
66
+
67
+ server_id == supervisor.server.id
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,61 @@
1
+ module RocketJob
2
+ module Subscribers
3
+ class Worker
4
+ include RocketJob::Subscriber
5
+
6
+ attr_reader :supervisor
7
+
8
+ def initialize(supervisor)
9
+ @supervisor = supervisor
10
+ end
11
+
12
+ def kill(server_id:, worker_id:, wait_timeout: 3)
13
+ return unless my_server?(server_id)
14
+
15
+ worker = locate_worker(worker_id)
16
+ return unless worker
17
+
18
+ worker.shutdown!
19
+ worker.join(wait_timeout)
20
+ worker.kill
21
+
22
+ logger.info "Killed"
23
+ end
24
+
25
+ def stop(server_id:, worker_id:)
26
+ return unless my_server?(server_id)
27
+
28
+ worker = locate_worker(worker_id)
29
+ return unless worker
30
+
31
+ worker.shutdown!
32
+ logger.info "Stopped Worker: #{worker_id}"
33
+ end
34
+
35
+ def thread_dump(server_id:, worker_id:)
36
+ return unless my_server?(server_id)
37
+
38
+ worker = locate_worker(worker_id)
39
+ return unless worker
40
+
41
+ logger.info "Thread dump Worker: #{worker_id}"
42
+ logger.backtrace(thread: worker.thread) if worker.thread && worker.alive?
43
+ end
44
+
45
+ private
46
+
47
+ def my_server?(server_id)
48
+ server_id == supervisor.server.id
49
+ end
50
+
51
+ def locate_worker(worker_id)
52
+ return unless worker_id
53
+
54
+ worker = supervisor.worker_pool.find(worker_id)
55
+ return unless worker&.alive?
56
+
57
+ worker
58
+ end
59
+ end
60
+ end
61
+ end