inst-jobs 2.3.3 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/db/migrate/20101216224513_create_delayed_jobs.rb +9 -7
- data/db/migrate/20110531144916_cleanup_delayed_jobs_indexes.rb +8 -13
- data/db/migrate/20110610213249_optimize_delayed_jobs.rb +8 -8
- data/db/migrate/20110831210257_add_delayed_jobs_next_in_strand.rb +25 -25
- data/db/migrate/20120510004759_delayed_jobs_delete_trigger_lock_for_update.rb +4 -8
- data/db/migrate/20120531150712_drop_psql_jobs_pop_fn.rb +1 -3
- data/db/migrate/20120607164022_delayed_jobs_use_advisory_locks.rb +11 -15
- data/db/migrate/20120607181141_index_jobs_on_locked_by.rb +1 -1
- data/db/migrate/20120608191051_add_jobs_run_at_index.rb +2 -2
- data/db/migrate/20120927184213_change_delayed_jobs_handler_to_text.rb +1 -1
- data/db/migrate/20140505215510_copy_failed_jobs_original_id.rb +2 -3
- data/db/migrate/20150807133223_add_max_concurrent_to_jobs.rb +9 -13
- data/db/migrate/20151210162949_improve_max_concurrent.rb +4 -8
- data/db/migrate/20161206323555_add_back_default_string_limits_jobs.rb +3 -2
- data/db/migrate/20181217155351_speed_up_max_concurrent_triggers.rb +13 -17
- data/db/migrate/20200330230722_add_id_to_get_delayed_jobs_index.rb +8 -8
- data/db/migrate/20200824222232_speed_up_max_concurrent_delete_trigger.rb +72 -77
- data/db/migrate/20200825011002_add_strand_order_override.rb +93 -97
- data/db/migrate/20210809145804_add_n_strand_index.rb +3 -3
- data/db/migrate/20210812210128_add_singleton_column.rb +203 -0
- data/exe/inst_jobs +3 -2
- data/lib/delayed/backend/active_record.rb +182 -148
- data/lib/delayed/backend/base.rb +79 -74
- data/lib/delayed/batch.rb +11 -9
- data/lib/delayed/cli.rb +98 -84
- data/lib/delayed/core_ext/kernel.rb +4 -2
- data/lib/delayed/daemon.rb +70 -74
- data/lib/delayed/job_tracking.rb +26 -25
- data/lib/delayed/lifecycle.rb +27 -24
- data/lib/delayed/log_tailer.rb +17 -17
- data/lib/delayed/logging.rb +13 -16
- data/lib/delayed/message_sending.rb +42 -51
- data/lib/delayed/performable_method.rb +5 -7
- data/lib/delayed/periodic.rb +66 -65
- data/lib/delayed/plugin.rb +2 -4
- data/lib/delayed/pool.rb +198 -193
- data/lib/delayed/server/helpers.rb +6 -6
- data/lib/delayed/server.rb +51 -54
- data/lib/delayed/settings.rb +93 -81
- data/lib/delayed/testing.rb +21 -22
- data/lib/delayed/version.rb +1 -1
- data/lib/delayed/work_queue/in_process.rb +21 -18
- data/lib/delayed/work_queue/parent_process/client.rb +54 -55
- data/lib/delayed/work_queue/parent_process/server.rb +215 -209
- data/lib/delayed/work_queue/parent_process.rb +52 -53
- data/lib/delayed/worker/consul_health_check.rb +21 -19
- data/lib/delayed/worker/health_check.rb +21 -12
- data/lib/delayed/worker/null_health_check.rb +3 -1
- data/lib/delayed/worker/process_helper.rb +8 -9
- data/lib/delayed/worker.rb +271 -265
- data/lib/delayed/yaml_extensions.rb +12 -10
- data/lib/delayed_job.rb +37 -38
- data/lib/inst-jobs.rb +1 -1
- data/spec/active_record_job_spec.rb +128 -135
- data/spec/delayed/cli_spec.rb +7 -7
- data/spec/delayed/daemon_spec.rb +8 -8
- data/spec/delayed/message_sending_spec.rb +8 -9
- data/spec/delayed/periodic_spec.rb +13 -12
- data/spec/delayed/server_spec.rb +38 -38
- data/spec/delayed/settings_spec.rb +26 -25
- data/spec/delayed/work_queue/in_process_spec.rb +7 -7
- data/spec/delayed/work_queue/parent_process/client_spec.rb +15 -11
- data/spec/delayed/work_queue/parent_process/server_spec.rb +43 -40
- data/spec/delayed/work_queue/parent_process_spec.rb +21 -21
- data/spec/delayed/worker/consul_health_check_spec.rb +22 -22
- data/spec/delayed/worker/health_check_spec.rb +51 -49
- data/spec/delayed/worker_spec.rb +28 -25
- data/spec/gemfiles/52.gemfile +5 -3
- data/spec/gemfiles/52.gemfile.lock +240 -0
- data/spec/gemfiles/60.gemfile +5 -3
- data/spec/gemfiles/60.gemfile.lock +1 -1
- data/spec/gemfiles/61.gemfile +5 -3
- data/spec/sample_jobs.rb +45 -15
- data/spec/shared/delayed_batch.rb +74 -67
- data/spec/shared/delayed_method.rb +143 -102
- data/spec/shared/performable_method.rb +39 -38
- data/spec/shared/shared_backend.rb +517 -441
- data/spec/shared/testing.rb +14 -14
- data/spec/shared/worker.rb +155 -147
- data/spec/shared_jobs_specs.rb +13 -13
- data/spec/spec_helper.rb +43 -40
- metadata +74 -56
- data/lib/delayed/backend/redis/bulk_update.lua +0 -50
- data/lib/delayed/backend/redis/destroy_job.lua +0 -2
- data/lib/delayed/backend/redis/enqueue.lua +0 -29
- data/lib/delayed/backend/redis/fail_job.lua +0 -5
- data/lib/delayed/backend/redis/find_available.lua +0 -3
- data/lib/delayed/backend/redis/functions.rb +0 -59
- data/lib/delayed/backend/redis/get_and_lock_next_available.lua +0 -17
- data/lib/delayed/backend/redis/includes/jobs_common.lua +0 -203
- data/lib/delayed/backend/redis/job.rb +0 -528
- data/lib/delayed/backend/redis/set_running.lua +0 -5
- data/lib/delayed/backend/redis/tickle_strand.lua +0 -2
- data/spec/redis_job_spec.rb +0 -148
@@ -1,69 +1,68 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
3
|
+
require "pathname"
|
4
|
+
require "socket"
|
5
|
+
require "timeout"
|
6
6
|
|
7
|
-
require_relative
|
8
|
-
require_relative
|
7
|
+
require_relative "parent_process/client"
|
8
|
+
require_relative "parent_process/server"
|
9
9
|
|
10
10
|
module Delayed
|
11
|
-
module WorkQueue
|
12
|
-
# ParentProcess is a WorkQueue implementation that spawns a separate worker
|
13
|
-
# process for querying the queue. Each Worker child process sends requests to
|
14
|
-
# the ParentProcess via IPC, and receives responses. This centralized queue
|
15
|
-
# querying cuts down on db queries and lock contention, and allows the
|
16
|
-
# possibility for other centralized logic such as notifications when all workers
|
17
|
-
# are idle.
|
18
|
-
#
|
19
|
-
# The IPC implementation uses Unix stream sockets and Ruby's built-in Marshal
|
20
|
-
# functionality. The ParentProcess creates a Unix socket on the filesystem in
|
21
|
-
# the tmp directory, so that if a worker process dies and is restarted it can
|
22
|
-
# reconnect to the socket.
|
23
|
-
#
|
24
|
-
# While Unix and IP sockets are API compatible, we take a lot of shortcuts
|
25
|
-
# because we know it's just a local Unix socket. If we ever wanted to swap this
|
26
|
-
# out for a TCP/IP socket and have the WorkQueue running on another host, we'd
|
27
|
-
# want to be a lot more robust about partial reads/writes and timeouts.
|
28
|
-
class ParentProcess
|
29
|
-
|
30
|
-
|
11
|
+
module WorkQueue
|
12
|
+
# ParentProcess is a WorkQueue implementation that spawns a separate worker
|
13
|
+
# process for querying the queue. Each Worker child process sends requests to
|
14
|
+
# the ParentProcess via IPC, and receives responses. This centralized queue
|
15
|
+
# querying cuts down on db queries and lock contention, and allows the
|
16
|
+
# possibility for other centralized logic such as notifications when all workers
|
17
|
+
# are idle.
|
18
|
+
#
|
19
|
+
# The IPC implementation uses Unix stream sockets and Ruby's built-in Marshal
|
20
|
+
# functionality. The ParentProcess creates a Unix socket on the filesystem in
|
21
|
+
# the tmp directory, so that if a worker process dies and is restarted it can
|
22
|
+
# reconnect to the socket.
|
23
|
+
#
|
24
|
+
# While Unix and IP sockets are API compatible, we take a lot of shortcuts
|
25
|
+
# because we know it's just a local Unix socket. If we ever wanted to swap this
|
26
|
+
# out for a TCP/IP socket and have the WorkQueue running on another host, we'd
|
27
|
+
# want to be a lot more robust about partial reads/writes and timeouts.
|
28
|
+
class ParentProcess
|
29
|
+
class ProtocolError < RuntimeError
|
30
|
+
end
|
31
31
|
|
32
|
-
|
32
|
+
attr_reader :server_address
|
33
33
|
|
34
|
-
|
35
|
-
|
34
|
+
DEFAULT_SOCKET_NAME = "inst-jobs.sock"
|
35
|
+
private_constant :DEFAULT_SOCKET_NAME
|
36
36
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
37
|
+
def initialize(config = Settings.parent_process)
|
38
|
+
@config = config
|
39
|
+
@server_address = generate_socket_path(config["server_address"])
|
40
|
+
end
|
41
41
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
def server(parent_pid: nil)
|
43
|
+
# The unix_server_socket method takes care of cleaning up any existing
|
44
|
+
# socket for us if the work queue process dies and is restarted.
|
45
|
+
listen_socket = Socket.unix_server_socket(@server_address)
|
46
|
+
Server.new(listen_socket, parent_pid: parent_pid, config: @config)
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
49
|
+
def client
|
50
|
+
Client.new(Addrinfo.unix(@server_address), config: @config)
|
51
|
+
end
|
52
52
|
|
53
|
-
|
53
|
+
private
|
54
54
|
|
55
|
-
|
56
|
-
|
55
|
+
def generate_socket_path(supplied_path)
|
56
|
+
pathname = Pathname.new(supplied_path)
|
57
57
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
58
|
+
if pathname.absolute? && pathname.directory?
|
59
|
+
pathname.join(DEFAULT_SOCKET_NAME).to_s
|
60
|
+
elsif pathname.absolute?
|
61
|
+
supplied_path
|
62
|
+
else
|
63
|
+
generate_socket_path(Settings.expand_rails_path(supplied_path))
|
64
|
+
end
|
65
|
+
end
|
64
66
|
end
|
65
67
|
end
|
66
68
|
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
@@ -1,23 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require
|
3
|
+
require_relative "health_check"
|
4
|
+
require_relative "process_helper"
|
5
|
+
require "socket"
|
6
6
|
|
7
7
|
module Delayed
|
8
8
|
class Worker
|
9
9
|
class ConsulHealthCheck < HealthCheck
|
10
10
|
self.type_name = :consul
|
11
11
|
|
12
|
-
CONSUL_CONFIG_KEYS = %w
|
13
|
-
DEFAULT_SERVICE_NAME =
|
12
|
+
CONSUL_CONFIG_KEYS = %w[url acl_token].map(&:freeze).freeze
|
13
|
+
DEFAULT_SERVICE_NAME = "inst-jobs_worker"
|
14
14
|
attr_reader :service_client, :health_client
|
15
15
|
|
16
16
|
def initialize(*, **)
|
17
17
|
super
|
18
18
|
# Because we don't want the consul client to be a hard dependency we're
|
19
19
|
# only requiring it once it's absolutely needed
|
20
|
-
require
|
20
|
+
require "diplomat"
|
21
21
|
|
22
22
|
if config.keys.any? { |k| CONSUL_CONFIG_KEYS.include?(k) }
|
23
23
|
consul_config = Diplomat::Configuration.new.tap do |conf|
|
@@ -35,10 +35,10 @@ module Delayed
|
|
35
35
|
|
36
36
|
def start
|
37
37
|
@service_client.register({
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
id: worker_name,
|
39
|
+
name: service_name,
|
40
|
+
check: check_attributes
|
41
|
+
})
|
42
42
|
end
|
43
43
|
|
44
44
|
def stop
|
@@ -48,27 +48,28 @@ module Delayed
|
|
48
48
|
def live_workers
|
49
49
|
# Filter out critical workers (probably nodes failing their serf health check)
|
50
50
|
live_nodes = @health_client.service(service_name, {
|
51
|
-
|
52
|
-
|
51
|
+
filter: "not Checks.Status == critical"
|
52
|
+
})
|
53
53
|
|
54
|
-
live_nodes.map { |n| n.Service[
|
54
|
+
live_nodes.map { |n| n.Service["ID"] }
|
55
55
|
end
|
56
56
|
|
57
57
|
private
|
58
58
|
|
59
59
|
def check_attributes
|
60
60
|
{
|
61
|
-
args: [
|
62
|
-
status:
|
63
|
-
interval: @config.fetch(:check_interval,
|
64
|
-
deregister_critical_service_after: @config.fetch(:deregister_service_delay,
|
61
|
+
args: ["bash", "-c", check_script],
|
62
|
+
status: "passing",
|
63
|
+
interval: @config.fetch(:check_interval, "5m"),
|
64
|
+
deregister_critical_service_after: @config.fetch(:deregister_service_delay, "10m")
|
65
65
|
}.tap do |h|
|
66
|
-
h[:docker_container_id] = docker_container_id if @config[
|
66
|
+
h[:docker_container_id] = docker_container_id if @config["docker"]
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
70
|
def check_script
|
71
71
|
return @check_script if @check_script
|
72
|
+
|
72
73
|
mtime = ProcessHelper.mtime(Process.pid)
|
73
74
|
@check_script = ProcessHelper.check_script(Process.pid, mtime)
|
74
75
|
end
|
@@ -77,12 +78,13 @@ module Delayed
|
|
77
78
|
# cgroups for part of its magic and also uses the container id as the cgroup name
|
78
79
|
def docker_container_id
|
79
80
|
return @docker_container_id if @docker_container_id
|
81
|
+
|
80
82
|
content = File.read("/proc/1/cgroup").split("\n")
|
81
83
|
@docker_container_id = content.last.split("/").last
|
82
84
|
end
|
83
85
|
|
84
86
|
def service_name
|
85
|
-
@service_name ||= @config.fetch(
|
87
|
+
@service_name ||= @config.fetch("service_name", DEFAULT_SERVICE_NAME)
|
86
88
|
end
|
87
89
|
end
|
88
90
|
end
|
@@ -11,17 +11,20 @@ module Delayed
|
|
11
11
|
|
12
12
|
def inherited(subclass)
|
13
13
|
@subclasses << subclass
|
14
|
+
super
|
14
15
|
end
|
15
16
|
|
16
17
|
def build(type:, worker_name:, config: {})
|
17
18
|
type = type.to_sym
|
18
19
|
klass = @subclasses.find { |sc| sc.type_name == type }
|
19
20
|
raise ArgumentError, "Unable to build a HealthCheck for type #{type}" unless klass
|
21
|
+
|
20
22
|
klass.new(worker_name: worker_name, config: config)
|
21
23
|
end
|
22
24
|
|
23
25
|
def reschedule_abandoned_jobs
|
24
26
|
return if Settings.worker_health_check_type == :none
|
27
|
+
|
25
28
|
Delayed::Job.transaction do
|
26
29
|
# this action is a special case, and SHOULD NOT be a periodic job
|
27
30
|
# because if it gets wiped out suddenly during execution
|
@@ -32,27 +35,32 @@ module Delayed
|
|
32
35
|
# operation, the transaction will end, releasing the advisory lock).
|
33
36
|
result = attempt_advisory_lock
|
34
37
|
return unless result
|
38
|
+
|
35
39
|
checker = Worker::HealthCheck.build(
|
36
40
|
type: Settings.worker_health_check_type,
|
37
41
|
config: Settings.worker_health_check_config,
|
38
|
-
worker_name:
|
42
|
+
worker_name: "cleanup-crew"
|
39
43
|
)
|
40
44
|
live_workers = checker.live_workers
|
41
45
|
|
42
46
|
Delayed::Job.running_jobs.each do |job|
|
43
47
|
# prefetched jobs have their own way of automatically unlocking themselves
|
44
48
|
next if job.locked_by.start_with?("prefetch:")
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
49
|
+
|
50
|
+
next if live_workers.include?(job.locked_by)
|
51
|
+
|
52
|
+
begin
|
53
|
+
Delayed::Job.transaction do
|
54
|
+
# double check that the job is still there. locked_by will immediately be reset
|
55
|
+
# to nil in this transaction by Job#reschedule
|
56
|
+
next unless Delayed::Job.where(id: job,
|
57
|
+
locked_by: job.locked_by)
|
58
|
+
.update_all(locked_by: "abandoned job cleanup") == 1
|
59
|
+
|
60
|
+
job.reschedule
|
55
61
|
end
|
62
|
+
rescue
|
63
|
+
::Rails.logger.error "Failure rescheduling abandoned job #{job.id} #{$!.inspect}"
|
56
64
|
end
|
57
65
|
end
|
58
66
|
end
|
@@ -61,7 +69,8 @@ module Delayed
|
|
61
69
|
def attempt_advisory_lock
|
62
70
|
lock_name = "Delayed::Worker::HealthCheck#reschedule_abandoned_jobs"
|
63
71
|
conn = ActiveRecord::Base.connection
|
64
|
-
|
72
|
+
fn_name = conn.quote_table_name("half_md5_as_bigint")
|
73
|
+
conn.select_value("SELECT pg_try_advisory_xact_lock(#{fn_name}('#{lock_name}'));")
|
65
74
|
end
|
66
75
|
end
|
67
76
|
|
@@ -3,14 +3,13 @@
|
|
3
3
|
module Delayed
|
4
4
|
class Worker
|
5
5
|
module ProcessHelper
|
6
|
-
|
7
|
-
|
8
|
-
STAT_MAC = 'ps -o lstart -p $WORKER_PID'
|
6
|
+
STAT_LINUX = "stat --format=%%Y /proc/$WORKER_PID"
|
7
|
+
STAT_MAC = "ps -o lstart -p $WORKER_PID"
|
9
8
|
STAT = RUBY_PLATFORM =~ /darwin/ ? STAT_MAC : STAT_LINUX
|
10
9
|
ALIVE_CHECK_LINUX = '[ -d "/proc/$WORKER_PID" ]'
|
11
|
-
ALIVE_CHECK_MAC =
|
10
|
+
ALIVE_CHECK_MAC = "ps -p $WORKER_PID > /dev/null"
|
12
11
|
ALIVE_CHECK = RUBY_PLATFORM =~ /darwin/ ? ALIVE_CHECK_MAC : ALIVE_CHECK_LINUX
|
13
|
-
SCRIPT_TEMPLATE = <<-BASH
|
12
|
+
SCRIPT_TEMPLATE = <<-BASH
|
14
13
|
WORKER_PID="%<pid>d" # an example, filled from ruby when the check is created
|
15
14
|
ORIGINAL_MTIME="%<mtime>s" # an example, filled from ruby when the check is created
|
16
15
|
|
@@ -31,19 +30,19 @@ module Delayed
|
|
31
30
|
|
32
31
|
def self.mtime(pid)
|
33
32
|
if RUBY_PLATFORM =~ /darwin/
|
34
|
-
`ps -o lstart -p #{pid}`.sub(/\n$/,
|
33
|
+
`ps -o lstart -p #{pid}`.sub(/\n$/, "").presence
|
35
34
|
else
|
36
35
|
File::Stat.new("/proc/#{pid}").mtime.to_i.to_s rescue nil
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
39
|
def self.check_script(pid, mtime)
|
41
|
-
|
40
|
+
format(SCRIPT_TEMPLATE, { pid: pid, mtime: mtime })
|
42
41
|
end
|
43
42
|
|
44
43
|
def self.process_is_still_running?(pid, mtime)
|
45
|
-
system(
|
44
|
+
system(check_script(pid, mtime))
|
46
45
|
end
|
47
46
|
end
|
48
47
|
end
|
49
|
-
end
|
48
|
+
end
|
data/lib/delayed/worker.rb
CHANGED
@@ -1,261 +1,267 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Delayed
|
4
|
-
|
5
|
-
|
6
|
-
class RetriableError < RuntimeError
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
17
|
-
|
18
|
-
require 'tmpdir'
|
19
|
-
require 'set'
|
20
|
-
|
21
|
-
class Worker
|
22
|
-
include Delayed::Logging
|
23
|
-
SIGNALS = %i{INT TERM QUIT}
|
24
|
-
|
25
|
-
attr_reader :config, :queue_name, :min_priority, :max_priority, :work_queue
|
26
|
-
|
27
|
-
# Callback to fire when a delayed job fails max_attempts times. If this
|
28
|
-
# callback is defined, then the value of destroy_failed_jobs is ignored, and
|
29
|
-
# the job is destroyed if this block returns true.
|
30
|
-
#
|
31
|
-
# This allows for destroying "uninteresting" failures, while keeping around
|
32
|
-
# interesting failures to be investigated later.
|
33
|
-
#
|
34
|
-
# The block is called with args(job, last_exception)
|
35
|
-
def self.on_max_failures=(block)
|
36
|
-
@@on_max_failures = block
|
4
|
+
class TimeoutError < RuntimeError; end
|
5
|
+
|
6
|
+
class RetriableError < RuntimeError
|
7
|
+
# this error is a special case. You _should_ raise
|
8
|
+
# it from inside the rescue block for another error,
|
9
|
+
# because it indicates: "something made this job fail
|
10
|
+
# but we're pretty sure it's transient and it's safe to try again".
|
11
|
+
# the workflow is still the same (retry will happen unless
|
12
|
+
# retries are exhausted), but it won't call the :error
|
13
|
+
# callback unless it can't retry anymore. It WILL call the
|
14
|
+
# separate ":retry" callback, which is ONLY activated
|
15
|
+
# for this kind of error.
|
37
16
|
end
|
38
|
-
cattr_reader :on_max_failures
|
39
17
|
|
40
|
-
|
41
|
-
|
18
|
+
require "tmpdir"
|
19
|
+
require "set"
|
20
|
+
|
21
|
+
class Worker
|
22
|
+
include Delayed::Logging
|
23
|
+
SIGNALS = %i[INT TERM QUIT].freeze
|
24
|
+
|
25
|
+
attr_reader :config, :queue_name, :min_priority, :max_priority, :work_queue
|
26
|
+
|
27
|
+
class << self
|
28
|
+
# Callback to fire when a delayed job fails max_attempts times. If this
|
29
|
+
# callback is defined, then the value of destroy_failed_jobs is ignored, and
|
30
|
+
# the job is destroyed if this block returns true.
|
31
|
+
#
|
32
|
+
# This allows for destroying "uninteresting" failures, while keeping around
|
33
|
+
# interesting failures to be investigated later.
|
34
|
+
#
|
35
|
+
# The block is called with args(job, last_exception)
|
36
|
+
attr_accessor :on_max_failures
|
37
|
+
end
|
42
38
|
|
43
|
-
|
44
|
-
|
45
|
-
end
|
39
|
+
cattr_accessor :plugins
|
40
|
+
self.plugins = Set.new
|
46
41
|
|
47
|
-
|
48
|
-
|
49
|
-
|
42
|
+
def self.lifecycle
|
43
|
+
@lifecycle ||= Delayed::Lifecycle.new
|
44
|
+
end
|
50
45
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
ensure
|
55
|
-
Thread.current[:running_delayed_job] = nil
|
56
|
-
end
|
46
|
+
def self.current_job
|
47
|
+
Thread.current[:running_delayed_job]
|
48
|
+
end
|
57
49
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
50
|
+
def self.running_job(job)
|
51
|
+
Thread.current[:running_delayed_job] = job
|
52
|
+
yield
|
53
|
+
ensure
|
54
|
+
Thread.current[:running_delayed_job] = nil
|
55
|
+
end
|
56
|
+
|
57
|
+
def initialize(options = {})
|
58
|
+
@exit = false
|
59
|
+
@parent_pid = options[:parent_pid]
|
60
|
+
@queue_name = options[:queue] ||= Settings.queue
|
61
|
+
@min_priority = options[:min_priority]
|
62
|
+
@max_priority = options[:max_priority]
|
63
|
+
@max_job_count = options[:worker_max_job_count].to_i
|
64
|
+
@max_memory_usage = options[:worker_max_memory_usage].to_i
|
65
|
+
@work_queue = options.delete(:work_queue) || WorkQueue::InProcess.new
|
66
|
+
@health_check_type = Settings.worker_health_check_type
|
67
|
+
@health_check_config = Settings.worker_health_check_config
|
68
|
+
@config = options
|
69
|
+
@job_count = 0
|
70
|
+
|
71
|
+
@signal_queue = []
|
72
|
+
|
73
|
+
app = Rails.application
|
74
|
+
if app && !app.config.cache_classes
|
75
|
+
Delayed::Worker.lifecycle.around(:perform) do |worker, job, &block|
|
76
|
+
reload = app.config.reload_classes_only_on_change != true || app.reloaders.map(&:updated?).any?
|
77
|
+
|
78
|
+
if reload
|
79
|
+
if defined?(ActiveSupport::Reloader)
|
80
|
+
Rails.application.reloader.reload!
|
81
|
+
else
|
82
|
+
ActionDispatch::Reloader.prepare!
|
83
|
+
end
|
84
84
|
end
|
85
|
-
end
|
86
85
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
86
|
+
begin
|
87
|
+
block.call(worker, job)
|
88
|
+
ensure
|
89
|
+
ActionDispatch::Reloader.cleanup! if reload && !defined?(ActiveSupport::Reloader)
|
90
|
+
end
|
91
91
|
end
|
92
92
|
end
|
93
|
-
end
|
94
|
-
|
95
|
-
plugins.each { |plugin| plugin.inject! }
|
96
|
-
end
|
97
93
|
|
98
|
-
|
99
|
-
|
100
|
-
end
|
101
|
-
|
102
|
-
def set_process_name(new_name)
|
103
|
-
$0 = "delayed:#{new_name}"
|
104
|
-
end
|
94
|
+
plugins.each(&:inject!)
|
95
|
+
end
|
105
96
|
|
106
|
-
|
107
|
-
|
108
|
-
|
97
|
+
def name
|
98
|
+
@name ||= "#{Socket.gethostname rescue 'X'}:#{id}"
|
99
|
+
end
|
109
100
|
|
110
|
-
|
111
|
-
|
112
|
-
|
101
|
+
def process_name=(new_name)
|
102
|
+
$0 = "delayed:#{new_name}"
|
103
|
+
end
|
113
104
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
end
|
105
|
+
def exit?
|
106
|
+
!!@exit || parent_exited?
|
107
|
+
end
|
118
108
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
@self_pipe = IO.pipe
|
123
|
-
work_queue.init
|
109
|
+
def parent_exited?
|
110
|
+
@parent_pid && @parent_pid != Process.ppid
|
111
|
+
end
|
124
112
|
|
125
|
-
|
126
|
-
|
127
|
-
|
113
|
+
def wake_up
|
114
|
+
@self_pipe[1].write_nonblock(".", exception: false)
|
115
|
+
work_queue.wake_up
|
128
116
|
end
|
129
117
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
@exit = true
|
143
|
-
else
|
144
|
-
logger.error "Unknown signal '#{sig}' received"
|
118
|
+
def start
|
119
|
+
logger.info "Starting worker"
|
120
|
+
self.process_name =
|
121
|
+
"start:#{Settings.worker_procname_prefix}#{@queue_name}:#{min_priority || 0}:#{max_priority || 'max'}"
|
122
|
+
@self_pipe = IO.pipe
|
123
|
+
work_queue.init
|
124
|
+
|
125
|
+
work_thread = Thread.current
|
126
|
+
SIGNALS.each do |sig|
|
127
|
+
trap(sig) do
|
128
|
+
@signal_queue << sig
|
129
|
+
wake_up
|
145
130
|
end
|
146
131
|
end
|
147
|
-
end
|
148
132
|
|
149
|
-
|
150
|
-
|
151
|
-
|
133
|
+
raise "Could not register health_check" unless health_check.start
|
134
|
+
|
135
|
+
signal_processor = Thread.new do
|
136
|
+
loop do
|
137
|
+
@self_pipe[0].read(1)
|
138
|
+
case @signal_queue.pop
|
139
|
+
when :INT, :TERM
|
140
|
+
@exit = true # get the main thread to bail early if it's waiting for a job
|
141
|
+
work_thread.raise(SystemExit) # Force the main thread to bail out of the current job
|
142
|
+
cleanup! # we're going to get SIGKILL'd in a moment, so clean up asap
|
143
|
+
break
|
144
|
+
when :QUIT
|
145
|
+
@exit = true
|
146
|
+
else
|
147
|
+
logger.error "Unknown signal '#{sig}' received"
|
148
|
+
end
|
149
|
+
end
|
152
150
|
end
|
153
|
-
end
|
154
151
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
self.class.lifecycle.run_callbacks(:exceptional_exit, self, e) { }
|
159
|
-
ensure
|
160
|
-
cleanup!
|
152
|
+
self.class.lifecycle.run_callbacks(:execute, self) do
|
153
|
+
run until exit?
|
154
|
+
end
|
161
155
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
156
|
+
logger.info "Stopping worker"
|
157
|
+
rescue => e
|
158
|
+
Rails.logger.fatal("Child process died: #{e.inspect}") rescue nil
|
159
|
+
self.class.lifecycle.run_callbacks(:exceptional_exit, self, e) { nil }
|
160
|
+
ensure
|
161
|
+
cleanup!
|
166
162
|
|
167
|
-
|
168
|
-
|
169
|
-
|
163
|
+
if signal_processor
|
164
|
+
signal_processor.kill
|
165
|
+
signal_processor.join
|
166
|
+
end
|
170
167
|
|
171
|
-
|
172
|
-
|
168
|
+
@self_pipe&.each(&:close)
|
169
|
+
@self_pipe = nil
|
170
|
+
end
|
173
171
|
|
174
|
-
|
175
|
-
|
176
|
-
Delayed::Job.clear_locks!(name)
|
172
|
+
def cleanup!
|
173
|
+
return if cleaned?
|
177
174
|
|
178
|
-
|
179
|
-
|
175
|
+
health_check.stop
|
176
|
+
work_queue.close
|
177
|
+
Delayed::Job.clear_locks!(name)
|
180
178
|
|
181
|
-
|
182
|
-
|
183
|
-
end
|
179
|
+
@cleaned = true
|
180
|
+
end
|
184
181
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
set_process_name("pop:#{Settings.worker_procname_prefix}#{@queue_name}:#{min_priority || 0}:#{max_priority || 'max'}")
|
189
|
-
job = self.class.lifecycle.run_callbacks(:pop, self) do
|
190
|
-
work_queue.get_and_lock_next_available(name, config)
|
191
|
-
end
|
182
|
+
def cleaned?
|
183
|
+
@cleaned
|
184
|
+
end
|
192
185
|
|
193
|
-
|
194
|
-
|
195
|
-
@job_count += perform(job)
|
186
|
+
def run
|
187
|
+
return if exit?
|
196
188
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
189
|
+
self.class.lifecycle.run_callbacks(:loop, self) do
|
190
|
+
self.process_name =
|
191
|
+
"pop:#{Settings.worker_procname_prefix}#{@queue_name}:#{min_priority || 0}:#{max_priority || 'max'}"
|
192
|
+
job = self.class.lifecycle.run_callbacks(:pop, self) do
|
193
|
+
work_queue.get_and_lock_next_available(name, config)
|
194
|
+
end
|
201
195
|
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
196
|
+
if job
|
197
|
+
configure_for_job(job) do
|
198
|
+
@job_count += perform(job)
|
199
|
+
|
200
|
+
if @max_job_count.positive? && @job_count >= @max_job_count
|
201
|
+
logger.debug "Max job count of #{@max_job_count} exceeded, dying"
|
206
202
|
@exit = true
|
207
|
-
|
208
|
-
|
203
|
+
end
|
204
|
+
|
205
|
+
if @max_memory_usage.positive?
|
206
|
+
memory = sample_memory
|
207
|
+
if memory > @max_memory_usage
|
208
|
+
logger.debug "Memory usage of #{memory} exceeds max of #{@max_memory_usage}, dying"
|
209
|
+
@exit = true
|
210
|
+
else
|
211
|
+
logger.debug "Memory usage: #{memory}"
|
212
|
+
end
|
209
213
|
end
|
210
214
|
end
|
215
|
+
else
|
216
|
+
self.process_name =
|
217
|
+
"wait:#{Settings.worker_procname_prefix}#{@queue_name}:#{min_priority || 0}:#{max_priority || 'max'}"
|
218
|
+
sleep(Settings.sleep_delay + (rand * Settings.sleep_delay_stagger)) unless exit?
|
211
219
|
end
|
212
|
-
else
|
213
|
-
set_process_name("wait:#{Settings.worker_procname_prefix}#{@queue_name}:#{min_priority || 0}:#{max_priority || 'max'}")
|
214
|
-
sleep(Settings.sleep_delay + (rand * Settings.sleep_delay_stagger)) unless exit?
|
215
220
|
end
|
216
221
|
end
|
217
|
-
end
|
218
222
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
223
|
+
def perform(job)
|
224
|
+
begin
|
225
|
+
count = 1
|
226
|
+
raise Delayed::Backend::JobExpired, "job expired at #{job.expires_at}" if job.expired?
|
227
|
+
|
228
|
+
self.class.lifecycle.run_callbacks(:perform, self, job) do
|
229
|
+
self.process_name = "run:#{Settings.worker_procname_prefix}#{job.id}:#{job.name}"
|
230
|
+
logger.info("Processing #{log_job(job, :long)}")
|
231
|
+
runtime = Benchmark.realtime do
|
232
|
+
if job.batch?
|
233
|
+
# each job in the batch will have perform called on it, so we don't
|
234
|
+
# need a timeout around this
|
235
|
+
count = perform_batch(job)
|
236
|
+
else
|
237
|
+
job.invoke_job
|
238
|
+
end
|
239
|
+
job.destroy
|
233
240
|
end
|
234
|
-
job
|
241
|
+
logger.info("Completed #{log_job(job)} #{format('%.0fms', (runtime * 1000))}")
|
242
|
+
end
|
243
|
+
rescue ::Delayed::RetriableError => e
|
244
|
+
can_retry = job.attempts + 1 < job.inferred_max_attempts
|
245
|
+
callback_type = can_retry ? :retry : :error
|
246
|
+
self.class.lifecycle.run_callbacks(callback_type, self, job, e) do
|
247
|
+
handle_failed_job(job, e)
|
248
|
+
end
|
249
|
+
rescue SystemExit => e
|
250
|
+
# There wasn't really a failure here so no callbacks and whatnot needed,
|
251
|
+
# still reschedule the job though.
|
252
|
+
job.reschedule(e)
|
253
|
+
rescue Exception => e # rubocop:disable Lint/RescueException
|
254
|
+
self.class.lifecycle.run_callbacks(:error, self, job, e) do
|
255
|
+
handle_failed_job(job, e)
|
235
256
|
end
|
236
|
-
logger.info("Completed #{log_job(job)} #{"%.0fms" % (runtime * 1000)}")
|
237
|
-
end
|
238
|
-
rescue ::Delayed::RetriableError => re
|
239
|
-
can_retry = job.attempts + 1 < job.inferred_max_attempts
|
240
|
-
callback_type = can_retry ? :retry : :error
|
241
|
-
self.class.lifecycle.run_callbacks(callback_type, self, job, re) do
|
242
|
-
handle_failed_job(job, re)
|
243
|
-
end
|
244
|
-
rescue SystemExit => se
|
245
|
-
# There wasn't really a failure here so no callbacks and whatnot needed,
|
246
|
-
# still reschedule the job though.
|
247
|
-
job.reschedule(se)
|
248
|
-
rescue Exception => e
|
249
|
-
self.class.lifecycle.run_callbacks(:error, self, job, e) do
|
250
|
-
handle_failed_job(job, e)
|
251
257
|
end
|
258
|
+
count
|
252
259
|
end
|
253
|
-
count
|
254
|
-
end
|
255
260
|
|
256
|
-
|
257
|
-
|
258
|
-
|
261
|
+
def perform_batch(parent_job)
|
262
|
+
batch = parent_job.payload_object
|
263
|
+
return unless batch.mode == :serial
|
264
|
+
|
259
265
|
batch.jobs.each do |job|
|
260
266
|
job.source = parent_job.source
|
261
267
|
job.create_and_lock!(name)
|
@@ -265,72 +271,72 @@ class Worker
|
|
265
271
|
end
|
266
272
|
batch.items.size
|
267
273
|
end
|
268
|
-
end
|
269
274
|
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
+
def handle_failed_job(job, error)
|
276
|
+
job.last_error = "#{error.message}\n#{error.backtrace.join("\n")}"
|
277
|
+
logger.error("Failed with #{error.class} [#{error.message}] (#{job.attempts} attempts)")
|
278
|
+
job.reschedule(error)
|
279
|
+
end
|
275
280
|
|
276
|
-
|
277
|
-
|
278
|
-
|
281
|
+
def id
|
282
|
+
Process.pid
|
283
|
+
end
|
279
284
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
285
|
+
def log_job(job, format = :short)
|
286
|
+
case format
|
287
|
+
when :long
|
288
|
+
"#{job.full_name} #{Settings.job_detailed_log_format.call(job)}"
|
289
|
+
else
|
290
|
+
job.full_name
|
291
|
+
end
|
286
292
|
end
|
287
|
-
end
|
288
293
|
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
294
|
+
# set up the session context information, so that it gets logged with the job log lines
|
295
|
+
# also set up a unique tmpdir, which will get removed at the end of the job.
|
296
|
+
def configure_for_job(job)
|
297
|
+
previous_tmpdir = ENV["TMPDIR"]
|
293
298
|
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
299
|
+
self.class.running_job(job) do
|
300
|
+
dir = Dir.mktmpdir("job-#{job.id}-#{name.gsub(/[^\w.]/, '.')}-")
|
301
|
+
begin
|
302
|
+
ENV["TMPDIR"] = dir
|
303
|
+
yield
|
304
|
+
ensure
|
305
|
+
FileUtils.remove_entry(dir, true)
|
306
|
+
end
|
301
307
|
end
|
308
|
+
ensure
|
309
|
+
ENV["TMPDIR"] = previous_tmpdir
|
302
310
|
end
|
303
|
-
ensure
|
304
|
-
ENV['TMPDIR'] = previous_tmpdir
|
305
|
-
end
|
306
|
-
|
307
|
-
def health_check
|
308
|
-
@health_check ||= HealthCheck.build(
|
309
|
-
type: @health_check_type,
|
310
|
-
worker_name: name,
|
311
|
-
config: @health_check_config
|
312
|
-
)
|
313
|
-
end
|
314
311
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
s * LINUX_PAGE_SIZE / 1024
|
312
|
+
def health_check
|
313
|
+
@health_check ||= HealthCheck.build(
|
314
|
+
type: @health_check_type,
|
315
|
+
worker_name: name,
|
316
|
+
config: @health_check_config
|
317
|
+
)
|
322
318
|
end
|
323
|
-
|
324
|
-
#
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
319
|
+
|
320
|
+
# `sample` reports KB, not B
|
321
|
+
if File.directory?("/proc")
|
322
|
+
# linux w/ proc fs
|
323
|
+
LINUX_PAGE_SIZE = (size = `getconf PAGESIZE`.to_i
|
324
|
+
size.positive? ? size : 4096)
|
325
|
+
def sample_memory
|
326
|
+
s = File.read("/proc/#{Process.pid}/statm").to_i rescue 0
|
327
|
+
s * LINUX_PAGE_SIZE / 1024
|
328
|
+
end
|
329
|
+
else
|
330
|
+
# generic unix solution
|
331
|
+
def sample_memory
|
332
|
+
if Rails.env.test?
|
333
|
+
0
|
334
|
+
else
|
335
|
+
# hmm this is actually resident set size, doesn't include swapped-to-disk
|
336
|
+
# memory.
|
337
|
+
`ps -o rss= -p #{Process.pid}`.to_i
|
338
|
+
end
|
332
339
|
end
|
333
340
|
end
|
334
341
|
end
|
335
342
|
end
|
336
|
-
end
|