inst-jobs 2.3.3 → 2.4.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/db/migrate/20101216224513_create_delayed_jobs.rb +9 -7
- data/db/migrate/20110531144916_cleanup_delayed_jobs_indexes.rb +8 -13
- data/db/migrate/20110610213249_optimize_delayed_jobs.rb +8 -8
- data/db/migrate/20110831210257_add_delayed_jobs_next_in_strand.rb +25 -25
- data/db/migrate/20120510004759_delayed_jobs_delete_trigger_lock_for_update.rb +4 -8
- data/db/migrate/20120531150712_drop_psql_jobs_pop_fn.rb +1 -3
- data/db/migrate/20120607164022_delayed_jobs_use_advisory_locks.rb +11 -15
- data/db/migrate/20120607181141_index_jobs_on_locked_by.rb +1 -1
- data/db/migrate/20120608191051_add_jobs_run_at_index.rb +2 -2
- data/db/migrate/20120927184213_change_delayed_jobs_handler_to_text.rb +1 -1
- data/db/migrate/20140505215510_copy_failed_jobs_original_id.rb +2 -3
- data/db/migrate/20150807133223_add_max_concurrent_to_jobs.rb +9 -13
- data/db/migrate/20151210162949_improve_max_concurrent.rb +4 -8
- data/db/migrate/20161206323555_add_back_default_string_limits_jobs.rb +3 -2
- data/db/migrate/20181217155351_speed_up_max_concurrent_triggers.rb +13 -17
- data/db/migrate/20200330230722_add_id_to_get_delayed_jobs_index.rb +8 -8
- data/db/migrate/20200824222232_speed_up_max_concurrent_delete_trigger.rb +72 -77
- data/db/migrate/20200825011002_add_strand_order_override.rb +93 -97
- data/db/migrate/20210809145804_add_n_strand_index.rb +3 -3
- data/db/migrate/20210812210128_add_singleton_column.rb +200 -0
- data/db/migrate/20210917232626_add_delete_conflicting_singletons_before_unlock_trigger.rb +27 -0
- data/db/migrate/20210928174754_fix_singleton_condition_in_before_insert.rb +56 -0
- data/exe/inst_jobs +3 -2
- data/lib/delayed/backend/active_record.rb +204 -150
- data/lib/delayed/backend/base.rb +106 -82
- data/lib/delayed/batch.rb +11 -9
- data/lib/delayed/cli.rb +98 -84
- data/lib/delayed/core_ext/kernel.rb +4 -2
- data/lib/delayed/daemon.rb +70 -74
- data/lib/delayed/job_tracking.rb +26 -25
- data/lib/delayed/lifecycle.rb +27 -24
- data/lib/delayed/log_tailer.rb +17 -17
- data/lib/delayed/logging.rb +13 -16
- data/lib/delayed/message_sending.rb +43 -52
- data/lib/delayed/performable_method.rb +6 -8
- data/lib/delayed/periodic.rb +72 -65
- data/lib/delayed/plugin.rb +2 -4
- data/lib/delayed/pool.rb +198 -193
- data/lib/delayed/server/helpers.rb +6 -6
- data/lib/delayed/server.rb +51 -54
- data/lib/delayed/settings.rb +93 -81
- data/lib/delayed/testing.rb +21 -22
- data/lib/delayed/version.rb +1 -1
- data/lib/delayed/work_queue/in_process.rb +21 -18
- data/lib/delayed/work_queue/parent_process/client.rb +54 -55
- data/lib/delayed/work_queue/parent_process/server.rb +219 -208
- data/lib/delayed/work_queue/parent_process.rb +52 -53
- data/lib/delayed/worker/consul_health_check.rb +21 -19
- data/lib/delayed/worker/health_check.rb +29 -22
- data/lib/delayed/worker/null_health_check.rb +3 -1
- data/lib/delayed/worker/process_helper.rb +8 -9
- data/lib/delayed/worker.rb +271 -265
- data/lib/delayed/yaml_extensions.rb +12 -10
- data/lib/delayed_job.rb +37 -38
- data/lib/inst-jobs.rb +1 -1
- data/spec/active_record_job_spec.rb +129 -136
- data/spec/delayed/cli_spec.rb +7 -7
- data/spec/delayed/daemon_spec.rb +10 -9
- data/spec/delayed/message_sending_spec.rb +16 -9
- data/spec/delayed/periodic_spec.rb +13 -12
- data/spec/delayed/server_spec.rb +38 -38
- data/spec/delayed/settings_spec.rb +26 -25
- data/spec/delayed/work_queue/in_process_spec.rb +7 -8
- data/spec/delayed/work_queue/parent_process/client_spec.rb +17 -12
- data/spec/delayed/work_queue/parent_process/server_spec.rb +70 -41
- data/spec/delayed/work_queue/parent_process_spec.rb +21 -23
- data/spec/delayed/worker/consul_health_check_spec.rb +22 -22
- data/spec/delayed/worker/health_check_spec.rb +60 -52
- data/spec/delayed/worker_spec.rb +28 -25
- data/spec/sample_jobs.rb +45 -15
- data/spec/shared/delayed_batch.rb +74 -67
- data/spec/shared/delayed_method.rb +143 -102
- data/spec/shared/performable_method.rb +39 -38
- data/spec/shared/shared_backend.rb +547 -441
- data/spec/shared/testing.rb +14 -14
- data/spec/shared/worker.rb +155 -147
- data/spec/shared_jobs_specs.rb +13 -13
- data/spec/spec_helper.rb +46 -41
- metadata +79 -55
- data/lib/delayed/backend/redis/bulk_update.lua +0 -50
- data/lib/delayed/backend/redis/destroy_job.lua +0 -2
- data/lib/delayed/backend/redis/enqueue.lua +0 -29
- data/lib/delayed/backend/redis/fail_job.lua +0 -5
- data/lib/delayed/backend/redis/find_available.lua +0 -3
- data/lib/delayed/backend/redis/functions.rb +0 -59
- data/lib/delayed/backend/redis/get_and_lock_next_available.lua +0 -17
- data/lib/delayed/backend/redis/includes/jobs_common.lua +0 -203
- data/lib/delayed/backend/redis/job.rb +0 -528
- data/lib/delayed/backend/redis/set_running.lua +0 -5
- data/lib/delayed/backend/redis/tickle_strand.lua +0 -2
- data/spec/gemfiles/52.gemfile +0 -7
- data/spec/gemfiles/60.gemfile +0 -7
- data/spec/gemfiles/60.gemfile.lock +0 -246
- data/spec/gemfiles/61.gemfile +0 -7
- data/spec/redis_job_spec.rb +0 -148
@@ -1,23 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require
|
3
|
+
require_relative "health_check"
|
4
|
+
require_relative "process_helper"
|
5
|
+
require "socket"
|
6
6
|
|
7
7
|
module Delayed
|
8
8
|
class Worker
|
9
9
|
class ConsulHealthCheck < HealthCheck
|
10
10
|
self.type_name = :consul
|
11
11
|
|
12
|
-
CONSUL_CONFIG_KEYS = %w
|
13
|
-
DEFAULT_SERVICE_NAME =
|
12
|
+
CONSUL_CONFIG_KEYS = %w[url acl_token].map(&:freeze).freeze
|
13
|
+
DEFAULT_SERVICE_NAME = "inst-jobs_worker"
|
14
14
|
attr_reader :service_client, :health_client
|
15
15
|
|
16
16
|
def initialize(*, **)
|
17
17
|
super
|
18
18
|
# Because we don't want the consul client to be a hard dependency we're
|
19
19
|
# only requiring it once it's absolutely needed
|
20
|
-
require
|
20
|
+
require "diplomat"
|
21
21
|
|
22
22
|
if config.keys.any? { |k| CONSUL_CONFIG_KEYS.include?(k) }
|
23
23
|
consul_config = Diplomat::Configuration.new.tap do |conf|
|
@@ -35,10 +35,10 @@ module Delayed
|
|
35
35
|
|
36
36
|
def start
|
37
37
|
@service_client.register({
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
38
|
+
id: worker_name,
|
39
|
+
name: service_name,
|
40
|
+
check: check_attributes
|
41
|
+
})
|
42
42
|
end
|
43
43
|
|
44
44
|
def stop
|
@@ -48,27 +48,28 @@ module Delayed
|
|
48
48
|
def live_workers
|
49
49
|
# Filter out critical workers (probably nodes failing their serf health check)
|
50
50
|
live_nodes = @health_client.service(service_name, {
|
51
|
-
|
52
|
-
|
51
|
+
filter: "not Checks.Status == critical"
|
52
|
+
})
|
53
53
|
|
54
|
-
live_nodes.map { |n| n.Service[
|
54
|
+
live_nodes.map { |n| n.Service["ID"] }
|
55
55
|
end
|
56
56
|
|
57
57
|
private
|
58
58
|
|
59
59
|
def check_attributes
|
60
60
|
{
|
61
|
-
args: [
|
62
|
-
status:
|
63
|
-
interval: @config.fetch(:check_interval,
|
64
|
-
deregister_critical_service_after: @config.fetch(:deregister_service_delay,
|
61
|
+
args: ["bash", "-c", check_script],
|
62
|
+
status: "passing",
|
63
|
+
interval: @config.fetch(:check_interval, "5m"),
|
64
|
+
deregister_critical_service_after: @config.fetch(:deregister_service_delay, "10m")
|
65
65
|
}.tap do |h|
|
66
|
-
h[:docker_container_id] = docker_container_id if @config[
|
66
|
+
h[:docker_container_id] = docker_container_id if @config["docker"]
|
67
67
|
end
|
68
68
|
end
|
69
69
|
|
70
70
|
def check_script
|
71
71
|
return @check_script if @check_script
|
72
|
+
|
72
73
|
mtime = ProcessHelper.mtime(Process.pid)
|
73
74
|
@check_script = ProcessHelper.check_script(Process.pid, mtime)
|
74
75
|
end
|
@@ -77,12 +78,13 @@ module Delayed
|
|
77
78
|
# cgroups for part of its magic and also uses the container id as the cgroup name
|
78
79
|
def docker_container_id
|
79
80
|
return @docker_container_id if @docker_container_id
|
81
|
+
|
80
82
|
content = File.read("/proc/1/cgroup").split("\n")
|
81
83
|
@docker_container_id = content.last.split("/").last
|
82
84
|
end
|
83
85
|
|
84
86
|
def service_name
|
85
|
-
@service_name ||= @config.fetch(
|
87
|
+
@service_name ||= @config.fetch("service_name", DEFAULT_SERVICE_NAME)
|
86
88
|
end
|
87
89
|
end
|
88
90
|
end
|
@@ -11,58 +11,65 @@ module Delayed
|
|
11
11
|
|
12
12
|
def inherited(subclass)
|
13
13
|
@subclasses << subclass
|
14
|
+
super
|
14
15
|
end
|
15
16
|
|
16
17
|
def build(type:, worker_name:, config: {})
|
17
18
|
type = type.to_sym
|
18
19
|
klass = @subclasses.find { |sc| sc.type_name == type }
|
19
20
|
raise ArgumentError, "Unable to build a HealthCheck for type #{type}" unless klass
|
21
|
+
|
20
22
|
klass.new(worker_name: worker_name, config: config)
|
21
23
|
end
|
22
24
|
|
23
25
|
def reschedule_abandoned_jobs
|
24
26
|
return if Settings.worker_health_check_type == :none
|
27
|
+
|
25
28
|
Delayed::Job.transaction do
|
26
29
|
# this action is a special case, and SHOULD NOT be a periodic job
|
27
30
|
# because if it gets wiped out suddenly during execution
|
28
|
-
# it can't go clean up
|
31
|
+
# it can't go clean up its abandoned self. Therefore,
|
29
32
|
# we expect it to get run from it's own process forked from the job pool
|
30
33
|
# and we try to get an advisory lock when it runs. If we succeed,
|
31
34
|
# no other worker is trying to do this right now (and if we abandon the
|
32
35
|
# operation, the transaction will end, releasing the advisory lock).
|
33
|
-
result = attempt_advisory_lock
|
36
|
+
result = Delayed::Job.attempt_advisory_lock("Delayed::Worker::HealthCheck#reschedule_abandoned_jobs")
|
34
37
|
return unless result
|
38
|
+
|
39
|
+
horizon = 5.minutes.ago
|
40
|
+
|
35
41
|
checker = Worker::HealthCheck.build(
|
36
42
|
type: Settings.worker_health_check_type,
|
37
43
|
config: Settings.worker_health_check_config,
|
38
|
-
worker_name:
|
44
|
+
worker_name: "cleanup-crew"
|
39
45
|
)
|
40
46
|
live_workers = checker.live_workers
|
41
47
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
48
|
+
loop do
|
49
|
+
batch = Delayed::Job.running_jobs
|
50
|
+
.where("locked_at<?", horizon)
|
51
|
+
.where.not("locked_by LIKE 'prefetch:%'")
|
52
|
+
.where.not(locked_by: live_workers)
|
53
|
+
.limit(100)
|
54
|
+
.to_a
|
55
|
+
break if batch.empty?
|
56
|
+
|
57
|
+
batch.each do |job|
|
58
|
+
Delayed::Job.transaction do
|
59
|
+
# double check that the job is still there. locked_by will immediately be reset
|
60
|
+
# to nil in this transaction by Job#reschedule
|
61
|
+
next unless Delayed::Job.where(id: job,
|
62
|
+
locked_by: job.locked_by)
|
63
|
+
.update_all(locked_by: "abandoned job cleanup") == 1
|
64
|
+
|
65
|
+
job.reschedule
|
55
66
|
end
|
56
67
|
end
|
68
|
+
rescue
|
69
|
+
::Rails.logger.error "Failure rescheduling abandoned job #{job.id} #{$!.inspect}"
|
57
70
|
end
|
58
71
|
end
|
59
72
|
end
|
60
|
-
|
61
|
-
def attempt_advisory_lock
|
62
|
-
lock_name = "Delayed::Worker::HealthCheck#reschedule_abandoned_jobs"
|
63
|
-
conn = ActiveRecord::Base.connection
|
64
|
-
conn.select_value("SELECT pg_try_advisory_xact_lock(#{conn.quote_table_name('half_md5_as_bigint')}('#{lock_name}'));")
|
65
|
-
end
|
66
73
|
end
|
67
74
|
|
68
75
|
attr_accessor :config, :worker_name
|
@@ -3,14 +3,13 @@
|
|
3
3
|
module Delayed
|
4
4
|
class Worker
|
5
5
|
module ProcessHelper
|
6
|
-
|
7
|
-
|
8
|
-
STAT_MAC = 'ps -o lstart -p $WORKER_PID'
|
6
|
+
STAT_LINUX = "stat --format=%%Y /proc/$WORKER_PID"
|
7
|
+
STAT_MAC = "ps -o lstart -p $WORKER_PID"
|
9
8
|
STAT = RUBY_PLATFORM =~ /darwin/ ? STAT_MAC : STAT_LINUX
|
10
9
|
ALIVE_CHECK_LINUX = '[ -d "/proc/$WORKER_PID" ]'
|
11
|
-
ALIVE_CHECK_MAC =
|
10
|
+
ALIVE_CHECK_MAC = "ps -p $WORKER_PID > /dev/null"
|
12
11
|
ALIVE_CHECK = RUBY_PLATFORM =~ /darwin/ ? ALIVE_CHECK_MAC : ALIVE_CHECK_LINUX
|
13
|
-
SCRIPT_TEMPLATE = <<-BASH
|
12
|
+
SCRIPT_TEMPLATE = <<-BASH
|
14
13
|
WORKER_PID="%<pid>d" # an example, filled from ruby when the check is created
|
15
14
|
ORIGINAL_MTIME="%<mtime>s" # an example, filled from ruby when the check is created
|
16
15
|
|
@@ -31,19 +30,19 @@ module Delayed
|
|
31
30
|
|
32
31
|
def self.mtime(pid)
|
33
32
|
if RUBY_PLATFORM =~ /darwin/
|
34
|
-
`ps -o lstart -p #{pid}`.sub(/\n$/,
|
33
|
+
`ps -o lstart -p #{pid}`.sub(/\n$/, "").presence
|
35
34
|
else
|
36
35
|
File::Stat.new("/proc/#{pid}").mtime.to_i.to_s rescue nil
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
39
|
def self.check_script(pid, mtime)
|
41
|
-
|
40
|
+
format(SCRIPT_TEMPLATE, { pid: pid, mtime: mtime })
|
42
41
|
end
|
43
42
|
|
44
43
|
def self.process_is_still_running?(pid, mtime)
|
45
|
-
system(
|
44
|
+
system(check_script(pid, mtime))
|
46
45
|
end
|
47
46
|
end
|
48
47
|
end
|
49
|
-
end
|
48
|
+
end
|