inst-jobs 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/db/migrate/20101216224513_create_delayed_jobs.rb +9 -7
- data/db/migrate/20110531144916_cleanup_delayed_jobs_indexes.rb +8 -13
- data/db/migrate/20110610213249_optimize_delayed_jobs.rb +8 -8
- data/db/migrate/20110831210257_add_delayed_jobs_next_in_strand.rb +25 -25
- data/db/migrate/20120510004759_delayed_jobs_delete_trigger_lock_for_update.rb +4 -8
- data/db/migrate/20120531150712_drop_psql_jobs_pop_fn.rb +1 -3
- data/db/migrate/20120607164022_delayed_jobs_use_advisory_locks.rb +11 -15
- data/db/migrate/20120607181141_index_jobs_on_locked_by.rb +1 -1
- data/db/migrate/20120608191051_add_jobs_run_at_index.rb +2 -2
- data/db/migrate/20120927184213_change_delayed_jobs_handler_to_text.rb +1 -1
- data/db/migrate/20140505215510_copy_failed_jobs_original_id.rb +2 -3
- data/db/migrate/20150807133223_add_max_concurrent_to_jobs.rb +9 -13
- data/db/migrate/20151210162949_improve_max_concurrent.rb +4 -8
- data/db/migrate/20161206323555_add_back_default_string_limits_jobs.rb +3 -2
- data/db/migrate/20181217155351_speed_up_max_concurrent_triggers.rb +13 -17
- data/db/migrate/20200330230722_add_id_to_get_delayed_jobs_index.rb +8 -8
- data/db/migrate/20200824222232_speed_up_max_concurrent_delete_trigger.rb +72 -77
- data/db/migrate/20200825011002_add_strand_order_override.rb +93 -97
- data/db/migrate/20210809145804_add_n_strand_index.rb +12 -0
- data/db/migrate/20210812210128_add_singleton_column.rb +200 -0
- data/db/migrate/20210917232626_add_delete_conflicting_singletons_before_unlock_trigger.rb +27 -0
- data/db/migrate/20210928174754_fix_singleton_condition_in_before_insert.rb +56 -0
- data/db/migrate/20210929204903_update_conflicting_singleton_function_to_use_index.rb +27 -0
- data/exe/inst_jobs +3 -2
- data/lib/delayed/backend/active_record.rb +211 -168
- data/lib/delayed/backend/base.rb +110 -72
- data/lib/delayed/batch.rb +11 -9
- data/lib/delayed/cli.rb +98 -84
- data/lib/delayed/core_ext/kernel.rb +4 -2
- data/lib/delayed/daemon.rb +70 -74
- data/lib/delayed/job_tracking.rb +26 -25
- data/lib/delayed/lifecycle.rb +27 -23
- data/lib/delayed/log_tailer.rb +17 -17
- data/lib/delayed/logging.rb +13 -16
- data/lib/delayed/message_sending.rb +43 -52
- data/lib/delayed/performable_method.rb +6 -8
- data/lib/delayed/periodic.rb +72 -68
- data/lib/delayed/plugin.rb +2 -4
- data/lib/delayed/pool.rb +205 -168
- data/lib/delayed/server/helpers.rb +6 -6
- data/lib/delayed/server.rb +51 -54
- data/lib/delayed/settings.rb +94 -81
- data/lib/delayed/testing.rb +21 -22
- data/lib/delayed/version.rb +1 -1
- data/lib/delayed/work_queue/in_process.rb +21 -17
- data/lib/delayed/work_queue/parent_process/client.rb +55 -53
- data/lib/delayed/work_queue/parent_process/server.rb +245 -207
- data/lib/delayed/work_queue/parent_process.rb +52 -53
- data/lib/delayed/worker/consul_health_check.rb +32 -33
- data/lib/delayed/worker/health_check.rb +34 -26
- data/lib/delayed/worker/null_health_check.rb +3 -1
- data/lib/delayed/worker/process_helper.rb +8 -9
- data/lib/delayed/worker.rb +272 -241
- data/lib/delayed/yaml_extensions.rb +12 -10
- data/lib/delayed_job.rb +37 -37
- data/lib/inst-jobs.rb +1 -1
- data/spec/active_record_job_spec.rb +143 -139
- data/spec/delayed/cli_spec.rb +7 -7
- data/spec/delayed/daemon_spec.rb +10 -9
- data/spec/delayed/message_sending_spec.rb +16 -9
- data/spec/delayed/periodic_spec.rb +14 -21
- data/spec/delayed/server_spec.rb +38 -38
- data/spec/delayed/settings_spec.rb +26 -25
- data/spec/delayed/work_queue/in_process_spec.rb +7 -8
- data/spec/delayed/work_queue/parent_process/client_spec.rb +17 -12
- data/spec/delayed/work_queue/parent_process/server_spec.rb +117 -41
- data/spec/delayed/work_queue/parent_process_spec.rb +21 -23
- data/spec/delayed/worker/consul_health_check_spec.rb +37 -50
- data/spec/delayed/worker/health_check_spec.rb +60 -52
- data/spec/delayed/worker_spec.rb +44 -21
- data/spec/sample_jobs.rb +45 -15
- data/spec/shared/delayed_batch.rb +74 -67
- data/spec/shared/delayed_method.rb +143 -102
- data/spec/shared/performable_method.rb +39 -38
- data/spec/shared/shared_backend.rb +550 -437
- data/spec/shared/testing.rb +14 -14
- data/spec/shared/worker.rb +156 -148
- data/spec/shared_jobs_specs.rb +13 -13
- data/spec/spec_helper.rb +53 -55
- metadata +148 -82
- data/lib/delayed/backend/redis/bulk_update.lua +0 -50
- data/lib/delayed/backend/redis/destroy_job.lua +0 -2
- data/lib/delayed/backend/redis/enqueue.lua +0 -29
- data/lib/delayed/backend/redis/fail_job.lua +0 -5
- data/lib/delayed/backend/redis/find_available.lua +0 -3
- data/lib/delayed/backend/redis/functions.rb +0 -59
- data/lib/delayed/backend/redis/get_and_lock_next_available.lua +0 -17
- data/lib/delayed/backend/redis/includes/jobs_common.lua +0 -203
- data/lib/delayed/backend/redis/job.rb +0 -535
- data/lib/delayed/backend/redis/set_running.lua +0 -5
- data/lib/delayed/backend/redis/tickle_strand.lua +0 -2
- data/spec/gemfiles/42.gemfile +0 -7
- data/spec/gemfiles/50.gemfile +0 -7
- data/spec/gemfiles/51.gemfile +0 -7
- data/spec/gemfiles/52.gemfile +0 -7
- data/spec/gemfiles/60.gemfile +0 -7
- data/spec/redis_job_spec.rb +0 -148
@@ -1,77 +1,75 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require
|
3
|
+
require_relative "health_check"
|
4
|
+
require_relative "process_helper"
|
5
|
+
require "socket"
|
6
6
|
|
7
7
|
module Delayed
|
8
8
|
class Worker
|
9
9
|
class ConsulHealthCheck < HealthCheck
|
10
10
|
self.type_name = :consul
|
11
11
|
|
12
|
-
CONSUL_CONFIG_KEYS = %w
|
13
|
-
DEFAULT_SERVICE_NAME =
|
14
|
-
attr_reader :
|
12
|
+
CONSUL_CONFIG_KEYS = %w[url acl_token].map(&:freeze).freeze
|
13
|
+
DEFAULT_SERVICE_NAME = "inst-jobs_worker"
|
14
|
+
attr_reader :service_client, :health_client
|
15
15
|
|
16
16
|
def initialize(*, **)
|
17
17
|
super
|
18
18
|
# Because we don't want the consul client to be a hard dependency we're
|
19
19
|
# only requiring it once it's absolutely needed
|
20
|
-
require
|
20
|
+
require "diplomat"
|
21
21
|
|
22
22
|
if config.keys.any? { |k| CONSUL_CONFIG_KEYS.include?(k) }
|
23
|
-
consul_config =
|
23
|
+
consul_config = Diplomat::Configuration.new.tap do |conf|
|
24
24
|
CONSUL_CONFIG_KEYS.each do |key|
|
25
25
|
conf.send("#{key}=", config[key]) if config[key]
|
26
26
|
end
|
27
27
|
end
|
28
|
-
@
|
29
|
-
@
|
28
|
+
@service_client = Diplomat::Service.new(configuration: consul_config)
|
29
|
+
@health_client = Diplomat::Health.new(configuration: consul_config)
|
30
30
|
else
|
31
|
-
@
|
32
|
-
@
|
31
|
+
@service_client = Diplomat::Service.new
|
32
|
+
@health_client = Diplomat::Health.new
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
36
36
|
def start
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
response = @agent_client.register_service(service)
|
43
|
-
response.ok?
|
37
|
+
@service_client.register({
|
38
|
+
id: worker_name,
|
39
|
+
name: service_name,
|
40
|
+
check: check_attributes
|
41
|
+
})
|
44
42
|
end
|
45
43
|
|
46
44
|
def stop
|
47
|
-
|
48
|
-
response.ok? || response.not_found?
|
45
|
+
@service_client.deregister(worker_name)
|
49
46
|
end
|
50
47
|
|
51
48
|
def live_workers
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
49
|
+
# Filter out critical workers (probably nodes failing their serf health check)
|
50
|
+
live_nodes = @health_client.service(service_name, {
|
51
|
+
filter: "not Checks.Status == critical"
|
52
|
+
})
|
53
|
+
|
54
|
+
live_nodes.map { |n| n.Service["ID"] }
|
58
55
|
end
|
59
56
|
|
60
57
|
private
|
61
58
|
|
62
59
|
def check_attributes
|
63
60
|
{
|
64
|
-
args: [
|
65
|
-
status:
|
66
|
-
interval: @config.fetch(:check_interval,
|
67
|
-
deregister_critical_service_after: @config.fetch(:deregister_service_delay,
|
61
|
+
args: ["bash", "-c", check_script],
|
62
|
+
status: "passing",
|
63
|
+
interval: @config.fetch(:check_interval, "5m"),
|
64
|
+
deregister_critical_service_after: @config.fetch(:deregister_service_delay, "10m")
|
68
65
|
}.tap do |h|
|
69
|
-
h[:docker_container_id] = docker_container_id if @config[
|
66
|
+
h[:docker_container_id] = docker_container_id if @config["docker"]
|
70
67
|
end
|
71
68
|
end
|
72
69
|
|
73
70
|
def check_script
|
74
71
|
return @check_script if @check_script
|
72
|
+
|
75
73
|
mtime = ProcessHelper.mtime(Process.pid)
|
76
74
|
@check_script = ProcessHelper.check_script(Process.pid, mtime)
|
77
75
|
end
|
@@ -80,12 +78,13 @@ module Delayed
|
|
80
78
|
# cgroups for part of its magic and also uses the container id as the cgroup name
|
81
79
|
def docker_container_id
|
82
80
|
return @docker_container_id if @docker_container_id
|
81
|
+
|
83
82
|
content = File.read("/proc/1/cgroup").split("\n")
|
84
83
|
@docker_container_id = content.last.split("/").last
|
85
84
|
end
|
86
85
|
|
87
86
|
def service_name
|
88
|
-
@service_name ||= @config.fetch(
|
87
|
+
@service_name ||= @config.fetch("service_name", DEFAULT_SERVICE_NAME)
|
89
88
|
end
|
90
89
|
end
|
91
90
|
end
|
@@ -11,57 +11,65 @@ module Delayed
|
|
11
11
|
|
12
12
|
def inherited(subclass)
|
13
13
|
@subclasses << subclass
|
14
|
+
super
|
14
15
|
end
|
15
16
|
|
16
17
|
def build(type:, worker_name:, config: {})
|
17
18
|
type = type.to_sym
|
18
19
|
klass = @subclasses.find { |sc| sc.type_name == type }
|
19
20
|
raise ArgumentError, "Unable to build a HealthCheck for type #{type}" unless klass
|
21
|
+
|
20
22
|
klass.new(worker_name: worker_name, config: config)
|
21
23
|
end
|
22
24
|
|
23
25
|
def reschedule_abandoned_jobs
|
24
26
|
return if Settings.worker_health_check_type == :none
|
27
|
+
|
25
28
|
Delayed::Job.transaction do
|
26
|
-
# this
|
29
|
+
# this action is a special case, and SHOULD NOT be a periodic job
|
27
30
|
# because if it gets wiped out suddenly during execution
|
28
|
-
# it can't go clean up
|
29
|
-
# we
|
30
|
-
#
|
31
|
-
#
|
32
|
-
|
31
|
+
# it can't go clean up its abandoned self. Therefore,
|
32
|
+
# we expect it to get run from it's own process forked from the job pool
|
33
|
+
# and we try to get an advisory lock when it runs. If we succeed,
|
34
|
+
# no other worker is trying to do this right now (and if we abandon the
|
35
|
+
# operation, the transaction will end, releasing the advisory lock).
|
36
|
+
result = Delayed::Job.attempt_advisory_lock("Delayed::Worker::HealthCheck#reschedule_abandoned_jobs")
|
33
37
|
return unless result
|
38
|
+
|
39
|
+
horizon = 5.minutes.ago
|
40
|
+
|
34
41
|
checker = Worker::HealthCheck.build(
|
35
42
|
type: Settings.worker_health_check_type,
|
36
43
|
config: Settings.worker_health_check_config,
|
37
|
-
worker_name:
|
44
|
+
worker_name: "cleanup-crew"
|
38
45
|
)
|
39
46
|
live_workers = checker.live_workers
|
40
47
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
48
|
+
loop do
|
49
|
+
batch = Delayed::Job.running_jobs
|
50
|
+
.where("locked_at<?", horizon)
|
51
|
+
.where.not("locked_by LIKE 'prefetch:%'")
|
52
|
+
.where.not(locked_by: live_workers)
|
53
|
+
.limit(100)
|
54
|
+
.to_a
|
55
|
+
break if batch.empty?
|
56
|
+
|
57
|
+
batch.each do |job|
|
58
|
+
Delayed::Job.transaction do
|
59
|
+
# double check that the job is still there. locked_by will immediately be reset
|
60
|
+
# to nil in this transaction by Job#reschedule
|
61
|
+
next unless Delayed::Job.where(id: job,
|
62
|
+
locked_by: job.locked_by)
|
63
|
+
.update_all(locked_by: "abandoned job cleanup") == 1
|
64
|
+
|
65
|
+
job.reschedule
|
54
66
|
end
|
55
67
|
end
|
68
|
+
rescue
|
69
|
+
::Rails.logger.error "Failure rescheduling abandoned job #{job.id} #{$!.inspect}"
|
56
70
|
end
|
57
71
|
end
|
58
72
|
end
|
59
|
-
|
60
|
-
def attempt_advisory_lock
|
61
|
-
lock_name = "Delayed::Worker::HealthCheck#reschedule_abandoned_jobs"
|
62
|
-
output = ActiveRecord::Base.connection.execute("SELECT pg_try_advisory_xact_lock(half_md5_as_bigint('#{lock_name}'));")
|
63
|
-
output.getvalue(0, 0)
|
64
|
-
end
|
65
73
|
end
|
66
74
|
|
67
75
|
attr_accessor :config, :worker_name
|
@@ -3,14 +3,13 @@
|
|
3
3
|
module Delayed
|
4
4
|
class Worker
|
5
5
|
module ProcessHelper
|
6
|
-
|
7
|
-
|
8
|
-
STAT_MAC = 'ps -o lstart -p $WORKER_PID'
|
6
|
+
STAT_LINUX = "stat --format=%%Y /proc/$WORKER_PID"
|
7
|
+
STAT_MAC = "ps -o lstart -p $WORKER_PID"
|
9
8
|
STAT = RUBY_PLATFORM =~ /darwin/ ? STAT_MAC : STAT_LINUX
|
10
9
|
ALIVE_CHECK_LINUX = '[ -d "/proc/$WORKER_PID" ]'
|
11
|
-
ALIVE_CHECK_MAC =
|
10
|
+
ALIVE_CHECK_MAC = "ps -p $WORKER_PID > /dev/null"
|
12
11
|
ALIVE_CHECK = RUBY_PLATFORM =~ /darwin/ ? ALIVE_CHECK_MAC : ALIVE_CHECK_LINUX
|
13
|
-
SCRIPT_TEMPLATE = <<-BASH
|
12
|
+
SCRIPT_TEMPLATE = <<-BASH
|
14
13
|
WORKER_PID="%<pid>d" # an example, filled from ruby when the check is created
|
15
14
|
ORIGINAL_MTIME="%<mtime>s" # an example, filled from ruby when the check is created
|
16
15
|
|
@@ -31,19 +30,19 @@ module Delayed
|
|
31
30
|
|
32
31
|
def self.mtime(pid)
|
33
32
|
if RUBY_PLATFORM =~ /darwin/
|
34
|
-
`ps -o lstart -p #{pid}`.sub(/\n$/,
|
33
|
+
`ps -o lstart -p #{pid}`.sub(/\n$/, "").presence
|
35
34
|
else
|
36
35
|
File::Stat.new("/proc/#{pid}").mtime.to_i.to_s rescue nil
|
37
36
|
end
|
38
37
|
end
|
39
38
|
|
40
39
|
def self.check_script(pid, mtime)
|
41
|
-
|
40
|
+
format(SCRIPT_TEMPLATE, { pid: pid, mtime: mtime })
|
42
41
|
end
|
43
42
|
|
44
43
|
def self.process_is_still_running?(pid, mtime)
|
45
|
-
system(
|
44
|
+
system(check_script(pid, mtime))
|
46
45
|
end
|
47
46
|
end
|
48
47
|
end
|
49
|
-
end
|
48
|
+
end
|