inst-jobs 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. checksums.yaml +4 -4
  2. data/db/migrate/20101216224513_create_delayed_jobs.rb +9 -7
  3. data/db/migrate/20110531144916_cleanup_delayed_jobs_indexes.rb +8 -13
  4. data/db/migrate/20110610213249_optimize_delayed_jobs.rb +8 -8
  5. data/db/migrate/20110831210257_add_delayed_jobs_next_in_strand.rb +25 -25
  6. data/db/migrate/20120510004759_delayed_jobs_delete_trigger_lock_for_update.rb +4 -8
  7. data/db/migrate/20120531150712_drop_psql_jobs_pop_fn.rb +1 -3
  8. data/db/migrate/20120607164022_delayed_jobs_use_advisory_locks.rb +11 -15
  9. data/db/migrate/20120607181141_index_jobs_on_locked_by.rb +1 -1
  10. data/db/migrate/20120608191051_add_jobs_run_at_index.rb +2 -2
  11. data/db/migrate/20120927184213_change_delayed_jobs_handler_to_text.rb +1 -1
  12. data/db/migrate/20140505215510_copy_failed_jobs_original_id.rb +2 -3
  13. data/db/migrate/20150807133223_add_max_concurrent_to_jobs.rb +9 -13
  14. data/db/migrate/20151210162949_improve_max_concurrent.rb +4 -8
  15. data/db/migrate/20161206323555_add_back_default_string_limits_jobs.rb +3 -2
  16. data/db/migrate/20181217155351_speed_up_max_concurrent_triggers.rb +13 -17
  17. data/db/migrate/20200330230722_add_id_to_get_delayed_jobs_index.rb +8 -8
  18. data/db/migrate/20200824222232_speed_up_max_concurrent_delete_trigger.rb +72 -77
  19. data/db/migrate/20200825011002_add_strand_order_override.rb +93 -97
  20. data/db/migrate/20210809145804_add_n_strand_index.rb +12 -0
  21. data/db/migrate/20210812210128_add_singleton_column.rb +200 -0
  22. data/db/migrate/20210917232626_add_delete_conflicting_singletons_before_unlock_trigger.rb +27 -0
  23. data/db/migrate/20210928174754_fix_singleton_condition_in_before_insert.rb +56 -0
  24. data/db/migrate/20210929204903_update_conflicting_singleton_function_to_use_index.rb +27 -0
  25. data/exe/inst_jobs +3 -2
  26. data/lib/delayed/backend/active_record.rb +211 -168
  27. data/lib/delayed/backend/base.rb +110 -72
  28. data/lib/delayed/batch.rb +11 -9
  29. data/lib/delayed/cli.rb +98 -84
  30. data/lib/delayed/core_ext/kernel.rb +4 -2
  31. data/lib/delayed/daemon.rb +70 -74
  32. data/lib/delayed/job_tracking.rb +26 -25
  33. data/lib/delayed/lifecycle.rb +27 -23
  34. data/lib/delayed/log_tailer.rb +17 -17
  35. data/lib/delayed/logging.rb +13 -16
  36. data/lib/delayed/message_sending.rb +43 -52
  37. data/lib/delayed/performable_method.rb +6 -8
  38. data/lib/delayed/periodic.rb +72 -68
  39. data/lib/delayed/plugin.rb +2 -4
  40. data/lib/delayed/pool.rb +205 -168
  41. data/lib/delayed/server/helpers.rb +6 -6
  42. data/lib/delayed/server.rb +51 -54
  43. data/lib/delayed/settings.rb +94 -81
  44. data/lib/delayed/testing.rb +21 -22
  45. data/lib/delayed/version.rb +1 -1
  46. data/lib/delayed/work_queue/in_process.rb +21 -17
  47. data/lib/delayed/work_queue/parent_process/client.rb +55 -53
  48. data/lib/delayed/work_queue/parent_process/server.rb +245 -207
  49. data/lib/delayed/work_queue/parent_process.rb +52 -53
  50. data/lib/delayed/worker/consul_health_check.rb +32 -33
  51. data/lib/delayed/worker/health_check.rb +34 -26
  52. data/lib/delayed/worker/null_health_check.rb +3 -1
  53. data/lib/delayed/worker/process_helper.rb +8 -9
  54. data/lib/delayed/worker.rb +272 -241
  55. data/lib/delayed/yaml_extensions.rb +12 -10
  56. data/lib/delayed_job.rb +37 -37
  57. data/lib/inst-jobs.rb +1 -1
  58. data/spec/active_record_job_spec.rb +143 -139
  59. data/spec/delayed/cli_spec.rb +7 -7
  60. data/spec/delayed/daemon_spec.rb +10 -9
  61. data/spec/delayed/message_sending_spec.rb +16 -9
  62. data/spec/delayed/periodic_spec.rb +14 -21
  63. data/spec/delayed/server_spec.rb +38 -38
  64. data/spec/delayed/settings_spec.rb +26 -25
  65. data/spec/delayed/work_queue/in_process_spec.rb +7 -8
  66. data/spec/delayed/work_queue/parent_process/client_spec.rb +17 -12
  67. data/spec/delayed/work_queue/parent_process/server_spec.rb +117 -41
  68. data/spec/delayed/work_queue/parent_process_spec.rb +21 -23
  69. data/spec/delayed/worker/consul_health_check_spec.rb +37 -50
  70. data/spec/delayed/worker/health_check_spec.rb +60 -52
  71. data/spec/delayed/worker_spec.rb +44 -21
  72. data/spec/sample_jobs.rb +45 -15
  73. data/spec/shared/delayed_batch.rb +74 -67
  74. data/spec/shared/delayed_method.rb +143 -102
  75. data/spec/shared/performable_method.rb +39 -38
  76. data/spec/shared/shared_backend.rb +550 -437
  77. data/spec/shared/testing.rb +14 -14
  78. data/spec/shared/worker.rb +156 -148
  79. data/spec/shared_jobs_specs.rb +13 -13
  80. data/spec/spec_helper.rb +53 -55
  81. metadata +148 -82
  82. data/lib/delayed/backend/redis/bulk_update.lua +0 -50
  83. data/lib/delayed/backend/redis/destroy_job.lua +0 -2
  84. data/lib/delayed/backend/redis/enqueue.lua +0 -29
  85. data/lib/delayed/backend/redis/fail_job.lua +0 -5
  86. data/lib/delayed/backend/redis/find_available.lua +0 -3
  87. data/lib/delayed/backend/redis/functions.rb +0 -59
  88. data/lib/delayed/backend/redis/get_and_lock_next_available.lua +0 -17
  89. data/lib/delayed/backend/redis/includes/jobs_common.lua +0 -203
  90. data/lib/delayed/backend/redis/job.rb +0 -535
  91. data/lib/delayed/backend/redis/set_running.lua +0 -5
  92. data/lib/delayed/backend/redis/tickle_strand.lua +0 -2
  93. data/spec/gemfiles/42.gemfile +0 -7
  94. data/spec/gemfiles/50.gemfile +0 -7
  95. data/spec/gemfiles/51.gemfile +0 -7
  96. data/spec/gemfiles/52.gemfile +0 -7
  97. data/spec/gemfiles/60.gemfile +0 -7
  98. data/spec/redis_job_spec.rb +0 -148
@@ -1,77 +1,75 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'health_check'
4
- require_relative 'process_helper'
5
- require 'socket'
3
+ require_relative "health_check"
4
+ require_relative "process_helper"
5
+ require "socket"
6
6
 
7
7
  module Delayed
8
8
  class Worker
9
9
  class ConsulHealthCheck < HealthCheck
10
10
  self.type_name = :consul
11
11
 
12
- CONSUL_CONFIG_KEYS = %w{url host port ssl token connect_timeout receive_timeout send_timeout}.map(&:freeze).freeze
13
- DEFAULT_SERVICE_NAME = 'inst-jobs_worker'.freeze
14
- attr_reader :agent_client, :catalog_client
12
+ CONSUL_CONFIG_KEYS = %w[url acl_token].map(&:freeze).freeze
13
+ DEFAULT_SERVICE_NAME = "inst-jobs_worker"
14
+ attr_reader :service_client, :health_client
15
15
 
16
16
  def initialize(*, **)
17
17
  super
18
18
  # Because we don't want the consul client to be a hard dependency we're
19
19
  # only requiring it once it's absolutely needed
20
- require 'imperium'
20
+ require "diplomat"
21
21
 
22
22
  if config.keys.any? { |k| CONSUL_CONFIG_KEYS.include?(k) }
23
- consul_config = Imperium::Configuration.new.tap do |conf|
23
+ consul_config = Diplomat::Configuration.new.tap do |conf|
24
24
  CONSUL_CONFIG_KEYS.each do |key|
25
25
  conf.send("#{key}=", config[key]) if config[key]
26
26
  end
27
27
  end
28
- @agent_client = Imperium::Agent.new(consul_config)
29
- @catalog_client = Imperium::Catalog.new(consul_config)
28
+ @service_client = Diplomat::Service.new(configuration: consul_config)
29
+ @health_client = Diplomat::Health.new(configuration: consul_config)
30
30
  else
31
- @agent_client = Imperium::Agent.default_client
32
- @catalog_client = Imperium::Catalog.default_client
31
+ @service_client = Diplomat::Service.new
32
+ @health_client = Diplomat::Health.new
33
33
  end
34
34
  end
35
35
 
36
36
  def start
37
- service = Imperium::Service.new({
38
- id: worker_name,
39
- name: service_name,
40
- })
41
- service.add_check(check_attributes)
42
- response = @agent_client.register_service(service)
43
- response.ok?
37
+ @service_client.register({
38
+ id: worker_name,
39
+ name: service_name,
40
+ check: check_attributes
41
+ })
44
42
  end
45
43
 
46
44
  def stop
47
- response = @agent_client.deregister_service(worker_name)
48
- response.ok? || response.not_found?
45
+ @service_client.deregister(worker_name)
49
46
  end
50
47
 
51
48
  def live_workers
52
- live_nodes = @catalog_client.list_nodes_for_service(service_name)
53
- if live_nodes.ok?
54
- live_nodes.map(&:service_id)
55
- else
56
- raise "Unable to read from Consul catalog: #{live_nodes.content}"
57
- end
49
+ # Filter out critical workers (probably nodes failing their serf health check)
50
+ live_nodes = @health_client.service(service_name, {
51
+ filter: "not Checks.Status == critical"
52
+ })
53
+
54
+ live_nodes.map { |n| n.Service["ID"] }
58
55
  end
59
56
 
60
57
  private
61
58
 
62
59
  def check_attributes
63
60
  {
64
- args: ['bash', '-c', check_script],
65
- status: 'passing',
66
- interval: @config.fetch(:check_interval, '5m'),
67
- deregister_critical_service_after: @config.fetch(:deregister_service_delay, '10m'),
61
+ args: ["bash", "-c", check_script],
62
+ status: "passing",
63
+ interval: @config.fetch(:check_interval, "5m"),
64
+ deregister_critical_service_after: @config.fetch(:deregister_service_delay, "10m")
68
65
  }.tap do |h|
69
- h[:docker_container_id] = docker_container_id if @config['docker']
66
+ h[:docker_container_id] = docker_container_id if @config["docker"]
70
67
  end
71
68
  end
72
69
 
73
70
  def check_script
74
71
  return @check_script if @check_script
72
+
75
73
  mtime = ProcessHelper.mtime(Process.pid)
76
74
  @check_script = ProcessHelper.check_script(Process.pid, mtime)
77
75
  end
@@ -80,12 +78,13 @@ module Delayed
80
78
  # cgroups for part of its magic and also uses the container id as the cgroup name
81
79
  def docker_container_id
82
80
  return @docker_container_id if @docker_container_id
81
+
83
82
  content = File.read("/proc/1/cgroup").split("\n")
84
83
  @docker_container_id = content.last.split("/").last
85
84
  end
86
85
 
87
86
  def service_name
88
- @service_name ||= @config.fetch('service_name', DEFAULT_SERVICE_NAME)
87
+ @service_name ||= @config.fetch("service_name", DEFAULT_SERVICE_NAME)
89
88
  end
90
89
  end
91
90
  end
@@ -11,57 +11,65 @@ module Delayed
11
11
 
12
12
  def inherited(subclass)
13
13
  @subclasses << subclass
14
+ super
14
15
  end
15
16
 
16
17
  def build(type:, worker_name:, config: {})
17
18
  type = type.to_sym
18
19
  klass = @subclasses.find { |sc| sc.type_name == type }
19
20
  raise ArgumentError, "Unable to build a HealthCheck for type #{type}" unless klass
21
+
20
22
  klass.new(worker_name: worker_name, config: config)
21
23
  end
22
24
 
23
25
  def reschedule_abandoned_jobs
24
26
  return if Settings.worker_health_check_type == :none
27
+
25
28
  Delayed::Job.transaction do
26
- # this job is a special case, and is not a singleton
29
+ # this action is a special case, and SHOULD NOT be a periodic job
27
30
  # because if it gets wiped out suddenly during execution
28
- # it can't go clean up it's abandoned self. Therefore,
29
- # we try to get an advisory lock when it runs. If we succeed,
30
- # no other job is trying to do this right now (and if we abandon the
31
- # job, the transaction will end, releasing the advisory lock).
32
- result = attempt_advisory_lock
31
+ # it can't go clean up its abandoned self. Therefore,
32
+ # we expect it to get run from it's own process forked from the job pool
33
+ # and we try to get an advisory lock when it runs. If we succeed,
34
+ # no other worker is trying to do this right now (and if we abandon the
35
+ # operation, the transaction will end, releasing the advisory lock).
36
+ result = Delayed::Job.attempt_advisory_lock("Delayed::Worker::HealthCheck#reschedule_abandoned_jobs")
33
37
  return unless result
38
+
39
+ horizon = 5.minutes.ago
40
+
34
41
  checker = Worker::HealthCheck.build(
35
42
  type: Settings.worker_health_check_type,
36
43
  config: Settings.worker_health_check_config,
37
- worker_name: 'cleanup-crew'
44
+ worker_name: "cleanup-crew"
38
45
  )
39
46
  live_workers = checker.live_workers
40
47
 
41
- Delayed::Job.running_jobs.each do |job|
42
- # prefetched jobs have their own way of automatically unlocking themselves
43
- next if job.locked_by.start_with?("prefetch:")
44
- unless live_workers.include?(job.locked_by)
45
- begin
46
- Delayed::Job.transaction do
47
- # double check that the job is still there. locked_by will immediately be reset
48
- # to nil in this transaction by Job#reschedule
49
- next unless Delayed::Job.where(id: job, locked_by: job.locked_by).update_all(locked_by: "abandoned job cleanup") == 1
50
- job.reschedule
51
- end
52
- rescue
53
- ::Rails.logger.error "Failure rescheduling abandoned job #{job.id} #{$!.inspect}"
48
+ loop do
49
+ batch = Delayed::Job.running_jobs
50
+ .where("locked_at<?", horizon)
51
+ .where.not("locked_by LIKE 'prefetch:%'")
52
+ .where.not(locked_by: live_workers)
53
+ .limit(100)
54
+ .to_a
55
+ break if batch.empty?
56
+
57
+ batch.each do |job|
58
+ Delayed::Job.transaction do
59
+ # double check that the job is still there. locked_by will immediately be reset
60
+ # to nil in this transaction by Job#reschedule
61
+ next unless Delayed::Job.where(id: job,
62
+ locked_by: job.locked_by)
63
+ .update_all(locked_by: "abandoned job cleanup") == 1
64
+
65
+ job.reschedule
54
66
  end
55
67
  end
68
+ rescue
69
+ ::Rails.logger.error "Failure rescheduling abandoned job #{job.id} #{$!.inspect}"
56
70
  end
57
71
  end
58
72
  end
59
-
60
- def attempt_advisory_lock
61
- lock_name = "Delayed::Worker::HealthCheck#reschedule_abandoned_jobs"
62
- output = ActiveRecord::Base.connection.execute("SELECT pg_try_advisory_xact_lock(half_md5_as_bigint('#{lock_name}'));")
63
- output.getvalue(0, 0)
64
- end
65
73
  end
66
74
 
67
75
  attr_accessor :config, :worker_name
@@ -13,7 +13,9 @@ module Delayed
13
13
  true
14
14
  end
15
15
 
16
- def live_workers; []; end
16
+ def live_workers
17
+ []
18
+ end
17
19
  end
18
20
  end
19
21
  end
@@ -3,14 +3,13 @@
3
3
  module Delayed
4
4
  class Worker
5
5
  module ProcessHelper
6
-
7
- STAT_LINUX = 'stat --format=%%Y /proc/$WORKER_PID'
8
- STAT_MAC = 'ps -o lstart -p $WORKER_PID'
6
+ STAT_LINUX = "stat --format=%%Y /proc/$WORKER_PID"
7
+ STAT_MAC = "ps -o lstart -p $WORKER_PID"
9
8
  STAT = RUBY_PLATFORM =~ /darwin/ ? STAT_MAC : STAT_LINUX
10
9
  ALIVE_CHECK_LINUX = '[ -d "/proc/$WORKER_PID" ]'
11
- ALIVE_CHECK_MAC = 'ps -p $WORKER_PID > /dev/null'
10
+ ALIVE_CHECK_MAC = "ps -p $WORKER_PID > /dev/null"
12
11
  ALIVE_CHECK = RUBY_PLATFORM =~ /darwin/ ? ALIVE_CHECK_MAC : ALIVE_CHECK_LINUX
13
- SCRIPT_TEMPLATE = <<-BASH.freeze
12
+ SCRIPT_TEMPLATE = <<-BASH
14
13
  WORKER_PID="%<pid>d" # an example, filled from ruby when the check is created
15
14
  ORIGINAL_MTIME="%<mtime>s" # an example, filled from ruby when the check is created
16
15
 
@@ -31,19 +30,19 @@ module Delayed
31
30
 
32
31
  def self.mtime(pid)
33
32
  if RUBY_PLATFORM =~ /darwin/
34
- `ps -o lstart -p #{pid}`.sub(/\n$/, '').presence
33
+ `ps -o lstart -p #{pid}`.sub(/\n$/, "").presence
35
34
  else
36
35
  File::Stat.new("/proc/#{pid}").mtime.to_i.to_s rescue nil
37
36
  end
38
37
  end
39
38
 
40
39
  def self.check_script(pid, mtime)
41
- sprintf(SCRIPT_TEMPLATE, {pid: pid, mtime: mtime})
40
+ format(SCRIPT_TEMPLATE, { pid: pid, mtime: mtime })
42
41
  end
43
42
 
44
43
  def self.process_is_still_running?(pid, mtime)
45
- system(self.check_script(pid, mtime))
44
+ system(check_script(pid, mtime))
46
45
  end
47
46
  end
48
47
  end
49
- end
48
+ end