gouda 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +9 -0
- data/.rubocop.yml +10 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +6 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +17 -0
- data/Rakefile +19 -0
- data/gouda.gemspec +32 -0
- data/lib/active_job/queue_adapters/gouda_adapter.rb +8 -0
- data/lib/generators/gouda/install_generator.rb +29 -0
- data/lib/generators/gouda/templates/install/migrations/create_gouda_tables.rb.erb +7 -0
- data/lib/gouda/active_job_extensions/concurrency.rb +70 -0
- data/lib/gouda/active_job_extensions/interrupts.rb +46 -0
- data/lib/gouda/adapter.rb +183 -0
- data/lib/gouda/bulk.rb +39 -0
- data/lib/gouda/job_fuse.rb +6 -0
- data/lib/gouda/migrations/create_gouda_tables.rb.erb +5 -0
- data/lib/gouda/queue_constraints.rb +73 -0
- data/lib/gouda/railtie.rb +57 -0
- data/lib/gouda/scheduler.rb +108 -0
- data/lib/gouda/version.rb +5 -0
- data/lib/gouda/worker.rb +188 -0
- data/lib/gouda/workload.rb +214 -0
- data/lib/gouda.rb +116 -0
- metadata +186 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gouda
|
4
|
+
class Railtie < Rails::Railtie
|
5
|
+
rake_tasks do
|
6
|
+
task preload: :setup do
|
7
|
+
if defined?(Rails) && Rails.respond_to?(:application)
|
8
|
+
if Rails.application.config.eager_load
|
9
|
+
ActiveSupport.run_load_hooks(:before_eager_load, Rails.application)
|
10
|
+
Rails.application.config.eager_load_namespaces.each(&:eager_load!)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
initializer "gouda.configure_rails_initialization" do
|
17
|
+
Gouda.config.app_executor = if defined?(Rails) && Rails.respond_to?(:application)
|
18
|
+
Rails.application.executor
|
19
|
+
else
|
20
|
+
ActiveSupport::Executor
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
initializer "gouda.active_job.extensions" do
|
25
|
+
ActiveSupport.on_load :active_job do
|
26
|
+
include Gouda::ActiveJobExtensions::Interrupts
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
generators do
|
31
|
+
require "generators/gouda/install_generator"
|
32
|
+
end
|
33
|
+
|
34
|
+
# The `to_prepare` block which is executed once in production
|
35
|
+
# and before each request in development.
|
36
|
+
config.to_prepare do
|
37
|
+
Gouda::Scheduler.update_schedule_from_config!
|
38
|
+
|
39
|
+
if defined?(Rails) && Rails.respond_to?(:application)
|
40
|
+
config_from_rails = Rails.application.config.try(:gouda)
|
41
|
+
if config_from_rails
|
42
|
+
Gouda.config.cleanup_preserved_jobs_before = config_from_rails[:cleanup_preserved_jobs_before]
|
43
|
+
Gouda.config.preserve_job_records = config_from_rails[:preserve_job_records]
|
44
|
+
Gouda.config.polling_sleep_interval_seconds = config_from_rails[:polling_sleep_interval_seconds]
|
45
|
+
Gouda.config.worker_thread_count = config_from_rails[:worker_thread_count]
|
46
|
+
if Gouda.config.logger
|
47
|
+
Gouda.config.logger.level = config_from_rails[:log_level] || Gouda.config.log_level
|
48
|
+
end
|
49
|
+
end
|
50
|
+
else
|
51
|
+
Gouda.config.preserve_job_records = false
|
52
|
+
Gouda.config.polling_sleep_interval_seconds = 0.2
|
53
|
+
Gouda.config.logger.level = Gouda.config.log_level
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Timers handles jobs which run either on a Cron schedule or using arbitrary time intervals
|
4
|
+
|
5
|
+
require "fugit"
|
6
|
+
module Gouda::Scheduler
|
7
|
+
# A timer entry is either a Cron pattern or an interval duration, and configures which job needs to be scheduled
|
8
|
+
# and when
|
9
|
+
class Entry < Struct.new(:name, :cron, :interval_seconds, :job_class, :kwargs, :args, :set, keyword_init: true)
|
10
|
+
def scheduler_key
|
11
|
+
[name, interval_seconds, cron, job_class].compact.join("_")
|
12
|
+
end
|
13
|
+
|
14
|
+
def next_at
|
15
|
+
if interval_seconds
|
16
|
+
first_existing = Gouda::Workload.where(scheduler_key: scheduler_key).where("scheduled_at > NOW()").order("scheduled_at DESC").pluck(:scheduled_at).first
|
17
|
+
(first_existing || Time.now.utc) + interval_seconds
|
18
|
+
elsif cron
|
19
|
+
fugit = Fugit::Cron.parse(cron)
|
20
|
+
raise ArgumentError, "Unable to parse cron pattern #{cron.inspect}" unless fugit
|
21
|
+
Time.at(fugit.next_time.to_i).utc
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def build_active_job
|
26
|
+
next_at = self.next_at
|
27
|
+
return unless next_at
|
28
|
+
|
29
|
+
job_class = self.job_class.constantize
|
30
|
+
|
31
|
+
active_job = kwargs_value.present? ? job_class.new(*args_value, **kwargs_value) : job_class.new(*args_value) # This method supports ruby2_keywords
|
32
|
+
active_job.scheduled_at = next_at
|
33
|
+
active_job.scheduler_key = scheduler_key
|
34
|
+
|
35
|
+
set_value.present? ? active_job.set(set_value) : active_job
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def set_value
|
41
|
+
value = set || {}
|
42
|
+
value.respond_to?(:call) ? value.call : value
|
43
|
+
end
|
44
|
+
|
45
|
+
def args_value
|
46
|
+
value = args || []
|
47
|
+
value.respond_to?(:call) ? value.call : value
|
48
|
+
end
|
49
|
+
|
50
|
+
def kwargs_value
|
51
|
+
value = kwargs || nil
|
52
|
+
value.respond_to?(:call) ? value.call : value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.update_schedule_from_config!(cron_table_hash = nil)
|
57
|
+
Gouda.logger.info "Updating scheduled workload entries..."
|
58
|
+
if cron_table_hash.blank?
|
59
|
+
config_from_rails = Rails.application.config.try(:gouda)
|
60
|
+
|
61
|
+
cron_table_hash = if config_from_rails.present?
|
62
|
+
config_from_rails.dig(:cron).to_h if config_from_rails.dig(:enable_cron)
|
63
|
+
elsif Gouda.config.enable_cron
|
64
|
+
Gouda.config.cron
|
65
|
+
end
|
66
|
+
|
67
|
+
return unless cron_table_hash
|
68
|
+
end
|
69
|
+
|
70
|
+
defaults = {cron: nil, interval_seconds: nil, kwargs: nil, args: nil}
|
71
|
+
@cron_table = cron_table_hash.map do |(name, cron_entry_params)|
|
72
|
+
# `class` is a reserved keyword and a method that exists on every Ruby object so...
|
73
|
+
cron_entry_params[:job_class] ||= cron_entry_params.delete(:class)
|
74
|
+
params_with_defaults = defaults.merge(cron_entry_params)
|
75
|
+
Entry.new(name:, **params_with_defaults)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.enqueue_next_scheduled_workload_for(finished_workload)
|
80
|
+
return unless finished_workload.scheduler_key
|
81
|
+
|
82
|
+
timer_table = @cron_table.to_a.index_by(&:scheduler_key)
|
83
|
+
timer_entry = timer_table[finished_workload.scheduler_key]
|
84
|
+
return unless timer_entry
|
85
|
+
|
86
|
+
Gouda.enqueue_jobs_via_their_adapters([timer_entry.build_active_job])
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.entries
|
90
|
+
@cron_table || []
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.update_scheduled_workloads!
|
94
|
+
table_entries = @cron_table || []
|
95
|
+
|
96
|
+
# Remove any cron keyed workloads which no longer match config-wise
|
97
|
+
known_keys = table_entries.map(&:scheduler_key).uniq
|
98
|
+
Gouda::Workload.transaction do
|
99
|
+
Gouda::Workload.where.not(scheduler_key: known_keys).delete_all
|
100
|
+
|
101
|
+
# Insert the next iteration for every "next" entry in the crontab.
|
102
|
+
active_jobs_to_enqueue = table_entries.filter_map(&:build_active_job)
|
103
|
+
Gouda.logger.info "#{active_jobs_to_enqueue.size} job(s) to enqueue from the scheduler."
|
104
|
+
enqjobs = Gouda.enqueue_jobs_via_their_adapters(active_jobs_to_enqueue)
|
105
|
+
Gouda.logger.info "#{enqjobs.size} scheduled job(s) enqueued."
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/gouda/worker.rb
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "securerandom"
|
4
|
+
require "gouda/version"
|
5
|
+
|
6
|
+
module Gouda
|
7
|
+
POLL_INTERVAL_DURATION_SECONDS = 1
|
8
|
+
|
9
|
+
# Is used for keeping the IDs of currently executing jobs on this worker in a thread-safe way.
|
10
|
+
# These IDs are used to update the heartbeat timestamps during execution. We need just three
|
11
|
+
# methods here - add to a set, remove from a set, and convert the set into an array for a SQL query
|
12
|
+
# with `WHERE id IN`.
|
13
|
+
class ThreadSafeSet
|
14
|
+
def initialize
|
15
|
+
@set = Set.new
|
16
|
+
@mutex = Mutex.new
|
17
|
+
end
|
18
|
+
|
19
|
+
def add(value)
|
20
|
+
@mutex.synchronize { @set.add(value) }
|
21
|
+
value
|
22
|
+
end
|
23
|
+
|
24
|
+
def delete(value)
|
25
|
+
@mutex.synchronize { @set.delete(value) }
|
26
|
+
value
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_a
|
30
|
+
@mutex.synchronize { @set.to_a }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns `true` once a given timer has elapsed.
|
35
|
+
# This is useful to terminate a worker after a certain amount of time
|
36
|
+
class TimerShutdownCheck
|
37
|
+
def initialize(seconds_float)
|
38
|
+
@dt = seconds_float
|
39
|
+
@st = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
40
|
+
end
|
41
|
+
|
42
|
+
def call
|
43
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - @st) > @dt
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Captures UNIX signals (TERM and INT) and then returns true. Once you initialize the
|
48
|
+
# this check you install signal handlers, meaning that the worker will not raise `Interrupt`
|
49
|
+
# from any theads but will get the space it needs to terminate cleanly. At least for SIGINT
|
50
|
+
# and SIGTERM this is very desirable. This is the default shutdown check.
|
51
|
+
class TrapShutdownCheck
|
52
|
+
def initialize
|
53
|
+
@did_trap = false
|
54
|
+
@did_log = false
|
55
|
+
Signal.trap(:TERM) do
|
56
|
+
@did_trap = :TERM
|
57
|
+
end
|
58
|
+
Signal.trap(:INT) do
|
59
|
+
@did_trap = :INT
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def call
|
64
|
+
if @did_trap
|
65
|
+
@did_log ||= begin
|
66
|
+
warn("Gouda worker signaled to terminate via SIG#{@did_trap}")
|
67
|
+
true
|
68
|
+
end
|
69
|
+
true
|
70
|
+
else
|
71
|
+
false
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# This shutdown check will return `true` once there
|
77
|
+
# are no enqueued jobs to process for this worker. This
|
78
|
+
# can be used to run a worker just as long as there are jobs to handle
|
79
|
+
# and then to let it quit by itself (handy for spot instances and the like)
|
80
|
+
class EmptyQueueShutdownCheck
|
81
|
+
def initialize(queue_constraint = Gouda::AnyQueue)
|
82
|
+
@queue_constraint = queue_constraint
|
83
|
+
end
|
84
|
+
|
85
|
+
def call
|
86
|
+
# return false unless Rails.application # Rails is still booting and there is no application defined
|
87
|
+
|
88
|
+
Gouda.config.app_executor.wrap do
|
89
|
+
Gouda::Workload.waiting_to_start(queue_constraint: @queue_constraint).none?
|
90
|
+
end
|
91
|
+
rescue # If the DB connection cannot be checked out etc
|
92
|
+
false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# A wrapping callable which returns `true` if any of the
|
97
|
+
# given callables return true. This can be used to combine a timed shutdown ("in 30 seconds")
|
98
|
+
# with a signal handler shutdown ("shutdown on SIGTERM/SIGINT")
|
99
|
+
class CombinedShutdownCheck
|
100
|
+
# @param callables_for_condition[#call] other shutdown checks
|
101
|
+
def initialize(*callables_for_condition)
|
102
|
+
@conditions = callables_for_condition
|
103
|
+
end
|
104
|
+
|
105
|
+
def call
|
106
|
+
# Once one shutdown check told us to shut down there is no point to query all the others
|
107
|
+
@memo ||= @conditions.any?(&:call)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Start looping, taking work from the queue and performing it, over multiple worker threads.
|
112
|
+
# Once the `check_shutdown` callable returns `true` the threads will cleanly terminate and the method will return (so it is blocking).
|
113
|
+
#
|
114
|
+
# @param n_threads[Integer] how many _worker_ threads to start. Another thread will be started for housekeeping, so ideally this should be the size of your connection pool minus 1
|
115
|
+
# @param check_shutdown[#call] A callable object (can be a Proc etc.). Once starts returning `true` the worker threads and the housekeeping thread will cleanly exit
|
116
|
+
def self.worker_loop(n_threads:, check_shutdown: TrapShutdownCheck.new, queue_constraint: Gouda::AnyQueue)
|
117
|
+
# We need quite a few things when starting the loop - we have to be far enough into the Rails bootup sequence
|
118
|
+
# that both the application and the executor are available
|
119
|
+
#
|
120
|
+
# raise "Rails is not loaded yet" unless defined?(Rails) && Rails.respond_to?(:application)
|
121
|
+
# raise "Rails application is not loaded yet" unless Rails.application
|
122
|
+
# raise "Rails executor not available yet" unless Rails.application.executor
|
123
|
+
|
124
|
+
check_shutdown = CombinedShutdownCheck.new(*check_shutdown) if !check_shutdown.respond_to?(:call) && check_shutdown.is_a?(Array)
|
125
|
+
|
126
|
+
worker_id = [Socket.gethostname, Process.pid, SecureRandom.uuid].join("-")
|
127
|
+
|
128
|
+
executing_workload_ids = ThreadSafeSet.new
|
129
|
+
|
130
|
+
raise ArgumentError, "You need at least 1 worker thread, but you requested #{n_threads}" if n_threads < 1
|
131
|
+
worker_threads = n_threads.times.map do
|
132
|
+
Thread.new do
|
133
|
+
worker_id_and_thread_id = [worker_id, "t0x#{Thread.current.object_id.to_s(16)}"].join("-")
|
134
|
+
loop do
|
135
|
+
break if check_shutdown.call
|
136
|
+
|
137
|
+
did_process = Gouda.config.app_executor.wrap do
|
138
|
+
Gouda::Workload.checkout_and_perform_one(executing_on: worker_id_and_thread_id, queue_constraint:, in_progress: executing_workload_ids)
|
139
|
+
end
|
140
|
+
|
141
|
+
# If no job was retrieved the queue is likely empty. Relax the polling then and ease off.
|
142
|
+
# If a job was retrieved it is likely that a burst has just been enqueued, and we do not
|
143
|
+
# sleep but proceed to attempt to retrieve the next job right after.
|
144
|
+
jitter_sleep_interval = POLL_INTERVAL_DURATION_SECONDS + (POLL_INTERVAL_DURATION_SECONDS * 0.25)
|
145
|
+
sleep_with_interruptions(jitter_sleep_interval, check_shutdown) unless did_process
|
146
|
+
rescue => e
|
147
|
+
warn "Uncaught exception during perform (#{e.class} - #{e}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Do the housekeeping tasks on main
|
153
|
+
loop do
|
154
|
+
break if check_shutdown.call
|
155
|
+
|
156
|
+
Gouda.config.app_executor.wrap do
|
157
|
+
# Mark known executing jobs as such. If a worker process is killed or the machine it is running on dies,
|
158
|
+
# a stale timestamp can indicate to us that the job was orphaned and is marked as "executing"
|
159
|
+
# even though the worker it was running on has failed for whatever reason.
|
160
|
+
# Later on we can figure out what to do with those jobs (re-enqueue them or toss them)
|
161
|
+
Gouda::Workload.where(id: executing_workload_ids.to_a, state: "executing").update_all(executing_on: worker_id, last_execution_heartbeat_at: Time.now.utc)
|
162
|
+
|
163
|
+
# Find jobs which just hung and clean them up (mark them as "finished" and enqueue replacement workloads if possible)
|
164
|
+
Gouda::Workload.reap_zombie_workloads
|
165
|
+
rescue => e
|
166
|
+
# Appsignal.add_exception(e)
|
167
|
+
warn "Uncaught exception during housekeeping (#{e.class} - #{e}"
|
168
|
+
end
|
169
|
+
|
170
|
+
# Jitter the sleep so that the workers booted at the same time do not all dogpile
|
171
|
+
randomized_sleep_duration_s = POLL_INTERVAL_DURATION_SECONDS + (POLL_INTERVAL_DURATION_SECONDS.to_f * rand)
|
172
|
+
sleep_with_interruptions(randomized_sleep_duration_s, check_shutdown)
|
173
|
+
end
|
174
|
+
ensure
|
175
|
+
worker_threads&.map(&:join)
|
176
|
+
end
|
177
|
+
|
178
|
+
def self.sleep_with_interruptions(n_seconds, must_abort_proc)
|
179
|
+
start_time_seconds = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
180
|
+
# remaining_seconds = n_seconds
|
181
|
+
check_interval_seconds = Gouda.config.polling_sleep_interval_seconds
|
182
|
+
loop do
|
183
|
+
return if must_abort_proc.call
|
184
|
+
return if Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time_seconds >= n_seconds
|
185
|
+
sleep(check_interval_seconds)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
# # frozen_string_literal: true
|
2
|
+
|
3
|
+
# This model is called "workload" for a reason. The ActiveJob can be enqueued multiple times with
|
4
|
+
# the same job ID which gets generated by Rails. These multiple enqueues of the same job are not
|
5
|
+
# exactly copies of one another. When you use job-iteration for example, your job will be retried with a different
|
6
|
+
# cursor position value. When you use ActiveJob `rescue_from` as well - the job will be retried and keep the same
|
7
|
+
# active job ID, but it then gets returned into the queue "in some way". What we want is that the records in our
|
8
|
+
# table represent a unit of work that the worker has to execute "at some point". If the same job gets enqueued multiple
|
9
|
+
# times due to retries or pause/resume we want the enqueues to be separate workloads, which can fail or succeed
|
10
|
+
# independently. This also allows the queue records to be "append-only" which allows the records to be pruned
|
11
|
+
# on a regular basis. This is why they are called "workloads" and not "jobs". "Executions" is a great term used
|
12
|
+
# by good_job but it seems that it is not clear what has the "identity". With the Workload the ID of the workload
|
13
|
+
# is the "provider ID" for ActiveJob. It is therefore possible (and likely) that multiple Workloads will exist
|
14
|
+
# sharing the same ActiveJob ID.
|
15
|
+
class Gouda::Workload < ActiveRecord::Base
|
16
|
+
ZOMBIE_MAX_THRESHOLD = "5 minutes"
|
17
|
+
|
18
|
+
self.table_name = "gouda_workloads"
|
19
|
+
# GoodJob calls these "enqueued" but they are more like
|
20
|
+
# "waiting to start" - jobs which have been scheduled past now,
|
21
|
+
# or haven't been scheduled to a particular time, are in the "enqueued"
|
22
|
+
# state and match the queue constraint
|
23
|
+
scope :waiting_to_start, ->(queue_constraint: Gouda::AnyQueue) {
|
24
|
+
condition_for_ready_to_execute_jobs = <<~SQL
|
25
|
+
#{queue_constraint.to_sql}
|
26
|
+
AND execution_concurrency_key NOT IN (
|
27
|
+
SELECT execution_concurrency_key FROM #{quoted_table_name} WHERE state = 'executing' AND execution_concurrency_key IS NOT NULL
|
28
|
+
)
|
29
|
+
AND state = 'enqueued'
|
30
|
+
AND (scheduled_at <= clock_timestamp())
|
31
|
+
SQL
|
32
|
+
|
33
|
+
where(Arel.sql(condition_for_ready_to_execute_jobs))
|
34
|
+
}
|
35
|
+
|
36
|
+
scope :errored, -> { where("error != '{}'") }
|
37
|
+
scope :retried, -> { where("(serialized_params -> 'exception_executions') != '{}' AND state != 'finished'") }
|
38
|
+
scope :finished, -> { where(state: "finished") }
|
39
|
+
scope :enqueued, -> { where(state: "enqueued") }
|
40
|
+
scope :executing, -> { where(state: "executing") }
|
41
|
+
|
42
|
+
def self.queue_names
|
43
|
+
connection.select_values("SELECT DISTINCT(queue_name) FROM #{quoted_table_name} ORDER BY queue_name ASC")
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.prune
|
47
|
+
if Gouda.config.preserve_job_records
|
48
|
+
where(state: "finished").where("execution_finished_at < ?", Gouda.cleanup_preserved_jobs_before.ago).delete_all
|
49
|
+
else
|
50
|
+
where(state: "finished").delete_all
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Re-enqueue zombie workloads which have been left to rot due to machine kills, worker OOM kills and the like
|
55
|
+
# With a lock so no single zombie job gets enqueued more than once
|
56
|
+
# And wrapped in transactions with the possibility to roll back a single workload without it rollbacking the entire batch
|
57
|
+
def self.reap_zombie_workloads
|
58
|
+
uncached do # again needed due to the use of clock_timestamp() in the SQL
|
59
|
+
transaction do
|
60
|
+
zombie_workloads_scope = Gouda::Workload.lock("FOR UPDATE SKIP LOCKED").where("state = 'executing' AND last_execution_heartbeat_at < (clock_timestamp() - interval '#{ZOMBIE_MAX_THRESHOLD}')")
|
61
|
+
zombie_workloads_scope.find_each(batch_size: 1000) do |workload|
|
62
|
+
# with_lock will start its own transaction
|
63
|
+
workload.with_lock("FOR UPDATE SKIP LOCKED") do
|
64
|
+
Gouda.logger.info { "Reviving (re-enqueueing) Gouda workload #{workload.id} after interruption" }
|
65
|
+
|
66
|
+
# Appsignal.increment_counter("gouda_workloads_revived", 1, job_class: workload.active_job_class_name)
|
67
|
+
|
68
|
+
interrupted_at = workload.last_execution_heartbeat_at
|
69
|
+
workload.update!(state: "finished", interrupted_at:, last_execution_heartbeat_at: Time.now.utc, execution_finished_at: Time.now.utc)
|
70
|
+
revived_job = ActiveJob::Base.deserialize(workload.active_job_data)
|
71
|
+
# Save the interrupted_at timestamp so that upon execution the new job will raise a Gouda::Interrpupted exception.
|
72
|
+
# The exception can then be handled like any other ActiveJob exception (using rescue_from or similar).
|
73
|
+
revived_job.interrupted_at = interrupted_at
|
74
|
+
revived_job.enqueue
|
75
|
+
end
|
76
|
+
rescue ActiveRecord::RecordNotFound
|
77
|
+
# This will happen if we have selected the zombie workload in the outer block, but
|
78
|
+
# by the point we reload it and take a FOR UPDATE SKIP LOCKED lock another worker is
|
79
|
+
# already reaping it - a call to `reload` will cause a RecordNotFound, since Postgres
|
80
|
+
# will hide the row from us. This is what we want in fact - we want to progress to
|
81
|
+
# the next row. So we allow the code to proceed, as we expect that the other worker
|
82
|
+
# (which stole the workload from us) will have set it to "state=finished" by the time we reattempt
|
83
|
+
# our SELECT with conditions
|
84
|
+
Gouda.logger.debug { "Gouda workload #{workload.id} cannot be reaped as it was hijacked by another worker" }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Lock the next workload and mark it as executing
|
91
|
+
def self.checkout_and_lock_one(executing_on:, queue_constraint: Gouda::AnyQueue)
|
92
|
+
where_query = <<~SQL
|
93
|
+
#{queue_constraint.to_sql}
|
94
|
+
AND workloads.state = 'enqueued'
|
95
|
+
AND NOT EXISTS (
|
96
|
+
SELECT NULL
|
97
|
+
FROM #{quoted_table_name} AS concurrent
|
98
|
+
WHERE concurrent.state = 'executing'
|
99
|
+
AND concurrent.execution_concurrency_key = workloads.execution_concurrency_key
|
100
|
+
)
|
101
|
+
AND workloads.scheduled_at <= clock_timestamp()
|
102
|
+
SQL
|
103
|
+
# Enter a txn just to mark this job as being executed "by us". This allows us to avoid any
|
104
|
+
# locks during execution itself, including advisory locks
|
105
|
+
jobs = Gouda::Workload
|
106
|
+
.select("workloads.*")
|
107
|
+
.from("#{quoted_table_name} AS workloads")
|
108
|
+
.where(where_query)
|
109
|
+
.order("workloads.priority ASC NULLS LAST")
|
110
|
+
.lock("FOR UPDATE SKIP LOCKED")
|
111
|
+
.limit(1)
|
112
|
+
|
113
|
+
_first_available_workload = ActiveSupport::Notifications.instrument("checkout_and_lock_one.gouda", {queue_constraint: queue_constraint.to_sql}) do |payload|
|
114
|
+
payload[:condition_sql] = jobs.to_sql
|
115
|
+
payload[:retried_checkouts_due_to_concurrent_exec] = 0
|
116
|
+
uncached do # Necessary because we SELECT with a clock_timestamp() which otherwise gets cached by ActiveRecord query cache
|
117
|
+
transaction do
|
118
|
+
jobs.first.tap do |job|
|
119
|
+
job&.update!(state: "executing", executing_on:, last_execution_heartbeat_at: Time.now.utc, execution_started_at: Time.now.utc)
|
120
|
+
end
|
121
|
+
rescue ActiveRecord::RecordNotUnique
|
122
|
+
# It can happen that due to a race the `execution_concurrency_key NOT IN` does not capture
|
123
|
+
# a job which _just_ entered the "executing" state, apparently after we do our SELECT. This will happen regardless
|
124
|
+
# whether we are using a CTE or a sub-SELECT
|
125
|
+
payload[:retried_checkouts_due_to_concurrent_exec] += 1
|
126
|
+
nil
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Get a new workload and call perform
|
133
|
+
# @param in_progress[#add,#delete] Used for tracking work in progress for heartbeats
|
134
|
+
def self.checkout_and_perform_one(executing_on:, queue_constraint: Gouda::AnyQueue, in_progress: Set.new)
|
135
|
+
# Select a job and mark it as "executing" which will make it unavailable to any other
|
136
|
+
workload = checkout_and_lock_one(executing_on:, queue_constraint:)
|
137
|
+
if workload
|
138
|
+
in_progress.add(workload.id)
|
139
|
+
workload.perform_and_update_state!
|
140
|
+
end
|
141
|
+
ensure
|
142
|
+
in_progress.delete(workload.id) if workload
|
143
|
+
end
|
144
|
+
|
145
|
+
def enqueued_at
|
146
|
+
Time.parse(serialized_params["enqueued_at"]) if serialized_params["enqueued_at"]
|
147
|
+
end
|
148
|
+
|
149
|
+
def perform_and_update_state!
|
150
|
+
ActiveSupport::Notifications.instrument("perform_job.gouda", {workload: self}) do |instrument_payload|
|
151
|
+
extras = {}
|
152
|
+
if Gouda::JobFuse.exists?(active_job_class_name: active_job_class_name)
|
153
|
+
extras[:error] = {class_name: "WorkloadSkippedError", message: "Skipped because of a fuse at #{Time.now.utc}"}
|
154
|
+
else
|
155
|
+
job_result = ActiveJob::Base.execute(active_job_data)
|
156
|
+
|
157
|
+
if job_result.is_a?(Exception)
|
158
|
+
# When an exception is handled, let's say we have a retry_on <exception> in our job, we end up here
|
159
|
+
# and it won't be rescueed
|
160
|
+
handled_error = job_result
|
161
|
+
update!(error: error_hash(handled_error))
|
162
|
+
end
|
163
|
+
|
164
|
+
instrument_payload[:value] = job_result
|
165
|
+
instrument_payload[:handled_error] = handled_error
|
166
|
+
|
167
|
+
job_result
|
168
|
+
end
|
169
|
+
rescue => exception_not_retried_by_active_job
|
170
|
+
# When a job fails and is not retryable it will end up here.
|
171
|
+
update!(error: error_hash(exception_not_retried_by_active_job))
|
172
|
+
instrument_payload[:unhandled_error] = exception_not_retried_by_active_job
|
173
|
+
Gouda.logger.error { exception_not_retried_by_active_job }
|
174
|
+
exception_not_retried_by_active_job # Return the exception instead of re-raising it
|
175
|
+
ensure
|
176
|
+
update!(state: "finished", last_execution_heartbeat_at: Time.now.utc, execution_finished_at: Time.now.utc, **extras)
|
177
|
+
# If the workload that just finished was a scheduled workload (via timer/cron) enqueue the next execution.
|
178
|
+
# Otherwise the next job will only get enqueued once the config is reloaded
|
179
|
+
Gouda::Scheduler.enqueue_next_scheduled_workload_for(self)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def schedule_now!
|
184
|
+
with_lock do
|
185
|
+
return if state != "enqueued"
|
186
|
+
|
187
|
+
update!(scheduled_at: Time.now.utc)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def mark_finished!
|
192
|
+
with_lock do
|
193
|
+
now = Time.now.utc
|
194
|
+
execution_started_at ||= now
|
195
|
+
|
196
|
+
return if state == "finished"
|
197
|
+
|
198
|
+
update!(
|
199
|
+
state: "finished", last_execution_heartbeat_at: now,
|
200
|
+
execution_finished_at: now, execution_started_at: execution_started_at,
|
201
|
+
error: {class_name: "RemovedError", message: "Manually removed at #{now}"}
|
202
|
+
)
|
203
|
+
Gouda::Scheduler.enqueue_next_scheduled_workload_for(self)
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def error_hash(error)
|
208
|
+
{class_name: error.class.to_s, backtrace: error.backtrace.to_a, message: error.message}
|
209
|
+
end
|
210
|
+
|
211
|
+
def active_job_data
|
212
|
+
serialized_params.deep_dup.merge("provider_job_id" => id, "interrupted_at" => interrupted_at, "scheduler_key" => scheduler_key) # TODO: is this memory-economical?
|
213
|
+
end
|
214
|
+
end
|
data/lib/gouda.rb
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support"
|
4
|
+
require "active_support/core_ext/numeric/time"
|
5
|
+
require "active_support/configurable"
|
6
|
+
require "rails/railtie"
|
7
|
+
require_relative "gouda/bulk"
|
8
|
+
require_relative "gouda/adapter"
|
9
|
+
require_relative "gouda/scheduler"
|
10
|
+
require_relative "gouda/railtie" if defined?(Rails::Railtie)
|
11
|
+
require_relative "gouda/workload"
|
12
|
+
require_relative "gouda/worker"
|
13
|
+
require_relative "gouda/job_fuse"
|
14
|
+
require_relative "gouda/queue_constraints"
|
15
|
+
require_relative "gouda/active_job_extensions/interrupts"
|
16
|
+
require_relative "gouda/active_job_extensions/concurrency"
|
17
|
+
require_relative "active_job/queue_adapters/gouda_adapter"
|
18
|
+
|
19
|
+
module Gouda
|
20
|
+
class Gouda::Configuration
|
21
|
+
include ActiveSupport::Configurable
|
22
|
+
|
23
|
+
config_accessor(:preserve_job_records, default: false)
|
24
|
+
config_accessor(:cleanup_preserved_jobs_before, default: 3.hours)
|
25
|
+
config_accessor(:polling_sleep_interval_seconds, default: 0.2)
|
26
|
+
config_accessor(:worker_thread_count, default: 1)
|
27
|
+
config_accessor(:logger, default: ActiveSupport::Logger.new($stdout))
|
28
|
+
config_accessor(:app_executor)
|
29
|
+
config_accessor(:cron, default: {})
|
30
|
+
config_accessor(:enable_cron, default: true)
|
31
|
+
# Log levels are:
|
32
|
+
# constant | level
|
33
|
+
# Logger::DEBUG (0)
|
34
|
+
# Logger::INFO (1)
|
35
|
+
# Logger::WARN (2)
|
36
|
+
# Logger::ERROR (3)
|
37
|
+
# Logger::FATAL (4)
|
38
|
+
# Logger::UNKNOWN (5)
|
39
|
+
config_accessor(:log_level, default: Logger::DEBUG)
|
40
|
+
end
|
41
|
+
|
42
|
+
class InterruptError < StandardError
|
43
|
+
end
|
44
|
+
|
45
|
+
class ConcurrencyExceededError < StandardError
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.start
|
49
|
+
Gouda::Scheduler.update_scheduled_workloads!
|
50
|
+
|
51
|
+
queue_constraint = if ENV["GOUDA_QUEUES"]
|
52
|
+
Gouda.parse_queue_constraint(ENV["GOUDA_QUEUES"])
|
53
|
+
else
|
54
|
+
Gouda::AnyQueue
|
55
|
+
end
|
56
|
+
|
57
|
+
Gouda.logger.info("Gouda version: #{Gouda::VERSION}")
|
58
|
+
Gouda.logger.info("Worker threads: #{Gouda.config.worker_thread_count}")
|
59
|
+
|
60
|
+
Gouda.worker_loop(n_threads: Gouda.config.worker_thread_count, queue_constraint:)
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.config
|
64
|
+
@config ||= Configuration.new
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.configure
|
68
|
+
yield config
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.logger
|
72
|
+
Gouda.config.logger
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.create_tables(active_record_schema)
|
76
|
+
active_record_schema.create_enum :gouda_workload_state, %w[enqueued executing finished]
|
77
|
+
active_record_schema.create_table :gouda_workloads, id: :uuid do |t|
|
78
|
+
t.uuid :active_job_id, null: false
|
79
|
+
t.timestamp :scheduled_at, null: false
|
80
|
+
t.timestamp :execution_started_at
|
81
|
+
t.timestamp :execution_finished_at
|
82
|
+
t.timestamp :last_execution_heartbeat_at
|
83
|
+
t.timestamp :interrupted_at, null: true
|
84
|
+
|
85
|
+
t.string :scheduler_key, null: true
|
86
|
+
t.string :queue_name, null: false, default: "default"
|
87
|
+
t.integer :priority
|
88
|
+
t.string :active_job_class_name, null: false
|
89
|
+
t.jsonb :serialized_params
|
90
|
+
t.jsonb :error, default: {}, null: false
|
91
|
+
t.enum :state, enum_type: :gouda_workload_state, default: "enqueued", null: false
|
92
|
+
t.string :execution_concurrency_key
|
93
|
+
t.string :enqueue_concurrency_key
|
94
|
+
t.string :executing_on
|
95
|
+
t.integer :position_in_bulk
|
96
|
+
|
97
|
+
t.timestamps
|
98
|
+
end
|
99
|
+
|
100
|
+
active_record_schema.add_index :gouda_workloads, [:priority, :id, :scheduled_at], where: "state = 'enqueued'", name: :gouda_checkout_all_index
|
101
|
+
active_record_schema.add_index :gouda_workloads, [:id, :last_execution_heartbeat_at], where: "state = 'executing'", name: :gouda_last_heartbeat_index
|
102
|
+
active_record_schema.add_index :gouda_workloads, [:enqueue_concurrency_key], where: "state = 'enqueued' AND enqueue_concurrency_key IS NOT NULL", unique: true, name: :guard_double_enqueue
|
103
|
+
active_record_schema.add_index :gouda_workloads, [:scheduler_key], where: "state = 'enqueued' AND scheduler_key IS NOT NULL", unique: true, name: :guard_double_schedule
|
104
|
+
active_record_schema.add_index :gouda_workloads, [:execution_concurrency_key], where: "state = 'executing' AND execution_concurrency_key IS NOT NULL", unique: true, name: :guard_double_exec
|
105
|
+
active_record_schema.add_index :gouda_workloads, [:active_job_id], name: :same_job_display_idx
|
106
|
+
active_record_schema.add_index :gouda_workloads, [:priority], order: {priority: "ASC NULLS LAST"}, name: :ordered_priority_idx
|
107
|
+
active_record_schema.add_index :gouda_workloads, [:last_execution_heartbeat_at], name: :index_gouda_workloads_on_last_execution_heartbeat_at
|
108
|
+
active_record_schema.add_index :gouda_workloads, [:scheduler_key], name: :index_gouda_workloads_on_scheduler_key
|
109
|
+
|
110
|
+
active_record_schema.create_table :gouda_job_fuses, id: false do |t|
|
111
|
+
t.string :active_job_class_name, null: false
|
112
|
+
|
113
|
+
t.timestamps
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|