gouda 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +9 -0
- data/.rubocop.yml +10 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +6 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +17 -0
- data/Rakefile +19 -0
- data/gouda.gemspec +32 -0
- data/lib/active_job/queue_adapters/gouda_adapter.rb +8 -0
- data/lib/generators/gouda/install_generator.rb +29 -0
- data/lib/generators/gouda/templates/install/migrations/create_gouda_tables.rb.erb +7 -0
- data/lib/gouda/active_job_extensions/concurrency.rb +70 -0
- data/lib/gouda/active_job_extensions/interrupts.rb +46 -0
- data/lib/gouda/adapter.rb +183 -0
- data/lib/gouda/bulk.rb +39 -0
- data/lib/gouda/job_fuse.rb +6 -0
- data/lib/gouda/migrations/create_gouda_tables.rb.erb +5 -0
- data/lib/gouda/queue_constraints.rb +73 -0
- data/lib/gouda/railtie.rb +57 -0
- data/lib/gouda/scheduler.rb +108 -0
- data/lib/gouda/version.rb +5 -0
- data/lib/gouda/worker.rb +188 -0
- data/lib/gouda/workload.rb +214 -0
- data/lib/gouda.rb +116 -0
- metadata +186 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Gouda
|
4
|
+
class Railtie < Rails::Railtie
|
5
|
+
rake_tasks do
|
6
|
+
task preload: :setup do
|
7
|
+
if defined?(Rails) && Rails.respond_to?(:application)
|
8
|
+
if Rails.application.config.eager_load
|
9
|
+
ActiveSupport.run_load_hooks(:before_eager_load, Rails.application)
|
10
|
+
Rails.application.config.eager_load_namespaces.each(&:eager_load!)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
initializer "gouda.configure_rails_initialization" do
|
17
|
+
Gouda.config.app_executor = if defined?(Rails) && Rails.respond_to?(:application)
|
18
|
+
Rails.application.executor
|
19
|
+
else
|
20
|
+
ActiveSupport::Executor
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
initializer "gouda.active_job.extensions" do
|
25
|
+
ActiveSupport.on_load :active_job do
|
26
|
+
include Gouda::ActiveJobExtensions::Interrupts
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
generators do
|
31
|
+
require "generators/gouda/install_generator"
|
32
|
+
end
|
33
|
+
|
34
|
+
# The `to_prepare` block which is executed once in production
|
35
|
+
# and before each request in development.
|
36
|
+
config.to_prepare do
|
37
|
+
Gouda::Scheduler.update_schedule_from_config!
|
38
|
+
|
39
|
+
if defined?(Rails) && Rails.respond_to?(:application)
|
40
|
+
config_from_rails = Rails.application.config.try(:gouda)
|
41
|
+
if config_from_rails
|
42
|
+
Gouda.config.cleanup_preserved_jobs_before = config_from_rails[:cleanup_preserved_jobs_before]
|
43
|
+
Gouda.config.preserve_job_records = config_from_rails[:preserve_job_records]
|
44
|
+
Gouda.config.polling_sleep_interval_seconds = config_from_rails[:polling_sleep_interval_seconds]
|
45
|
+
Gouda.config.worker_thread_count = config_from_rails[:worker_thread_count]
|
46
|
+
if Gouda.config.logger
|
47
|
+
Gouda.config.logger.level = config_from_rails[:log_level] || Gouda.config.log_level
|
48
|
+
end
|
49
|
+
end
|
50
|
+
else
|
51
|
+
Gouda.config.preserve_job_records = false
|
52
|
+
Gouda.config.polling_sleep_interval_seconds = 0.2
|
53
|
+
Gouda.config.logger.level = Gouda.config.log_level
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Timers handles jobs which run either on a Cron schedule or using arbitrary time intervals
|
4
|
+
|
5
|
+
require "fugit"
|
6
|
+
module Gouda::Scheduler
|
7
|
+
# A timer entry is either a Cron pattern or an interval duration, and configures which job needs to be scheduled
|
8
|
+
# and when
|
9
|
+
class Entry < Struct.new(:name, :cron, :interval_seconds, :job_class, :kwargs, :args, :set, keyword_init: true)
|
10
|
+
def scheduler_key
|
11
|
+
[name, interval_seconds, cron, job_class].compact.join("_")
|
12
|
+
end
|
13
|
+
|
14
|
+
def next_at
|
15
|
+
if interval_seconds
|
16
|
+
first_existing = Gouda::Workload.where(scheduler_key: scheduler_key).where("scheduled_at > NOW()").order("scheduled_at DESC").pluck(:scheduled_at).first
|
17
|
+
(first_existing || Time.now.utc) + interval_seconds
|
18
|
+
elsif cron
|
19
|
+
fugit = Fugit::Cron.parse(cron)
|
20
|
+
raise ArgumentError, "Unable to parse cron pattern #{cron.inspect}" unless fugit
|
21
|
+
Time.at(fugit.next_time.to_i).utc
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def build_active_job
|
26
|
+
next_at = self.next_at
|
27
|
+
return unless next_at
|
28
|
+
|
29
|
+
job_class = self.job_class.constantize
|
30
|
+
|
31
|
+
active_job = kwargs_value.present? ? job_class.new(*args_value, **kwargs_value) : job_class.new(*args_value) # This method supports ruby2_keywords
|
32
|
+
active_job.scheduled_at = next_at
|
33
|
+
active_job.scheduler_key = scheduler_key
|
34
|
+
|
35
|
+
set_value.present? ? active_job.set(set_value) : active_job
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def set_value
|
41
|
+
value = set || {}
|
42
|
+
value.respond_to?(:call) ? value.call : value
|
43
|
+
end
|
44
|
+
|
45
|
+
def args_value
|
46
|
+
value = args || []
|
47
|
+
value.respond_to?(:call) ? value.call : value
|
48
|
+
end
|
49
|
+
|
50
|
+
def kwargs_value
|
51
|
+
value = kwargs || nil
|
52
|
+
value.respond_to?(:call) ? value.call : value
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.update_schedule_from_config!(cron_table_hash = nil)
|
57
|
+
Gouda.logger.info "Updating scheduled workload entries..."
|
58
|
+
if cron_table_hash.blank?
|
59
|
+
config_from_rails = Rails.application.config.try(:gouda)
|
60
|
+
|
61
|
+
cron_table_hash = if config_from_rails.present?
|
62
|
+
config_from_rails.dig(:cron).to_h if config_from_rails.dig(:enable_cron)
|
63
|
+
elsif Gouda.config.enable_cron
|
64
|
+
Gouda.config.cron
|
65
|
+
end
|
66
|
+
|
67
|
+
return unless cron_table_hash
|
68
|
+
end
|
69
|
+
|
70
|
+
defaults = {cron: nil, interval_seconds: nil, kwargs: nil, args: nil}
|
71
|
+
@cron_table = cron_table_hash.map do |(name, cron_entry_params)|
|
72
|
+
# `class` is a reserved keyword and a method that exists on every Ruby object so...
|
73
|
+
cron_entry_params[:job_class] ||= cron_entry_params.delete(:class)
|
74
|
+
params_with_defaults = defaults.merge(cron_entry_params)
|
75
|
+
Entry.new(name:, **params_with_defaults)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.enqueue_next_scheduled_workload_for(finished_workload)
|
80
|
+
return unless finished_workload.scheduler_key
|
81
|
+
|
82
|
+
timer_table = @cron_table.to_a.index_by(&:scheduler_key)
|
83
|
+
timer_entry = timer_table[finished_workload.scheduler_key]
|
84
|
+
return unless timer_entry
|
85
|
+
|
86
|
+
Gouda.enqueue_jobs_via_their_adapters([timer_entry.build_active_job])
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.entries
|
90
|
+
@cron_table || []
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.update_scheduled_workloads!
|
94
|
+
table_entries = @cron_table || []
|
95
|
+
|
96
|
+
# Remove any cron keyed workloads which no longer match config-wise
|
97
|
+
known_keys = table_entries.map(&:scheduler_key).uniq
|
98
|
+
Gouda::Workload.transaction do
|
99
|
+
Gouda::Workload.where.not(scheduler_key: known_keys).delete_all
|
100
|
+
|
101
|
+
# Insert the next iteration for every "next" entry in the crontab.
|
102
|
+
active_jobs_to_enqueue = table_entries.filter_map(&:build_active_job)
|
103
|
+
Gouda.logger.info "#{active_jobs_to_enqueue.size} job(s) to enqueue from the scheduler."
|
104
|
+
enqjobs = Gouda.enqueue_jobs_via_their_adapters(active_jobs_to_enqueue)
|
105
|
+
Gouda.logger.info "#{enqjobs.size} scheduled job(s) enqueued."
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/gouda/worker.rb
ADDED
@@ -0,0 +1,188 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "securerandom"
|
4
|
+
require "gouda/version"
|
5
|
+
|
6
|
+
module Gouda
|
7
|
+
POLL_INTERVAL_DURATION_SECONDS = 1
|
8
|
+
|
9
|
+
# Is used for keeping the IDs of currently executing jobs on this worker in a thread-safe way.
|
10
|
+
# These IDs are used to update the heartbeat timestamps during execution. We need just three
|
11
|
+
# methods here - add to a set, remove from a set, and convert the set into an array for a SQL query
|
12
|
+
# with `WHERE id IN`.
|
13
|
+
class ThreadSafeSet
|
14
|
+
def initialize
|
15
|
+
@set = Set.new
|
16
|
+
@mutex = Mutex.new
|
17
|
+
end
|
18
|
+
|
19
|
+
def add(value)
|
20
|
+
@mutex.synchronize { @set.add(value) }
|
21
|
+
value
|
22
|
+
end
|
23
|
+
|
24
|
+
def delete(value)
|
25
|
+
@mutex.synchronize { @set.delete(value) }
|
26
|
+
value
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_a
|
30
|
+
@mutex.synchronize { @set.to_a }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns `true` once a given timer has elapsed.
|
35
|
+
# This is useful to terminate a worker after a certain amount of time
|
36
|
+
class TimerShutdownCheck
|
37
|
+
def initialize(seconds_float)
|
38
|
+
@dt = seconds_float
|
39
|
+
@st = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
40
|
+
end
|
41
|
+
|
42
|
+
def call
|
43
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) - @st) > @dt
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Captures UNIX signals (TERM and INT) and then returns true. Once you initialize the
|
48
|
+
# this check you install signal handlers, meaning that the worker will not raise `Interrupt`
|
49
|
+
# from any theads but will get the space it needs to terminate cleanly. At least for SIGINT
|
50
|
+
# and SIGTERM this is very desirable. This is the default shutdown check.
|
51
|
+
class TrapShutdownCheck
|
52
|
+
def initialize
|
53
|
+
@did_trap = false
|
54
|
+
@did_log = false
|
55
|
+
Signal.trap(:TERM) do
|
56
|
+
@did_trap = :TERM
|
57
|
+
end
|
58
|
+
Signal.trap(:INT) do
|
59
|
+
@did_trap = :INT
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def call
|
64
|
+
if @did_trap
|
65
|
+
@did_log ||= begin
|
66
|
+
warn("Gouda worker signaled to terminate via SIG#{@did_trap}")
|
67
|
+
true
|
68
|
+
end
|
69
|
+
true
|
70
|
+
else
|
71
|
+
false
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# This shutdown check will return `true` once there
|
77
|
+
# are no enqueued jobs to process for this worker. This
|
78
|
+
# can be used to run a worker just as long as there are jobs to handle
|
79
|
+
# and then to let it quit by itself (handy for spot instances and the like)
|
80
|
+
class EmptyQueueShutdownCheck
|
81
|
+
def initialize(queue_constraint = Gouda::AnyQueue)
|
82
|
+
@queue_constraint = queue_constraint
|
83
|
+
end
|
84
|
+
|
85
|
+
def call
|
86
|
+
# return false unless Rails.application # Rails is still booting and there is no application defined
|
87
|
+
|
88
|
+
Gouda.config.app_executor.wrap do
|
89
|
+
Gouda::Workload.waiting_to_start(queue_constraint: @queue_constraint).none?
|
90
|
+
end
|
91
|
+
rescue # If the DB connection cannot be checked out etc
|
92
|
+
false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# A wrapping callable which returns `true` if any of the
|
97
|
+
# given callables return true. This can be used to combine a timed shutdown ("in 30 seconds")
|
98
|
+
# with a signal handler shutdown ("shutdown on SIGTERM/SIGINT")
|
99
|
+
class CombinedShutdownCheck
|
100
|
+
# @param callables_for_condition[#call] other shutdown checks
|
101
|
+
def initialize(*callables_for_condition)
|
102
|
+
@conditions = callables_for_condition
|
103
|
+
end
|
104
|
+
|
105
|
+
def call
|
106
|
+
# Once one shutdown check told us to shut down there is no point to query all the others
|
107
|
+
@memo ||= @conditions.any?(&:call)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Start looping, taking work from the queue and performing it, over multiple worker threads.
|
112
|
+
# Once the `check_shutdown` callable returns `true` the threads will cleanly terminate and the method will return (so it is blocking).
|
113
|
+
#
|
114
|
+
# @param n_threads[Integer] how many _worker_ threads to start. Another thread will be started for housekeeping, so ideally this should be the size of your connection pool minus 1
|
115
|
+
# @param check_shutdown[#call] A callable object (can be a Proc etc.). Once starts returning `true` the worker threads and the housekeeping thread will cleanly exit
|
116
|
+
def self.worker_loop(n_threads:, check_shutdown: TrapShutdownCheck.new, queue_constraint: Gouda::AnyQueue)
|
117
|
+
# We need quite a few things when starting the loop - we have to be far enough into the Rails bootup sequence
|
118
|
+
# that both the application and the executor are available
|
119
|
+
#
|
120
|
+
# raise "Rails is not loaded yet" unless defined?(Rails) && Rails.respond_to?(:application)
|
121
|
+
# raise "Rails application is not loaded yet" unless Rails.application
|
122
|
+
# raise "Rails executor not available yet" unless Rails.application.executor
|
123
|
+
|
124
|
+
check_shutdown = CombinedShutdownCheck.new(*check_shutdown) if !check_shutdown.respond_to?(:call) && check_shutdown.is_a?(Array)
|
125
|
+
|
126
|
+
worker_id = [Socket.gethostname, Process.pid, SecureRandom.uuid].join("-")
|
127
|
+
|
128
|
+
executing_workload_ids = ThreadSafeSet.new
|
129
|
+
|
130
|
+
raise ArgumentError, "You need at least 1 worker thread, but you requested #{n_threads}" if n_threads < 1
|
131
|
+
worker_threads = n_threads.times.map do
|
132
|
+
Thread.new do
|
133
|
+
worker_id_and_thread_id = [worker_id, "t0x#{Thread.current.object_id.to_s(16)}"].join("-")
|
134
|
+
loop do
|
135
|
+
break if check_shutdown.call
|
136
|
+
|
137
|
+
did_process = Gouda.config.app_executor.wrap do
|
138
|
+
Gouda::Workload.checkout_and_perform_one(executing_on: worker_id_and_thread_id, queue_constraint:, in_progress: executing_workload_ids)
|
139
|
+
end
|
140
|
+
|
141
|
+
# If no job was retrieved the queue is likely empty. Relax the polling then and ease off.
|
142
|
+
# If a job was retrieved it is likely that a burst has just been enqueued, and we do not
|
143
|
+
# sleep but proceed to attempt to retrieve the next job right after.
|
144
|
+
jitter_sleep_interval = POLL_INTERVAL_DURATION_SECONDS + (POLL_INTERVAL_DURATION_SECONDS * 0.25)
|
145
|
+
sleep_with_interruptions(jitter_sleep_interval, check_shutdown) unless did_process
|
146
|
+
rescue => e
|
147
|
+
warn "Uncaught exception during perform (#{e.class} - #{e}"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
# Do the housekeeping tasks on main
|
153
|
+
loop do
|
154
|
+
break if check_shutdown.call
|
155
|
+
|
156
|
+
Gouda.config.app_executor.wrap do
|
157
|
+
# Mark known executing jobs as such. If a worker process is killed or the machine it is running on dies,
|
158
|
+
# a stale timestamp can indicate to us that the job was orphaned and is marked as "executing"
|
159
|
+
# even though the worker it was running on has failed for whatever reason.
|
160
|
+
# Later on we can figure out what to do with those jobs (re-enqueue them or toss them)
|
161
|
+
Gouda::Workload.where(id: executing_workload_ids.to_a, state: "executing").update_all(executing_on: worker_id, last_execution_heartbeat_at: Time.now.utc)
|
162
|
+
|
163
|
+
# Find jobs which just hung and clean them up (mark them as "finished" and enqueue replacement workloads if possible)
|
164
|
+
Gouda::Workload.reap_zombie_workloads
|
165
|
+
rescue => e
|
166
|
+
# Appsignal.add_exception(e)
|
167
|
+
warn "Uncaught exception during housekeeping (#{e.class} - #{e}"
|
168
|
+
end
|
169
|
+
|
170
|
+
# Jitter the sleep so that the workers booted at the same time do not all dogpile
|
171
|
+
randomized_sleep_duration_s = POLL_INTERVAL_DURATION_SECONDS + (POLL_INTERVAL_DURATION_SECONDS.to_f * rand)
|
172
|
+
sleep_with_interruptions(randomized_sleep_duration_s, check_shutdown)
|
173
|
+
end
|
174
|
+
ensure
|
175
|
+
worker_threads&.map(&:join)
|
176
|
+
end
|
177
|
+
|
178
|
+
def self.sleep_with_interruptions(n_seconds, must_abort_proc)
|
179
|
+
start_time_seconds = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
180
|
+
# remaining_seconds = n_seconds
|
181
|
+
check_interval_seconds = Gouda.config.polling_sleep_interval_seconds
|
182
|
+
loop do
|
183
|
+
return if must_abort_proc.call
|
184
|
+
return if Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time_seconds >= n_seconds
|
185
|
+
sleep(check_interval_seconds)
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end
|
@@ -0,0 +1,214 @@
|
|
1
|
+
# # frozen_string_literal: true
|
2
|
+
|
3
|
+
# This model is called "workload" for a reason. The ActiveJob can be enqueued multiple times with
|
4
|
+
# the same job ID which gets generated by Rails. These multiple enqueues of the same job are not
|
5
|
+
# exactly copies of one another. When you use job-iteration for example, your job will be retried with a different
|
6
|
+
# cursor position value. When you use ActiveJob `rescue_from` as well - the job will be retried and keep the same
|
7
|
+
# active job ID, but it then gets returned into the queue "in some way". What we want is that the records in our
|
8
|
+
# table represent a unit of work that the worker has to execute "at some point". If the same job gets enqueued multiple
|
9
|
+
# times due to retries or pause/resume we want the enqueues to be separate workloads, which can fail or succeed
|
10
|
+
# independently. This also allows the queue records to be "append-only" which allows the records to be pruned
|
11
|
+
# on a regular basis. This is why they are called "workloads" and not "jobs". "Executions" is a great term used
|
12
|
+
# by good_job but it seems that it is not clear what has the "identity". With the Workload the ID of the workload
|
13
|
+
# is the "provider ID" for ActiveJob. It is therefore possible (and likely) that multiple Workloads will exist
|
14
|
+
# sharing the same ActiveJob ID.
|
15
|
+
class Gouda::Workload < ActiveRecord::Base
|
16
|
+
ZOMBIE_MAX_THRESHOLD = "5 minutes"
|
17
|
+
|
18
|
+
self.table_name = "gouda_workloads"
|
19
|
+
# GoodJob calls these "enqueued" but they are more like
|
20
|
+
# "waiting to start" - jobs which have been scheduled past now,
|
21
|
+
# or haven't been scheduled to a particular time, are in the "enqueued"
|
22
|
+
# state and match the queue constraint
|
23
|
+
scope :waiting_to_start, ->(queue_constraint: Gouda::AnyQueue) {
|
24
|
+
condition_for_ready_to_execute_jobs = <<~SQL
|
25
|
+
#{queue_constraint.to_sql}
|
26
|
+
AND execution_concurrency_key NOT IN (
|
27
|
+
SELECT execution_concurrency_key FROM #{quoted_table_name} WHERE state = 'executing' AND execution_concurrency_key IS NOT NULL
|
28
|
+
)
|
29
|
+
AND state = 'enqueued'
|
30
|
+
AND (scheduled_at <= clock_timestamp())
|
31
|
+
SQL
|
32
|
+
|
33
|
+
where(Arel.sql(condition_for_ready_to_execute_jobs))
|
34
|
+
}
|
35
|
+
|
36
|
+
scope :errored, -> { where("error != '{}'") }
|
37
|
+
scope :retried, -> { where("(serialized_params -> 'exception_executions') != '{}' AND state != 'finished'") }
|
38
|
+
scope :finished, -> { where(state: "finished") }
|
39
|
+
scope :enqueued, -> { where(state: "enqueued") }
|
40
|
+
scope :executing, -> { where(state: "executing") }
|
41
|
+
|
42
|
+
def self.queue_names
|
43
|
+
connection.select_values("SELECT DISTINCT(queue_name) FROM #{quoted_table_name} ORDER BY queue_name ASC")
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.prune
|
47
|
+
if Gouda.config.preserve_job_records
|
48
|
+
where(state: "finished").where("execution_finished_at < ?", Gouda.cleanup_preserved_jobs_before.ago).delete_all
|
49
|
+
else
|
50
|
+
where(state: "finished").delete_all
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Re-enqueue zombie workloads which have been left to rot due to machine kills, worker OOM kills and the like
|
55
|
+
# With a lock so no single zombie job gets enqueued more than once
|
56
|
+
# And wrapped in transactions with the possibility to roll back a single workload without it rollbacking the entire batch
|
57
|
+
def self.reap_zombie_workloads
|
58
|
+
uncached do # again needed due to the use of clock_timestamp() in the SQL
|
59
|
+
transaction do
|
60
|
+
zombie_workloads_scope = Gouda::Workload.lock("FOR UPDATE SKIP LOCKED").where("state = 'executing' AND last_execution_heartbeat_at < (clock_timestamp() - interval '#{ZOMBIE_MAX_THRESHOLD}')")
|
61
|
+
zombie_workloads_scope.find_each(batch_size: 1000) do |workload|
|
62
|
+
# with_lock will start its own transaction
|
63
|
+
workload.with_lock("FOR UPDATE SKIP LOCKED") do
|
64
|
+
Gouda.logger.info { "Reviving (re-enqueueing) Gouda workload #{workload.id} after interruption" }
|
65
|
+
|
66
|
+
# Appsignal.increment_counter("gouda_workloads_revived", 1, job_class: workload.active_job_class_name)
|
67
|
+
|
68
|
+
interrupted_at = workload.last_execution_heartbeat_at
|
69
|
+
workload.update!(state: "finished", interrupted_at:, last_execution_heartbeat_at: Time.now.utc, execution_finished_at: Time.now.utc)
|
70
|
+
revived_job = ActiveJob::Base.deserialize(workload.active_job_data)
|
71
|
+
# Save the interrupted_at timestamp so that upon execution the new job will raise a Gouda::Interrpupted exception.
|
72
|
+
# The exception can then be handled like any other ActiveJob exception (using rescue_from or similar).
|
73
|
+
revived_job.interrupted_at = interrupted_at
|
74
|
+
revived_job.enqueue
|
75
|
+
end
|
76
|
+
rescue ActiveRecord::RecordNotFound
|
77
|
+
# This will happen if we have selected the zombie workload in the outer block, but
|
78
|
+
# by the point we reload it and take a FOR UPDATE SKIP LOCKED lock another worker is
|
79
|
+
# already reaping it - a call to `reload` will cause a RecordNotFound, since Postgres
|
80
|
+
# will hide the row from us. This is what we want in fact - we want to progress to
|
81
|
+
# the next row. So we allow the code to proceed, as we expect that the other worker
|
82
|
+
# (which stole the workload from us) will have set it to "state=finished" by the time we reattempt
|
83
|
+
# our SELECT with conditions
|
84
|
+
Gouda.logger.debug { "Gouda workload #{workload.id} cannot be reaped as it was hijacked by another worker" }
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# Lock the next workload and mark it as executing
|
91
|
+
def self.checkout_and_lock_one(executing_on:, queue_constraint: Gouda::AnyQueue)
|
92
|
+
where_query = <<~SQL
|
93
|
+
#{queue_constraint.to_sql}
|
94
|
+
AND workloads.state = 'enqueued'
|
95
|
+
AND NOT EXISTS (
|
96
|
+
SELECT NULL
|
97
|
+
FROM #{quoted_table_name} AS concurrent
|
98
|
+
WHERE concurrent.state = 'executing'
|
99
|
+
AND concurrent.execution_concurrency_key = workloads.execution_concurrency_key
|
100
|
+
)
|
101
|
+
AND workloads.scheduled_at <= clock_timestamp()
|
102
|
+
SQL
|
103
|
+
# Enter a txn just to mark this job as being executed "by us". This allows us to avoid any
|
104
|
+
# locks during execution itself, including advisory locks
|
105
|
+
jobs = Gouda::Workload
|
106
|
+
.select("workloads.*")
|
107
|
+
.from("#{quoted_table_name} AS workloads")
|
108
|
+
.where(where_query)
|
109
|
+
.order("workloads.priority ASC NULLS LAST")
|
110
|
+
.lock("FOR UPDATE SKIP LOCKED")
|
111
|
+
.limit(1)
|
112
|
+
|
113
|
+
_first_available_workload = ActiveSupport::Notifications.instrument("checkout_and_lock_one.gouda", {queue_constraint: queue_constraint.to_sql}) do |payload|
|
114
|
+
payload[:condition_sql] = jobs.to_sql
|
115
|
+
payload[:retried_checkouts_due_to_concurrent_exec] = 0
|
116
|
+
uncached do # Necessary because we SELECT with a clock_timestamp() which otherwise gets cached by ActiveRecord query cache
|
117
|
+
transaction do
|
118
|
+
jobs.first.tap do |job|
|
119
|
+
job&.update!(state: "executing", executing_on:, last_execution_heartbeat_at: Time.now.utc, execution_started_at: Time.now.utc)
|
120
|
+
end
|
121
|
+
rescue ActiveRecord::RecordNotUnique
|
122
|
+
# It can happen that due to a race the `execution_concurrency_key NOT IN` does not capture
|
123
|
+
# a job which _just_ entered the "executing" state, apparently after we do our SELECT. This will happen regardless
|
124
|
+
# whether we are using a CTE or a sub-SELECT
|
125
|
+
payload[:retried_checkouts_due_to_concurrent_exec] += 1
|
126
|
+
nil
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Get a new workload and call perform
|
133
|
+
# @param in_progress[#add,#delete] Used for tracking work in progress for heartbeats
|
134
|
+
def self.checkout_and_perform_one(executing_on:, queue_constraint: Gouda::AnyQueue, in_progress: Set.new)
|
135
|
+
# Select a job and mark it as "executing" which will make it unavailable to any other
|
136
|
+
workload = checkout_and_lock_one(executing_on:, queue_constraint:)
|
137
|
+
if workload
|
138
|
+
in_progress.add(workload.id)
|
139
|
+
workload.perform_and_update_state!
|
140
|
+
end
|
141
|
+
ensure
|
142
|
+
in_progress.delete(workload.id) if workload
|
143
|
+
end
|
144
|
+
|
145
|
+
def enqueued_at
|
146
|
+
Time.parse(serialized_params["enqueued_at"]) if serialized_params["enqueued_at"]
|
147
|
+
end
|
148
|
+
|
149
|
+
def perform_and_update_state!
|
150
|
+
ActiveSupport::Notifications.instrument("perform_job.gouda", {workload: self}) do |instrument_payload|
|
151
|
+
extras = {}
|
152
|
+
if Gouda::JobFuse.exists?(active_job_class_name: active_job_class_name)
|
153
|
+
extras[:error] = {class_name: "WorkloadSkippedError", message: "Skipped because of a fuse at #{Time.now.utc}"}
|
154
|
+
else
|
155
|
+
job_result = ActiveJob::Base.execute(active_job_data)
|
156
|
+
|
157
|
+
if job_result.is_a?(Exception)
|
158
|
+
# When an exception is handled, let's say we have a retry_on <exception> in our job, we end up here
|
159
|
+
# and it won't be rescueed
|
160
|
+
handled_error = job_result
|
161
|
+
update!(error: error_hash(handled_error))
|
162
|
+
end
|
163
|
+
|
164
|
+
instrument_payload[:value] = job_result
|
165
|
+
instrument_payload[:handled_error] = handled_error
|
166
|
+
|
167
|
+
job_result
|
168
|
+
end
|
169
|
+
rescue => exception_not_retried_by_active_job
|
170
|
+
# When a job fails and is not retryable it will end up here.
|
171
|
+
update!(error: error_hash(exception_not_retried_by_active_job))
|
172
|
+
instrument_payload[:unhandled_error] = exception_not_retried_by_active_job
|
173
|
+
Gouda.logger.error { exception_not_retried_by_active_job }
|
174
|
+
exception_not_retried_by_active_job # Return the exception instead of re-raising it
|
175
|
+
ensure
|
176
|
+
update!(state: "finished", last_execution_heartbeat_at: Time.now.utc, execution_finished_at: Time.now.utc, **extras)
|
177
|
+
# If the workload that just finished was a scheduled workload (via timer/cron) enqueue the next execution.
|
178
|
+
# Otherwise the next job will only get enqueued once the config is reloaded
|
179
|
+
Gouda::Scheduler.enqueue_next_scheduled_workload_for(self)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def schedule_now!
|
184
|
+
with_lock do
|
185
|
+
return if state != "enqueued"
|
186
|
+
|
187
|
+
update!(scheduled_at: Time.now.utc)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def mark_finished!
|
192
|
+
with_lock do
|
193
|
+
now = Time.now.utc
|
194
|
+
execution_started_at ||= now
|
195
|
+
|
196
|
+
return if state == "finished"
|
197
|
+
|
198
|
+
update!(
|
199
|
+
state: "finished", last_execution_heartbeat_at: now,
|
200
|
+
execution_finished_at: now, execution_started_at: execution_started_at,
|
201
|
+
error: {class_name: "RemovedError", message: "Manually removed at #{now}"}
|
202
|
+
)
|
203
|
+
Gouda::Scheduler.enqueue_next_scheduled_workload_for(self)
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def error_hash(error)
|
208
|
+
{class_name: error.class.to_s, backtrace: error.backtrace.to_a, message: error.message}
|
209
|
+
end
|
210
|
+
|
211
|
+
def active_job_data
|
212
|
+
serialized_params.deep_dup.merge("provider_job_id" => id, "interrupted_at" => interrupted_at, "scheduler_key" => scheduler_key) # TODO: is this memory-economical?
|
213
|
+
end
|
214
|
+
end
|
data/lib/gouda.rb
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support"
|
4
|
+
require "active_support/core_ext/numeric/time"
|
5
|
+
require "active_support/configurable"
|
6
|
+
require "rails/railtie"
|
7
|
+
require_relative "gouda/bulk"
|
8
|
+
require_relative "gouda/adapter"
|
9
|
+
require_relative "gouda/scheduler"
|
10
|
+
require_relative "gouda/railtie" if defined?(Rails::Railtie)
|
11
|
+
require_relative "gouda/workload"
|
12
|
+
require_relative "gouda/worker"
|
13
|
+
require_relative "gouda/job_fuse"
|
14
|
+
require_relative "gouda/queue_constraints"
|
15
|
+
require_relative "gouda/active_job_extensions/interrupts"
|
16
|
+
require_relative "gouda/active_job_extensions/concurrency"
|
17
|
+
require_relative "active_job/queue_adapters/gouda_adapter"
|
18
|
+
|
19
|
+
module Gouda
|
20
|
+
class Gouda::Configuration
|
21
|
+
include ActiveSupport::Configurable
|
22
|
+
|
23
|
+
config_accessor(:preserve_job_records, default: false)
|
24
|
+
config_accessor(:cleanup_preserved_jobs_before, default: 3.hours)
|
25
|
+
config_accessor(:polling_sleep_interval_seconds, default: 0.2)
|
26
|
+
config_accessor(:worker_thread_count, default: 1)
|
27
|
+
config_accessor(:logger, default: ActiveSupport::Logger.new($stdout))
|
28
|
+
config_accessor(:app_executor)
|
29
|
+
config_accessor(:cron, default: {})
|
30
|
+
config_accessor(:enable_cron, default: true)
|
31
|
+
# Log levels are:
|
32
|
+
# constant | level
|
33
|
+
# Logger::DEBUG (0)
|
34
|
+
# Logger::INFO (1)
|
35
|
+
# Logger::WARN (2)
|
36
|
+
# Logger::ERROR (3)
|
37
|
+
# Logger::FATAL (4)
|
38
|
+
# Logger::UNKNOWN (5)
|
39
|
+
config_accessor(:log_level, default: Logger::DEBUG)
|
40
|
+
end
|
41
|
+
|
42
|
+
class InterruptError < StandardError
|
43
|
+
end
|
44
|
+
|
45
|
+
class ConcurrencyExceededError < StandardError
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.start
|
49
|
+
Gouda::Scheduler.update_scheduled_workloads!
|
50
|
+
|
51
|
+
queue_constraint = if ENV["GOUDA_QUEUES"]
|
52
|
+
Gouda.parse_queue_constraint(ENV["GOUDA_QUEUES"])
|
53
|
+
else
|
54
|
+
Gouda::AnyQueue
|
55
|
+
end
|
56
|
+
|
57
|
+
Gouda.logger.info("Gouda version: #{Gouda::VERSION}")
|
58
|
+
Gouda.logger.info("Worker threads: #{Gouda.config.worker_thread_count}")
|
59
|
+
|
60
|
+
Gouda.worker_loop(n_threads: Gouda.config.worker_thread_count, queue_constraint:)
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.config
|
64
|
+
@config ||= Configuration.new
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.configure
|
68
|
+
yield config
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.logger
|
72
|
+
Gouda.config.logger
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.create_tables(active_record_schema)
|
76
|
+
active_record_schema.create_enum :gouda_workload_state, %w[enqueued executing finished]
|
77
|
+
active_record_schema.create_table :gouda_workloads, id: :uuid do |t|
|
78
|
+
t.uuid :active_job_id, null: false
|
79
|
+
t.timestamp :scheduled_at, null: false
|
80
|
+
t.timestamp :execution_started_at
|
81
|
+
t.timestamp :execution_finished_at
|
82
|
+
t.timestamp :last_execution_heartbeat_at
|
83
|
+
t.timestamp :interrupted_at, null: true
|
84
|
+
|
85
|
+
t.string :scheduler_key, null: true
|
86
|
+
t.string :queue_name, null: false, default: "default"
|
87
|
+
t.integer :priority
|
88
|
+
t.string :active_job_class_name, null: false
|
89
|
+
t.jsonb :serialized_params
|
90
|
+
t.jsonb :error, default: {}, null: false
|
91
|
+
t.enum :state, enum_type: :gouda_workload_state, default: "enqueued", null: false
|
92
|
+
t.string :execution_concurrency_key
|
93
|
+
t.string :enqueue_concurrency_key
|
94
|
+
t.string :executing_on
|
95
|
+
t.integer :position_in_bulk
|
96
|
+
|
97
|
+
t.timestamps
|
98
|
+
end
|
99
|
+
|
100
|
+
active_record_schema.add_index :gouda_workloads, [:priority, :id, :scheduled_at], where: "state = 'enqueued'", name: :gouda_checkout_all_index
|
101
|
+
active_record_schema.add_index :gouda_workloads, [:id, :last_execution_heartbeat_at], where: "state = 'executing'", name: :gouda_last_heartbeat_index
|
102
|
+
active_record_schema.add_index :gouda_workloads, [:enqueue_concurrency_key], where: "state = 'enqueued' AND enqueue_concurrency_key IS NOT NULL", unique: true, name: :guard_double_enqueue
|
103
|
+
active_record_schema.add_index :gouda_workloads, [:scheduler_key], where: "state = 'enqueued' AND scheduler_key IS NOT NULL", unique: true, name: :guard_double_schedule
|
104
|
+
active_record_schema.add_index :gouda_workloads, [:execution_concurrency_key], where: "state = 'executing' AND execution_concurrency_key IS NOT NULL", unique: true, name: :guard_double_exec
|
105
|
+
active_record_schema.add_index :gouda_workloads, [:active_job_id], name: :same_job_display_idx
|
106
|
+
active_record_schema.add_index :gouda_workloads, [:priority], order: {priority: "ASC NULLS LAST"}, name: :ordered_priority_idx
|
107
|
+
active_record_schema.add_index :gouda_workloads, [:last_execution_heartbeat_at], name: :index_gouda_workloads_on_last_execution_heartbeat_at
|
108
|
+
active_record_schema.add_index :gouda_workloads, [:scheduler_key], name: :index_gouda_workloads_on_scheduler_key
|
109
|
+
|
110
|
+
active_record_schema.create_table :gouda_job_fuses, id: false do |t|
|
111
|
+
t.string :active_job_class_name, null: false
|
112
|
+
|
113
|
+
t.timestamps
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|