dispatch_policy 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +185 -0
- data/README.md +30 -7
- data/app/controllers/dispatch_policy/application_controller.rb +21 -2
- data/app/controllers/dispatch_policy/dashboard_controller.rb +3 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +51 -15
- data/app/controllers/dispatch_policy/policies_controller.rb +26 -4
- data/app/models/dispatch_policy/policy_setting.rb +14 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +6 -1
- data/app/views/dispatch_policy/partitions/index.html.erb +1 -1
- data/app/views/dispatch_policy/partitions/show.html.erb +1 -1
- data/app/views/dispatch_policy/policies/index.html.erb +11 -3
- data/app/views/dispatch_policy/policies/show.html.erb +13 -4
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +9 -2
- data/app/views/layouts/dispatch_policy/application.html.erb +21 -25
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +13 -0
- data/lib/dispatch_policy/config.rb +5 -0
- data/lib/dispatch_policy/context.rb +12 -2
- data/lib/dispatch_policy/cursor_pagination.rb +24 -7
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +14 -0
- data/lib/dispatch_policy/gates/concurrency.rb +4 -0
- data/lib/dispatch_policy/gates/throttle.rb +36 -9
- data/lib/dispatch_policy/inflight_tracker.rb +72 -26
- data/lib/dispatch_policy/job_extension.rb +33 -9
- data/lib/dispatch_policy/manual_admission.rb +18 -0
- data/lib/dispatch_policy/operator_hints.rb +14 -0
- data/lib/dispatch_policy/policy.rb +12 -0
- data/lib/dispatch_policy/policy_dsl.rb +10 -2
- data/lib/dispatch_policy/railtie.rb +10 -0
- data/lib/dispatch_policy/registry.rb +8 -4
- data/lib/dispatch_policy/repository.rb +102 -30
- data/lib/dispatch_policy/tick.rb +18 -2
- data/lib/dispatch_policy/tick_loop.rb +15 -7
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +9 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +30 -2
- metadata +2 -1
|
@@ -19,7 +19,7 @@ module DispatchPolicy
|
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
def fetch(name)
|
|
22
|
-
entry = @policies[name.to_s]
|
|
22
|
+
entry = @mutex.synchronize { @policies[name.to_s] }
|
|
23
23
|
entry && entry[:policy]
|
|
24
24
|
end
|
|
25
25
|
|
|
@@ -28,15 +28,19 @@ module DispatchPolicy
|
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
def names
|
|
31
|
-
@policies.keys
|
|
31
|
+
@mutex.synchronize { @policies.keys }
|
|
32
32
|
end
|
|
33
33
|
|
|
34
34
|
def each(&block)
|
|
35
|
-
|
|
35
|
+
# Snapshot under the lock, then iterate outside it: the block may run
|
|
36
|
+
# arbitrary code (and Mutex isn't reentrant), so we must not hold the
|
|
37
|
+
# lock while yielding.
|
|
38
|
+
snapshot = @mutex.synchronize { @policies.values.map { |e| e[:policy] } }
|
|
39
|
+
snapshot.each(&block)
|
|
36
40
|
end
|
|
37
41
|
|
|
38
42
|
def size
|
|
39
|
-
@policies.size
|
|
43
|
+
@mutex.synchronize { @policies.size }
|
|
40
44
|
end
|
|
41
45
|
|
|
42
46
|
def clear
|
|
@@ -13,8 +13,9 @@ module DispatchPolicy
|
|
|
13
13
|
STAGED_TABLE = "dispatch_policy_staged_jobs"
|
|
14
14
|
PARTITIONS_TABLE = "dispatch_policy_partitions"
|
|
15
15
|
INFLIGHT_TABLE = "dispatch_policy_inflight_jobs"
|
|
16
|
-
SAMPLES_TABLE
|
|
17
|
-
ADAPTIVE_TABLE
|
|
16
|
+
SAMPLES_TABLE = "dispatch_policy_tick_samples"
|
|
17
|
+
ADAPTIVE_TABLE = "dispatch_policy_adaptive_concurrency_stats"
|
|
18
|
+
POLICY_SETTINGS_TABLE = "dispatch_policy_policy_settings"
|
|
18
19
|
|
|
19
20
|
module_function
|
|
20
21
|
|
|
@@ -78,35 +79,43 @@ module DispatchPolicy
|
|
|
78
79
|
# Bulk version for perform_all_later. Receives an array of hashes with
|
|
79
80
|
# the same keys as #stage!. Performs one INSERT for staged_jobs and
|
|
80
81
|
# one UPSERT per (policy_name, partition_key) group.
|
|
82
|
+
# Rows per INSERT. Each row binds 8 params; Postgres caps a statement at
|
|
83
|
+
# 65_535 bind params, so we slice well under 65_535/8 ≈ 8_191 to leave
|
|
84
|
+
# headroom. A single perform_all_later with more rows than this would
|
|
85
|
+
# otherwise blow the limit and fail the whole batch.
|
|
86
|
+
STAGE_MANY_BATCH = 1_000
|
|
87
|
+
|
|
81
88
|
def stage_many!(rows)
|
|
82
89
|
return 0 if rows.empty?
|
|
83
90
|
|
|
84
91
|
connection.transaction(requires_new: true) do
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
92
|
+
rows.each_slice(STAGE_MANY_BATCH) do |slice|
|
|
93
|
+
values_sql = []
|
|
94
|
+
params = []
|
|
95
|
+
slice.each_with_index do |row, idx|
|
|
96
|
+
base = idx * 8
|
|
97
|
+
values_sql << "($#{base + 1}, $#{base + 2}, $#{base + 3}, $#{base + 4}, $#{base + 5}::jsonb, $#{base + 6}::jsonb, $#{base + 7}, $#{base + 8})"
|
|
98
|
+
params.push(
|
|
99
|
+
row[:policy_name],
|
|
100
|
+
row[:partition_key],
|
|
101
|
+
row[:queue_name],
|
|
102
|
+
row[:job_class],
|
|
103
|
+
JSON.dump(row[:job_data]),
|
|
104
|
+
JSON.dump(row[:context] || {}),
|
|
105
|
+
row[:scheduled_at],
|
|
106
|
+
row[:priority] || 0
|
|
107
|
+
)
|
|
108
|
+
end
|
|
109
|
+
connection.exec_query(
|
|
110
|
+
<<~SQL.squish,
|
|
111
|
+
INSERT INTO #{STAGED_TABLE}
|
|
112
|
+
(policy_name, partition_key, queue_name, job_class, job_data, context, scheduled_at, priority)
|
|
113
|
+
VALUES #{values_sql.join(", ")}
|
|
114
|
+
SQL
|
|
115
|
+
"stage_many",
|
|
116
|
+
params
|
|
99
117
|
)
|
|
100
118
|
end
|
|
101
|
-
connection.exec_query(
|
|
102
|
-
<<~SQL.squish,
|
|
103
|
-
INSERT INTO #{STAGED_TABLE}
|
|
104
|
-
(policy_name, partition_key, queue_name, job_class, job_data, context, scheduled_at, priority)
|
|
105
|
-
VALUES #{values_sql.join(", ")}
|
|
106
|
-
SQL
|
|
107
|
-
"stage_many",
|
|
108
|
-
params
|
|
109
|
-
)
|
|
110
119
|
|
|
111
120
|
rows.group_by { |r| [r[:policy_name], r[:partition_key]] }.each do |(policy_name, partition_key), group|
|
|
112
121
|
upsert_partition!(
|
|
@@ -169,6 +178,10 @@ module DispatchPolicy
|
|
|
169
178
|
AND status = 'active'
|
|
170
179
|
AND pending_count > 0
|
|
171
180
|
AND (next_eligible_at IS NULL OR next_eligible_at <= now())
|
|
181
|
+
AND NOT EXISTS (
|
|
182
|
+
SELECT 1 FROM #{POLICY_SETTINGS_TABLE} ps
|
|
183
|
+
WHERE ps.policy_name = $1 AND ps.paused
|
|
184
|
+
)
|
|
172
185
|
#{shard_sql}
|
|
173
186
|
ORDER BY last_checked_at NULLS FIRST, id
|
|
174
187
|
LIMIT $#{params.size}
|
|
@@ -306,10 +319,13 @@ module DispatchPolicy
|
|
|
306
319
|
#
|
|
307
320
|
# Each entry: { policy_name:, partition_key:, gate_state_patch:, retry_after: }.
|
|
308
321
|
# Independent per row — the join via FROM(VALUES…) makes the bulk
|
|
309
|
-
# statement equivalent to N sequential UPDATEs in correctness terms
|
|
310
|
-
#
|
|
311
|
-
# LOCKED
|
|
312
|
-
#
|
|
322
|
+
# statement equivalent to N sequential UPDATEs in correctness terms.
|
|
323
|
+
# Note: `claim_partitions` runs as its own autocommitted statement, so
|
|
324
|
+
# its `FOR UPDATE SKIP LOCKED` row locks are already released by the time
|
|
325
|
+
# we reach this flush — they do NOT guard the batch. What keeps two ticks
|
|
326
|
+
# off the same partitions is the operational invariant of one tick loop
|
|
327
|
+
# per (policy, shard), reinforced by the `last_checked_at` bump on claim
|
|
328
|
+
# (a racing claim skips recently-checked rows).
|
|
313
329
|
def bulk_record_partition_denies!(entries)
|
|
314
330
|
return if entries.empty?
|
|
315
331
|
|
|
@@ -331,7 +347,7 @@ module DispatchPolicy
|
|
|
331
347
|
UPDATE #{PARTITIONS_TABLE} p
|
|
332
348
|
SET gate_state = p.gate_state || v.gate_state_patch,
|
|
333
349
|
next_eligible_at = CASE
|
|
334
|
-
WHEN v.retry_after_secs IS NULL THEN
|
|
350
|
+
WHEN v.retry_after_secs IS NULL THEN p.next_eligible_at
|
|
335
351
|
ELSE now() + (v.retry_after_secs || ' seconds')::interval
|
|
336
352
|
END,
|
|
337
353
|
updated_at = now()
|
|
@@ -344,6 +360,24 @@ module DispatchPolicy
|
|
|
344
360
|
)
|
|
345
361
|
end
|
|
346
362
|
|
|
363
|
+
# ----- policy settings ------------------------------------------------------
|
|
364
|
+
|
|
365
|
+
# Upsert the pause flag for a policy. The tick's claim_partitions reads
|
|
366
|
+
# this row, so toggling it takes effect for every partition of the
|
|
367
|
+
# policy — including ones created after the toggle.
|
|
368
|
+
def set_policy_paused!(policy_name:, paused:)
|
|
369
|
+
connection.exec_query(
|
|
370
|
+
<<~SQL.squish,
|
|
371
|
+
INSERT INTO #{POLICY_SETTINGS_TABLE} (policy_name, paused, created_at, updated_at)
|
|
372
|
+
VALUES ($1, $2, now(), now())
|
|
373
|
+
ON CONFLICT (policy_name)
|
|
374
|
+
DO UPDATE SET paused = EXCLUDED.paused, updated_at = now()
|
|
375
|
+
SQL
|
|
376
|
+
"set_policy_paused",
|
|
377
|
+
[policy_name, paused ? true : false]
|
|
378
|
+
)
|
|
379
|
+
end
|
|
380
|
+
|
|
347
381
|
# ----- inflight tracking ---------------------------------------------------
|
|
348
382
|
|
|
349
383
|
def insert_inflight!(rows)
|
|
@@ -883,5 +917,43 @@ module DispatchPolicy
|
|
|
883
917
|
["now() + ($5 || ' seconds')::interval", [retry_after.to_f.round(3)]]
|
|
884
918
|
end
|
|
885
919
|
end
|
|
920
|
+
|
|
921
|
+
# ----- role routing ---------------------------------------------------------
|
|
922
|
+
#
|
|
923
|
+
# Every public Repository method must run against config.database_role
|
|
924
|
+
# so multi-DB setups (e.g. solid_queue on a separate :queue DB, with
|
|
925
|
+
# the gem tables living there) hit the DB the staging/admission/inflight
|
|
926
|
+
# state actually lives in. Otherwise staging writes the primary DB while
|
|
927
|
+
# the tick reads the queue DB — silent job loss — and the concurrency
|
|
928
|
+
# gate counts inflight rows in a different DB than the tracker writes.
|
|
929
|
+
#
|
|
930
|
+
# Rather than wrap ~25 method bodies by hand — and risk missing one as
|
|
931
|
+
# the API grows — we redefine each public SQL method to run inside
|
|
932
|
+
# `with_connection`. We capture the ORIGINAL as a bound closure and call
|
|
933
|
+
# it directly (no `super`, no prepended module): this is immune to the
|
|
934
|
+
# file being evaluated more than once in a process (dev reloader,
|
|
935
|
+
# integration suites that boot the dummy app under multiple require
|
|
936
|
+
# paths). Each evaluation re-wraps the freshly (re)defined originals
|
|
937
|
+
# exactly once, so wrappers never stack. `connected_to(role:)` nesting
|
|
938
|
+
# with the SAME role is a no-op, so the explicit `with_connection` blocks
|
|
939
|
+
# at the transaction boundaries (Tick, ManualAdmission) stay correct: the
|
|
940
|
+
# admission TX still opens entirely within one role context, preserving
|
|
941
|
+
# the shared-connection atomicity invariant. The `connection` accessor
|
|
942
|
+
# and the pure helpers are excluded — they issue no SQL of their own and
|
|
943
|
+
# always run inside an already-routed caller, so wrapping them would only
|
|
944
|
+
# add redundant role swaps in hot per-row loops (normalize_*/parse_jsonb
|
|
945
|
+
# run once per claimed row).
|
|
946
|
+
ROLE_ROUTING_EXCLUDED = %i[
|
|
947
|
+
connection with_connection
|
|
948
|
+
normalize_partition normalize_staged parse_jsonb
|
|
949
|
+
sample_filter next_eligible_clause trend_direction
|
|
950
|
+
].freeze
|
|
951
|
+
|
|
952
|
+
(singleton_methods(false) - ROLE_ROUTING_EXCLUDED).each do |method_name|
|
|
953
|
+
original = singleton_class.instance_method(method_name)
|
|
954
|
+
define_singleton_method(method_name) do |*args, **kwargs, &block|
|
|
955
|
+
with_connection { original.bind_call(self, *args, **kwargs, &block) }
|
|
956
|
+
end
|
|
957
|
+
end
|
|
886
958
|
end
|
|
887
959
|
end
|
data/lib/dispatch_policy/tick.rb
CHANGED
|
@@ -127,6 +127,15 @@ module DispatchPolicy
|
|
|
127
127
|
forward_failures += outcome[:failures]
|
|
128
128
|
admitted_per_partition[p["partition_key"]] += outcome[:admitted]
|
|
129
129
|
remaining -= outcome[:admitted]
|
|
130
|
+
|
|
131
|
+
# Feed pass-2 denies into the reason breakdown (e.g. a throttle
|
|
132
|
+
# that emptied after pass-1's settle) so the dashboard sees why
|
|
133
|
+
# redistribution stopped. We do NOT bump partitions_denied: the
|
|
134
|
+
# partition already counted as admitted in pass-1, and
|
|
135
|
+
# admitted + denied should stay ≈ partitions_seen.
|
|
136
|
+
if outcome[:admitted].zero?
|
|
137
|
+
outcome[:reasons].each { |r| denied_reasons[r] += 1 }
|
|
138
|
+
end
|
|
130
139
|
end
|
|
131
140
|
end
|
|
132
141
|
end
|
|
@@ -341,8 +350,15 @@ module DispatchPolicy
|
|
|
341
350
|
end
|
|
342
351
|
|
|
343
352
|
def record_sample!(**fields)
|
|
344
|
-
|
|
345
|
-
|
|
353
|
+
# These two reads go through the AR models, which the Repository role
|
|
354
|
+
# wrapper doesn't cover — wrap explicitly or, under multi-DB
|
|
355
|
+
# (config.database_role), they'd query the default writing role and
|
|
356
|
+
# either raise (swallowed below → no samples ever) or record zeros.
|
|
357
|
+
pending_total = inflight_total = nil
|
|
358
|
+
Repository.with_connection do
|
|
359
|
+
pending_total = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
|
|
360
|
+
inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
|
|
361
|
+
end
|
|
346
362
|
|
|
347
363
|
Repository.record_tick_sample!(
|
|
348
364
|
policy_name: @policy_name,
|
|
@@ -29,7 +29,7 @@ module DispatchPolicy
|
|
|
29
29
|
|
|
30
30
|
names = policy_names(policy_name)
|
|
31
31
|
if names.empty?
|
|
32
|
-
|
|
32
|
+
pause(config.idle_pause)
|
|
33
33
|
next
|
|
34
34
|
end
|
|
35
35
|
|
|
@@ -46,18 +46,26 @@ module DispatchPolicy
|
|
|
46
46
|
end
|
|
47
47
|
|
|
48
48
|
iteration += 1
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
# sweep_every_ticks <= 0 means "never sweep" (rather than crashing
|
|
50
|
+
# the loop with ZeroDivisionError on `iteration % 0`).
|
|
51
|
+
sweep_every = config.sweep_every_ticks.to_i
|
|
52
|
+
sweep! if sweep_every.positive? && (iteration % sweep_every).zero?
|
|
52
53
|
|
|
53
54
|
if admitted.zero?
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
pause(config.idle_pause)
|
|
56
|
+
else
|
|
57
|
+
pause(config.busy_pause)
|
|
57
58
|
end
|
|
58
59
|
end
|
|
59
60
|
end
|
|
60
61
|
|
|
62
|
+
# sleep, but never with a negative argument (which would raise
|
|
63
|
+
# ArgumentError mid-loop) — a non-positive pause just means "no pause".
|
|
64
|
+
def pause(seconds)
|
|
65
|
+
secs = seconds.to_f
|
|
66
|
+
sleep(secs) if secs.positive?
|
|
67
|
+
end
|
|
68
|
+
|
|
61
69
|
def policy_names(filter)
|
|
62
70
|
if filter
|
|
63
71
|
[filter.to_s]
|
|
@@ -91,5 +91,14 @@ class CreateDispatchPolicyTables < ActiveRecord::Migration[<%= Rails::VERSION::S
|
|
|
91
91
|
[:policy_name, :partition_key],
|
|
92
92
|
unique: true,
|
|
93
93
|
name: "idx_dp_adaptive_concurrency_lookup"
|
|
94
|
+
|
|
95
|
+
create_table :dispatch_policy_policy_settings do |t|
|
|
96
|
+
t.string :policy_name, null: false
|
|
97
|
+
t.boolean :paused, null: false, default: false
|
|
98
|
+
t.timestamps
|
|
99
|
+
end
|
|
100
|
+
add_index :dispatch_policy_policy_settings, :policy_name,
|
|
101
|
+
unique: true,
|
|
102
|
+
name: "idx_dp_policy_settings_lookup"
|
|
94
103
|
end
|
|
95
104
|
end
|
|
@@ -17,8 +17,26 @@ class DispatchTickLoopJob < ApplicationJob
|
|
|
17
17
|
<% if good_job? -%>
|
|
18
18
|
|
|
19
19
|
include GoodJob::ActiveJobExtensions::Concurrency
|
|
20
|
+
# Two valid setups depending on whether you have a cron safety net:
|
|
21
|
+
#
|
|
22
|
+
# A) enqueue_limit: 1, perform_limit: 1 (default below — no cron required)
|
|
23
|
+
# GoodJob's enqueue check excludes the running job (advisory-locked), so
|
|
24
|
+
# the self-re-enqueue at the end of perform always succeeds. perform_limit
|
|
25
|
+
# ensures a single concurrent execution. Downside: if a cron also fires
|
|
26
|
+
# while the job is running, it enqueues a second copy (enqueue_limit = 0
|
|
27
|
+
# from the cron's perspective); that second copy then hits perform_limit
|
|
28
|
+
# and raises ConcurrencyExceededError, creating a retry gap.
|
|
29
|
+
#
|
|
30
|
+
# B) total_limit: 1 (use this if you have a cron safety net)
|
|
31
|
+
# total_limit counts the running job, so the cron enqueue is blocked
|
|
32
|
+
# instead of erroring. The self-chain at the end of perform is also
|
|
33
|
+
# blocked (silently returns successfully_enqueued? == false), but the
|
|
34
|
+
# cron reschedules the job within seconds — no ConcurrencyExceededError,
|
|
35
|
+
# no retry gap. The "belt and braces" log below can be removed in this
|
|
36
|
+
# setup as a blocked self-chain is expected and harmless.
|
|
20
37
|
good_job_control_concurrency_with(
|
|
21
|
-
|
|
38
|
+
enqueue_limit: 1,
|
|
39
|
+
perform_limit: 1,
|
|
22
40
|
key: -> { "dispatch_tick_loop:#{arguments[0] || 'all'}:#{arguments[1] || 'all'}" }
|
|
23
41
|
)
|
|
24
42
|
<% elsif solid_queue? -%>
|
|
@@ -36,7 +54,17 @@ class DispatchTickLoopJob < ApplicationJob
|
|
|
36
54
|
stop_when: -> { adapter_shutting_down? || Time.current >= deadline }
|
|
37
55
|
)
|
|
38
56
|
|
|
39
|
-
self.class.set(wait: 1.second).perform_later(policy_name, shard)
|
|
57
|
+
successor = self.class.set(wait: 1.second).perform_later(policy_name, shard)
|
|
58
|
+
|
|
59
|
+
# Belt and braces: a concurrency-aborted enqueue returns the job with
|
|
60
|
+
# successfully_enqueued? == false instead of raising. If that ever
|
|
61
|
+
# happens the loop chain is dead — make it loud instead of silent.
|
|
62
|
+
if successor.respond_to?(:successfully_enqueued?) && !successor.successfully_enqueued?
|
|
63
|
+
Rails.logger.error(
|
|
64
|
+
"[dispatch_policy] DispatchTickLoopJob failed to re-enqueue itself " \
|
|
65
|
+
"(policy=#{policy_name.inspect} shard=#{shard.inspect}) — the tick loop chain has stopped"
|
|
66
|
+
)
|
|
67
|
+
end
|
|
40
68
|
end
|
|
41
69
|
|
|
42
70
|
private
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: dispatch_policy
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- José Galisteo
|
|
@@ -204,6 +204,7 @@ files:
|
|
|
204
204
|
- app/models/dispatch_policy/application_record.rb
|
|
205
205
|
- app/models/dispatch_policy/inflight_job.rb
|
|
206
206
|
- app/models/dispatch_policy/partition.rb
|
|
207
|
+
- app/models/dispatch_policy/policy_setting.rb
|
|
207
208
|
- app/models/dispatch_policy/staged_job.rb
|
|
208
209
|
- app/models/dispatch_policy/tick_sample.rb
|
|
209
210
|
- app/views/dispatch_policy/dashboard/index.html.erb
|