dispatch_policy 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +185 -0
  3. data/README.md +30 -7
  4. data/app/controllers/dispatch_policy/application_controller.rb +21 -2
  5. data/app/controllers/dispatch_policy/dashboard_controller.rb +3 -0
  6. data/app/controllers/dispatch_policy/partitions_controller.rb +51 -15
  7. data/app/controllers/dispatch_policy/policies_controller.rb +26 -4
  8. data/app/models/dispatch_policy/policy_setting.rb +14 -0
  9. data/app/views/dispatch_policy/dashboard/index.html.erb +6 -1
  10. data/app/views/dispatch_policy/partitions/index.html.erb +1 -1
  11. data/app/views/dispatch_policy/partitions/show.html.erb +1 -1
  12. data/app/views/dispatch_policy/policies/index.html.erb +11 -3
  13. data/app/views/dispatch_policy/policies/show.html.erb +13 -4
  14. data/app/views/dispatch_policy/shared/_partition_row.html.erb +9 -2
  15. data/app/views/layouts/dispatch_policy/application.html.erb +21 -25
  16. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +13 -0
  17. data/lib/dispatch_policy/config.rb +5 -0
  18. data/lib/dispatch_policy/context.rb +12 -2
  19. data/lib/dispatch_policy/cursor_pagination.rb +24 -7
  20. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +14 -0
  21. data/lib/dispatch_policy/gates/concurrency.rb +4 -0
  22. data/lib/dispatch_policy/gates/throttle.rb +36 -9
  23. data/lib/dispatch_policy/inflight_tracker.rb +72 -26
  24. data/lib/dispatch_policy/job_extension.rb +33 -9
  25. data/lib/dispatch_policy/manual_admission.rb +18 -0
  26. data/lib/dispatch_policy/operator_hints.rb +14 -0
  27. data/lib/dispatch_policy/policy.rb +12 -0
  28. data/lib/dispatch_policy/policy_dsl.rb +10 -2
  29. data/lib/dispatch_policy/railtie.rb +10 -0
  30. data/lib/dispatch_policy/registry.rb +8 -4
  31. data/lib/dispatch_policy/repository.rb +102 -30
  32. data/lib/dispatch_policy/tick.rb +18 -2
  33. data/lib/dispatch_policy/tick_loop.rb +15 -7
  34. data/lib/dispatch_policy/version.rb +1 -1
  35. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +9 -0
  36. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +30 -2
  37. metadata +2 -1
@@ -19,7 +19,7 @@ module DispatchPolicy
19
19
  end
20
20
 
21
21
  def fetch(name)
22
- entry = @policies[name.to_s]
22
+ entry = @mutex.synchronize { @policies[name.to_s] }
23
23
  entry && entry[:policy]
24
24
  end
25
25
 
@@ -28,15 +28,19 @@ module DispatchPolicy
28
28
  end
29
29
 
30
30
  def names
31
- @policies.keys
31
+ @mutex.synchronize { @policies.keys }
32
32
  end
33
33
 
34
34
  def each(&block)
35
- @policies.values.map { |e| e[:policy] }.each(&block)
35
+ # Snapshot under the lock, then iterate outside it: the block may run
36
+ # arbitrary code (and Mutex isn't reentrant), so we must not hold the
37
+ # lock while yielding.
38
+ snapshot = @mutex.synchronize { @policies.values.map { |e| e[:policy] } }
39
+ snapshot.each(&block)
36
40
  end
37
41
 
38
42
  def size
39
- @policies.size
43
+ @mutex.synchronize { @policies.size }
40
44
  end
41
45
 
42
46
  def clear
@@ -13,8 +13,9 @@ module DispatchPolicy
13
13
  STAGED_TABLE = "dispatch_policy_staged_jobs"
14
14
  PARTITIONS_TABLE = "dispatch_policy_partitions"
15
15
  INFLIGHT_TABLE = "dispatch_policy_inflight_jobs"
16
- SAMPLES_TABLE = "dispatch_policy_tick_samples"
17
- ADAPTIVE_TABLE = "dispatch_policy_adaptive_concurrency_stats"
16
+ SAMPLES_TABLE = "dispatch_policy_tick_samples"
17
+ ADAPTIVE_TABLE = "dispatch_policy_adaptive_concurrency_stats"
18
+ POLICY_SETTINGS_TABLE = "dispatch_policy_policy_settings"
18
19
 
19
20
  module_function
20
21
 
@@ -78,35 +79,43 @@ module DispatchPolicy
78
79
  # Bulk version for perform_all_later. Receives an array of hashes with
79
80
  # the same keys as #stage!. Performs one INSERT for staged_jobs and
80
81
  # one UPSERT per (policy_name, partition_key) group.
82
+ # Rows per INSERT. Each row binds 8 params; Postgres caps a statement at
83
+ # 65_535 bind params, so we slice well under 65_535/8 ≈ 8_191 to leave
84
+ # headroom. A single perform_all_later with more rows than this would
85
+ # otherwise blow the limit and fail the whole batch.
86
+ STAGE_MANY_BATCH = 1_000
87
+
81
88
  def stage_many!(rows)
82
89
  return 0 if rows.empty?
83
90
 
84
91
  connection.transaction(requires_new: true) do
85
- values_sql = []
86
- params = []
87
- rows.each_with_index do |row, idx|
88
- base = idx * 8
89
- values_sql << "($#{base + 1}, $#{base + 2}, $#{base + 3}, $#{base + 4}, $#{base + 5}::jsonb, $#{base + 6}::jsonb, $#{base + 7}, $#{base + 8})"
90
- params.push(
91
- row[:policy_name],
92
- row[:partition_key],
93
- row[:queue_name],
94
- row[:job_class],
95
- JSON.dump(row[:job_data]),
96
- JSON.dump(row[:context] || {}),
97
- row[:scheduled_at],
98
- row[:priority] || 0
92
+ rows.each_slice(STAGE_MANY_BATCH) do |slice|
93
+ values_sql = []
94
+ params = []
95
+ slice.each_with_index do |row, idx|
96
+ base = idx * 8
97
+ values_sql << "($#{base + 1}, $#{base + 2}, $#{base + 3}, $#{base + 4}, $#{base + 5}::jsonb, $#{base + 6}::jsonb, $#{base + 7}, $#{base + 8})"
98
+ params.push(
99
+ row[:policy_name],
100
+ row[:partition_key],
101
+ row[:queue_name],
102
+ row[:job_class],
103
+ JSON.dump(row[:job_data]),
104
+ JSON.dump(row[:context] || {}),
105
+ row[:scheduled_at],
106
+ row[:priority] || 0
107
+ )
108
+ end
109
+ connection.exec_query(
110
+ <<~SQL.squish,
111
+ INSERT INTO #{STAGED_TABLE}
112
+ (policy_name, partition_key, queue_name, job_class, job_data, context, scheduled_at, priority)
113
+ VALUES #{values_sql.join(", ")}
114
+ SQL
115
+ "stage_many",
116
+ params
99
117
  )
100
118
  end
101
- connection.exec_query(
102
- <<~SQL.squish,
103
- INSERT INTO #{STAGED_TABLE}
104
- (policy_name, partition_key, queue_name, job_class, job_data, context, scheduled_at, priority)
105
- VALUES #{values_sql.join(", ")}
106
- SQL
107
- "stage_many",
108
- params
109
- )
110
119
 
111
120
  rows.group_by { |r| [r[:policy_name], r[:partition_key]] }.each do |(policy_name, partition_key), group|
112
121
  upsert_partition!(
@@ -169,6 +178,10 @@ module DispatchPolicy
169
178
  AND status = 'active'
170
179
  AND pending_count > 0
171
180
  AND (next_eligible_at IS NULL OR next_eligible_at <= now())
181
+ AND NOT EXISTS (
182
+ SELECT 1 FROM #{POLICY_SETTINGS_TABLE} ps
183
+ WHERE ps.policy_name = $1 AND ps.paused
184
+ )
172
185
  #{shard_sql}
173
186
  ORDER BY last_checked_at NULLS FIRST, id
174
187
  LIMIT $#{params.size}
@@ -306,10 +319,13 @@ module DispatchPolicy
306
319
  #
307
320
  # Each entry: { policy_name:, partition_key:, gate_state_patch:, retry_after: }.
308
321
  # Independent per row — the join via FROM(VALUES…) makes the bulk
309
- # statement equivalent to N sequential UPDATEs in correctness terms;
310
- # the row-level locks held by `claim_partitions` (FOR UPDATE SKIP
311
- # LOCKED, last_checked_at bumped) keep concurrent ticks away from the
312
- # same partitions while we batch.
322
+ # statement equivalent to N sequential UPDATEs in correctness terms.
323
+ # Note: `claim_partitions` runs as its own autocommitted statement, so
324
+ # its `FOR UPDATE SKIP LOCKED` row locks are already released by the time
325
+ # we reach this flush — they do NOT guard the batch. What keeps two ticks
326
+ # off the same partitions is the operational invariant of one tick loop
327
+ # per (policy, shard), reinforced by the `last_checked_at` bump on claim
328
+ # (a racing claim skips recently-checked rows).
313
329
  def bulk_record_partition_denies!(entries)
314
330
  return if entries.empty?
315
331
 
@@ -331,7 +347,7 @@ module DispatchPolicy
331
347
  UPDATE #{PARTITIONS_TABLE} p
332
348
  SET gate_state = p.gate_state || v.gate_state_patch,
333
349
  next_eligible_at = CASE
334
- WHEN v.retry_after_secs IS NULL THEN NULL
350
+ WHEN v.retry_after_secs IS NULL THEN p.next_eligible_at
335
351
  ELSE now() + (v.retry_after_secs || ' seconds')::interval
336
352
  END,
337
353
  updated_at = now()
@@ -344,6 +360,24 @@ module DispatchPolicy
344
360
  )
345
361
  end
346
362
 
363
+ # ----- policy settings ------------------------------------------------------
364
+
365
+ # Upsert the pause flag for a policy. The tick's claim_partitions reads
366
+ # this row, so toggling it takes effect for every partition of the
367
+ # policy — including ones created after the toggle.
368
+ def set_policy_paused!(policy_name:, paused:)
369
+ connection.exec_query(
370
+ <<~SQL.squish,
371
+ INSERT INTO #{POLICY_SETTINGS_TABLE} (policy_name, paused, created_at, updated_at)
372
+ VALUES ($1, $2, now(), now())
373
+ ON CONFLICT (policy_name)
374
+ DO UPDATE SET paused = EXCLUDED.paused, updated_at = now()
375
+ SQL
376
+ "set_policy_paused",
377
+ [policy_name, paused ? true : false]
378
+ )
379
+ end
380
+
347
381
  # ----- inflight tracking ---------------------------------------------------
348
382
 
349
383
  def insert_inflight!(rows)
@@ -883,5 +917,43 @@ module DispatchPolicy
883
917
  ["now() + ($5 || ' seconds')::interval", [retry_after.to_f.round(3)]]
884
918
  end
885
919
  end
920
+
921
+ # ----- role routing ---------------------------------------------------------
922
+ #
923
+ # Every public Repository method must run against config.database_role
924
+ # so multi-DB setups (e.g. solid_queue on a separate :queue DB, with
925
+ # the gem tables living there) hit the DB the staging/admission/inflight
926
+ # state actually lives in. Otherwise staging writes the primary DB while
927
+ # the tick reads the queue DB — silent job loss — and the concurrency
928
+ # gate counts inflight rows in a different DB than the tracker writes.
929
+ #
930
+ # Rather than wrap ~25 method bodies by hand — and risk missing one as
931
+ # the API grows — we redefine each public SQL method to run inside
932
+ # `with_connection`. We capture the ORIGINAL as a bound closure and call
933
+ # it directly (no `super`, no prepended module): this is immune to the
934
+ # file being evaluated more than once in a process (dev reloader,
935
+ # integration suites that boot the dummy app under multiple require
936
+ # paths). Each evaluation re-wraps the freshly (re)defined originals
937
+ # exactly once, so wrappers never stack. `connected_to(role:)` nesting
938
+ # with the SAME role is a no-op, so the explicit `with_connection` blocks
939
+ # at the transaction boundaries (Tick, ManualAdmission) stay correct: the
940
+ # admission TX still opens entirely within one role context, preserving
941
+ # the shared-connection atomicity invariant. The `connection` accessor
942
+ # and the pure helpers are excluded — they issue no SQL of their own and
943
+ # always run inside an already-routed caller, so wrapping them would only
944
+ # add redundant role swaps in hot per-row loops (normalize_*/parse_jsonb
945
+ # run once per claimed row).
946
+ ROLE_ROUTING_EXCLUDED = %i[
947
+ connection with_connection
948
+ normalize_partition normalize_staged parse_jsonb
949
+ sample_filter next_eligible_clause trend_direction
950
+ ].freeze
951
+
952
+ (singleton_methods(false) - ROLE_ROUTING_EXCLUDED).each do |method_name|
953
+ original = singleton_class.instance_method(method_name)
954
+ define_singleton_method(method_name) do |*args, **kwargs, &block|
955
+ with_connection { original.bind_call(self, *args, **kwargs, &block) }
956
+ end
957
+ end
886
958
  end
887
959
  end
@@ -127,6 +127,15 @@ module DispatchPolicy
127
127
  forward_failures += outcome[:failures]
128
128
  admitted_per_partition[p["partition_key"]] += outcome[:admitted]
129
129
  remaining -= outcome[:admitted]
130
+
131
+ # Feed pass-2 denies into the reason breakdown (e.g. a throttle
132
+ # that emptied after pass-1's settle) so the dashboard sees why
133
+ # redistribution stopped. We do NOT bump partitions_denied: the
134
+ # partition already counted as admitted in pass-1, and
135
+ # admitted + denied should stay ≈ partitions_seen.
136
+ if outcome[:admitted].zero?
137
+ outcome[:reasons].each { |r| denied_reasons[r] += 1 }
138
+ end
130
139
  end
131
140
  end
132
141
  end
@@ -341,8 +350,15 @@ module DispatchPolicy
341
350
  end
342
351
 
343
352
  def record_sample!(**fields)
344
- pending_total = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
345
- inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
353
+ # These two reads go through the AR models, which the Repository role
354
+ # wrapper doesn't cover — wrap explicitly or, under multi-DB
355
+ # (config.database_role), they'd query the default writing role and
356
+ # either raise (swallowed below → no samples ever) or record zeros.
357
+ pending_total = inflight_total = nil
358
+ Repository.with_connection do
359
+ pending_total = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
360
+ inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
361
+ end
346
362
 
347
363
  Repository.record_tick_sample!(
348
364
  policy_name: @policy_name,
@@ -29,7 +29,7 @@ module DispatchPolicy
29
29
 
30
30
  names = policy_names(policy_name)
31
31
  if names.empty?
32
- sleep(config.idle_pause)
32
+ pause(config.idle_pause)
33
33
  next
34
34
  end
35
35
 
@@ -46,18 +46,26 @@ module DispatchPolicy
46
46
  end
47
47
 
48
48
  iteration += 1
49
- if (iteration % config.sweep_every_ticks).zero?
50
- sweep!
51
- end
49
+ # sweep_every_ticks <= 0 means "never sweep" (rather than crashing
50
+ # the loop with ZeroDivisionError on `iteration % 0`).
51
+ sweep_every = config.sweep_every_ticks.to_i
52
+ sweep! if sweep_every.positive? && (iteration % sweep_every).zero?
52
53
 
53
54
  if admitted.zero?
54
- sleep(config.idle_pause)
55
- elsif config.busy_pause.to_f.positive?
56
- sleep(config.busy_pause)
55
+ pause(config.idle_pause)
56
+ else
57
+ pause(config.busy_pause)
57
58
  end
58
59
  end
59
60
  end
60
61
 
62
+ # sleep, but never with a negative argument (which would raise
63
+ # ArgumentError mid-loop) — a non-positive pause just means "no pause".
64
+ def pause(seconds)
65
+ secs = seconds.to_f
66
+ sleep(secs) if secs.positive?
67
+ end
68
+
61
69
  def policy_names(filter)
62
70
  if filter
63
71
  [filter.to_s]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DispatchPolicy
4
- VERSION = "0.4.3"
4
+ VERSION = "0.5.0"
5
5
  end
@@ -91,5 +91,14 @@ class CreateDispatchPolicyTables < ActiveRecord::Migration[<%= Rails::VERSION::S
91
91
  [:policy_name, :partition_key],
92
92
  unique: true,
93
93
  name: "idx_dp_adaptive_concurrency_lookup"
94
+
95
+ create_table :dispatch_policy_policy_settings do |t|
96
+ t.string :policy_name, null: false
97
+ t.boolean :paused, null: false, default: false
98
+ t.timestamps
99
+ end
100
+ add_index :dispatch_policy_policy_settings, :policy_name,
101
+ unique: true,
102
+ name: "idx_dp_policy_settings_lookup"
94
103
  end
95
104
  end
@@ -17,8 +17,26 @@ class DispatchTickLoopJob < ApplicationJob
17
17
  <% if good_job? -%>
18
18
 
19
19
  include GoodJob::ActiveJobExtensions::Concurrency
20
+ # Two valid setups depending on whether you have a cron safety net:
21
+ #
22
+ # A) enqueue_limit: 1, perform_limit: 1 (default below — no cron required)
23
+ # GoodJob's enqueue check excludes the running job (advisory-locked), so
24
+ # the self-re-enqueue at the end of perform always succeeds. perform_limit
25
+ # ensures a single concurrent execution. Downside: if a cron also fires
26
+ # while the job is running, it enqueues a second copy (enqueue_limit = 0
27
+ # from the cron's perspective); that second copy then hits perform_limit
28
+ # and raises ConcurrencyExceededError, creating a retry gap.
29
+ #
30
+ # B) total_limit: 1 (use this if you have a cron safety net)
31
+ # total_limit counts the running job, so the cron enqueue is blocked
32
+ # instead of erroring. The self-chain at the end of perform is also
33
+ # blocked (silently returns successfully_enqueued? == false), but the
34
+ # cron reschedules the job within seconds — no ConcurrencyExceededError,
35
+ # no retry gap. The "belt and braces" log below can be removed in this
36
+ # setup as a blocked self-chain is expected and harmless.
20
37
  good_job_control_concurrency_with(
21
- total_limit: 1,
38
+ enqueue_limit: 1,
39
+ perform_limit: 1,
22
40
  key: -> { "dispatch_tick_loop:#{arguments[0] || 'all'}:#{arguments[1] || 'all'}" }
23
41
  )
24
42
  <% elsif solid_queue? -%>
@@ -36,7 +54,17 @@ class DispatchTickLoopJob < ApplicationJob
36
54
  stop_when: -> { adapter_shutting_down? || Time.current >= deadline }
37
55
  )
38
56
 
39
- self.class.set(wait: 1.second).perform_later(policy_name, shard)
57
+ successor = self.class.set(wait: 1.second).perform_later(policy_name, shard)
58
+
59
+ # Belt and braces: a concurrency-aborted enqueue returns the job with
60
+ # successfully_enqueued? == false instead of raising. If that ever
61
+ # happens the loop chain is dead — make it loud instead of silent.
62
+ if successor.respond_to?(:successfully_enqueued?) && !successor.successfully_enqueued?
63
+ Rails.logger.error(
64
+ "[dispatch_policy] DispatchTickLoopJob failed to re-enqueue itself " \
65
+ "(policy=#{policy_name.inspect} shard=#{shard.inspect}) — the tick loop chain has stopped"
66
+ )
67
+ end
40
68
  end
41
69
 
42
70
  private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dispatch_policy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - José Galisteo
@@ -204,6 +204,7 @@ files:
204
204
  - app/models/dispatch_policy/application_record.rb
205
205
  - app/models/dispatch_policy/inflight_job.rb
206
206
  - app/models/dispatch_policy/partition.rb
207
+ - app/models/dispatch_policy/policy_setting.rb
207
208
  - app/models/dispatch_policy/staged_job.rb
208
209
  - app/models/dispatch_policy/tick_sample.rb
209
210
  - app/views/dispatch_policy/dashboard/index.html.erb