dispatch_policy 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +185 -0
  3. data/README.md +30 -7
  4. data/app/controllers/dispatch_policy/application_controller.rb +21 -2
  5. data/app/controllers/dispatch_policy/dashboard_controller.rb +3 -0
  6. data/app/controllers/dispatch_policy/partitions_controller.rb +51 -15
  7. data/app/controllers/dispatch_policy/policies_controller.rb +26 -4
  8. data/app/models/dispatch_policy/policy_setting.rb +14 -0
  9. data/app/views/dispatch_policy/dashboard/index.html.erb +6 -1
  10. data/app/views/dispatch_policy/partitions/index.html.erb +1 -1
  11. data/app/views/dispatch_policy/partitions/show.html.erb +1 -1
  12. data/app/views/dispatch_policy/policies/index.html.erb +11 -3
  13. data/app/views/dispatch_policy/policies/show.html.erb +13 -4
  14. data/app/views/dispatch_policy/shared/_partition_row.html.erb +9 -2
  15. data/app/views/layouts/dispatch_policy/application.html.erb +21 -25
  16. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +13 -0
  17. data/lib/dispatch_policy/config.rb +5 -0
  18. data/lib/dispatch_policy/context.rb +12 -2
  19. data/lib/dispatch_policy/cursor_pagination.rb +24 -7
  20. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +14 -0
  21. data/lib/dispatch_policy/gates/concurrency.rb +4 -0
  22. data/lib/dispatch_policy/gates/throttle.rb +36 -9
  23. data/lib/dispatch_policy/inflight_tracker.rb +72 -26
  24. data/lib/dispatch_policy/job_extension.rb +33 -9
  25. data/lib/dispatch_policy/manual_admission.rb +18 -0
  26. data/lib/dispatch_policy/operator_hints.rb +14 -0
  27. data/lib/dispatch_policy/policy.rb +12 -0
  28. data/lib/dispatch_policy/policy_dsl.rb +10 -2
  29. data/lib/dispatch_policy/railtie.rb +10 -0
  30. data/lib/dispatch_policy/registry.rb +8 -4
  31. data/lib/dispatch_policy/repository.rb +102 -30
  32. data/lib/dispatch_policy/tick.rb +18 -2
  33. data/lib/dispatch_policy/tick_loop.rb +15 -7
  34. data/lib/dispatch_policy/version.rb +1 -1
  35. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +9 -0
  36. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +30 -2
  37. metadata +2 -1
@@ -1,9 +1,16 @@
1
- <tr id="partition_<%= partition.id %>" class="<%= "dp-row-paused" if partition.paused? %>">
1
+ <% policy_paused = local_assigns.fetch(:policy_paused, false) %>
2
+ <tr id="partition_<%= partition.id %>" class="<%= "dp-row-paused" if partition.paused? || policy_paused %>">
2
3
  <td><%= link_to partition.policy_name, policy_path(partition.policy_name), class: "dp-link" %></td>
3
4
  <td><code><%= partition.shard %></code></td>
4
5
  <td><%= partition.queue_name || "—" %></td>
5
6
  <td><%= link_to partition.partition_key, partition_path(partition), class: "dp-link" %></td>
6
- <td><%= partition.status %></td>
7
+ <td>
8
+ <% if policy_paused && !partition.paused? %>
9
+ <%= partition.status %> <span class="dp-warn" style="font-size:11px;">(policy paused)</span>
10
+ <% else %>
11
+ <%= partition.status %>
12
+ <% end %>
13
+ </td>
7
14
  <td class="dp-num"><%= format_count(partition.pending_count) %></td>
8
15
  <td class="dp-num"><%= format_count(partition.total_admitted) %></td>
9
16
  <td><%= format_time(partition.next_eligible_at) %></td>
@@ -70,20 +70,19 @@
70
70
  });
71
71
  }
72
72
 
73
- function bindControls() {
74
- document.querySelectorAll("[data-dp-refresh]").forEach(function (btn) {
75
- if (btn.dataset.bound) return;
76
- btn.dataset.bound = "1";
77
- btn.addEventListener("click", function (e) {
78
- e.preventDefault();
79
- setInterval(parseInt(btn.getAttribute("data-dp-refresh"), 10));
80
- });
81
- });
82
- syncControls();
83
- }
73
+ // Delegate on document, registered ONCE. Per-button listeners with a
74
+ // data-bound guard leaked: Turbo's morph refresh preserves the button
75
+ // node but drops the data-bound attribute (it's not in the server
76
+ // HTML), so every turbo:load re-bound and clicks fired N+1 times.
77
+ document.addEventListener("click", function (e) {
78
+ var btn = e.target.closest("[data-dp-refresh]");
79
+ if (!btn) return;
80
+ e.preventDefault();
81
+ setInterval(parseInt(btn.getAttribute("data-dp-refresh"), 10));
82
+ });
84
83
 
85
84
  function init() {
86
- bindControls();
85
+ syncControls();
87
86
  restart();
88
87
  }
89
88
 
@@ -140,20 +139,17 @@
140
139
  });
141
140
  }
142
141
 
143
- function bindControls() {
144
- document.querySelectorAll("[data-dp-theme]").forEach(function (btn) {
145
- if (btn.dataset.bound) return;
146
- btn.dataset.bound = "1";
147
- btn.addEventListener("click", function (e) {
148
- e.preventDefault();
149
- setTheme(btn.getAttribute("data-dp-theme"));
150
- });
151
- });
152
- syncControls();
153
- }
142
+ // Delegate on document, registered ONCE (see the refresh control
143
+ // above for why per-button binding leaks under Turbo morph).
144
+ document.addEventListener("click", function (e) {
145
+ var btn = e.target.closest("[data-dp-theme]");
146
+ if (!btn) return;
147
+ e.preventDefault();
148
+ setTheme(btn.getAttribute("data-dp-theme"));
149
+ });
154
150
 
155
- document.addEventListener("DOMContentLoaded", bindControls);
156
- document.addEventListener("turbo:load", bindControls);
151
+ document.addEventListener("DOMContentLoaded", syncControls);
152
+ document.addEventListener("turbo:load", syncControls);
157
153
  })();
158
154
  </script>
159
155
  </head>
@@ -99,5 +99,18 @@ class CreateDispatchPolicyTables < ActiveRecord::Migration[7.1]
99
99
  [:policy_name, :partition_key],
100
100
  unique: true,
101
101
  name: "idx_dp_adaptive_concurrency_lookup"
102
+
103
+ # Policy-level settings. Currently just the pause flag: a paused policy
104
+ # admits nothing, INCLUDING partitions created after the pause (the
105
+ # tick's claim checks this row). The partitions' own `status` is a
106
+ # per-partition display concern; this is the policy-wide source of truth.
107
+ create_table :dispatch_policy_policy_settings do |t|
108
+ t.string :policy_name, null: false
109
+ t.boolean :paused, null: false, default: false
110
+ t.timestamps
111
+ end
112
+ add_index :dispatch_policy_policy_settings, :policy_name,
113
+ unique: true,
114
+ name: "idx_dp_policy_settings_lookup"
102
115
  end
103
116
  end
@@ -51,6 +51,11 @@ module DispatchPolicy
51
51
  # assuming the admission was lost. Raise it if your adapter backlog
52
52
  # can exceed an hour.
53
53
  @inflight_queued_stale_after = 60 * 60
54
+ # Seconds between heartbeat_at refreshes. Each beat briefly checks out
55
+ # an EXTRA connection (one per running job) from the role's pool, so
56
+ # the DB pool needs headroom above the worker concurrency — otherwise
57
+ # beats hit ConnectionTimeoutError and long jobs get swept as stale.
58
+ # Set to 0 to disable the heartbeat thread entirely.
54
59
  @inflight_heartbeat_interval = 30
55
60
  @real_adapter = nil
56
61
  @logger = nil
@@ -19,7 +19,7 @@ module DispatchPolicy
19
19
  end
20
20
 
21
21
  def [](key)
22
- @data[key.to_s]
22
+ indifferent(@data[key.to_s])
23
23
  end
24
24
 
25
25
  def to_h
@@ -31,11 +31,21 @@ module DispatchPolicy
31
31
  end
32
32
 
33
33
  def fetch(key, *args, &block)
34
- @data.fetch(key.to_s, *args, &block)
34
+ indifferent(@data.fetch(key.to_s, *args, &block))
35
35
  end
36
36
 
37
37
  private
38
38
 
39
+ # Nested hashes are stored string-keyed (deep_stringify), so
40
+ # `ctx[:limits][:max]` would miss — the inner lookup is by symbol.
41
+ # Return nested hashes with indifferent access so symbol and string
42
+ # keys work at every depth, matching how host apps usually write
43
+ # context. to_h/to_jsonb still return the plain string-keyed hash for
44
+ # storage, untouched.
45
+ def indifferent(value)
46
+ value.is_a?(Hash) ? value.with_indifferent_access : value
47
+ end
48
+
39
49
  def deep_stringify(value)
40
50
  case value
41
51
  when Hash
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "base64"
4
4
  require "json"
5
+ require "time"
5
6
 
6
7
  module DispatchPolicy
7
8
  # Tiny keyset-pagination helper for the engine UI. Each sort mode declares
@@ -86,13 +87,22 @@ module DispatchPolicy
86
87
 
87
88
  value, last_id = cursor
88
89
  # Ignore a cursor whose value type can't be compared against this
89
- # sort's column. The numeric columns (pending_count, total_admitted)
90
- # need a Numeric; everything else compares as text (partition_key, or
91
- # the ISO8601 timestamps emitted by #extract). A mismatch — e.g. a
92
- # numeric value forged for a timestamp sort — would raise PG error;
93
- # instead we fall back to the first page.
94
- numeric_column = %w[pending_count total_admitted].include?(sort[:cursor_sql])
95
- return scope unless numeric_column ? value.is_a?(Numeric) : value.is_a?(String)
90
+ # sort's column. A mismatch — e.g. a numeric value forged for a
91
+ # timestamp sort — would raise a PG error; instead we fall back to the
92
+ # first page.
93
+ numeric_column = %w[pending_count total_admitted].include?(sort[:cursor_sql])
94
+ timestamp_column = sort[:cursor_sql].start_with?("COALESCE(")
95
+ if numeric_column
96
+ return scope unless value.is_a?(Numeric)
97
+ elsif timestamp_column
98
+ # Bound against a timestamp column: a non-parseable string (e.g. a
99
+ # hand-forged "zzz") would raise `invalid input syntax for type
100
+ # timestamp` and 500. Require a real ISO8601 value — exactly what
101
+ # #extract emits — or fall back to the first page.
102
+ return scope unless value.is_a?(String) && parseable_timestamp?(value)
103
+ else
104
+ return scope unless value.is_a?(String)
105
+ end
96
106
 
97
107
  case sort[:direction]
98
108
  when :desc
@@ -133,5 +143,12 @@ module DispatchPolicy
133
143
  else v
134
144
  end
135
145
  end
146
+
147
+ def parseable_timestamp?(str)
148
+ Time.iso8601(str)
149
+ true
150
+ rescue ArgumentError, TypeError
151
+ false
152
+ end
136
153
  end
137
154
  end
@@ -45,6 +45,20 @@ module DispatchPolicy
45
45
  raise ArgumentError, "target_lag_ms must be > 0" unless @target_lag_ms.positive?
46
46
  raise ArgumentError, "min must be >= 1" unless @min >= 1
47
47
  raise ArgumentError, "initial_max must be >= min" unless @initial_max >= @min
48
+ # Out-of-range tuning knobs invert the AIMD loop instead of erroring:
49
+ # alpha=0 freezes the EWMA at its seed so the cap grows unbounded;
50
+ # a decrease factor >= 1 turns the multiplicative *decrease* into an
51
+ # increase, a positive-feedback loop under failure/overload.
52
+ unless @ewma_alpha > 0 && @ewma_alpha <= 1
53
+ raise ArgumentError, "ewma_alpha must be in (0, 1]"
54
+ end
55
+ unless @fail_factor > 0 && @fail_factor < 1
56
+ raise ArgumentError, "failure_decrease_factor must be in (0, 1)"
57
+ end
58
+ unless @slow_factor > 0 && @slow_factor < 1
59
+ raise ArgumentError, "overload_decrease_factor must be in (0, 1)"
60
+ end
61
+ raise ArgumentError, "full_backoff must be >= 0" if @full_backoff.negative?
48
62
  end
49
63
 
50
64
  def name
@@ -17,6 +17,10 @@ module DispatchPolicy
17
17
  super()
18
18
  @max_proc = max.respond_to?(:call) ? max : ->(_ctx) { max }
19
19
  @full_backoff = full_backoff.to_f
20
+ # A negative backoff sets next_eligible_at in the past, so the
21
+ # partition is re-evaluated (COUNT(*)) every tick — the opposite of
22
+ # what full_backoff is for.
23
+ raise ArgumentError, "full_backoff must be >= 0" if @full_backoff.negative?
20
24
  end
21
25
 
22
26
  def name
@@ -14,13 +14,20 @@ module DispatchPolicy
14
14
  # The bucket lives on the staged partition row — one row per
15
15
  # `policy.partition_for(ctx)` value, one bucket per row, no dilution.
16
16
  class Throttle < Gate
17
- attr_reader :rate_proc, :per
17
+ attr_reader :rate_proc, :per_proc
18
18
 
19
19
  def initialize(rate:, per:)
20
20
  super()
21
21
  @rate_proc = rate.respond_to?(:call) ? rate : ->(_ctx) { rate }
22
- @per = duration_seconds(per)
23
- raise ArgumentError, "throttle :per must be > 0 (got #{@per})" unless @per.positive?
22
+ if per.respond_to?(:call)
23
+ # Dynamic window (per-ctx), symmetric with a dynamic rate. Validated
24
+ # per-evaluate since the value isn't known until admission time.
25
+ @per_proc = ->(ctx) { duration_seconds(per.call(ctx)) }
26
+ else
27
+ fixed = duration_seconds(per)
28
+ raise ArgumentError, "throttle :per must be > 0 (got #{fixed})" unless fixed.positive?
29
+ @per_proc = ->(_ctx) { fixed }
30
+ end
24
31
  end
25
32
 
26
33
  def name
@@ -28,16 +35,27 @@ module DispatchPolicy
28
35
  end
29
36
 
30
37
  def evaluate(ctx, partition, admit_budget)
31
- capacity = capacity_for(ctx)
32
- return Decision.deny(reason: "rate=0") if capacity <= 0
38
+ per = per_for(ctx)
39
+ rate = rate_for(ctx)
40
+ # rate <= 0 (e.g. a paused tenant) backs off for one window instead
41
+ # of denying with a NULL retry_after. A NULL retry_after leaves the
42
+ # partition immediately eligible, so it would be re-claimed and
43
+ # re-evaluated every single tick — a busy-loop that also clobbers any
44
+ # backoff a prior tick had set.
45
+ return Decision.deny(retry_after: per, reason: "rate=0") if rate <= 0
33
46
 
34
- refill_rate = capacity.to_f / @per
47
+ # The bucket holds at least one whole token; otherwise a sub-unit rate
48
+ # (e.g. rate: 0.5) could never accumulate a full token and would never
49
+ # admit. refill_rate stays at the true `rate` so the long-run pace is
50
+ # exact — the floor only sets the burst ceiling.
51
+ capacity = [rate, 1.0].max
52
+ refill_rate = rate / per
35
53
  state = (partition["gate_state"] || {})["throttle"] || {}
36
54
  tokens = (state["tokens"] || capacity).to_f
37
55
  refilled_at = (state["refilled_at"] || now).to_f
38
56
 
39
57
  elapsed = [now - refilled_at, 0.0].max
40
- tokens = [tokens + (elapsed * refill_rate), capacity.to_f].min
58
+ tokens = [tokens + (elapsed * refill_rate), capacity].min
41
59
 
42
60
  # The patch records the post-refill bucket WITHOUT deducting yet.
43
61
  # The actual deduction is deferred to #consume, which runs once
@@ -77,9 +95,18 @@ module DispatchPolicy
77
95
 
78
96
  private
79
97
 
80
- def capacity_for(ctx)
98
+ def per_for(ctx)
99
+ value = @per_proc.call(ctx)
100
+ raise ArgumentError, "throttle :per must be > 0 (got #{value})" unless value.positive?
101
+ value
102
+ end
103
+
104
+ def rate_for(ctx)
81
105
  value = @rate_proc.call(ctx)
82
- value.nil? ? 0 : Integer(value)
106
+ # Float, not Integer: a fractional rate (e.g. 2.5/sec) must keep its
107
+ # fractional part or the bucket systematically under-admits by
108
+ # truncating every refill. nil means "no rate configured" → deny.
109
+ value.nil? ? 0.0 : Float(value)
83
110
  end
84
111
 
85
112
  def now
@@ -39,33 +39,45 @@ module DispatchPolicy
39
39
  ctx = policy.build_context(job.arguments, queue_name: queue_name)
40
40
  partition_key = policy.partition_key_for(ctx)
41
41
 
42
- Repository.insert_inflight!([{
43
- policy_name: policy.name,
44
- partition_key: partition_key,
45
- active_job_id: job.job_id
46
- }])
47
-
48
42
  adaptive_gates = policy.gates.select { |g| g.name == :adaptive_concurrency }
49
- admitted_at = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
50
- perform_start = Time.current
43
+ admitted_at = nil
44
+ perform_start = nil
45
+ heartbeat = nil
46
+ started = false
47
+ succeeded = false
48
+
49
+ # insert + heartbeat spawn live INSIDE the begin so the ensure always
50
+ # cleans up: if start_heartbeat (Thread.new) raises after the row is
51
+ # inserted, the row would otherwise orphan until the stale sweeper.
52
+ begin
53
+ Repository.insert_inflight!([{
54
+ policy_name: policy.name,
55
+ partition_key: partition_key,
56
+ active_job_id: job.job_id
57
+ }])
51
58
 
52
- heartbeat = start_heartbeat(job.job_id)
59
+ admitted_at = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
60
+ perform_start = Time.current
61
+ heartbeat = start_heartbeat(job.job_id)
53
62
 
54
- succeeded = false
55
- begin
63
+ started = true
56
64
  yield
57
65
  succeeded = true
58
66
  ensure
59
67
  stop_heartbeat(heartbeat)
60
68
 
61
- record_adaptive_observations(
62
- policy: policy,
63
- gates: adaptive_gates,
64
- partition_key: partition_key,
65
- admitted_at: admitted_at,
66
- perform_start: perform_start,
67
- succeeded: succeeded
68
- )
69
+ # Only record an observation if we actually reached perform — a
70
+ # failure in setup (insert / heartbeat spawn) isn't a perform result.
71
+ if started
72
+ record_adaptive_observations(
73
+ policy: policy,
74
+ gates: adaptive_gates,
75
+ partition_key: partition_key,
76
+ admitted_at: admitted_at,
77
+ perform_start: perform_start,
78
+ succeeded: succeeded
79
+ )
80
+ end
69
81
 
70
82
  begin
71
83
  Repository.delete_inflight!(active_job_id: job.job_id)
@@ -75,17 +87,43 @@ module DispatchPolicy
75
87
  end
76
88
  end
77
89
 
90
+ # Deletes the inflight row for a job that ActiveJob discarded BEFORE
91
+ # around_perform ran — most commonly an ActiveJob::DeserializationError
92
+ # (a GlobalID whose record was deleted) on a job with
93
+ # `discard_on ActiveJob::DeserializationError`. Argument deserialization
94
+ # happens before the perform callbacks, so track's `ensure` never runs
95
+ # and the row the Tick pre-inserted would otherwise sit until the
96
+ # `inflight_queued_stale_after` sweeper reaps it (default 1h), holding a
97
+ # concurrency slot the whole time. Wired to the `discard.active_job`
98
+ # notification by the railtie. Idempotent: a no-op when no row exists
99
+ # (e.g. discard fired after track already deleted it).
100
+ def self.handle_discard(job)
101
+ return unless job
102
+ return unless job.class.respond_to?(:dispatch_policy_name) && job.class.dispatch_policy_name
103
+
104
+ Repository.delete_inflight!(active_job_id: job.job_id)
105
+ rescue StandardError => e
106
+ DispatchPolicy.config.logger&.warn(
107
+ "[dispatch_policy] failed to clean up inflight row for discarded job #{job&.job_id}: #{e.class}: #{e.message}"
108
+ )
109
+ end
110
+
78
111
  # Reads the admitted_at column from the inflight row that the Tick
79
112
  # pre-inserted. Used as the start-of-queue-wait reference for the
80
113
  # adaptive_concurrency feedback signal (queue_lag = perform_start
81
114
  # - admitted_at). nil if the row vanished or the lookup fails —
82
115
  # the observation is then skipped.
83
116
  def self.lookup_admitted_at(active_job_id)
84
- result = ActiveRecord::Base.connection.exec_query(
85
- "SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
86
- "lookup_admitted_at",
87
- [active_job_id]
88
- )
117
+ # Route through config.database_role: the inflight row lives in the
118
+ # same DB the Tick pre-inserted it into, which under multi-DB is the
119
+ # queue DB, not the default writing role of the worker process.
120
+ result = Repository.with_connection do
121
+ ActiveRecord::Base.connection.exec_query(
122
+ "SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
123
+ "lookup_admitted_at",
124
+ [active_job_id]
125
+ )
126
+ end
89
127
  row = result.first
90
128
  return nil unless row
91
129
  ts = row["admitted_at"]
@@ -147,8 +185,16 @@ module DispatchPolicy
147
185
  break if stop_flag.true?
148
186
 
149
187
  begin
150
- ActiveRecord::Base.connection_pool.with_connection do
151
- Repository.heartbeat_inflight!(active_job_id: active_job_id)
188
+ # Establish config.database_role inside this thread BEFORE the
189
+ # checkout: under multi-DB, connection_pool must resolve to the
190
+ # role's pool (where the inflight row lives), not the default
191
+ # writing pool. with_connection swaps the role thread-locally;
192
+ # the nested pool checkout then borrows/returns a dedicated
193
+ # connection from that pool per beat.
194
+ Repository.with_connection do
195
+ ActiveRecord::Base.connection_pool.with_connection do
196
+ Repository.heartbeat_inflight!(active_job_id: active_job_id)
197
+ end
152
198
  end
153
199
  rescue StandardError => e
154
200
  DispatchPolicy.config.logger&.warn("[dispatch_policy] heartbeat #{active_job_id} failed: #{e.class}: #{e.message}")
@@ -71,6 +71,25 @@ module DispatchPolicy
71
71
  (job.respond_to?(:executions) ? job.executions.to_i : 0).positive?
72
72
  end
73
73
 
74
+ # Whether a job should be staged through admission control rather than
75
+ # handed straight to the adapter. Mirrors the single-enqueue decision in
76
+ # around_enqueue_for: it needs a registered policy and must not be a
77
+ # retry on a bypass_retries policy. Used by the bulk path to split jobs
78
+ # so the ones we don't own fall through to the adapter instead of being
79
+ # silently dropped.
80
+ def self.stageable?(job)
81
+ return false unless job.class.respond_to?(:dispatch_policy_name)
82
+
83
+ name = job.class.dispatch_policy_name
84
+ return false unless name
85
+
86
+ policy = DispatchPolicy.registry.fetch(name)
87
+ return false unless policy
88
+ return false if retry_attempt?(job) && policy.bypass_retries?
89
+
90
+ true
91
+ end
92
+
74
93
  def self.scheduled_time(job)
75
94
  ts = job.scheduled_at
76
95
  return nil if ts.nil?
@@ -111,17 +130,19 @@ module DispatchPolicy
111
130
  return super unless DispatchPolicy.config.enabled
112
131
  return super unless DispatchPolicy.registry.size.positive?
113
132
 
114
- with_policy, without_policy = flat.partition do |j|
115
- j.class.respond_to?(:dispatch_policy_name) && j.class.dispatch_policy_name
116
- end
133
+ # Split exactly like the single path decides: jobs we own get staged,
134
+ # everything else (no policy, unregistered policy name, or a retry on
135
+ # a bypass_retries policy) goes straight to the adapter. Dropping them
136
+ # — as a `next unless policy` inside the row builder would — silently
137
+ # loses jobs the caller expected to be enqueued.
138
+ to_stage, to_adapter = flat.partition { |job| JobExtension.stageable?(job) }
117
139
 
118
- super(without_policy) if without_policy.any?
140
+ super(to_adapter) if to_adapter.any?
119
141
 
120
- return nil if with_policy.empty?
142
+ return nil if to_stage.empty?
121
143
 
122
- rows = with_policy.filter_map do |job|
144
+ rows = to_stage.map do |job|
123
145
  policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
124
- next unless policy
125
146
 
126
147
  # See JobExtension.ensure_arguments_materialized! — we need this
127
148
  # for the same reason as the single-enqueue path.
@@ -132,7 +153,6 @@ module DispatchPolicy
132
153
  partition_key = policy.partition_key_for(ctx)
133
154
  shard = policy.shard_for(ctx)
134
155
  payload = Serializer.serialize(job)
135
- job.successfully_enqueued = true
136
156
 
137
157
  {
138
158
  policy_name: policy.name,
@@ -147,7 +167,11 @@ module DispatchPolicy
147
167
  }
148
168
  end
149
169
 
150
- Repository.stage_many!(rows) if rows.any?
170
+ # Only mark enqueued AFTER the INSERT commits. If stage_many! raises,
171
+ # a caller that rescues and inspects successfully_enqueued? must not
172
+ # be told the jobs were enqueued when they weren't.
173
+ Repository.stage_many!(rows)
174
+ to_stage.each { |job| job.successfully_enqueued = true }
151
175
  nil # ActiveJob.perform_all_later contract returns nil
152
176
  end
153
177
  end
@@ -39,6 +39,24 @@ module DispatchPolicy
39
39
  next if rows.empty?
40
40
 
41
41
  rows.each { |row| row["job_data"]["job_id"] = SecureRandom.uuid }
42
+
43
+ # Pre-insert an inflight row per admitted job, exactly like
44
+ # Tick#admit_partition does. Without it the concurrency gate's
45
+ # COUNT(*) misses these jobs until each one starts performing and
46
+ # InflightTracker.track inserts its own row — an over-admission
47
+ # window proportional to how many jobs were force-admitted. The
48
+ # key is the canonical partition value, which for a policy-level
49
+ # partition_by is exactly the staged partition_key (see
50
+ # Concurrency#inflight_partition_key). Runs inside the same TX, so
51
+ # a rolled-back claim takes the inflight rows with it.
52
+ inflight_rows = rows.filter_map do |row|
53
+ ajid = row.dig("job_data", "job_id")
54
+ next unless ajid
55
+
56
+ { policy_name: policy_name, partition_key: partition_key, active_job_id: ajid }
57
+ end
58
+ Repository.insert_inflight!(inflight_rows) if inflight_rows.any?
59
+
42
60
  Forwarder.dispatch(rows)
43
61
  forwarded = rows.size
44
62
  end
@@ -29,10 +29,24 @@ module DispatchPolicy
29
29
  # in_backoff: int
30
30
  # total_partitions: int
31
31
  # adapter_target_jps: int|nil (config.adapter_throughput_target)
32
+ # paused: bool (policy-level pause flag)
32
33
  def for(metrics)
33
34
  hints = []
34
35
  m = metrics
35
36
 
37
+ # ---- policy paused: everything below presumes admission SHOULD be
38
+ # flowing (never_checked, drain time, pending growing), so during a
39
+ # deliberate pause those hints turn into false alarms — e.g.
40
+ # "increase partition_batch_size" while the tick is intentionally
41
+ # skipping the policy. State the pause and stop.
42
+ if m[:paused]
43
+ return [Hint.new(
44
+ level: :warn,
45
+ message: "Policy is paused — admission is stopped while staging continues " \
46
+ "(pending keeps growing). Resume to drain."
47
+ )]
48
+ end
49
+
36
50
  # ---- tick approaching deadline ---------------------------------
37
51
  if m[:tick_max_duration_ms].to_i.positive? && m[:avg_tick_ms].to_i.positive?
38
52
  ratio = m[:avg_tick_ms].to_f / m[:tick_max_duration_ms]
@@ -78,6 +78,18 @@ module DispatchPolicy
78
78
  unless %i[restage bypass].include?(@retry_strategy)
79
79
  raise InvalidPolicy, "retry_strategy must be :restage or :bypass"
80
80
  end
81
+ # Two gates of the same type would persist their state under the same
82
+ # gate_state key (e.g. both throttles share gate_state["throttle"]),
83
+ # silently corrupting each other: the merged patch keeps only the last
84
+ # gate's bucket, and on the next tick the other gate clamps that count
85
+ # to its own capacity and sees a permanently full bucket. Reject it at
86
+ # definition time — multi-window rate limiting needs separate policies.
87
+ duplicate = @gates.map(&:name).tally.find { |_, count| count > 1 }
88
+ if duplicate
89
+ raise InvalidPolicy,
90
+ "duplicate #{duplicate.first.inspect} gate: a policy may declare each gate " \
91
+ "type at most once (use separate policies for multi-window limits)"
92
+ end
81
93
  # Note: gates are NOT required. A policy with no gates uses the
82
94
  # admission_batch_size (or tick_admission_budget when set) as its
83
95
  # only ceiling, with the in-tick fairness reorder distributing
@@ -45,7 +45,11 @@ module DispatchPolicy
45
45
  end
46
46
 
47
47
  def admission_batch_size(size)
48
- @admission_batch_size = Integer(size) if size
48
+ return if size.nil?
49
+
50
+ value = Integer(size)
51
+ raise InvalidPolicy, "admission_batch_size must be > 0 (got #{value})" unless value.positive?
52
+ @admission_batch_size = value
49
53
  end
50
54
 
51
55
  # Per-policy override for the EWMA half-life used to weigh recent
@@ -62,7 +66,11 @@ module DispatchPolicy
62
66
  # nil, no global cap is enforced and per-partition admission_batch_size
63
67
  # is the only ceiling.
64
68
  def tick_admission_budget(value)
65
- @tick_admission_budget = Integer(value) if value
69
+ return if value.nil?
70
+
71
+ budget = Integer(value)
72
+ raise InvalidPolicy, "tick_admission_budget must be > 0 (got #{budget})" unless budget.positive?
73
+ @tick_admission_budget = budget
66
74
  end
67
75
 
68
76
  # Defines the partition scope. Required — every policy declares
@@ -19,6 +19,16 @@ module DispatchPolicy
19
19
  end
20
20
  end
21
21
 
22
+ # Reap the inflight row when a job is discarded before its perform
23
+ # callbacks run (e.g. discard_on ActiveJob::DeserializationError):
24
+ # InflightTracker.track's `ensure` never fires in that path, so the
25
+ # Tick's pre-inserted row would orphan until the stale sweeper.
26
+ initializer "dispatch_policy.discard_cleanup" do
27
+ ActiveSupport::Notifications.subscribe("discard.active_job") do |event|
28
+ DispatchPolicy::InflightTracker.handle_discard(event.payload[:job])
29
+ end
30
+ end
31
+
22
32
  # Hosts copy the gem's migration into their own db/migrate via
23
33
  # `rails railties:install:migrations` (or hand-write a cutover
24
34
  # migration like opstasks did). We deliberately do NOT auto-merge