RubyGems - dispatch_policy - Versions diffs - 0.4.3 → 0.5.0 - Mend

dispatch_policy 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +185 -0
data/README.md +30 -7
data/app/controllers/dispatch_policy/application_controller.rb +21 -2
data/app/controllers/dispatch_policy/dashboard_controller.rb +3 -0
data/app/controllers/dispatch_policy/partitions_controller.rb +51 -15
data/app/controllers/dispatch_policy/policies_controller.rb +26 -4
data/app/models/dispatch_policy/policy_setting.rb +14 -0
data/app/views/dispatch_policy/dashboard/index.html.erb +6 -1
data/app/views/dispatch_policy/partitions/index.html.erb +1 -1
data/app/views/dispatch_policy/partitions/show.html.erb +1 -1
data/app/views/dispatch_policy/policies/index.html.erb +11 -3
data/app/views/dispatch_policy/policies/show.html.erb +13 -4
data/app/views/dispatch_policy/shared/_partition_row.html.erb +9 -2
data/app/views/layouts/dispatch_policy/application.html.erb +21 -25
data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +13 -0
data/lib/dispatch_policy/config.rb +5 -0
data/lib/dispatch_policy/context.rb +12 -2
data/lib/dispatch_policy/cursor_pagination.rb +24 -7
data/lib/dispatch_policy/gates/adaptive_concurrency.rb +14 -0
data/lib/dispatch_policy/gates/concurrency.rb +4 -0
data/lib/dispatch_policy/gates/throttle.rb +36 -9
data/lib/dispatch_policy/inflight_tracker.rb +72 -26
data/lib/dispatch_policy/job_extension.rb +33 -9
data/lib/dispatch_policy/manual_admission.rb +18 -0
data/lib/dispatch_policy/operator_hints.rb +14 -0
data/lib/dispatch_policy/policy.rb +12 -0
data/lib/dispatch_policy/policy_dsl.rb +10 -2
data/lib/dispatch_policy/railtie.rb +10 -0
data/lib/dispatch_policy/registry.rb +8 -4
data/lib/dispatch_policy/repository.rb +102 -30
data/lib/dispatch_policy/tick.rb +18 -2
data/lib/dispatch_policy/tick_loop.rb +15 -7
data/lib/dispatch_policy/version.rb +1 -1
data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +9 -0
data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +30 -2
metadata +2 -1

data/app/views/dispatch_policy/shared/_partition_row.html.erb CHANGED Viewed

@@ -1,9 +1,16 @@
-<tr id="partition_<%= partition.id %>" class="<%= "dp-row-paused" if partition.paused? %>">
+<% policy_paused = local_assigns.fetch(:policy_paused, false) %>
+<tr id="partition_<%= partition.id %>" class="<%= "dp-row-paused" if partition.paused? || policy_paused %>">
   <td><%= link_to partition.policy_name, policy_path(partition.policy_name), class: "dp-link" %></td>
   <td><code><%= partition.shard %></code></td>
   <td><%= partition.queue_name || "—" %></td>
   <td><%= link_to partition.partition_key, partition_path(partition), class: "dp-link" %></td>
-  <td><%= partition.status %></td>
+  <td>
+    <% if policy_paused && !partition.paused? %>
+      <%= partition.status %> <span class="dp-warn" style="font-size:11px;">(policy paused)</span>
+    <% else %>
+      <%= partition.status %>
+    <% end %>
+  </td>
   <td class="dp-num"><%= format_count(partition.pending_count) %></td>
   <td class="dp-num"><%= format_count(partition.total_admitted) %></td>
   <td><%= format_time(partition.next_eligible_at) %></td>

data/app/views/layouts/dispatch_policy/application.html.erb CHANGED Viewed

@@ -70,20 +70,19 @@
           });
         }
-        function bindControls() {
-          document.querySelectorAll("[data-dp-refresh]").forEach(function (btn) {
-            if (btn.dataset.bound) return;
-            btn.dataset.bound = "1";
-            btn.addEventListener("click", function (e) {
-              e.preventDefault();
-              setInterval(parseInt(btn.getAttribute("data-dp-refresh"), 10));
-            });
-          });
-          syncControls();
-        }
+        // Delegate on document, registered ONCE. Per-button listeners with a
+        // data-bound guard leaked: Turbo's morph refresh preserves the button
+        // node but drops the data-bound attribute (it's not in the server
+        // HTML), so every turbo:load re-bound and clicks fired N+1 times.
+        document.addEventListener("click", function (e) {
+          var btn = e.target.closest("[data-dp-refresh]");
+          if (!btn) return;
+          e.preventDefault();
+          setInterval(parseInt(btn.getAttribute("data-dp-refresh"), 10));
+        });
         function init() {
-          bindControls();
+          syncControls();
           restart();
         }
@@ -140,20 +139,17 @@
           });
         }
-        function bindControls() {
-          document.querySelectorAll("[data-dp-theme]").forEach(function (btn) {
-            if (btn.dataset.bound) return;
-            btn.dataset.bound = "1";
-            btn.addEventListener("click", function (e) {
-              e.preventDefault();
-              setTheme(btn.getAttribute("data-dp-theme"));
-            });
-          });
-          syncControls();
-        }
+        // Delegate on document, registered ONCE (see the refresh control
+        // above for why per-button binding leaks under Turbo morph).
+        document.addEventListener("click", function (e) {
+          var btn = e.target.closest("[data-dp-theme]");
+          if (!btn) return;
+          e.preventDefault();
+          setTheme(btn.getAttribute("data-dp-theme"));
+        });
-        document.addEventListener("DOMContentLoaded", bindControls);
-        document.addEventListener("turbo:load",       bindControls);
+        document.addEventListener("DOMContentLoaded", syncControls);
+        document.addEventListener("turbo:load",       syncControls);
       })();
     </script>
   </head>

data/db/migrate/20260501000001_create_dispatch_policy_tables.rb CHANGED Viewed

@@ -99,5 +99,18 @@ class CreateDispatchPolicyTables < ActiveRecord::Migration[7.1]
               [:policy_name, :partition_key],
               unique: true,
               name:   "idx_dp_adaptive_concurrency_lookup"
+    # Policy-level settings. Currently just the pause flag: a paused policy
+    # admits nothing, INCLUDING partitions created after the pause (the
+    # tick's claim checks this row). The partitions' own `status` is a
+    # per-partition display concern; this is the policy-wide source of truth.
+    create_table :dispatch_policy_policy_settings do |t|
+      t.string   :policy_name, null: false
+      t.boolean  :paused,      null: false, default: false
+      t.timestamps
+    end
+    add_index :dispatch_policy_policy_settings, :policy_name,
+              unique: true,
+              name:   "idx_dp_policy_settings_lookup"
   end
 end

data/lib/dispatch_policy/config.rb CHANGED Viewed

@@ -51,6 +51,11 @@ module DispatchPolicy
       # assuming the admission was lost. Raise it if your adapter backlog
       # can exceed an hour.
       @inflight_queued_stale_after = 60 * 60
+      # Seconds between heartbeat_at refreshes. Each beat briefly checks out
+      # an EXTRA connection (one per running job) from the role's pool, so
+      # the DB pool needs headroom above the worker concurrency — otherwise
+      # beats hit ConnectionTimeoutError and long jobs get swept as stale.
+      # Set to 0 to disable the heartbeat thread entirely.
       @inflight_heartbeat_interval = 30
       @real_adapter              = nil
       @logger                    = nil

data/lib/dispatch_policy/context.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module DispatchPolicy
     end
     def [](key)
-      @data[key.to_s]
+      indifferent(@data[key.to_s])
     end
     def to_h
@@ -31,11 +31,21 @@ module DispatchPolicy
     end
     def fetch(key, *args, &block)
-      @data.fetch(key.to_s, *args, &block)
+      indifferent(@data.fetch(key.to_s, *args, &block))
     end
     private
+    # Nested hashes are stored string-keyed (deep_stringify), so
+    # `ctx[:limits][:max]` would miss — the inner lookup is by symbol.
+    # Return nested hashes with indifferent access so symbol and string
+    # keys work at every depth, matching how host apps usually write
+    # context. to_h/to_jsonb still return the plain string-keyed hash for
+    # storage, untouched.
+    def indifferent(value)
+      value.is_a?(Hash) ? value.with_indifferent_access : value
+    end
     def deep_stringify(value)
       case value
       when Hash

data/lib/dispatch_policy/cursor_pagination.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require "base64"
 require "json"
+require "time"
 module DispatchPolicy
   # Tiny keyset-pagination helper for the engine UI. Each sort mode declares
@@ -86,13 +87,22 @@ module DispatchPolicy
       value, last_id = cursor
       # Ignore a cursor whose value type can't be compared against this
-      # sort's column. The numeric columns (pending_count, total_admitted)
-      # need a Numeric; everything else compares as text (partition_key, or
-      # the ISO8601 timestamps emitted by #extract). A mismatch — e.g. a
-      # numeric value forged for a timestamp sort — would raise PG error;
-      # instead we fall back to the first page.
-      numeric_column = %w[pending_count total_admitted].include?(sort[:cursor_sql])
-      return scope unless numeric_column ? value.is_a?(Numeric) : value.is_a?(String)
+      # sort's column. A mismatch — e.g. a numeric value forged for a
+      # timestamp sort — would raise a PG error; instead we fall back to the
+      # first page.
+      numeric_column   = %w[pending_count total_admitted].include?(sort[:cursor_sql])
+      timestamp_column = sort[:cursor_sql].start_with?("COALESCE(")
+      if numeric_column
+        return scope unless value.is_a?(Numeric)
+      elsif timestamp_column
+        # Bound against a timestamp column: a non-parseable string (e.g. a
+        # hand-forged "zzz") would raise `invalid input syntax for type
+        # timestamp` and 500. Require a real ISO8601 value — exactly what
+        # #extract emits — or fall back to the first page.
+        return scope unless value.is_a?(String) && parseable_timestamp?(value)
+      else
+        return scope unless value.is_a?(String)
+      end
       case sort[:direction]
       when :desc
@@ -133,5 +143,12 @@ module DispatchPolicy
       else v
       end
     end
+    def parseable_timestamp?(str)
+      Time.iso8601(str)
+      true
+    rescue ArgumentError, TypeError
+      false
+    end
   end
 end

data/lib/dispatch_policy/gates/adaptive_concurrency.rb CHANGED Viewed

@@ -45,6 +45,20 @@ module DispatchPolicy
         raise ArgumentError, "target_lag_ms must be > 0" unless @target_lag_ms.positive?
         raise ArgumentError, "min must be >= 1"          unless @min >= 1
         raise ArgumentError, "initial_max must be >= min" unless @initial_max >= @min
+        # Out-of-range tuning knobs invert the AIMD loop instead of erroring:
+        # alpha=0 freezes the EWMA at its seed so the cap grows unbounded;
+        # a decrease factor >= 1 turns the multiplicative *decrease* into an
+        # increase, a positive-feedback loop under failure/overload.
+        unless @ewma_alpha > 0 && @ewma_alpha <= 1
+          raise ArgumentError, "ewma_alpha must be in (0, 1]"
+        end
+        unless @fail_factor > 0 && @fail_factor < 1
+          raise ArgumentError, "failure_decrease_factor must be in (0, 1)"
+        end
+        unless @slow_factor > 0 && @slow_factor < 1
+          raise ArgumentError, "overload_decrease_factor must be in (0, 1)"
+        end
+        raise ArgumentError, "full_backoff must be >= 0" if @full_backoff.negative?
       end
       def name

data/lib/dispatch_policy/gates/concurrency.rb CHANGED Viewed

@@ -17,6 +17,10 @@ module DispatchPolicy
         super()
         @max_proc     = max.respond_to?(:call) ? max : ->(_ctx) { max }
         @full_backoff = full_backoff.to_f
+        # A negative backoff sets next_eligible_at in the past, so the
+        # partition is re-evaluated (COUNT(*)) every tick — the opposite of
+        # what full_backoff is for.
+        raise ArgumentError, "full_backoff must be >= 0" if @full_backoff.negative?
       end
       def name

data/lib/dispatch_policy/gates/throttle.rb CHANGED Viewed

@@ -14,13 +14,20 @@ module DispatchPolicy
     # The bucket lives on the staged partition row — one row per
     # `policy.partition_for(ctx)` value, one bucket per row, no dilution.
     class Throttle < Gate
-      attr_reader :rate_proc, :per
+      attr_reader :rate_proc, :per_proc
       def initialize(rate:, per:)
         super()
         @rate_proc = rate.respond_to?(:call) ? rate : ->(_ctx) { rate }
-        @per       = duration_seconds(per)
-        raise ArgumentError, "throttle :per must be > 0 (got #{@per})" unless @per.positive?
+        if per.respond_to?(:call)
+          # Dynamic window (per-ctx), symmetric with a dynamic rate. Validated
+          # per-evaluate since the value isn't known until admission time.
+          @per_proc = ->(ctx) { duration_seconds(per.call(ctx)) }
+        else
+          fixed = duration_seconds(per)
+          raise ArgumentError, "throttle :per must be > 0 (got #{fixed})" unless fixed.positive?
+          @per_proc = ->(_ctx) { fixed }
+        end
       end
       def name
@@ -28,16 +35,27 @@ module DispatchPolicy
       end
       def evaluate(ctx, partition, admit_budget)
-        capacity = capacity_for(ctx)
-        return Decision.deny(reason: "rate=0") if capacity <= 0
+        per  = per_for(ctx)
+        rate = rate_for(ctx)
+        # rate <= 0 (e.g. a paused tenant) backs off for one window instead
+        # of denying with a NULL retry_after. A NULL retry_after leaves the
+        # partition immediately eligible, so it would be re-claimed and
+        # re-evaluated every single tick — a busy-loop that also clobbers any
+        # backoff a prior tick had set.
+        return Decision.deny(retry_after: per, reason: "rate=0") if rate <= 0
-        refill_rate = capacity.to_f / @per
+        # The bucket holds at least one whole token; otherwise a sub-unit rate
+        # (e.g. rate: 0.5) could never accumulate a full token and would never
+        # admit. refill_rate stays at the true `rate` so the long-run pace is
+        # exact — the floor only sets the burst ceiling.
+        capacity    = [rate, 1.0].max
+        refill_rate = rate / per
         state       = (partition["gate_state"] || {})["throttle"] || {}
         tokens      = (state["tokens"] || capacity).to_f
         refilled_at = (state["refilled_at"] || now).to_f
         elapsed     = [now - refilled_at, 0.0].max
-        tokens      = [tokens + (elapsed * refill_rate), capacity.to_f].min
+        tokens      = [tokens + (elapsed * refill_rate), capacity].min
         # The patch records the post-refill bucket WITHOUT deducting yet.
         # The actual deduction is deferred to #consume, which runs once
@@ -77,9 +95,18 @@ module DispatchPolicy
       private
-      def capacity_for(ctx)
+      def per_for(ctx)
+        value = @per_proc.call(ctx)
+        raise ArgumentError, "throttle :per must be > 0 (got #{value})" unless value.positive?
+        value
+      end
+      def rate_for(ctx)
         value = @rate_proc.call(ctx)
-        value.nil? ? 0 : Integer(value)
+        # Float, not Integer: a fractional rate (e.g. 2.5/sec) must keep its
+        # fractional part or the bucket systematically under-admits by
+        # truncating every refill. nil means "no rate configured" → deny.
+        value.nil? ? 0.0 : Float(value)
       end
       def now

data/lib/dispatch_policy/inflight_tracker.rb CHANGED Viewed

@@ -39,33 +39,45 @@ module DispatchPolicy
       ctx           = policy.build_context(job.arguments, queue_name: queue_name)
       partition_key = policy.partition_key_for(ctx)
-      Repository.insert_inflight!([{
-        policy_name:    policy.name,
-        partition_key:  partition_key,
-        active_job_id:  job.job_id
-      }])
       adaptive_gates = policy.gates.select { |g| g.name == :adaptive_concurrency }
-      admitted_at    = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
-      perform_start  = Time.current
+      admitted_at    = nil
+      perform_start  = nil
+      heartbeat      = nil
+      started        = false
+      succeeded      = false
+      # insert + heartbeat spawn live INSIDE the begin so the ensure always
+      # cleans up: if start_heartbeat (Thread.new) raises after the row is
+      # inserted, the row would otherwise orphan until the stale sweeper.
+      begin
+        Repository.insert_inflight!([{
+          policy_name:    policy.name,
+          partition_key:  partition_key,
+          active_job_id:  job.job_id
+        }])
-      heartbeat = start_heartbeat(job.job_id)
+        admitted_at   = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
+        perform_start = Time.current
+        heartbeat     = start_heartbeat(job.job_id)
-      succeeded = false
-      begin
+        started = true
         yield
         succeeded = true
       ensure
         stop_heartbeat(heartbeat)
-        record_adaptive_observations(
-          policy:        policy,
-          gates:         adaptive_gates,
-          partition_key: partition_key,
-          admitted_at:   admitted_at,
-          perform_start: perform_start,
-          succeeded:     succeeded
-        )
+        # Only record an observation if we actually reached perform — a
+        # failure in setup (insert / heartbeat spawn) isn't a perform result.
+        if started
+          record_adaptive_observations(
+            policy:        policy,
+            gates:         adaptive_gates,
+            partition_key: partition_key,
+            admitted_at:   admitted_at,
+            perform_start: perform_start,
+            succeeded:     succeeded
+          )
+        end
         begin
           Repository.delete_inflight!(active_job_id: job.job_id)
@@ -75,17 +87,43 @@ module DispatchPolicy
       end
     end
+    # Deletes the inflight row for a job that ActiveJob discarded BEFORE
+    # around_perform ran — most commonly an ActiveJob::DeserializationError
+    # (a GlobalID whose record was deleted) on a job with
+    # `discard_on ActiveJob::DeserializationError`. Argument deserialization
+    # happens before the perform callbacks, so track's `ensure` never runs
+    # and the row the Tick pre-inserted would otherwise sit until the
+    # `inflight_queued_stale_after` sweeper reaps it (default 1h), holding a
+    # concurrency slot the whole time. Wired to the `discard.active_job`
+    # notification by the railtie. Idempotent: a no-op when no row exists
+    # (e.g. discard fired after track already deleted it).
+    def self.handle_discard(job)
+      return unless job
+      return unless job.class.respond_to?(:dispatch_policy_name) && job.class.dispatch_policy_name
+      Repository.delete_inflight!(active_job_id: job.job_id)
+    rescue StandardError => e
+      DispatchPolicy.config.logger&.warn(
+        "[dispatch_policy] failed to clean up inflight row for discarded job #{job&.job_id}: #{e.class}: #{e.message}"
+      )
+    end
     # Reads the admitted_at column from the inflight row that the Tick
     # pre-inserted. Used as the start-of-queue-wait reference for the
     # adaptive_concurrency feedback signal (queue_lag = perform_start
     # - admitted_at). nil if the row vanished or the lookup fails —
     # the observation is then skipped.
     def self.lookup_admitted_at(active_job_id)
-      result = ActiveRecord::Base.connection.exec_query(
-        "SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
-        "lookup_admitted_at",
-        [active_job_id]
-      )
+      # Route through config.database_role: the inflight row lives in the
+      # same DB the Tick pre-inserted it into, which under multi-DB is the
+      # queue DB, not the default writing role of the worker process.
+      result = Repository.with_connection do
+        ActiveRecord::Base.connection.exec_query(
+          "SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
+          "lookup_admitted_at",
+          [active_job_id]
+        )
+      end
       row = result.first
       return nil unless row
       ts = row["admitted_at"]
@@ -147,8 +185,16 @@ module DispatchPolicy
           break if stop_flag.true?
           begin
-            ActiveRecord::Base.connection_pool.with_connection do
-              Repository.heartbeat_inflight!(active_job_id: active_job_id)
+            # Establish config.database_role inside this thread BEFORE the
+            # checkout: under multi-DB, connection_pool must resolve to the
+            # role's pool (where the inflight row lives), not the default
+            # writing pool. with_connection swaps the role thread-locally;
+            # the nested pool checkout then borrows/returns a dedicated
+            # connection from that pool per beat.
+            Repository.with_connection do
+              ActiveRecord::Base.connection_pool.with_connection do
+                Repository.heartbeat_inflight!(active_job_id: active_job_id)
+              end
             end
           rescue StandardError => e
             DispatchPolicy.config.logger&.warn("[dispatch_policy] heartbeat #{active_job_id} failed: #{e.class}: #{e.message}")

data/lib/dispatch_policy/job_extension.rb CHANGED Viewed

@@ -71,6 +71,25 @@ module DispatchPolicy
       (job.respond_to?(:executions) ? job.executions.to_i : 0).positive?
     end
+    # Whether a job should be staged through admission control rather than
+    # handed straight to the adapter. Mirrors the single-enqueue decision in
+    # around_enqueue_for: it needs a registered policy and must not be a
+    # retry on a bypass_retries policy. Used by the bulk path to split jobs
+    # so the ones we don't own fall through to the adapter instead of being
+    # silently dropped.
+    def self.stageable?(job)
+      return false unless job.class.respond_to?(:dispatch_policy_name)
+      name = job.class.dispatch_policy_name
+      return false unless name
+      policy = DispatchPolicy.registry.fetch(name)
+      return false unless policy
+      return false if retry_attempt?(job) && policy.bypass_retries?
+      true
+    end
     def self.scheduled_time(job)
       ts = job.scheduled_at
       return nil if ts.nil?
@@ -111,17 +130,19 @@ module DispatchPolicy
         return super unless DispatchPolicy.config.enabled
         return super unless DispatchPolicy.registry.size.positive?
-        with_policy, without_policy = flat.partition do |j|
-          j.class.respond_to?(:dispatch_policy_name) && j.class.dispatch_policy_name
-        end
+        # Split exactly like the single path decides: jobs we own get staged,
+        # everything else (no policy, unregistered policy name, or a retry on
+        # a bypass_retries policy) goes straight to the adapter. Dropping them
+        # — as a `next unless policy` inside the row builder would — silently
+        # loses jobs the caller expected to be enqueued.
+        to_stage, to_adapter = flat.partition { |job| JobExtension.stageable?(job) }
-        super(without_policy) if without_policy.any?
+        super(to_adapter) if to_adapter.any?
-        return nil if with_policy.empty?
+        return nil if to_stage.empty?
-        rows = with_policy.filter_map do |job|
+        rows = to_stage.map do |job|
           policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
-          next unless policy
           # See JobExtension.ensure_arguments_materialized! — we need this
           # for the same reason as the single-enqueue path.
@@ -132,7 +153,6 @@ module DispatchPolicy
           partition_key = policy.partition_key_for(ctx)
           shard         = policy.shard_for(ctx)
           payload       = Serializer.serialize(job)
-          job.successfully_enqueued = true
           {
             policy_name:   policy.name,
@@ -147,7 +167,11 @@ module DispatchPolicy
           }
         end
-        Repository.stage_many!(rows) if rows.any?
+        # Only mark enqueued AFTER the INSERT commits. If stage_many! raises,
+        # a caller that rescues and inspects successfully_enqueued? must not
+        # be told the jobs were enqueued when they weren't.
+        Repository.stage_many!(rows)
+        to_stage.each { |job| job.successfully_enqueued = true }
         nil # ActiveJob.perform_all_later contract returns nil
       end
     end

data/lib/dispatch_policy/manual_admission.rb CHANGED Viewed

@@ -39,6 +39,24 @@ module DispatchPolicy
           next if rows.empty?
           rows.each { |row| row["job_data"]["job_id"] = SecureRandom.uuid }
+          # Pre-insert an inflight row per admitted job, exactly like
+          # Tick#admit_partition does. Without it the concurrency gate's
+          # COUNT(*) misses these jobs until each one starts performing and
+          # InflightTracker.track inserts its own row — an over-admission
+          # window proportional to how many jobs were force-admitted. The
+          # key is the canonical partition value, which for a policy-level
+          # partition_by is exactly the staged partition_key (see
+          # Concurrency#inflight_partition_key). Runs inside the same TX, so
+          # a rolled-back claim takes the inflight rows with it.
+          inflight_rows = rows.filter_map do |row|
+            ajid = row.dig("job_data", "job_id")
+            next unless ajid
+            { policy_name: policy_name, partition_key: partition_key, active_job_id: ajid }
+          end
+          Repository.insert_inflight!(inflight_rows) if inflight_rows.any?
           Forwarder.dispatch(rows)
           forwarded = rows.size
         end

data/lib/dispatch_policy/operator_hints.rb CHANGED Viewed

@@ -29,10 +29,24 @@ module DispatchPolicy
     #   in_backoff:             int
     #   total_partitions:       int
     #   adapter_target_jps:     int|nil  (config.adapter_throughput_target)
+    #   paused:                 bool (policy-level pause flag)
     def for(metrics)
       hints = []
       m     = metrics
+      # ---- policy paused: everything below presumes admission SHOULD be
+      # flowing (never_checked, drain time, pending growing), so during a
+      # deliberate pause those hints turn into false alarms — e.g.
+      # "increase partition_batch_size" while the tick is intentionally
+      # skipping the policy. State the pause and stop.
+      if m[:paused]
+        return [Hint.new(
+          level: :warn,
+          message: "Policy is paused — admission is stopped while staging continues " \
+                   "(pending keeps growing). Resume to drain."
+        )]
+      end
       # ---- tick approaching deadline ---------------------------------
       if m[:tick_max_duration_ms].to_i.positive? && m[:avg_tick_ms].to_i.positive?
         ratio = m[:avg_tick_ms].to_f / m[:tick_max_duration_ms]

data/lib/dispatch_policy/policy.rb CHANGED Viewed

@@ -78,6 +78,18 @@ module DispatchPolicy
       unless %i[restage bypass].include?(@retry_strategy)
         raise InvalidPolicy, "retry_strategy must be :restage or :bypass"
       end
+      # Two gates of the same type would persist their state under the same
+      # gate_state key (e.g. both throttles share gate_state["throttle"]),
+      # silently corrupting each other: the merged patch keeps only the last
+      # gate's bucket, and on the next tick the other gate clamps that count
+      # to its own capacity and sees a permanently full bucket. Reject it at
+      # definition time — multi-window rate limiting needs separate policies.
+      duplicate = @gates.map(&:name).tally.find { |_, count| count > 1 }
+      if duplicate
+        raise InvalidPolicy,
+              "duplicate #{duplicate.first.inspect} gate: a policy may declare each gate " \
+              "type at most once (use separate policies for multi-window limits)"
+      end
       # Note: gates are NOT required. A policy with no gates uses the
       # admission_batch_size (or tick_admission_budget when set) as its
       # only ceiling, with the in-tick fairness reorder distributing

data/lib/dispatch_policy/policy_dsl.rb CHANGED Viewed

@@ -45,7 +45,11 @@ module DispatchPolicy
     end
     def admission_batch_size(size)
-      @admission_batch_size = Integer(size) if size
+      return if size.nil?
+      value = Integer(size)
+      raise InvalidPolicy, "admission_batch_size must be > 0 (got #{value})" unless value.positive?
+      @admission_batch_size = value
     end
     # Per-policy override for the EWMA half-life used to weigh recent
@@ -62,7 +66,11 @@ module DispatchPolicy
     # nil, no global cap is enforced and per-partition admission_batch_size
     # is the only ceiling.
     def tick_admission_budget(value)
-      @tick_admission_budget = Integer(value) if value
+      return if value.nil?
+      budget = Integer(value)
+      raise InvalidPolicy, "tick_admission_budget must be > 0 (got #{budget})" unless budget.positive?
+      @tick_admission_budget = budget
     end
     # Defines the partition scope. Required — every policy declares

data/lib/dispatch_policy/railtie.rb CHANGED Viewed

@@ -19,6 +19,16 @@ module DispatchPolicy
       end
     end
+    # Reap the inflight row when a job is discarded before its perform
+    # callbacks run (e.g. discard_on ActiveJob::DeserializationError):
+    # InflightTracker.track's `ensure` never fires in that path, so the
+    # Tick's pre-inserted row would orphan until the stale sweeper.
+    initializer "dispatch_policy.discard_cleanup" do
+      ActiveSupport::Notifications.subscribe("discard.active_job") do |event|
+        DispatchPolicy::InflightTracker.handle_discard(event.payload[:job])
+      end
+    end
     # Hosts copy the gem's migration into their own db/migrate via
     # `rails railties:install:migrations` (or hand-write a cutover
     # migration like opstasks did). We deliberately do NOT auto-merge