RubyGems - dispatch_policy - Versions diffs - 0.4.3 → 0.5.0 - Mend

dispatch_policy 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +185 -0
data/README.md +30 -7
data/app/controllers/dispatch_policy/application_controller.rb +21 -2
data/app/controllers/dispatch_policy/dashboard_controller.rb +3 -0
data/app/controllers/dispatch_policy/partitions_controller.rb +51 -15
data/app/controllers/dispatch_policy/policies_controller.rb +26 -4
data/app/models/dispatch_policy/policy_setting.rb +14 -0
data/app/views/dispatch_policy/dashboard/index.html.erb +6 -1
data/app/views/dispatch_policy/partitions/index.html.erb +1 -1
data/app/views/dispatch_policy/partitions/show.html.erb +1 -1
data/app/views/dispatch_policy/policies/index.html.erb +11 -3
data/app/views/dispatch_policy/policies/show.html.erb +13 -4
data/app/views/dispatch_policy/shared/_partition_row.html.erb +9 -2
data/app/views/layouts/dispatch_policy/application.html.erb +21 -25
data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +13 -0
data/lib/dispatch_policy/config.rb +5 -0
data/lib/dispatch_policy/context.rb +12 -2
data/lib/dispatch_policy/cursor_pagination.rb +24 -7
data/lib/dispatch_policy/gates/adaptive_concurrency.rb +14 -0
data/lib/dispatch_policy/gates/concurrency.rb +4 -0
data/lib/dispatch_policy/gates/throttle.rb +36 -9
data/lib/dispatch_policy/inflight_tracker.rb +72 -26
data/lib/dispatch_policy/job_extension.rb +33 -9
data/lib/dispatch_policy/manual_admission.rb +18 -0
data/lib/dispatch_policy/operator_hints.rb +14 -0
data/lib/dispatch_policy/policy.rb +12 -0
data/lib/dispatch_policy/policy_dsl.rb +10 -2
data/lib/dispatch_policy/railtie.rb +10 -0
data/lib/dispatch_policy/registry.rb +8 -4
data/lib/dispatch_policy/repository.rb +102 -30
data/lib/dispatch_policy/tick.rb +18 -2
data/lib/dispatch_policy/tick_loop.rb +15 -7
data/lib/dispatch_policy/version.rb +1 -1
data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +9 -0
data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +30 -2
metadata +2 -1

data/lib/dispatch_policy/registry.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module DispatchPolicy
     end
     def fetch(name)
-      entry = @policies[name.to_s]
+      entry = @mutex.synchronize { @policies[name.to_s] }
       entry && entry[:policy]
     end
@@ -28,15 +28,19 @@ module DispatchPolicy
     end
     def names
-      @policies.keys
+      @mutex.synchronize { @policies.keys }
     end
     def each(&block)
-      @policies.values.map { |e| e[:policy] }.each(&block)
+      # Snapshot under the lock, then iterate outside it: the block may run
+      # arbitrary code (and Mutex isn't reentrant), so we must not hold the
+      # lock while yielding.
+      snapshot = @mutex.synchronize { @policies.values.map { |e| e[:policy] } }
+      snapshot.each(&block)
     end
     def size
-      @policies.size
+      @mutex.synchronize { @policies.size }
     end
     def clear

data/lib/dispatch_policy/repository.rb CHANGED Viewed

@@ -13,8 +13,9 @@ module DispatchPolicy
     STAGED_TABLE      = "dispatch_policy_staged_jobs"
     PARTITIONS_TABLE  = "dispatch_policy_partitions"
     INFLIGHT_TABLE    = "dispatch_policy_inflight_jobs"
-    SAMPLES_TABLE     = "dispatch_policy_tick_samples"
-    ADAPTIVE_TABLE    = "dispatch_policy_adaptive_concurrency_stats"
+    SAMPLES_TABLE        = "dispatch_policy_tick_samples"
+    ADAPTIVE_TABLE       = "dispatch_policy_adaptive_concurrency_stats"
+    POLICY_SETTINGS_TABLE = "dispatch_policy_policy_settings"
     module_function
@@ -78,35 +79,43 @@ module DispatchPolicy
     # Bulk version for perform_all_later. Receives an array of hashes with
     # the same keys as #stage!. Performs one INSERT for staged_jobs and
     # one UPSERT per (policy_name, partition_key) group.
+    # Rows per INSERT. Each row binds 8 params; Postgres caps a statement at
+    # 65_535 bind params, so we slice well under 65_535/8 ≈ 8_191 to leave
+    # headroom. A single perform_all_later with more rows than this would
+    # otherwise blow the limit and fail the whole batch.
+    STAGE_MANY_BATCH = 1_000
     def stage_many!(rows)
       return 0 if rows.empty?
       connection.transaction(requires_new: true) do
-        values_sql = []
-        params     = []
-        rows.each_with_index do |row, idx|
-          base = idx * 8
-          values_sql << "($#{base + 1}, $#{base + 2}, $#{base + 3}, $#{base + 4}, $#{base + 5}::jsonb, $#{base + 6}::jsonb, $#{base + 7}, $#{base + 8})"
-          params.push(
-            row[:policy_name],
-            row[:partition_key],
-            row[:queue_name],
-            row[:job_class],
-            JSON.dump(row[:job_data]),
-            JSON.dump(row[:context] || {}),
-            row[:scheduled_at],
-            row[:priority] || 0
+        rows.each_slice(STAGE_MANY_BATCH) do |slice|
+          values_sql = []
+          params     = []
+          slice.each_with_index do |row, idx|
+            base = idx * 8
+            values_sql << "($#{base + 1}, $#{base + 2}, $#{base + 3}, $#{base + 4}, $#{base + 5}::jsonb, $#{base + 6}::jsonb, $#{base + 7}, $#{base + 8})"
+            params.push(
+              row[:policy_name],
+              row[:partition_key],
+              row[:queue_name],
+              row[:job_class],
+              JSON.dump(row[:job_data]),
+              JSON.dump(row[:context] || {}),
+              row[:scheduled_at],
+              row[:priority] || 0
+            )
+          end
+          connection.exec_query(
+            <<~SQL.squish,
+              INSERT INTO #{STAGED_TABLE}
+                (policy_name, partition_key, queue_name, job_class, job_data, context, scheduled_at, priority)
+              VALUES #{values_sql.join(", ")}
+            SQL
+            "stage_many",
+            params
           )
         end
-        connection.exec_query(
-          <<~SQL.squish,
-            INSERT INTO #{STAGED_TABLE}
-              (policy_name, partition_key, queue_name, job_class, job_data, context, scheduled_at, priority)
-            VALUES #{values_sql.join(", ")}
-          SQL
-          "stage_many",
-          params
-        )
         rows.group_by { |r| [r[:policy_name], r[:partition_key]] }.each do |(policy_name, partition_key), group|
           upsert_partition!(
@@ -169,6 +178,10 @@ module DispatchPolicy
             AND status = 'active'
             AND pending_count > 0
             AND (next_eligible_at IS NULL OR next_eligible_at <= now())
+            AND NOT EXISTS (
+              SELECT 1 FROM #{POLICY_SETTINGS_TABLE} ps
+              WHERE ps.policy_name = $1 AND ps.paused
+            )
             #{shard_sql}
           ORDER BY last_checked_at NULLS FIRST, id
           LIMIT $#{params.size}
@@ -306,10 +319,13 @@ module DispatchPolicy
     #
     # Each entry: { policy_name:, partition_key:, gate_state_patch:, retry_after: }.
     # Independent per row — the join via FROM(VALUES…) makes the bulk
-    # statement equivalent to N sequential UPDATEs in correctness terms;
-    # the row-level locks held by `claim_partitions` (FOR UPDATE SKIP
-    # LOCKED, last_checked_at bumped) keep concurrent ticks away from the
-    # same partitions while we batch.
+    # statement equivalent to N sequential UPDATEs in correctness terms.
+    # Note: `claim_partitions` runs as its own autocommitted statement, so
+    # its `FOR UPDATE SKIP LOCKED` row locks are already released by the time
+    # we reach this flush — they do NOT guard the batch. What keeps two ticks
+    # off the same partitions is the operational invariant of one tick loop
+    # per (policy, shard), reinforced by the `last_checked_at` bump on claim
+    # (a racing claim skips recently-checked rows).
     def bulk_record_partition_denies!(entries)
       return if entries.empty?
@@ -331,7 +347,7 @@ module DispatchPolicy
           UPDATE #{PARTITIONS_TABLE} p
           SET gate_state       = p.gate_state || v.gate_state_patch,
               next_eligible_at = CASE
-                WHEN v.retry_after_secs IS NULL THEN NULL
+                WHEN v.retry_after_secs IS NULL THEN p.next_eligible_at
                 ELSE now() + (v.retry_after_secs || ' seconds')::interval
               END,
               updated_at       = now()
@@ -344,6 +360,24 @@ module DispatchPolicy
       )
     end
+    # ----- policy settings ------------------------------------------------------
+    # Upsert the pause flag for a policy. The tick's claim_partitions reads
+    # this row, so toggling it takes effect for every partition of the
+    # policy — including ones created after the toggle.
+    def set_policy_paused!(policy_name:, paused:)
+      connection.exec_query(
+        <<~SQL.squish,
+          INSERT INTO #{POLICY_SETTINGS_TABLE} (policy_name, paused, created_at, updated_at)
+          VALUES ($1, $2, now(), now())
+          ON CONFLICT (policy_name)
+          DO UPDATE SET paused = EXCLUDED.paused, updated_at = now()
+        SQL
+        "set_policy_paused",
+        [policy_name, paused ? true : false]
+      )
+    end
     # ----- inflight tracking ---------------------------------------------------
     def insert_inflight!(rows)
@@ -883,5 +917,43 @@ module DispatchPolicy
         ["now() + ($5 || ' seconds')::interval", [retry_after.to_f.round(3)]]
       end
     end
+    # ----- role routing ---------------------------------------------------------
+    #
+    # Every public Repository method must run against config.database_role
+    # so multi-DB setups (e.g. solid_queue on a separate :queue DB, with
+    # the gem tables living there) hit the DB the staging/admission/inflight
+    # state actually lives in. Otherwise staging writes the primary DB while
+    # the tick reads the queue DB — silent job loss — and the concurrency
+    # gate counts inflight rows in a different DB than the tracker writes.
+    #
+    # Rather than wrap ~25 method bodies by hand — and risk missing one as
+    # the API grows — we redefine each public SQL method to run inside
+    # `with_connection`. We capture the ORIGINAL as a bound closure and call
+    # it directly (no `super`, no prepended module): this is immune to the
+    # file being evaluated more than once in a process (dev reloader,
+    # integration suites that boot the dummy app under multiple require
+    # paths). Each evaluation re-wraps the freshly (re)defined originals
+    # exactly once, so wrappers never stack. `connected_to(role:)` nesting
+    # with the SAME role is a no-op, so the explicit `with_connection` blocks
+    # at the transaction boundaries (Tick, ManualAdmission) stay correct: the
+    # admission TX still opens entirely within one role context, preserving
+    # the shared-connection atomicity invariant. The `connection` accessor
+    # and the pure helpers are excluded — they issue no SQL of their own and
+    # always run inside an already-routed caller, so wrapping them would only
+    # add redundant role swaps in hot per-row loops (normalize_*/parse_jsonb
+    # run once per claimed row).
+    ROLE_ROUTING_EXCLUDED = %i[
+      connection with_connection
+      normalize_partition normalize_staged parse_jsonb
+      sample_filter next_eligible_clause trend_direction
+    ].freeze
+    (singleton_methods(false) - ROLE_ROUTING_EXCLUDED).each do |method_name|
+      original = singleton_class.instance_method(method_name)
+      define_singleton_method(method_name) do |*args, **kwargs, &block|
+        with_connection { original.bind_call(self, *args, **kwargs, &block) }
+      end
+    end
   end
 end

data/lib/dispatch_policy/tick.rb CHANGED Viewed

@@ -127,6 +127,15 @@ module DispatchPolicy
             forward_failures += outcome[:failures]
             admitted_per_partition[p["partition_key"]] += outcome[:admitted]
             remaining -= outcome[:admitted]
+            # Feed pass-2 denies into the reason breakdown (e.g. a throttle
+            # that emptied after pass-1's settle) so the dashboard sees why
+            # redistribution stopped. We do NOT bump partitions_denied: the
+            # partition already counted as admitted in pass-1, and
+            # admitted + denied should stay ≈ partitions_seen.
+            if outcome[:admitted].zero?
+              outcome[:reasons].each { |r| denied_reasons[r] += 1 }
+            end
           end
         end
       end
@@ -341,8 +350,15 @@ module DispatchPolicy
     end
     def record_sample!(**fields)
-      pending_total  = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
-      inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
+      # These two reads go through the AR models, which the Repository role
+      # wrapper doesn't cover — wrap explicitly or, under multi-DB
+      # (config.database_role), they'd query the default writing role and
+      # either raise (swallowed below → no samples ever) or record zeros.
+      pending_total = inflight_total = nil
+      Repository.with_connection do
+        pending_total  = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
+        inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
+      end
       Repository.record_tick_sample!(
         policy_name:    @policy_name,

data/lib/dispatch_policy/tick_loop.rb CHANGED Viewed

@@ -29,7 +29,7 @@ module DispatchPolicy
         names = policy_names(policy_name)
         if names.empty?
-          sleep(config.idle_pause)
+          pause(config.idle_pause)
           next
         end
@@ -46,18 +46,26 @@ module DispatchPolicy
         end
         iteration += 1
-        if (iteration % config.sweep_every_ticks).zero?
-          sweep!
-        end
+        # sweep_every_ticks <= 0 means "never sweep" (rather than crashing
+        # the loop with ZeroDivisionError on `iteration % 0`).
+        sweep_every = config.sweep_every_ticks.to_i
+        sweep! if sweep_every.positive? && (iteration % sweep_every).zero?
         if admitted.zero?
-          sleep(config.idle_pause)
-        elsif config.busy_pause.to_f.positive?
-          sleep(config.busy_pause)
+          pause(config.idle_pause)
+        else
+          pause(config.busy_pause)
         end
       end
     end
+    # sleep, but never with a negative argument (which would raise
+    # ArgumentError mid-loop) — a non-positive pause just means "no pause".
+    def pause(seconds)
+      secs = seconds.to_f
+      sleep(secs) if secs.positive?
+    end
     def policy_names(filter)
       if filter
         [filter.to_s]

data/lib/dispatch_policy/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module DispatchPolicy
-  VERSION = "0.4.3"
+  VERSION = "0.5.0"
 end

data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt CHANGED Viewed

@@ -91,5 +91,14 @@ class CreateDispatchPolicyTables < ActiveRecord::Migration[<%= Rails::VERSION::S
               [:policy_name, :partition_key],
               unique: true,
               name:   "idx_dp_adaptive_concurrency_lookup"
+    create_table :dispatch_policy_policy_settings do |t|
+      t.string   :policy_name, null: false
+      t.boolean  :paused,      null: false, default: false
+      t.timestamps
+    end
+    add_index :dispatch_policy_policy_settings, :policy_name,
+              unique: true,
+              name:   "idx_dp_policy_settings_lookup"
   end
 end

data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt CHANGED Viewed

@@ -17,8 +17,26 @@ class DispatchTickLoopJob < ApplicationJob
 <% if good_job? -%>
   include GoodJob::ActiveJobExtensions::Concurrency
+  # Two valid setups depending on whether you have a cron safety net:
+  #
+  # A) enqueue_limit: 1, perform_limit: 1  (default below — no cron required)
+  #    GoodJob's enqueue check excludes the running job (advisory-locked), so
+  #    the self-re-enqueue at the end of perform always succeeds. perform_limit
+  #    ensures a single concurrent execution. Downside: if a cron also fires
+  #    while the job is running, it enqueues a second copy (enqueue_limit = 0
+  #    from the cron's perspective); that second copy then hits perform_limit
+  #    and raises ConcurrencyExceededError, creating a retry gap.
+  #
+  # B) total_limit: 1  (use this if you have a cron safety net)
+  #    total_limit counts the running job, so the cron enqueue is blocked
+  #    instead of erroring. The self-chain at the end of perform is also
+  #    blocked (silently returns successfully_enqueued? == false), but the
+  #    cron reschedules the job within seconds — no ConcurrencyExceededError,
+  #    no retry gap. The "belt and braces" log below can be removed in this
+  #    setup as a blocked self-chain is expected and harmless.
   good_job_control_concurrency_with(
-    total_limit: 1,
+    enqueue_limit: 1,
+    perform_limit: 1,
     key: -> { "dispatch_tick_loop:#{arguments[0] || 'all'}:#{arguments[1] || 'all'}" }
   )
 <% elsif solid_queue? -%>
@@ -36,7 +54,17 @@ class DispatchTickLoopJob < ApplicationJob
       stop_when:   -> { adapter_shutting_down? || Time.current >= deadline }
     )
-    self.class.set(wait: 1.second).perform_later(policy_name, shard)
+    successor = self.class.set(wait: 1.second).perform_later(policy_name, shard)
+    # Belt and braces: a concurrency-aborted enqueue returns the job with
+    # successfully_enqueued? == false instead of raising. If that ever
+    # happens the loop chain is dead — make it loud instead of silent.
+    if successor.respond_to?(:successfully_enqueued?) && !successor.successfully_enqueued?
+      Rails.logger.error(
+        "[dispatch_policy] DispatchTickLoopJob failed to re-enqueue itself " \
+        "(policy=#{policy_name.inspect} shard=#{shard.inspect}) — the tick loop chain has stopped"
+      )
+    end
   end
   private

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: dispatch_policy
 version: !ruby/object:Gem::Version
-  version: 0.4.3
+  version: 0.5.0
 platform: ruby
 authors:
 - José Galisteo
@@ -204,6 +204,7 @@ files:
 - app/models/dispatch_policy/application_record.rb
 - app/models/dispatch_policy/inflight_job.rb
 - app/models/dispatch_policy/partition.rb
+- app/models/dispatch_policy/policy_setting.rb
 - app/models/dispatch_policy/staged_job.rb
 - app/models/dispatch_policy/tick_sample.rb
 - app/views/dispatch_policy/dashboard/index.html.erb