RubyGems - dispatch_policy - Versions diffs - 0.1.0 → 0.3.0 - Mend

dispatch_policy 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

checksums.yaml +4 -4
data/MIT-LICENSE +16 -17
data/README.md +449 -288
data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
data/app/controllers/dispatch_policy/application_controller.rb +45 -1
data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
data/app/models/dispatch_policy/inflight_job.rb +12 -0
data/app/models/dispatch_policy/partition.rb +21 -0
data/app/models/dispatch_policy/staged_job.rb +4 -97
data/app/models/dispatch_policy/tick_sample.rb +11 -0
data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
data/app/views/dispatch_policy/policies/index.html.erb +15 -37
data/app/views/dispatch_policy/policies/show.html.erb +140 -216
data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
data/config/routes.rb +18 -2
data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
data/lib/dispatch_policy/bypass.rb +23 -0
data/lib/dispatch_policy/config.rb +85 -0
data/lib/dispatch_policy/context.rb +50 -0
data/lib/dispatch_policy/cursor_pagination.rb +121 -0
data/lib/dispatch_policy/decision.rb +22 -0
data/lib/dispatch_policy/engine.rb +4 -27
data/lib/dispatch_policy/forwarder.rb +63 -0
data/lib/dispatch_policy/gate.rb +10 -38
data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
data/lib/dispatch_policy/gates/concurrency.rb +45 -26
data/lib/dispatch_policy/gates/throttle.rb +65 -37
data/lib/dispatch_policy/inflight_tracker.rb +174 -0
data/lib/dispatch_policy/job_extension.rb +155 -0
data/lib/dispatch_policy/operator_hints.rb +126 -0
data/lib/dispatch_policy/pipeline.rb +48 -0
data/lib/dispatch_policy/policy.rb +62 -47
data/lib/dispatch_policy/policy_dsl.rb +120 -0
data/lib/dispatch_policy/railtie.rb +35 -0
data/lib/dispatch_policy/registry.rb +46 -0
data/lib/dispatch_policy/repository.rb +723 -0
data/lib/dispatch_policy/serializer.rb +36 -0
data/lib/dispatch_policy/tick.rb +263 -172
data/lib/dispatch_policy/tick_loop.rb +59 -26
data/lib/dispatch_policy/version.rb +1 -1
data/lib/dispatch_policy.rb +71 -46
data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
metadata +101 -43
data/CHANGELOG.md +0 -12
data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
data/app/models/dispatch_policy/partition_observation.rb +0 -49
data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
data/lib/dispatch_policy/dispatch_context.rb +0 -53
data/lib/dispatch_policy/dispatchable.rb +0 -120
data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
data/lib/dispatch_policy/gates/global_cap.rb +0 -26
data/lib/dispatch_policy/install_generator.rb +0 -23

data/lib/dispatch_policy/serializer.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# frozen_string_literal: true
+require "json"
+module DispatchPolicy
+  module Serializer
+    module_function
+    # Serialize an ActiveJob instance for storage in staged_jobs.job_data.
+    # Returns a Ruby hash compatible with PostgreSQL jsonb (string keys).
+    def serialize(job)
+      job.serialize
+    end
+    # Deserialize stored job_data into a fresh ActiveJob instance ready
+    # to be enqueued via `#enqueue`.
+    def deserialize(payload)
+      job_class = payload["job_class"] || payload[:job_class]
+      raise InvalidPolicy, "missing job_class in stored payload" unless job_class
+      klass = job_class.constantize
+      klass.deserialize(payload)
+    end
+    def dump_jsonb(value)
+      JSON.dump(value)
+    end
+    def load_jsonb(text)
+      return text if text.is_a?(Hash) || text.is_a?(Array)
+      return {} if text.nil? || text == ""
+      JSON.parse(text)
+    end
+  end
+end

data/lib/dispatch_policy/tick.rb CHANGED Viewed

@@ -1,214 +1,305 @@
 # frozen_string_literal: true
 module DispatchPolicy
+  # One pass of admission for a single policy.
+  #
+  # Records a row in dispatch_policy_tick_samples at the end so the engine UI
+  # can show throughput, denial reasons, and tick duration without sampling
+  # on the read path.
   class Tick
-    THROTTLE_ZERO_THRESHOLD = 0.001
+    Result = Struct.new(:partitions_seen, :jobs_admitted, keyword_init: true)
-    # Single admission pass: fetch pending staged jobs per policy, evaluate
-    # gates, mark survivors as admitted, then enqueue them on the real
-    # backend outside the locking transaction.
-    def self.run(policy_name: nil)
-      return 0 unless DispatchPolicy.enabled?
+    def self.run(policy_name:, shard: nil)
+      new(policy_name, shard: shard).call
+    end
-      pending_enqueue = []
+    def initialize(policy_name, shard: nil)
+      @policy_name = policy_name
+      @shard       = shard
+      @policy      = DispatchPolicy.registry.fetch(policy_name) || raise(InvalidPolicy, "unknown policy #{policy_name.inspect}")
+      @config      = DispatchPolicy.config
+    end
-      StagedJob.transaction do
-        active_policies(policy_name).each do |pname|
-          policy = lookup_policy(pname)
-          next unless policy
+    def call
+      started_at         = monotonic_now_ms
+      partitions_seen    = 0
+      partitions_admitted = 0
+      partitions_denied   = 0
+      jobs_admitted      = 0
+      forward_failures   = 0
+      denied_reasons     = Hash.new(0)
+      partitions = Repository.claim_partitions(
+        policy_name: @policy_name,
+        shard:       @shard,
+        limit:       @config.partition_batch_size
+      )
-          batch = fetch_batch(policy)
-          next if batch.empty?
+      # Reorder by least-recent-admit-weighted (EWMA decayed_admits ASC)
+      # so under-admitted partitions get first crack at the tick budget.
+      # claim_partitions ALREADY enforced anti-stagnation via
+      # last_checked_at — every partition with pending is visited within
+      # ⌈active_partitions / partition_batch_size⌉ ticks regardless of
+      # decayed_admits. Reordering here only decides order *inside* this
+      # already-fair selection.
+      sort_partitions_for_fairness!(partitions)
+      # Per-partition fair share. When tick_admission_budget is set, we
+      # divide it evenly across the partitions we just claimed. Otherwise
+      # the legacy admission_batch_size is the per-partition ceiling.
+      #
+      # We deliberately do NOT clamp fair_share to a minimum of 1 when
+      # tick_cap < N. The hard global cap wins over a per-partition
+      # admit floor; partitions that don't admit this tick are still
+      # visited (last_checked_at bumped) and re-visited next tick when
+      # they'll be at the front of the in-tick decay order.
+      # Anti-stagnation comes from claim_partitions, not from forcing
+      # an admit on every claimed partition.
+      tick_cap   = @policy.tick_admission_budget || @config.tick_admission_budget
+      per_part   = @policy.admission_batch_size || @config.admission_batch_size
+      fair_share = if tick_cap && partitions.any?
+                     (tick_cap.to_f / partitions.size).ceil
+                   else
+                     per_part
+                   end
+      pending_denies          = []
+      admitted_per_partition  = Hash.new(0)
+      used                    = 0
+      partitions.each do |partition|
+        partitions_seen += 1
+        if tick_cap && used >= tick_cap
+          # Global cap exhausted in pass-1. The partition is still
+          # observed (claim_partitions bumped its last_checked_at), so
+          # the round-robin invariant for anti-stagnation holds; we
+          # just admit nothing this tick.
+          partitions_denied += 1
+          denied_reasons["tick_cap_exhausted"] += 1
+          # Push this partition to the deny path so its gate state
+          # still gets persisted — the pipeline already evaluated it
+          # in admit_partition... actually we haven't called admit yet.
+          # Skip: not adding to pending_denies because the pipeline
+          # didn't run, no gate_state_patch to flush.
+          next
+        end
-          pending_enqueue.concat(run_policy(policy, batch))
+        budget_for_this = if tick_cap
+                            [fair_share, tick_cap - used].min
+                          else
+                            fair_share
+                          end
+        budget_for_this = 0 if budget_for_this.negative?
+        outcome = admit_partition(partition, pending_denies, max_budget: budget_for_this)
+        admitted_per_partition[partition["partition_key"]] = outcome[:admitted]
+        jobs_admitted    += outcome[:admitted]
+        forward_failures += outcome[:failures]
+        used             += outcome[:admitted]
+        if outcome[:admitted].positive?
+          partitions_admitted += 1
+        else
+          partitions_denied += 1
+          outcome[:reasons].each { |r| denied_reasons[r] += 1 }
         end
       end
-      admitted_count = 0
-      pending_enqueue.each do |staged, job|
-        begin
-          job.enqueue(_bypass_staging: true)
-          admitted_count += 1
-        rescue StandardError => e
-          Rails.logger&.error("[DispatchPolicy] enqueue failed staged=#{staged.id}: #{e.class}: #{e.message}")
-          revert_admission(staged)
+      # Pass-2: redistribution. Pass-1 may have left budget unused if
+      # some partitions had less pending than their fair share. Walk the
+      # claimed partitions (still in decay-sorted order) and offer the
+      # leftover to whoever filled their fair share in pass-1 — a signal
+      # they had more pending than we let them admit.
+      if tick_cap
+        remaining = tick_cap - used
+        if remaining.positive?
+          partitions.each do |p|
+            break if remaining <= 0
+            next  if admitted_per_partition[p["partition_key"]] < fair_share
+            extra_cap = [remaining, fair_share].min
+            outcome   = admit_partition(p, pending_denies, max_budget: extra_cap)
+            jobs_admitted += outcome[:admitted]
+            forward_failures += outcome[:failures]
+            admitted_per_partition[p["partition_key"]] += outcome[:admitted]
+            remaining -= outcome[:admitted]
+          end
         end
       end
-      admitted_count
-    end
+      flush_denies!(pending_denies) if pending_denies.any?
-    def self.prune_idle_partitions
-      ttl = DispatchPolicy.config.partition_idle_ttl
-      return if ttl.nil? || ttl <= 0
+      duration_ms = monotonic_now_ms - started_at
-      cutoff = Time.current - ttl
-      PartitionInflightCount.where(in_flight: 0).where("updated_at < ?", cutoff).delete_all
-      ThrottleBucket.where("tokens <= ? AND refilled_at < ?", THROTTLE_ZERO_THRESHOLD, cutoff).delete_all
-    end
+      record_sample!(
+        duration_ms:         duration_ms,
+        partitions_seen:     partitions_seen,
+        partitions_admitted: partitions_admitted,
+        partitions_denied:   partitions_denied,
+        jobs_admitted:       jobs_admitted,
+        forward_failures:    forward_failures,
+        denied_reasons:      denied_reasons
+      )
-    def self.prune_orphan_gate_rows
-      [ PartitionInflightCount, ThrottleBucket ].each do |model|
-        model.distinct.pluck(:policy_name, :gate_name).each do |policy_name, gate_name|
-          policy = lookup_policy(policy_name)
-          next if policy && policy.gates.any? { |g| g.name == gate_name.to_sym }
+      Result.new(partitions_seen: partitions_seen, jobs_admitted: jobs_admitted)
+    end
-          model.where(policy_name: policy_name, gate_name: gate_name).delete_all
-        end
+    private
+    # In-place sort by current decayed_admits ASC, computed in Ruby from
+    # the row's stored decayed_admits + the elapsed time since
+    # decayed_admits_at. We do this here (rather than in the SQL of
+    # claim_partitions) because:
+    #
+    # - claim_partitions's ORDER BY is anti-stagnation (last_checked_at
+    #   NULLS FIRST); reordering there would bias selection itself,
+    #   reintroducing the stagnation risk.
+    # - The math is cheap on N ≤ partition_batch_size rows already in
+    #   memory.
+    def sort_partitions_for_fairness!(partitions)
+      half_life = @policy.fairness_half_life_seconds || @config.fairness_half_life_seconds
+      return partitions if half_life.nil? || half_life <= 0
+      tau = half_life.to_f / Math.log(2)
+      now = Time.current.to_f
+      partitions.sort_by! do |p|
+        last_t = decayed_admits_epoch(p["decayed_admits_at"]) || now
+        elapsed = [now - last_t, 0.0].max
+        (p["decayed_admits"] || 0.0).to_f * Math.exp(-elapsed / tau)
       end
     end
-    def self.reap
-      StagedJob.expired_leases.find_each do |staged|
-        (staged.partitions || {}).each do |gate_name, partition_key|
-          policy = lookup_policy(staged.policy_name)
-          gate   = policy&.gates&.find { |g| g.name == gate_name.to_sym }
-          next unless gate&.tracks_inflight?
-          PartitionInflightCount.decrement(
-            policy_name:   staged.policy_name,
-            gate_name:     gate_name.to_s,
-            partition_key: partition_key.to_s
-          )
-        end
-        staged.update!(lease_expires_at: nil, completed_at: Time.current)
-      end
+    def decayed_admits_epoch(value)
+      return nil if value.nil?
+      return value.to_f if value.is_a?(Numeric)
+      return value.to_time.to_f if value.respond_to?(:to_time)
+      Time.parse(value.to_s).to_f
+    rescue ArgumentError, TypeError
+      nil
     end
-    def self.release(policy_name:, partitions:)
-      partitions.each do |gate_name, partition_key|
-        policy = lookup_policy(policy_name)
-        gate   = policy&.gates&.find { |g| g.name == gate_name.to_sym }
-        next unless gate&.tracks_inflight?
-        PartitionInflightCount.decrement(
-          policy_name:   policy_name,
-          gate_name:     gate_name.to_s,
-          partition_key: partition_key.to_s
-        )
+    def admit_partition(partition, pending_denies, max_budget:)
+      ctx        = Context.wrap(partition["context"])
+      pipe       = Pipeline.new(@policy)
+      result     = pipe.call(ctx, partition, max_budget)
+      # Pure-deny path (gate said no capacity for this partition this tick).
+      # Defer the partition state UPDATE to the bulk flush at the end of
+      # the tick instead of issuing a per-partition statement now.
+      if result.admit_count.zero?
+        pending_denies << {
+          policy_name:      @policy_name,
+          partition_key:    partition["partition_key"],
+          gate_state_patch: result.gate_state_patch,
+          retry_after:      result.retry_after
+        }
+        return { admitted: 0, failures: 0, reasons: deduce_reasons(result) }
       end
-    end
-    def self.active_policies(policy_name)
-      return [ policy_name ] if policy_name
+      admitted = 0
+      half_life = @policy.fairness_half_life_seconds || @config.fairness_half_life_seconds
+      Repository.with_connection do
+        ActiveRecord::Base.transaction(requires_new: true) do
+          rows = Repository.claim_staged_jobs!(
+            policy_name:       @policy_name,
+            partition_key:     partition["partition_key"],
+            limit:             result.admit_count,
+            gate_state_patch:  result.gate_state_patch,
+            retry_after:       result.retry_after,
+            half_life_seconds: half_life
+          )
-      StagedJob.pending
-        .where("not_before_at IS NULL OR not_before_at <= ?", Time.current)
-        .distinct
-        .pluck(:policy_name)
-    end
+          # `claim_staged_jobs!` always runs `record_partition_admit!` so
+          # the partition's counters and gate_state commit even when the
+          # actual DELETE returned zero rows (e.g. all staged rows are
+          # scheduled in the future, or another tick raced us to them).
+          next if rows.empty?
+          # Pre-insert an inflight row per admitted job so the concurrency
+          # gate sees them immediately. With a concurrency gate, use its
+          # (coarser) partition key so the gate's COUNT(*) keeps aggregating
+          # correctly across staged sub-partitions.
+          concurrency_gate = @policy.gates.find { |g| g.name == :concurrency }
+          inflight_rows = rows.filter_map do |row|
+            ajid = row.dig("job_data", "job_id")
+            next unless ajid
+            key = if concurrency_gate
+              concurrency_gate.inflight_partition_key(@policy_name, Context.wrap(row["context"]))
+            else
+              row["partition_key"]
+            end
+            { policy_name: @policy_name, partition_key: key, active_job_id: ajid }
+          end
+          Repository.insert_inflight!(inflight_rows) if inflight_rows.any?
+          # Re-enqueue to the real adapter *inside this transaction*. The
+          # adapter (good_job / solid_queue) shares ActiveRecord::Base's
+          # connection, so its INSERT into good_jobs / solid_queue_jobs
+          # participates in the same TX. If anything raises (deserialize,
+          # adapter error, network), the whole TX rolls back atomically:
+          # staged_jobs return, inflight rows vanish, partition counters
+          # revert, and the adapter rows are also reverted. This is the
+          # at-least-once guarantee — there is no window where staged is
+          # gone but the adapter never received the job.
+          Forwarder.dispatch(rows)
+          admitted = rows.size
+        end
+      end
-    def self.fetch_batch(policy)
-      if policy.round_robin?
-        fetch_round_robin_batch(policy)
+      if admitted.zero?
+        { admitted: 0, failures: 0, reasons: ["no_rows_claimed"] }
       else
-        fetch_plain_batch(policy)
+        { admitted: admitted, failures: 0, reasons: [] }
       end
+    rescue StandardError => e
+      DispatchPolicy.config.logger&.error(
+        "[dispatch_policy] forward failed for #{@policy_name}/#{partition['partition_key']}: " \
+        "#{e.class}: #{e.message}"
+      )
+      { admitted: 0, failures: 1, reasons: ["forward_failed"] }
     end
-    def self.fetch_plain_batch(policy)
-      StagedJob.pending
-        .where(policy_name: policy.name)
-        .where("not_before_at IS NULL OR not_before_at <= ?", Time.current)
-        .order(:priority, :staged_at)
-        .limit(DispatchPolicy.config.batch_size)
-        .lock("FOR UPDATE SKIP LOCKED")
-        .to_a
-    end
-    def self.fetch_round_robin_batch(policy)
-      quantum    = DispatchPolicy.config.round_robin_quantum
-      batch_size = DispatchPolicy.config.batch_size
-      now        = Time.current
-      sql = <<~SQL.squish
-        SELECT rows.*
-        FROM (
-          SELECT DISTINCT round_robin_key
-          FROM dispatch_policy_staged_jobs
-          WHERE policy_name = ?
-            AND admitted_at IS NULL
-            AND round_robin_key IS NOT NULL
-            AND (not_before_at IS NULL OR not_before_at <= ?)
-        ) AS keys
-        CROSS JOIN LATERAL (
-          SELECT *
-          FROM dispatch_policy_staged_jobs
-          WHERE policy_name = ?
-            AND admitted_at IS NULL
-            AND round_robin_key = keys.round_robin_key
-            AND (not_before_at IS NULL OR not_before_at <= ?)
-          ORDER BY priority, staged_at
-          LIMIT ?
-          FOR UPDATE SKIP LOCKED
-        ) AS rows
-        LIMIT ?
-      SQL
-      batch = StagedJob.find_by_sql([ sql, policy.name, now, policy.name, now, quantum, batch_size ])
-      remaining = batch_size - batch.size
-      return batch if remaining <= 0
-      top_up = StagedJob.pending
-        .where(policy_name: policy.name)
-        .where("not_before_at IS NULL OR not_before_at <= ?", now)
-        .where.not(id: batch.map(&:id))
-        .order(:priority, :staged_at)
-        .limit(remaining)
-        .lock("FOR UPDATE SKIP LOCKED")
-        .to_a
-      batch + top_up
-    end
-    def self.lookup_policy(policy_name)
-      job_class = DispatchPolicy.registry[policy_name] || autoload_job_for(policy_name)
-      return nil unless job_class
-      job_class.resolved_dispatch_policy
-    end
-    def self.autoload_job_for(policy_name)
-      const_name = policy_name.tr("-", "/").camelize
-      const_name.safe_constantize
-      DispatchPolicy.registry[policy_name]
+    def flush_denies!(entries)
+      Repository.with_connection { Repository.bulk_record_partition_denies!(entries) }
+    rescue StandardError => e
+      DispatchPolicy.config.logger&.error(
+        "[dispatch_policy] bulk_record_partition_denies failed: #{e.class}: #{e.message}"
+      )
     end
-    def self.run_policy(policy, batch)
-      context = DispatchContext.new(policy: policy, batch: batch)
-      survivors = batch
-      policy.gates.each do |gate|
-        survivors = gate.filter(survivors, context)
+    # When admit_count was 0, the Pipeline's `reasons` array contains entries
+    # like "throttle:rate=0", "concurrency:concurrency_full". We strip the
+    # `gate:` prefix's value separator so callers see "throttle" / "concurrency_full".
+    def deduce_reasons(result)
+      reasons = result.reasons.map do |s|
+        gate, msg = s.split(":", 2)
+        msg.presence || gate
       end
+      reasons << "no_capacity" if reasons.empty?
+      reasons
+    end
-      survivors.map do |staged|
-        partitions = context.partitions_for(staged)
-        partitions.each do |gate_name, partition_key|
-          gate = policy.gates.find { |g| g.name == gate_name.to_sym }
-          next unless gate&.tracks_inflight?
-          PartitionInflightCount.increment(
-            policy_name:   policy.name,
-            gate_name:     gate_name.to_s,
-            partition_key: partition_key.to_s
-          )
-        end
+    def record_sample!(**fields)
+      pending_total  = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
+      inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
-        job = staged.mark_admitted!(partitions: partitions)
-        [ staged, job ]
-      end
+      Repository.record_tick_sample!(
+        policy_name:    @policy_name,
+        pending_total:  pending_total,
+        inflight_total: inflight_total,
+        **fields
+      )
+    rescue StandardError => e
+      DispatchPolicy.config.logger&.warn("[dispatch_policy] failed to record tick sample: #{e.class}: #{e.message}")
     end
-    def self.revert_admission(staged)
-      partitions = staged.partitions || {}
-      release(policy_name: staged.policy_name, partitions: partitions)
-      staged.update_columns(
-        admitted_at:      nil,
-        lease_expires_at: nil,
-        active_job_id:    nil,
-        partitions:       {}
-      )
+    def monotonic_now_ms
+      (Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1000).to_i
     end
   end
 end

data/lib/dispatch_policy/tick_loop.rb CHANGED Viewed

@@ -1,45 +1,78 @@
 # frozen_string_literal: true
 module DispatchPolicy
-  # Shared driver for DispatchTickLoopJob and any foreground tick (e.g. a
-  # rake task). Loops Tick.reap + Tick.run with an interruptible sleep and
-  # bails when stop_when returns true.
-  class TickLoop
-    def self.run(policy_name: nil, sleep_for: nil, sleep_for_busy: nil, stop_when: -> { false })
-      idle_sleep = (sleep_for      || DispatchPolicy.config.tick_sleep).to_f
-      busy_sleep = (sleep_for_busy || DispatchPolicy.config.tick_sleep_busy).to_f
+  # Drives admission until `stop_when` fires (deadline, shutdown signal, etc).
+  # Runs one Tick per policy per loop iteration; sleeps `idle_pause` when no
+  # jobs were admitted across all policies. Periodically (every
+  # `sweep_every_ticks` iterations) sweeps stale inflight rows and inactive
+  # partitions.
+  module TickLoop
+    module_function
+    # @param policy_name [String, nil] limit to one policy. nil = all registered.
+    # @param shard [String, nil] limit to one shard. nil = all shards.
+    def run(policy_name: nil, shard: nil, stop_when: -> { false })
+      config       = DispatchPolicy.config
+      logger       = config.logger
+      iteration    = 0
       loop do
         break if stop_when.call
+        unless DispatchPolicy.config.enabled
+          # Master switch off: stop polling. The job that drives
+          # TickLoop.run will re-schedule itself; we exit cleanly so
+          # the next iteration sees the flag and stops too.
+          logger&.info("[dispatch_policy] TickLoop exiting because config.enabled = false")
+          break
+        end
+        names = policy_names(policy_name)
+        if names.empty?
+          sleep(config.idle_pause)
+          next
+        end
         admitted = 0
-        begin
-          ActiveRecord::Base.uncached do
-            Tick.reap
-            admitted = Tick.run(policy_name: policy_name).to_i
+        names.each do |name|
+          break if stop_when.call
+          begin
+            result = Tick.run(policy_name: name, shard: shard)
+            admitted += result.jobs_admitted
+          rescue StandardError => e
+            logger&.error("[dispatch_policy] tick error policy=#{name} shard=#{shard.inspect} #{e.class}: #{e.message}\n#{e.backtrace.first(10).join("\n")}")
           end
-        rescue StandardError => e
-          Rails.logger&.error("[DispatchPolicy] tick error: #{e.class}: #{e.message}")
-          Rails.error.report(e, handled: true) if defined?(Rails) && Rails.respond_to?(:error)
         end
-        break if stop_when.call
+        iteration += 1
+        if (iteration % config.sweep_every_ticks).zero?
+          sweep!
+        end
-        interruptible_sleep(admitted.positive? ? busy_sleep : idle_sleep, stop_when)
+        if admitted.zero?
+          sleep(config.idle_pause)
+        elsif config.busy_pause.to_f.positive?
+          sleep(config.busy_pause)
+        end
       end
     end
-    def self.interruptible_sleep(total, stop_when)
-      return unless total.positive?
-      remaining = total
-      step = 0.1
-      while remaining.positive?
-        break if stop_when.call
-        chunk = [ remaining, step ].min
-        sleep(chunk)
-        remaining -= chunk
+    def policy_names(filter)
+      if filter
+        [filter.to_s]
+      else
+        DispatchPolicy.registry.names
       end
     end
+    def sweep!
+      cfg = DispatchPolicy.config
+      Repository.sweep_stale_inflight!(cutoff_seconds: cfg.inflight_stale_after)
+      Repository.sweep_inactive_partitions!(cutoff_seconds: cfg.partition_inactive_after)
+      Repository.sweep_old_tick_samples!(cutoff_seconds: cfg.metrics_retention)
+    rescue StandardError => e
+      DispatchPolicy.config.logger&.error("[dispatch_policy] sweep error: #{e.class}: #{e.message}")
+    end
   end
 end

data/lib/dispatch_policy/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module DispatchPolicy
-  VERSION = "0.1.0"
+  VERSION = "0.3.0"
 end