dispatch_policy 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/MIT-LICENSE +16 -17
  3. data/README.md +449 -288
  4. data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
  5. data/app/controllers/dispatch_policy/application_controller.rb +45 -1
  6. data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
  7. data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
  8. data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
  9. data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
  10. data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
  11. data/app/models/dispatch_policy/inflight_job.rb +12 -0
  12. data/app/models/dispatch_policy/partition.rb +21 -0
  13. data/app/models/dispatch_policy/staged_job.rb +4 -97
  14. data/app/models/dispatch_policy/tick_sample.rb +11 -0
  15. data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
  16. data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
  17. data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
  18. data/app/views/dispatch_policy/policies/index.html.erb +15 -37
  19. data/app/views/dispatch_policy/policies/show.html.erb +140 -216
  20. data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
  21. data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
  22. data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
  23. data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
  24. data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
  25. data/config/routes.rb +18 -2
  26. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
  27. data/lib/dispatch_policy/bypass.rb +23 -0
  28. data/lib/dispatch_policy/config.rb +85 -0
  29. data/lib/dispatch_policy/context.rb +50 -0
  30. data/lib/dispatch_policy/cursor_pagination.rb +121 -0
  31. data/lib/dispatch_policy/decision.rb +22 -0
  32. data/lib/dispatch_policy/engine.rb +4 -27
  33. data/lib/dispatch_policy/forwarder.rb +63 -0
  34. data/lib/dispatch_policy/gate.rb +10 -38
  35. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
  36. data/lib/dispatch_policy/gates/concurrency.rb +45 -26
  37. data/lib/dispatch_policy/gates/throttle.rb +65 -37
  38. data/lib/dispatch_policy/inflight_tracker.rb +174 -0
  39. data/lib/dispatch_policy/job_extension.rb +155 -0
  40. data/lib/dispatch_policy/operator_hints.rb +126 -0
  41. data/lib/dispatch_policy/pipeline.rb +48 -0
  42. data/lib/dispatch_policy/policy.rb +62 -47
  43. data/lib/dispatch_policy/policy_dsl.rb +120 -0
  44. data/lib/dispatch_policy/railtie.rb +35 -0
  45. data/lib/dispatch_policy/registry.rb +46 -0
  46. data/lib/dispatch_policy/repository.rb +723 -0
  47. data/lib/dispatch_policy/serializer.rb +36 -0
  48. data/lib/dispatch_policy/tick.rb +263 -172
  49. data/lib/dispatch_policy/tick_loop.rb +59 -26
  50. data/lib/dispatch_policy/version.rb +1 -1
  51. data/lib/dispatch_policy.rb +71 -46
  52. data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
  53. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
  54. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
  55. data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
  56. metadata +101 -43
  57. data/CHANGELOG.md +0 -12
  58. data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
  59. data/app/models/dispatch_policy/partition_observation.rb +0 -49
  60. data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
  61. data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
  62. data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
  63. data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
  64. data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
  65. data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
  66. data/lib/dispatch_policy/dispatch_context.rb +0 -53
  67. data/lib/dispatch_policy/dispatchable.rb +0 -120
  68. data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
  69. data/lib/dispatch_policy/gates/global_cap.rb +0 -26
  70. data/lib/dispatch_policy/install_generator.rb +0 -23
@@ -2,51 +2,79 @@
2
2
 
3
3
  module DispatchPolicy
4
4
  module Gates
5
+ # Token bucket throttle gate.
6
+ #
7
+ # Persists state in partitions.gate_state["throttle"] = {
8
+ # "tokens" => Float, # current tokens, capped at bucket size
9
+ # "refilled_at" => Float # epoch seconds, last refill
10
+ # }
11
+ #
12
+ # The partition scope this gate enforces against is the policy's
13
+ # `partition_by` (declared in the policy DSL block, not on the gate).
14
+ # The bucket lives on the staged partition row — one row per
15
+ # `policy.partition_for(ctx)` value, one bucket per row, no dilution.
5
16
  class Throttle < Gate
6
- def configure(rate:, per:, burst: nil)
7
- @rate = rate
8
- @per = per
9
- @burst = burst
17
+ attr_reader :rate_proc, :per
18
+
19
+ def initialize(rate:, per:)
20
+ super()
21
+ @rate_proc = rate.respond_to?(:call) ? rate : ->(_ctx) { rate }
22
+ @per = duration_seconds(per)
23
+ raise ArgumentError, "throttle :per must be > 0 (got #{@per})" unless @per.positive?
10
24
  end
11
25
 
12
- # Consumed tokens refill over time, no release step.
13
- def tracks_inflight?
14
- false
26
+ def name
27
+ :throttle
15
28
  end
16
29
 
17
- def filter(batch, context)
18
- by_partition = batch.group_by { |staged| partition_key_for(context.for(staged)) }
19
-
20
- admitted = []
21
- by_partition.each do |partition_key, jobs|
22
- sample_ctx = context.for(jobs.first)
23
- rate = resolve(@rate, sample_ctx).to_f
24
- per = @per.to_f
25
- burst = (resolve(@burst, sample_ctx) || rate).to_f
26
-
27
- bucket = ThrottleBucket.lock(
28
- policy_name: policy.name,
29
- gate_name: name,
30
- partition_key: partition_key,
31
- burst: burst
32
- )
33
- bucket.refill!(rate: rate, per: per, burst: burst)
34
-
35
- jobs.each do |staged|
36
- if bucket.consume(1)
37
- admitted << [ staged, partition_key ]
38
- else
39
- break
40
- end
41
- end
42
- bucket.save!
30
+ def evaluate(ctx, partition, admit_budget)
31
+ capacity = capacity_for(ctx)
32
+ return Decision.deny(reason: "rate=0") if capacity <= 0
33
+
34
+ refill_rate = capacity.to_f / @per
35
+ state = (partition["gate_state"] || {})["throttle"] || {}
36
+ tokens = (state["tokens"] || capacity).to_f
37
+ refilled_at = (state["refilled_at"] || now).to_f
38
+
39
+ elapsed = [now - refilled_at, 0.0].max
40
+ tokens = [tokens + (elapsed * refill_rate), capacity.to_f].min
41
+
42
+ whole = tokens.floor
43
+ if whole.zero?
44
+ missing = 1.0 - tokens
45
+ retry_after = missing / refill_rate
46
+ patch = { "tokens" => tokens, "refilled_at" => now }
47
+ return Decision.new(allowed: 0,
48
+ retry_after: retry_after,
49
+ gate_state_patch: { "throttle" => patch },
50
+ reason: "throttle_empty")
43
51
  end
44
52
 
45
- context.record_partitions(admitted, gate: name)
46
- admitted.map(&:first)
53
+ allowed = [whole, admit_budget].min
54
+ patch = { "tokens" => tokens - allowed, "refilled_at" => now }
55
+ Decision.new(allowed: allowed, gate_state_patch: { "throttle" => patch })
47
56
  end
48
- end
49
57
 
50
- Gate.register(:throttle, Throttle)
58
+ private
59
+
60
+ def capacity_for(ctx)
61
+ value = @rate_proc.call(ctx)
62
+ value.nil? ? 0 : Integer(value)
63
+ end
64
+
65
+ def now
66
+ DispatchPolicy.config.now.to_f
67
+ end
68
+
69
+ def duration_seconds(value)
70
+ if value.is_a?(Numeric)
71
+ value.to_f
72
+ elsif value.respond_to?(:to_f) && value.respond_to?(:seconds)
73
+ value.to_f
74
+ else
75
+ raise ArgumentError, "throttle :per must be a numeric duration in seconds (got #{value.inspect})"
76
+ end
77
+ end
78
+ end
51
79
  end
52
80
  end
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ # Around-perform that records each job execution in
5
+ # dispatch_policy_inflight_jobs while it runs, so the concurrency gate
6
+ # can count active jobs per partition.
7
+ #
8
+ # While the job runs we spawn a heartbeat thread that bumps
9
+ # `heartbeat_at` every `config.inflight_heartbeat_interval` seconds.
10
+ # Without this, jobs longer than `inflight_stale_after` (default 5 min)
11
+ # get their inflight row prematurely swept and the concurrency gate
12
+ # over-admits.
13
+ module InflightTracker
14
+ extend ActiveSupport::Concern
15
+
16
+ class_methods do
17
+ def dispatch_policy_inflight_tracking
18
+ around_perform do |job, block|
19
+ DispatchPolicy::InflightTracker.track(job, &block)
20
+ end
21
+ end
22
+ end
23
+
24
+ def self.track(job)
25
+ policy_name = job.class.respond_to?(:dispatch_policy_name) && job.class.dispatch_policy_name
26
+ return yield unless policy_name
27
+
28
+ policy = DispatchPolicy.registry.fetch(policy_name)
29
+ return yield unless policy
30
+
31
+ ctx = policy.build_context(job.arguments, queue_name: job.queue_name&.to_s)
32
+ partition_key = policy.partition_key_for(ctx)
33
+
34
+ Repository.insert_inflight!([{
35
+ policy_name: policy.name,
36
+ partition_key: partition_key,
37
+ active_job_id: job.job_id
38
+ }])
39
+
40
+ adaptive_gates = policy.gates.select { |g| g.name == :adaptive_concurrency }
41
+ admitted_at = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
42
+ perform_start = Time.current
43
+
44
+ heartbeat = start_heartbeat(job.job_id)
45
+
46
+ succeeded = false
47
+ begin
48
+ yield
49
+ succeeded = true
50
+ ensure
51
+ stop_heartbeat(heartbeat)
52
+
53
+ record_adaptive_observations(
54
+ policy: policy,
55
+ gates: adaptive_gates,
56
+ partition_key: partition_key,
57
+ admitted_at: admitted_at,
58
+ perform_start: perform_start,
59
+ succeeded: succeeded
60
+ )
61
+
62
+ begin
63
+ Repository.delete_inflight!(active_job_id: job.job_id)
64
+ rescue StandardError => e
65
+ DispatchPolicy.config.logger&.warn("[dispatch_policy] failed to delete inflight row #{job.job_id}: #{e.class}: #{e.message}")
66
+ end
67
+ end
68
+ end
69
+
70
+ # Reads the admitted_at column from the inflight row that the Tick
71
+ # pre-inserted. Used as the start-of-queue-wait reference for the
72
+ # adaptive_concurrency feedback signal (queue_lag = perform_start
73
+ # - admitted_at). nil if the row vanished or the lookup fails —
74
+ # the observation is then skipped.
75
+ def self.lookup_admitted_at(active_job_id)
76
+ result = ActiveRecord::Base.connection.exec_query(
77
+ "SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
78
+ "lookup_admitted_at",
79
+ [active_job_id]
80
+ )
81
+ row = result.first
82
+ return nil unless row
83
+ ts = row["admitted_at"]
84
+ ts.is_a?(Time) ? ts : Time.parse(ts.to_s)
85
+ rescue StandardError
86
+ nil
87
+ end
88
+
89
+ def self.record_adaptive_observations(policy:, gates:, partition_key:, admitted_at:, perform_start:, succeeded:)
90
+ return if gates.empty?
91
+
92
+ queue_lag_ms = if admitted_at
93
+ ((perform_start - admitted_at) * 1000).to_i
94
+ else
95
+ # No admitted_at means we can't measure queue wait. Treat as 0
96
+ # so the observation still increments sample_count and the
97
+ # cap can grow if everything else is healthy.
98
+ 0
99
+ end
100
+
101
+ gates.each do |gate|
102
+ gate.record_observation(
103
+ policy_name: policy.name,
104
+ partition_key: partition_key,
105
+ queue_lag_ms: queue_lag_ms,
106
+ succeeded: succeeded
107
+ )
108
+ rescue StandardError => e
109
+ DispatchPolicy.config.logger&.warn(
110
+ "[dispatch_policy] adaptive observation failed for #{policy.name}/#{partition_key}: #{e.class}: #{e.message}"
111
+ )
112
+ end
113
+ end
114
+
115
+ # ----- heartbeat thread -----
116
+
117
+ HEARTBEAT_KEY = :__dispatch_policy_heartbeat_token__
118
+
119
+ Heartbeat = Struct.new(:thread, :stop_flag)
120
+
121
+ def self.start_heartbeat(active_job_id)
122
+ interval = DispatchPolicy.config.inflight_heartbeat_interval.to_f
123
+ return nil if interval <= 0
124
+
125
+ stop_flag = Concurrent::AtomicBoolean.new(false) if defined?(Concurrent::AtomicBoolean)
126
+ stop_flag ||= ThreadSafeFlag.new
127
+
128
+ thread = Thread.new do
129
+ Thread.current.name = "dispatch_policy.heartbeat:#{active_job_id}"
130
+
131
+ until stop_flag.true?
132
+ # Sleep in small slices so stop is responsive without polling tight.
133
+ slept = 0.0
134
+ slice = [interval, 1.0].min
135
+ while slept < interval && !stop_flag.true?
136
+ sleep(slice)
137
+ slept += slice
138
+ end
139
+ break if stop_flag.true?
140
+
141
+ begin
142
+ ActiveRecord::Base.connection_pool.with_connection do
143
+ Repository.heartbeat_inflight!(active_job_id: active_job_id)
144
+ end
145
+ rescue StandardError => e
146
+ DispatchPolicy.config.logger&.warn("[dispatch_policy] heartbeat #{active_job_id} failed: #{e.class}: #{e.message}")
147
+ end
148
+ end
149
+ end
150
+
151
+ Heartbeat.new(thread, stop_flag)
152
+ end
153
+
154
+ def self.stop_heartbeat(heartbeat)
155
+ return if heartbeat.nil?
156
+
157
+ heartbeat.stop_flag.make_true
158
+ # Wake the thread out of any in-progress sleep so we don't wait the full slice.
159
+ heartbeat.thread.wakeup if heartbeat.thread.alive?
160
+ heartbeat.thread.join(1.0)
161
+ rescue StandardError
162
+ # Worst case: the thread is killed by GC; the inflight row gets a stale
163
+ # heartbeat_at and the sweeper will reclaim it after inflight_stale_after.
164
+ end
165
+
166
+ # Tiny fallback if concurrent-ruby isn't available (it's a Rails dep
167
+ # via active_support so it normally is).
168
+ class ThreadSafeFlag
169
+ def initialize; @mutex = Mutex.new; @value = false; end
170
+ def true?; @mutex.synchronize { @value }; end
171
+ def make_true; @mutex.synchronize { @value = true }; end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ # Hooks into ActiveJob::Base. Adds:
5
+ # - the `dispatch_policy :name do … end` class macro
6
+ # - an `around_enqueue` callback that stages jobs declaring a policy
7
+ # - a `perform_all_later` patch that handles bulk enqueue
8
+ module JobExtension
9
+ extend ActiveSupport::Concern
10
+
11
+ included do
12
+ class_attribute :dispatch_policy_name, instance_writer: false
13
+ end
14
+
15
+ class_methods do
16
+ def dispatch_policy(name, &block)
17
+ policy = PolicyDSL.build(name.to_s, &block)
18
+ DispatchPolicy.registry.register(policy, owner: self.name)
19
+ self.dispatch_policy_name = policy.name
20
+
21
+ around_enqueue do |job, block|
22
+ DispatchPolicy::JobExtension.around_enqueue_for(job, block)
23
+ end
24
+ end
25
+ end
26
+
27
+ # Called by the around_enqueue lambda. Public so it can be tested directly.
28
+ def self.around_enqueue_for(job, block)
29
+ return block.call if Bypass.active?
30
+ return block.call unless DispatchPolicy.config.enabled
31
+
32
+ policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
33
+ return block.call unless policy
34
+
35
+ if retry_attempt?(job) && policy.bypass_retries?
36
+ return block.call
37
+ end
38
+
39
+ # `klass.deserialize(payload)` (used elsewhere — see Forwarder, retries)
40
+ # only sets @serialized_arguments. ActiveJob defers the actual
41
+ # arguments deserialization to perform_now via the private
42
+ # deserialize_arguments_if_needed. If something deserializes a job
43
+ # and re-enqueues it without going through perform_now (e.g. a
44
+ # custom retry path), `job.arguments` would be []. Guard against
45
+ # that here so the context proc always sees the real args.
46
+ ensure_arguments_materialized!(job)
47
+
48
+ queue_name = job.queue_name&.to_s || policy.queue_name
49
+ ctx = policy.build_context(job.arguments, queue_name: queue_name)
50
+ partition_key = policy.partition_key_for(ctx)
51
+ shard = policy.shard_for(ctx)
52
+ payload = Serializer.serialize(job)
53
+
54
+ Repository.stage!(
55
+ policy_name: policy.name,
56
+ partition_key: partition_key,
57
+ queue_name: queue_name,
58
+ shard: shard,
59
+ job_class: job.class.name,
60
+ job_data: payload,
61
+ context: ctx.to_jsonb,
62
+ scheduled_at: scheduled_time(job),
63
+ priority: job.priority || 0
64
+ )
65
+
66
+ job.successfully_enqueued = true
67
+ false # halts the around_enqueue chain so the real adapter never sees the job
68
+ end
69
+
70
+ def self.retry_attempt?(job)
71
+ (job.respond_to?(:executions) ? job.executions.to_i : 0).positive?
72
+ end
73
+
74
+ def self.scheduled_time(job)
75
+ ts = job.scheduled_at
76
+ return nil if ts.nil?
77
+ return ts if ts.is_a?(Time)
78
+
79
+ Time.at(Float(ts))
80
+ rescue ArgumentError, TypeError
81
+ nil
82
+ end
83
+
84
+ # ActiveJob's `arguments` getter is a plain attr_accessor that returns
85
+ # the in-memory @arguments. After `klass.deserialize(payload)`, that
86
+ # array is empty until perform_now triggers
87
+ # `deserialize_arguments_if_needed` (a private method). Anywhere we
88
+ # read `job.arguments` outside of perform we must materialize first,
89
+ # or the context proc receives [] and falls back to its defaults.
90
+ def self.ensure_arguments_materialized!(job)
91
+ return unless job.respond_to?(:deserialize_arguments_if_needed, true)
92
+ job.send(:deserialize_arguments_if_needed)
93
+ end
94
+
95
+ # ---- perform_all_later support -------------------------------------------
96
+
97
+ # Rails 7.1+ exposes ActiveJob.perform_all_later. We override it to route
98
+ # jobs declaring a dispatch_policy through a single bulk INSERT, while
99
+ # delegating jobs without a policy to the original enqueue_all path.
100
+ module BulkEnqueue
101
+ def perform_all_later(*jobs)
102
+ flat = jobs.flatten
103
+ return super if flat.empty?
104
+ # Critical: respect Bypass exactly like the per-job around_enqueue
105
+ # does. Forwarder.dispatch deserializes admitted jobs and calls
106
+ # ActiveJob.perform_all_later under Bypass.with — without this
107
+ # check, BulkEnqueue would re-stage them, creating an infinite
108
+ # admission loop with the wrong context (job.arguments is still []
109
+ # at that point because ActiveJob defers deserialization).
110
+ return super if DispatchPolicy::Bypass.active?
111
+ return super unless DispatchPolicy.config.enabled
112
+ return super unless DispatchPolicy.registry.size.positive?
113
+
114
+ with_policy, without_policy = flat.partition do |j|
115
+ j.class.respond_to?(:dispatch_policy_name) && j.class.dispatch_policy_name
116
+ end
117
+
118
+ super(without_policy) if without_policy.any?
119
+
120
+ return nil if with_policy.empty?
121
+
122
+ rows = with_policy.filter_map do |job|
123
+ policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
124
+ next unless policy
125
+
126
+ # See JobExtension.ensure_arguments_materialized! — we need this
127
+ # for the same reason as the single-enqueue path.
128
+ JobExtension.ensure_arguments_materialized!(job)
129
+
130
+ queue_name = job.queue_name&.to_s || policy.queue_name
131
+ ctx = policy.build_context(job.arguments, queue_name: queue_name)
132
+ partition_key = policy.partition_key_for(ctx)
133
+ shard = policy.shard_for(ctx)
134
+ payload = Serializer.serialize(job)
135
+ job.successfully_enqueued = true
136
+
137
+ {
138
+ policy_name: policy.name,
139
+ partition_key: partition_key,
140
+ queue_name: queue_name,
141
+ shard: shard,
142
+ job_class: job.class.name,
143
+ job_data: payload,
144
+ context: ctx.to_jsonb,
145
+ scheduled_at: JobExtension.scheduled_time(job),
146
+ priority: job.priority || 0
147
+ }
148
+ end
149
+
150
+ Repository.stage_many!(rows) if rows.any?
151
+ nil # ActiveJob.perform_all_later contract returns nil
152
+ end
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ # Pure-Ruby hint generator. Takes a snapshot of live metrics and
5
+ # returns a list of {level:, message:} that the dashboard renders.
6
+ #
7
+ # Each predicate is intentionally conservative: hints fire on
8
+ # crossings the operator can fix from the UI or by toggling a config
9
+ # value, not on noise. Levels:
10
+ #
11
+ # :info — observation worth glancing at
12
+ # :warn — attention soon
13
+ # :critical — fix now
14
+ module OperatorHints
15
+ Hint = Struct.new(:level, :message, keyword_init: true)
16
+
17
+ module_function
18
+
19
+ # `metrics` is a hash of:
20
+ # tick_max_duration_ms: int (config tick_max_duration × 1000)
21
+ # avg_tick_ms: int
22
+ # max_tick_ms: int
23
+ # pending_total: int
24
+ # admitted_per_minute: int (last 1m)
25
+ # forward_failures: int (last 1m)
26
+ # jobs_admitted: int (last 1m, denominator for fail %)
27
+ # active_partitions: int
28
+ # never_checked: int
29
+ # in_backoff: int
30
+ # total_partitions: int
31
+ # adapter_target_jps: int|nil (config.adapter_throughput_target)
32
+ def for(metrics)
33
+ hints = []
34
+ m = metrics
35
+
36
+ # ---- tick approaching deadline ---------------------------------
37
+ if m[:tick_max_duration_ms].to_i.positive? && m[:avg_tick_ms].to_i.positive?
38
+ ratio = m[:avg_tick_ms].to_f / m[:tick_max_duration_ms]
39
+ if ratio >= 0.6
40
+ hints << Hint.new(
41
+ level: ratio >= 0.85 ? :critical : :warn,
42
+ message: "Avg tick is #{format('%.0f%%', ratio * 100)} of tick_max_duration. " \
43
+ "Lower admission_batch_size, set tick_admission_budget, or shard the policy."
44
+ )
45
+ end
46
+ end
47
+
48
+ # ---- backlog drain time ----------------------------------------
49
+ if m[:admitted_per_minute].to_i.positive? && m[:pending_total].to_i.positive?
50
+ drain_minutes = m[:pending_total].to_f / m[:admitted_per_minute]
51
+ if drain_minutes >= 30
52
+ level = drain_minutes >= 120 ? :warn : :info
53
+ hints << Hint.new(
54
+ level: level,
55
+ message: "At #{m[:admitted_per_minute]} admits/min, the current backlog of " \
56
+ "#{m[:pending_total]} pending would take ~#{drain_minutes.round} min " \
57
+ "to drain. Raise admission_batch_size, raise the gate's rate, or shard."
58
+ )
59
+ end
60
+ end
61
+
62
+ # ---- pending growing while admit rate is non-trivial -----------
63
+ # `pending_trend` compares head/tail thirds of the sparkline; a
64
+ # transient spike that already drained still leaves the tail
65
+ # average elevated. Gate on current pending > 0 so a recovered
66
+ # backlog does not raise a warning.
67
+ if m[:pending_trend] == :up &&
68
+ m[:admitted_per_minute].to_i.positive? &&
69
+ m[:pending_total].to_i.positive?
70
+ hints << Hint.new(
71
+ level: :warn,
72
+ message: "Pending is growing while we are admitting. Inflow > outflow — " \
73
+ "either the throttle rate is below the producer rate, or the worker pool can't keep up."
74
+ )
75
+ end
76
+
77
+ # ---- forward failure rate --------------------------------------
78
+ if m[:jobs_admitted].to_i.positive?
79
+ rate = m[:forward_failures].to_f / m[:jobs_admitted]
80
+ if rate >= 0.01
81
+ hints << Hint.new(
82
+ level: rate >= 0.05 ? :critical : :warn,
83
+ message: "Forward failures at #{format('%.1f%%', rate * 100)} (#{m[:forward_failures]} / " \
84
+ "#{m[:jobs_admitted]} admits). Inspect logs — adapter is rejecting enqueues."
85
+ )
86
+ end
87
+ end
88
+
89
+ # ---- never_checked > 0 with cardinality > batch ----------------
90
+ if m[:never_checked].to_i.positive?
91
+ hints << Hint.new(
92
+ level: :warn,
93
+ message: "#{m[:never_checked]} active partitions have never been checked. " \
94
+ "The tick is not getting through them — increase partition_batch_size or shard."
95
+ )
96
+ end
97
+
98
+ # ---- partition cardinality -------------------------------------
99
+ if m[:total_partitions].to_i >= 50_000
100
+ hints << Hint.new(
101
+ level: :info,
102
+ message: "#{m[:total_partitions]} partitions in DB. claim_partitions starts to feel " \
103
+ "this around 50k–100k. Consider lowering partition_inactive_after to GC " \
104
+ "drained ones earlier."
105
+ )
106
+ end
107
+
108
+ # ---- adapter ceiling proximity --------------------------------
109
+ target_jps = m[:adapter_target_jps].to_i
110
+ if target_jps.positive? && m[:admitted_per_minute].to_i.positive?
111
+ current_jps = m[:admitted_per_minute] / 60.0
112
+ ratio = current_jps / target_jps
113
+ if ratio >= 0.7
114
+ hints << Hint.new(
115
+ level: ratio >= 0.95 ? :critical : :warn,
116
+ message: "Admitting #{format('%.0f', current_jps)} jobs/sec, " \
117
+ "#{format('%.0f%%', ratio * 100)} of the configured adapter ceiling " \
118
+ "(#{target_jps}/sec). Consider an additional shard before the next traffic spike."
119
+ )
120
+ end
121
+ end
122
+
123
+ hints
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ # Composes a sequence of gates into a single admission decision for one
5
+ # partition. Returns a value object describing how many jobs may be
6
+ # admitted right now and which gate-state patches to persist.
7
+ class Pipeline
8
+ Result = Struct.new(:admit_count, :retry_after, :gate_state_patch, :reasons, keyword_init: true)
9
+
10
+ def initialize(policy)
11
+ @policy = policy
12
+ end
13
+
14
+ def call(ctx, partition, max_budget)
15
+ budget = max_budget
16
+ retry_after = nil
17
+ patch = {}
18
+ reasons = []
19
+ decisions = []
20
+
21
+ @policy.gates.each do |gate|
22
+ decision = gate.evaluate(ctx, partition, budget)
23
+ decisions << [gate, decision]
24
+ budget = decision.allowed.finite? ? [budget, decision.allowed].min : budget
25
+ if decision.retry_after
26
+ retry_after = retry_after.nil? ? decision.retry_after : [retry_after, decision.retry_after].min
27
+ end
28
+ reasons << "#{gate.name}:#{decision.reason}" if decision.reason
29
+ break if budget.zero?
30
+ end
31
+
32
+ admit_count = budget.finite? ? budget : max_budget
33
+ admit_count = 0 if admit_count.negative?
34
+
35
+ decisions.each do |_, decision|
36
+ next unless decision.gate_state_patch
37
+ patch.merge!(decision.gate_state_patch)
38
+ end
39
+
40
+ Result.new(
41
+ admit_count: admit_count,
42
+ retry_after: retry_after,
43
+ gate_state_patch: patch,
44
+ reasons: reasons
45
+ )
46
+ end
47
+ end
48
+ end