dispatch_policy 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/MIT-LICENSE +16 -17
  3. data/README.md +449 -288
  4. data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
  5. data/app/controllers/dispatch_policy/application_controller.rb +45 -1
  6. data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
  7. data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
  8. data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
  9. data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
  10. data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
  11. data/app/models/dispatch_policy/inflight_job.rb +12 -0
  12. data/app/models/dispatch_policy/partition.rb +21 -0
  13. data/app/models/dispatch_policy/staged_job.rb +4 -97
  14. data/app/models/dispatch_policy/tick_sample.rb +11 -0
  15. data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
  16. data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
  17. data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
  18. data/app/views/dispatch_policy/policies/index.html.erb +15 -37
  19. data/app/views/dispatch_policy/policies/show.html.erb +140 -216
  20. data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
  21. data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
  22. data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
  23. data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
  24. data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
  25. data/config/routes.rb +18 -2
  26. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
  27. data/lib/dispatch_policy/bypass.rb +23 -0
  28. data/lib/dispatch_policy/config.rb +85 -0
  29. data/lib/dispatch_policy/context.rb +50 -0
  30. data/lib/dispatch_policy/cursor_pagination.rb +121 -0
  31. data/lib/dispatch_policy/decision.rb +22 -0
  32. data/lib/dispatch_policy/engine.rb +4 -27
  33. data/lib/dispatch_policy/forwarder.rb +63 -0
  34. data/lib/dispatch_policy/gate.rb +10 -38
  35. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
  36. data/lib/dispatch_policy/gates/concurrency.rb +45 -26
  37. data/lib/dispatch_policy/gates/throttle.rb +65 -37
  38. data/lib/dispatch_policy/inflight_tracker.rb +174 -0
  39. data/lib/dispatch_policy/job_extension.rb +155 -0
  40. data/lib/dispatch_policy/operator_hints.rb +126 -0
  41. data/lib/dispatch_policy/pipeline.rb +48 -0
  42. data/lib/dispatch_policy/policy.rb +62 -47
  43. data/lib/dispatch_policy/policy_dsl.rb +120 -0
  44. data/lib/dispatch_policy/railtie.rb +35 -0
  45. data/lib/dispatch_policy/registry.rb +46 -0
  46. data/lib/dispatch_policy/repository.rb +723 -0
  47. data/lib/dispatch_policy/serializer.rb +36 -0
  48. data/lib/dispatch_policy/tick.rb +263 -172
  49. data/lib/dispatch_policy/tick_loop.rb +59 -26
  50. data/lib/dispatch_policy/version.rb +1 -1
  51. data/lib/dispatch_policy.rb +71 -46
  52. data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
  53. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
  54. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
  55. data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
  56. metadata +101 -43
  57. data/CHANGELOG.md +0 -12
  58. data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
  59. data/app/models/dispatch_policy/partition_observation.rb +0 -49
  60. data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
  61. data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
  62. data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
  63. data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
  64. data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
  65. data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
  66. data/lib/dispatch_policy/dispatch_context.rb +0 -53
  67. data/lib/dispatch_policy/dispatchable.rb +0 -120
  68. data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
  69. data/lib/dispatch_policy/gates/global_cap.rb +0 -26
  70. data/lib/dispatch_policy/install_generator.rb +0 -23
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module DispatchPolicy
6
+ module Serializer
7
+ module_function
8
+
9
+ # Serialize an ActiveJob instance for storage in staged_jobs.job_data.
10
+ # Returns a Ruby hash compatible with PostgreSQL jsonb (string keys).
11
+ def serialize(job)
12
+ job.serialize
13
+ end
14
+
15
+ # Deserialize stored job_data into a fresh ActiveJob instance ready
16
+ # to be enqueued via `#enqueue`.
17
+ def deserialize(payload)
18
+ job_class = payload["job_class"] || payload[:job_class]
19
+ raise InvalidPolicy, "missing job_class in stored payload" unless job_class
20
+
21
+ klass = job_class.constantize
22
+ klass.deserialize(payload)
23
+ end
24
+
25
+ def dump_jsonb(value)
26
+ JSON.dump(value)
27
+ end
28
+
29
+ def load_jsonb(text)
30
+ return text if text.is_a?(Hash) || text.is_a?(Array)
31
+ return {} if text.nil? || text == ""
32
+
33
+ JSON.parse(text)
34
+ end
35
+ end
36
+ end
@@ -1,214 +1,305 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DispatchPolicy
4
+ # One pass of admission for a single policy.
5
+ #
6
+ # Records a row in dispatch_policy_tick_samples at the end so the engine UI
7
+ # can show throughput, denial reasons, and tick duration without sampling
8
+ # on the read path.
4
9
  class Tick
5
- THROTTLE_ZERO_THRESHOLD = 0.001
10
+ Result = Struct.new(:partitions_seen, :jobs_admitted, keyword_init: true)
6
11
 
7
- # Single admission pass: fetch pending staged jobs per policy, evaluate
8
- # gates, mark survivors as admitted, then enqueue them on the real
9
- # backend outside the locking transaction.
10
- def self.run(policy_name: nil)
11
- return 0 unless DispatchPolicy.enabled?
12
+ def self.run(policy_name:, shard: nil)
13
+ new(policy_name, shard: shard).call
14
+ end
12
15
 
13
- pending_enqueue = []
16
+ def initialize(policy_name, shard: nil)
17
+ @policy_name = policy_name
18
+ @shard = shard
19
+ @policy = DispatchPolicy.registry.fetch(policy_name) || raise(InvalidPolicy, "unknown policy #{policy_name.inspect}")
20
+ @config = DispatchPolicy.config
21
+ end
14
22
 
15
- StagedJob.transaction do
16
- active_policies(policy_name).each do |pname|
17
- policy = lookup_policy(pname)
18
- next unless policy
23
+ def call
24
+ started_at = monotonic_now_ms
25
+ partitions_seen = 0
26
+ partitions_admitted = 0
27
+ partitions_denied = 0
28
+ jobs_admitted = 0
29
+ forward_failures = 0
30
+ denied_reasons = Hash.new(0)
31
+
32
+ partitions = Repository.claim_partitions(
33
+ policy_name: @policy_name,
34
+ shard: @shard,
35
+ limit: @config.partition_batch_size
36
+ )
19
37
 
20
- batch = fetch_batch(policy)
21
- next if batch.empty?
38
+ # Reorder by least-recent-admit-weighted (EWMA decayed_admits ASC)
39
+ # so under-admitted partitions get first crack at the tick budget.
40
+ # claim_partitions ALREADY enforced anti-stagnation via
41
+ # last_checked_at — every partition with pending is visited within
42
+ # ⌈active_partitions / partition_batch_size⌉ ticks regardless of
43
+ # decayed_admits. Reordering here only decides order *inside* this
44
+ # already-fair selection.
45
+ sort_partitions_for_fairness!(partitions)
46
+
47
+ # Per-partition fair share. When tick_admission_budget is set, we
48
+ # divide it evenly across the partitions we just claimed. Otherwise
49
+ # the legacy admission_batch_size is the per-partition ceiling.
50
+ #
51
+ # We deliberately do NOT clamp fair_share to a minimum of 1 when
52
+ # tick_cap < N. The hard global cap wins over a per-partition
53
+ # admit floor; partitions that don't admit this tick are still
54
+ # visited (last_checked_at bumped) and re-visited next tick when
55
+ # they'll be at the front of the in-tick decay order.
56
+ # Anti-stagnation comes from claim_partitions, not from forcing
57
+ # an admit on every claimed partition.
58
+ tick_cap = @policy.tick_admission_budget || @config.tick_admission_budget
59
+ per_part = @policy.admission_batch_size || @config.admission_batch_size
60
+ fair_share = if tick_cap && partitions.any?
61
+ (tick_cap.to_f / partitions.size).ceil
62
+ else
63
+ per_part
64
+ end
65
+
66
+ pending_denies = []
67
+ admitted_per_partition = Hash.new(0)
68
+ used = 0
69
+
70
+ partitions.each do |partition|
71
+ partitions_seen += 1
72
+
73
+ if tick_cap && used >= tick_cap
74
+ # Global cap exhausted in pass-1. The partition is still
75
+ # observed (claim_partitions bumped its last_checked_at), so
76
+ # the round-robin invariant for anti-stagnation holds; we
77
+ # just admit nothing this tick.
78
+ partitions_denied += 1
79
+ denied_reasons["tick_cap_exhausted"] += 1
80
+ # Push this partition to the deny path so its gate state
81
+ # still gets persisted — the pipeline already evaluated it
82
+ # in admit_partition... actually we haven't called admit yet.
83
+ # Skip: not adding to pending_denies because the pipeline
84
+ # didn't run, no gate_state_patch to flush.
85
+ next
86
+ end
22
87
 
23
- pending_enqueue.concat(run_policy(policy, batch))
88
+ budget_for_this = if tick_cap
89
+ [fair_share, tick_cap - used].min
90
+ else
91
+ fair_share
92
+ end
93
+ budget_for_this = 0 if budget_for_this.negative?
94
+
95
+ outcome = admit_partition(partition, pending_denies, max_budget: budget_for_this)
96
+ admitted_per_partition[partition["partition_key"]] = outcome[:admitted]
97
+
98
+ jobs_admitted += outcome[:admitted]
99
+ forward_failures += outcome[:failures]
100
+ used += outcome[:admitted]
101
+
102
+ if outcome[:admitted].positive?
103
+ partitions_admitted += 1
104
+ else
105
+ partitions_denied += 1
106
+ outcome[:reasons].each { |r| denied_reasons[r] += 1 }
24
107
  end
25
108
  end
26
109
 
27
- admitted_count = 0
28
- pending_enqueue.each do |staged, job|
29
- begin
30
- job.enqueue(_bypass_staging: true)
31
- admitted_count += 1
32
- rescue StandardError => e
33
- Rails.logger&.error("[DispatchPolicy] enqueue failed staged=#{staged.id}: #{e.class}: #{e.message}")
34
- revert_admission(staged)
110
+ # Pass-2: redistribution. Pass-1 may have left budget unused if
111
+ # some partitions had less pending than their fair share. Walk the
112
+ # claimed partitions (still in decay-sorted order) and offer the
113
+ # leftover to whoever filled their fair share in pass-1 — a signal
114
+ # they had more pending than we let them admit.
115
+ if tick_cap
116
+ remaining = tick_cap - used
117
+ if remaining.positive?
118
+ partitions.each do |p|
119
+ break if remaining <= 0
120
+ next if admitted_per_partition[p["partition_key"]] < fair_share
121
+
122
+ extra_cap = [remaining, fair_share].min
123
+ outcome = admit_partition(p, pending_denies, max_budget: extra_cap)
124
+ jobs_admitted += outcome[:admitted]
125
+ forward_failures += outcome[:failures]
126
+ admitted_per_partition[p["partition_key"]] += outcome[:admitted]
127
+ remaining -= outcome[:admitted]
128
+ end
35
129
  end
36
130
  end
37
131
 
38
- admitted_count
39
- end
132
+ flush_denies!(pending_denies) if pending_denies.any?
40
133
 
41
- def self.prune_idle_partitions
42
- ttl = DispatchPolicy.config.partition_idle_ttl
43
- return if ttl.nil? || ttl <= 0
134
+ duration_ms = monotonic_now_ms - started_at
44
135
 
45
- cutoff = Time.current - ttl
46
- PartitionInflightCount.where(in_flight: 0).where("updated_at < ?", cutoff).delete_all
47
- ThrottleBucket.where("tokens <= ? AND refilled_at < ?", THROTTLE_ZERO_THRESHOLD, cutoff).delete_all
48
- end
136
+ record_sample!(
137
+ duration_ms: duration_ms,
138
+ partitions_seen: partitions_seen,
139
+ partitions_admitted: partitions_admitted,
140
+ partitions_denied: partitions_denied,
141
+ jobs_admitted: jobs_admitted,
142
+ forward_failures: forward_failures,
143
+ denied_reasons: denied_reasons
144
+ )
49
145
 
50
- def self.prune_orphan_gate_rows
51
- [ PartitionInflightCount, ThrottleBucket ].each do |model|
52
- model.distinct.pluck(:policy_name, :gate_name).each do |policy_name, gate_name|
53
- policy = lookup_policy(policy_name)
54
- next if policy && policy.gates.any? { |g| g.name == gate_name.to_sym }
146
+ Result.new(partitions_seen: partitions_seen, jobs_admitted: jobs_admitted)
147
+ end
55
148
 
56
- model.where(policy_name: policy_name, gate_name: gate_name).delete_all
57
- end
149
+ private
150
+
151
+ # In-place sort by current decayed_admits ASC, computed in Ruby from
152
+ # the row's stored decayed_admits + the elapsed time since
153
+ # decayed_admits_at. We do this here (rather than in the SQL of
154
+ # claim_partitions) because:
155
+ #
156
+ # - claim_partitions's ORDER BY is anti-stagnation (last_checked_at
157
+ # NULLS FIRST); reordering there would bias selection itself,
158
+ # reintroducing the stagnation risk.
159
+ # - The math is cheap on N ≤ partition_batch_size rows already in
160
+ # memory.
161
+ def sort_partitions_for_fairness!(partitions)
162
+ half_life = @policy.fairness_half_life_seconds || @config.fairness_half_life_seconds
163
+ return partitions if half_life.nil? || half_life <= 0
164
+
165
+ tau = half_life.to_f / Math.log(2)
166
+ now = Time.current.to_f
167
+
168
+ partitions.sort_by! do |p|
169
+ last_t = decayed_admits_epoch(p["decayed_admits_at"]) || now
170
+ elapsed = [now - last_t, 0.0].max
171
+ (p["decayed_admits"] || 0.0).to_f * Math.exp(-elapsed / tau)
58
172
  end
59
173
  end
60
174
 
61
- def self.reap
62
- StagedJob.expired_leases.find_each do |staged|
63
- (staged.partitions || {}).each do |gate_name, partition_key|
64
- policy = lookup_policy(staged.policy_name)
65
- gate = policy&.gates&.find { |g| g.name == gate_name.to_sym }
66
- next unless gate&.tracks_inflight?
67
-
68
- PartitionInflightCount.decrement(
69
- policy_name: staged.policy_name,
70
- gate_name: gate_name.to_s,
71
- partition_key: partition_key.to_s
72
- )
73
- end
74
- staged.update!(lease_expires_at: nil, completed_at: Time.current)
75
- end
175
+ def decayed_admits_epoch(value)
176
+ return nil if value.nil?
177
+ return value.to_f if value.is_a?(Numeric)
178
+ return value.to_time.to_f if value.respond_to?(:to_time)
179
+ Time.parse(value.to_s).to_f
180
+ rescue ArgumentError, TypeError
181
+ nil
76
182
  end
77
183
 
78
- def self.release(policy_name:, partitions:)
79
- partitions.each do |gate_name, partition_key|
80
- policy = lookup_policy(policy_name)
81
- gate = policy&.gates&.find { |g| g.name == gate_name.to_sym }
82
- next unless gate&.tracks_inflight?
83
-
84
- PartitionInflightCount.decrement(
85
- policy_name: policy_name,
86
- gate_name: gate_name.to_s,
87
- partition_key: partition_key.to_s
88
- )
184
+ def admit_partition(partition, pending_denies, max_budget:)
185
+ ctx = Context.wrap(partition["context"])
186
+ pipe = Pipeline.new(@policy)
187
+ result = pipe.call(ctx, partition, max_budget)
188
+
189
+ # Pure-deny path (gate said no capacity for this partition this tick).
190
+ # Defer the partition state UPDATE to the bulk flush at the end of
191
+ # the tick instead of issuing a per-partition statement now.
192
+ if result.admit_count.zero?
193
+ pending_denies << {
194
+ policy_name: @policy_name,
195
+ partition_key: partition["partition_key"],
196
+ gate_state_patch: result.gate_state_patch,
197
+ retry_after: result.retry_after
198
+ }
199
+ return { admitted: 0, failures: 0, reasons: deduce_reasons(result) }
89
200
  end
90
- end
91
201
 
92
- def self.active_policies(policy_name)
93
- return [ policy_name ] if policy_name
202
+ admitted = 0
203
+ half_life = @policy.fairness_half_life_seconds || @config.fairness_half_life_seconds
204
+
205
+ Repository.with_connection do
206
+ ActiveRecord::Base.transaction(requires_new: true) do
207
+ rows = Repository.claim_staged_jobs!(
208
+ policy_name: @policy_name,
209
+ partition_key: partition["partition_key"],
210
+ limit: result.admit_count,
211
+ gate_state_patch: result.gate_state_patch,
212
+ retry_after: result.retry_after,
213
+ half_life_seconds: half_life
214
+ )
94
215
 
95
- StagedJob.pending
96
- .where("not_before_at IS NULL OR not_before_at <= ?", Time.current)
97
- .distinct
98
- .pluck(:policy_name)
99
- end
216
+ # `claim_staged_jobs!` always runs `record_partition_admit!` so
217
+ # the partition's counters and gate_state commit even when the
218
+ # actual DELETE returned zero rows (e.g. all staged rows are
219
+ # scheduled in the future, or another tick raced us to them).
220
+ next if rows.empty?
221
+
222
+ # Pre-insert an inflight row per admitted job so the concurrency
223
+ # gate sees them immediately. With a concurrency gate, use its
224
+ # (coarser) partition key so the gate's COUNT(*) keeps aggregating
225
+ # correctly across staged sub-partitions.
226
+ concurrency_gate = @policy.gates.find { |g| g.name == :concurrency }
227
+ inflight_rows = rows.filter_map do |row|
228
+ ajid = row.dig("job_data", "job_id")
229
+ next unless ajid
230
+
231
+ key = if concurrency_gate
232
+ concurrency_gate.inflight_partition_key(@policy_name, Context.wrap(row["context"]))
233
+ else
234
+ row["partition_key"]
235
+ end
236
+ { policy_name: @policy_name, partition_key: key, active_job_id: ajid }
237
+ end
238
+ Repository.insert_inflight!(inflight_rows) if inflight_rows.any?
239
+
240
+ # Re-enqueue to the real adapter *inside this transaction*. The
241
+ # adapter (good_job / solid_queue) shares ActiveRecord::Base's
242
+ # connection, so its INSERT into good_jobs / solid_queue_jobs
243
+ # participates in the same TX. If anything raises (deserialize,
244
+ # adapter error, network), the whole TX rolls back atomically:
245
+ # staged_jobs return, inflight rows vanish, partition counters
246
+ # revert, and the adapter rows are also reverted. This is the
247
+ # at-least-once guarantee — there is no window where staged is
248
+ # gone but the adapter never received the job.
249
+ Forwarder.dispatch(rows)
250
+ admitted = rows.size
251
+ end
252
+ end
100
253
 
101
- def self.fetch_batch(policy)
102
- if policy.round_robin?
103
- fetch_round_robin_batch(policy)
254
+ if admitted.zero?
255
+ { admitted: 0, failures: 0, reasons: ["no_rows_claimed"] }
104
256
  else
105
- fetch_plain_batch(policy)
257
+ { admitted: admitted, failures: 0, reasons: [] }
106
258
  end
259
+ rescue StandardError => e
260
+ DispatchPolicy.config.logger&.error(
261
+ "[dispatch_policy] forward failed for #{@policy_name}/#{partition['partition_key']}: " \
262
+ "#{e.class}: #{e.message}"
263
+ )
264
+ { admitted: 0, failures: 1, reasons: ["forward_failed"] }
107
265
  end
108
266
 
109
- def self.fetch_plain_batch(policy)
110
- StagedJob.pending
111
- .where(policy_name: policy.name)
112
- .where("not_before_at IS NULL OR not_before_at <= ?", Time.current)
113
- .order(:priority, :staged_at)
114
- .limit(DispatchPolicy.config.batch_size)
115
- .lock("FOR UPDATE SKIP LOCKED")
116
- .to_a
117
- end
118
-
119
- def self.fetch_round_robin_batch(policy)
120
- quantum = DispatchPolicy.config.round_robin_quantum
121
- batch_size = DispatchPolicy.config.batch_size
122
- now = Time.current
123
-
124
- sql = <<~SQL.squish
125
- SELECT rows.*
126
- FROM (
127
- SELECT DISTINCT round_robin_key
128
- FROM dispatch_policy_staged_jobs
129
- WHERE policy_name = ?
130
- AND admitted_at IS NULL
131
- AND round_robin_key IS NOT NULL
132
- AND (not_before_at IS NULL OR not_before_at <= ?)
133
- ) AS keys
134
- CROSS JOIN LATERAL (
135
- SELECT *
136
- FROM dispatch_policy_staged_jobs
137
- WHERE policy_name = ?
138
- AND admitted_at IS NULL
139
- AND round_robin_key = keys.round_robin_key
140
- AND (not_before_at IS NULL OR not_before_at <= ?)
141
- ORDER BY priority, staged_at
142
- LIMIT ?
143
- FOR UPDATE SKIP LOCKED
144
- ) AS rows
145
- LIMIT ?
146
- SQL
147
-
148
- batch = StagedJob.find_by_sql([ sql, policy.name, now, policy.name, now, quantum, batch_size ])
149
-
150
- remaining = batch_size - batch.size
151
- return batch if remaining <= 0
152
-
153
- top_up = StagedJob.pending
154
- .where(policy_name: policy.name)
155
- .where("not_before_at IS NULL OR not_before_at <= ?", now)
156
- .where.not(id: batch.map(&:id))
157
- .order(:priority, :staged_at)
158
- .limit(remaining)
159
- .lock("FOR UPDATE SKIP LOCKED")
160
- .to_a
161
-
162
- batch + top_up
163
- end
164
-
165
- def self.lookup_policy(policy_name)
166
- job_class = DispatchPolicy.registry[policy_name] || autoload_job_for(policy_name)
167
- return nil unless job_class
168
- job_class.resolved_dispatch_policy
169
- end
170
-
171
- def self.autoload_job_for(policy_name)
172
- const_name = policy_name.tr("-", "/").camelize
173
- const_name.safe_constantize
174
- DispatchPolicy.registry[policy_name]
267
+ def flush_denies!(entries)
268
+ Repository.with_connection { Repository.bulk_record_partition_denies!(entries) }
269
+ rescue StandardError => e
270
+ DispatchPolicy.config.logger&.error(
271
+ "[dispatch_policy] bulk_record_partition_denies failed: #{e.class}: #{e.message}"
272
+ )
175
273
  end
176
274
 
177
- def self.run_policy(policy, batch)
178
- context = DispatchContext.new(policy: policy, batch: batch)
179
- survivors = batch
180
- policy.gates.each do |gate|
181
- survivors = gate.filter(survivors, context)
275
+ # When admit_count was 0, the Pipeline's `reasons` array contains entries
276
+ # like "throttle:rate=0", "concurrency:concurrency_full". We strip the
277
+ # `gate:` prefix's value separator so callers see "throttle" / "concurrency_full".
278
+ def deduce_reasons(result)
279
+ reasons = result.reasons.map do |s|
280
+ gate, msg = s.split(":", 2)
281
+ msg.presence || gate
182
282
  end
283
+ reasons << "no_capacity" if reasons.empty?
284
+ reasons
285
+ end
183
286
 
184
- survivors.map do |staged|
185
- partitions = context.partitions_for(staged)
186
-
187
- partitions.each do |gate_name, partition_key|
188
- gate = policy.gates.find { |g| g.name == gate_name.to_sym }
189
- next unless gate&.tracks_inflight?
190
-
191
- PartitionInflightCount.increment(
192
- policy_name: policy.name,
193
- gate_name: gate_name.to_s,
194
- partition_key: partition_key.to_s
195
- )
196
- end
287
+ def record_sample!(**fields)
288
+ pending_total = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
289
+ inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
197
290
 
198
- job = staged.mark_admitted!(partitions: partitions)
199
- [ staged, job ]
200
- end
291
+ Repository.record_tick_sample!(
292
+ policy_name: @policy_name,
293
+ pending_total: pending_total,
294
+ inflight_total: inflight_total,
295
+ **fields
296
+ )
297
+ rescue StandardError => e
298
+ DispatchPolicy.config.logger&.warn("[dispatch_policy] failed to record tick sample: #{e.class}: #{e.message}")
201
299
  end
202
300
 
203
- def self.revert_admission(staged)
204
- partitions = staged.partitions || {}
205
- release(policy_name: staged.policy_name, partitions: partitions)
206
- staged.update_columns(
207
- admitted_at: nil,
208
- lease_expires_at: nil,
209
- active_job_id: nil,
210
- partitions: {}
211
- )
301
+ def monotonic_now_ms
302
+ (Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1000).to_i
212
303
  end
213
304
  end
214
305
  end
@@ -1,45 +1,78 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DispatchPolicy
4
- # Shared driver for DispatchTickLoopJob and any foreground tick (e.g. a
5
- # rake task). Loops Tick.reap + Tick.run with an interruptible sleep and
6
- # bails when stop_when returns true.
7
- class TickLoop
8
- def self.run(policy_name: nil, sleep_for: nil, sleep_for_busy: nil, stop_when: -> { false })
9
- idle_sleep = (sleep_for || DispatchPolicy.config.tick_sleep).to_f
10
- busy_sleep = (sleep_for_busy || DispatchPolicy.config.tick_sleep_busy).to_f
4
+ # Drives admission until `stop_when` fires (deadline, shutdown signal, etc).
5
+ # Runs one Tick per policy per loop iteration; sleeps `idle_pause` when no
6
+ # jobs were admitted across all policies. Periodically (every
7
+ # `sweep_every_ticks` iterations) sweeps stale inflight rows and inactive
8
+ # partitions.
9
+ module TickLoop
10
+ module_function
11
+
12
+ # @param policy_name [String, nil] limit to one policy. nil = all registered.
13
+ # @param shard [String, nil] limit to one shard. nil = all shards.
14
+ def run(policy_name: nil, shard: nil, stop_when: -> { false })
15
+ config = DispatchPolicy.config
16
+ logger = config.logger
17
+ iteration = 0
11
18
 
12
19
  loop do
13
20
  break if stop_when.call
14
21
 
22
+ unless DispatchPolicy.config.enabled
23
+ # Master switch off: stop polling. The job that drives
24
+ # TickLoop.run will re-schedule itself; we exit cleanly so
25
+ # the next iteration sees the flag and stops too.
26
+ logger&.info("[dispatch_policy] TickLoop exiting because config.enabled = false")
27
+ break
28
+ end
29
+
30
+ names = policy_names(policy_name)
31
+ if names.empty?
32
+ sleep(config.idle_pause)
33
+ next
34
+ end
35
+
15
36
  admitted = 0
16
- begin
17
- ActiveRecord::Base.uncached do
18
- Tick.reap
19
- admitted = Tick.run(policy_name: policy_name).to_i
37
+ names.each do |name|
38
+ break if stop_when.call
39
+
40
+ begin
41
+ result = Tick.run(policy_name: name, shard: shard)
42
+ admitted += result.jobs_admitted
43
+ rescue StandardError => e
44
+ logger&.error("[dispatch_policy] tick error policy=#{name} shard=#{shard.inspect} #{e.class}: #{e.message}\n#{e.backtrace.first(10).join("\n")}")
20
45
  end
21
- rescue StandardError => e
22
- Rails.logger&.error("[DispatchPolicy] tick error: #{e.class}: #{e.message}")
23
- Rails.error.report(e, handled: true) if defined?(Rails) && Rails.respond_to?(:error)
24
46
  end
25
47
 
26
- break if stop_when.call
48
+ iteration += 1
49
+ if (iteration % config.sweep_every_ticks).zero?
50
+ sweep!
51
+ end
27
52
 
28
- interruptible_sleep(admitted.positive? ? busy_sleep : idle_sleep, stop_when)
53
+ if admitted.zero?
54
+ sleep(config.idle_pause)
55
+ elsif config.busy_pause.to_f.positive?
56
+ sleep(config.busy_pause)
57
+ end
29
58
  end
30
59
  end
31
60
 
32
- def self.interruptible_sleep(total, stop_when)
33
- return unless total.positive?
34
-
35
- remaining = total
36
- step = 0.1
37
- while remaining.positive?
38
- break if stop_when.call
39
- chunk = [ remaining, step ].min
40
- sleep(chunk)
41
- remaining -= chunk
61
+ def policy_names(filter)
62
+ if filter
63
+ [filter.to_s]
64
+ else
65
+ DispatchPolicy.registry.names
42
66
  end
43
67
  end
68
+
69
+ def sweep!
70
+ cfg = DispatchPolicy.config
71
+ Repository.sweep_stale_inflight!(cutoff_seconds: cfg.inflight_stale_after)
72
+ Repository.sweep_inactive_partitions!(cutoff_seconds: cfg.partition_inactive_after)
73
+ Repository.sweep_old_tick_samples!(cutoff_seconds: cfg.metrics_retention)
74
+ rescue StandardError => e
75
+ DispatchPolicy.config.logger&.error("[dispatch_policy] sweep error: #{e.class}: #{e.message}")
76
+ end
44
77
  end
45
78
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DispatchPolicy
4
- VERSION = "0.1.0"
4
+ VERSION = "0.3.0"
5
5
  end