dispatch_policy 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/MIT-LICENSE +16 -17
  3. data/README.md +433 -388
  4. data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
  5. data/app/controllers/dispatch_policy/application_controller.rb +45 -1
  6. data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
  7. data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
  8. data/app/controllers/dispatch_policy/policies_controller.rb +94 -267
  9. data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
  10. data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
  11. data/app/models/dispatch_policy/inflight_job.rb +12 -0
  12. data/app/models/dispatch_policy/partition.rb +21 -0
  13. data/app/models/dispatch_policy/staged_job.rb +4 -97
  14. data/app/models/dispatch_policy/tick_sample.rb +11 -0
  15. data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
  16. data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
  17. data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
  18. data/app/views/dispatch_policy/policies/index.html.erb +15 -37
  19. data/app/views/dispatch_policy/policies/show.html.erb +139 -223
  20. data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
  21. data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
  22. data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
  23. data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
  24. data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
  25. data/config/routes.rb +18 -2
  26. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
  27. data/lib/dispatch_policy/bypass.rb +23 -0
  28. data/lib/dispatch_policy/config.rb +85 -0
  29. data/lib/dispatch_policy/context.rb +50 -0
  30. data/lib/dispatch_policy/cursor_pagination.rb +121 -0
  31. data/lib/dispatch_policy/decision.rb +22 -0
  32. data/lib/dispatch_policy/engine.rb +4 -27
  33. data/lib/dispatch_policy/forwarder.rb +63 -0
  34. data/lib/dispatch_policy/gate.rb +10 -38
  35. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
  36. data/lib/dispatch_policy/gates/concurrency.rb +45 -26
  37. data/lib/dispatch_policy/gates/throttle.rb +65 -41
  38. data/lib/dispatch_policy/inflight_tracker.rb +174 -0
  39. data/lib/dispatch_policy/job_extension.rb +155 -0
  40. data/lib/dispatch_policy/operator_hints.rb +126 -0
  41. data/lib/dispatch_policy/pipeline.rb +48 -0
  42. data/lib/dispatch_policy/policy.rb +61 -59
  43. data/lib/dispatch_policy/policy_dsl.rb +120 -0
  44. data/lib/dispatch_policy/railtie.rb +35 -0
  45. data/lib/dispatch_policy/registry.rb +46 -0
  46. data/lib/dispatch_policy/repository.rb +723 -0
  47. data/lib/dispatch_policy/serializer.rb +36 -0
  48. data/lib/dispatch_policy/tick.rb +260 -256
  49. data/lib/dispatch_policy/tick_loop.rb +59 -26
  50. data/lib/dispatch_policy/version.rb +1 -1
  51. data/lib/dispatch_policy.rb +71 -52
  52. data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
  53. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
  54. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
  55. data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
  56. metadata +101 -43
  57. data/CHANGELOG.md +0 -43
  58. data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
  59. data/app/models/dispatch_policy/partition_observation.rb +0 -76
  60. data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
  61. data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
  62. data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
  63. data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
  64. data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
  65. data/db/migrate/20260425000001_add_duration_to_partition_observations.rb +0 -8
  66. data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
  67. data/lib/dispatch_policy/dispatch_context.rb +0 -53
  68. data/lib/dispatch_policy/dispatchable.rb +0 -123
  69. data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
  70. data/lib/dispatch_policy/gates/global_cap.rb +0 -26
@@ -2,294 +2,121 @@
2
2
 
3
3
  module DispatchPolicy
4
4
  class PoliciesController < ApplicationController
5
- STALE_PENDING_THRESHOLD = 1.hour
6
- PARTITION_LIST_PAGE_SIZE = 25
5
+ before_action :find_policy, only: %i[show pause resume drain]
7
6
 
8
- before_action :load_policy, only: :show
7
+ DRAIN_MAX_PER_REQUEST = 10_000
9
8
 
10
9
  def index
11
- @policies = DispatchPolicy.registry.map do |name, job_class|
12
- scope = StagedJob.where(policy_name: name)
13
- pending = scope.pending
10
+ registry_names = DispatchPolicy.registry.names
11
+ db_names = Partition.distinct.pluck(:policy_name)
12
+ names = (registry_names + db_names).uniq.sort
13
+
14
+ in_flight_by_policy = InflightJob.where(policy_name: names).group(:policy_name).count
15
+
16
+ @rows = names.map do |name|
17
+ partitions = Partition.for_policy(name)
14
18
  {
15
- name: name,
16
- job_class: job_class,
17
- policy: job_class.resolved_dispatch_policy,
18
- pending_count: pending.count,
19
- admitted_count: scope.admitted.count,
20
- completed_24h: scope.completed.where(completed_at: 24.hours.ago..).count,
21
- oldest_pending: pending.minimum(:staged_at),
22
- stale_threshold: STALE_PENDING_THRESHOLD
19
+ name: name,
20
+ registered: registry_names.include?(name),
21
+ pending: partitions.sum(:pending_count),
22
+ in_flight: in_flight_by_policy[name] || 0,
23
+ partitions: partitions.count,
24
+ paused_count: partitions.paused.count
23
25
  }
24
- end.sort_by { |p| -p[:pending_count] }
25
-
26
- @active_partitions = PartitionInflightCount.where("in_flight > 0").count
27
- @expired_leases = StagedJob.expired_leases.count
26
+ end
28
27
  end
29
28
 
30
29
  def show
31
- scope = StagedJob.where(policy_name: @policy_name)
32
- @pending_count = scope.pending.count
33
- @pending_eligible_count = scope.pending.where("not_before_at IS NULL OR not_before_at <= ?", Time.current).count
34
- @pending_scheduled_count = @pending_count - @pending_eligible_count
35
- @admitted_count = scope.admitted.count
36
- @completed_24h = scope.completed.where(completed_at: 24.hours.ago..).count
37
-
38
- all_breakdown = partition_breakdown(scope)
39
-
40
- # "Watched" subset (passed via ?watch=a,b,c; the JS layer syncs it
41
- # with localStorage so the choice sticks across reloads).
42
- @watched_keys = (params[:watch] || "").split(",").map(&:strip).reject(&:empty?)
43
- @partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
44
-
45
- # Browsable list of every active partition with filter + sort + pagination.
46
- @partition_search = params[:q].to_s.strip
47
- @partition_page = [ params[:page].to_i, 1 ].max
48
- @partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
49
- @partition_dir = params[:dir] == "asc" ? "asc" : "desc"
50
-
51
- list = all_breakdown
52
- list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
53
- list = sort_partition_list(list, @partition_sort, @partition_dir)
54
-
55
- @partition_total_list = list.size
56
- offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
57
- @partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
58
-
59
- load_adaptive_chart_data
60
- @throttle_buckets = ThrottleBucket
61
- .where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
62
- # Explicit select: don't load the `arguments` jsonb (job payload —
63
- # may contain PII / tokens) into memory just to render six fields.
64
- @pending_jobs = scope.pending
65
- .select(:id, :dedupe_key, :round_robin_key, :priority, :staged_at, :not_before_at)
66
- .order(:priority, :staged_at)
67
- .limit(50)
68
- end
69
-
70
- private
71
-
72
- def load_policy
73
- @policy_name = params[:policy_name]
74
- @job_class = DispatchPolicy.registry[@policy_name] ||
75
- Tick.autoload_job_for(@policy_name)
76
- raise ActiveRecord::RecordNotFound unless @job_class
77
- @policy = @job_class.resolved_dispatch_policy
78
- end
79
-
80
- # Per-(source, partition) breakdown of pending-eligible / pending-scheduled
81
- # / in-flight / completed-24h. A "source" is either a gate with a
82
- # partition_by (uses gate.partition_key_for(context)) or the policy's
83
- # round_robin_by declaration (uses the round_robin_key column directly).
84
- # All four counts come from StagedJob groupings; PartitionInflightCount
85
- # is an admission-time optimization, not the user-facing truth.
86
- def partition_breakdown(scope)
87
- sources = partition_sources
88
- return [] if sources.empty?
89
-
90
- now = Time.current
91
- now_iso = now.iso8601
92
- since_24h = 24.hours.ago.iso8601
93
- limit = DispatchPolicy.config.admin_partition_limit
94
- @partition_breakdown_truncated = false
95
-
96
- adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
97
- .order(updated_at: :desc)
98
- .limit(limit)
99
- .pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
100
- .each_with_object({}) { |(g, k, c, l), h|
101
- h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
102
- }
103
-
104
- rows = Hash.new { |h, k|
105
- h[k] = {
106
- source: k[0],
107
- partition: k[1],
108
- eligible: 0,
109
- scheduled: 0,
110
- in_flight: 0,
111
- completed_24h: 0,
112
- last_enqueued_at: nil,
113
- last_dispatched_at: nil,
114
- current_max: nil,
115
- ewma_latency_ms: nil
116
- }
30
+ @policy_object = DispatchPolicy.registry.fetch(@policy_name)
31
+ @partitions = Partition.for_policy(@policy_name)
32
+ .order(Arel.sql("pending_count DESC, last_admit_at DESC NULLS LAST"))
33
+ .limit(100)
34
+ @top_admitted = Partition.for_policy(@policy_name)
35
+ .order(total_admitted: :desc)
36
+ .limit(20)
37
+
38
+ @totals = {
39
+ pending: Partition.for_policy(@policy_name).sum(:pending_count),
40
+ in_flight: InflightJob.where(policy_name: @policy_name).count,
41
+ partitions: Partition.for_policy(@policy_name).count
117
42
  }
118
43
 
119
- # Each aggregation below is order-by-count + limited so that a
120
- # policy with tens of thousands of distinct (context, round_robin_key)
121
- # tuples can't pull megabytes of rows into memory per request. We
122
- # show the top-N most-active partitions per axis and flip the
123
- # truncation flag for the view banner.
124
-
125
- # Activity timestamps bounded to the last 24h so the scan stays on
126
- # an index-friendly slice of staged_jobs.
127
- activity_rows = scope
128
- .where("staged_at > ?", since_24h)
129
- .group(:context, :round_robin_key)
130
- .order(Arel.sql("MAX(staged_at) DESC"))
131
- .limit(limit)
132
- .pluck(
133
- :context,
134
- :round_robin_key,
135
- Arel.sql("MAX(staged_at)"),
136
- Arel.sql("MAX(admitted_at)")
137
- )
138
- @partition_breakdown_truncated = true if activity_rows.size >= limit
139
-
140
- sources.each do |name, extract|
141
- pending_counts = scope.pending.group(:context, :round_robin_key)
142
- .order(Arel.sql("count(*) DESC"))
143
- .limit(limit)
144
- .pluck(
145
- :context,
146
- :round_robin_key,
147
- Arel.sql("count(*) filter (where not_before_at is null or not_before_at <= '#{now_iso}')"),
148
- Arel.sql("count(*) filter (where not_before_at > '#{now_iso}')")
149
- )
150
- @partition_breakdown_truncated = true if pending_counts.size >= limit
151
- pending_counts.each do |ctx, rr_key, eligible, scheduled|
152
- partition = extract.call(ctx, rr_key)
153
- row = rows[[ name, partition ]]
154
- row[:eligible] += eligible
155
- row[:scheduled] += scheduled
156
- end
157
-
158
- admitted_counts = scope.admitted.group(:context, :round_robin_key)
159
- .order(Arel.sql("count(*) DESC"))
160
- .limit(limit)
161
- .pluck(:context, :round_robin_key, Arel.sql("count(*)"))
162
- @partition_breakdown_truncated = true if admitted_counts.size >= limit
163
- admitted_counts.each do |ctx, rr_key, in_flight|
164
- partition = extract.call(ctx, rr_key)
165
- rows[[ name, partition ]][:in_flight] += in_flight
166
- end
167
-
168
- completed_counts = scope.completed.where("completed_at > ?", since_24h)
169
- .group(:context, :round_robin_key)
170
- .order(Arel.sql("count(*) DESC"))
171
- .limit(limit)
172
- .pluck(:context, :round_robin_key, Arel.sql("count(*)"))
173
- @partition_breakdown_truncated = true if completed_counts.size >= limit
174
- completed_counts.each do |ctx, rr_key, completed|
175
- partition = extract.call(ctx, rr_key)
176
- rows[[ name, partition ]][:completed_24h] += completed
177
- end
178
-
179
- activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
180
- partition = extract.call(ctx, rr_key)
181
- row = rows[[ name, partition ]]
182
- row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
183
- row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
184
- end
185
- end
186
-
187
- rows.each do |(source, partition), row|
188
- stats = adaptive_stats[[ source, partition ]]
189
- next unless stats
190
- row[:current_max] = stats[:current_max]
191
- row[:ewma_latency_ms] = stats[:ewma_latency_ms]
192
- end
193
-
194
- # Two different sources (say round_robin_by account_id + a gate
195
- # partitioned by account_id) producing the same partition key yield
196
- # identical counts — collapse them into one row with a merged source
197
- # label instead of listing the same numbers twice.
198
- merged = rows.values
199
- .reject { |r| r[:partition].nil? || r[:partition].empty? }
200
- .group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
201
- .map { |_, group|
202
- base = group.first.dup
203
- base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
204
- group.each do |r|
205
- base[:current_max] ||= r[:current_max]
206
- base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
207
- base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
208
- base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
209
- end
210
- base
211
- }
212
-
213
- merged.sort_by { |r|
214
- [ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
44
+ now = Time.current
45
+ @windows = {
46
+ "1m" => Repository.tick_summary(policy_name: @policy_name, since: now - 60),
47
+ "5m" => Repository.tick_summary(policy_name: @policy_name, since: now - 5 * 60),
48
+ "15m" => Repository.tick_summary(policy_name: @policy_name, since: now - 15 * 60)
49
+ }
50
+ @denied_reasons = Repository.denied_reasons_summary(policy_name: @policy_name, since: now - 15 * 60)
51
+ @round_trip = Repository.partition_round_trip_stats(policy_name: @policy_name)
52
+ @sparkline = Repository.tick_samples_buckets(policy_name: @policy_name, since: now - 30 * 60, bucket_seconds: 60)
53
+ @pending_trend = Repository.trend_direction(@sparkline.map { |b| b[:pending_total] })
54
+
55
+ cfg = DispatchPolicy.config
56
+ @capacity = {
57
+ admitted_per_minute: @windows["1m"][:jobs_admitted],
58
+ adapter_target_jps: cfg.adapter_throughput_target,
59
+ avg_tick_ms: @windows["1m"][:avg_duration_ms],
60
+ max_tick_ms: @windows["1m"][:max_duration_ms],
61
+ tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
215
62
  }
216
- end
217
63
 
218
- def sort_partition_list(list, sort, dir)
219
- # Put nulls at the bottom regardless of direction (Time#to_f on nil
220
- # would crash; -Float::INFINITY sorts first, +Float::INFINITY last).
221
- key =
222
- case sort
223
- when "source" then ->(r) { [ r[:source], r[:partition] ] }
224
- when "partition" then ->(r) { r[:partition] }
225
- when "pending" then ->(r) { r[:eligible] + r[:scheduled] }
226
- when "in_flight" then ->(r) { r[:in_flight] }
227
- when "completed_24h" then ->(r) { r[:completed_24h] }
228
- when "last_enqueued_at" then ->(r) { r[:last_enqueued_at]&.to_f || 0 }
229
- when "last_dispatched_at" then ->(r) { r[:last_dispatched_at]&.to_f || 0 }
230
- else ->(r) { r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h] }
231
- end
232
- sorted = list.sort_by(&key)
233
- dir == "asc" ? sorted : sorted.reverse
64
+ @hints = OperatorHints.for(
65
+ tick_max_duration_ms: @capacity[:tick_max_duration_ms],
66
+ avg_tick_ms: @capacity[:avg_tick_ms],
67
+ max_tick_ms: @capacity[:max_tick_ms],
68
+ pending_total: @totals[:pending],
69
+ admitted_per_minute: @capacity[:admitted_per_minute],
70
+ forward_failures: @windows["1m"][:forward_failures],
71
+ jobs_admitted: @windows["1m"][:jobs_admitted],
72
+ active_partitions: @round_trip[:active_partitions],
73
+ never_checked: @round_trip[:never_checked],
74
+ in_backoff: @round_trip[:in_backoff],
75
+ total_partitions: @totals[:partitions],
76
+ adapter_target_jps: @capacity[:adapter_target_jps],
77
+ pending_trend: @pending_trend
78
+ )
234
79
  end
235
80
 
236
- # Returns [[source_name, ->(ctx, rr_key) { partition_key }], ...]
237
- # covering every partition-producing declaration on the policy: every
238
- # gate with a partition_by, plus round_robin_by if declared.
239
- def partition_sources
240
- return [] unless @policy
81
+ def pause
82
+ Partition.for_policy(@policy_name).update_all(status: "paused", updated_at: Time.current)
83
+ redirect_to policy_path(@policy_name), notice: "Policy paused."
84
+ end
241
85
 
242
- sources = @policy.gates.select(&:partition_by).map do |gate|
243
- [ gate.name.to_s, ->(ctx, _rr) { gate.partition_key_for((ctx || {}).symbolize_keys) } ]
244
- end
245
- sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
246
- sources
86
+ def resume
87
+ Partition.for_policy(@policy_name).update_all(status: "active", updated_at: Time.current)
88
+ redirect_to policy_path(@policy_name), notice: "Policy resumed."
247
89
  end
248
90
 
249
- # Build chart data from PartitionObservation. Two queries:
250
- # - Global aggregated (one row per minute): cheap even with 1000s of
251
- # partitions because we SUM/AVG in SQL, not in Ruby.
252
- # - Per-partition sparkline data, scoped to only the partitions we're
253
- # going to actually render (breakdown's top N).
254
- def load_adaptive_chart_data
255
- last_minute = Time.current.utc.beginning_of_minute
256
- @chart_slots = (0..59).map { |i| last_minute - (59 - i).minutes }
257
- @chart_labels = @chart_slots.map { |t| t.strftime("%H:%M") }
258
- slot_index = @chart_slots.each_with_index.to_h
91
+ # Force-admits every staged job across every partition of the policy,
92
+ # bypassing all gates. Walks partitions in pending-DESC order so the
93
+ # busiest ones drain first. Bounded at DRAIN_MAX_PER_REQUEST per click.
94
+ def drain
95
+ drained = 0
96
+ Partition.for_policy(@policy_name)
97
+ .where("pending_count > 0")
98
+ .order(pending_count: :desc, id: :asc)
99
+ .limit(500)
100
+ .each do |partition|
101
+ break if drained >= DRAIN_MAX_PER_REQUEST
102
+
103
+ batch, _ = PartitionsController.drain_partition!(partition)
104
+ drained += batch
105
+ end
259
106
 
260
- @adaptive_global = Array.new(@chart_slots.size)
261
- @completions_global = Array.new(@chart_slots.size, 0)
262
- global_rows = PartitionObservation
263
- .where(policy_name: @policy_name)
264
- .where("minute_bucket >= ?", @chart_slots.first)
265
- .group(:minute_bucket)
266
- .pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
267
- global_rows.each do |bucket, total_lag, obs_count|
268
- idx = slot_index[bucket.utc.beginning_of_minute]
269
- next unless idx
270
- @completions_global[idx] = obs_count
271
- @adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
107
+ remaining = Partition.for_policy(@policy_name).sum(:pending_count)
108
+ notice = if remaining.positive?
109
+ "Drained #{drained} job(s) across this policy; #{remaining} still pending — click drain again to continue."
110
+ else
111
+ "Drained #{drained} job(s); policy fully drained."
272
112
  end
113
+ redirect_to policy_path(@policy_name), notice: notice
114
+ end
273
115
 
274
- partition_keys = (@partition_breakdown || []).map { |r| r[:partition] }.uniq
275
- @adaptive_samples = {}
276
- @completions_samples = {}
277
- return if partition_keys.empty?
116
+ private
278
117
 
279
- per_partition_lag = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size) }
280
- per_partition_counts = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size, 0) }
281
- rows = PartitionObservation
282
- .where(policy_name: @policy_name, partition_key: partition_keys)
283
- .where("minute_bucket >= ?", @chart_slots.first)
284
- .pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
285
- rows.each do |pk, bucket, total, count|
286
- idx = slot_index[bucket.utc.beginning_of_minute]
287
- next unless idx
288
- per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
289
- per_partition_counts[pk][idx] = count
290
- end
291
- @adaptive_samples = per_partition_lag
292
- @completions_samples = per_partition_counts
118
+ def find_policy
119
+ @policy_name = params[:name]
293
120
  end
294
121
  end
295
122
  end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class StagedJobsController < ApplicationController
5
+ def show
6
+ @job = StagedJob.find(params[:id])
7
+ end
8
+ end
9
+ end
@@ -1,89 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DispatchPolicy
4
+ # One row per (policy_name, partition_key) for partitions whose policy
5
+ # declares an `:adaptive_concurrency` gate. Holds the AIMD-tuned
6
+ # `current_max` plus the EWMA of recent queue-lag observations the cap
7
+ # adapts on.
8
+ #
9
+ # Read by `Gates::AdaptiveConcurrency#evaluate` to learn how many jobs
10
+ # this partition may admit right now. Written atomically by
11
+ # `Repository.adaptive_record!` from `InflightTracker.track`'s ensure
12
+ # block after each perform — the EWMA + AIMD update lives in a single
13
+ # SQL statement so concurrent workers can't race on read-modify-write.
4
14
  class AdaptiveConcurrencyStats < ApplicationRecord
5
15
  self.table_name = "dispatch_policy_adaptive_concurrency_stats"
6
16
 
7
- # Seed a stats row if one doesn't exist yet. Mirrors ThrottleBucket.lock.
8
- def self.seed!(policy_name:, gate_name:, partition_key:, initial_max:)
9
- now = Time.current
10
- sql = <<~SQL.squish
11
- INSERT INTO #{quoted_table_name}
12
- (policy_name, gate_name, partition_key, current_max,
13
- ewma_latency_ms, sample_count, created_at, updated_at)
14
- VALUES (?, ?, ?, ?, 0, 0, ?, ?)
15
- ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
16
- SQL
17
- connection.exec_update(
18
- sanitize_sql_array([
19
- sql, policy_name, gate_name.to_s, partition_key.to_s,
20
- initial_max.to_i, now, now
21
- ])
22
- )
23
- end
24
-
25
- def self.fetch_many(policy_name:, gate_name:, partition_keys:)
26
- return {} if partition_keys.empty?
27
- where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
28
- .pluck(:partition_key, :current_max, :ewma_latency_ms)
29
- .each_with_object({}) { |(k, c, l), h| h[k] = { current_max: c, ewma_latency_ms: l } }
30
- end
31
-
32
- # Single-statement EWMA + AIMD update so concurrent performs can't race
33
- # on read-modify-write. Seed first (INSERT ON CONFLICT DO NOTHING), then
34
- # apply the adjustment.
35
- def self.record_observation!(
36
- policy_name:, gate_name:, partition_key:,
37
- queue_lag_ms:, succeeded:,
38
- alpha:, min:, target_lag_ms:,
39
- fail_factor:, slow_factor:, initial_max:
40
- )
41
- seed!(
42
- policy_name: policy_name,
43
- gate_name: gate_name,
44
- partition_key: partition_key,
45
- initial_max: initial_max
46
- )
47
-
48
- # Feedback signal is queue_lag (admitted_at → perform_start). When
49
- # the adapter queue is empty, lag ≈ 0 → +1 grow. When the queue
50
- # backs up, lag rises past target → multiplicative shrink. Failures
51
- # shrink harder. Only `min` is enforced so a partition can't lock
52
- # out entirely.
53
- sql = <<~SQL.squish
54
- UPDATE #{quoted_table_name}
55
- SET
56
- ewma_latency_ms = ewma_latency_ms * (1 - ?) + ? * ?,
57
- sample_count = sample_count + 1,
58
- current_max = GREATEST(?, CASE
59
- WHEN ? = FALSE THEN FLOOR(current_max * ?)::int
60
- WHEN (ewma_latency_ms * (1 - ?) + ? * ?) > ? THEN FLOOR(current_max * ?)::int
61
- ELSE current_max + 1
62
- END),
63
- last_observed_at = ?,
64
- updated_at = ?
65
- WHERE policy_name = ? AND gate_name = ? AND partition_key = ?
66
- SQL
67
-
68
- now = Time.current
69
- connection.exec_update(
70
- sanitize_sql_array([
71
- sql,
72
- alpha, alpha, queue_lag_ms,
73
- min.to_i,
74
- succeeded, fail_factor,
75
- alpha, alpha, queue_lag_ms, target_lag_ms, slow_factor,
76
- now, now,
77
- policy_name, gate_name.to_s, partition_key.to_s
78
- ])
79
- )
80
- end
81
-
82
- # Quick lookup used by Dispatchable to denormalize current_max into
83
- # the generic partition observation row.
84
- def self.current_max_for(policy_name:, partition_key:)
85
- where(policy_name: policy_name, partition_key: partition_key.to_s)
86
- .limit(1).pick(:current_max)
87
- end
17
+ scope :for_policy, ->(name) { where(policy_name: name) }
88
18
  end
89
19
  end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class InflightJob < ApplicationRecord
5
+ self.table_name = "dispatch_policy_inflight_jobs"
6
+
7
+ scope :for_partition, ->(policy_name, partition_key) {
8
+ where(policy_name: policy_name, partition_key: partition_key)
9
+ }
10
+ scope :stale, ->(cutoff) { where("heartbeat_at < ?", cutoff) }
11
+ end
12
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class Partition < ApplicationRecord
5
+ self.table_name = "dispatch_policy_partitions"
6
+
7
+ scope :for_policy, ->(name) { where(policy_name: name) }
8
+ scope :for_shard, ->(s) { s ? where(shard: s) : all }
9
+ scope :active, -> { where(status: "active") }
10
+ scope :paused, -> { where(status: "paused") }
11
+ scope :pending, -> { where("pending_count > 0") }
12
+ scope :stale_inactive, ->(cutoff) {
13
+ where("pending_count = 0 AND in_flight_count = 0")
14
+ .where("last_admit_at < ? OR (last_admit_at IS NULL AND created_at < ?)", cutoff, cutoff)
15
+ }
16
+
17
+ def paused?
18
+ status == "paused"
19
+ end
20
+ end
21
+ end
@@ -4,102 +4,9 @@ module DispatchPolicy
4
4
  class StagedJob < ApplicationRecord
5
5
  self.table_name = "dispatch_policy_staged_jobs"
6
6
 
7
- scope :pending, -> { where(admitted_at: nil, completed_at: nil) }
8
- scope :admitted, -> { where.not(admitted_at: nil).where(completed_at: nil) }
9
- scope :completed, -> { where.not(completed_at: nil) }
10
- scope :active, -> { where(completed_at: nil) }
11
- scope :expired_leases, -> {
12
- admitted.where("lease_expires_at IS NOT NULL AND lease_expires_at < ?", Time.current)
13
- }
14
-
15
- # Merge the job's ActiveJob metadata (queue_name, priority) into the
16
- # context hash so gate lambdas can partition_by :queue_name without
17
- # the user having to pass it as a kwarg. User-provided keys win.
18
- def self.context_for(job_instance, policy)
19
- built = policy.context_builder.call(job_instance.arguments)
20
- return built unless built.is_a?(Hash)
21
- {
22
- queue_name: job_instance.queue_name,
23
- priority: job_instance.priority
24
- }.merge(built.symbolize_keys)
25
- end
26
-
27
- # Stages a job in the admission queue. Returns the created row, or nil if
28
- # the policy declares a dedupe_key and an active row already exists.
29
- def self.stage!(job_instance:, policy:)
30
- dedupe_key = policy.build_dedupe_key(job_instance.arguments)
31
-
32
- if dedupe_key && exists?(policy_name: policy.name, dedupe_key: dedupe_key, completed_at: nil)
33
- return nil
34
- end
35
-
36
- create!(
37
- job_class: job_instance.class.name,
38
- policy_name: policy.name,
39
- arguments: job_instance.serialize,
40
- snapshot: policy.build_snapshot(job_instance.arguments),
41
- context: context_for(job_instance, policy),
42
- priority: job_instance.priority || 100,
43
- not_before_at: job_instance.scheduled_at,
44
- staged_at: Time.current,
45
- dedupe_key: dedupe_key,
46
- round_robin_key: policy.build_round_robin_key(job_instance.arguments)
47
- )
48
- rescue ActiveRecord::RecordNotUnique
49
- nil
50
- end
51
-
52
- # Batch-insert variant of stage!.
53
- def self.stage_many!(policy:, jobs:)
54
- return 0 if jobs.empty?
55
-
56
- now = Time.current
57
- rows = jobs.map do |job_instance|
58
- {
59
- job_class: job_instance.class.name,
60
- policy_name: policy.name,
61
- arguments: job_instance.serialize,
62
- snapshot: policy.build_snapshot(job_instance.arguments),
63
- context: context_for(job_instance, policy),
64
- priority: job_instance.priority || 100,
65
- not_before_at: job_instance.scheduled_at,
66
- staged_at: now,
67
- dedupe_key: policy.build_dedupe_key(job_instance.arguments),
68
- round_robin_key: policy.build_round_robin_key(job_instance.arguments),
69
- partitions: {},
70
- created_at: now,
71
- updated_at: now
72
- }
73
- end
74
-
75
- result = insert_all(rows, unique_by: :idx_dp_staged_dedupe_active)
76
- result.rows.size
77
- end
78
-
79
- def self.mark_completed_by_active_job_id(active_job_id)
80
- return 0 if active_job_id.blank?
81
- where(active_job_id: active_job_id, completed_at: nil)
82
- .update_all(completed_at: Time.current, lease_expires_at: nil)
83
- end
84
-
85
- def mark_admitted!(partitions:)
86
- now = Time.current
87
- job = instantiate_active_job
88
- job._dispatch_partitions = partitions
89
- job._dispatch_admitted_at = now
90
-
91
- update!(
92
- admitted_at: now,
93
- lease_expires_at: now + DispatchPolicy.config.lease_duration,
94
- active_job_id: job.job_id,
95
- partitions: partitions
96
- )
97
-
98
- job
99
- end
100
-
101
- def instantiate_active_job
102
- ActiveJob::Base.deserialize(arguments)
103
- end
7
+ scope :for_policy, ->(name) { where(policy_name: name) }
8
+ scope :for_partition, ->(name, key) { where(policy_name: name, partition_key: key) }
9
+ scope :due, -> { where("scheduled_at IS NULL OR scheduled_at <= now()") }
10
+ scope :recent, -> { order(enqueued_at: :desc) }
104
11
  end
105
12
  end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class TickSample < ApplicationRecord
5
+ self.table_name = "dispatch_policy_tick_samples"
6
+
7
+ scope :for_policy, ->(name) { where(policy_name: name) }
8
+ scope :since, ->(time) { where("sampled_at >= ?", time) }
9
+ scope :recent, -> { order(sampled_at: :desc) }
10
+ end
11
+ end