dispatch_policy 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/MIT-LICENSE +16 -17
  3. data/README.md +449 -288
  4. data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
  5. data/app/controllers/dispatch_policy/application_controller.rb +45 -1
  6. data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
  7. data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
  8. data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
  9. data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
  10. data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
  11. data/app/models/dispatch_policy/inflight_job.rb +12 -0
  12. data/app/models/dispatch_policy/partition.rb +21 -0
  13. data/app/models/dispatch_policy/staged_job.rb +4 -97
  14. data/app/models/dispatch_policy/tick_sample.rb +11 -0
  15. data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
  16. data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
  17. data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
  18. data/app/views/dispatch_policy/policies/index.html.erb +15 -37
  19. data/app/views/dispatch_policy/policies/show.html.erb +140 -216
  20. data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
  21. data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
  22. data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
  23. data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
  24. data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
  25. data/config/routes.rb +18 -2
  26. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
  27. data/lib/dispatch_policy/bypass.rb +23 -0
  28. data/lib/dispatch_policy/config.rb +85 -0
  29. data/lib/dispatch_policy/context.rb +50 -0
  30. data/lib/dispatch_policy/cursor_pagination.rb +121 -0
  31. data/lib/dispatch_policy/decision.rb +22 -0
  32. data/lib/dispatch_policy/engine.rb +4 -27
  33. data/lib/dispatch_policy/forwarder.rb +63 -0
  34. data/lib/dispatch_policy/gate.rb +10 -38
  35. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
  36. data/lib/dispatch_policy/gates/concurrency.rb +45 -26
  37. data/lib/dispatch_policy/gates/throttle.rb +65 -37
  38. data/lib/dispatch_policy/inflight_tracker.rb +174 -0
  39. data/lib/dispatch_policy/job_extension.rb +155 -0
  40. data/lib/dispatch_policy/operator_hints.rb +126 -0
  41. data/lib/dispatch_policy/pipeline.rb +48 -0
  42. data/lib/dispatch_policy/policy.rb +62 -47
  43. data/lib/dispatch_policy/policy_dsl.rb +120 -0
  44. data/lib/dispatch_policy/railtie.rb +35 -0
  45. data/lib/dispatch_policy/registry.rb +46 -0
  46. data/lib/dispatch_policy/repository.rb +723 -0
  47. data/lib/dispatch_policy/serializer.rb +36 -0
  48. data/lib/dispatch_policy/tick.rb +263 -172
  49. data/lib/dispatch_policy/tick_loop.rb +59 -26
  50. data/lib/dispatch_policy/version.rb +1 -1
  51. data/lib/dispatch_policy.rb +71 -46
  52. data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
  53. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
  54. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
  55. data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
  56. metadata +101 -43
  57. data/CHANGELOG.md +0 -12
  58. data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
  59. data/app/models/dispatch_policy/partition_observation.rb +0 -49
  60. data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
  61. data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
  62. data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
  63. data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
  64. data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
  65. data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
  66. data/lib/dispatch_policy/dispatch_context.rb +0 -53
  67. data/lib/dispatch_policy/dispatchable.rb +0 -120
  68. data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
  69. data/lib/dispatch_policy/gates/global_cap.rb +0 -26
  70. data/lib/dispatch_policy/install_generator.rb +0 -23
@@ -2,268 +2,121 @@
2
2
 
3
3
  module DispatchPolicy
4
4
  class PoliciesController < ApplicationController
5
- STALE_PENDING_THRESHOLD = 1.hour
6
- PARTITION_LIST_PAGE_SIZE = 25
5
+ before_action :find_policy, only: %i[show pause resume drain]
7
6
 
8
- before_action :load_policy, only: :show
7
+ DRAIN_MAX_PER_REQUEST = 10_000
9
8
 
10
9
  def index
11
- @policies = DispatchPolicy.registry.map do |name, job_class|
12
- scope = StagedJob.where(policy_name: name)
13
- pending = scope.pending
10
+ registry_names = DispatchPolicy.registry.names
11
+ db_names = Partition.distinct.pluck(:policy_name)
12
+ names = (registry_names + db_names).uniq.sort
13
+
14
+ in_flight_by_policy = InflightJob.where(policy_name: names).group(:policy_name).count
15
+
16
+ @rows = names.map do |name|
17
+ partitions = Partition.for_policy(name)
14
18
  {
15
- name: name,
16
- job_class: job_class,
17
- policy: job_class.resolved_dispatch_policy,
18
- pending_count: pending.count,
19
- admitted_count: scope.admitted.count,
20
- completed_24h: scope.completed.where(completed_at: 24.hours.ago..).count,
21
- oldest_pending: pending.minimum(:staged_at),
22
- stale_threshold: STALE_PENDING_THRESHOLD
19
+ name: name,
20
+ registered: registry_names.include?(name),
21
+ pending: partitions.sum(:pending_count),
22
+ in_flight: in_flight_by_policy[name] || 0,
23
+ partitions: partitions.count,
24
+ paused_count: partitions.paused.count
23
25
  }
24
- end.sort_by { |p| -p[:pending_count] }
25
-
26
- @active_partitions = PartitionInflightCount.where("in_flight > 0").count
27
- @expired_leases = StagedJob.expired_leases.count
26
+ end
28
27
  end
29
28
 
30
29
  def show
31
- scope = StagedJob.where(policy_name: @policy_name)
32
- @pending_count = scope.pending.count
33
- @pending_eligible_count = scope.pending.where("not_before_at IS NULL OR not_before_at <= ?", Time.current).count
34
- @pending_scheduled_count = @pending_count - @pending_eligible_count
35
- @admitted_count = scope.admitted.count
36
- @completed_24h = scope.completed.where(completed_at: 24.hours.ago..).count
37
-
38
- all_breakdown = partition_breakdown(scope)
39
-
40
- # "Watched" subset (passed via ?watch=a,b,c; the JS layer syncs it
41
- # with localStorage so the choice sticks across reloads).
42
- @watched_keys = (params[:watch] || "").split(",").map(&:strip).reject(&:empty?)
43
- @partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
44
-
45
- # Browsable list of every active partition with filter + sort + pagination.
46
- @partition_search = params[:q].to_s.strip
47
- @partition_page = [ params[:page].to_i, 1 ].max
48
- @partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
49
- @partition_dir = params[:dir] == "asc" ? "asc" : "desc"
50
-
51
- list = all_breakdown
52
- list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
53
- list = sort_partition_list(list, @partition_sort, @partition_dir)
54
-
55
- @partition_total_list = list.size
56
- offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
57
- @partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
58
-
59
- load_adaptive_chart_data
60
- @throttle_buckets = ThrottleBucket
61
- .where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
62
- @pending_jobs = scope.pending.order(:priority, :staged_at).limit(50)
63
- end
64
-
65
- private
66
-
67
- def load_policy
68
- @policy_name = params[:policy_name]
69
- @job_class = DispatchPolicy.registry[@policy_name] ||
70
- Tick.autoload_job_for(@policy_name)
71
- raise ActiveRecord::RecordNotFound unless @job_class
72
- @policy = @job_class.resolved_dispatch_policy
73
- end
74
-
75
- # Per-(source, partition) breakdown of pending-eligible / pending-scheduled
76
- # / in-flight / completed-24h. A "source" is either a gate with a
77
- # partition_by (uses gate.partition_key_for(context)) or the policy's
78
- # round_robin_by declaration (uses the round_robin_key column directly).
79
- # All four counts come from StagedJob groupings; PartitionInflightCount
80
- # is an admission-time optimization, not the user-facing truth.
81
- def partition_breakdown(scope)
82
- sources = partition_sources
83
- return [] if sources.empty?
84
-
85
- now = Time.current
86
- now_iso = now.iso8601
87
- since_24h = 24.hours.ago.iso8601
88
-
89
- adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
90
- .pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
91
- .each_with_object({}) { |(g, k, c, l), h|
92
- h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
93
- }
94
-
95
- rows = Hash.new { |h, k|
96
- h[k] = {
97
- source: k[0],
98
- partition: k[1],
99
- eligible: 0,
100
- scheduled: 0,
101
- in_flight: 0,
102
- completed_24h: 0,
103
- last_enqueued_at: nil,
104
- last_dispatched_at: nil,
105
- current_max: nil,
106
- ewma_latency_ms: nil
107
- }
30
+ @policy_object = DispatchPolicy.registry.fetch(@policy_name)
31
+ @partitions = Partition.for_policy(@policy_name)
32
+ .order(Arel.sql("pending_count DESC, last_admit_at DESC NULLS LAST"))
33
+ .limit(100)
34
+ @top_admitted = Partition.for_policy(@policy_name)
35
+ .order(total_admitted: :desc)
36
+ .limit(20)
37
+
38
+ @totals = {
39
+ pending: Partition.for_policy(@policy_name).sum(:pending_count),
40
+ in_flight: InflightJob.where(policy_name: @policy_name).count,
41
+ partitions: Partition.for_policy(@policy_name).count
108
42
  }
109
43
 
110
- # Activity timestamps bounded to the last 24h so the scan stays on
111
- # an index-friendly slice of staged_jobs.
112
- activity_rows = scope
113
- .where("staged_at > ?", since_24h)
114
- .group(:context, :round_robin_key)
115
- .pluck(
116
- :context,
117
- :round_robin_key,
118
- Arel.sql("MAX(staged_at)"),
119
- Arel.sql("MAX(admitted_at)")
120
- )
121
-
122
- sources.each do |name, extract|
123
- pending_counts = scope.pending.group(:context, :round_robin_key).pluck(
124
- :context,
125
- :round_robin_key,
126
- Arel.sql("count(*) filter (where not_before_at is null or not_before_at <= '#{now_iso}')"),
127
- Arel.sql("count(*) filter (where not_before_at > '#{now_iso}')")
128
- )
129
- pending_counts.each do |ctx, rr_key, eligible, scheduled|
130
- partition = extract.call(ctx, rr_key)
131
- row = rows[[ name, partition ]]
132
- row[:eligible] += eligible
133
- row[:scheduled] += scheduled
134
- end
135
-
136
- admitted_counts = scope.admitted.group(:context, :round_robin_key).pluck(
137
- :context, :round_robin_key, Arel.sql("count(*)")
138
- )
139
- admitted_counts.each do |ctx, rr_key, in_flight|
140
- partition = extract.call(ctx, rr_key)
141
- rows[[ name, partition ]][:in_flight] += in_flight
142
- end
143
-
144
- completed_counts = scope.completed.where("completed_at > ?", since_24h)
145
- .group(:context, :round_robin_key).pluck(
146
- :context, :round_robin_key, Arel.sql("count(*)")
147
- )
148
- completed_counts.each do |ctx, rr_key, completed|
149
- partition = extract.call(ctx, rr_key)
150
- rows[[ name, partition ]][:completed_24h] += completed
151
- end
152
-
153
- activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
154
- partition = extract.call(ctx, rr_key)
155
- row = rows[[ name, partition ]]
156
- row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
157
- row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
158
- end
159
- end
160
-
161
- rows.each do |(source, partition), row|
162
- stats = adaptive_stats[[ source, partition ]]
163
- next unless stats
164
- row[:current_max] = stats[:current_max]
165
- row[:ewma_latency_ms] = stats[:ewma_latency_ms]
166
- end
167
-
168
- # Two different sources (say round_robin_by account_id + a gate
169
- # partitioned by account_id) producing the same partition key yield
170
- # identical counts — collapse them into one row with a merged source
171
- # label instead of listing the same numbers twice.
172
- merged = rows.values
173
- .reject { |r| r[:partition].nil? || r[:partition].empty? }
174
- .group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
175
- .map { |_, group|
176
- base = group.first.dup
177
- base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
178
- group.each do |r|
179
- base[:current_max] ||= r[:current_max]
180
- base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
181
- base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
182
- base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
183
- end
184
- base
185
- }
186
-
187
- merged.sort_by { |r|
188
- [ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
44
+ now = Time.current
45
+ @windows = {
46
+ "1m" => Repository.tick_summary(policy_name: @policy_name, since: now - 60),
47
+ "5m" => Repository.tick_summary(policy_name: @policy_name, since: now - 5 * 60),
48
+ "15m" => Repository.tick_summary(policy_name: @policy_name, since: now - 15 * 60)
49
+ }
50
+ @denied_reasons = Repository.denied_reasons_summary(policy_name: @policy_name, since: now - 15 * 60)
51
+ @round_trip = Repository.partition_round_trip_stats(policy_name: @policy_name)
52
+ @sparkline = Repository.tick_samples_buckets(policy_name: @policy_name, since: now - 30 * 60, bucket_seconds: 60)
53
+ @pending_trend = Repository.trend_direction(@sparkline.map { |b| b[:pending_total] })
54
+
55
+ cfg = DispatchPolicy.config
56
+ @capacity = {
57
+ admitted_per_minute: @windows["1m"][:jobs_admitted],
58
+ adapter_target_jps: cfg.adapter_throughput_target,
59
+ avg_tick_ms: @windows["1m"][:avg_duration_ms],
60
+ max_tick_ms: @windows["1m"][:max_duration_ms],
61
+ tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
189
62
  }
190
- end
191
63
 
192
- def sort_partition_list(list, sort, dir)
193
- # Put nulls at the bottom regardless of direction (Time#to_f on nil
194
- # would crash; -Float::INFINITY sorts first, +Float::INFINITY last).
195
- key =
196
- case sort
197
- when "source" then ->(r) { [ r[:source], r[:partition] ] }
198
- when "partition" then ->(r) { r[:partition] }
199
- when "pending" then ->(r) { r[:eligible] + r[:scheduled] }
200
- when "in_flight" then ->(r) { r[:in_flight] }
201
- when "completed_24h" then ->(r) { r[:completed_24h] }
202
- when "last_enqueued_at" then ->(r) { r[:last_enqueued_at]&.to_f || 0 }
203
- when "last_dispatched_at" then ->(r) { r[:last_dispatched_at]&.to_f || 0 }
204
- else ->(r) { r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h] }
205
- end
206
- sorted = list.sort_by(&key)
207
- dir == "asc" ? sorted : sorted.reverse
64
+ @hints = OperatorHints.for(
65
+ tick_max_duration_ms: @capacity[:tick_max_duration_ms],
66
+ avg_tick_ms: @capacity[:avg_tick_ms],
67
+ max_tick_ms: @capacity[:max_tick_ms],
68
+ pending_total: @totals[:pending],
69
+ admitted_per_minute: @capacity[:admitted_per_minute],
70
+ forward_failures: @windows["1m"][:forward_failures],
71
+ jobs_admitted: @windows["1m"][:jobs_admitted],
72
+ active_partitions: @round_trip[:active_partitions],
73
+ never_checked: @round_trip[:never_checked],
74
+ in_backoff: @round_trip[:in_backoff],
75
+ total_partitions: @totals[:partitions],
76
+ adapter_target_jps: @capacity[:adapter_target_jps],
77
+ pending_trend: @pending_trend
78
+ )
208
79
  end
209
80
 
210
- # Returns [[source_name, ->(ctx, rr_key) { partition_key }], ...]
211
- # covering every partition-producing declaration on the policy: every
212
- # gate with a partition_by, plus round_robin_by if declared.
213
- def partition_sources
214
- return [] unless @policy
81
+ def pause
82
+ Partition.for_policy(@policy_name).update_all(status: "paused", updated_at: Time.current)
83
+ redirect_to policy_path(@policy_name), notice: "Policy paused."
84
+ end
215
85
 
216
- sources = @policy.gates.select(&:partition_by).map do |gate|
217
- [ gate.name.to_s, ->(ctx, _rr) { gate.partition_key_for((ctx || {}).symbolize_keys) } ]
218
- end
219
- sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
220
- sources
86
+ def resume
87
+ Partition.for_policy(@policy_name).update_all(status: "active", updated_at: Time.current)
88
+ redirect_to policy_path(@policy_name), notice: "Policy resumed."
221
89
  end
222
90
 
223
- # Build chart data from PartitionObservation. Two queries:
224
- # - Global aggregated (one row per minute): cheap even with 1000s of
225
- # partitions because we SUM/AVG in SQL, not in Ruby.
226
- # - Per-partition sparkline data, scoped to only the partitions we're
227
- # going to actually render (breakdown's top N).
228
- def load_adaptive_chart_data
229
- last_minute = Time.current.utc.beginning_of_minute
230
- @chart_slots = (0..59).map { |i| last_minute - (59 - i).minutes }
231
- @chart_labels = @chart_slots.map { |t| t.strftime("%H:%M") }
232
- slot_index = @chart_slots.each_with_index.to_h
91
+ # Force-admits every staged job across every partition of the policy,
92
+ # bypassing all gates. Walks partitions in pending-DESC order so the
93
+ # busiest ones drain first. Bounded at DRAIN_MAX_PER_REQUEST per click.
94
+ def drain
95
+ drained = 0
96
+ Partition.for_policy(@policy_name)
97
+ .where("pending_count > 0")
98
+ .order(pending_count: :desc, id: :asc)
99
+ .limit(500)
100
+ .each do |partition|
101
+ break if drained >= DRAIN_MAX_PER_REQUEST
102
+
103
+ batch, _ = PartitionsController.drain_partition!(partition)
104
+ drained += batch
105
+ end
233
106
 
234
- @adaptive_global = Array.new(@chart_slots.size)
235
- @completions_global = Array.new(@chart_slots.size, 0)
236
- global_rows = PartitionObservation
237
- .where(policy_name: @policy_name)
238
- .where("minute_bucket >= ?", @chart_slots.first)
239
- .group(:minute_bucket)
240
- .pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
241
- global_rows.each do |bucket, total_lag, obs_count|
242
- idx = slot_index[bucket.utc.beginning_of_minute]
243
- next unless idx
244
- @completions_global[idx] = obs_count
245
- @adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
107
+ remaining = Partition.for_policy(@policy_name).sum(:pending_count)
108
+ notice = if remaining.positive?
109
+ "Drained #{drained} job(s) across this policy; #{remaining} still pending — click drain again to continue."
110
+ else
111
+ "Drained #{drained} job(s); policy fully drained."
246
112
  end
113
+ redirect_to policy_path(@policy_name), notice: notice
114
+ end
247
115
 
248
- partition_keys = (@partition_breakdown || []).map { |r| r[:partition] }.uniq
249
- @adaptive_samples = {}
250
- @completions_samples = {}
251
- return if partition_keys.empty?
116
+ private
252
117
 
253
- per_partition_lag = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size) }
254
- per_partition_counts = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size, 0) }
255
- rows = PartitionObservation
256
- .where(policy_name: @policy_name, partition_key: partition_keys)
257
- .where("minute_bucket >= ?", @chart_slots.first)
258
- .pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
259
- rows.each do |pk, bucket, total, count|
260
- idx = slot_index[bucket.utc.beginning_of_minute]
261
- next unless idx
262
- per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
263
- per_partition_counts[pk][idx] = count
264
- end
265
- @adaptive_samples = per_partition_lag
266
- @completions_samples = per_partition_counts
118
+ def find_policy
119
+ @policy_name = params[:name]
267
120
  end
268
121
  end
269
122
  end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class StagedJobsController < ApplicationController
5
+ def show
6
+ @job = StagedJob.find(params[:id])
7
+ end
8
+ end
9
+ end
@@ -1,89 +1,19 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module DispatchPolicy
4
+ # One row per (policy_name, partition_key) for partitions whose policy
5
+ # declares an `:adaptive_concurrency` gate. Holds the AIMD-tuned
6
+ # `current_max` plus the EWMA of recent queue-lag observations the cap
7
+ # adapts on.
8
+ #
9
+ # Read by `Gates::AdaptiveConcurrency#evaluate` to learn how many jobs
10
+ # this partition may admit right now. Written atomically by
11
+ # `Repository.adaptive_record!` from `InflightTracker.track`'s ensure
12
+ # block after each perform — the EWMA + AIMD update lives in a single
13
+ # SQL statement so concurrent workers can't race on read-modify-write.
4
14
  class AdaptiveConcurrencyStats < ApplicationRecord
5
15
  self.table_name = "dispatch_policy_adaptive_concurrency_stats"
6
16
 
7
- # Seed a stats row if one doesn't exist yet. Mirrors ThrottleBucket.lock.
8
- def self.seed!(policy_name:, gate_name:, partition_key:, initial_max:)
9
- now = Time.current
10
- sql = <<~SQL.squish
11
- INSERT INTO #{quoted_table_name}
12
- (policy_name, gate_name, partition_key, current_max,
13
- ewma_latency_ms, sample_count, created_at, updated_at)
14
- VALUES (?, ?, ?, ?, 0, 0, ?, ?)
15
- ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
16
- SQL
17
- connection.exec_update(
18
- sanitize_sql_array([
19
- sql, policy_name, gate_name.to_s, partition_key.to_s,
20
- initial_max.to_i, now, now
21
- ])
22
- )
23
- end
24
-
25
- def self.fetch_many(policy_name:, gate_name:, partition_keys:)
26
- return {} if partition_keys.empty?
27
- where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
28
- .pluck(:partition_key, :current_max, :ewma_latency_ms)
29
- .each_with_object({}) { |(k, c, l), h| h[k] = { current_max: c, ewma_latency_ms: l } }
30
- end
31
-
32
- # Single-statement EWMA + AIMD update so concurrent performs can't race
33
- # on read-modify-write. Seed first (INSERT ON CONFLICT DO NOTHING), then
34
- # apply the adjustment.
35
- def self.record_observation!(
36
- policy_name:, gate_name:, partition_key:,
37
- queue_lag_ms:, succeeded:,
38
- alpha:, min:, target_lag_ms:,
39
- fail_factor:, slow_factor:, initial_max:
40
- )
41
- seed!(
42
- policy_name: policy_name,
43
- gate_name: gate_name,
44
- partition_key: partition_key,
45
- initial_max: initial_max
46
- )
47
-
48
- # Feedback signal is queue_lag (admitted_at → perform_start). When
49
- # the adapter queue is empty, lag ≈ 0 → +1 grow. When the queue
50
- # backs up, lag rises past target → multiplicative shrink. Failures
51
- # shrink harder. Only `min` is enforced so a partition can't lock
52
- # out entirely.
53
- sql = <<~SQL.squish
54
- UPDATE #{quoted_table_name}
55
- SET
56
- ewma_latency_ms = ewma_latency_ms * (1 - ?) + ? * ?,
57
- sample_count = sample_count + 1,
58
- current_max = GREATEST(?, CASE
59
- WHEN ? = FALSE THEN FLOOR(current_max * ?)::int
60
- WHEN (ewma_latency_ms * (1 - ?) + ? * ?) > ? THEN FLOOR(current_max * ?)::int
61
- ELSE current_max + 1
62
- END),
63
- last_observed_at = ?,
64
- updated_at = ?
65
- WHERE policy_name = ? AND gate_name = ? AND partition_key = ?
66
- SQL
67
-
68
- now = Time.current
69
- connection.exec_update(
70
- sanitize_sql_array([
71
- sql,
72
- alpha, alpha, queue_lag_ms,
73
- min.to_i,
74
- succeeded, fail_factor,
75
- alpha, alpha, queue_lag_ms, target_lag_ms, slow_factor,
76
- now, now,
77
- policy_name, gate_name.to_s, partition_key.to_s
78
- ])
79
- )
80
- end
81
-
82
- # Quick lookup used by Dispatchable to denormalize current_max into
83
- # the generic partition observation row.
84
- def self.current_max_for(policy_name:, partition_key:)
85
- where(policy_name: policy_name, partition_key: partition_key.to_s)
86
- .limit(1).pick(:current_max)
87
- end
17
+ scope :for_policy, ->(name) { where(policy_name: name) }
88
18
  end
89
19
  end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class InflightJob < ApplicationRecord
5
+ self.table_name = "dispatch_policy_inflight_jobs"
6
+
7
+ scope :for_partition, ->(policy_name, partition_key) {
8
+ where(policy_name: policy_name, partition_key: partition_key)
9
+ }
10
+ scope :stale, ->(cutoff) { where("heartbeat_at < ?", cutoff) }
11
+ end
12
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class Partition < ApplicationRecord
5
+ self.table_name = "dispatch_policy_partitions"
6
+
7
+ scope :for_policy, ->(name) { where(policy_name: name) }
8
+ scope :for_shard, ->(s) { s ? where(shard: s) : all }
9
+ scope :active, -> { where(status: "active") }
10
+ scope :paused, -> { where(status: "paused") }
11
+ scope :pending, -> { where("pending_count > 0") }
12
+ scope :stale_inactive, ->(cutoff) {
13
+ where("pending_count = 0 AND in_flight_count = 0")
14
+ .where("last_admit_at < ? OR (last_admit_at IS NULL AND created_at < ?)", cutoff, cutoff)
15
+ }
16
+
17
+ def paused?
18
+ status == "paused"
19
+ end
20
+ end
21
+ end
@@ -4,102 +4,9 @@ module DispatchPolicy
4
4
  class StagedJob < ApplicationRecord
5
5
  self.table_name = "dispatch_policy_staged_jobs"
6
6
 
7
- scope :pending, -> { where(admitted_at: nil, completed_at: nil) }
8
- scope :admitted, -> { where.not(admitted_at: nil).where(completed_at: nil) }
9
- scope :completed, -> { where.not(completed_at: nil) }
10
- scope :active, -> { where(completed_at: nil) }
11
- scope :expired_leases, -> {
12
- admitted.where("lease_expires_at IS NOT NULL AND lease_expires_at < ?", Time.current)
13
- }
14
-
15
- # Merge the job's ActiveJob metadata (queue_name, priority) into the
16
- # context hash so gate lambdas can partition_by :queue_name without
17
- # the user having to pass it as a kwarg. User-provided keys win.
18
- def self.context_for(job_instance, policy)
19
- built = policy.context_builder.call(job_instance.arguments)
20
- return built unless built.is_a?(Hash)
21
- {
22
- queue_name: job_instance.queue_name,
23
- priority: job_instance.priority
24
- }.merge(built.symbolize_keys)
25
- end
26
-
27
- # Stages a job in the admission queue. Returns the created row, or nil if
28
- # the policy declares a dedupe_key and an active row already exists.
29
- def self.stage!(job_instance:, policy:)
30
- dedupe_key = policy.build_dedupe_key(job_instance.arguments)
31
-
32
- if dedupe_key && exists?(policy_name: policy.name, dedupe_key: dedupe_key, completed_at: nil)
33
- return nil
34
- end
35
-
36
- create!(
37
- job_class: job_instance.class.name,
38
- policy_name: policy.name,
39
- arguments: job_instance.serialize,
40
- snapshot: policy.build_snapshot(job_instance.arguments),
41
- context: context_for(job_instance, policy),
42
- priority: job_instance.priority || 100,
43
- not_before_at: job_instance.scheduled_at,
44
- staged_at: Time.current,
45
- dedupe_key: dedupe_key,
46
- round_robin_key: policy.build_round_robin_key(job_instance.arguments)
47
- )
48
- rescue ActiveRecord::RecordNotUnique
49
- nil
50
- end
51
-
52
- # Batch-insert variant of stage!.
53
- def self.stage_many!(policy:, jobs:)
54
- return 0 if jobs.empty?
55
-
56
- now = Time.current
57
- rows = jobs.map do |job_instance|
58
- {
59
- job_class: job_instance.class.name,
60
- policy_name: policy.name,
61
- arguments: job_instance.serialize,
62
- snapshot: policy.build_snapshot(job_instance.arguments),
63
- context: context_for(job_instance, policy),
64
- priority: job_instance.priority || 100,
65
- not_before_at: job_instance.scheduled_at,
66
- staged_at: now,
67
- dedupe_key: policy.build_dedupe_key(job_instance.arguments),
68
- round_robin_key: policy.build_round_robin_key(job_instance.arguments),
69
- partitions: {},
70
- created_at: now,
71
- updated_at: now
72
- }
73
- end
74
-
75
- result = insert_all(rows, unique_by: :idx_dp_staged_dedupe_active)
76
- result.rows.size
77
- end
78
-
79
- def self.mark_completed_by_active_job_id(active_job_id)
80
- return 0 if active_job_id.blank?
81
- where(active_job_id: active_job_id, completed_at: nil)
82
- .update_all(completed_at: Time.current, lease_expires_at: nil)
83
- end
84
-
85
- def mark_admitted!(partitions:)
86
- now = Time.current
87
- job = instantiate_active_job
88
- job._dispatch_partitions = partitions
89
- job._dispatch_admitted_at = now
90
-
91
- update!(
92
- admitted_at: now,
93
- lease_expires_at: now + DispatchPolicy.config.lease_duration,
94
- active_job_id: job.job_id,
95
- partitions: partitions
96
- )
97
-
98
- job
99
- end
100
-
101
- def instantiate_active_job
102
- ActiveJob::Base.deserialize(arguments)
103
- end
7
+ scope :for_policy, ->(name) { where(policy_name: name) }
8
+ scope :for_partition, ->(name, key) { where(policy_name: name, partition_key: key) }
9
+ scope :due, -> { where("scheduled_at IS NULL OR scheduled_at <= now()") }
10
+ scope :recent, -> { order(enqueued_at: :desc) }
104
11
  end
105
12
  end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class TickSample < ApplicationRecord
5
+ self.table_name = "dispatch_policy_tick_samples"
6
+
7
+ scope :for_policy, ->(name) { where(policy_name: name) }
8
+ scope :since, ->(time) { where("sampled_at >= ?", time) }
9
+ scope :recent, -> { order(sampled_at: :desc) }
10
+ end
11
+ end