dispatch_policy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +12 -0
  3. data/MIT-LICENSE +21 -0
  4. data/README.md +435 -0
  5. data/app/controllers/dispatch_policy/application_controller.rb +9 -0
  6. data/app/controllers/dispatch_policy/policies_controller.rb +269 -0
  7. data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +89 -0
  8. data/app/models/dispatch_policy/application_record.rb +7 -0
  9. data/app/models/dispatch_policy/partition_inflight_count.rb +42 -0
  10. data/app/models/dispatch_policy/partition_observation.rb +49 -0
  11. data/app/models/dispatch_policy/staged_job.rb +105 -0
  12. data/app/models/dispatch_policy/throttle_bucket.rb +41 -0
  13. data/app/views/dispatch_policy/policies/index.html.erb +52 -0
  14. data/app/views/dispatch_policy/policies/show.html.erb +241 -0
  15. data/app/views/layouts/dispatch_policy/application.html.erb +266 -0
  16. data/config/routes.rb +6 -0
  17. data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +80 -0
  18. data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +22 -0
  19. data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +25 -0
  20. data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +32 -0
  21. data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +32 -0
  22. data/lib/dispatch_policy/dispatch_context.rb +53 -0
  23. data/lib/dispatch_policy/dispatchable.rb +120 -0
  24. data/lib/dispatch_policy/engine.rb +36 -0
  25. data/lib/dispatch_policy/gate.rb +49 -0
  26. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +123 -0
  27. data/lib/dispatch_policy/gates/concurrency.rb +43 -0
  28. data/lib/dispatch_policy/gates/fair_interleave.rb +32 -0
  29. data/lib/dispatch_policy/gates/global_cap.rb +26 -0
  30. data/lib/dispatch_policy/gates/throttle.rb +52 -0
  31. data/lib/dispatch_policy/install_generator.rb +23 -0
  32. data/lib/dispatch_policy/policy.rb +73 -0
  33. data/lib/dispatch_policy/tick.rb +214 -0
  34. data/lib/dispatch_policy/tick_loop.rb +45 -0
  35. data/lib/dispatch_policy/version.rb +5 -0
  36. data/lib/dispatch_policy.rb +64 -0
  37. metadata +182 -0
@@ -0,0 +1,269 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class PoliciesController < ApplicationController
5
+ STALE_PENDING_THRESHOLD = 1.hour
6
+ PARTITION_LIST_PAGE_SIZE = 25
7
+
8
+ before_action :load_policy, only: :show
9
+
10
+ def index
11
+ @policies = DispatchPolicy.registry.map do |name, job_class|
12
+ scope = StagedJob.where(policy_name: name)
13
+ pending = scope.pending
14
+ {
15
+ name: name,
16
+ job_class: job_class,
17
+ policy: job_class.resolved_dispatch_policy,
18
+ pending_count: pending.count,
19
+ admitted_count: scope.admitted.count,
20
+ completed_24h: scope.completed.where(completed_at: 24.hours.ago..).count,
21
+ oldest_pending: pending.minimum(:staged_at),
22
+ stale_threshold: STALE_PENDING_THRESHOLD
23
+ }
24
+ end.sort_by { |p| -p[:pending_count] }
25
+
26
+ @active_partitions = PartitionInflightCount.where("in_flight > 0").count
27
+ @expired_leases = StagedJob.expired_leases.count
28
+ end
29
+
30
+ def show
31
+ scope = StagedJob.where(policy_name: @policy_name)
32
+ @pending_count = scope.pending.count
33
+ @pending_eligible_count = scope.pending.where("not_before_at IS NULL OR not_before_at <= ?", Time.current).count
34
+ @pending_scheduled_count = @pending_count - @pending_eligible_count
35
+ @admitted_count = scope.admitted.count
36
+ @completed_24h = scope.completed.where(completed_at: 24.hours.ago..).count
37
+
38
+ all_breakdown = partition_breakdown(scope)
39
+
40
+ # "Watched" subset (passed via ?watch=a,b,c; the JS layer syncs it
41
+ # with localStorage so the choice sticks across reloads).
42
+ @watched_keys = (params[:watch] || "").split(",").map(&:strip).reject(&:empty?)
43
+ @partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
44
+
45
+ # Browsable list of every active partition with filter + sort + pagination.
46
+ @partition_search = params[:q].to_s.strip
47
+ @partition_page = [ params[:page].to_i, 1 ].max
48
+ @partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
49
+ @partition_dir = params[:dir] == "asc" ? "asc" : "desc"
50
+
51
+ list = all_breakdown
52
+ list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
53
+ list = sort_partition_list(list, @partition_sort, @partition_dir)
54
+
55
+ @partition_total_list = list.size
56
+ offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
57
+ @partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
58
+
59
+ load_adaptive_chart_data
60
+ @throttle_buckets = ThrottleBucket
61
+ .where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
62
+ @pending_jobs = scope.pending.order(:priority, :staged_at).limit(50)
63
+ end
64
+
65
+ private
66
+
67
+ def load_policy
68
+ @policy_name = params[:policy_name]
69
+ @job_class = DispatchPolicy.registry[@policy_name] ||
70
+ Tick.autoload_job_for(@policy_name)
71
+ raise ActiveRecord::RecordNotFound unless @job_class
72
+ @policy = @job_class.resolved_dispatch_policy
73
+ end
74
+
75
+ # Per-(source, partition) breakdown of pending-eligible / pending-scheduled
76
+ # / in-flight / completed-24h. A "source" is either a gate with a
77
+ # partition_by (uses gate.partition_key_for(context)) or the policy's
78
+ # round_robin_by declaration (uses the round_robin_key column directly).
79
+ # All four counts come from StagedJob groupings; PartitionInflightCount
80
+ # is an admission-time optimization, not the user-facing truth.
81
+ def partition_breakdown(scope)
82
+ sources = partition_sources
83
+ return [] if sources.empty?
84
+
85
+ now = Time.current
86
+ now_iso = now.iso8601
87
+ since_24h = 24.hours.ago.iso8601
88
+
89
+ adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
90
+ .pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
91
+ .each_with_object({}) { |(g, k, c, l), h|
92
+ h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
93
+ }
94
+
95
+ rows = Hash.new { |h, k|
96
+ h[k] = {
97
+ source: k[0],
98
+ partition: k[1],
99
+ eligible: 0,
100
+ scheduled: 0,
101
+ in_flight: 0,
102
+ completed_24h: 0,
103
+ last_enqueued_at: nil,
104
+ last_dispatched_at: nil,
105
+ current_max: nil,
106
+ ewma_latency_ms: nil
107
+ }
108
+ }
109
+
110
+ # Activity timestamps bounded to the last 24h so the scan stays on
111
+ # an index-friendly slice of staged_jobs.
112
+ activity_rows = scope
113
+ .where("staged_at > ?", since_24h)
114
+ .group(:context, :round_robin_key)
115
+ .pluck(
116
+ :context,
117
+ :round_robin_key,
118
+ Arel.sql("MAX(staged_at)"),
119
+ Arel.sql("MAX(admitted_at)")
120
+ )
121
+
122
+ sources.each do |name, extract|
123
+ pending_counts = scope.pending.group(:context, :round_robin_key).pluck(
124
+ :context,
125
+ :round_robin_key,
126
+ Arel.sql("count(*) filter (where not_before_at is null or not_before_at <= '#{now_iso}')"),
127
+ Arel.sql("count(*) filter (where not_before_at > '#{now_iso}')")
128
+ )
129
+ pending_counts.each do |ctx, rr_key, eligible, scheduled|
130
+ partition = extract.call(ctx, rr_key)
131
+ row = rows[[ name, partition ]]
132
+ row[:eligible] += eligible
133
+ row[:scheduled] += scheduled
134
+ end
135
+
136
+ admitted_counts = scope.admitted.group(:context, :round_robin_key).pluck(
137
+ :context, :round_robin_key, Arel.sql("count(*)")
138
+ )
139
+ admitted_counts.each do |ctx, rr_key, in_flight|
140
+ partition = extract.call(ctx, rr_key)
141
+ rows[[ name, partition ]][:in_flight] += in_flight
142
+ end
143
+
144
+ completed_counts = scope.completed.where("completed_at > ?", since_24h)
145
+ .group(:context, :round_robin_key).pluck(
146
+ :context, :round_robin_key, Arel.sql("count(*)")
147
+ )
148
+ completed_counts.each do |ctx, rr_key, completed|
149
+ partition = extract.call(ctx, rr_key)
150
+ rows[[ name, partition ]][:completed_24h] += completed
151
+ end
152
+
153
+ activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
154
+ partition = extract.call(ctx, rr_key)
155
+ row = rows[[ name, partition ]]
156
+ row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
157
+ row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
158
+ end
159
+ end
160
+
161
+ rows.each do |(source, partition), row|
162
+ stats = adaptive_stats[[ source, partition ]]
163
+ next unless stats
164
+ row[:current_max] = stats[:current_max]
165
+ row[:ewma_latency_ms] = stats[:ewma_latency_ms]
166
+ end
167
+
168
+ # Two different sources (say round_robin_by account_id + a gate
169
+ # partitioned by account_id) producing the same partition key yield
170
+ # identical counts — collapse them into one row with a merged source
171
+ # label instead of listing the same numbers twice.
172
+ merged = rows.values
173
+ .reject { |r| r[:partition].nil? || r[:partition].empty? }
174
+ .group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
175
+ .map { |_, group|
176
+ base = group.first.dup
177
+ base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
178
+ group.each do |r|
179
+ base[:current_max] ||= r[:current_max]
180
+ base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
181
+ base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
182
+ base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
183
+ end
184
+ base
185
+ }
186
+
187
+ merged.sort_by { |r|
188
+ [ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
189
+ }
190
+ end
191
+
192
+ def sort_partition_list(list, sort, dir)
193
+ # Put nulls at the bottom regardless of direction (Time#to_f on nil
194
+ # would crash; -Float::INFINITY sorts first, +Float::INFINITY last).
195
+ key =
196
+ case sort
197
+ when "source" then ->(r) { [ r[:source], r[:partition] ] }
198
+ when "partition" then ->(r) { r[:partition] }
199
+ when "pending" then ->(r) { r[:eligible] + r[:scheduled] }
200
+ when "in_flight" then ->(r) { r[:in_flight] }
201
+ when "completed_24h" then ->(r) { r[:completed_24h] }
202
+ when "last_enqueued_at" then ->(r) { r[:last_enqueued_at]&.to_f || 0 }
203
+ when "last_dispatched_at" then ->(r) { r[:last_dispatched_at]&.to_f || 0 }
204
+ else ->(r) { r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h] }
205
+ end
206
+ sorted = list.sort_by(&key)
207
+ dir == "asc" ? sorted : sorted.reverse
208
+ end
209
+
210
+ # Returns [[source_name, ->(ctx, rr_key) { partition_key }], ...]
211
+ # covering every partition-producing declaration on the policy: every
212
+ # gate with a partition_by, plus round_robin_by if declared.
213
+ def partition_sources
214
+ return [] unless @policy
215
+
216
+ sources = @policy.gates.select(&:partition_by).map do |gate|
217
+ [ gate.name.to_s, ->(ctx, _rr) { gate.partition_key_for((ctx || {}).symbolize_keys) } ]
218
+ end
219
+ sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
220
+ sources
221
+ end
222
+
223
+ # Build chart data from PartitionObservation. Two queries:
224
+ # - Global aggregated (one row per minute): cheap even with 1000s of
225
+ # partitions because we SUM/AVG in SQL, not in Ruby.
226
+ # - Per-partition sparkline data, scoped to only the partitions we're
227
+ # going to actually render (breakdown's top N).
228
+ def load_adaptive_chart_data
229
+ last_minute = Time.current.utc.beginning_of_minute
230
+ @chart_slots = (0..59).map { |i| last_minute - (59 - i).minutes }
231
+ @chart_labels = @chart_slots.map { |t| t.strftime("%H:%M") }
232
+ slot_index = @chart_slots.each_with_index.to_h
233
+
234
+ @adaptive_global = Array.new(@chart_slots.size)
235
+ @completions_global = Array.new(@chart_slots.size, 0)
236
+ global_rows = PartitionObservation
237
+ .where(policy_name: @policy_name)
238
+ .where("minute_bucket >= ?", @chart_slots.first)
239
+ .group(:minute_bucket)
240
+ .pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
241
+ global_rows.each do |bucket, total_lag, obs_count|
242
+ idx = slot_index[bucket.utc.beginning_of_minute]
243
+ next unless idx
244
+ @completions_global[idx] = obs_count
245
+ @adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
246
+ end
247
+
248
+ partition_keys = (@partition_breakdown || []).map { |r| r[:partition] }.uniq
249
+ @adaptive_samples = {}
250
+ @completions_samples = {}
251
+ return if partition_keys.empty?
252
+
253
+ per_partition_lag = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size) }
254
+ per_partition_counts = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size, 0) }
255
+ rows = PartitionObservation
256
+ .where(policy_name: @policy_name, partition_key: partition_keys)
257
+ .where("minute_bucket >= ?", @chart_slots.first)
258
+ .pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
259
+ rows.each do |pk, bucket, total, count|
260
+ idx = slot_index[bucket.utc.beginning_of_minute]
261
+ next unless idx
262
+ per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
263
+ per_partition_counts[pk][idx] = count
264
+ end
265
+ @adaptive_samples = per_partition_lag
266
+ @completions_samples = per_partition_counts
267
+ end
268
+ end
269
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class AdaptiveConcurrencyStats < ApplicationRecord
5
+ self.table_name = "dispatch_policy_adaptive_concurrency_stats"
6
+
7
+ # Seed a stats row if one doesn't exist yet. Mirrors ThrottleBucket.lock.
8
+ def self.seed!(policy_name:, gate_name:, partition_key:, initial_max:)
9
+ now = Time.current
10
+ sql = <<~SQL.squish
11
+ INSERT INTO #{quoted_table_name}
12
+ (policy_name, gate_name, partition_key, current_max,
13
+ ewma_latency_ms, sample_count, created_at, updated_at)
14
+ VALUES (?, ?, ?, ?, 0, 0, ?, ?)
15
+ ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
16
+ SQL
17
+ connection.exec_update(
18
+ sanitize_sql_array([
19
+ sql, policy_name, gate_name.to_s, partition_key.to_s,
20
+ initial_max.to_i, now, now
21
+ ])
22
+ )
23
+ end
24
+
25
+ def self.fetch_many(policy_name:, gate_name:, partition_keys:)
26
+ return {} if partition_keys.empty?
27
+ where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
28
+ .pluck(:partition_key, :current_max, :ewma_latency_ms)
29
+ .each_with_object({}) { |(k, c, l), h| h[k] = { current_max: c, ewma_latency_ms: l } }
30
+ end
31
+
32
+ # Single-statement EWMA + AIMD update so concurrent performs can't race
33
+ # on read-modify-write. Seed first (INSERT ON CONFLICT DO NOTHING), then
34
+ # apply the adjustment.
35
+ def self.record_observation!(
36
+ policy_name:, gate_name:, partition_key:,
37
+ queue_lag_ms:, succeeded:,
38
+ alpha:, min:, target_lag_ms:,
39
+ fail_factor:, slow_factor:, initial_max:
40
+ )
41
+ seed!(
42
+ policy_name: policy_name,
43
+ gate_name: gate_name,
44
+ partition_key: partition_key,
45
+ initial_max: initial_max
46
+ )
47
+
48
+ # Feedback signal is queue_lag (admitted_at → perform_start). When
49
+ # the adapter queue is empty, lag ≈ 0 → +1 grow. When the queue
50
+ # backs up, lag rises past target → multiplicative shrink. Failures
51
+ # shrink harder. Only `min` is enforced so a partition can't lock
52
+ # out entirely.
53
+ sql = <<~SQL.squish
54
+ UPDATE #{quoted_table_name}
55
+ SET
56
+ ewma_latency_ms = ewma_latency_ms * (1 - ?) + ? * ?,
57
+ sample_count = sample_count + 1,
58
+ current_max = GREATEST(?, CASE
59
+ WHEN ? = FALSE THEN FLOOR(current_max * ?)::int
60
+ WHEN (ewma_latency_ms * (1 - ?) + ? * ?) > ? THEN FLOOR(current_max * ?)::int
61
+ ELSE current_max + 1
62
+ END),
63
+ last_observed_at = ?,
64
+ updated_at = ?
65
+ WHERE policy_name = ? AND gate_name = ? AND partition_key = ?
66
+ SQL
67
+
68
+ now = Time.current
69
+ connection.exec_update(
70
+ sanitize_sql_array([
71
+ sql,
72
+ alpha, alpha, queue_lag_ms,
73
+ min.to_i,
74
+ succeeded, fail_factor,
75
+ alpha, alpha, queue_lag_ms, target_lag_ms, slow_factor,
76
+ now, now,
77
+ policy_name, gate_name.to_s, partition_key.to_s
78
+ ])
79
+ )
80
+ end
81
+
82
+ # Quick lookup used by Dispatchable to denormalize current_max into
83
+ # the generic partition observation row.
84
+ def self.current_max_for(policy_name:, partition_key:)
85
+ where(policy_name: policy_name, partition_key: partition_key.to_s)
86
+ .limit(1).pick(:current_max)
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class ApplicationRecord < ActiveRecord::Base
5
+ self.abstract_class = true
6
+ end
7
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class PartitionInflightCount < ApplicationRecord
5
+ self.table_name = "dispatch_policy_partition_counts"
6
+
7
+ def self.fetch_many(policy_name:, gate_name:, partition_keys:)
8
+ return {} if partition_keys.empty?
9
+
10
+ where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
11
+ .pluck(:partition_key, :in_flight).to_h
12
+ .tap { |h| partition_keys.each { |k| h[k] ||= 0 } }
13
+ end
14
+
15
+ def self.total_for(policy_name:, gate_name:)
16
+ where(policy_name: policy_name, gate_name: gate_name.to_s).sum(:in_flight)
17
+ end
18
+
19
+ def self.increment(policy_name:, gate_name:, partition_key:, by: 1)
20
+ now = Time.current
21
+ sql = <<~SQL.squish
22
+ INSERT INTO #{quoted_table_name}
23
+ (policy_name, gate_name, partition_key, in_flight, created_at, updated_at)
24
+ VALUES (?, ?, ?, ?, ?, ?)
25
+ ON CONFLICT (policy_name, gate_name, partition_key)
26
+ DO UPDATE SET
27
+ in_flight = #{quoted_table_name}.in_flight + EXCLUDED.in_flight,
28
+ updated_at = EXCLUDED.updated_at
29
+ SQL
30
+ connection.exec_update(
31
+ sanitize_sql_array([ sql, policy_name, gate_name.to_s, partition_key.to_s, by, now, now ])
32
+ )
33
+ end
34
+
35
+ def self.decrement(policy_name:, gate_name:, partition_key:, by: 1)
36
+ where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_key.to_s)
37
+ .update_all([
38
+ "in_flight = GREATEST(in_flight - ?, 0), updated_at = ?", by, Time.current
39
+ ])
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ # Minute-bucketed observability per (policy, partition). Any gate with
5
+ # partition_by gets an observation row here — adaptive, throttle,
6
+ # concurrency, whatever — so the admin chart shows queue lag / throughput
7
+ # for all partitioned policies, not just the adaptive ones.
8
+ #
9
+ # One row per (policy, partition, minute): total_lag_ms accumulates the
10
+ # sum of queue_lag_ms observations in that minute, observation_count
11
+ # increments, max_lag_ms tracks the worst spike. Average lag for the
12
+ # bucket is derived on read as total / count.
13
+ class PartitionObservation < ApplicationRecord
14
+ self.table_name = "dispatch_policy_partition_observations"
15
+
16
+ OBSERVATION_TTL = 2 * 60 * 60 # 2 hours
17
+
18
+ def self.observe!(policy_name:, partition_key:, queue_lag_ms:, current_max: nil)
19
+ return if partition_key.nil? || partition_key.to_s.empty?
20
+
21
+ now = Time.current
22
+ lag = queue_lag_ms.to_i
23
+ sql = <<~SQL.squish
24
+ INSERT INTO #{quoted_table_name}
25
+ (policy_name, partition_key, minute_bucket,
26
+ total_lag_ms, observation_count, max_lag_ms, current_max,
27
+ created_at, updated_at)
28
+ VALUES (?, ?, date_trunc('minute', ?::timestamp), ?, 1, ?, ?, ?, ?)
29
+ ON CONFLICT (policy_name, partition_key, minute_bucket)
30
+ DO UPDATE SET
31
+ total_lag_ms = #{quoted_table_name}.total_lag_ms + EXCLUDED.total_lag_ms,
32
+ observation_count = #{quoted_table_name}.observation_count + 1,
33
+ max_lag_ms = GREATEST(#{quoted_table_name}.max_lag_ms, EXCLUDED.max_lag_ms),
34
+ current_max = COALESCE(EXCLUDED.current_max, #{quoted_table_name}.current_max),
35
+ updated_at = EXCLUDED.updated_at
36
+ SQL
37
+ connection.exec_update(
38
+ sanitize_sql_array([
39
+ sql, policy_name, partition_key.to_s, now,
40
+ lag, lag, current_max, now, now
41
+ ])
42
+ )
43
+ end
44
+
45
+ def self.prune!
46
+ where("minute_bucket < ?", Time.current - OBSERVATION_TTL).delete_all
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class StagedJob < ApplicationRecord
5
+ self.table_name = "dispatch_policy_staged_jobs"
6
+
7
+ scope :pending, -> { where(admitted_at: nil, completed_at: nil) }
8
+ scope :admitted, -> { where.not(admitted_at: nil).where(completed_at: nil) }
9
+ scope :completed, -> { where.not(completed_at: nil) }
10
+ scope :active, -> { where(completed_at: nil) }
11
+ scope :expired_leases, -> {
12
+ admitted.where("lease_expires_at IS NOT NULL AND lease_expires_at < ?", Time.current)
13
+ }
14
+
15
+ # Merge the job's ActiveJob metadata (queue_name, priority) into the
16
+ # context hash so gate lambdas can partition_by :queue_name without
17
+ # the user having to pass it as a kwarg. User-provided keys win.
18
+ def self.context_for(job_instance, policy)
19
+ built = policy.context_builder.call(job_instance.arguments)
20
+ return built unless built.is_a?(Hash)
21
+ {
22
+ queue_name: job_instance.queue_name,
23
+ priority: job_instance.priority
24
+ }.merge(built.symbolize_keys)
25
+ end
26
+
27
+ # Stages a job in the admission queue. Returns the created row, or nil if
28
+ # the policy declares a dedupe_key and an active row already exists.
29
+ def self.stage!(job_instance:, policy:)
30
+ dedupe_key = policy.build_dedupe_key(job_instance.arguments)
31
+
32
+ if dedupe_key && exists?(policy_name: policy.name, dedupe_key: dedupe_key, completed_at: nil)
33
+ return nil
34
+ end
35
+
36
+ create!(
37
+ job_class: job_instance.class.name,
38
+ policy_name: policy.name,
39
+ arguments: job_instance.serialize,
40
+ snapshot: policy.build_snapshot(job_instance.arguments),
41
+ context: context_for(job_instance, policy),
42
+ priority: job_instance.priority || 100,
43
+ not_before_at: job_instance.scheduled_at,
44
+ staged_at: Time.current,
45
+ dedupe_key: dedupe_key,
46
+ round_robin_key: policy.build_round_robin_key(job_instance.arguments)
47
+ )
48
+ rescue ActiveRecord::RecordNotUnique
49
+ nil
50
+ end
51
+
52
+ # Batch-insert variant of stage!.
53
+ def self.stage_many!(policy:, jobs:)
54
+ return 0 if jobs.empty?
55
+
56
+ now = Time.current
57
+ rows = jobs.map do |job_instance|
58
+ {
59
+ job_class: job_instance.class.name,
60
+ policy_name: policy.name,
61
+ arguments: job_instance.serialize,
62
+ snapshot: policy.build_snapshot(job_instance.arguments),
63
+ context: context_for(job_instance, policy),
64
+ priority: job_instance.priority || 100,
65
+ not_before_at: job_instance.scheduled_at,
66
+ staged_at: now,
67
+ dedupe_key: policy.build_dedupe_key(job_instance.arguments),
68
+ round_robin_key: policy.build_round_robin_key(job_instance.arguments),
69
+ partitions: {},
70
+ created_at: now,
71
+ updated_at: now
72
+ }
73
+ end
74
+
75
+ result = insert_all(rows, unique_by: :idx_dp_staged_dedupe_active)
76
+ result.rows.size
77
+ end
78
+
79
+ def self.mark_completed_by_active_job_id(active_job_id)
80
+ return 0 if active_job_id.blank?
81
+ where(active_job_id: active_job_id, completed_at: nil)
82
+ .update_all(completed_at: Time.current, lease_expires_at: nil)
83
+ end
84
+
85
+ def mark_admitted!(partitions:)
86
+ now = Time.current
87
+ job = instantiate_active_job
88
+ job._dispatch_partitions = partitions
89
+ job._dispatch_admitted_at = now
90
+
91
+ update!(
92
+ admitted_at: now,
93
+ lease_expires_at: now + DispatchPolicy.config.lease_duration,
94
+ active_job_id: job.job_id,
95
+ partitions: partitions
96
+ )
97
+
98
+ job
99
+ end
100
+
101
+ def instantiate_active_job
102
+ ActiveJob::Base.deserialize(arguments)
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class ThrottleBucket < ApplicationRecord
5
+ self.table_name = "dispatch_policy_throttle_buckets"
6
+
7
+ def self.lock(policy_name:, gate_name:, partition_key:, burst:)
8
+ now = Time.current
9
+ seed_sql = <<~SQL.squish
10
+ INSERT INTO #{quoted_table_name}
11
+ (policy_name, gate_name, partition_key, tokens, refilled_at, created_at, updated_at)
12
+ VALUES (?, ?, ?, ?, ?, ?, ?)
13
+ ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
14
+ SQL
15
+ connection.exec_update(
16
+ sanitize_sql_array([
17
+ seed_sql, policy_name, gate_name.to_s, partition_key.to_s,
18
+ burst.to_f, now, now, now
19
+ ])
20
+ )
21
+
22
+ where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_key.to_s)
23
+ .lock("FOR UPDATE")
24
+ .first!
25
+ end
26
+
27
+ def refill!(rate:, per:, burst:)
28
+ now = Time.current
29
+ elapsed = (now - refilled_at).to_f
30
+ new_tokens = tokens + (rate * elapsed / per)
31
+ self.tokens = [ new_tokens, burst.to_f ].min
32
+ self.refilled_at = now
33
+ end
34
+
35
+ def consume(n = 1)
36
+ return false if tokens < n
37
+ self.tokens -= n
38
+ true
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,52 @@
1
+ <h1>DispatchPolicy</h1>
2
+ <p class="muted">Admission-control policies registered in this app.</p>
3
+
4
+ <div class="summary">
5
+ <div class="summary-item">
6
+ <div class="label">Active partitions</div>
7
+ <div class="value"><%= @active_partitions %></div>
8
+ </div>
9
+ <div class="summary-item">
10
+ <div class="label">Expired leases</div>
11
+ <div class="value"><%= @expired_leases %></div>
12
+ </div>
13
+ </div>
14
+
15
+ <h2>Policies</h2>
16
+ <% if @policies.empty? %>
17
+ <p class="muted">No policies registered yet.</p>
18
+ <% else %>
19
+ <table>
20
+ <thead>
21
+ <tr>
22
+ <th>Policy</th>
23
+ <th>Job class</th>
24
+ <th>Pending</th>
25
+ <th>Admitted</th>
26
+ <th>Completed (24h)</th>
27
+ <th>Oldest pending</th>
28
+ </tr>
29
+ </thead>
30
+ <tbody>
31
+ <% @policies.each do |p| %>
32
+ <tr>
33
+ <td><%= link_to p[:name], policy_path(policy_name: p[:name]) %></td>
34
+ <td><code><%= p[:job_class].name %></code></td>
35
+ <td><%= p[:pending_count] %></td>
36
+ <td><%= p[:admitted_count] %></td>
37
+ <td><%= p[:completed_24h] %></td>
38
+ <td>
39
+ <% if p[:oldest_pending] %>
40
+ <% stale = p[:oldest_pending] < Time.current - p[:stale_threshold] %>
41
+ <span class="<%= 'stale' if stale %>">
42
+ <%= time_ago_in_words(p[:oldest_pending]) %> ago
43
+ </span>
44
+ <% else %>
45
+ <span class="muted">—</span>
46
+ <% end %>
47
+ </td>
48
+ </tr>
49
+ <% end %>
50
+ </tbody>
51
+ </table>
52
+ <% end %>