dispatch_policy 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +12 -0
- data/MIT-LICENSE +21 -0
- data/README.md +435 -0
- data/app/controllers/dispatch_policy/application_controller.rb +9 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +269 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +89 -0
- data/app/models/dispatch_policy/application_record.rb +7 -0
- data/app/models/dispatch_policy/partition_inflight_count.rb +42 -0
- data/app/models/dispatch_policy/partition_observation.rb +49 -0
- data/app/models/dispatch_policy/staged_job.rb +105 -0
- data/app/models/dispatch_policy/throttle_bucket.rb +41 -0
- data/app/views/dispatch_policy/policies/index.html.erb +52 -0
- data/app/views/dispatch_policy/policies/show.html.erb +241 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +266 -0
- data/config/routes.rb +6 -0
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +80 -0
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +22 -0
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +25 -0
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +32 -0
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +32 -0
- data/lib/dispatch_policy/dispatch_context.rb +53 -0
- data/lib/dispatch_policy/dispatchable.rb +120 -0
- data/lib/dispatch_policy/engine.rb +36 -0
- data/lib/dispatch_policy/gate.rb +49 -0
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +123 -0
- data/lib/dispatch_policy/gates/concurrency.rb +43 -0
- data/lib/dispatch_policy/gates/fair_interleave.rb +32 -0
- data/lib/dispatch_policy/gates/global_cap.rb +26 -0
- data/lib/dispatch_policy/gates/throttle.rb +52 -0
- data/lib/dispatch_policy/install_generator.rb +23 -0
- data/lib/dispatch_policy/policy.rb +73 -0
- data/lib/dispatch_policy/tick.rb +214 -0
- data/lib/dispatch_policy/tick_loop.rb +45 -0
- data/lib/dispatch_policy/version.rb +5 -0
- data/lib/dispatch_policy.rb +64 -0
- metadata +182 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class PoliciesController < ApplicationController
|
|
5
|
+
STALE_PENDING_THRESHOLD = 1.hour
|
|
6
|
+
PARTITION_LIST_PAGE_SIZE = 25
|
|
7
|
+
|
|
8
|
+
before_action :load_policy, only: :show
|
|
9
|
+
|
|
10
|
+
def index
|
|
11
|
+
@policies = DispatchPolicy.registry.map do |name, job_class|
|
|
12
|
+
scope = StagedJob.where(policy_name: name)
|
|
13
|
+
pending = scope.pending
|
|
14
|
+
{
|
|
15
|
+
name: name,
|
|
16
|
+
job_class: job_class,
|
|
17
|
+
policy: job_class.resolved_dispatch_policy,
|
|
18
|
+
pending_count: pending.count,
|
|
19
|
+
admitted_count: scope.admitted.count,
|
|
20
|
+
completed_24h: scope.completed.where(completed_at: 24.hours.ago..).count,
|
|
21
|
+
oldest_pending: pending.minimum(:staged_at),
|
|
22
|
+
stale_threshold: STALE_PENDING_THRESHOLD
|
|
23
|
+
}
|
|
24
|
+
end.sort_by { |p| -p[:pending_count] }
|
|
25
|
+
|
|
26
|
+
@active_partitions = PartitionInflightCount.where("in_flight > 0").count
|
|
27
|
+
@expired_leases = StagedJob.expired_leases.count
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def show
|
|
31
|
+
scope = StagedJob.where(policy_name: @policy_name)
|
|
32
|
+
@pending_count = scope.pending.count
|
|
33
|
+
@pending_eligible_count = scope.pending.where("not_before_at IS NULL OR not_before_at <= ?", Time.current).count
|
|
34
|
+
@pending_scheduled_count = @pending_count - @pending_eligible_count
|
|
35
|
+
@admitted_count = scope.admitted.count
|
|
36
|
+
@completed_24h = scope.completed.where(completed_at: 24.hours.ago..).count
|
|
37
|
+
|
|
38
|
+
all_breakdown = partition_breakdown(scope)
|
|
39
|
+
|
|
40
|
+
# "Watched" subset (passed via ?watch=a,b,c; the JS layer syncs it
|
|
41
|
+
# with localStorage so the choice sticks across reloads).
|
|
42
|
+
@watched_keys = (params[:watch] || "").split(",").map(&:strip).reject(&:empty?)
|
|
43
|
+
@partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
|
|
44
|
+
|
|
45
|
+
# Browsable list of every active partition with filter + sort + pagination.
|
|
46
|
+
@partition_search = params[:q].to_s.strip
|
|
47
|
+
@partition_page = [ params[:page].to_i, 1 ].max
|
|
48
|
+
@partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
|
|
49
|
+
@partition_dir = params[:dir] == "asc" ? "asc" : "desc"
|
|
50
|
+
|
|
51
|
+
list = all_breakdown
|
|
52
|
+
list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
|
|
53
|
+
list = sort_partition_list(list, @partition_sort, @partition_dir)
|
|
54
|
+
|
|
55
|
+
@partition_total_list = list.size
|
|
56
|
+
offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
|
|
57
|
+
@partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
|
|
58
|
+
|
|
59
|
+
load_adaptive_chart_data
|
|
60
|
+
@throttle_buckets = ThrottleBucket
|
|
61
|
+
.where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
|
|
62
|
+
@pending_jobs = scope.pending.order(:priority, :staged_at).limit(50)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
private
|
|
66
|
+
|
|
67
|
+
def load_policy
|
|
68
|
+
@policy_name = params[:policy_name]
|
|
69
|
+
@job_class = DispatchPolicy.registry[@policy_name] ||
|
|
70
|
+
Tick.autoload_job_for(@policy_name)
|
|
71
|
+
raise ActiveRecord::RecordNotFound unless @job_class
|
|
72
|
+
@policy = @job_class.resolved_dispatch_policy
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Per-(source, partition) breakdown of pending-eligible / pending-scheduled
|
|
76
|
+
# / in-flight / completed-24h. A "source" is either a gate with a
|
|
77
|
+
# partition_by (uses gate.partition_key_for(context)) or the policy's
|
|
78
|
+
# round_robin_by declaration (uses the round_robin_key column directly).
|
|
79
|
+
# All four counts come from StagedJob groupings; PartitionInflightCount
|
|
80
|
+
# is an admission-time optimization, not the user-facing truth.
|
|
81
|
+
def partition_breakdown(scope)
|
|
82
|
+
sources = partition_sources
|
|
83
|
+
return [] if sources.empty?
|
|
84
|
+
|
|
85
|
+
now = Time.current
|
|
86
|
+
now_iso = now.iso8601
|
|
87
|
+
since_24h = 24.hours.ago.iso8601
|
|
88
|
+
|
|
89
|
+
adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
|
|
90
|
+
.pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
|
|
91
|
+
.each_with_object({}) { |(g, k, c, l), h|
|
|
92
|
+
h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
rows = Hash.new { |h, k|
|
|
96
|
+
h[k] = {
|
|
97
|
+
source: k[0],
|
|
98
|
+
partition: k[1],
|
|
99
|
+
eligible: 0,
|
|
100
|
+
scheduled: 0,
|
|
101
|
+
in_flight: 0,
|
|
102
|
+
completed_24h: 0,
|
|
103
|
+
last_enqueued_at: nil,
|
|
104
|
+
last_dispatched_at: nil,
|
|
105
|
+
current_max: nil,
|
|
106
|
+
ewma_latency_ms: nil
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
# Activity timestamps bounded to the last 24h so the scan stays on
|
|
111
|
+
# an index-friendly slice of staged_jobs.
|
|
112
|
+
activity_rows = scope
|
|
113
|
+
.where("staged_at > ?", since_24h)
|
|
114
|
+
.group(:context, :round_robin_key)
|
|
115
|
+
.pluck(
|
|
116
|
+
:context,
|
|
117
|
+
:round_robin_key,
|
|
118
|
+
Arel.sql("MAX(staged_at)"),
|
|
119
|
+
Arel.sql("MAX(admitted_at)")
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
sources.each do |name, extract|
|
|
123
|
+
pending_counts = scope.pending.group(:context, :round_robin_key).pluck(
|
|
124
|
+
:context,
|
|
125
|
+
:round_robin_key,
|
|
126
|
+
Arel.sql("count(*) filter (where not_before_at is null or not_before_at <= '#{now_iso}')"),
|
|
127
|
+
Arel.sql("count(*) filter (where not_before_at > '#{now_iso}')")
|
|
128
|
+
)
|
|
129
|
+
pending_counts.each do |ctx, rr_key, eligible, scheduled|
|
|
130
|
+
partition = extract.call(ctx, rr_key)
|
|
131
|
+
row = rows[[ name, partition ]]
|
|
132
|
+
row[:eligible] += eligible
|
|
133
|
+
row[:scheduled] += scheduled
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
admitted_counts = scope.admitted.group(:context, :round_robin_key).pluck(
|
|
137
|
+
:context, :round_robin_key, Arel.sql("count(*)")
|
|
138
|
+
)
|
|
139
|
+
admitted_counts.each do |ctx, rr_key, in_flight|
|
|
140
|
+
partition = extract.call(ctx, rr_key)
|
|
141
|
+
rows[[ name, partition ]][:in_flight] += in_flight
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
completed_counts = scope.completed.where("completed_at > ?", since_24h)
|
|
145
|
+
.group(:context, :round_robin_key).pluck(
|
|
146
|
+
:context, :round_robin_key, Arel.sql("count(*)")
|
|
147
|
+
)
|
|
148
|
+
completed_counts.each do |ctx, rr_key, completed|
|
|
149
|
+
partition = extract.call(ctx, rr_key)
|
|
150
|
+
rows[[ name, partition ]][:completed_24h] += completed
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
|
|
154
|
+
partition = extract.call(ctx, rr_key)
|
|
155
|
+
row = rows[[ name, partition ]]
|
|
156
|
+
row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
|
|
157
|
+
row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
rows.each do |(source, partition), row|
|
|
162
|
+
stats = adaptive_stats[[ source, partition ]]
|
|
163
|
+
next unless stats
|
|
164
|
+
row[:current_max] = stats[:current_max]
|
|
165
|
+
row[:ewma_latency_ms] = stats[:ewma_latency_ms]
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Two different sources (say round_robin_by account_id + a gate
|
|
169
|
+
# partitioned by account_id) producing the same partition key yield
|
|
170
|
+
# identical counts — collapse them into one row with a merged source
|
|
171
|
+
# label instead of listing the same numbers twice.
|
|
172
|
+
merged = rows.values
|
|
173
|
+
.reject { |r| r[:partition].nil? || r[:partition].empty? }
|
|
174
|
+
.group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
|
|
175
|
+
.map { |_, group|
|
|
176
|
+
base = group.first.dup
|
|
177
|
+
base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
|
|
178
|
+
group.each do |r|
|
|
179
|
+
base[:current_max] ||= r[:current_max]
|
|
180
|
+
base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
|
|
181
|
+
base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
|
|
182
|
+
base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
|
|
183
|
+
end
|
|
184
|
+
base
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
merged.sort_by { |r|
|
|
188
|
+
[ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
|
|
189
|
+
}
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def sort_partition_list(list, sort, dir)
|
|
193
|
+
# Put nulls at the bottom regardless of direction (Time#to_f on nil
|
|
194
|
+
# would crash; -Float::INFINITY sorts first, +Float::INFINITY last).
|
|
195
|
+
key =
|
|
196
|
+
case sort
|
|
197
|
+
when "source" then ->(r) { [ r[:source], r[:partition] ] }
|
|
198
|
+
when "partition" then ->(r) { r[:partition] }
|
|
199
|
+
when "pending" then ->(r) { r[:eligible] + r[:scheduled] }
|
|
200
|
+
when "in_flight" then ->(r) { r[:in_flight] }
|
|
201
|
+
when "completed_24h" then ->(r) { r[:completed_24h] }
|
|
202
|
+
when "last_enqueued_at" then ->(r) { r[:last_enqueued_at]&.to_f || 0 }
|
|
203
|
+
when "last_dispatched_at" then ->(r) { r[:last_dispatched_at]&.to_f || 0 }
|
|
204
|
+
else ->(r) { r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h] }
|
|
205
|
+
end
|
|
206
|
+
sorted = list.sort_by(&key)
|
|
207
|
+
dir == "asc" ? sorted : sorted.reverse
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Returns [[source_name, ->(ctx, rr_key) { partition_key }], ...]
|
|
211
|
+
# covering every partition-producing declaration on the policy: every
|
|
212
|
+
# gate with a partition_by, plus round_robin_by if declared.
|
|
213
|
+
def partition_sources
|
|
214
|
+
return [] unless @policy
|
|
215
|
+
|
|
216
|
+
sources = @policy.gates.select(&:partition_by).map do |gate|
|
|
217
|
+
[ gate.name.to_s, ->(ctx, _rr) { gate.partition_key_for((ctx || {}).symbolize_keys) } ]
|
|
218
|
+
end
|
|
219
|
+
sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
|
|
220
|
+
sources
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Build chart data from PartitionObservation. Two queries:
|
|
224
|
+
# - Global aggregated (one row per minute): cheap even with 1000s of
|
|
225
|
+
# partitions because we SUM/AVG in SQL, not in Ruby.
|
|
226
|
+
# - Per-partition sparkline data, scoped to only the partitions we're
|
|
227
|
+
# going to actually render (breakdown's top N).
|
|
228
|
+
def load_adaptive_chart_data
|
|
229
|
+
last_minute = Time.current.utc.beginning_of_minute
|
|
230
|
+
@chart_slots = (0..59).map { |i| last_minute - (59 - i).minutes }
|
|
231
|
+
@chart_labels = @chart_slots.map { |t| t.strftime("%H:%M") }
|
|
232
|
+
slot_index = @chart_slots.each_with_index.to_h
|
|
233
|
+
|
|
234
|
+
@adaptive_global = Array.new(@chart_slots.size)
|
|
235
|
+
@completions_global = Array.new(@chart_slots.size, 0)
|
|
236
|
+
global_rows = PartitionObservation
|
|
237
|
+
.where(policy_name: @policy_name)
|
|
238
|
+
.where("minute_bucket >= ?", @chart_slots.first)
|
|
239
|
+
.group(:minute_bucket)
|
|
240
|
+
.pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
|
|
241
|
+
global_rows.each do |bucket, total_lag, obs_count|
|
|
242
|
+
idx = slot_index[bucket.utc.beginning_of_minute]
|
|
243
|
+
next unless idx
|
|
244
|
+
@completions_global[idx] = obs_count
|
|
245
|
+
@adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
partition_keys = (@partition_breakdown || []).map { |r| r[:partition] }.uniq
|
|
249
|
+
@adaptive_samples = {}
|
|
250
|
+
@completions_samples = {}
|
|
251
|
+
return if partition_keys.empty?
|
|
252
|
+
|
|
253
|
+
per_partition_lag = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size) }
|
|
254
|
+
per_partition_counts = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size, 0) }
|
|
255
|
+
rows = PartitionObservation
|
|
256
|
+
.where(policy_name: @policy_name, partition_key: partition_keys)
|
|
257
|
+
.where("minute_bucket >= ?", @chart_slots.first)
|
|
258
|
+
.pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
|
|
259
|
+
rows.each do |pk, bucket, total, count|
|
|
260
|
+
idx = slot_index[bucket.utc.beginning_of_minute]
|
|
261
|
+
next unless idx
|
|
262
|
+
per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
|
|
263
|
+
per_partition_counts[pk][idx] = count
|
|
264
|
+
end
|
|
265
|
+
@adaptive_samples = per_partition_lag
|
|
266
|
+
@completions_samples = per_partition_counts
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
end
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class AdaptiveConcurrencyStats < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_adaptive_concurrency_stats"
|
|
6
|
+
|
|
7
|
+
# Seed a stats row if one doesn't exist yet. Mirrors ThrottleBucket.lock.
|
|
8
|
+
def self.seed!(policy_name:, gate_name:, partition_key:, initial_max:)
|
|
9
|
+
now = Time.current
|
|
10
|
+
sql = <<~SQL.squish
|
|
11
|
+
INSERT INTO #{quoted_table_name}
|
|
12
|
+
(policy_name, gate_name, partition_key, current_max,
|
|
13
|
+
ewma_latency_ms, sample_count, created_at, updated_at)
|
|
14
|
+
VALUES (?, ?, ?, ?, 0, 0, ?, ?)
|
|
15
|
+
ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
|
|
16
|
+
SQL
|
|
17
|
+
connection.exec_update(
|
|
18
|
+
sanitize_sql_array([
|
|
19
|
+
sql, policy_name, gate_name.to_s, partition_key.to_s,
|
|
20
|
+
initial_max.to_i, now, now
|
|
21
|
+
])
|
|
22
|
+
)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.fetch_many(policy_name:, gate_name:, partition_keys:)
|
|
26
|
+
return {} if partition_keys.empty?
|
|
27
|
+
where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
|
|
28
|
+
.pluck(:partition_key, :current_max, :ewma_latency_ms)
|
|
29
|
+
.each_with_object({}) { |(k, c, l), h| h[k] = { current_max: c, ewma_latency_ms: l } }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Single-statement EWMA + AIMD update so concurrent performs can't race
|
|
33
|
+
# on read-modify-write. Seed first (INSERT ON CONFLICT DO NOTHING), then
|
|
34
|
+
# apply the adjustment.
|
|
35
|
+
def self.record_observation!(
|
|
36
|
+
policy_name:, gate_name:, partition_key:,
|
|
37
|
+
queue_lag_ms:, succeeded:,
|
|
38
|
+
alpha:, min:, target_lag_ms:,
|
|
39
|
+
fail_factor:, slow_factor:, initial_max:
|
|
40
|
+
)
|
|
41
|
+
seed!(
|
|
42
|
+
policy_name: policy_name,
|
|
43
|
+
gate_name: gate_name,
|
|
44
|
+
partition_key: partition_key,
|
|
45
|
+
initial_max: initial_max
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Feedback signal is queue_lag (admitted_at → perform_start). When
|
|
49
|
+
# the adapter queue is empty, lag ≈ 0 → +1 grow. When the queue
|
|
50
|
+
# backs up, lag rises past target → multiplicative shrink. Failures
|
|
51
|
+
# shrink harder. Only `min` is enforced so a partition can't lock
|
|
52
|
+
# out entirely.
|
|
53
|
+
sql = <<~SQL.squish
|
|
54
|
+
UPDATE #{quoted_table_name}
|
|
55
|
+
SET
|
|
56
|
+
ewma_latency_ms = ewma_latency_ms * (1 - ?) + ? * ?,
|
|
57
|
+
sample_count = sample_count + 1,
|
|
58
|
+
current_max = GREATEST(?, CASE
|
|
59
|
+
WHEN ? = FALSE THEN FLOOR(current_max * ?)::int
|
|
60
|
+
WHEN (ewma_latency_ms * (1 - ?) + ? * ?) > ? THEN FLOOR(current_max * ?)::int
|
|
61
|
+
ELSE current_max + 1
|
|
62
|
+
END),
|
|
63
|
+
last_observed_at = ?,
|
|
64
|
+
updated_at = ?
|
|
65
|
+
WHERE policy_name = ? AND gate_name = ? AND partition_key = ?
|
|
66
|
+
SQL
|
|
67
|
+
|
|
68
|
+
now = Time.current
|
|
69
|
+
connection.exec_update(
|
|
70
|
+
sanitize_sql_array([
|
|
71
|
+
sql,
|
|
72
|
+
alpha, alpha, queue_lag_ms,
|
|
73
|
+
min.to_i,
|
|
74
|
+
succeeded, fail_factor,
|
|
75
|
+
alpha, alpha, queue_lag_ms, target_lag_ms, slow_factor,
|
|
76
|
+
now, now,
|
|
77
|
+
policy_name, gate_name.to_s, partition_key.to_s
|
|
78
|
+
])
|
|
79
|
+
)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Quick lookup used by Dispatchable to denormalize current_max into
|
|
83
|
+
# the generic partition observation row.
|
|
84
|
+
def self.current_max_for(policy_name:, partition_key:)
|
|
85
|
+
where(policy_name: policy_name, partition_key: partition_key.to_s)
|
|
86
|
+
.limit(1).pick(:current_max)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class PartitionInflightCount < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_partition_counts"
|
|
6
|
+
|
|
7
|
+
def self.fetch_many(policy_name:, gate_name:, partition_keys:)
|
|
8
|
+
return {} if partition_keys.empty?
|
|
9
|
+
|
|
10
|
+
where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
|
|
11
|
+
.pluck(:partition_key, :in_flight).to_h
|
|
12
|
+
.tap { |h| partition_keys.each { |k| h[k] ||= 0 } }
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def self.total_for(policy_name:, gate_name:)
|
|
16
|
+
where(policy_name: policy_name, gate_name: gate_name.to_s).sum(:in_flight)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def self.increment(policy_name:, gate_name:, partition_key:, by: 1)
|
|
20
|
+
now = Time.current
|
|
21
|
+
sql = <<~SQL.squish
|
|
22
|
+
INSERT INTO #{quoted_table_name}
|
|
23
|
+
(policy_name, gate_name, partition_key, in_flight, created_at, updated_at)
|
|
24
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
25
|
+
ON CONFLICT (policy_name, gate_name, partition_key)
|
|
26
|
+
DO UPDATE SET
|
|
27
|
+
in_flight = #{quoted_table_name}.in_flight + EXCLUDED.in_flight,
|
|
28
|
+
updated_at = EXCLUDED.updated_at
|
|
29
|
+
SQL
|
|
30
|
+
connection.exec_update(
|
|
31
|
+
sanitize_sql_array([ sql, policy_name, gate_name.to_s, partition_key.to_s, by, now, now ])
|
|
32
|
+
)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def self.decrement(policy_name:, gate_name:, partition_key:, by: 1)
|
|
36
|
+
where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_key.to_s)
|
|
37
|
+
.update_all([
|
|
38
|
+
"in_flight = GREATEST(in_flight - ?, 0), updated_at = ?", by, Time.current
|
|
39
|
+
])
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Minute-bucketed observability per (policy, partition). Any gate with
|
|
5
|
+
# partition_by gets an observation row here — adaptive, throttle,
|
|
6
|
+
# concurrency, whatever — so the admin chart shows queue lag / throughput
|
|
7
|
+
# for all partitioned policies, not just the adaptive ones.
|
|
8
|
+
#
|
|
9
|
+
# One row per (policy, partition, minute): total_lag_ms accumulates the
|
|
10
|
+
# sum of queue_lag_ms observations in that minute, observation_count
|
|
11
|
+
# increments, max_lag_ms tracks the worst spike. Average lag for the
|
|
12
|
+
# bucket is derived on read as total / count.
|
|
13
|
+
class PartitionObservation < ApplicationRecord
|
|
14
|
+
self.table_name = "dispatch_policy_partition_observations"
|
|
15
|
+
|
|
16
|
+
OBSERVATION_TTL = 2 * 60 * 60 # 2 hours
|
|
17
|
+
|
|
18
|
+
def self.observe!(policy_name:, partition_key:, queue_lag_ms:, current_max: nil)
|
|
19
|
+
return if partition_key.nil? || partition_key.to_s.empty?
|
|
20
|
+
|
|
21
|
+
now = Time.current
|
|
22
|
+
lag = queue_lag_ms.to_i
|
|
23
|
+
sql = <<~SQL.squish
|
|
24
|
+
INSERT INTO #{quoted_table_name}
|
|
25
|
+
(policy_name, partition_key, minute_bucket,
|
|
26
|
+
total_lag_ms, observation_count, max_lag_ms, current_max,
|
|
27
|
+
created_at, updated_at)
|
|
28
|
+
VALUES (?, ?, date_trunc('minute', ?::timestamp), ?, 1, ?, ?, ?, ?)
|
|
29
|
+
ON CONFLICT (policy_name, partition_key, minute_bucket)
|
|
30
|
+
DO UPDATE SET
|
|
31
|
+
total_lag_ms = #{quoted_table_name}.total_lag_ms + EXCLUDED.total_lag_ms,
|
|
32
|
+
observation_count = #{quoted_table_name}.observation_count + 1,
|
|
33
|
+
max_lag_ms = GREATEST(#{quoted_table_name}.max_lag_ms, EXCLUDED.max_lag_ms),
|
|
34
|
+
current_max = COALESCE(EXCLUDED.current_max, #{quoted_table_name}.current_max),
|
|
35
|
+
updated_at = EXCLUDED.updated_at
|
|
36
|
+
SQL
|
|
37
|
+
connection.exec_update(
|
|
38
|
+
sanitize_sql_array([
|
|
39
|
+
sql, policy_name, partition_key.to_s, now,
|
|
40
|
+
lag, lag, current_max, now, now
|
|
41
|
+
])
|
|
42
|
+
)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.prune!
|
|
46
|
+
where("minute_bucket < ?", Time.current - OBSERVATION_TTL).delete_all
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class StagedJob < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_staged_jobs"
|
|
6
|
+
|
|
7
|
+
scope :pending, -> { where(admitted_at: nil, completed_at: nil) }
|
|
8
|
+
scope :admitted, -> { where.not(admitted_at: nil).where(completed_at: nil) }
|
|
9
|
+
scope :completed, -> { where.not(completed_at: nil) }
|
|
10
|
+
scope :active, -> { where(completed_at: nil) }
|
|
11
|
+
scope :expired_leases, -> {
|
|
12
|
+
admitted.where("lease_expires_at IS NOT NULL AND lease_expires_at < ?", Time.current)
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# Merge the job's ActiveJob metadata (queue_name, priority) into the
|
|
16
|
+
# context hash so gate lambdas can partition_by :queue_name without
|
|
17
|
+
# the user having to pass it as a kwarg. User-provided keys win.
|
|
18
|
+
def self.context_for(job_instance, policy)
|
|
19
|
+
built = policy.context_builder.call(job_instance.arguments)
|
|
20
|
+
return built unless built.is_a?(Hash)
|
|
21
|
+
{
|
|
22
|
+
queue_name: job_instance.queue_name,
|
|
23
|
+
priority: job_instance.priority
|
|
24
|
+
}.merge(built.symbolize_keys)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Stages a job in the admission queue. Returns the created row, or nil if
|
|
28
|
+
# the policy declares a dedupe_key and an active row already exists.
|
|
29
|
+
def self.stage!(job_instance:, policy:)
|
|
30
|
+
dedupe_key = policy.build_dedupe_key(job_instance.arguments)
|
|
31
|
+
|
|
32
|
+
if dedupe_key && exists?(policy_name: policy.name, dedupe_key: dedupe_key, completed_at: nil)
|
|
33
|
+
return nil
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
create!(
|
|
37
|
+
job_class: job_instance.class.name,
|
|
38
|
+
policy_name: policy.name,
|
|
39
|
+
arguments: job_instance.serialize,
|
|
40
|
+
snapshot: policy.build_snapshot(job_instance.arguments),
|
|
41
|
+
context: context_for(job_instance, policy),
|
|
42
|
+
priority: job_instance.priority || 100,
|
|
43
|
+
not_before_at: job_instance.scheduled_at,
|
|
44
|
+
staged_at: Time.current,
|
|
45
|
+
dedupe_key: dedupe_key,
|
|
46
|
+
round_robin_key: policy.build_round_robin_key(job_instance.arguments)
|
|
47
|
+
)
|
|
48
|
+
rescue ActiveRecord::RecordNotUnique
|
|
49
|
+
nil
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Batch-insert variant of stage!.
|
|
53
|
+
def self.stage_many!(policy:, jobs:)
|
|
54
|
+
return 0 if jobs.empty?
|
|
55
|
+
|
|
56
|
+
now = Time.current
|
|
57
|
+
rows = jobs.map do |job_instance|
|
|
58
|
+
{
|
|
59
|
+
job_class: job_instance.class.name,
|
|
60
|
+
policy_name: policy.name,
|
|
61
|
+
arguments: job_instance.serialize,
|
|
62
|
+
snapshot: policy.build_snapshot(job_instance.arguments),
|
|
63
|
+
context: context_for(job_instance, policy),
|
|
64
|
+
priority: job_instance.priority || 100,
|
|
65
|
+
not_before_at: job_instance.scheduled_at,
|
|
66
|
+
staged_at: now,
|
|
67
|
+
dedupe_key: policy.build_dedupe_key(job_instance.arguments),
|
|
68
|
+
round_robin_key: policy.build_round_robin_key(job_instance.arguments),
|
|
69
|
+
partitions: {},
|
|
70
|
+
created_at: now,
|
|
71
|
+
updated_at: now
|
|
72
|
+
}
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
result = insert_all(rows, unique_by: :idx_dp_staged_dedupe_active)
|
|
76
|
+
result.rows.size
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def self.mark_completed_by_active_job_id(active_job_id)
|
|
80
|
+
return 0 if active_job_id.blank?
|
|
81
|
+
where(active_job_id: active_job_id, completed_at: nil)
|
|
82
|
+
.update_all(completed_at: Time.current, lease_expires_at: nil)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def mark_admitted!(partitions:)
|
|
86
|
+
now = Time.current
|
|
87
|
+
job = instantiate_active_job
|
|
88
|
+
job._dispatch_partitions = partitions
|
|
89
|
+
job._dispatch_admitted_at = now
|
|
90
|
+
|
|
91
|
+
update!(
|
|
92
|
+
admitted_at: now,
|
|
93
|
+
lease_expires_at: now + DispatchPolicy.config.lease_duration,
|
|
94
|
+
active_job_id: job.job_id,
|
|
95
|
+
partitions: partitions
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
job
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def instantiate_active_job
|
|
102
|
+
ActiveJob::Base.deserialize(arguments)
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class ThrottleBucket < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_throttle_buckets"
|
|
6
|
+
|
|
7
|
+
def self.lock(policy_name:, gate_name:, partition_key:, burst:)
|
|
8
|
+
now = Time.current
|
|
9
|
+
seed_sql = <<~SQL.squish
|
|
10
|
+
INSERT INTO #{quoted_table_name}
|
|
11
|
+
(policy_name, gate_name, partition_key, tokens, refilled_at, created_at, updated_at)
|
|
12
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
13
|
+
ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
|
|
14
|
+
SQL
|
|
15
|
+
connection.exec_update(
|
|
16
|
+
sanitize_sql_array([
|
|
17
|
+
seed_sql, policy_name, gate_name.to_s, partition_key.to_s,
|
|
18
|
+
burst.to_f, now, now, now
|
|
19
|
+
])
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_key.to_s)
|
|
23
|
+
.lock("FOR UPDATE")
|
|
24
|
+
.first!
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def refill!(rate:, per:, burst:)
|
|
28
|
+
now = Time.current
|
|
29
|
+
elapsed = (now - refilled_at).to_f
|
|
30
|
+
new_tokens = tokens + (rate * elapsed / per)
|
|
31
|
+
self.tokens = [ new_tokens, burst.to_f ].min
|
|
32
|
+
self.refilled_at = now
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def consume(n = 1)
|
|
36
|
+
return false if tokens < n
|
|
37
|
+
self.tokens -= n
|
|
38
|
+
true
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
<h1>DispatchPolicy</h1>
|
|
2
|
+
<p class="muted">Admission-control policies registered in this app.</p>
|
|
3
|
+
|
|
4
|
+
<div class="summary">
|
|
5
|
+
<div class="summary-item">
|
|
6
|
+
<div class="label">Active partitions</div>
|
|
7
|
+
<div class="value"><%= @active_partitions %></div>
|
|
8
|
+
</div>
|
|
9
|
+
<div class="summary-item">
|
|
10
|
+
<div class="label">Expired leases</div>
|
|
11
|
+
<div class="value"><%= @expired_leases %></div>
|
|
12
|
+
</div>
|
|
13
|
+
</div>
|
|
14
|
+
|
|
15
|
+
<h2>Policies</h2>
|
|
16
|
+
<% if @policies.empty? %>
|
|
17
|
+
<p class="muted">No policies registered yet.</p>
|
|
18
|
+
<% else %>
|
|
19
|
+
<table>
|
|
20
|
+
<thead>
|
|
21
|
+
<tr>
|
|
22
|
+
<th>Policy</th>
|
|
23
|
+
<th>Job class</th>
|
|
24
|
+
<th>Pending</th>
|
|
25
|
+
<th>Admitted</th>
|
|
26
|
+
<th>Completed (24h)</th>
|
|
27
|
+
<th>Oldest pending</th>
|
|
28
|
+
</tr>
|
|
29
|
+
</thead>
|
|
30
|
+
<tbody>
|
|
31
|
+
<% @policies.each do |p| %>
|
|
32
|
+
<tr>
|
|
33
|
+
<td><%= link_to p[:name], policy_path(policy_name: p[:name]) %></td>
|
|
34
|
+
<td><code><%= p[:job_class].name %></code></td>
|
|
35
|
+
<td><%= p[:pending_count] %></td>
|
|
36
|
+
<td><%= p[:admitted_count] %></td>
|
|
37
|
+
<td><%= p[:completed_24h] %></td>
|
|
38
|
+
<td>
|
|
39
|
+
<% if p[:oldest_pending] %>
|
|
40
|
+
<% stale = p[:oldest_pending] < Time.current - p[:stale_threshold] %>
|
|
41
|
+
<span class="<%= 'stale' if stale %>">
|
|
42
|
+
<%= time_ago_in_words(p[:oldest_pending]) %> ago
|
|
43
|
+
</span>
|
|
44
|
+
<% else %>
|
|
45
|
+
<span class="muted">—</span>
|
|
46
|
+
<% end %>
|
|
47
|
+
</td>
|
|
48
|
+
</tr>
|
|
49
|
+
<% end %>
|
|
50
|
+
</tbody>
|
|
51
|
+
</table>
|
|
52
|
+
<% end %>
|