dispatch_policy 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/MIT-LICENSE +16 -17
- data/README.md +449 -288
- data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
- data/app/controllers/dispatch_policy/application_controller.rb +45 -1
- data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
- data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
- data/app/models/dispatch_policy/inflight_job.rb +12 -0
- data/app/models/dispatch_policy/partition.rb +21 -0
- data/app/models/dispatch_policy/staged_job.rb +4 -97
- data/app/models/dispatch_policy/tick_sample.rb +11 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
- data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
- data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
- data/app/views/dispatch_policy/policies/index.html.erb +15 -37
- data/app/views/dispatch_policy/policies/show.html.erb +140 -216
- data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
- data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
- data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
- data/config/routes.rb +18 -2
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
- data/lib/dispatch_policy/bypass.rb +23 -0
- data/lib/dispatch_policy/config.rb +85 -0
- data/lib/dispatch_policy/context.rb +50 -0
- data/lib/dispatch_policy/cursor_pagination.rb +121 -0
- data/lib/dispatch_policy/decision.rb +22 -0
- data/lib/dispatch_policy/engine.rb +4 -27
- data/lib/dispatch_policy/forwarder.rb +63 -0
- data/lib/dispatch_policy/gate.rb +10 -38
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
- data/lib/dispatch_policy/gates/concurrency.rb +45 -26
- data/lib/dispatch_policy/gates/throttle.rb +65 -37
- data/lib/dispatch_policy/inflight_tracker.rb +174 -0
- data/lib/dispatch_policy/job_extension.rb +155 -0
- data/lib/dispatch_policy/operator_hints.rb +126 -0
- data/lib/dispatch_policy/pipeline.rb +48 -0
- data/lib/dispatch_policy/policy.rb +62 -47
- data/lib/dispatch_policy/policy_dsl.rb +120 -0
- data/lib/dispatch_policy/railtie.rb +35 -0
- data/lib/dispatch_policy/registry.rb +46 -0
- data/lib/dispatch_policy/repository.rb +723 -0
- data/lib/dispatch_policy/serializer.rb +36 -0
- data/lib/dispatch_policy/tick.rb +263 -172
- data/lib/dispatch_policy/tick_loop.rb +59 -26
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/dispatch_policy.rb +71 -46
- data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
- data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
- metadata +101 -43
- data/CHANGELOG.md +0 -12
- data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
- data/app/models/dispatch_policy/partition_observation.rb +0 -49
- data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
- data/lib/dispatch_policy/dispatch_context.rb +0 -53
- data/lib/dispatch_policy/dispatchable.rb +0 -120
- data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
- data/lib/dispatch_policy/gates/global_cap.rb +0 -26
- data/lib/dispatch_policy/install_generator.rb +0 -23
|
@@ -2,268 +2,121 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
class PoliciesController < ApplicationController
|
|
5
|
-
|
|
6
|
-
PARTITION_LIST_PAGE_SIZE = 25
|
|
5
|
+
before_action :find_policy, only: %i[show pause resume drain]
|
|
7
6
|
|
|
8
|
-
|
|
7
|
+
DRAIN_MAX_PER_REQUEST = 10_000
|
|
9
8
|
|
|
10
9
|
def index
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
registry_names = DispatchPolicy.registry.names
|
|
11
|
+
db_names = Partition.distinct.pluck(:policy_name)
|
|
12
|
+
names = (registry_names + db_names).uniq.sort
|
|
13
|
+
|
|
14
|
+
in_flight_by_policy = InflightJob.where(policy_name: names).group(:policy_name).count
|
|
15
|
+
|
|
16
|
+
@rows = names.map do |name|
|
|
17
|
+
partitions = Partition.for_policy(name)
|
|
14
18
|
{
|
|
15
|
-
name:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
oldest_pending: pending.minimum(:staged_at),
|
|
22
|
-
stale_threshold: STALE_PENDING_THRESHOLD
|
|
19
|
+
name: name,
|
|
20
|
+
registered: registry_names.include?(name),
|
|
21
|
+
pending: partitions.sum(:pending_count),
|
|
22
|
+
in_flight: in_flight_by_policy[name] || 0,
|
|
23
|
+
partitions: partitions.count,
|
|
24
|
+
paused_count: partitions.paused.count
|
|
23
25
|
}
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
@active_partitions = PartitionInflightCount.where("in_flight > 0").count
|
|
27
|
-
@expired_leases = StagedJob.expired_leases.count
|
|
26
|
+
end
|
|
28
27
|
end
|
|
29
28
|
|
|
30
29
|
def show
|
|
31
|
-
|
|
32
|
-
@
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
|
|
44
|
-
|
|
45
|
-
# Browsable list of every active partition with filter + sort + pagination.
|
|
46
|
-
@partition_search = params[:q].to_s.strip
|
|
47
|
-
@partition_page = [ params[:page].to_i, 1 ].max
|
|
48
|
-
@partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
|
|
49
|
-
@partition_dir = params[:dir] == "asc" ? "asc" : "desc"
|
|
50
|
-
|
|
51
|
-
list = all_breakdown
|
|
52
|
-
list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
|
|
53
|
-
list = sort_partition_list(list, @partition_sort, @partition_dir)
|
|
54
|
-
|
|
55
|
-
@partition_total_list = list.size
|
|
56
|
-
offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
|
|
57
|
-
@partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
|
|
58
|
-
|
|
59
|
-
load_adaptive_chart_data
|
|
60
|
-
@throttle_buckets = ThrottleBucket
|
|
61
|
-
.where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
|
|
62
|
-
@pending_jobs = scope.pending.order(:priority, :staged_at).limit(50)
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
private
|
|
66
|
-
|
|
67
|
-
def load_policy
|
|
68
|
-
@policy_name = params[:policy_name]
|
|
69
|
-
@job_class = DispatchPolicy.registry[@policy_name] ||
|
|
70
|
-
Tick.autoload_job_for(@policy_name)
|
|
71
|
-
raise ActiveRecord::RecordNotFound unless @job_class
|
|
72
|
-
@policy = @job_class.resolved_dispatch_policy
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
# Per-(source, partition) breakdown of pending-eligible / pending-scheduled
|
|
76
|
-
# / in-flight / completed-24h. A "source" is either a gate with a
|
|
77
|
-
# partition_by (uses gate.partition_key_for(context)) or the policy's
|
|
78
|
-
# round_robin_by declaration (uses the round_robin_key column directly).
|
|
79
|
-
# All four counts come from StagedJob groupings; PartitionInflightCount
|
|
80
|
-
# is an admission-time optimization, not the user-facing truth.
|
|
81
|
-
def partition_breakdown(scope)
|
|
82
|
-
sources = partition_sources
|
|
83
|
-
return [] if sources.empty?
|
|
84
|
-
|
|
85
|
-
now = Time.current
|
|
86
|
-
now_iso = now.iso8601
|
|
87
|
-
since_24h = 24.hours.ago.iso8601
|
|
88
|
-
|
|
89
|
-
adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
|
|
90
|
-
.pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
|
|
91
|
-
.each_with_object({}) { |(g, k, c, l), h|
|
|
92
|
-
h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
rows = Hash.new { |h, k|
|
|
96
|
-
h[k] = {
|
|
97
|
-
source: k[0],
|
|
98
|
-
partition: k[1],
|
|
99
|
-
eligible: 0,
|
|
100
|
-
scheduled: 0,
|
|
101
|
-
in_flight: 0,
|
|
102
|
-
completed_24h: 0,
|
|
103
|
-
last_enqueued_at: nil,
|
|
104
|
-
last_dispatched_at: nil,
|
|
105
|
-
current_max: nil,
|
|
106
|
-
ewma_latency_ms: nil
|
|
107
|
-
}
|
|
30
|
+
@policy_object = DispatchPolicy.registry.fetch(@policy_name)
|
|
31
|
+
@partitions = Partition.for_policy(@policy_name)
|
|
32
|
+
.order(Arel.sql("pending_count DESC, last_admit_at DESC NULLS LAST"))
|
|
33
|
+
.limit(100)
|
|
34
|
+
@top_admitted = Partition.for_policy(@policy_name)
|
|
35
|
+
.order(total_admitted: :desc)
|
|
36
|
+
.limit(20)
|
|
37
|
+
|
|
38
|
+
@totals = {
|
|
39
|
+
pending: Partition.for_policy(@policy_name).sum(:pending_count),
|
|
40
|
+
in_flight: InflightJob.where(policy_name: @policy_name).count,
|
|
41
|
+
partitions: Partition.for_policy(@policy_name).count
|
|
108
42
|
}
|
|
109
43
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
.
|
|
114
|
-
.
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
129
|
-
pending_counts.each do |ctx, rr_key, eligible, scheduled|
|
|
130
|
-
partition = extract.call(ctx, rr_key)
|
|
131
|
-
row = rows[[ name, partition ]]
|
|
132
|
-
row[:eligible] += eligible
|
|
133
|
-
row[:scheduled] += scheduled
|
|
134
|
-
end
|
|
135
|
-
|
|
136
|
-
admitted_counts = scope.admitted.group(:context, :round_robin_key).pluck(
|
|
137
|
-
:context, :round_robin_key, Arel.sql("count(*)")
|
|
138
|
-
)
|
|
139
|
-
admitted_counts.each do |ctx, rr_key, in_flight|
|
|
140
|
-
partition = extract.call(ctx, rr_key)
|
|
141
|
-
rows[[ name, partition ]][:in_flight] += in_flight
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
completed_counts = scope.completed.where("completed_at > ?", since_24h)
|
|
145
|
-
.group(:context, :round_robin_key).pluck(
|
|
146
|
-
:context, :round_robin_key, Arel.sql("count(*)")
|
|
147
|
-
)
|
|
148
|
-
completed_counts.each do |ctx, rr_key, completed|
|
|
149
|
-
partition = extract.call(ctx, rr_key)
|
|
150
|
-
rows[[ name, partition ]][:completed_24h] += completed
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
|
|
154
|
-
partition = extract.call(ctx, rr_key)
|
|
155
|
-
row = rows[[ name, partition ]]
|
|
156
|
-
row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
|
|
157
|
-
row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
|
|
158
|
-
end
|
|
159
|
-
end
|
|
160
|
-
|
|
161
|
-
rows.each do |(source, partition), row|
|
|
162
|
-
stats = adaptive_stats[[ source, partition ]]
|
|
163
|
-
next unless stats
|
|
164
|
-
row[:current_max] = stats[:current_max]
|
|
165
|
-
row[:ewma_latency_ms] = stats[:ewma_latency_ms]
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
# Two different sources (say round_robin_by account_id + a gate
|
|
169
|
-
# partitioned by account_id) producing the same partition key yield
|
|
170
|
-
# identical counts — collapse them into one row with a merged source
|
|
171
|
-
# label instead of listing the same numbers twice.
|
|
172
|
-
merged = rows.values
|
|
173
|
-
.reject { |r| r[:partition].nil? || r[:partition].empty? }
|
|
174
|
-
.group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
|
|
175
|
-
.map { |_, group|
|
|
176
|
-
base = group.first.dup
|
|
177
|
-
base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
|
|
178
|
-
group.each do |r|
|
|
179
|
-
base[:current_max] ||= r[:current_max]
|
|
180
|
-
base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
|
|
181
|
-
base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
|
|
182
|
-
base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
|
|
183
|
-
end
|
|
184
|
-
base
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
merged.sort_by { |r|
|
|
188
|
-
[ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
|
|
44
|
+
now = Time.current
|
|
45
|
+
@windows = {
|
|
46
|
+
"1m" => Repository.tick_summary(policy_name: @policy_name, since: now - 60),
|
|
47
|
+
"5m" => Repository.tick_summary(policy_name: @policy_name, since: now - 5 * 60),
|
|
48
|
+
"15m" => Repository.tick_summary(policy_name: @policy_name, since: now - 15 * 60)
|
|
49
|
+
}
|
|
50
|
+
@denied_reasons = Repository.denied_reasons_summary(policy_name: @policy_name, since: now - 15 * 60)
|
|
51
|
+
@round_trip = Repository.partition_round_trip_stats(policy_name: @policy_name)
|
|
52
|
+
@sparkline = Repository.tick_samples_buckets(policy_name: @policy_name, since: now - 30 * 60, bucket_seconds: 60)
|
|
53
|
+
@pending_trend = Repository.trend_direction(@sparkline.map { |b| b[:pending_total] })
|
|
54
|
+
|
|
55
|
+
cfg = DispatchPolicy.config
|
|
56
|
+
@capacity = {
|
|
57
|
+
admitted_per_minute: @windows["1m"][:jobs_admitted],
|
|
58
|
+
adapter_target_jps: cfg.adapter_throughput_target,
|
|
59
|
+
avg_tick_ms: @windows["1m"][:avg_duration_ms],
|
|
60
|
+
max_tick_ms: @windows["1m"][:max_duration_ms],
|
|
61
|
+
tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
|
|
189
62
|
}
|
|
190
|
-
end
|
|
191
63
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
dir == "asc" ? sorted : sorted.reverse
|
|
64
|
+
@hints = OperatorHints.for(
|
|
65
|
+
tick_max_duration_ms: @capacity[:tick_max_duration_ms],
|
|
66
|
+
avg_tick_ms: @capacity[:avg_tick_ms],
|
|
67
|
+
max_tick_ms: @capacity[:max_tick_ms],
|
|
68
|
+
pending_total: @totals[:pending],
|
|
69
|
+
admitted_per_minute: @capacity[:admitted_per_minute],
|
|
70
|
+
forward_failures: @windows["1m"][:forward_failures],
|
|
71
|
+
jobs_admitted: @windows["1m"][:jobs_admitted],
|
|
72
|
+
active_partitions: @round_trip[:active_partitions],
|
|
73
|
+
never_checked: @round_trip[:never_checked],
|
|
74
|
+
in_backoff: @round_trip[:in_backoff],
|
|
75
|
+
total_partitions: @totals[:partitions],
|
|
76
|
+
adapter_target_jps: @capacity[:adapter_target_jps],
|
|
77
|
+
pending_trend: @pending_trend
|
|
78
|
+
)
|
|
208
79
|
end
|
|
209
80
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
return [] unless @policy
|
|
81
|
+
def pause
|
|
82
|
+
Partition.for_policy(@policy_name).update_all(status: "paused", updated_at: Time.current)
|
|
83
|
+
redirect_to policy_path(@policy_name), notice: "Policy paused."
|
|
84
|
+
end
|
|
215
85
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
|
|
220
|
-
sources
|
|
86
|
+
def resume
|
|
87
|
+
Partition.for_policy(@policy_name).update_all(status: "active", updated_at: Time.current)
|
|
88
|
+
redirect_to policy_path(@policy_name), notice: "Policy resumed."
|
|
221
89
|
end
|
|
222
90
|
|
|
223
|
-
#
|
|
224
|
-
#
|
|
225
|
-
#
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
91
|
+
# Force-admits every staged job across every partition of the policy,
|
|
92
|
+
# bypassing all gates. Walks partitions in pending-DESC order so the
|
|
93
|
+
# busiest ones drain first. Bounded at DRAIN_MAX_PER_REQUEST per click.
|
|
94
|
+
def drain
|
|
95
|
+
drained = 0
|
|
96
|
+
Partition.for_policy(@policy_name)
|
|
97
|
+
.where("pending_count > 0")
|
|
98
|
+
.order(pending_count: :desc, id: :asc)
|
|
99
|
+
.limit(500)
|
|
100
|
+
.each do |partition|
|
|
101
|
+
break if drained >= DRAIN_MAX_PER_REQUEST
|
|
102
|
+
|
|
103
|
+
batch, _ = PartitionsController.drain_partition!(partition)
|
|
104
|
+
drained += batch
|
|
105
|
+
end
|
|
233
106
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
.group(:minute_bucket)
|
|
240
|
-
.pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
|
|
241
|
-
global_rows.each do |bucket, total_lag, obs_count|
|
|
242
|
-
idx = slot_index[bucket.utc.beginning_of_minute]
|
|
243
|
-
next unless idx
|
|
244
|
-
@completions_global[idx] = obs_count
|
|
245
|
-
@adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
|
|
107
|
+
remaining = Partition.for_policy(@policy_name).sum(:pending_count)
|
|
108
|
+
notice = if remaining.positive?
|
|
109
|
+
"Drained #{drained} job(s) across this policy; #{remaining} still pending — click drain again to continue."
|
|
110
|
+
else
|
|
111
|
+
"Drained #{drained} job(s); policy fully drained."
|
|
246
112
|
end
|
|
113
|
+
redirect_to policy_path(@policy_name), notice: notice
|
|
114
|
+
end
|
|
247
115
|
|
|
248
|
-
|
|
249
|
-
@adaptive_samples = {}
|
|
250
|
-
@completions_samples = {}
|
|
251
|
-
return if partition_keys.empty?
|
|
116
|
+
private
|
|
252
117
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
rows = PartitionObservation
|
|
256
|
-
.where(policy_name: @policy_name, partition_key: partition_keys)
|
|
257
|
-
.where("minute_bucket >= ?", @chart_slots.first)
|
|
258
|
-
.pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
|
|
259
|
-
rows.each do |pk, bucket, total, count|
|
|
260
|
-
idx = slot_index[bucket.utc.beginning_of_minute]
|
|
261
|
-
next unless idx
|
|
262
|
-
per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
|
|
263
|
-
per_partition_counts[pk][idx] = count
|
|
264
|
-
end
|
|
265
|
-
@adaptive_samples = per_partition_lag
|
|
266
|
-
@completions_samples = per_partition_counts
|
|
118
|
+
def find_policy
|
|
119
|
+
@policy_name = params[:name]
|
|
267
120
|
end
|
|
268
121
|
end
|
|
269
122
|
end
|
|
@@ -1,89 +1,19 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
|
+
# One row per (policy_name, partition_key) for partitions whose policy
|
|
5
|
+
# declares an `:adaptive_concurrency` gate. Holds the AIMD-tuned
|
|
6
|
+
# `current_max` plus the EWMA of recent queue-lag observations the cap
|
|
7
|
+
# adapts on.
|
|
8
|
+
#
|
|
9
|
+
# Read by `Gates::AdaptiveConcurrency#evaluate` to learn how many jobs
|
|
10
|
+
# this partition may admit right now. Written atomically by
|
|
11
|
+
# `Repository.adaptive_record!` from `InflightTracker.track`'s ensure
|
|
12
|
+
# block after each perform — the EWMA + AIMD update lives in a single
|
|
13
|
+
# SQL statement so concurrent workers can't race on read-modify-write.
|
|
4
14
|
class AdaptiveConcurrencyStats < ApplicationRecord
|
|
5
15
|
self.table_name = "dispatch_policy_adaptive_concurrency_stats"
|
|
6
16
|
|
|
7
|
-
|
|
8
|
-
def self.seed!(policy_name:, gate_name:, partition_key:, initial_max:)
|
|
9
|
-
now = Time.current
|
|
10
|
-
sql = <<~SQL.squish
|
|
11
|
-
INSERT INTO #{quoted_table_name}
|
|
12
|
-
(policy_name, gate_name, partition_key, current_max,
|
|
13
|
-
ewma_latency_ms, sample_count, created_at, updated_at)
|
|
14
|
-
VALUES (?, ?, ?, ?, 0, 0, ?, ?)
|
|
15
|
-
ON CONFLICT (policy_name, gate_name, partition_key) DO NOTHING
|
|
16
|
-
SQL
|
|
17
|
-
connection.exec_update(
|
|
18
|
-
sanitize_sql_array([
|
|
19
|
-
sql, policy_name, gate_name.to_s, partition_key.to_s,
|
|
20
|
-
initial_max.to_i, now, now
|
|
21
|
-
])
|
|
22
|
-
)
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def self.fetch_many(policy_name:, gate_name:, partition_keys:)
|
|
26
|
-
return {} if partition_keys.empty?
|
|
27
|
-
where(policy_name: policy_name, gate_name: gate_name.to_s, partition_key: partition_keys)
|
|
28
|
-
.pluck(:partition_key, :current_max, :ewma_latency_ms)
|
|
29
|
-
.each_with_object({}) { |(k, c, l), h| h[k] = { current_max: c, ewma_latency_ms: l } }
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# Single-statement EWMA + AIMD update so concurrent performs can't race
|
|
33
|
-
# on read-modify-write. Seed first (INSERT ON CONFLICT DO NOTHING), then
|
|
34
|
-
# apply the adjustment.
|
|
35
|
-
def self.record_observation!(
|
|
36
|
-
policy_name:, gate_name:, partition_key:,
|
|
37
|
-
queue_lag_ms:, succeeded:,
|
|
38
|
-
alpha:, min:, target_lag_ms:,
|
|
39
|
-
fail_factor:, slow_factor:, initial_max:
|
|
40
|
-
)
|
|
41
|
-
seed!(
|
|
42
|
-
policy_name: policy_name,
|
|
43
|
-
gate_name: gate_name,
|
|
44
|
-
partition_key: partition_key,
|
|
45
|
-
initial_max: initial_max
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
# Feedback signal is queue_lag (admitted_at → perform_start). When
|
|
49
|
-
# the adapter queue is empty, lag ≈ 0 → +1 grow. When the queue
|
|
50
|
-
# backs up, lag rises past target → multiplicative shrink. Failures
|
|
51
|
-
# shrink harder. Only `min` is enforced so a partition can't lock
|
|
52
|
-
# out entirely.
|
|
53
|
-
sql = <<~SQL.squish
|
|
54
|
-
UPDATE #{quoted_table_name}
|
|
55
|
-
SET
|
|
56
|
-
ewma_latency_ms = ewma_latency_ms * (1 - ?) + ? * ?,
|
|
57
|
-
sample_count = sample_count + 1,
|
|
58
|
-
current_max = GREATEST(?, CASE
|
|
59
|
-
WHEN ? = FALSE THEN FLOOR(current_max * ?)::int
|
|
60
|
-
WHEN (ewma_latency_ms * (1 - ?) + ? * ?) > ? THEN FLOOR(current_max * ?)::int
|
|
61
|
-
ELSE current_max + 1
|
|
62
|
-
END),
|
|
63
|
-
last_observed_at = ?,
|
|
64
|
-
updated_at = ?
|
|
65
|
-
WHERE policy_name = ? AND gate_name = ? AND partition_key = ?
|
|
66
|
-
SQL
|
|
67
|
-
|
|
68
|
-
now = Time.current
|
|
69
|
-
connection.exec_update(
|
|
70
|
-
sanitize_sql_array([
|
|
71
|
-
sql,
|
|
72
|
-
alpha, alpha, queue_lag_ms,
|
|
73
|
-
min.to_i,
|
|
74
|
-
succeeded, fail_factor,
|
|
75
|
-
alpha, alpha, queue_lag_ms, target_lag_ms, slow_factor,
|
|
76
|
-
now, now,
|
|
77
|
-
policy_name, gate_name.to_s, partition_key.to_s
|
|
78
|
-
])
|
|
79
|
-
)
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
# Quick lookup used by Dispatchable to denormalize current_max into
|
|
83
|
-
# the generic partition observation row.
|
|
84
|
-
def self.current_max_for(policy_name:, partition_key:)
|
|
85
|
-
where(policy_name: policy_name, partition_key: partition_key.to_s)
|
|
86
|
-
.limit(1).pick(:current_max)
|
|
87
|
-
end
|
|
17
|
+
scope :for_policy, ->(name) { where(policy_name: name) }
|
|
88
18
|
end
|
|
89
19
|
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class InflightJob < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_inflight_jobs"
|
|
6
|
+
|
|
7
|
+
scope :for_partition, ->(policy_name, partition_key) {
|
|
8
|
+
where(policy_name: policy_name, partition_key: partition_key)
|
|
9
|
+
}
|
|
10
|
+
scope :stale, ->(cutoff) { where("heartbeat_at < ?", cutoff) }
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class Partition < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_partitions"
|
|
6
|
+
|
|
7
|
+
scope :for_policy, ->(name) { where(policy_name: name) }
|
|
8
|
+
scope :for_shard, ->(s) { s ? where(shard: s) : all }
|
|
9
|
+
scope :active, -> { where(status: "active") }
|
|
10
|
+
scope :paused, -> { where(status: "paused") }
|
|
11
|
+
scope :pending, -> { where("pending_count > 0") }
|
|
12
|
+
scope :stale_inactive, ->(cutoff) {
|
|
13
|
+
where("pending_count = 0 AND in_flight_count = 0")
|
|
14
|
+
.where("last_admit_at < ? OR (last_admit_at IS NULL AND created_at < ?)", cutoff, cutoff)
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def paused?
|
|
18
|
+
status == "paused"
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -4,102 +4,9 @@ module DispatchPolicy
|
|
|
4
4
|
class StagedJob < ApplicationRecord
|
|
5
5
|
self.table_name = "dispatch_policy_staged_jobs"
|
|
6
6
|
|
|
7
|
-
scope :
|
|
8
|
-
scope :
|
|
9
|
-
scope :
|
|
10
|
-
scope :
|
|
11
|
-
scope :expired_leases, -> {
|
|
12
|
-
admitted.where("lease_expires_at IS NOT NULL AND lease_expires_at < ?", Time.current)
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
# Merge the job's ActiveJob metadata (queue_name, priority) into the
|
|
16
|
-
# context hash so gate lambdas can partition_by :queue_name without
|
|
17
|
-
# the user having to pass it as a kwarg. User-provided keys win.
|
|
18
|
-
def self.context_for(job_instance, policy)
|
|
19
|
-
built = policy.context_builder.call(job_instance.arguments)
|
|
20
|
-
return built unless built.is_a?(Hash)
|
|
21
|
-
{
|
|
22
|
-
queue_name: job_instance.queue_name,
|
|
23
|
-
priority: job_instance.priority
|
|
24
|
-
}.merge(built.symbolize_keys)
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
# Stages a job in the admission queue. Returns the created row, or nil if
|
|
28
|
-
# the policy declares a dedupe_key and an active row already exists.
|
|
29
|
-
def self.stage!(job_instance:, policy:)
|
|
30
|
-
dedupe_key = policy.build_dedupe_key(job_instance.arguments)
|
|
31
|
-
|
|
32
|
-
if dedupe_key && exists?(policy_name: policy.name, dedupe_key: dedupe_key, completed_at: nil)
|
|
33
|
-
return nil
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
create!(
|
|
37
|
-
job_class: job_instance.class.name,
|
|
38
|
-
policy_name: policy.name,
|
|
39
|
-
arguments: job_instance.serialize,
|
|
40
|
-
snapshot: policy.build_snapshot(job_instance.arguments),
|
|
41
|
-
context: context_for(job_instance, policy),
|
|
42
|
-
priority: job_instance.priority || 100,
|
|
43
|
-
not_before_at: job_instance.scheduled_at,
|
|
44
|
-
staged_at: Time.current,
|
|
45
|
-
dedupe_key: dedupe_key,
|
|
46
|
-
round_robin_key: policy.build_round_robin_key(job_instance.arguments)
|
|
47
|
-
)
|
|
48
|
-
rescue ActiveRecord::RecordNotUnique
|
|
49
|
-
nil
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# Batch-insert variant of stage!.
|
|
53
|
-
def self.stage_many!(policy:, jobs:)
|
|
54
|
-
return 0 if jobs.empty?
|
|
55
|
-
|
|
56
|
-
now = Time.current
|
|
57
|
-
rows = jobs.map do |job_instance|
|
|
58
|
-
{
|
|
59
|
-
job_class: job_instance.class.name,
|
|
60
|
-
policy_name: policy.name,
|
|
61
|
-
arguments: job_instance.serialize,
|
|
62
|
-
snapshot: policy.build_snapshot(job_instance.arguments),
|
|
63
|
-
context: context_for(job_instance, policy),
|
|
64
|
-
priority: job_instance.priority || 100,
|
|
65
|
-
not_before_at: job_instance.scheduled_at,
|
|
66
|
-
staged_at: now,
|
|
67
|
-
dedupe_key: policy.build_dedupe_key(job_instance.arguments),
|
|
68
|
-
round_robin_key: policy.build_round_robin_key(job_instance.arguments),
|
|
69
|
-
partitions: {},
|
|
70
|
-
created_at: now,
|
|
71
|
-
updated_at: now
|
|
72
|
-
}
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
result = insert_all(rows, unique_by: :idx_dp_staged_dedupe_active)
|
|
76
|
-
result.rows.size
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
def self.mark_completed_by_active_job_id(active_job_id)
|
|
80
|
-
return 0 if active_job_id.blank?
|
|
81
|
-
where(active_job_id: active_job_id, completed_at: nil)
|
|
82
|
-
.update_all(completed_at: Time.current, lease_expires_at: nil)
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
def mark_admitted!(partitions:)
|
|
86
|
-
now = Time.current
|
|
87
|
-
job = instantiate_active_job
|
|
88
|
-
job._dispatch_partitions = partitions
|
|
89
|
-
job._dispatch_admitted_at = now
|
|
90
|
-
|
|
91
|
-
update!(
|
|
92
|
-
admitted_at: now,
|
|
93
|
-
lease_expires_at: now + DispatchPolicy.config.lease_duration,
|
|
94
|
-
active_job_id: job.job_id,
|
|
95
|
-
partitions: partitions
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
job
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
def instantiate_active_job
|
|
102
|
-
ActiveJob::Base.deserialize(arguments)
|
|
103
|
-
end
|
|
7
|
+
scope :for_policy, ->(name) { where(policy_name: name) }
|
|
8
|
+
scope :for_partition, ->(name, key) { where(policy_name: name, partition_key: key) }
|
|
9
|
+
scope :due, -> { where("scheduled_at IS NULL OR scheduled_at <= now()") }
|
|
10
|
+
scope :recent, -> { order(enqueued_at: :desc) }
|
|
104
11
|
end
|
|
105
12
|
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class TickSample < ApplicationRecord
|
|
5
|
+
self.table_name = "dispatch_policy_tick_samples"
|
|
6
|
+
|
|
7
|
+
scope :for_policy, ->(name) { where(policy_name: name) }
|
|
8
|
+
scope :since, ->(time) { where("sampled_at >= ?", time) }
|
|
9
|
+
scope :recent, -> { order(sampled_at: :desc) }
|
|
10
|
+
end
|
|
11
|
+
end
|