dispatch_policy 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -28
- data/MIT-LICENSE +16 -17
- data/README.md +452 -388
- data/app/assets/images/dispatch_policy/logo-large.svg +9 -0
- data/app/assets/images/dispatch_policy/logo-small.svg +7 -0
- data/app/assets/javascripts/dispatch_policy/turbo.es2017-umd.min.js +35 -0
- data/app/assets/stylesheets/dispatch_policy/application.css +294 -0
- data/app/controllers/dispatch_policy/application_controller.rb +45 -1
- data/app/controllers/dispatch_policy/assets_controller.rb +31 -0
- data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +94 -267
- data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
- data/app/models/dispatch_policy/inflight_job.rb +12 -0
- data/app/models/dispatch_policy/partition.rb +21 -0
- data/app/models/dispatch_policy/staged_job.rb +4 -97
- data/app/models/dispatch_policy/tick_sample.rb +11 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
- data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
- data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
- data/app/views/dispatch_policy/policies/index.html.erb +15 -37
- data/app/views/dispatch_policy/policies/show.html.erb +139 -223
- data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
- data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
- data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +164 -231
- data/config/routes.rb +21 -2
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
- data/lib/dispatch_policy/assets.rb +38 -0
- data/lib/dispatch_policy/bypass.rb +23 -0
- data/lib/dispatch_policy/config.rb +85 -0
- data/lib/dispatch_policy/context.rb +50 -0
- data/lib/dispatch_policy/cursor_pagination.rb +121 -0
- data/lib/dispatch_policy/decision.rb +22 -0
- data/lib/dispatch_policy/engine.rb +5 -27
- data/lib/dispatch_policy/forwarder.rb +63 -0
- data/lib/dispatch_policy/gate.rb +10 -38
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
- data/lib/dispatch_policy/gates/concurrency.rb +45 -26
- data/lib/dispatch_policy/gates/throttle.rb +65 -41
- data/lib/dispatch_policy/inflight_tracker.rb +174 -0
- data/lib/dispatch_policy/job_extension.rb +155 -0
- data/lib/dispatch_policy/operator_hints.rb +126 -0
- data/lib/dispatch_policy/pipeline.rb +48 -0
- data/lib/dispatch_policy/policy.rb +61 -59
- data/lib/dispatch_policy/policy_dsl.rb +120 -0
- data/lib/dispatch_policy/railtie.rb +35 -0
- data/lib/dispatch_policy/registry.rb +46 -0
- data/lib/dispatch_policy/repository.rb +723 -0
- data/lib/dispatch_policy/serializer.rb +36 -0
- data/lib/dispatch_policy/tick.rb +260 -256
- data/lib/dispatch_policy/tick_loop.rb +59 -26
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/dispatch_policy.rb +72 -52
- data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
- data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
- metadata +134 -42
- data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
- data/app/models/dispatch_policy/partition_observation.rb +0 -76
- data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
- data/db/migrate/20260425000001_add_duration_to_partition_observations.rb +0 -8
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
- data/lib/dispatch_policy/dispatch_context.rb +0 -53
- data/lib/dispatch_policy/dispatchable.rb +0 -123
- data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
- data/lib/dispatch_policy/gates/global_cap.rb +0 -26
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Serves vendored static assets at content-addressed URLs so browsers
|
|
5
|
+
# can cache them forever. The digest is part of the URL, not a query
|
|
6
|
+
# string, so caches keyed on path alone still bust on upgrade.
|
|
7
|
+
class AssetsController < ApplicationController
|
|
8
|
+
skip_forgery_protection
|
|
9
|
+
|
|
10
|
+
def turbo
|
|
11
|
+
serve(Assets::TURBO_BODY, Assets::TURBO_DIGEST, "application/javascript")
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def logo
|
|
15
|
+
serve(Assets::LOGO_SMALL_BODY, Assets::LOGO_SMALL_DIGEST, "image/svg+xml")
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def serve(body, digest, content_type)
|
|
21
|
+
if params[:digest] != digest
|
|
22
|
+
head :not_found
|
|
23
|
+
return
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
response.headers["Cache-Control"] = "public, max-age=31536000, immutable"
|
|
27
|
+
response.headers["ETag"] = %("#{digest}")
|
|
28
|
+
send_data body, type: content_type, disposition: "inline"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class DashboardController < ApplicationController
|
|
5
|
+
WINDOWS = { "1m" => 60, "5m" => 5 * 60, "15m" => 15 * 60 }.freeze
|
|
6
|
+
|
|
7
|
+
def index
|
|
8
|
+
@totals = {
|
|
9
|
+
staged: StagedJob.count,
|
|
10
|
+
partitions: Partition.count,
|
|
11
|
+
active_parts: Partition.active.count,
|
|
12
|
+
paused_parts: Partition.paused.count,
|
|
13
|
+
in_flight: InflightJob.count
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
now = Time.current
|
|
17
|
+
@windows = WINDOWS.transform_values { |secs| Repository.tick_summary(since: now - secs) }
|
|
18
|
+
@round_trip = Repository.partition_round_trip_stats
|
|
19
|
+
|
|
20
|
+
# Pending trend: 30 minutes of 1-min buckets aggregated across
|
|
21
|
+
# all policies. Used for the sparkline + arrow on the overview.
|
|
22
|
+
@pending_buckets = Repository.tick_samples_buckets(since: now - 30 * 60, bucket_seconds: 60)
|
|
23
|
+
@pending_trend = Repository.trend_direction(@pending_buckets.map { |b| b[:pending_total] })
|
|
24
|
+
|
|
25
|
+
# Capacity headroom: live admit rate vs configured adapter ceiling,
|
|
26
|
+
# avg tick wall vs tick_max_duration. These two ratios are the
|
|
27
|
+
# operator's quickest "should I shard?" signal.
|
|
28
|
+
cfg = DispatchPolicy.config
|
|
29
|
+
@capacity = {
|
|
30
|
+
admitted_per_minute: @windows["1m"][:jobs_admitted],
|
|
31
|
+
admitted_per_second: @windows["1m"][:jobs_admitted] / 60.0,
|
|
32
|
+
adapter_target_jps: cfg.adapter_throughput_target,
|
|
33
|
+
avg_tick_ms: @windows["1m"][:avg_duration_ms],
|
|
34
|
+
max_tick_ms: @windows["1m"][:max_duration_ms],
|
|
35
|
+
tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
@hints = OperatorHints.for(
|
|
39
|
+
tick_max_duration_ms: @capacity[:tick_max_duration_ms],
|
|
40
|
+
avg_tick_ms: @capacity[:avg_tick_ms],
|
|
41
|
+
max_tick_ms: @capacity[:max_tick_ms],
|
|
42
|
+
pending_total: @totals[:staged],
|
|
43
|
+
admitted_per_minute: @capacity[:admitted_per_minute],
|
|
44
|
+
forward_failures: @windows["1m"][:forward_failures],
|
|
45
|
+
jobs_admitted: @windows["1m"][:jobs_admitted],
|
|
46
|
+
active_partitions: @round_trip[:active_partitions],
|
|
47
|
+
never_checked: @round_trip[:never_checked],
|
|
48
|
+
in_backoff: @round_trip[:in_backoff],
|
|
49
|
+
total_partitions: @totals[:partitions],
|
|
50
|
+
adapter_target_jps: @capacity[:adapter_target_jps],
|
|
51
|
+
pending_trend: @pending_trend
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
pending_by_policy = Partition
|
|
55
|
+
.group(:policy_name)
|
|
56
|
+
.pluck(:policy_name, Arel.sql("SUM(pending_count)::int"), Arel.sql("MAX(last_admit_at)"))
|
|
57
|
+
.to_h { |name, pending, last_admit| [name, { pending: pending || 0, last_admit_at: last_admit }] }
|
|
58
|
+
|
|
59
|
+
in_flight_by_policy = InflightJob.group(:policy_name).count
|
|
60
|
+
|
|
61
|
+
one_min_ago = now - 60
|
|
62
|
+
five_min_ago = now - 300
|
|
63
|
+
|
|
64
|
+
names = (pending_by_policy.keys + in_flight_by_policy.keys).uniq.sort
|
|
65
|
+
@policies = names.map do |name|
|
|
66
|
+
info = pending_by_policy[name] || {}
|
|
67
|
+
m1 = Repository.tick_summary(policy_name: name, since: one_min_ago)
|
|
68
|
+
m5 = Repository.tick_summary(policy_name: name, since: five_min_ago)
|
|
69
|
+
rs = Repository.denied_reasons_summary(policy_name: name, since: one_min_ago)
|
|
70
|
+
rt = Repository.partition_round_trip_stats(policy_name: name)
|
|
71
|
+
|
|
72
|
+
{
|
|
73
|
+
name: name,
|
|
74
|
+
pending: info[:pending] || 0,
|
|
75
|
+
in_flight: in_flight_by_policy[name] || 0,
|
|
76
|
+
last_admit_at: info[:last_admit_at],
|
|
77
|
+
admitted_1m: m1[:jobs_admitted],
|
|
78
|
+
admitted_5m: m5[:jobs_admitted],
|
|
79
|
+
ticks_1m: m1[:ticks],
|
|
80
|
+
avg_tick_ms_1m: m1[:avg_duration_ms],
|
|
81
|
+
forward_failures_1m: m1[:forward_failures],
|
|
82
|
+
oldest_age_seconds: rt[:oldest_age_seconds],
|
|
83
|
+
p95_age_seconds: rt[:p95_age_seconds],
|
|
84
|
+
in_backoff: rt[:in_backoff],
|
|
85
|
+
top_denial_reason: rs.first&.first,
|
|
86
|
+
top_denial_count: rs.first&.last
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class PartitionsController < ApplicationController
|
|
5
|
+
before_action :find_partition, only: %i[show drain admit]
|
|
6
|
+
|
|
7
|
+
DRAIN_MAX_PER_REQUEST = 10_000
|
|
8
|
+
DRAIN_BATCH_SIZE = 200
|
|
9
|
+
|
|
10
|
+
PAGE_SIZE = 100
|
|
11
|
+
|
|
12
|
+
def index
|
|
13
|
+
base = Partition.all
|
|
14
|
+
base = base.for_policy(params[:policy]) if params[:policy].present?
|
|
15
|
+
base = base.for_shard(params[:shard]) if params[:shard].present?
|
|
16
|
+
base = base.where("partition_key ILIKE ?", "%#{params[:q]}%") if params[:q].present?
|
|
17
|
+
base = base.where("pending_count > 0") if params[:only_pending] == "1"
|
|
18
|
+
|
|
19
|
+
@sort = DispatchPolicy::CursorPagination::SORTS.key?(params[:sort]) ? params[:sort] : DispatchPolicy::CursorPagination::DEFAULT_SORT
|
|
20
|
+
sort_def = DispatchPolicy::CursorPagination.sort_for(@sort)
|
|
21
|
+
|
|
22
|
+
@total = base.count # cheap on indexed columns; nice to display
|
|
23
|
+
@cursor = DispatchPolicy::CursorPagination.decode(params[:cursor])
|
|
24
|
+
|
|
25
|
+
paginated = DispatchPolicy::CursorPagination.apply(base, @sort, @cursor)
|
|
26
|
+
.order(Arel.sql(sort_def[:sql_order]))
|
|
27
|
+
.limit(PAGE_SIZE + 1)
|
|
28
|
+
.to_a
|
|
29
|
+
|
|
30
|
+
@has_more = paginated.size > PAGE_SIZE
|
|
31
|
+
@partitions = paginated.first(PAGE_SIZE)
|
|
32
|
+
@next_cursor =
|
|
33
|
+
if @has_more && @partitions.any?
|
|
34
|
+
v, id = DispatchPolicy::CursorPagination.extract(@partitions.last, @sort)
|
|
35
|
+
DispatchPolicy::CursorPagination.encode(v, id)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
@policy = params[:policy]
|
|
39
|
+
@shard = params[:shard]
|
|
40
|
+
@query = params[:q]
|
|
41
|
+
@only_pending = params[:only_pending] == "1"
|
|
42
|
+
|
|
43
|
+
shards_scope = Partition.all
|
|
44
|
+
shards_scope = shards_scope.for_policy(params[:policy]) if params[:policy].present?
|
|
45
|
+
@shards = shards_scope.distinct.pluck(:shard).sort
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Build URL params preserving filters, replacing the cursor.
|
|
49
|
+
def pagination_params(overrides = {})
|
|
50
|
+
{
|
|
51
|
+
policy: @policy.presence,
|
|
52
|
+
shard: @shard.presence,
|
|
53
|
+
q: @query.presence,
|
|
54
|
+
sort: (@sort if @sort != DispatchPolicy::CursorPagination::DEFAULT_SORT),
|
|
55
|
+
only_pending: ("1" if @only_pending),
|
|
56
|
+
cursor: nil
|
|
57
|
+
}.compact.merge(overrides)
|
|
58
|
+
end
|
|
59
|
+
helper_method :pagination_params
|
|
60
|
+
|
|
61
|
+
def show
|
|
62
|
+
@recent_jobs = StagedJob
|
|
63
|
+
.for_partition(@partition.policy_name, @partition.partition_key)
|
|
64
|
+
.order(:scheduled_at, :id)
|
|
65
|
+
.limit(50)
|
|
66
|
+
@inflight = InflightJob.where(policy_name: @partition.policy_name).limit(50)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def admit
|
|
70
|
+
count = Integer(params[:count] || 1)
|
|
71
|
+
rows = Repository.claim_staged_jobs!(
|
|
72
|
+
policy_name: @partition.policy_name,
|
|
73
|
+
partition_key: @partition.partition_key,
|
|
74
|
+
limit: count,
|
|
75
|
+
gate_state_patch: {},
|
|
76
|
+
retry_after: nil
|
|
77
|
+
)
|
|
78
|
+
forwarded = rows.size - Forwarder.dispatch(rows).size
|
|
79
|
+
redirect_to partition_path(@partition), notice: "Forwarded #{forwarded} job(s)."
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Empties the partition by force-admitting every staged job through the
|
|
83
|
+
# forwarder, bypassing all gates. Bounded at DRAIN_MAX_PER_REQUEST so a
|
|
84
|
+
# huge backlog can't time the controller out — the operator clicks again
|
|
85
|
+
# for the next batch.
|
|
86
|
+
def drain
|
|
87
|
+
drained, remaining = self.class.drain_partition!(@partition)
|
|
88
|
+
notice = if remaining.positive?
|
|
89
|
+
"Drained #{drained} job(s); #{remaining} still pending — click drain again to continue."
|
|
90
|
+
else
|
|
91
|
+
"Drained #{drained} job(s); partition empty."
|
|
92
|
+
end
|
|
93
|
+
redirect_to partition_path(@partition), notice: notice
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def self.drain_partition!(partition)
|
|
97
|
+
drained = 0
|
|
98
|
+
while drained < DRAIN_MAX_PER_REQUEST
|
|
99
|
+
batch_limit = [DRAIN_BATCH_SIZE, DRAIN_MAX_PER_REQUEST - drained].min
|
|
100
|
+
rows = Repository.claim_staged_jobs!(
|
|
101
|
+
policy_name: partition.policy_name,
|
|
102
|
+
partition_key: partition.partition_key,
|
|
103
|
+
limit: batch_limit,
|
|
104
|
+
gate_state_patch: {},
|
|
105
|
+
retry_after: nil
|
|
106
|
+
)
|
|
107
|
+
break if rows.empty?
|
|
108
|
+
|
|
109
|
+
Forwarder.dispatch(rows)
|
|
110
|
+
drained += rows.size
|
|
111
|
+
end
|
|
112
|
+
remaining = partition.class.where(id: partition.id).pick(:pending_count) || 0
|
|
113
|
+
[drained, remaining]
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
def find_partition
|
|
119
|
+
@partition = Partition.find(params[:id])
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -2,294 +2,121 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
class PoliciesController < ApplicationController
|
|
5
|
-
|
|
6
|
-
PARTITION_LIST_PAGE_SIZE = 25
|
|
5
|
+
before_action :find_policy, only: %i[show pause resume drain]
|
|
7
6
|
|
|
8
|
-
|
|
7
|
+
DRAIN_MAX_PER_REQUEST = 10_000
|
|
9
8
|
|
|
10
9
|
def index
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
registry_names = DispatchPolicy.registry.names
|
|
11
|
+
db_names = Partition.distinct.pluck(:policy_name)
|
|
12
|
+
names = (registry_names + db_names).uniq.sort
|
|
13
|
+
|
|
14
|
+
in_flight_by_policy = InflightJob.where(policy_name: names).group(:policy_name).count
|
|
15
|
+
|
|
16
|
+
@rows = names.map do |name|
|
|
17
|
+
partitions = Partition.for_policy(name)
|
|
14
18
|
{
|
|
15
|
-
name:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
oldest_pending: pending.minimum(:staged_at),
|
|
22
|
-
stale_threshold: STALE_PENDING_THRESHOLD
|
|
19
|
+
name: name,
|
|
20
|
+
registered: registry_names.include?(name),
|
|
21
|
+
pending: partitions.sum(:pending_count),
|
|
22
|
+
in_flight: in_flight_by_policy[name] || 0,
|
|
23
|
+
partitions: partitions.count,
|
|
24
|
+
paused_count: partitions.paused.count
|
|
23
25
|
}
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
@active_partitions = PartitionInflightCount.where("in_flight > 0").count
|
|
27
|
-
@expired_leases = StagedJob.expired_leases.count
|
|
26
|
+
end
|
|
28
27
|
end
|
|
29
28
|
|
|
30
29
|
def show
|
|
31
|
-
|
|
32
|
-
@
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
@partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
|
|
44
|
-
|
|
45
|
-
# Browsable list of every active partition with filter + sort + pagination.
|
|
46
|
-
@partition_search = params[:q].to_s.strip
|
|
47
|
-
@partition_page = [ params[:page].to_i, 1 ].max
|
|
48
|
-
@partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
|
|
49
|
-
@partition_dir = params[:dir] == "asc" ? "asc" : "desc"
|
|
50
|
-
|
|
51
|
-
list = all_breakdown
|
|
52
|
-
list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
|
|
53
|
-
list = sort_partition_list(list, @partition_sort, @partition_dir)
|
|
54
|
-
|
|
55
|
-
@partition_total_list = list.size
|
|
56
|
-
offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
|
|
57
|
-
@partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
|
|
58
|
-
|
|
59
|
-
load_adaptive_chart_data
|
|
60
|
-
@throttle_buckets = ThrottleBucket
|
|
61
|
-
.where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
|
|
62
|
-
# Explicit select: don't load the `arguments` jsonb (job payload —
|
|
63
|
-
# may contain PII / tokens) into memory just to render six fields.
|
|
64
|
-
@pending_jobs = scope.pending
|
|
65
|
-
.select(:id, :dedupe_key, :round_robin_key, :priority, :staged_at, :not_before_at)
|
|
66
|
-
.order(:priority, :staged_at)
|
|
67
|
-
.limit(50)
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
private
|
|
71
|
-
|
|
72
|
-
def load_policy
|
|
73
|
-
@policy_name = params[:policy_name]
|
|
74
|
-
@job_class = DispatchPolicy.registry[@policy_name] ||
|
|
75
|
-
Tick.autoload_job_for(@policy_name)
|
|
76
|
-
raise ActiveRecord::RecordNotFound unless @job_class
|
|
77
|
-
@policy = @job_class.resolved_dispatch_policy
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
# Per-(source, partition) breakdown of pending-eligible / pending-scheduled
|
|
81
|
-
# / in-flight / completed-24h. A "source" is either a gate with a
|
|
82
|
-
# partition_by (uses gate.partition_key_for(context)) or the policy's
|
|
83
|
-
# round_robin_by declaration (uses the round_robin_key column directly).
|
|
84
|
-
# All four counts come from StagedJob groupings; PartitionInflightCount
|
|
85
|
-
# is an admission-time optimization, not the user-facing truth.
|
|
86
|
-
def partition_breakdown(scope)
|
|
87
|
-
sources = partition_sources
|
|
88
|
-
return [] if sources.empty?
|
|
89
|
-
|
|
90
|
-
now = Time.current
|
|
91
|
-
now_iso = now.iso8601
|
|
92
|
-
since_24h = 24.hours.ago.iso8601
|
|
93
|
-
limit = DispatchPolicy.config.admin_partition_limit
|
|
94
|
-
@partition_breakdown_truncated = false
|
|
95
|
-
|
|
96
|
-
adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
|
|
97
|
-
.order(updated_at: :desc)
|
|
98
|
-
.limit(limit)
|
|
99
|
-
.pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
|
|
100
|
-
.each_with_object({}) { |(g, k, c, l), h|
|
|
101
|
-
h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
rows = Hash.new { |h, k|
|
|
105
|
-
h[k] = {
|
|
106
|
-
source: k[0],
|
|
107
|
-
partition: k[1],
|
|
108
|
-
eligible: 0,
|
|
109
|
-
scheduled: 0,
|
|
110
|
-
in_flight: 0,
|
|
111
|
-
completed_24h: 0,
|
|
112
|
-
last_enqueued_at: nil,
|
|
113
|
-
last_dispatched_at: nil,
|
|
114
|
-
current_max: nil,
|
|
115
|
-
ewma_latency_ms: nil
|
|
116
|
-
}
|
|
30
|
+
@policy_object = DispatchPolicy.registry.fetch(@policy_name)
|
|
31
|
+
@partitions = Partition.for_policy(@policy_name)
|
|
32
|
+
.order(Arel.sql("pending_count DESC, last_admit_at DESC NULLS LAST"))
|
|
33
|
+
.limit(100)
|
|
34
|
+
@top_admitted = Partition.for_policy(@policy_name)
|
|
35
|
+
.order(total_admitted: :desc)
|
|
36
|
+
.limit(20)
|
|
37
|
+
|
|
38
|
+
@totals = {
|
|
39
|
+
pending: Partition.for_policy(@policy_name).sum(:pending_count),
|
|
40
|
+
in_flight: InflightJob.where(policy_name: @policy_name).count,
|
|
41
|
+
partitions: Partition.for_policy(@policy_name).count
|
|
117
42
|
}
|
|
118
43
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
138
|
-
@partition_breakdown_truncated = true if activity_rows.size >= limit
|
|
139
|
-
|
|
140
|
-
sources.each do |name, extract|
|
|
141
|
-
pending_counts = scope.pending.group(:context, :round_robin_key)
|
|
142
|
-
.order(Arel.sql("count(*) DESC"))
|
|
143
|
-
.limit(limit)
|
|
144
|
-
.pluck(
|
|
145
|
-
:context,
|
|
146
|
-
:round_robin_key,
|
|
147
|
-
Arel.sql("count(*) filter (where not_before_at is null or not_before_at <= '#{now_iso}')"),
|
|
148
|
-
Arel.sql("count(*) filter (where not_before_at > '#{now_iso}')")
|
|
149
|
-
)
|
|
150
|
-
@partition_breakdown_truncated = true if pending_counts.size >= limit
|
|
151
|
-
pending_counts.each do |ctx, rr_key, eligible, scheduled|
|
|
152
|
-
partition = extract.call(ctx, rr_key)
|
|
153
|
-
row = rows[[ name, partition ]]
|
|
154
|
-
row[:eligible] += eligible
|
|
155
|
-
row[:scheduled] += scheduled
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
admitted_counts = scope.admitted.group(:context, :round_robin_key)
|
|
159
|
-
.order(Arel.sql("count(*) DESC"))
|
|
160
|
-
.limit(limit)
|
|
161
|
-
.pluck(:context, :round_robin_key, Arel.sql("count(*)"))
|
|
162
|
-
@partition_breakdown_truncated = true if admitted_counts.size >= limit
|
|
163
|
-
admitted_counts.each do |ctx, rr_key, in_flight|
|
|
164
|
-
partition = extract.call(ctx, rr_key)
|
|
165
|
-
rows[[ name, partition ]][:in_flight] += in_flight
|
|
166
|
-
end
|
|
167
|
-
|
|
168
|
-
completed_counts = scope.completed.where("completed_at > ?", since_24h)
|
|
169
|
-
.group(:context, :round_robin_key)
|
|
170
|
-
.order(Arel.sql("count(*) DESC"))
|
|
171
|
-
.limit(limit)
|
|
172
|
-
.pluck(:context, :round_robin_key, Arel.sql("count(*)"))
|
|
173
|
-
@partition_breakdown_truncated = true if completed_counts.size >= limit
|
|
174
|
-
completed_counts.each do |ctx, rr_key, completed|
|
|
175
|
-
partition = extract.call(ctx, rr_key)
|
|
176
|
-
rows[[ name, partition ]][:completed_24h] += completed
|
|
177
|
-
end
|
|
178
|
-
|
|
179
|
-
activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
|
|
180
|
-
partition = extract.call(ctx, rr_key)
|
|
181
|
-
row = rows[[ name, partition ]]
|
|
182
|
-
row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
|
|
183
|
-
row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
|
|
184
|
-
end
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
rows.each do |(source, partition), row|
|
|
188
|
-
stats = adaptive_stats[[ source, partition ]]
|
|
189
|
-
next unless stats
|
|
190
|
-
row[:current_max] = stats[:current_max]
|
|
191
|
-
row[:ewma_latency_ms] = stats[:ewma_latency_ms]
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
# Two different sources (say round_robin_by account_id + a gate
|
|
195
|
-
# partitioned by account_id) producing the same partition key yield
|
|
196
|
-
# identical counts — collapse them into one row with a merged source
|
|
197
|
-
# label instead of listing the same numbers twice.
|
|
198
|
-
merged = rows.values
|
|
199
|
-
.reject { |r| r[:partition].nil? || r[:partition].empty? }
|
|
200
|
-
.group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
|
|
201
|
-
.map { |_, group|
|
|
202
|
-
base = group.first.dup
|
|
203
|
-
base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
|
|
204
|
-
group.each do |r|
|
|
205
|
-
base[:current_max] ||= r[:current_max]
|
|
206
|
-
base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
|
|
207
|
-
base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
|
|
208
|
-
base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
|
|
209
|
-
end
|
|
210
|
-
base
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
merged.sort_by { |r|
|
|
214
|
-
[ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
|
|
44
|
+
now = Time.current
|
|
45
|
+
@windows = {
|
|
46
|
+
"1m" => Repository.tick_summary(policy_name: @policy_name, since: now - 60),
|
|
47
|
+
"5m" => Repository.tick_summary(policy_name: @policy_name, since: now - 5 * 60),
|
|
48
|
+
"15m" => Repository.tick_summary(policy_name: @policy_name, since: now - 15 * 60)
|
|
49
|
+
}
|
|
50
|
+
@denied_reasons = Repository.denied_reasons_summary(policy_name: @policy_name, since: now - 15 * 60)
|
|
51
|
+
@round_trip = Repository.partition_round_trip_stats(policy_name: @policy_name)
|
|
52
|
+
@sparkline = Repository.tick_samples_buckets(policy_name: @policy_name, since: now - 30 * 60, bucket_seconds: 60)
|
|
53
|
+
@pending_trend = Repository.trend_direction(@sparkline.map { |b| b[:pending_total] })
|
|
54
|
+
|
|
55
|
+
cfg = DispatchPolicy.config
|
|
56
|
+
@capacity = {
|
|
57
|
+
admitted_per_minute: @windows["1m"][:jobs_admitted],
|
|
58
|
+
adapter_target_jps: cfg.adapter_throughput_target,
|
|
59
|
+
avg_tick_ms: @windows["1m"][:avg_duration_ms],
|
|
60
|
+
max_tick_ms: @windows["1m"][:max_duration_ms],
|
|
61
|
+
tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
|
|
215
62
|
}
|
|
216
|
-
end
|
|
217
63
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
dir == "asc" ? sorted : sorted.reverse
|
|
64
|
+
@hints = OperatorHints.for(
|
|
65
|
+
tick_max_duration_ms: @capacity[:tick_max_duration_ms],
|
|
66
|
+
avg_tick_ms: @capacity[:avg_tick_ms],
|
|
67
|
+
max_tick_ms: @capacity[:max_tick_ms],
|
|
68
|
+
pending_total: @totals[:pending],
|
|
69
|
+
admitted_per_minute: @capacity[:admitted_per_minute],
|
|
70
|
+
forward_failures: @windows["1m"][:forward_failures],
|
|
71
|
+
jobs_admitted: @windows["1m"][:jobs_admitted],
|
|
72
|
+
active_partitions: @round_trip[:active_partitions],
|
|
73
|
+
never_checked: @round_trip[:never_checked],
|
|
74
|
+
in_backoff: @round_trip[:in_backoff],
|
|
75
|
+
total_partitions: @totals[:partitions],
|
|
76
|
+
adapter_target_jps: @capacity[:adapter_target_jps],
|
|
77
|
+
pending_trend: @pending_trend
|
|
78
|
+
)
|
|
234
79
|
end
|
|
235
80
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
return [] unless @policy
|
|
81
|
+
def pause
|
|
82
|
+
Partition.for_policy(@policy_name).update_all(status: "paused", updated_at: Time.current)
|
|
83
|
+
redirect_to policy_path(@policy_name), notice: "Policy paused."
|
|
84
|
+
end
|
|
241
85
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
|
|
246
|
-
sources
|
|
86
|
+
def resume
|
|
87
|
+
Partition.for_policy(@policy_name).update_all(status: "active", updated_at: Time.current)
|
|
88
|
+
redirect_to policy_path(@policy_name), notice: "Policy resumed."
|
|
247
89
|
end
|
|
248
90
|
|
|
249
|
-
#
|
|
250
|
-
#
|
|
251
|
-
#
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
91
|
+
# Force-admits every staged job across every partition of the policy,
|
|
92
|
+
# bypassing all gates. Walks partitions in pending-DESC order so the
|
|
93
|
+
# busiest ones drain first. Bounded at DRAIN_MAX_PER_REQUEST per click.
|
|
94
|
+
def drain
|
|
95
|
+
drained = 0
|
|
96
|
+
Partition.for_policy(@policy_name)
|
|
97
|
+
.where("pending_count > 0")
|
|
98
|
+
.order(pending_count: :desc, id: :asc)
|
|
99
|
+
.limit(500)
|
|
100
|
+
.each do |partition|
|
|
101
|
+
break if drained >= DRAIN_MAX_PER_REQUEST
|
|
102
|
+
|
|
103
|
+
batch, _ = PartitionsController.drain_partition!(partition)
|
|
104
|
+
drained += batch
|
|
105
|
+
end
|
|
259
106
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
.group(:minute_bucket)
|
|
266
|
-
.pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
|
|
267
|
-
global_rows.each do |bucket, total_lag, obs_count|
|
|
268
|
-
idx = slot_index[bucket.utc.beginning_of_minute]
|
|
269
|
-
next unless idx
|
|
270
|
-
@completions_global[idx] = obs_count
|
|
271
|
-
@adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
|
|
107
|
+
remaining = Partition.for_policy(@policy_name).sum(:pending_count)
|
|
108
|
+
notice = if remaining.positive?
|
|
109
|
+
"Drained #{drained} job(s) across this policy; #{remaining} still pending — click drain again to continue."
|
|
110
|
+
else
|
|
111
|
+
"Drained #{drained} job(s); policy fully drained."
|
|
272
112
|
end
|
|
113
|
+
redirect_to policy_path(@policy_name), notice: notice
|
|
114
|
+
end
|
|
273
115
|
|
|
274
|
-
|
|
275
|
-
@adaptive_samples = {}
|
|
276
|
-
@completions_samples = {}
|
|
277
|
-
return if partition_keys.empty?
|
|
116
|
+
private
|
|
278
117
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
rows = PartitionObservation
|
|
282
|
-
.where(policy_name: @policy_name, partition_key: partition_keys)
|
|
283
|
-
.where("minute_bucket >= ?", @chart_slots.first)
|
|
284
|
-
.pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
|
|
285
|
-
rows.each do |pk, bucket, total, count|
|
|
286
|
-
idx = slot_index[bucket.utc.beginning_of_minute]
|
|
287
|
-
next unless idx
|
|
288
|
-
per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
|
|
289
|
-
per_partition_counts[pk][idx] = count
|
|
290
|
-
end
|
|
291
|
-
@adaptive_samples = per_partition_lag
|
|
292
|
-
@completions_samples = per_partition_counts
|
|
118
|
+
def find_policy
|
|
119
|
+
@policy_name = params[:name]
|
|
293
120
|
end
|
|
294
121
|
end
|
|
295
122
|
end
|