dispatch_policy 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +98 -28
  3. data/MIT-LICENSE +16 -17
  4. data/README.md +452 -388
  5. data/app/assets/images/dispatch_policy/logo-large.svg +9 -0
  6. data/app/assets/images/dispatch_policy/logo-small.svg +7 -0
  7. data/app/assets/javascripts/dispatch_policy/turbo.es2017-umd.min.js +35 -0
  8. data/app/assets/stylesheets/dispatch_policy/application.css +294 -0
  9. data/app/controllers/dispatch_policy/application_controller.rb +45 -1
  10. data/app/controllers/dispatch_policy/assets_controller.rb +31 -0
  11. data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
  12. data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
  13. data/app/controllers/dispatch_policy/policies_controller.rb +94 -267
  14. data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
  15. data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
  16. data/app/models/dispatch_policy/inflight_job.rb +12 -0
  17. data/app/models/dispatch_policy/partition.rb +21 -0
  18. data/app/models/dispatch_policy/staged_job.rb +4 -97
  19. data/app/models/dispatch_policy/tick_sample.rb +11 -0
  20. data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
  21. data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
  22. data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
  23. data/app/views/dispatch_policy/policies/index.html.erb +15 -37
  24. data/app/views/dispatch_policy/policies/show.html.erb +139 -223
  25. data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
  26. data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
  27. data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
  28. data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
  29. data/app/views/layouts/dispatch_policy/application.html.erb +164 -231
  30. data/config/routes.rb +21 -2
  31. data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
  32. data/lib/dispatch_policy/assets.rb +38 -0
  33. data/lib/dispatch_policy/bypass.rb +23 -0
  34. data/lib/dispatch_policy/config.rb +85 -0
  35. data/lib/dispatch_policy/context.rb +50 -0
  36. data/lib/dispatch_policy/cursor_pagination.rb +121 -0
  37. data/lib/dispatch_policy/decision.rb +22 -0
  38. data/lib/dispatch_policy/engine.rb +5 -27
  39. data/lib/dispatch_policy/forwarder.rb +63 -0
  40. data/lib/dispatch_policy/gate.rb +10 -38
  41. data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
  42. data/lib/dispatch_policy/gates/concurrency.rb +45 -26
  43. data/lib/dispatch_policy/gates/throttle.rb +65 -41
  44. data/lib/dispatch_policy/inflight_tracker.rb +174 -0
  45. data/lib/dispatch_policy/job_extension.rb +155 -0
  46. data/lib/dispatch_policy/operator_hints.rb +126 -0
  47. data/lib/dispatch_policy/pipeline.rb +48 -0
  48. data/lib/dispatch_policy/policy.rb +61 -59
  49. data/lib/dispatch_policy/policy_dsl.rb +120 -0
  50. data/lib/dispatch_policy/railtie.rb +35 -0
  51. data/lib/dispatch_policy/registry.rb +46 -0
  52. data/lib/dispatch_policy/repository.rb +723 -0
  53. data/lib/dispatch_policy/serializer.rb +36 -0
  54. data/lib/dispatch_policy/tick.rb +260 -256
  55. data/lib/dispatch_policy/tick_loop.rb +59 -26
  56. data/lib/dispatch_policy/version.rb +1 -1
  57. data/lib/dispatch_policy.rb +72 -52
  58. data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
  59. data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
  60. data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
  61. data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
  62. metadata +134 -42
  63. data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
  64. data/app/models/dispatch_policy/partition_observation.rb +0 -76
  65. data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
  66. data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
  67. data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
  68. data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
  69. data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
  70. data/db/migrate/20260425000001_add_duration_to_partition_observations.rb +0 -8
  71. data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
  72. data/lib/dispatch_policy/dispatch_context.rb +0 -53
  73. data/lib/dispatch_policy/dispatchable.rb +0 -123
  74. data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
  75. data/lib/dispatch_policy/gates/global_cap.rb +0 -26
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ # Serves vendored static assets at content-addressed URLs so browsers
5
+ # can cache them forever. The digest is part of the URL, not a query
6
+ # string, so caches keyed on path alone still bust on upgrade.
7
+ class AssetsController < ApplicationController
8
+ skip_forgery_protection
9
+
10
+ def turbo
11
+ serve(Assets::TURBO_BODY, Assets::TURBO_DIGEST, "application/javascript")
12
+ end
13
+
14
+ def logo
15
+ serve(Assets::LOGO_SMALL_BODY, Assets::LOGO_SMALL_DIGEST, "image/svg+xml")
16
+ end
17
+
18
+ private
19
+
20
+ def serve(body, digest, content_type)
21
+ if params[:digest] != digest
22
+ head :not_found
23
+ return
24
+ end
25
+
26
+ response.headers["Cache-Control"] = "public, max-age=31536000, immutable"
27
+ response.headers["ETag"] = %("#{digest}")
28
+ send_data body, type: content_type, disposition: "inline"
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class DashboardController < ApplicationController
5
+ WINDOWS = { "1m" => 60, "5m" => 5 * 60, "15m" => 15 * 60 }.freeze
6
+
7
+ def index
8
+ @totals = {
9
+ staged: StagedJob.count,
10
+ partitions: Partition.count,
11
+ active_parts: Partition.active.count,
12
+ paused_parts: Partition.paused.count,
13
+ in_flight: InflightJob.count
14
+ }
15
+
16
+ now = Time.current
17
+ @windows = WINDOWS.transform_values { |secs| Repository.tick_summary(since: now - secs) }
18
+ @round_trip = Repository.partition_round_trip_stats
19
+
20
+ # Pending trend: 30 minutes of 1-min buckets aggregated across
21
+ # all policies. Used for the sparkline + arrow on the overview.
22
+ @pending_buckets = Repository.tick_samples_buckets(since: now - 30 * 60, bucket_seconds: 60)
23
+ @pending_trend = Repository.trend_direction(@pending_buckets.map { |b| b[:pending_total] })
24
+
25
+ # Capacity headroom: live admit rate vs configured adapter ceiling,
26
+ # avg tick wall vs tick_max_duration. These two ratios are the
27
+ # operator's quickest "should I shard?" signal.
28
+ cfg = DispatchPolicy.config
29
+ @capacity = {
30
+ admitted_per_minute: @windows["1m"][:jobs_admitted],
31
+ admitted_per_second: @windows["1m"][:jobs_admitted] / 60.0,
32
+ adapter_target_jps: cfg.adapter_throughput_target,
33
+ avg_tick_ms: @windows["1m"][:avg_duration_ms],
34
+ max_tick_ms: @windows["1m"][:max_duration_ms],
35
+ tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
36
+ }
37
+
38
+ @hints = OperatorHints.for(
39
+ tick_max_duration_ms: @capacity[:tick_max_duration_ms],
40
+ avg_tick_ms: @capacity[:avg_tick_ms],
41
+ max_tick_ms: @capacity[:max_tick_ms],
42
+ pending_total: @totals[:staged],
43
+ admitted_per_minute: @capacity[:admitted_per_minute],
44
+ forward_failures: @windows["1m"][:forward_failures],
45
+ jobs_admitted: @windows["1m"][:jobs_admitted],
46
+ active_partitions: @round_trip[:active_partitions],
47
+ never_checked: @round_trip[:never_checked],
48
+ in_backoff: @round_trip[:in_backoff],
49
+ total_partitions: @totals[:partitions],
50
+ adapter_target_jps: @capacity[:adapter_target_jps],
51
+ pending_trend: @pending_trend
52
+ )
53
+
54
+ pending_by_policy = Partition
55
+ .group(:policy_name)
56
+ .pluck(:policy_name, Arel.sql("SUM(pending_count)::int"), Arel.sql("MAX(last_admit_at)"))
57
+ .to_h { |name, pending, last_admit| [name, { pending: pending || 0, last_admit_at: last_admit }] }
58
+
59
+ in_flight_by_policy = InflightJob.group(:policy_name).count
60
+
61
+ one_min_ago = now - 60
62
+ five_min_ago = now - 300
63
+
64
+ names = (pending_by_policy.keys + in_flight_by_policy.keys).uniq.sort
65
+ @policies = names.map do |name|
66
+ info = pending_by_policy[name] || {}
67
+ m1 = Repository.tick_summary(policy_name: name, since: one_min_ago)
68
+ m5 = Repository.tick_summary(policy_name: name, since: five_min_ago)
69
+ rs = Repository.denied_reasons_summary(policy_name: name, since: one_min_ago)
70
+ rt = Repository.partition_round_trip_stats(policy_name: name)
71
+
72
+ {
73
+ name: name,
74
+ pending: info[:pending] || 0,
75
+ in_flight: in_flight_by_policy[name] || 0,
76
+ last_admit_at: info[:last_admit_at],
77
+ admitted_1m: m1[:jobs_admitted],
78
+ admitted_5m: m5[:jobs_admitted],
79
+ ticks_1m: m1[:ticks],
80
+ avg_tick_ms_1m: m1[:avg_duration_ms],
81
+ forward_failures_1m: m1[:forward_failures],
82
+ oldest_age_seconds: rt[:oldest_age_seconds],
83
+ p95_age_seconds: rt[:p95_age_seconds],
84
+ in_backoff: rt[:in_backoff],
85
+ top_denial_reason: rs.first&.first,
86
+ top_denial_count: rs.first&.last
87
+ }
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class PartitionsController < ApplicationController
5
+ before_action :find_partition, only: %i[show drain admit]
6
+
7
+ DRAIN_MAX_PER_REQUEST = 10_000
8
+ DRAIN_BATCH_SIZE = 200
9
+
10
+ PAGE_SIZE = 100
11
+
12
+ def index
13
+ base = Partition.all
14
+ base = base.for_policy(params[:policy]) if params[:policy].present?
15
+ base = base.for_shard(params[:shard]) if params[:shard].present?
16
+ base = base.where("partition_key ILIKE ?", "%#{params[:q]}%") if params[:q].present?
17
+ base = base.where("pending_count > 0") if params[:only_pending] == "1"
18
+
19
+ @sort = DispatchPolicy::CursorPagination::SORTS.key?(params[:sort]) ? params[:sort] : DispatchPolicy::CursorPagination::DEFAULT_SORT
20
+ sort_def = DispatchPolicy::CursorPagination.sort_for(@sort)
21
+
22
+ @total = base.count # cheap on indexed columns; nice to display
23
+ @cursor = DispatchPolicy::CursorPagination.decode(params[:cursor])
24
+
25
+ paginated = DispatchPolicy::CursorPagination.apply(base, @sort, @cursor)
26
+ .order(Arel.sql(sort_def[:sql_order]))
27
+ .limit(PAGE_SIZE + 1)
28
+ .to_a
29
+
30
+ @has_more = paginated.size > PAGE_SIZE
31
+ @partitions = paginated.first(PAGE_SIZE)
32
+ @next_cursor =
33
+ if @has_more && @partitions.any?
34
+ v, id = DispatchPolicy::CursorPagination.extract(@partitions.last, @sort)
35
+ DispatchPolicy::CursorPagination.encode(v, id)
36
+ end
37
+
38
+ @policy = params[:policy]
39
+ @shard = params[:shard]
40
+ @query = params[:q]
41
+ @only_pending = params[:only_pending] == "1"
42
+
43
+ shards_scope = Partition.all
44
+ shards_scope = shards_scope.for_policy(params[:policy]) if params[:policy].present?
45
+ @shards = shards_scope.distinct.pluck(:shard).sort
46
+ end
47
+
48
+ # Build URL params preserving filters, replacing the cursor.
49
+ def pagination_params(overrides = {})
50
+ {
51
+ policy: @policy.presence,
52
+ shard: @shard.presence,
53
+ q: @query.presence,
54
+ sort: (@sort if @sort != DispatchPolicy::CursorPagination::DEFAULT_SORT),
55
+ only_pending: ("1" if @only_pending),
56
+ cursor: nil
57
+ }.compact.merge(overrides)
58
+ end
59
+ helper_method :pagination_params
60
+
61
+ def show
62
+ @recent_jobs = StagedJob
63
+ .for_partition(@partition.policy_name, @partition.partition_key)
64
+ .order(:scheduled_at, :id)
65
+ .limit(50)
66
+ @inflight = InflightJob.where(policy_name: @partition.policy_name).limit(50)
67
+ end
68
+
69
+ def admit
70
+ count = Integer(params[:count] || 1)
71
+ rows = Repository.claim_staged_jobs!(
72
+ policy_name: @partition.policy_name,
73
+ partition_key: @partition.partition_key,
74
+ limit: count,
75
+ gate_state_patch: {},
76
+ retry_after: nil
77
+ )
78
+ forwarded = rows.size - Forwarder.dispatch(rows).size
79
+ redirect_to partition_path(@partition), notice: "Forwarded #{forwarded} job(s)."
80
+ end
81
+
82
+ # Empties the partition by force-admitting every staged job through the
83
+ # forwarder, bypassing all gates. Bounded at DRAIN_MAX_PER_REQUEST so a
84
+ # huge backlog can't time the controller out — the operator clicks again
85
+ # for the next batch.
86
+ def drain
87
+ drained, remaining = self.class.drain_partition!(@partition)
88
+ notice = if remaining.positive?
89
+ "Drained #{drained} job(s); #{remaining} still pending — click drain again to continue."
90
+ else
91
+ "Drained #{drained} job(s); partition empty."
92
+ end
93
+ redirect_to partition_path(@partition), notice: notice
94
+ end
95
+
96
+ def self.drain_partition!(partition)
97
+ drained = 0
98
+ while drained < DRAIN_MAX_PER_REQUEST
99
+ batch_limit = [DRAIN_BATCH_SIZE, DRAIN_MAX_PER_REQUEST - drained].min
100
+ rows = Repository.claim_staged_jobs!(
101
+ policy_name: partition.policy_name,
102
+ partition_key: partition.partition_key,
103
+ limit: batch_limit,
104
+ gate_state_patch: {},
105
+ retry_after: nil
106
+ )
107
+ break if rows.empty?
108
+
109
+ Forwarder.dispatch(rows)
110
+ drained += rows.size
111
+ end
112
+ remaining = partition.class.where(id: partition.id).pick(:pending_count) || 0
113
+ [drained, remaining]
114
+ end
115
+
116
+ private
117
+
118
+ def find_partition
119
+ @partition = Partition.find(params[:id])
120
+ end
121
+ end
122
+ end
@@ -2,294 +2,121 @@
2
2
 
3
3
  module DispatchPolicy
4
4
  class PoliciesController < ApplicationController
5
- STALE_PENDING_THRESHOLD = 1.hour
6
- PARTITION_LIST_PAGE_SIZE = 25
5
+ before_action :find_policy, only: %i[show pause resume drain]
7
6
 
8
- before_action :load_policy, only: :show
7
+ DRAIN_MAX_PER_REQUEST = 10_000
9
8
 
10
9
  def index
11
- @policies = DispatchPolicy.registry.map do |name, job_class|
12
- scope = StagedJob.where(policy_name: name)
13
- pending = scope.pending
10
+ registry_names = DispatchPolicy.registry.names
11
+ db_names = Partition.distinct.pluck(:policy_name)
12
+ names = (registry_names + db_names).uniq.sort
13
+
14
+ in_flight_by_policy = InflightJob.where(policy_name: names).group(:policy_name).count
15
+
16
+ @rows = names.map do |name|
17
+ partitions = Partition.for_policy(name)
14
18
  {
15
- name: name,
16
- job_class: job_class,
17
- policy: job_class.resolved_dispatch_policy,
18
- pending_count: pending.count,
19
- admitted_count: scope.admitted.count,
20
- completed_24h: scope.completed.where(completed_at: 24.hours.ago..).count,
21
- oldest_pending: pending.minimum(:staged_at),
22
- stale_threshold: STALE_PENDING_THRESHOLD
19
+ name: name,
20
+ registered: registry_names.include?(name),
21
+ pending: partitions.sum(:pending_count),
22
+ in_flight: in_flight_by_policy[name] || 0,
23
+ partitions: partitions.count,
24
+ paused_count: partitions.paused.count
23
25
  }
24
- end.sort_by { |p| -p[:pending_count] }
25
-
26
- @active_partitions = PartitionInflightCount.where("in_flight > 0").count
27
- @expired_leases = StagedJob.expired_leases.count
26
+ end
28
27
  end
29
28
 
30
29
  def show
31
- scope = StagedJob.where(policy_name: @policy_name)
32
- @pending_count = scope.pending.count
33
- @pending_eligible_count = scope.pending.where("not_before_at IS NULL OR not_before_at <= ?", Time.current).count
34
- @pending_scheduled_count = @pending_count - @pending_eligible_count
35
- @admitted_count = scope.admitted.count
36
- @completed_24h = scope.completed.where(completed_at: 24.hours.ago..).count
37
-
38
- all_breakdown = partition_breakdown(scope)
39
-
40
- # "Watched" subset (passed via ?watch=a,b,c; the JS layer syncs it
41
- # with localStorage so the choice sticks across reloads).
42
- @watched_keys = (params[:watch] || "").split(",").map(&:strip).reject(&:empty?)
43
- @partition_breakdown = @watched_keys.any? ? all_breakdown.select { |r| @watched_keys.include?(r[:partition]) } : []
44
-
45
- # Browsable list of every active partition with filter + sort + pagination.
46
- @partition_search = params[:q].to_s.strip
47
- @partition_page = [ params[:page].to_i, 1 ].max
48
- @partition_sort = %w[source partition pending in_flight completed_24h last_enqueued_at last_dispatched_at].include?(params[:sort]) ? params[:sort] : "activity"
49
- @partition_dir = params[:dir] == "asc" ? "asc" : "desc"
50
-
51
- list = all_breakdown
52
- list = list.select { |r| r[:partition].to_s.downcase.include?(@partition_search.downcase) } if @partition_search.present?
53
- list = sort_partition_list(list, @partition_sort, @partition_dir)
54
-
55
- @partition_total_list = list.size
56
- offset = (@partition_page - 1) * PARTITION_LIST_PAGE_SIZE
57
- @partition_list = list[offset, PARTITION_LIST_PAGE_SIZE] || []
58
-
59
- load_adaptive_chart_data
60
- @throttle_buckets = ThrottleBucket
61
- .where(policy_name: @policy_name).order(:gate_name, :partition_key).limit(50)
62
- # Explicit select: don't load the `arguments` jsonb (job payload —
63
- # may contain PII / tokens) into memory just to render six fields.
64
- @pending_jobs = scope.pending
65
- .select(:id, :dedupe_key, :round_robin_key, :priority, :staged_at, :not_before_at)
66
- .order(:priority, :staged_at)
67
- .limit(50)
68
- end
69
-
70
- private
71
-
72
- def load_policy
73
- @policy_name = params[:policy_name]
74
- @job_class = DispatchPolicy.registry[@policy_name] ||
75
- Tick.autoload_job_for(@policy_name)
76
- raise ActiveRecord::RecordNotFound unless @job_class
77
- @policy = @job_class.resolved_dispatch_policy
78
- end
79
-
80
- # Per-(source, partition) breakdown of pending-eligible / pending-scheduled
81
- # / in-flight / completed-24h. A "source" is either a gate with a
82
- # partition_by (uses gate.partition_key_for(context)) or the policy's
83
- # round_robin_by declaration (uses the round_robin_key column directly).
84
- # All four counts come from StagedJob groupings; PartitionInflightCount
85
- # is an admission-time optimization, not the user-facing truth.
86
- def partition_breakdown(scope)
87
- sources = partition_sources
88
- return [] if sources.empty?
89
-
90
- now = Time.current
91
- now_iso = now.iso8601
92
- since_24h = 24.hours.ago.iso8601
93
- limit = DispatchPolicy.config.admin_partition_limit
94
- @partition_breakdown_truncated = false
95
-
96
- adaptive_stats = AdaptiveConcurrencyStats.where(policy_name: @policy_name)
97
- .order(updated_at: :desc)
98
- .limit(limit)
99
- .pluck(:gate_name, :partition_key, :current_max, :ewma_latency_ms)
100
- .each_with_object({}) { |(g, k, c, l), h|
101
- h[[ g, k ]] = { current_max: c, ewma_latency_ms: l.to_f.round(1) }
102
- }
103
-
104
- rows = Hash.new { |h, k|
105
- h[k] = {
106
- source: k[0],
107
- partition: k[1],
108
- eligible: 0,
109
- scheduled: 0,
110
- in_flight: 0,
111
- completed_24h: 0,
112
- last_enqueued_at: nil,
113
- last_dispatched_at: nil,
114
- current_max: nil,
115
- ewma_latency_ms: nil
116
- }
30
+ @policy_object = DispatchPolicy.registry.fetch(@policy_name)
31
+ @partitions = Partition.for_policy(@policy_name)
32
+ .order(Arel.sql("pending_count DESC, last_admit_at DESC NULLS LAST"))
33
+ .limit(100)
34
+ @top_admitted = Partition.for_policy(@policy_name)
35
+ .order(total_admitted: :desc)
36
+ .limit(20)
37
+
38
+ @totals = {
39
+ pending: Partition.for_policy(@policy_name).sum(:pending_count),
40
+ in_flight: InflightJob.where(policy_name: @policy_name).count,
41
+ partitions: Partition.for_policy(@policy_name).count
117
42
  }
118
43
 
119
- # Each aggregation below is order-by-count + limited so that a
120
- # policy with tens of thousands of distinct (context, round_robin_key)
121
- # tuples can't pull megabytes of rows into memory per request. We
122
- # show the top-N most-active partitions per axis and flip the
123
- # truncation flag for the view banner.
124
-
125
- # Activity timestamps bounded to the last 24h so the scan stays on
126
- # an index-friendly slice of staged_jobs.
127
- activity_rows = scope
128
- .where("staged_at > ?", since_24h)
129
- .group(:context, :round_robin_key)
130
- .order(Arel.sql("MAX(staged_at) DESC"))
131
- .limit(limit)
132
- .pluck(
133
- :context,
134
- :round_robin_key,
135
- Arel.sql("MAX(staged_at)"),
136
- Arel.sql("MAX(admitted_at)")
137
- )
138
- @partition_breakdown_truncated = true if activity_rows.size >= limit
139
-
140
- sources.each do |name, extract|
141
- pending_counts = scope.pending.group(:context, :round_robin_key)
142
- .order(Arel.sql("count(*) DESC"))
143
- .limit(limit)
144
- .pluck(
145
- :context,
146
- :round_robin_key,
147
- Arel.sql("count(*) filter (where not_before_at is null or not_before_at <= '#{now_iso}')"),
148
- Arel.sql("count(*) filter (where not_before_at > '#{now_iso}')")
149
- )
150
- @partition_breakdown_truncated = true if pending_counts.size >= limit
151
- pending_counts.each do |ctx, rr_key, eligible, scheduled|
152
- partition = extract.call(ctx, rr_key)
153
- row = rows[[ name, partition ]]
154
- row[:eligible] += eligible
155
- row[:scheduled] += scheduled
156
- end
157
-
158
- admitted_counts = scope.admitted.group(:context, :round_robin_key)
159
- .order(Arel.sql("count(*) DESC"))
160
- .limit(limit)
161
- .pluck(:context, :round_robin_key, Arel.sql("count(*)"))
162
- @partition_breakdown_truncated = true if admitted_counts.size >= limit
163
- admitted_counts.each do |ctx, rr_key, in_flight|
164
- partition = extract.call(ctx, rr_key)
165
- rows[[ name, partition ]][:in_flight] += in_flight
166
- end
167
-
168
- completed_counts = scope.completed.where("completed_at > ?", since_24h)
169
- .group(:context, :round_robin_key)
170
- .order(Arel.sql("count(*) DESC"))
171
- .limit(limit)
172
- .pluck(:context, :round_robin_key, Arel.sql("count(*)"))
173
- @partition_breakdown_truncated = true if completed_counts.size >= limit
174
- completed_counts.each do |ctx, rr_key, completed|
175
- partition = extract.call(ctx, rr_key)
176
- rows[[ name, partition ]][:completed_24h] += completed
177
- end
178
-
179
- activity_rows.each do |ctx, rr_key, last_staged, last_admitted|
180
- partition = extract.call(ctx, rr_key)
181
- row = rows[[ name, partition ]]
182
- row[:last_enqueued_at] = [ row[:last_enqueued_at], last_staged ].compact.max
183
- row[:last_dispatched_at] = [ row[:last_dispatched_at], last_admitted ].compact.max
184
- end
185
- end
186
-
187
- rows.each do |(source, partition), row|
188
- stats = adaptive_stats[[ source, partition ]]
189
- next unless stats
190
- row[:current_max] = stats[:current_max]
191
- row[:ewma_latency_ms] = stats[:ewma_latency_ms]
192
- end
193
-
194
- # Two different sources (say round_robin_by account_id + a gate
195
- # partitioned by account_id) producing the same partition key yield
196
- # identical counts — collapse them into one row with a merged source
197
- # label instead of listing the same numbers twice.
198
- merged = rows.values
199
- .reject { |r| r[:partition].nil? || r[:partition].empty? }
200
- .group_by { |r| [ r[:partition], r[:eligible], r[:scheduled], r[:in_flight], r[:completed_24h] ] }
201
- .map { |_, group|
202
- base = group.first.dup
203
- base[:source] = group.map { |r| r[:source] }.uniq.sort.join(" + ")
204
- group.each do |r|
205
- base[:current_max] ||= r[:current_max]
206
- base[:ewma_latency_ms] ||= r[:ewma_latency_ms]
207
- base[:last_enqueued_at] = [ base[:last_enqueued_at], r[:last_enqueued_at] ].compact.max
208
- base[:last_dispatched_at] = [ base[:last_dispatched_at], r[:last_dispatched_at] ].compact.max
209
- end
210
- base
211
- }
212
-
213
- merged.sort_by { |r|
214
- [ -(r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h]), r[:source], r[:partition] ]
44
+ now = Time.current
45
+ @windows = {
46
+ "1m" => Repository.tick_summary(policy_name: @policy_name, since: now - 60),
47
+ "5m" => Repository.tick_summary(policy_name: @policy_name, since: now - 5 * 60),
48
+ "15m" => Repository.tick_summary(policy_name: @policy_name, since: now - 15 * 60)
49
+ }
50
+ @denied_reasons = Repository.denied_reasons_summary(policy_name: @policy_name, since: now - 15 * 60)
51
+ @round_trip = Repository.partition_round_trip_stats(policy_name: @policy_name)
52
+ @sparkline = Repository.tick_samples_buckets(policy_name: @policy_name, since: now - 30 * 60, bucket_seconds: 60)
53
+ @pending_trend = Repository.trend_direction(@sparkline.map { |b| b[:pending_total] })
54
+
55
+ cfg = DispatchPolicy.config
56
+ @capacity = {
57
+ admitted_per_minute: @windows["1m"][:jobs_admitted],
58
+ adapter_target_jps: cfg.adapter_throughput_target,
59
+ avg_tick_ms: @windows["1m"][:avg_duration_ms],
60
+ max_tick_ms: @windows["1m"][:max_duration_ms],
61
+ tick_max_duration_ms: cfg.tick_max_duration.to_i * 1000
215
62
  }
216
- end
217
63
 
218
- def sort_partition_list(list, sort, dir)
219
- # Put nulls at the bottom regardless of direction (Time#to_f on nil
220
- # would crash; -Float::INFINITY sorts first, +Float::INFINITY last).
221
- key =
222
- case sort
223
- when "source" then ->(r) { [ r[:source], r[:partition] ] }
224
- when "partition" then ->(r) { r[:partition] }
225
- when "pending" then ->(r) { r[:eligible] + r[:scheduled] }
226
- when "in_flight" then ->(r) { r[:in_flight] }
227
- when "completed_24h" then ->(r) { r[:completed_24h] }
228
- when "last_enqueued_at" then ->(r) { r[:last_enqueued_at]&.to_f || 0 }
229
- when "last_dispatched_at" then ->(r) { r[:last_dispatched_at]&.to_f || 0 }
230
- else ->(r) { r[:eligible] + r[:scheduled] + r[:in_flight] + r[:completed_24h] }
231
- end
232
- sorted = list.sort_by(&key)
233
- dir == "asc" ? sorted : sorted.reverse
64
+ @hints = OperatorHints.for(
65
+ tick_max_duration_ms: @capacity[:tick_max_duration_ms],
66
+ avg_tick_ms: @capacity[:avg_tick_ms],
67
+ max_tick_ms: @capacity[:max_tick_ms],
68
+ pending_total: @totals[:pending],
69
+ admitted_per_minute: @capacity[:admitted_per_minute],
70
+ forward_failures: @windows["1m"][:forward_failures],
71
+ jobs_admitted: @windows["1m"][:jobs_admitted],
72
+ active_partitions: @round_trip[:active_partitions],
73
+ never_checked: @round_trip[:never_checked],
74
+ in_backoff: @round_trip[:in_backoff],
75
+ total_partitions: @totals[:partitions],
76
+ adapter_target_jps: @capacity[:adapter_target_jps],
77
+ pending_trend: @pending_trend
78
+ )
234
79
  end
235
80
 
236
- # Returns [[source_name, ->(ctx, rr_key) { partition_key }], ...]
237
- # covering every partition-producing declaration on the policy: every
238
- # gate with a partition_by, plus round_robin_by if declared.
239
- def partition_sources
240
- return [] unless @policy
81
+ def pause
82
+ Partition.for_policy(@policy_name).update_all(status: "paused", updated_at: Time.current)
83
+ redirect_to policy_path(@policy_name), notice: "Policy paused."
84
+ end
241
85
 
242
- sources = @policy.gates.select(&:partition_by).map do |gate|
243
- [ gate.name.to_s, ->(ctx, _rr) { gate.partition_key_for((ctx || {}).symbolize_keys) } ]
244
- end
245
- sources << [ "round_robin_by", ->(_ctx, rr) { rr } ] if @policy.round_robin?
246
- sources
86
+ def resume
87
+ Partition.for_policy(@policy_name).update_all(status: "active", updated_at: Time.current)
88
+ redirect_to policy_path(@policy_name), notice: "Policy resumed."
247
89
  end
248
90
 
249
- # Build chart data from PartitionObservation. Two queries:
250
- # - Global aggregated (one row per minute): cheap even with 1000s of
251
- # partitions because we SUM/AVG in SQL, not in Ruby.
252
- # - Per-partition sparkline data, scoped to only the partitions we're
253
- # going to actually render (breakdown's top N).
254
- def load_adaptive_chart_data
255
- last_minute = Time.current.utc.beginning_of_minute
256
- @chart_slots = (0..59).map { |i| last_minute - (59 - i).minutes }
257
- @chart_labels = @chart_slots.map { |t| t.strftime("%H:%M") }
258
- slot_index = @chart_slots.each_with_index.to_h
91
+ # Force-admits every staged job across every partition of the policy,
92
+ # bypassing all gates. Walks partitions in pending-DESC order so the
93
+ # busiest ones drain first. Bounded at DRAIN_MAX_PER_REQUEST per click.
94
+ def drain
95
+ drained = 0
96
+ Partition.for_policy(@policy_name)
97
+ .where("pending_count > 0")
98
+ .order(pending_count: :desc, id: :asc)
99
+ .limit(500)
100
+ .each do |partition|
101
+ break if drained >= DRAIN_MAX_PER_REQUEST
102
+
103
+ batch, _ = PartitionsController.drain_partition!(partition)
104
+ drained += batch
105
+ end
259
106
 
260
- @adaptive_global = Array.new(@chart_slots.size)
261
- @completions_global = Array.new(@chart_slots.size, 0)
262
- global_rows = PartitionObservation
263
- .where(policy_name: @policy_name)
264
- .where("minute_bucket >= ?", @chart_slots.first)
265
- .group(:minute_bucket)
266
- .pluck(:minute_bucket, Arel.sql("SUM(total_lag_ms)"), Arel.sql("SUM(observation_count)"))
267
- global_rows.each do |bucket, total_lag, obs_count|
268
- idx = slot_index[bucket.utc.beginning_of_minute]
269
- next unless idx
270
- @completions_global[idx] = obs_count
271
- @adaptive_global[idx] = obs_count.positive? ? (total_lag.to_f / obs_count).round(1) : nil
107
+ remaining = Partition.for_policy(@policy_name).sum(:pending_count)
108
+ notice = if remaining.positive?
109
+ "Drained #{drained} job(s) across this policy; #{remaining} still pending — click drain again to continue."
110
+ else
111
+ "Drained #{drained} job(s); policy fully drained."
272
112
  end
113
+ redirect_to policy_path(@policy_name), notice: notice
114
+ end
273
115
 
274
- partition_keys = (@partition_breakdown || []).map { |r| r[:partition] }.uniq
275
- @adaptive_samples = {}
276
- @completions_samples = {}
277
- return if partition_keys.empty?
116
+ private
278
117
 
279
- per_partition_lag = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size) }
280
- per_partition_counts = Hash.new { |h, k| h[k] = Array.new(@chart_slots.size, 0) }
281
- rows = PartitionObservation
282
- .where(policy_name: @policy_name, partition_key: partition_keys)
283
- .where("minute_bucket >= ?", @chart_slots.first)
284
- .pluck(:partition_key, :minute_bucket, :total_lag_ms, :observation_count)
285
- rows.each do |pk, bucket, total, count|
286
- idx = slot_index[bucket.utc.beginning_of_minute]
287
- next unless idx
288
- per_partition_lag[pk][idx] = count.positive? ? (total.to_f / count).round(1) : nil
289
- per_partition_counts[pk][idx] = count
290
- end
291
- @adaptive_samples = per_partition_lag
292
- @completions_samples = per_partition_counts
118
+ def find_policy
119
+ @policy_name = params[:name]
293
120
  end
294
121
  end
295
122
  end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DispatchPolicy
4
+ class StagedJobsController < ApplicationController
5
+ def show
6
+ @job = StagedJob.find(params[:id])
7
+ end
8
+ end
9
+ end