dispatch_policy 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/MIT-LICENSE +16 -17
- data/README.md +449 -288
- data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
- data/app/controllers/dispatch_policy/application_controller.rb +45 -1
- data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
- data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
- data/app/models/dispatch_policy/inflight_job.rb +12 -0
- data/app/models/dispatch_policy/partition.rb +21 -0
- data/app/models/dispatch_policy/staged_job.rb +4 -97
- data/app/models/dispatch_policy/tick_sample.rb +11 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
- data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
- data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
- data/app/views/dispatch_policy/policies/index.html.erb +15 -37
- data/app/views/dispatch_policy/policies/show.html.erb +140 -216
- data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
- data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
- data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
- data/config/routes.rb +18 -2
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
- data/lib/dispatch_policy/bypass.rb +23 -0
- data/lib/dispatch_policy/config.rb +85 -0
- data/lib/dispatch_policy/context.rb +50 -0
- data/lib/dispatch_policy/cursor_pagination.rb +121 -0
- data/lib/dispatch_policy/decision.rb +22 -0
- data/lib/dispatch_policy/engine.rb +4 -27
- data/lib/dispatch_policy/forwarder.rb +63 -0
- data/lib/dispatch_policy/gate.rb +10 -38
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
- data/lib/dispatch_policy/gates/concurrency.rb +45 -26
- data/lib/dispatch_policy/gates/throttle.rb +65 -37
- data/lib/dispatch_policy/inflight_tracker.rb +174 -0
- data/lib/dispatch_policy/job_extension.rb +155 -0
- data/lib/dispatch_policy/operator_hints.rb +126 -0
- data/lib/dispatch_policy/pipeline.rb +48 -0
- data/lib/dispatch_policy/policy.rb +62 -47
- data/lib/dispatch_policy/policy_dsl.rb +120 -0
- data/lib/dispatch_policy/railtie.rb +35 -0
- data/lib/dispatch_policy/registry.rb +46 -0
- data/lib/dispatch_policy/repository.rb +723 -0
- data/lib/dispatch_policy/serializer.rb +36 -0
- data/lib/dispatch_policy/tick.rb +263 -172
- data/lib/dispatch_policy/tick_loop.rb +59 -26
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/dispatch_policy.rb +71 -46
- data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
- data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
- metadata +101 -43
- data/CHANGELOG.md +0 -12
- data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
- data/app/models/dispatch_policy/partition_observation.rb +0 -49
- data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
- data/lib/dispatch_policy/dispatch_context.rb +0 -53
- data/lib/dispatch_policy/dispatchable.rb +0 -120
- data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
- data/lib/dispatch_policy/gates/global_cap.rb +0 -26
- data/lib/dispatch_policy/install_generator.rb +0 -23
|
@@ -2,51 +2,79 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
module Gates
|
|
5
|
+
# Token bucket throttle gate.
|
|
6
|
+
#
|
|
7
|
+
# Persists state in partitions.gate_state["throttle"] = {
|
|
8
|
+
# "tokens" => Float, # current tokens, capped at bucket size
|
|
9
|
+
# "refilled_at" => Float # epoch seconds, last refill
|
|
10
|
+
# }
|
|
11
|
+
#
|
|
12
|
+
# The partition scope this gate enforces against is the policy's
|
|
13
|
+
# `partition_by` (declared in the policy DSL block, not on the gate).
|
|
14
|
+
# The bucket lives on the staged partition row — one row per
|
|
15
|
+
# `policy.partition_for(ctx)` value, one bucket per row, no dilution.
|
|
5
16
|
class Throttle < Gate
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
17
|
+
attr_reader :rate_proc, :per
|
|
18
|
+
|
|
19
|
+
def initialize(rate:, per:)
|
|
20
|
+
super()
|
|
21
|
+
@rate_proc = rate.respond_to?(:call) ? rate : ->(_ctx) { rate }
|
|
22
|
+
@per = duration_seconds(per)
|
|
23
|
+
raise ArgumentError, "throttle :per must be > 0 (got #{@per})" unless @per.positive?
|
|
10
24
|
end
|
|
11
25
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
false
|
|
26
|
+
def name
|
|
27
|
+
:throttle
|
|
15
28
|
end
|
|
16
29
|
|
|
17
|
-
def
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
else
|
|
39
|
-
break
|
|
40
|
-
end
|
|
41
|
-
end
|
|
42
|
-
bucket.save!
|
|
30
|
+
def evaluate(ctx, partition, admit_budget)
|
|
31
|
+
capacity = capacity_for(ctx)
|
|
32
|
+
return Decision.deny(reason: "rate=0") if capacity <= 0
|
|
33
|
+
|
|
34
|
+
refill_rate = capacity.to_f / @per
|
|
35
|
+
state = (partition["gate_state"] || {})["throttle"] || {}
|
|
36
|
+
tokens = (state["tokens"] || capacity).to_f
|
|
37
|
+
refilled_at = (state["refilled_at"] || now).to_f
|
|
38
|
+
|
|
39
|
+
elapsed = [now - refilled_at, 0.0].max
|
|
40
|
+
tokens = [tokens + (elapsed * refill_rate), capacity.to_f].min
|
|
41
|
+
|
|
42
|
+
whole = tokens.floor
|
|
43
|
+
if whole.zero?
|
|
44
|
+
missing = 1.0 - tokens
|
|
45
|
+
retry_after = missing / refill_rate
|
|
46
|
+
patch = { "tokens" => tokens, "refilled_at" => now }
|
|
47
|
+
return Decision.new(allowed: 0,
|
|
48
|
+
retry_after: retry_after,
|
|
49
|
+
gate_state_patch: { "throttle" => patch },
|
|
50
|
+
reason: "throttle_empty")
|
|
43
51
|
end
|
|
44
52
|
|
|
45
|
-
|
|
46
|
-
|
|
53
|
+
allowed = [whole, admit_budget].min
|
|
54
|
+
patch = { "tokens" => tokens - allowed, "refilled_at" => now }
|
|
55
|
+
Decision.new(allowed: allowed, gate_state_patch: { "throttle" => patch })
|
|
47
56
|
end
|
|
48
|
-
end
|
|
49
57
|
|
|
50
|
-
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def capacity_for(ctx)
|
|
61
|
+
value = @rate_proc.call(ctx)
|
|
62
|
+
value.nil? ? 0 : Integer(value)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def now
|
|
66
|
+
DispatchPolicy.config.now.to_f
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def duration_seconds(value)
|
|
70
|
+
if value.is_a?(Numeric)
|
|
71
|
+
value.to_f
|
|
72
|
+
elsif value.respond_to?(:to_f) && value.respond_to?(:seconds)
|
|
73
|
+
value.to_f
|
|
74
|
+
else
|
|
75
|
+
raise ArgumentError, "throttle :per must be a numeric duration in seconds (got #{value.inspect})"
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
51
79
|
end
|
|
52
80
|
end
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Around-perform that records each job execution in
|
|
5
|
+
# dispatch_policy_inflight_jobs while it runs, so the concurrency gate
|
|
6
|
+
# can count active jobs per partition.
|
|
7
|
+
#
|
|
8
|
+
# While the job runs we spawn a heartbeat thread that bumps
|
|
9
|
+
# `heartbeat_at` every `config.inflight_heartbeat_interval` seconds.
|
|
10
|
+
# Without this, jobs longer than `inflight_stale_after` (default 5 min)
|
|
11
|
+
# get their inflight row prematurely swept and the concurrency gate
|
|
12
|
+
# over-admits.
|
|
13
|
+
module InflightTracker
|
|
14
|
+
extend ActiveSupport::Concern
|
|
15
|
+
|
|
16
|
+
class_methods do
|
|
17
|
+
def dispatch_policy_inflight_tracking
|
|
18
|
+
around_perform do |job, block|
|
|
19
|
+
DispatchPolicy::InflightTracker.track(job, &block)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def self.track(job)
|
|
25
|
+
policy_name = job.class.respond_to?(:dispatch_policy_name) && job.class.dispatch_policy_name
|
|
26
|
+
return yield unless policy_name
|
|
27
|
+
|
|
28
|
+
policy = DispatchPolicy.registry.fetch(policy_name)
|
|
29
|
+
return yield unless policy
|
|
30
|
+
|
|
31
|
+
ctx = policy.build_context(job.arguments, queue_name: job.queue_name&.to_s)
|
|
32
|
+
partition_key = policy.partition_key_for(ctx)
|
|
33
|
+
|
|
34
|
+
Repository.insert_inflight!([{
|
|
35
|
+
policy_name: policy.name,
|
|
36
|
+
partition_key: partition_key,
|
|
37
|
+
active_job_id: job.job_id
|
|
38
|
+
}])
|
|
39
|
+
|
|
40
|
+
adaptive_gates = policy.gates.select { |g| g.name == :adaptive_concurrency }
|
|
41
|
+
admitted_at = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
|
|
42
|
+
perform_start = Time.current
|
|
43
|
+
|
|
44
|
+
heartbeat = start_heartbeat(job.job_id)
|
|
45
|
+
|
|
46
|
+
succeeded = false
|
|
47
|
+
begin
|
|
48
|
+
yield
|
|
49
|
+
succeeded = true
|
|
50
|
+
ensure
|
|
51
|
+
stop_heartbeat(heartbeat)
|
|
52
|
+
|
|
53
|
+
record_adaptive_observations(
|
|
54
|
+
policy: policy,
|
|
55
|
+
gates: adaptive_gates,
|
|
56
|
+
partition_key: partition_key,
|
|
57
|
+
admitted_at: admitted_at,
|
|
58
|
+
perform_start: perform_start,
|
|
59
|
+
succeeded: succeeded
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
begin
|
|
63
|
+
Repository.delete_inflight!(active_job_id: job.job_id)
|
|
64
|
+
rescue StandardError => e
|
|
65
|
+
DispatchPolicy.config.logger&.warn("[dispatch_policy] failed to delete inflight row #{job.job_id}: #{e.class}: #{e.message}")
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Reads the admitted_at column from the inflight row that the Tick
|
|
71
|
+
# pre-inserted. Used as the start-of-queue-wait reference for the
|
|
72
|
+
# adaptive_concurrency feedback signal (queue_lag = perform_start
|
|
73
|
+
# - admitted_at). nil if the row vanished or the lookup fails —
|
|
74
|
+
# the observation is then skipped.
|
|
75
|
+
def self.lookup_admitted_at(active_job_id)
|
|
76
|
+
result = ActiveRecord::Base.connection.exec_query(
|
|
77
|
+
"SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
|
|
78
|
+
"lookup_admitted_at",
|
|
79
|
+
[active_job_id]
|
|
80
|
+
)
|
|
81
|
+
row = result.first
|
|
82
|
+
return nil unless row
|
|
83
|
+
ts = row["admitted_at"]
|
|
84
|
+
ts.is_a?(Time) ? ts : Time.parse(ts.to_s)
|
|
85
|
+
rescue StandardError
|
|
86
|
+
nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def self.record_adaptive_observations(policy:, gates:, partition_key:, admitted_at:, perform_start:, succeeded:)
|
|
90
|
+
return if gates.empty?
|
|
91
|
+
|
|
92
|
+
queue_lag_ms = if admitted_at
|
|
93
|
+
((perform_start - admitted_at) * 1000).to_i
|
|
94
|
+
else
|
|
95
|
+
# No admitted_at means we can't measure queue wait. Treat as 0
|
|
96
|
+
# so the observation still increments sample_count and the
|
|
97
|
+
# cap can grow if everything else is healthy.
|
|
98
|
+
0
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
gates.each do |gate|
|
|
102
|
+
gate.record_observation(
|
|
103
|
+
policy_name: policy.name,
|
|
104
|
+
partition_key: partition_key,
|
|
105
|
+
queue_lag_ms: queue_lag_ms,
|
|
106
|
+
succeeded: succeeded
|
|
107
|
+
)
|
|
108
|
+
rescue StandardError => e
|
|
109
|
+
DispatchPolicy.config.logger&.warn(
|
|
110
|
+
"[dispatch_policy] adaptive observation failed for #{policy.name}/#{partition_key}: #{e.class}: #{e.message}"
|
|
111
|
+
)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# ----- heartbeat thread -----
|
|
116
|
+
|
|
117
|
+
HEARTBEAT_KEY = :__dispatch_policy_heartbeat_token__
|
|
118
|
+
|
|
119
|
+
Heartbeat = Struct.new(:thread, :stop_flag)
|
|
120
|
+
|
|
121
|
+
def self.start_heartbeat(active_job_id)
|
|
122
|
+
interval = DispatchPolicy.config.inflight_heartbeat_interval.to_f
|
|
123
|
+
return nil if interval <= 0
|
|
124
|
+
|
|
125
|
+
stop_flag = Concurrent::AtomicBoolean.new(false) if defined?(Concurrent::AtomicBoolean)
|
|
126
|
+
stop_flag ||= ThreadSafeFlag.new
|
|
127
|
+
|
|
128
|
+
thread = Thread.new do
|
|
129
|
+
Thread.current.name = "dispatch_policy.heartbeat:#{active_job_id}"
|
|
130
|
+
|
|
131
|
+
until stop_flag.true?
|
|
132
|
+
# Sleep in small slices so stop is responsive without polling tight.
|
|
133
|
+
slept = 0.0
|
|
134
|
+
slice = [interval, 1.0].min
|
|
135
|
+
while slept < interval && !stop_flag.true?
|
|
136
|
+
sleep(slice)
|
|
137
|
+
slept += slice
|
|
138
|
+
end
|
|
139
|
+
break if stop_flag.true?
|
|
140
|
+
|
|
141
|
+
begin
|
|
142
|
+
ActiveRecord::Base.connection_pool.with_connection do
|
|
143
|
+
Repository.heartbeat_inflight!(active_job_id: active_job_id)
|
|
144
|
+
end
|
|
145
|
+
rescue StandardError => e
|
|
146
|
+
DispatchPolicy.config.logger&.warn("[dispatch_policy] heartbeat #{active_job_id} failed: #{e.class}: #{e.message}")
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
Heartbeat.new(thread, stop_flag)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def self.stop_heartbeat(heartbeat)
|
|
155
|
+
return if heartbeat.nil?
|
|
156
|
+
|
|
157
|
+
heartbeat.stop_flag.make_true
|
|
158
|
+
# Wake the thread out of any in-progress sleep so we don't wait the full slice.
|
|
159
|
+
heartbeat.thread.wakeup if heartbeat.thread.alive?
|
|
160
|
+
heartbeat.thread.join(1.0)
|
|
161
|
+
rescue StandardError
|
|
162
|
+
# Worst case: the thread is killed by GC; the inflight row gets a stale
|
|
163
|
+
# heartbeat_at and the sweeper will reclaim it after inflight_stale_after.
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Tiny fallback if concurrent-ruby isn't available (it's a Rails dep
|
|
167
|
+
# via active_support so it normally is).
|
|
168
|
+
class ThreadSafeFlag
|
|
169
|
+
def initialize; @mutex = Mutex.new; @value = false; end
|
|
170
|
+
def true?; @mutex.synchronize { @value }; end
|
|
171
|
+
def make_true; @mutex.synchronize { @value = true }; end
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Hooks into ActiveJob::Base. Adds:
|
|
5
|
+
# - the `dispatch_policy :name do … end` class macro
|
|
6
|
+
# - an `around_enqueue` callback that stages jobs declaring a policy
|
|
7
|
+
# - a `perform_all_later` patch that handles bulk enqueue
|
|
8
|
+
module JobExtension
|
|
9
|
+
extend ActiveSupport::Concern
|
|
10
|
+
|
|
11
|
+
included do
|
|
12
|
+
class_attribute :dispatch_policy_name, instance_writer: false
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
class_methods do
|
|
16
|
+
def dispatch_policy(name, &block)
|
|
17
|
+
policy = PolicyDSL.build(name.to_s, &block)
|
|
18
|
+
DispatchPolicy.registry.register(policy, owner: self.name)
|
|
19
|
+
self.dispatch_policy_name = policy.name
|
|
20
|
+
|
|
21
|
+
around_enqueue do |job, block|
|
|
22
|
+
DispatchPolicy::JobExtension.around_enqueue_for(job, block)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Called by the around_enqueue lambda. Public so it can be tested directly.
|
|
28
|
+
def self.around_enqueue_for(job, block)
|
|
29
|
+
return block.call if Bypass.active?
|
|
30
|
+
return block.call unless DispatchPolicy.config.enabled
|
|
31
|
+
|
|
32
|
+
policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
|
|
33
|
+
return block.call unless policy
|
|
34
|
+
|
|
35
|
+
if retry_attempt?(job) && policy.bypass_retries?
|
|
36
|
+
return block.call
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# `klass.deserialize(payload)` (used elsewhere — see Forwarder, retries)
|
|
40
|
+
# only sets @serialized_arguments. ActiveJob defers the actual
|
|
41
|
+
# arguments deserialization to perform_now via the private
|
|
42
|
+
# deserialize_arguments_if_needed. If something deserializes a job
|
|
43
|
+
# and re-enqueues it without going through perform_now (e.g. a
|
|
44
|
+
# custom retry path), `job.arguments` would be []. Guard against
|
|
45
|
+
# that here so the context proc always sees the real args.
|
|
46
|
+
ensure_arguments_materialized!(job)
|
|
47
|
+
|
|
48
|
+
queue_name = job.queue_name&.to_s || policy.queue_name
|
|
49
|
+
ctx = policy.build_context(job.arguments, queue_name: queue_name)
|
|
50
|
+
partition_key = policy.partition_key_for(ctx)
|
|
51
|
+
shard = policy.shard_for(ctx)
|
|
52
|
+
payload = Serializer.serialize(job)
|
|
53
|
+
|
|
54
|
+
Repository.stage!(
|
|
55
|
+
policy_name: policy.name,
|
|
56
|
+
partition_key: partition_key,
|
|
57
|
+
queue_name: queue_name,
|
|
58
|
+
shard: shard,
|
|
59
|
+
job_class: job.class.name,
|
|
60
|
+
job_data: payload,
|
|
61
|
+
context: ctx.to_jsonb,
|
|
62
|
+
scheduled_at: scheduled_time(job),
|
|
63
|
+
priority: job.priority || 0
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
job.successfully_enqueued = true
|
|
67
|
+
false # halts the around_enqueue chain so the real adapter never sees the job
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def self.retry_attempt?(job)
|
|
71
|
+
(job.respond_to?(:executions) ? job.executions.to_i : 0).positive?
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def self.scheduled_time(job)
|
|
75
|
+
ts = job.scheduled_at
|
|
76
|
+
return nil if ts.nil?
|
|
77
|
+
return ts if ts.is_a?(Time)
|
|
78
|
+
|
|
79
|
+
Time.at(Float(ts))
|
|
80
|
+
rescue ArgumentError, TypeError
|
|
81
|
+
nil
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# ActiveJob's `arguments` getter is a plain attr_accessor that returns
|
|
85
|
+
# the in-memory @arguments. After `klass.deserialize(payload)`, that
|
|
86
|
+
# array is empty until perform_now triggers
|
|
87
|
+
# `deserialize_arguments_if_needed` (a private method). Anywhere we
|
|
88
|
+
# read `job.arguments` outside of perform we must materialize first,
|
|
89
|
+
# or the context proc receives [] and falls back to its defaults.
|
|
90
|
+
def self.ensure_arguments_materialized!(job)
|
|
91
|
+
return unless job.respond_to?(:deserialize_arguments_if_needed, true)
|
|
92
|
+
job.send(:deserialize_arguments_if_needed)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# ---- perform_all_later support -------------------------------------------
|
|
96
|
+
|
|
97
|
+
# Rails 7.1+ exposes ActiveJob.perform_all_later. We override it to route
|
|
98
|
+
# jobs declaring a dispatch_policy through a single bulk INSERT, while
|
|
99
|
+
# delegating jobs without a policy to the original enqueue_all path.
|
|
100
|
+
module BulkEnqueue
|
|
101
|
+
def perform_all_later(*jobs)
|
|
102
|
+
flat = jobs.flatten
|
|
103
|
+
return super if flat.empty?
|
|
104
|
+
# Critical: respect Bypass exactly like the per-job around_enqueue
|
|
105
|
+
# does. Forwarder.dispatch deserializes admitted jobs and calls
|
|
106
|
+
# ActiveJob.perform_all_later under Bypass.with — without this
|
|
107
|
+
# check, BulkEnqueue would re-stage them, creating an infinite
|
|
108
|
+
# admission loop with the wrong context (job.arguments is still []
|
|
109
|
+
# at that point because ActiveJob defers deserialization).
|
|
110
|
+
return super if DispatchPolicy::Bypass.active?
|
|
111
|
+
return super unless DispatchPolicy.config.enabled
|
|
112
|
+
return super unless DispatchPolicy.registry.size.positive?
|
|
113
|
+
|
|
114
|
+
with_policy, without_policy = flat.partition do |j|
|
|
115
|
+
j.class.respond_to?(:dispatch_policy_name) && j.class.dispatch_policy_name
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
super(without_policy) if without_policy.any?
|
|
119
|
+
|
|
120
|
+
return nil if with_policy.empty?
|
|
121
|
+
|
|
122
|
+
rows = with_policy.filter_map do |job|
|
|
123
|
+
policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
|
|
124
|
+
next unless policy
|
|
125
|
+
|
|
126
|
+
# See JobExtension.ensure_arguments_materialized! — we need this
|
|
127
|
+
# for the same reason as the single-enqueue path.
|
|
128
|
+
JobExtension.ensure_arguments_materialized!(job)
|
|
129
|
+
|
|
130
|
+
queue_name = job.queue_name&.to_s || policy.queue_name
|
|
131
|
+
ctx = policy.build_context(job.arguments, queue_name: queue_name)
|
|
132
|
+
partition_key = policy.partition_key_for(ctx)
|
|
133
|
+
shard = policy.shard_for(ctx)
|
|
134
|
+
payload = Serializer.serialize(job)
|
|
135
|
+
job.successfully_enqueued = true
|
|
136
|
+
|
|
137
|
+
{
|
|
138
|
+
policy_name: policy.name,
|
|
139
|
+
partition_key: partition_key,
|
|
140
|
+
queue_name: queue_name,
|
|
141
|
+
shard: shard,
|
|
142
|
+
job_class: job.class.name,
|
|
143
|
+
job_data: payload,
|
|
144
|
+
context: ctx.to_jsonb,
|
|
145
|
+
scheduled_at: JobExtension.scheduled_time(job),
|
|
146
|
+
priority: job.priority || 0
|
|
147
|
+
}
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
Repository.stage_many!(rows) if rows.any?
|
|
151
|
+
nil # ActiveJob.perform_all_later contract returns nil
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Pure-Ruby hint generator. Takes a snapshot of live metrics and
|
|
5
|
+
# returns a list of {level:, message:} that the dashboard renders.
|
|
6
|
+
#
|
|
7
|
+
# Each predicate is intentionally conservative: hints fire on
|
|
8
|
+
# crossings the operator can fix from the UI or by toggling a config
|
|
9
|
+
# value, not on noise. Levels:
|
|
10
|
+
#
|
|
11
|
+
# :info — observation worth glancing at
|
|
12
|
+
# :warn — attention soon
|
|
13
|
+
# :critical — fix now
|
|
14
|
+
module OperatorHints
|
|
15
|
+
Hint = Struct.new(:level, :message, keyword_init: true)
|
|
16
|
+
|
|
17
|
+
module_function
|
|
18
|
+
|
|
19
|
+
# `metrics` is a hash of:
|
|
20
|
+
# tick_max_duration_ms: int (config tick_max_duration × 1000)
|
|
21
|
+
# avg_tick_ms: int
|
|
22
|
+
# max_tick_ms: int
|
|
23
|
+
# pending_total: int
|
|
24
|
+
# admitted_per_minute: int (last 1m)
|
|
25
|
+
# forward_failures: int (last 1m)
|
|
26
|
+
# jobs_admitted: int (last 1m, denominator for fail %)
|
|
27
|
+
# active_partitions: int
|
|
28
|
+
# never_checked: int
|
|
29
|
+
# in_backoff: int
|
|
30
|
+
# total_partitions: int
|
|
31
|
+
# adapter_target_jps: int|nil (config.adapter_throughput_target)
|
|
32
|
+
def for(metrics)
|
|
33
|
+
hints = []
|
|
34
|
+
m = metrics
|
|
35
|
+
|
|
36
|
+
# ---- tick approaching deadline ---------------------------------
|
|
37
|
+
if m[:tick_max_duration_ms].to_i.positive? && m[:avg_tick_ms].to_i.positive?
|
|
38
|
+
ratio = m[:avg_tick_ms].to_f / m[:tick_max_duration_ms]
|
|
39
|
+
if ratio >= 0.6
|
|
40
|
+
hints << Hint.new(
|
|
41
|
+
level: ratio >= 0.85 ? :critical : :warn,
|
|
42
|
+
message: "Avg tick is #{format('%.0f%%', ratio * 100)} of tick_max_duration. " \
|
|
43
|
+
"Lower admission_batch_size, set tick_admission_budget, or shard the policy."
|
|
44
|
+
)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# ---- backlog drain time ----------------------------------------
|
|
49
|
+
if m[:admitted_per_minute].to_i.positive? && m[:pending_total].to_i.positive?
|
|
50
|
+
drain_minutes = m[:pending_total].to_f / m[:admitted_per_minute]
|
|
51
|
+
if drain_minutes >= 30
|
|
52
|
+
level = drain_minutes >= 120 ? :warn : :info
|
|
53
|
+
hints << Hint.new(
|
|
54
|
+
level: level,
|
|
55
|
+
message: "At #{m[:admitted_per_minute]} admits/min, the current backlog of " \
|
|
56
|
+
"#{m[:pending_total]} pending would take ~#{drain_minutes.round} min " \
|
|
57
|
+
"to drain. Raise admission_batch_size, raise the gate's rate, or shard."
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# ---- pending growing while admit rate is non-trivial -----------
|
|
63
|
+
# `pending_trend` compares head/tail thirds of the sparkline; a
|
|
64
|
+
# transient spike that already drained still leaves the tail
|
|
65
|
+
# average elevated. Gate on current pending > 0 so a recovered
|
|
66
|
+
# backlog does not raise a warning.
|
|
67
|
+
if m[:pending_trend] == :up &&
|
|
68
|
+
m[:admitted_per_minute].to_i.positive? &&
|
|
69
|
+
m[:pending_total].to_i.positive?
|
|
70
|
+
hints << Hint.new(
|
|
71
|
+
level: :warn,
|
|
72
|
+
message: "Pending is growing while we are admitting. Inflow > outflow — " \
|
|
73
|
+
"either the throttle rate is below the producer rate, or the worker pool can't keep up."
|
|
74
|
+
)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
# ---- forward failure rate --------------------------------------
|
|
78
|
+
if m[:jobs_admitted].to_i.positive?
|
|
79
|
+
rate = m[:forward_failures].to_f / m[:jobs_admitted]
|
|
80
|
+
if rate >= 0.01
|
|
81
|
+
hints << Hint.new(
|
|
82
|
+
level: rate >= 0.05 ? :critical : :warn,
|
|
83
|
+
message: "Forward failures at #{format('%.1f%%', rate * 100)} (#{m[:forward_failures]} / " \
|
|
84
|
+
"#{m[:jobs_admitted]} admits). Inspect logs — adapter is rejecting enqueues."
|
|
85
|
+
)
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# ---- never_checked > 0 with cardinality > batch ----------------
|
|
90
|
+
if m[:never_checked].to_i.positive?
|
|
91
|
+
hints << Hint.new(
|
|
92
|
+
level: :warn,
|
|
93
|
+
message: "#{m[:never_checked]} active partitions have never been checked. " \
|
|
94
|
+
"The tick is not getting through them — increase partition_batch_size or shard."
|
|
95
|
+
)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# ---- partition cardinality -------------------------------------
|
|
99
|
+
if m[:total_partitions].to_i >= 50_000
|
|
100
|
+
hints << Hint.new(
|
|
101
|
+
level: :info,
|
|
102
|
+
message: "#{m[:total_partitions]} partitions in DB. claim_partitions starts to feel " \
|
|
103
|
+
"this around 50k–100k. Consider lowering partition_inactive_after to GC " \
|
|
104
|
+
"drained ones earlier."
|
|
105
|
+
)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# ---- adapter ceiling proximity --------------------------------
|
|
109
|
+
target_jps = m[:adapter_target_jps].to_i
|
|
110
|
+
if target_jps.positive? && m[:admitted_per_minute].to_i.positive?
|
|
111
|
+
current_jps = m[:admitted_per_minute] / 60.0
|
|
112
|
+
ratio = current_jps / target_jps
|
|
113
|
+
if ratio >= 0.7
|
|
114
|
+
hints << Hint.new(
|
|
115
|
+
level: ratio >= 0.95 ? :critical : :warn,
|
|
116
|
+
message: "Admitting #{format('%.0f', current_jps)} jobs/sec, " \
|
|
117
|
+
"#{format('%.0f%%', ratio * 100)} of the configured adapter ceiling " \
|
|
118
|
+
"(#{target_jps}/sec). Consider an additional shard before the next traffic spike."
|
|
119
|
+
)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
hints
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Composes a sequence of gates into a single admission decision for one
|
|
5
|
+
# partition. Returns a value object describing how many jobs may be
|
|
6
|
+
# admitted right now and which gate-state patches to persist.
|
|
7
|
+
class Pipeline
|
|
8
|
+
Result = Struct.new(:admit_count, :retry_after, :gate_state_patch, :reasons, keyword_init: true)
|
|
9
|
+
|
|
10
|
+
def initialize(policy)
|
|
11
|
+
@policy = policy
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def call(ctx, partition, max_budget)
|
|
15
|
+
budget = max_budget
|
|
16
|
+
retry_after = nil
|
|
17
|
+
patch = {}
|
|
18
|
+
reasons = []
|
|
19
|
+
decisions = []
|
|
20
|
+
|
|
21
|
+
@policy.gates.each do |gate|
|
|
22
|
+
decision = gate.evaluate(ctx, partition, budget)
|
|
23
|
+
decisions << [gate, decision]
|
|
24
|
+
budget = decision.allowed.finite? ? [budget, decision.allowed].min : budget
|
|
25
|
+
if decision.retry_after
|
|
26
|
+
retry_after = retry_after.nil? ? decision.retry_after : [retry_after, decision.retry_after].min
|
|
27
|
+
end
|
|
28
|
+
reasons << "#{gate.name}:#{decision.reason}" if decision.reason
|
|
29
|
+
break if budget.zero?
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
admit_count = budget.finite? ? budget : max_budget
|
|
33
|
+
admit_count = 0 if admit_count.negative?
|
|
34
|
+
|
|
35
|
+
decisions.each do |_, decision|
|
|
36
|
+
next unless decision.gate_state_patch
|
|
37
|
+
patch.merge!(decision.gate_state_patch)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
Result.new(
|
|
41
|
+
admit_count: admit_count,
|
|
42
|
+
retry_after: retry_after,
|
|
43
|
+
gate_state_patch: patch,
|
|
44
|
+
reasons: reasons
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|