dispatch_policy 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -28
- data/MIT-LICENSE +16 -17
- data/README.md +452 -388
- data/app/assets/images/dispatch_policy/logo-large.svg +9 -0
- data/app/assets/images/dispatch_policy/logo-small.svg +7 -0
- data/app/assets/javascripts/dispatch_policy/turbo.es2017-umd.min.js +35 -0
- data/app/assets/stylesheets/dispatch_policy/application.css +294 -0
- data/app/controllers/dispatch_policy/application_controller.rb +45 -1
- data/app/controllers/dispatch_policy/assets_controller.rb +31 -0
- data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +94 -267
- data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
- data/app/models/dispatch_policy/inflight_job.rb +12 -0
- data/app/models/dispatch_policy/partition.rb +21 -0
- data/app/models/dispatch_policy/staged_job.rb +4 -97
- data/app/models/dispatch_policy/tick_sample.rb +11 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
- data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
- data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
- data/app/views/dispatch_policy/policies/index.html.erb +15 -37
- data/app/views/dispatch_policy/policies/show.html.erb +139 -223
- data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
- data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
- data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +164 -231
- data/config/routes.rb +21 -2
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
- data/lib/dispatch_policy/assets.rb +38 -0
- data/lib/dispatch_policy/bypass.rb +23 -0
- data/lib/dispatch_policy/config.rb +85 -0
- data/lib/dispatch_policy/context.rb +50 -0
- data/lib/dispatch_policy/cursor_pagination.rb +121 -0
- data/lib/dispatch_policy/decision.rb +22 -0
- data/lib/dispatch_policy/engine.rb +5 -27
- data/lib/dispatch_policy/forwarder.rb +63 -0
- data/lib/dispatch_policy/gate.rb +10 -38
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
- data/lib/dispatch_policy/gates/concurrency.rb +45 -26
- data/lib/dispatch_policy/gates/throttle.rb +65 -41
- data/lib/dispatch_policy/inflight_tracker.rb +174 -0
- data/lib/dispatch_policy/job_extension.rb +155 -0
- data/lib/dispatch_policy/operator_hints.rb +126 -0
- data/lib/dispatch_policy/pipeline.rb +48 -0
- data/lib/dispatch_policy/policy.rb +61 -59
- data/lib/dispatch_policy/policy_dsl.rb +120 -0
- data/lib/dispatch_policy/railtie.rb +35 -0
- data/lib/dispatch_policy/registry.rb +46 -0
- data/lib/dispatch_policy/repository.rb +723 -0
- data/lib/dispatch_policy/serializer.rb +36 -0
- data/lib/dispatch_policy/tick.rb +260 -256
- data/lib/dispatch_policy/tick_loop.rb +59 -26
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/dispatch_policy.rb +72 -52
- data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
- data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
- metadata +134 -42
- data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
- data/app/models/dispatch_policy/partition_observation.rb +0 -76
- data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
- data/db/migrate/20260425000001_add_duration_to_partition_observations.rb +0 -8
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
- data/lib/dispatch_policy/dispatch_context.rb +0 -53
- data/lib/dispatch_policy/dispatchable.rb +0 -123
- data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
- data/lib/dispatch_policy/gates/global_cap.rb +0 -26
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module DispatchPolicy
|
|
6
|
+
module Serializer
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
# Serialize an ActiveJob instance for storage in staged_jobs.job_data.
|
|
10
|
+
# Returns a Ruby hash compatible with PostgreSQL jsonb (string keys).
|
|
11
|
+
def serialize(job)
|
|
12
|
+
job.serialize
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Deserialize stored job_data into a fresh ActiveJob instance ready
|
|
16
|
+
# to be enqueued via `#enqueue`.
|
|
17
|
+
def deserialize(payload)
|
|
18
|
+
job_class = payload["job_class"] || payload[:job_class]
|
|
19
|
+
raise InvalidPolicy, "missing job_class in stored payload" unless job_class
|
|
20
|
+
|
|
21
|
+
klass = job_class.constantize
|
|
22
|
+
klass.deserialize(payload)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def dump_jsonb(value)
|
|
26
|
+
JSON.dump(value)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def load_jsonb(text)
|
|
30
|
+
return text if text.is_a?(Hash) || text.is_a?(Array)
|
|
31
|
+
return {} if text.nil? || text == ""
|
|
32
|
+
|
|
33
|
+
JSON.parse(text)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
data/lib/dispatch_policy/tick.rb
CHANGED
|
@@ -1,301 +1,305 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
|
+
# One pass of admission for a single policy.
|
|
5
|
+
#
|
|
6
|
+
# Records a row in dispatch_policy_tick_samples at the end so the engine UI
|
|
7
|
+
# can show throughput, denial reasons, and tick duration without sampling
|
|
8
|
+
# on the read path.
|
|
4
9
|
class Tick
|
|
5
|
-
|
|
10
|
+
Result = Struct.new(:partitions_seen, :jobs_admitted, keyword_init: true)
|
|
6
11
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def self.run(policy_name: nil)
|
|
11
|
-
return 0 unless DispatchPolicy.enabled?
|
|
12
|
-
|
|
13
|
-
pending_enqueue = []
|
|
14
|
-
|
|
15
|
-
StagedJob.transaction do
|
|
16
|
-
active_policies(policy_name).each do |pname|
|
|
17
|
-
policy = lookup_policy(pname)
|
|
18
|
-
next unless policy
|
|
12
|
+
def self.run(policy_name:, shard: nil)
|
|
13
|
+
new(policy_name, shard: shard).call
|
|
14
|
+
end
|
|
19
15
|
|
|
20
|
-
|
|
21
|
-
|
|
16
|
+
def initialize(policy_name, shard: nil)
|
|
17
|
+
@policy_name = policy_name
|
|
18
|
+
@shard = shard
|
|
19
|
+
@policy = DispatchPolicy.registry.fetch(policy_name) || raise(InvalidPolicy, "unknown policy #{policy_name.inspect}")
|
|
20
|
+
@config = DispatchPolicy.config
|
|
21
|
+
end
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
def call
|
|
24
|
+
started_at = monotonic_now_ms
|
|
25
|
+
partitions_seen = 0
|
|
26
|
+
partitions_admitted = 0
|
|
27
|
+
partitions_denied = 0
|
|
28
|
+
jobs_admitted = 0
|
|
29
|
+
forward_failures = 0
|
|
30
|
+
denied_reasons = Hash.new(0)
|
|
31
|
+
|
|
32
|
+
partitions = Repository.claim_partitions(
|
|
33
|
+
policy_name: @policy_name,
|
|
34
|
+
shard: @shard,
|
|
35
|
+
limit: @config.partition_batch_size
|
|
36
|
+
)
|
|
26
37
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
38
|
+
# Reorder by least-recent-admit-weighted (EWMA decayed_admits ASC)
|
|
39
|
+
# so under-admitted partitions get first crack at the tick budget.
|
|
40
|
+
# claim_partitions ALREADY enforced anti-stagnation via
|
|
41
|
+
# last_checked_at — every partition with pending is visited within
|
|
42
|
+
# ⌈active_partitions / partition_batch_size⌉ ticks regardless of
|
|
43
|
+
# decayed_admits. Reordering here only decides order *inside* this
|
|
44
|
+
# already-fair selection.
|
|
45
|
+
sort_partitions_for_fairness!(partitions)
|
|
46
|
+
|
|
47
|
+
# Per-partition fair share. When tick_admission_budget is set, we
|
|
48
|
+
# divide it evenly across the partitions we just claimed. Otherwise
|
|
49
|
+
# the legacy admission_batch_size is the per-partition ceiling.
|
|
50
|
+
#
|
|
51
|
+
# We deliberately do NOT clamp fair_share to a minimum of 1 when
|
|
52
|
+
# tick_cap < N. The hard global cap wins over a per-partition
|
|
53
|
+
# admit floor; partitions that don't admit this tick are still
|
|
54
|
+
# visited (last_checked_at bumped) and re-visited next tick when
|
|
55
|
+
# they'll be at the front of the in-tick decay order.
|
|
56
|
+
# Anti-stagnation comes from claim_partitions, not from forcing
|
|
57
|
+
# an admit on every claimed partition.
|
|
58
|
+
tick_cap = @policy.tick_admission_budget || @config.tick_admission_budget
|
|
59
|
+
per_part = @policy.admission_batch_size || @config.admission_batch_size
|
|
60
|
+
fair_share = if tick_cap && partitions.any?
|
|
61
|
+
(tick_cap.to_f / partitions.size).ceil
|
|
62
|
+
else
|
|
63
|
+
per_part
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
pending_denies = []
|
|
67
|
+
admitted_per_partition = Hash.new(0)
|
|
68
|
+
used = 0
|
|
69
|
+
|
|
70
|
+
partitions.each do |partition|
|
|
71
|
+
partitions_seen += 1
|
|
72
|
+
|
|
73
|
+
if tick_cap && used >= tick_cap
|
|
74
|
+
# Global cap exhausted in pass-1. The partition is still
|
|
75
|
+
# observed (claim_partitions bumped its last_checked_at), so
|
|
76
|
+
# the round-robin invariant for anti-stagnation holds; we
|
|
77
|
+
# just admit nothing this tick.
|
|
78
|
+
partitions_denied += 1
|
|
79
|
+
denied_reasons["tick_cap_exhausted"] += 1
|
|
80
|
+
# Push this partition to the deny path so its gate state
|
|
81
|
+
# still gets persisted — the pipeline already evaluated it
|
|
82
|
+
# in admit_partition... actually we haven't called admit yet.
|
|
83
|
+
# Skip: not adding to pending_denies because the pipeline
|
|
84
|
+
# didn't run, no gate_state_patch to flush.
|
|
85
|
+
next
|
|
48
86
|
end
|
|
49
|
-
end
|
|
50
87
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
88
|
+
budget_for_this = if tick_cap
|
|
89
|
+
[fair_share, tick_cap - used].min
|
|
90
|
+
else
|
|
91
|
+
fair_share
|
|
92
|
+
end
|
|
93
|
+
budget_for_this = 0 if budget_for_this.negative?
|
|
57
94
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
ThrottleBucket.where("tokens <= ? AND refilled_at < ?", THROTTLE_ZERO_THRESHOLD, cutoff).delete_all
|
|
61
|
-
end
|
|
95
|
+
outcome = admit_partition(partition, pending_denies, max_budget: budget_for_this)
|
|
96
|
+
admitted_per_partition[partition["partition_key"]] = outcome[:admitted]
|
|
62
97
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
policy = lookup_policy(policy_name)
|
|
67
|
-
next if policy && policy.gates.any? { |g| g.name == gate_name.to_sym }
|
|
98
|
+
jobs_admitted += outcome[:admitted]
|
|
99
|
+
forward_failures += outcome[:failures]
|
|
100
|
+
used += outcome[:admitted]
|
|
68
101
|
|
|
69
|
-
|
|
102
|
+
if outcome[:admitted].positive?
|
|
103
|
+
partitions_admitted += 1
|
|
104
|
+
else
|
|
105
|
+
partitions_denied += 1
|
|
106
|
+
outcome[:reasons].each { |r| denied_reasons[r] += 1 }
|
|
70
107
|
end
|
|
71
108
|
end
|
|
72
|
-
end
|
|
73
109
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
partition_key
|
|
85
|
-
|
|
110
|
+
# Pass-2: redistribution. Pass-1 may have left budget unused if
|
|
111
|
+
# some partitions had less pending than their fair share. Walk the
|
|
112
|
+
# claimed partitions (still in decay-sorted order) and offer the
|
|
113
|
+
# leftover to whoever filled their fair share in pass-1 — a signal
|
|
114
|
+
# they had more pending than we let them admit.
|
|
115
|
+
if tick_cap
|
|
116
|
+
remaining = tick_cap - used
|
|
117
|
+
if remaining.positive?
|
|
118
|
+
partitions.each do |p|
|
|
119
|
+
break if remaining <= 0
|
|
120
|
+
next if admitted_per_partition[p["partition_key"]] < fair_share
|
|
121
|
+
|
|
122
|
+
extra_cap = [remaining, fair_share].min
|
|
123
|
+
outcome = admit_partition(p, pending_denies, max_budget: extra_cap)
|
|
124
|
+
jobs_admitted += outcome[:admitted]
|
|
125
|
+
forward_failures += outcome[:failures]
|
|
126
|
+
admitted_per_partition[p["partition_key"]] += outcome[:admitted]
|
|
127
|
+
remaining -= outcome[:admitted]
|
|
128
|
+
end
|
|
86
129
|
end
|
|
87
|
-
staged.update!(lease_expires_at: nil, completed_at: Time.current)
|
|
88
130
|
end
|
|
89
|
-
end
|
|
90
131
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
gate = policy&.gates&.find { |g| g.name == gate_name.to_sym }
|
|
95
|
-
next unless gate&.tracks_inflight?
|
|
96
|
-
|
|
97
|
-
PartitionInflightCount.decrement(
|
|
98
|
-
policy_name: policy_name,
|
|
99
|
-
gate_name: gate_name.to_s,
|
|
100
|
-
partition_key: partition_key.to_s
|
|
101
|
-
)
|
|
102
|
-
end
|
|
103
|
-
end
|
|
132
|
+
flush_denies!(pending_denies) if pending_denies.any?
|
|
133
|
+
|
|
134
|
+
duration_ms = monotonic_now_ms - started_at
|
|
104
135
|
|
|
105
|
-
|
|
106
|
-
|
|
136
|
+
record_sample!(
|
|
137
|
+
duration_ms: duration_ms,
|
|
138
|
+
partitions_seen: partitions_seen,
|
|
139
|
+
partitions_admitted: partitions_admitted,
|
|
140
|
+
partitions_denied: partitions_denied,
|
|
141
|
+
jobs_admitted: jobs_admitted,
|
|
142
|
+
forward_failures: forward_failures,
|
|
143
|
+
denied_reasons: denied_reasons
|
|
144
|
+
)
|
|
107
145
|
|
|
108
|
-
|
|
109
|
-
.where("not_before_at IS NULL OR not_before_at <= ?", Time.current)
|
|
110
|
-
.distinct
|
|
111
|
-
.pluck(:policy_name)
|
|
146
|
+
Result.new(partitions_seen: partitions_seen, jobs_admitted: jobs_admitted)
|
|
112
147
|
end
|
|
113
148
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
149
|
+
private
|
|
150
|
+
|
|
151
|
+
# In-place sort by current decayed_admits ASC, computed in Ruby from
|
|
152
|
+
# the row's stored decayed_admits + the elapsed time since
|
|
153
|
+
# decayed_admits_at. We do this here (rather than in the SQL of
|
|
154
|
+
# claim_partitions) because:
|
|
155
|
+
#
|
|
156
|
+
# - claim_partitions's ORDER BY is anti-stagnation (last_checked_at
|
|
157
|
+
# NULLS FIRST); reordering there would bias selection itself,
|
|
158
|
+
# reintroducing the stagnation risk.
|
|
159
|
+
# - The math is cheap on N ≤ partition_batch_size rows already in
|
|
160
|
+
# memory.
|
|
161
|
+
def sort_partitions_for_fairness!(partitions)
|
|
162
|
+
half_life = @policy.fairness_half_life_seconds || @config.fairness_half_life_seconds
|
|
163
|
+
return partitions if half_life.nil? || half_life <= 0
|
|
164
|
+
|
|
165
|
+
tau = half_life.to_f / Math.log(2)
|
|
166
|
+
now = Time.current.to_f
|
|
167
|
+
|
|
168
|
+
partitions.sort_by! do |p|
|
|
169
|
+
last_t = decayed_admits_epoch(p["decayed_admits_at"]) || now
|
|
170
|
+
elapsed = [now - last_t, 0.0].max
|
|
171
|
+
(p["decayed_admits"] || 0.0).to_f * Math.exp(-elapsed / tau)
|
|
123
172
|
end
|
|
124
173
|
end
|
|
125
174
|
|
|
126
|
-
def
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
.to_a
|
|
175
|
+
def decayed_admits_epoch(value)
|
|
176
|
+
return nil if value.nil?
|
|
177
|
+
return value.to_f if value.is_a?(Numeric)
|
|
178
|
+
return value.to_time.to_f if value.respond_to?(:to_time)
|
|
179
|
+
Time.parse(value.to_s).to_f
|
|
180
|
+
rescue ArgumentError, TypeError
|
|
181
|
+
nil
|
|
134
182
|
end
|
|
135
183
|
|
|
136
|
-
def
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
FROM dispatch_policy_staged_jobs
|
|
154
|
-
WHERE policy_name = ?
|
|
155
|
-
AND admitted_at IS NULL
|
|
156
|
-
AND round_robin_key = keys.round_robin_key
|
|
157
|
-
AND (not_before_at IS NULL OR not_before_at <= ?)
|
|
158
|
-
ORDER BY priority, staged_at
|
|
159
|
-
LIMIT ?
|
|
160
|
-
FOR UPDATE SKIP LOCKED
|
|
161
|
-
) AS rows
|
|
162
|
-
LIMIT ?
|
|
163
|
-
SQL
|
|
164
|
-
|
|
165
|
-
batch = StagedJob.find_by_sql([ sql, policy.name, now, policy.name, now, quantum, batch_size ])
|
|
166
|
-
|
|
167
|
-
remaining = batch_size - batch.size
|
|
168
|
-
return batch if remaining <= 0
|
|
169
|
-
|
|
170
|
-
top_up = StagedJob.pending
|
|
171
|
-
.where(policy_name: policy.name)
|
|
172
|
-
.where("not_before_at IS NULL OR not_before_at <= ?", now)
|
|
173
|
-
.where.not(id: batch.map(&:id))
|
|
174
|
-
.order(:priority, :staged_at)
|
|
175
|
-
.limit(remaining)
|
|
176
|
-
.lock("FOR UPDATE SKIP LOCKED")
|
|
177
|
-
.to_a
|
|
178
|
-
|
|
179
|
-
batch + top_up
|
|
180
|
-
end
|
|
184
|
+
def admit_partition(partition, pending_denies, max_budget:)
|
|
185
|
+
ctx = Context.wrap(partition["context"])
|
|
186
|
+
pipe = Pipeline.new(@policy)
|
|
187
|
+
result = pipe.call(ctx, partition, max_budget)
|
|
188
|
+
|
|
189
|
+
# Pure-deny path (gate said no capacity for this partition this tick).
|
|
190
|
+
# Defer the partition state UPDATE to the bulk flush at the end of
|
|
191
|
+
# the tick instead of issuing a per-partition statement now.
|
|
192
|
+
if result.admit_count.zero?
|
|
193
|
+
pending_denies << {
|
|
194
|
+
policy_name: @policy_name,
|
|
195
|
+
partition_key: partition["partition_key"],
|
|
196
|
+
gate_state_patch: result.gate_state_patch,
|
|
197
|
+
retry_after: result.retry_after
|
|
198
|
+
}
|
|
199
|
+
return { admitted: 0, failures: 0, reasons: deduce_reasons(result) }
|
|
200
|
+
end
|
|
181
201
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
now = Time.current
|
|
196
|
-
|
|
197
|
-
partitions = StagedJob.pending
|
|
198
|
-
.where(policy_name: policy.name)
|
|
199
|
-
.where("not_before_at IS NULL OR not_before_at <= ?", now)
|
|
200
|
-
.where.not(round_robin_key: nil)
|
|
201
|
-
.distinct
|
|
202
|
-
.pluck(:round_robin_key)
|
|
203
|
-
|
|
204
|
-
return fetch_plain_batch(policy) if partitions.empty?
|
|
205
|
-
|
|
206
|
-
consumed = PartitionObservation.consumed_ms_by_partition(
|
|
207
|
-
policy_name: policy.name,
|
|
208
|
-
partition_keys: partitions,
|
|
209
|
-
window: policy.round_robin_window
|
|
210
|
-
)
|
|
202
|
+
admitted = 0
|
|
203
|
+
half_life = @policy.fairness_half_life_seconds || @config.fairness_half_life_seconds
|
|
204
|
+
|
|
205
|
+
Repository.with_connection do
|
|
206
|
+
ActiveRecord::Base.transaction(requires_new: true) do
|
|
207
|
+
rows = Repository.claim_staged_jobs!(
|
|
208
|
+
policy_name: @policy_name,
|
|
209
|
+
partition_key: partition["partition_key"],
|
|
210
|
+
limit: result.admit_count,
|
|
211
|
+
gate_state_patch: result.gate_state_patch,
|
|
212
|
+
retry_after: result.retry_after,
|
|
213
|
+
half_life_seconds: half_life
|
|
214
|
+
)
|
|
211
215
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
216
|
+
# `claim_staged_jobs!` always runs `record_partition_admit!` so
|
|
217
|
+
# the partition's counters and gate_state commit even when the
|
|
218
|
+
# actual DELETE returned zero rows (e.g. all staged rows are
|
|
219
|
+
# scheduled in the future, or another tick raced us to them).
|
|
220
|
+
next if rows.empty?
|
|
221
|
+
|
|
222
|
+
# Pre-insert an inflight row per admitted job so the concurrency
|
|
223
|
+
# gate sees them immediately. With a concurrency gate, use its
|
|
224
|
+
# (coarser) partition key so the gate's COUNT(*) keeps aggregating
|
|
225
|
+
# correctly across staged sub-partitions.
|
|
226
|
+
concurrency_gate = @policy.gates.find { |g| g.name == :concurrency }
|
|
227
|
+
inflight_rows = rows.filter_map do |row|
|
|
228
|
+
ajid = row.dig("job_data", "job_id")
|
|
229
|
+
next unless ajid
|
|
230
|
+
|
|
231
|
+
key = if concurrency_gate
|
|
232
|
+
concurrency_gate.inflight_partition_key(@policy_name, Context.wrap(row["context"]))
|
|
233
|
+
else
|
|
234
|
+
row["partition_key"]
|
|
235
|
+
end
|
|
236
|
+
{ policy_name: @policy_name, partition_key: key, active_job_id: ajid }
|
|
237
|
+
end
|
|
238
|
+
Repository.insert_inflight!(inflight_rows) if inflight_rows.any?
|
|
239
|
+
|
|
240
|
+
# Re-enqueue to the real adapter *inside this transaction*. The
|
|
241
|
+
# adapter (good_job / solid_queue) shares ActiveRecord::Base's
|
|
242
|
+
# connection, so its INSERT into good_jobs / solid_queue_jobs
|
|
243
|
+
# participates in the same TX. If anything raises (deserialize,
|
|
244
|
+
# adapter error, network), the whole TX rolls back atomically:
|
|
245
|
+
# staged_jobs return, inflight rows vanish, partition counters
|
|
246
|
+
# revert, and the adapter rows are also reverted. This is the
|
|
247
|
+
# at-least-once guarantee — there is no window where staged is
|
|
248
|
+
# gone but the adapter never received the job.
|
|
249
|
+
Forwarder.dispatch(rows)
|
|
250
|
+
admitted = rows.size
|
|
251
|
+
end
|
|
222
252
|
end
|
|
223
253
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
.where("not_before_at IS NULL OR not_before_at <= ?", now)
|
|
229
|
-
.order(:priority, :staged_at)
|
|
230
|
-
.limit(quanta[key])
|
|
231
|
-
.lock("FOR UPDATE SKIP LOCKED")
|
|
232
|
-
.to_a
|
|
233
|
-
batch.concat(rows)
|
|
234
|
-
break if batch.size >= batch_size
|
|
254
|
+
if admitted.zero?
|
|
255
|
+
{ admitted: 0, failures: 0, reasons: ["no_rows_claimed"] }
|
|
256
|
+
else
|
|
257
|
+
{ admitted: admitted, failures: 0, reasons: [] }
|
|
235
258
|
end
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
.where("not_before_at IS NULL OR not_before_at <= ?", now)
|
|
243
|
-
.where.not(id: batch.map(&:id))
|
|
244
|
-
.order(:priority, :staged_at)
|
|
245
|
-
.limit(remaining)
|
|
246
|
-
.lock("FOR UPDATE SKIP LOCKED")
|
|
247
|
-
.to_a
|
|
248
|
-
|
|
249
|
-
batch + top_up
|
|
250
|
-
end
|
|
251
|
-
|
|
252
|
-
def self.lookup_policy(policy_name)
|
|
253
|
-
job_class = DispatchPolicy.registry[policy_name] || autoload_job_for(policy_name)
|
|
254
|
-
return nil unless job_class
|
|
255
|
-
job_class.resolved_dispatch_policy
|
|
259
|
+
rescue StandardError => e
|
|
260
|
+
DispatchPolicy.config.logger&.error(
|
|
261
|
+
"[dispatch_policy] forward failed for #{@policy_name}/#{partition['partition_key']}: " \
|
|
262
|
+
"#{e.class}: #{e.message}"
|
|
263
|
+
)
|
|
264
|
+
{ admitted: 0, failures: 1, reasons: ["forward_failed"] }
|
|
256
265
|
end
|
|
257
266
|
|
|
258
|
-
def
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
DispatchPolicy.
|
|
267
|
+
def flush_denies!(entries)
|
|
268
|
+
Repository.with_connection { Repository.bulk_record_partition_denies!(entries) }
|
|
269
|
+
rescue StandardError => e
|
|
270
|
+
DispatchPolicy.config.logger&.error(
|
|
271
|
+
"[dispatch_policy] bulk_record_partition_denies failed: #{e.class}: #{e.message}"
|
|
272
|
+
)
|
|
262
273
|
end
|
|
263
274
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
275
|
+
# When admit_count was 0, the Pipeline's `reasons` array contains entries
|
|
276
|
+
# like "throttle:rate=0", "concurrency:concurrency_full". We strip the
|
|
277
|
+
# `gate:` prefix's value separator so callers see "throttle" / "concurrency_full".
|
|
278
|
+
def deduce_reasons(result)
|
|
279
|
+
reasons = result.reasons.map do |s|
|
|
280
|
+
gate, msg = s.split(":", 2)
|
|
281
|
+
msg.presence || gate
|
|
269
282
|
end
|
|
283
|
+
reasons << "no_capacity" if reasons.empty?
|
|
284
|
+
reasons
|
|
285
|
+
end
|
|
270
286
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
partitions.each do |gate_name, partition_key|
|
|
275
|
-
gate = policy.gates.find { |g| g.name == gate_name.to_sym }
|
|
276
|
-
next unless gate&.tracks_inflight?
|
|
287
|
+
def record_sample!(**fields)
|
|
288
|
+
pending_total = DispatchPolicy::Partition.for_policy(@policy_name).sum(:pending_count)
|
|
289
|
+
inflight_total = DispatchPolicy::InflightJob.where(policy_name: @policy_name).count
|
|
277
290
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
[ staged, job ]
|
|
287
|
-
end
|
|
291
|
+
Repository.record_tick_sample!(
|
|
292
|
+
policy_name: @policy_name,
|
|
293
|
+
pending_total: pending_total,
|
|
294
|
+
inflight_total: inflight_total,
|
|
295
|
+
**fields
|
|
296
|
+
)
|
|
297
|
+
rescue StandardError => e
|
|
298
|
+
DispatchPolicy.config.logger&.warn("[dispatch_policy] failed to record tick sample: #{e.class}: #{e.message}")
|
|
288
299
|
end
|
|
289
300
|
|
|
290
|
-
def
|
|
291
|
-
|
|
292
|
-
release(policy_name: staged.policy_name, partitions: partitions)
|
|
293
|
-
staged.update_columns(
|
|
294
|
-
admitted_at: nil,
|
|
295
|
-
lease_expires_at: nil,
|
|
296
|
-
active_job_id: nil,
|
|
297
|
-
partitions: {}
|
|
298
|
-
)
|
|
301
|
+
def monotonic_now_ms
|
|
302
|
+
(Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1000).to_i
|
|
299
303
|
end
|
|
300
304
|
end
|
|
301
305
|
end
|