dispatch_policy 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +185 -0
- data/README.md +30 -7
- data/app/controllers/dispatch_policy/application_controller.rb +21 -2
- data/app/controllers/dispatch_policy/dashboard_controller.rb +3 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +51 -15
- data/app/controllers/dispatch_policy/policies_controller.rb +26 -4
- data/app/models/dispatch_policy/policy_setting.rb +14 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +6 -1
- data/app/views/dispatch_policy/partitions/index.html.erb +1 -1
- data/app/views/dispatch_policy/partitions/show.html.erb +1 -1
- data/app/views/dispatch_policy/policies/index.html.erb +11 -3
- data/app/views/dispatch_policy/policies/show.html.erb +13 -4
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +9 -2
- data/app/views/layouts/dispatch_policy/application.html.erb +21 -25
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +13 -0
- data/lib/dispatch_policy/config.rb +5 -0
- data/lib/dispatch_policy/context.rb +12 -2
- data/lib/dispatch_policy/cursor_pagination.rb +24 -7
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +14 -0
- data/lib/dispatch_policy/gates/concurrency.rb +4 -0
- data/lib/dispatch_policy/gates/throttle.rb +36 -9
- data/lib/dispatch_policy/inflight_tracker.rb +72 -26
- data/lib/dispatch_policy/job_extension.rb +33 -9
- data/lib/dispatch_policy/manual_admission.rb +18 -0
- data/lib/dispatch_policy/operator_hints.rb +14 -0
- data/lib/dispatch_policy/policy.rb +12 -0
- data/lib/dispatch_policy/policy_dsl.rb +10 -2
- data/lib/dispatch_policy/railtie.rb +10 -0
- data/lib/dispatch_policy/registry.rb +8 -4
- data/lib/dispatch_policy/repository.rb +102 -30
- data/lib/dispatch_policy/tick.rb +18 -2
- data/lib/dispatch_policy/tick_loop.rb +15 -7
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +9 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +30 -2
- metadata +2 -1
|
@@ -1,9 +1,16 @@
|
|
|
1
|
-
|
|
1
|
+
<% policy_paused = local_assigns.fetch(:policy_paused, false) %>
|
|
2
|
+
<tr id="partition_<%= partition.id %>" class="<%= "dp-row-paused" if partition.paused? || policy_paused %>">
|
|
2
3
|
<td><%= link_to partition.policy_name, policy_path(partition.policy_name), class: "dp-link" %></td>
|
|
3
4
|
<td><code><%= partition.shard %></code></td>
|
|
4
5
|
<td><%= partition.queue_name || "—" %></td>
|
|
5
6
|
<td><%= link_to partition.partition_key, partition_path(partition), class: "dp-link" %></td>
|
|
6
|
-
<td
|
|
7
|
+
<td>
|
|
8
|
+
<% if policy_paused && !partition.paused? %>
|
|
9
|
+
<%= partition.status %> <span class="dp-warn" style="font-size:11px;">(policy paused)</span>
|
|
10
|
+
<% else %>
|
|
11
|
+
<%= partition.status %>
|
|
12
|
+
<% end %>
|
|
13
|
+
</td>
|
|
7
14
|
<td class="dp-num"><%= format_count(partition.pending_count) %></td>
|
|
8
15
|
<td class="dp-num"><%= format_count(partition.total_admitted) %></td>
|
|
9
16
|
<td><%= format_time(partition.next_eligible_at) %></td>
|
|
@@ -70,20 +70,19 @@
|
|
|
70
70
|
});
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
}
|
|
73
|
+
// Delegate on document, registered ONCE. Per-button listeners with a
|
|
74
|
+
// data-bound guard leaked: Turbo's morph refresh preserves the button
|
|
75
|
+
// node but drops the data-bound attribute (it's not in the server
|
|
76
|
+
// HTML), so every turbo:load re-bound and clicks fired N+1 times.
|
|
77
|
+
document.addEventListener("click", function (e) {
|
|
78
|
+
var btn = e.target.closest("[data-dp-refresh]");
|
|
79
|
+
if (!btn) return;
|
|
80
|
+
e.preventDefault();
|
|
81
|
+
setInterval(parseInt(btn.getAttribute("data-dp-refresh"), 10));
|
|
82
|
+
});
|
|
84
83
|
|
|
85
84
|
function init() {
|
|
86
|
-
|
|
85
|
+
syncControls();
|
|
87
86
|
restart();
|
|
88
87
|
}
|
|
89
88
|
|
|
@@ -140,20 +139,17 @@
|
|
|
140
139
|
});
|
|
141
140
|
}
|
|
142
141
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
});
|
|
152
|
-
syncControls();
|
|
153
|
-
}
|
|
142
|
+
// Delegate on document, registered ONCE (see the refresh control
|
|
143
|
+
// above for why per-button binding leaks under Turbo morph).
|
|
144
|
+
document.addEventListener("click", function (e) {
|
|
145
|
+
var btn = e.target.closest("[data-dp-theme]");
|
|
146
|
+
if (!btn) return;
|
|
147
|
+
e.preventDefault();
|
|
148
|
+
setTheme(btn.getAttribute("data-dp-theme"));
|
|
149
|
+
});
|
|
154
150
|
|
|
155
|
-
document.addEventListener("DOMContentLoaded",
|
|
156
|
-
document.addEventListener("turbo:load",
|
|
151
|
+
document.addEventListener("DOMContentLoaded", syncControls);
|
|
152
|
+
document.addEventListener("turbo:load", syncControls);
|
|
157
153
|
})();
|
|
158
154
|
</script>
|
|
159
155
|
</head>
|
|
@@ -99,5 +99,18 @@ class CreateDispatchPolicyTables < ActiveRecord::Migration[7.1]
|
|
|
99
99
|
[:policy_name, :partition_key],
|
|
100
100
|
unique: true,
|
|
101
101
|
name: "idx_dp_adaptive_concurrency_lookup"
|
|
102
|
+
|
|
103
|
+
# Policy-level settings. Currently just the pause flag: a paused policy
|
|
104
|
+
# admits nothing, INCLUDING partitions created after the pause (the
|
|
105
|
+
# tick's claim checks this row). The partitions' own `status` is a
|
|
106
|
+
# per-partition display concern; this is the policy-wide source of truth.
|
|
107
|
+
create_table :dispatch_policy_policy_settings do |t|
|
|
108
|
+
t.string :policy_name, null: false
|
|
109
|
+
t.boolean :paused, null: false, default: false
|
|
110
|
+
t.timestamps
|
|
111
|
+
end
|
|
112
|
+
add_index :dispatch_policy_policy_settings, :policy_name,
|
|
113
|
+
unique: true,
|
|
114
|
+
name: "idx_dp_policy_settings_lookup"
|
|
102
115
|
end
|
|
103
116
|
end
|
|
@@ -51,6 +51,11 @@ module DispatchPolicy
|
|
|
51
51
|
# assuming the admission was lost. Raise it if your adapter backlog
|
|
52
52
|
# can exceed an hour.
|
|
53
53
|
@inflight_queued_stale_after = 60 * 60
|
|
54
|
+
# Seconds between heartbeat_at refreshes. Each beat briefly checks out
|
|
55
|
+
# an EXTRA connection (one per running job) from the role's pool, so
|
|
56
|
+
# the DB pool needs headroom above the worker concurrency — otherwise
|
|
57
|
+
# beats hit ConnectionTimeoutError and long jobs get swept as stale.
|
|
58
|
+
# Set to 0 to disable the heartbeat thread entirely.
|
|
54
59
|
@inflight_heartbeat_interval = 30
|
|
55
60
|
@real_adapter = nil
|
|
56
61
|
@logger = nil
|
|
@@ -19,7 +19,7 @@ module DispatchPolicy
|
|
|
19
19
|
end
|
|
20
20
|
|
|
21
21
|
def [](key)
|
|
22
|
-
@data[key.to_s]
|
|
22
|
+
indifferent(@data[key.to_s])
|
|
23
23
|
end
|
|
24
24
|
|
|
25
25
|
def to_h
|
|
@@ -31,11 +31,21 @@ module DispatchPolicy
|
|
|
31
31
|
end
|
|
32
32
|
|
|
33
33
|
def fetch(key, *args, &block)
|
|
34
|
-
@data.fetch(key.to_s, *args, &block)
|
|
34
|
+
indifferent(@data.fetch(key.to_s, *args, &block))
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
private
|
|
38
38
|
|
|
39
|
+
# Nested hashes are stored string-keyed (deep_stringify), so
|
|
40
|
+
# `ctx[:limits][:max]` would miss — the inner lookup is by symbol.
|
|
41
|
+
# Return nested hashes with indifferent access so symbol and string
|
|
42
|
+
# keys work at every depth, matching how host apps usually write
|
|
43
|
+
# context. to_h/to_jsonb still return the plain string-keyed hash for
|
|
44
|
+
# storage, untouched.
|
|
45
|
+
def indifferent(value)
|
|
46
|
+
value.is_a?(Hash) ? value.with_indifferent_access : value
|
|
47
|
+
end
|
|
48
|
+
|
|
39
49
|
def deep_stringify(value)
|
|
40
50
|
case value
|
|
41
51
|
when Hash
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require "base64"
|
|
4
4
|
require "json"
|
|
5
|
+
require "time"
|
|
5
6
|
|
|
6
7
|
module DispatchPolicy
|
|
7
8
|
# Tiny keyset-pagination helper for the engine UI. Each sort mode declares
|
|
@@ -86,13 +87,22 @@ module DispatchPolicy
|
|
|
86
87
|
|
|
87
88
|
value, last_id = cursor
|
|
88
89
|
# Ignore a cursor whose value type can't be compared against this
|
|
89
|
-
# sort's column.
|
|
90
|
-
#
|
|
91
|
-
#
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
numeric_column
|
|
95
|
-
|
|
90
|
+
# sort's column. A mismatch — e.g. a numeric value forged for a
|
|
91
|
+
# timestamp sort — would raise a PG error; instead we fall back to the
|
|
92
|
+
# first page.
|
|
93
|
+
numeric_column = %w[pending_count total_admitted].include?(sort[:cursor_sql])
|
|
94
|
+
timestamp_column = sort[:cursor_sql].start_with?("COALESCE(")
|
|
95
|
+
if numeric_column
|
|
96
|
+
return scope unless value.is_a?(Numeric)
|
|
97
|
+
elsif timestamp_column
|
|
98
|
+
# Bound against a timestamp column: a non-parseable string (e.g. a
|
|
99
|
+
# hand-forged "zzz") would raise `invalid input syntax for type
|
|
100
|
+
# timestamp` and 500. Require a real ISO8601 value — exactly what
|
|
101
|
+
# #extract emits — or fall back to the first page.
|
|
102
|
+
return scope unless value.is_a?(String) && parseable_timestamp?(value)
|
|
103
|
+
else
|
|
104
|
+
return scope unless value.is_a?(String)
|
|
105
|
+
end
|
|
96
106
|
|
|
97
107
|
case sort[:direction]
|
|
98
108
|
when :desc
|
|
@@ -133,5 +143,12 @@ module DispatchPolicy
|
|
|
133
143
|
else v
|
|
134
144
|
end
|
|
135
145
|
end
|
|
146
|
+
|
|
147
|
+
def parseable_timestamp?(str)
|
|
148
|
+
Time.iso8601(str)
|
|
149
|
+
true
|
|
150
|
+
rescue ArgumentError, TypeError
|
|
151
|
+
false
|
|
152
|
+
end
|
|
136
153
|
end
|
|
137
154
|
end
|
|
@@ -45,6 +45,20 @@ module DispatchPolicy
|
|
|
45
45
|
raise ArgumentError, "target_lag_ms must be > 0" unless @target_lag_ms.positive?
|
|
46
46
|
raise ArgumentError, "min must be >= 1" unless @min >= 1
|
|
47
47
|
raise ArgumentError, "initial_max must be >= min" unless @initial_max >= @min
|
|
48
|
+
# Out-of-range tuning knobs invert the AIMD loop instead of erroring:
|
|
49
|
+
# alpha=0 freezes the EWMA at its seed so the cap grows unbounded;
|
|
50
|
+
# a decrease factor >= 1 turns the multiplicative *decrease* into an
|
|
51
|
+
# increase, a positive-feedback loop under failure/overload.
|
|
52
|
+
unless @ewma_alpha > 0 && @ewma_alpha <= 1
|
|
53
|
+
raise ArgumentError, "ewma_alpha must be in (0, 1]"
|
|
54
|
+
end
|
|
55
|
+
unless @fail_factor > 0 && @fail_factor < 1
|
|
56
|
+
raise ArgumentError, "failure_decrease_factor must be in (0, 1)"
|
|
57
|
+
end
|
|
58
|
+
unless @slow_factor > 0 && @slow_factor < 1
|
|
59
|
+
raise ArgumentError, "overload_decrease_factor must be in (0, 1)"
|
|
60
|
+
end
|
|
61
|
+
raise ArgumentError, "full_backoff must be >= 0" if @full_backoff.negative?
|
|
48
62
|
end
|
|
49
63
|
|
|
50
64
|
def name
|
|
@@ -17,6 +17,10 @@ module DispatchPolicy
|
|
|
17
17
|
super()
|
|
18
18
|
@max_proc = max.respond_to?(:call) ? max : ->(_ctx) { max }
|
|
19
19
|
@full_backoff = full_backoff.to_f
|
|
20
|
+
# A negative backoff sets next_eligible_at in the past, so the
|
|
21
|
+
# partition is re-evaluated (COUNT(*)) every tick — the opposite of
|
|
22
|
+
# what full_backoff is for.
|
|
23
|
+
raise ArgumentError, "full_backoff must be >= 0" if @full_backoff.negative?
|
|
20
24
|
end
|
|
21
25
|
|
|
22
26
|
def name
|
|
@@ -14,13 +14,20 @@ module DispatchPolicy
|
|
|
14
14
|
# The bucket lives on the staged partition row — one row per
|
|
15
15
|
# `policy.partition_for(ctx)` value, one bucket per row, no dilution.
|
|
16
16
|
class Throttle < Gate
|
|
17
|
-
attr_reader :rate_proc, :
|
|
17
|
+
attr_reader :rate_proc, :per_proc
|
|
18
18
|
|
|
19
19
|
def initialize(rate:, per:)
|
|
20
20
|
super()
|
|
21
21
|
@rate_proc = rate.respond_to?(:call) ? rate : ->(_ctx) { rate }
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
if per.respond_to?(:call)
|
|
23
|
+
# Dynamic window (per-ctx), symmetric with a dynamic rate. Validated
|
|
24
|
+
# per-evaluate since the value isn't known until admission time.
|
|
25
|
+
@per_proc = ->(ctx) { duration_seconds(per.call(ctx)) }
|
|
26
|
+
else
|
|
27
|
+
fixed = duration_seconds(per)
|
|
28
|
+
raise ArgumentError, "throttle :per must be > 0 (got #{fixed})" unless fixed.positive?
|
|
29
|
+
@per_proc = ->(_ctx) { fixed }
|
|
30
|
+
end
|
|
24
31
|
end
|
|
25
32
|
|
|
26
33
|
def name
|
|
@@ -28,16 +35,27 @@ module DispatchPolicy
|
|
|
28
35
|
end
|
|
29
36
|
|
|
30
37
|
def evaluate(ctx, partition, admit_budget)
|
|
31
|
-
|
|
32
|
-
|
|
38
|
+
per = per_for(ctx)
|
|
39
|
+
rate = rate_for(ctx)
|
|
40
|
+
# rate <= 0 (e.g. a paused tenant) backs off for one window instead
|
|
41
|
+
# of denying with a NULL retry_after. A NULL retry_after leaves the
|
|
42
|
+
# partition immediately eligible, so it would be re-claimed and
|
|
43
|
+
# re-evaluated every single tick — a busy-loop that also clobbers any
|
|
44
|
+
# backoff a prior tick had set.
|
|
45
|
+
return Decision.deny(retry_after: per, reason: "rate=0") if rate <= 0
|
|
33
46
|
|
|
34
|
-
|
|
47
|
+
# The bucket holds at least one whole token; otherwise a sub-unit rate
|
|
48
|
+
# (e.g. rate: 0.5) could never accumulate a full token and would never
|
|
49
|
+
# admit. refill_rate stays at the true `rate` so the long-run pace is
|
|
50
|
+
# exact — the floor only sets the burst ceiling.
|
|
51
|
+
capacity = [rate, 1.0].max
|
|
52
|
+
refill_rate = rate / per
|
|
35
53
|
state = (partition["gate_state"] || {})["throttle"] || {}
|
|
36
54
|
tokens = (state["tokens"] || capacity).to_f
|
|
37
55
|
refilled_at = (state["refilled_at"] || now).to_f
|
|
38
56
|
|
|
39
57
|
elapsed = [now - refilled_at, 0.0].max
|
|
40
|
-
tokens = [tokens + (elapsed * refill_rate), capacity
|
|
58
|
+
tokens = [tokens + (elapsed * refill_rate), capacity].min
|
|
41
59
|
|
|
42
60
|
# The patch records the post-refill bucket WITHOUT deducting yet.
|
|
43
61
|
# The actual deduction is deferred to #consume, which runs once
|
|
@@ -77,9 +95,18 @@ module DispatchPolicy
|
|
|
77
95
|
|
|
78
96
|
private
|
|
79
97
|
|
|
80
|
-
def
|
|
98
|
+
def per_for(ctx)
|
|
99
|
+
value = @per_proc.call(ctx)
|
|
100
|
+
raise ArgumentError, "throttle :per must be > 0 (got #{value})" unless value.positive?
|
|
101
|
+
value
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def rate_for(ctx)
|
|
81
105
|
value = @rate_proc.call(ctx)
|
|
82
|
-
|
|
106
|
+
# Float, not Integer: a fractional rate (e.g. 2.5/sec) must keep its
|
|
107
|
+
# fractional part or the bucket systematically under-admits by
|
|
108
|
+
# truncating every refill. nil means "no rate configured" → deny.
|
|
109
|
+
value.nil? ? 0.0 : Float(value)
|
|
83
110
|
end
|
|
84
111
|
|
|
85
112
|
def now
|
|
@@ -39,33 +39,45 @@ module DispatchPolicy
|
|
|
39
39
|
ctx = policy.build_context(job.arguments, queue_name: queue_name)
|
|
40
40
|
partition_key = policy.partition_key_for(ctx)
|
|
41
41
|
|
|
42
|
-
Repository.insert_inflight!([{
|
|
43
|
-
policy_name: policy.name,
|
|
44
|
-
partition_key: partition_key,
|
|
45
|
-
active_job_id: job.job_id
|
|
46
|
-
}])
|
|
47
|
-
|
|
48
42
|
adaptive_gates = policy.gates.select { |g| g.name == :adaptive_concurrency }
|
|
49
|
-
admitted_at =
|
|
50
|
-
perform_start =
|
|
43
|
+
admitted_at = nil
|
|
44
|
+
perform_start = nil
|
|
45
|
+
heartbeat = nil
|
|
46
|
+
started = false
|
|
47
|
+
succeeded = false
|
|
48
|
+
|
|
49
|
+
# insert + heartbeat spawn live INSIDE the begin so the ensure always
|
|
50
|
+
# cleans up: if start_heartbeat (Thread.new) raises after the row is
|
|
51
|
+
# inserted, the row would otherwise orphan until the stale sweeper.
|
|
52
|
+
begin
|
|
53
|
+
Repository.insert_inflight!([{
|
|
54
|
+
policy_name: policy.name,
|
|
55
|
+
partition_key: partition_key,
|
|
56
|
+
active_job_id: job.job_id
|
|
57
|
+
}])
|
|
51
58
|
|
|
52
|
-
|
|
59
|
+
admitted_at = adaptive_gates.any? ? lookup_admitted_at(job.job_id) : nil
|
|
60
|
+
perform_start = Time.current
|
|
61
|
+
heartbeat = start_heartbeat(job.job_id)
|
|
53
62
|
|
|
54
|
-
|
|
55
|
-
begin
|
|
63
|
+
started = true
|
|
56
64
|
yield
|
|
57
65
|
succeeded = true
|
|
58
66
|
ensure
|
|
59
67
|
stop_heartbeat(heartbeat)
|
|
60
68
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
# Only record an observation if we actually reached perform — a
|
|
70
|
+
# failure in setup (insert / heartbeat spawn) isn't a perform result.
|
|
71
|
+
if started
|
|
72
|
+
record_adaptive_observations(
|
|
73
|
+
policy: policy,
|
|
74
|
+
gates: adaptive_gates,
|
|
75
|
+
partition_key: partition_key,
|
|
76
|
+
admitted_at: admitted_at,
|
|
77
|
+
perform_start: perform_start,
|
|
78
|
+
succeeded: succeeded
|
|
79
|
+
)
|
|
80
|
+
end
|
|
69
81
|
|
|
70
82
|
begin
|
|
71
83
|
Repository.delete_inflight!(active_job_id: job.job_id)
|
|
@@ -75,17 +87,43 @@ module DispatchPolicy
|
|
|
75
87
|
end
|
|
76
88
|
end
|
|
77
89
|
|
|
90
|
+
# Deletes the inflight row for a job that ActiveJob discarded BEFORE
|
|
91
|
+
# around_perform ran — most commonly an ActiveJob::DeserializationError
|
|
92
|
+
# (a GlobalID whose record was deleted) on a job with
|
|
93
|
+
# `discard_on ActiveJob::DeserializationError`. Argument deserialization
|
|
94
|
+
# happens before the perform callbacks, so track's `ensure` never runs
|
|
95
|
+
# and the row the Tick pre-inserted would otherwise sit until the
|
|
96
|
+
# `inflight_queued_stale_after` sweeper reaps it (default 1h), holding a
|
|
97
|
+
# concurrency slot the whole time. Wired to the `discard.active_job`
|
|
98
|
+
# notification by the railtie. Idempotent: a no-op when no row exists
|
|
99
|
+
# (e.g. discard fired after track already deleted it).
|
|
100
|
+
def self.handle_discard(job)
|
|
101
|
+
return unless job
|
|
102
|
+
return unless job.class.respond_to?(:dispatch_policy_name) && job.class.dispatch_policy_name
|
|
103
|
+
|
|
104
|
+
Repository.delete_inflight!(active_job_id: job.job_id)
|
|
105
|
+
rescue StandardError => e
|
|
106
|
+
DispatchPolicy.config.logger&.warn(
|
|
107
|
+
"[dispatch_policy] failed to clean up inflight row for discarded job #{job&.job_id}: #{e.class}: #{e.message}"
|
|
108
|
+
)
|
|
109
|
+
end
|
|
110
|
+
|
|
78
111
|
# Reads the admitted_at column from the inflight row that the Tick
|
|
79
112
|
# pre-inserted. Used as the start-of-queue-wait reference for the
|
|
80
113
|
# adaptive_concurrency feedback signal (queue_lag = perform_start
|
|
81
114
|
# - admitted_at). nil if the row vanished or the lookup fails —
|
|
82
115
|
# the observation is then skipped.
|
|
83
116
|
def self.lookup_admitted_at(active_job_id)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
117
|
+
# Route through config.database_role: the inflight row lives in the
|
|
118
|
+
# same DB the Tick pre-inserted it into, which under multi-DB is the
|
|
119
|
+
# queue DB, not the default writing role of the worker process.
|
|
120
|
+
result = Repository.with_connection do
|
|
121
|
+
ActiveRecord::Base.connection.exec_query(
|
|
122
|
+
"SELECT admitted_at FROM dispatch_policy_inflight_jobs WHERE active_job_id = $1 LIMIT 1",
|
|
123
|
+
"lookup_admitted_at",
|
|
124
|
+
[active_job_id]
|
|
125
|
+
)
|
|
126
|
+
end
|
|
89
127
|
row = result.first
|
|
90
128
|
return nil unless row
|
|
91
129
|
ts = row["admitted_at"]
|
|
@@ -147,8 +185,16 @@ module DispatchPolicy
|
|
|
147
185
|
break if stop_flag.true?
|
|
148
186
|
|
|
149
187
|
begin
|
|
150
|
-
|
|
151
|
-
|
|
188
|
+
# Establish config.database_role inside this thread BEFORE the
|
|
189
|
+
# checkout: under multi-DB, connection_pool must resolve to the
|
|
190
|
+
# role's pool (where the inflight row lives), not the default
|
|
191
|
+
# writing pool. with_connection swaps the role thread-locally;
|
|
192
|
+
# the nested pool checkout then borrows/returns a dedicated
|
|
193
|
+
# connection from that pool per beat.
|
|
194
|
+
Repository.with_connection do
|
|
195
|
+
ActiveRecord::Base.connection_pool.with_connection do
|
|
196
|
+
Repository.heartbeat_inflight!(active_job_id: active_job_id)
|
|
197
|
+
end
|
|
152
198
|
end
|
|
153
199
|
rescue StandardError => e
|
|
154
200
|
DispatchPolicy.config.logger&.warn("[dispatch_policy] heartbeat #{active_job_id} failed: #{e.class}: #{e.message}")
|
|
@@ -71,6 +71,25 @@ module DispatchPolicy
|
|
|
71
71
|
(job.respond_to?(:executions) ? job.executions.to_i : 0).positive?
|
|
72
72
|
end
|
|
73
73
|
|
|
74
|
+
# Whether a job should be staged through admission control rather than
|
|
75
|
+
# handed straight to the adapter. Mirrors the single-enqueue decision in
|
|
76
|
+
# around_enqueue_for: it needs a registered policy and must not be a
|
|
77
|
+
# retry on a bypass_retries policy. Used by the bulk path to split jobs
|
|
78
|
+
# so the ones we don't own fall through to the adapter instead of being
|
|
79
|
+
# silently dropped.
|
|
80
|
+
def self.stageable?(job)
|
|
81
|
+
return false unless job.class.respond_to?(:dispatch_policy_name)
|
|
82
|
+
|
|
83
|
+
name = job.class.dispatch_policy_name
|
|
84
|
+
return false unless name
|
|
85
|
+
|
|
86
|
+
policy = DispatchPolicy.registry.fetch(name)
|
|
87
|
+
return false unless policy
|
|
88
|
+
return false if retry_attempt?(job) && policy.bypass_retries?
|
|
89
|
+
|
|
90
|
+
true
|
|
91
|
+
end
|
|
92
|
+
|
|
74
93
|
def self.scheduled_time(job)
|
|
75
94
|
ts = job.scheduled_at
|
|
76
95
|
return nil if ts.nil?
|
|
@@ -111,17 +130,19 @@ module DispatchPolicy
|
|
|
111
130
|
return super unless DispatchPolicy.config.enabled
|
|
112
131
|
return super unless DispatchPolicy.registry.size.positive?
|
|
113
132
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
133
|
+
# Split exactly like the single path decides: jobs we own get staged,
|
|
134
|
+
# everything else (no policy, unregistered policy name, or a retry on
|
|
135
|
+
# a bypass_retries policy) goes straight to the adapter. Dropping them
|
|
136
|
+
# — as a `next unless policy` inside the row builder would — silently
|
|
137
|
+
# loses jobs the caller expected to be enqueued.
|
|
138
|
+
to_stage, to_adapter = flat.partition { |job| JobExtension.stageable?(job) }
|
|
117
139
|
|
|
118
|
-
super(
|
|
140
|
+
super(to_adapter) if to_adapter.any?
|
|
119
141
|
|
|
120
|
-
return nil if
|
|
142
|
+
return nil if to_stage.empty?
|
|
121
143
|
|
|
122
|
-
rows =
|
|
144
|
+
rows = to_stage.map do |job|
|
|
123
145
|
policy = DispatchPolicy.registry.fetch(job.class.dispatch_policy_name)
|
|
124
|
-
next unless policy
|
|
125
146
|
|
|
126
147
|
# See JobExtension.ensure_arguments_materialized! — we need this
|
|
127
148
|
# for the same reason as the single-enqueue path.
|
|
@@ -132,7 +153,6 @@ module DispatchPolicy
|
|
|
132
153
|
partition_key = policy.partition_key_for(ctx)
|
|
133
154
|
shard = policy.shard_for(ctx)
|
|
134
155
|
payload = Serializer.serialize(job)
|
|
135
|
-
job.successfully_enqueued = true
|
|
136
156
|
|
|
137
157
|
{
|
|
138
158
|
policy_name: policy.name,
|
|
@@ -147,7 +167,11 @@ module DispatchPolicy
|
|
|
147
167
|
}
|
|
148
168
|
end
|
|
149
169
|
|
|
150
|
-
|
|
170
|
+
# Only mark enqueued AFTER the INSERT commits. If stage_many! raises,
|
|
171
|
+
# a caller that rescues and inspects successfully_enqueued? must not
|
|
172
|
+
# be told the jobs were enqueued when they weren't.
|
|
173
|
+
Repository.stage_many!(rows)
|
|
174
|
+
to_stage.each { |job| job.successfully_enqueued = true }
|
|
151
175
|
nil # ActiveJob.perform_all_later contract returns nil
|
|
152
176
|
end
|
|
153
177
|
end
|
|
@@ -39,6 +39,24 @@ module DispatchPolicy
|
|
|
39
39
|
next if rows.empty?
|
|
40
40
|
|
|
41
41
|
rows.each { |row| row["job_data"]["job_id"] = SecureRandom.uuid }
|
|
42
|
+
|
|
43
|
+
# Pre-insert an inflight row per admitted job, exactly like
|
|
44
|
+
# Tick#admit_partition does. Without it the concurrency gate's
|
|
45
|
+
# COUNT(*) misses these jobs until each one starts performing and
|
|
46
|
+
# InflightTracker.track inserts its own row — an over-admission
|
|
47
|
+
# window proportional to how many jobs were force-admitted. The
|
|
48
|
+
# key is the canonical partition value, which for a policy-level
|
|
49
|
+
# partition_by is exactly the staged partition_key (see
|
|
50
|
+
# Concurrency#inflight_partition_key). Runs inside the same TX, so
|
|
51
|
+
# a rolled-back claim takes the inflight rows with it.
|
|
52
|
+
inflight_rows = rows.filter_map do |row|
|
|
53
|
+
ajid = row.dig("job_data", "job_id")
|
|
54
|
+
next unless ajid
|
|
55
|
+
|
|
56
|
+
{ policy_name: policy_name, partition_key: partition_key, active_job_id: ajid }
|
|
57
|
+
end
|
|
58
|
+
Repository.insert_inflight!(inflight_rows) if inflight_rows.any?
|
|
59
|
+
|
|
42
60
|
Forwarder.dispatch(rows)
|
|
43
61
|
forwarded = rows.size
|
|
44
62
|
end
|
|
@@ -29,10 +29,24 @@ module DispatchPolicy
|
|
|
29
29
|
# in_backoff: int
|
|
30
30
|
# total_partitions: int
|
|
31
31
|
# adapter_target_jps: int|nil (config.adapter_throughput_target)
|
|
32
|
+
# paused: bool (policy-level pause flag)
|
|
32
33
|
def for(metrics)
|
|
33
34
|
hints = []
|
|
34
35
|
m = metrics
|
|
35
36
|
|
|
37
|
+
# ---- policy paused: everything below presumes admission SHOULD be
|
|
38
|
+
# flowing (never_checked, drain time, pending growing), so during a
|
|
39
|
+
# deliberate pause those hints turn into false alarms — e.g.
|
|
40
|
+
# "increase partition_batch_size" while the tick is intentionally
|
|
41
|
+
# skipping the policy. State the pause and stop.
|
|
42
|
+
if m[:paused]
|
|
43
|
+
return [Hint.new(
|
|
44
|
+
level: :warn,
|
|
45
|
+
message: "Policy is paused — admission is stopped while staging continues " \
|
|
46
|
+
"(pending keeps growing). Resume to drain."
|
|
47
|
+
)]
|
|
48
|
+
end
|
|
49
|
+
|
|
36
50
|
# ---- tick approaching deadline ---------------------------------
|
|
37
51
|
if m[:tick_max_duration_ms].to_i.positive? && m[:avg_tick_ms].to_i.positive?
|
|
38
52
|
ratio = m[:avg_tick_ms].to_f / m[:tick_max_duration_ms]
|
|
@@ -78,6 +78,18 @@ module DispatchPolicy
|
|
|
78
78
|
unless %i[restage bypass].include?(@retry_strategy)
|
|
79
79
|
raise InvalidPolicy, "retry_strategy must be :restage or :bypass"
|
|
80
80
|
end
|
|
81
|
+
# Two gates of the same type would persist their state under the same
|
|
82
|
+
# gate_state key (e.g. both throttles share gate_state["throttle"]),
|
|
83
|
+
# silently corrupting each other: the merged patch keeps only the last
|
|
84
|
+
# gate's bucket, and on the next tick the other gate clamps that count
|
|
85
|
+
# to its own capacity and sees a permanently full bucket. Reject it at
|
|
86
|
+
# definition time — multi-window rate limiting needs separate policies.
|
|
87
|
+
duplicate = @gates.map(&:name).tally.find { |_, count| count > 1 }
|
|
88
|
+
if duplicate
|
|
89
|
+
raise InvalidPolicy,
|
|
90
|
+
"duplicate #{duplicate.first.inspect} gate: a policy may declare each gate " \
|
|
91
|
+
"type at most once (use separate policies for multi-window limits)"
|
|
92
|
+
end
|
|
81
93
|
# Note: gates are NOT required. A policy with no gates uses the
|
|
82
94
|
# admission_batch_size (or tick_admission_budget when set) as its
|
|
83
95
|
# only ceiling, with the in-tick fairness reorder distributing
|
|
@@ -45,7 +45,11 @@ module DispatchPolicy
|
|
|
45
45
|
end
|
|
46
46
|
|
|
47
47
|
def admission_batch_size(size)
|
|
48
|
-
|
|
48
|
+
return if size.nil?
|
|
49
|
+
|
|
50
|
+
value = Integer(size)
|
|
51
|
+
raise InvalidPolicy, "admission_batch_size must be > 0 (got #{value})" unless value.positive?
|
|
52
|
+
@admission_batch_size = value
|
|
49
53
|
end
|
|
50
54
|
|
|
51
55
|
# Per-policy override for the EWMA half-life used to weigh recent
|
|
@@ -62,7 +66,11 @@ module DispatchPolicy
|
|
|
62
66
|
# nil, no global cap is enforced and per-partition admission_batch_size
|
|
63
67
|
# is the only ceiling.
|
|
64
68
|
def tick_admission_budget(value)
|
|
65
|
-
|
|
69
|
+
return if value.nil?
|
|
70
|
+
|
|
71
|
+
budget = Integer(value)
|
|
72
|
+
raise InvalidPolicy, "tick_admission_budget must be > 0 (got #{budget})" unless budget.positive?
|
|
73
|
+
@tick_admission_budget = budget
|
|
66
74
|
end
|
|
67
75
|
|
|
68
76
|
# Defines the partition scope. Required — every policy declares
|
|
@@ -19,6 +19,16 @@ module DispatchPolicy
|
|
|
19
19
|
end
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
+
# Reap the inflight row when a job is discarded before its perform
|
|
23
|
+
# callbacks run (e.g. discard_on ActiveJob::DeserializationError):
|
|
24
|
+
# InflightTracker.track's `ensure` never fires in that path, so the
|
|
25
|
+
# Tick's pre-inserted row would orphan until the stale sweeper.
|
|
26
|
+
initializer "dispatch_policy.discard_cleanup" do
|
|
27
|
+
ActiveSupport::Notifications.subscribe("discard.active_job") do |event|
|
|
28
|
+
DispatchPolicy::InflightTracker.handle_discard(event.payload[:job])
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
22
32
|
# Hosts copy the gem's migration into their own db/migrate via
|
|
23
33
|
# `rails railties:install:migrations` (or hand-write a cutover
|
|
24
34
|
# migration like opstasks did). We deliberately do NOT auto-merge
|