dispatch_policy 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/MIT-LICENSE +16 -17
- data/README.md +449 -288
- data/app/assets/stylesheets/dispatch_policy/application.css +157 -0
- data/app/controllers/dispatch_policy/application_controller.rb +45 -1
- data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +94 -241
- data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
- data/app/models/dispatch_policy/inflight_job.rb +12 -0
- data/app/models/dispatch_policy/partition.rb +21 -0
- data/app/models/dispatch_policy/staged_job.rb +4 -97
- data/app/models/dispatch_policy/tick_sample.rb +11 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
- data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
- data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
- data/app/views/dispatch_policy/policies/index.html.erb +15 -37
- data/app/views/dispatch_policy/policies/show.html.erb +140 -216
- data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
- data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
- data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +95 -238
- data/config/routes.rb +18 -2
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
- data/lib/dispatch_policy/bypass.rb +23 -0
- data/lib/dispatch_policy/config.rb +85 -0
- data/lib/dispatch_policy/context.rb +50 -0
- data/lib/dispatch_policy/cursor_pagination.rb +121 -0
- data/lib/dispatch_policy/decision.rb +22 -0
- data/lib/dispatch_policy/engine.rb +4 -27
- data/lib/dispatch_policy/forwarder.rb +63 -0
- data/lib/dispatch_policy/gate.rb +10 -38
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
- data/lib/dispatch_policy/gates/concurrency.rb +45 -26
- data/lib/dispatch_policy/gates/throttle.rb +65 -37
- data/lib/dispatch_policy/inflight_tracker.rb +174 -0
- data/lib/dispatch_policy/job_extension.rb +155 -0
- data/lib/dispatch_policy/operator_hints.rb +126 -0
- data/lib/dispatch_policy/pipeline.rb +48 -0
- data/lib/dispatch_policy/policy.rb +62 -47
- data/lib/dispatch_policy/policy_dsl.rb +120 -0
- data/lib/dispatch_policy/railtie.rb +35 -0
- data/lib/dispatch_policy/registry.rb +46 -0
- data/lib/dispatch_policy/repository.rb +723 -0
- data/lib/dispatch_policy/serializer.rb +36 -0
- data/lib/dispatch_policy/tick.rb +263 -172
- data/lib/dispatch_policy/tick_loop.rb +59 -26
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/dispatch_policy.rb +71 -46
- data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
- data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
- metadata +101 -43
- data/CHANGELOG.md +0 -12
- data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
- data/app/models/dispatch_policy/partition_observation.rb +0 -49
- data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
- data/lib/dispatch_policy/dispatch_context.rb +0 -53
- data/lib/dispatch_policy/dispatchable.rb +0 -120
- data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
- data/lib/dispatch_policy/gates/global_cap.rb +0 -26
- data/lib/dispatch_policy/install_generator.rb +0 -23
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "base64"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module DispatchPolicy
|
|
7
|
+
# Tiny keyset-pagination helper for the engine UI. Each sort mode declares
|
|
8
|
+
# a single sortable column plus the row id as a deterministic tiebreaker
|
|
9
|
+
# so two rows can never share the same cursor. NULLable columns are
|
|
10
|
+
# coalesced to a sentinel ('1970-01-01' for timestamps) so the cursor
|
|
11
|
+
# clause stays a simple tuple comparison.
|
|
12
|
+
module CursorPagination
|
|
13
|
+
SENTINEL_TS = "1970-01-01 00:00:00".freeze
|
|
14
|
+
|
|
15
|
+
# name => { sql_order:, cursor_sql:, direction:, label: }
|
|
16
|
+
# cursor_sql is the expression to extract the sort key for a row
|
|
17
|
+
# (used both in ORDER BY and to build the cursor tuple).
|
|
18
|
+
SORTS = {
|
|
19
|
+
"pending" => {
|
|
20
|
+
sql_order: "pending_count DESC, id ASC",
|
|
21
|
+
cursor_sql: "pending_count",
|
|
22
|
+
direction: :desc,
|
|
23
|
+
label: "pending desc"
|
|
24
|
+
},
|
|
25
|
+
"admitted" => {
|
|
26
|
+
sql_order: "total_admitted DESC, id ASC",
|
|
27
|
+
cursor_sql: "total_admitted",
|
|
28
|
+
direction: :desc,
|
|
29
|
+
label: "lifetime admitted"
|
|
30
|
+
},
|
|
31
|
+
"stale" => {
|
|
32
|
+
sql_order: "COALESCE(last_checked_at, TIMESTAMP '#{SENTINEL_TS}') ASC, id ASC",
|
|
33
|
+
cursor_sql: "COALESCE(last_checked_at, TIMESTAMP '#{SENTINEL_TS}')",
|
|
34
|
+
direction: :asc,
|
|
35
|
+
label: "stalest (round-trip)"
|
|
36
|
+
},
|
|
37
|
+
"recent" => {
|
|
38
|
+
sql_order: "COALESCE(last_admit_at, TIMESTAMP '#{SENTINEL_TS}') DESC, id ASC",
|
|
39
|
+
cursor_sql: "COALESCE(last_admit_at, TIMESTAMP '#{SENTINEL_TS}')",
|
|
40
|
+
direction: :desc,
|
|
41
|
+
label: "recent admit"
|
|
42
|
+
},
|
|
43
|
+
"key" => {
|
|
44
|
+
sql_order: "partition_key ASC, id ASC",
|
|
45
|
+
cursor_sql: "partition_key",
|
|
46
|
+
direction: :asc,
|
|
47
|
+
label: "partition key"
|
|
48
|
+
}
|
|
49
|
+
}.freeze
|
|
50
|
+
|
|
51
|
+
DEFAULT_SORT = "pending"
|
|
52
|
+
|
|
53
|
+
module_function
|
|
54
|
+
|
|
55
|
+
def sort_for(name)
|
|
56
|
+
SORTS[name] || SORTS.fetch(DEFAULT_SORT)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def encode(value, id)
|
|
60
|
+
Base64.urlsafe_encode64(JSON.dump([value, id]), padding: false)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def decode(cursor)
|
|
64
|
+
return nil if cursor.nil? || cursor.empty?
|
|
65
|
+
|
|
66
|
+
decoded = JSON.parse(Base64.urlsafe_decode64(cursor))
|
|
67
|
+
return nil unless decoded.is_a?(Array) && decoded.size == 2
|
|
68
|
+
|
|
69
|
+
decoded
|
|
70
|
+
rescue StandardError
|
|
71
|
+
nil
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Apply a cursor tuple (value, id) to an AR scope under the given sort.
|
|
75
|
+
# The tiebreaker on id is always ASC so id strictly advances forward.
|
|
76
|
+
def apply(scope, sort_name, cursor)
|
|
77
|
+
sort = sort_for(sort_name)
|
|
78
|
+
return scope if cursor.nil?
|
|
79
|
+
|
|
80
|
+
value, last_id = cursor
|
|
81
|
+
case sort[:direction]
|
|
82
|
+
when :desc
|
|
83
|
+
scope.where(
|
|
84
|
+
"(#{sort[:cursor_sql]} < ?) OR (#{sort[:cursor_sql]} = ? AND id > ?)",
|
|
85
|
+
value, value, last_id
|
|
86
|
+
)
|
|
87
|
+
when :asc
|
|
88
|
+
scope.where(
|
|
89
|
+
"(#{sort[:cursor_sql]} > ?) OR (#{sort[:cursor_sql]} = ? AND id > ?)",
|
|
90
|
+
value, value, last_id
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Read the cursor key from a row using the given sort. Returns the
|
|
96
|
+
# raw value the cursor was built from (for emitting to the next link).
|
|
97
|
+
def extract(row, sort_name)
|
|
98
|
+
sort = sort_for(sort_name)
|
|
99
|
+
column = sort[:cursor_sql]
|
|
100
|
+
# cursor_sql may include a COALESCE(...). For row-side extraction we
|
|
101
|
+
# mirror that with Ruby. The columns we coalesce are timestamps; we
|
|
102
|
+
# use Time.at(0) as the equivalent sentinel.
|
|
103
|
+
raw = case column
|
|
104
|
+
when "pending_count", "total_admitted", "partition_key"
|
|
105
|
+
row.send(column)
|
|
106
|
+
when /COALESCE\(last_checked_at,/
|
|
107
|
+
row.last_checked_at || Time.at(0)
|
|
108
|
+
when /COALESCE\(last_admit_at,/
|
|
109
|
+
row.last_admit_at || Time.at(0)
|
|
110
|
+
end
|
|
111
|
+
[serialize_value(raw), row.id]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def serialize_value(v)
|
|
115
|
+
case v
|
|
116
|
+
when Time, ActiveSupport::TimeWithZone then v.utc.iso8601(6)
|
|
117
|
+
else v
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class Decision
|
|
5
|
+
attr_reader :allowed, :retry_after, :gate_state_patch, :reason
|
|
6
|
+
|
|
7
|
+
def initialize(allowed:, retry_after: nil, gate_state_patch: nil, reason: nil)
|
|
8
|
+
@allowed = allowed
|
|
9
|
+
@retry_after = retry_after
|
|
10
|
+
@gate_state_patch = gate_state_patch
|
|
11
|
+
@reason = reason
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.unlimited
|
|
15
|
+
new(allowed: Float::INFINITY)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.deny(retry_after: nil, reason: nil)
|
|
19
|
+
new(allowed: 0, retry_after: retry_after, reason: reason)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -3,34 +3,11 @@
|
|
|
3
3
|
require "rails/engine"
|
|
4
4
|
|
|
5
5
|
module DispatchPolicy
|
|
6
|
+
# Mounted by the host app. Views, controllers, and AR models live under
|
|
7
|
+
# `app/`; the layout inlines the engine CSS by reading
|
|
8
|
+
# `app/assets/stylesheets/dispatch_policy/application.css` at render time,
|
|
9
|
+
# so no asset pipeline integration is required.
|
|
6
10
|
class Engine < ::Rails::Engine
|
|
7
11
|
isolate_namespace DispatchPolicy
|
|
8
|
-
|
|
9
|
-
initializer "dispatch_policy.reference_gates" do
|
|
10
|
-
config.to_prepare do
|
|
11
|
-
# Reference the built-in gates so they register in Gate.registry.
|
|
12
|
-
DispatchPolicy::Gates::Concurrency
|
|
13
|
-
DispatchPolicy::Gates::Throttle
|
|
14
|
-
DispatchPolicy::Gates::GlobalCap
|
|
15
|
-
DispatchPolicy::Gates::FairInterleave
|
|
16
|
-
DispatchPolicy::Gates::AdaptiveConcurrency
|
|
17
|
-
|
|
18
|
-
DispatchPolicy::ActiveJobPerformAllLaterPatch
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
initializer "dispatch_policy.boot_prune", after: :load_config_initializers do
|
|
23
|
-
config.to_prepare do
|
|
24
|
-
begin
|
|
25
|
-
DispatchPolicy::Tick.prune_orphan_gate_rows
|
|
26
|
-
DispatchPolicy::Tick.prune_idle_partitions
|
|
27
|
-
DispatchPolicy::PartitionObservation.prune!
|
|
28
|
-
rescue ActiveRecord::NoDatabaseError,
|
|
29
|
-
ActiveRecord::StatementInvalid,
|
|
30
|
-
ActiveRecord::ConnectionNotEstablished
|
|
31
|
-
# DB not ready — skip silently.
|
|
32
|
-
end
|
|
33
|
-
end
|
|
34
|
-
end
|
|
35
12
|
end
|
|
36
13
|
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Re-enqueues admitted jobs onto the real ActiveJob adapter under a
|
|
5
|
+
# `Bypass.with` block, so the around_enqueue callback that staged them
|
|
6
|
+
# in the first place lets the call through.
|
|
7
|
+
#
|
|
8
|
+
# Called from inside Tick's admission transaction. With a PG-backed
|
|
9
|
+
# adapter (good_job / solid_queue) the adapter's INSERT shares the
|
|
10
|
+
# transaction, so any exception here aborts the whole admission
|
|
11
|
+
# atomically (staged_jobs return, inflight rows disappear, partition
|
|
12
|
+
# counters revert, adapter rows revert). There is intentionally no
|
|
13
|
+
# rescue here: failures must propagate to roll back the surrounding TX.
|
|
14
|
+
#
|
|
15
|
+
# Bulk path: rows without scheduled_at go through ActiveJob.perform_all_later,
|
|
16
|
+
# which collapses to a single multi-row INSERT on adapters that implement
|
|
17
|
+
# enqueue_all natively (good_job, solid_queue). Rows with scheduled_at
|
|
18
|
+
# keep the per-row path because perform_all_later doesn't accept a
|
|
19
|
+
# wait_until per job.
|
|
20
|
+
module Forwarder
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
# @param rows [Array<Hash>] admitted staged_job rows (already deleted from staging)
|
|
24
|
+
# @raise StandardError propagates any error from deserialize / adapter enqueue
|
|
25
|
+
# @raise EnqueueFailed if the adapter's enqueue_all returned without
|
|
26
|
+
# raising but flagged any job as not-successfully-enqueued (the
|
|
27
|
+
# atomic contract requires caller-visible failure so the surrounding
|
|
28
|
+
# TX rolls back).
|
|
29
|
+
def dispatch(rows)
|
|
30
|
+
return if rows.empty?
|
|
31
|
+
|
|
32
|
+
scheduled, immediate = rows.partition { |row| row["scheduled_at"] }
|
|
33
|
+
|
|
34
|
+
if immediate.any?
|
|
35
|
+
jobs = immediate.map { |row| Serializer.deserialize(row["job_data"]) }
|
|
36
|
+
Bypass.with { ::ActiveJob.perform_all_later(jobs) }
|
|
37
|
+
not_enqueued = jobs.reject { |j| j.respond_to?(:successfully_enqueued?) ? j.successfully_enqueued? : true }
|
|
38
|
+
if not_enqueued.any?
|
|
39
|
+
ids = not_enqueued.map(&:job_id).join(", ")
|
|
40
|
+
raise EnqueueFailed,
|
|
41
|
+
"perform_all_later soft-failed #{not_enqueued.size}/#{jobs.size} jobs (#{ids})"
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
scheduled.each do |row|
|
|
46
|
+
job = Serializer.deserialize(row["job_data"])
|
|
47
|
+
wait_until = enqueue_wait_until(row)
|
|
48
|
+
Bypass.with { job.set(wait_until: wait_until).enqueue }
|
|
49
|
+
if job.respond_to?(:successfully_enqueued?) && !job.successfully_enqueued?
|
|
50
|
+
raise EnqueueFailed, "scheduled enqueue soft-failed for #{job.job_id}"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def enqueue_wait_until(row)
|
|
56
|
+
ts = row["scheduled_at"]
|
|
57
|
+
return nil unless ts
|
|
58
|
+
ts.is_a?(Time) ? ts : Time.parse(ts.to_s)
|
|
59
|
+
rescue ArgumentError
|
|
60
|
+
nil
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/dispatch_policy/gate.rb
CHANGED
|
@@ -2,48 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
class Gate
|
|
5
|
-
|
|
6
|
-
def registry
|
|
7
|
-
@registry ||= {}
|
|
8
|
-
end
|
|
9
|
-
|
|
10
|
-
def register(name, klass)
|
|
11
|
-
registry[name.to_sym] = klass
|
|
12
|
-
end
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
attr_reader :policy, :partition_by, :name
|
|
16
|
-
|
|
17
|
-
def initialize(policy:, name:, partition_by: nil, **opts)
|
|
18
|
-
@policy = policy
|
|
19
|
-
@name = name
|
|
20
|
-
@partition_by = partition_by
|
|
21
|
-
configure(**opts)
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
def configure(**_opts); end
|
|
25
|
-
|
|
26
|
-
# Resolve a partition key for a given context.
|
|
27
|
-
def partition_key_for(ctx)
|
|
28
|
-
return "default" if @partition_by.nil?
|
|
29
|
-
@partition_by.call(ctx).to_s
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# Subclasses must implement.
|
|
33
|
-
def filter(_batch, _context)
|
|
5
|
+
def name
|
|
34
6
|
raise NotImplementedError
|
|
35
7
|
end
|
|
36
8
|
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
9
|
+
# @param ctx [DispatchPolicy::Context]
|
|
10
|
+
# @param partition [Hash] the partitions row (string keys)
|
|
11
|
+
# @param admit_budget [Integer] the budget remaining from earlier gates
|
|
12
|
+
# @return [DispatchPolicy::Decision]
|
|
13
|
+
def evaluate(_ctx, _partition, _admit_budget)
|
|
14
|
+
raise NotImplementedError
|
|
41
15
|
end
|
|
42
16
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def
|
|
46
|
-
value.respond_to?(:call) ? value.call(ctx) : value
|
|
47
|
-
end
|
|
17
|
+
# Called after a successful admit to update gate-local state.
|
|
18
|
+
# Returns a hash patch to merge into partition.gate_state, or nil.
|
|
19
|
+
def consume(_decision, _admitted_count); nil; end
|
|
48
20
|
end
|
|
49
21
|
end
|
|
@@ -2,122 +2,124 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
module Gates
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
# perform_start — so it reflects "are we admitting too fast?" without
|
|
10
|
-
# getting polluted by how long the external work takes.
|
|
5
|
+
# Self-tuning concurrency gate. Like :concurrency but with a
|
|
6
|
+
# per-partition cap (`current_max`) that grows when the adapter
|
|
7
|
+
# queue is empty and shrinks when it builds up. AIMD loop persisted
|
|
8
|
+
# in `dispatch_policy_adaptive_concurrency_stats`.
|
|
11
9
|
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
10
|
+
# Feedback signal is `queue_lag_ms = perform_start - admitted_at`
|
|
11
|
+
# (time the job spent waiting in the adapter after admission).
|
|
12
|
+
# Pure saturation signal — slow performs in the downstream service
|
|
13
|
+
# don't punish admissions if workers still drain the queue quickly.
|
|
14
|
+
#
|
|
15
|
+
# Update rule applied after each perform (in InflightTracker.track):
|
|
16
|
+
#
|
|
17
|
+
# succeeded? & ewma_lag <= target_lag_ms → current_max += 1
|
|
18
|
+
# succeeded? & ewma_lag > target_lag_ms → current_max *= slow_factor
|
|
19
|
+
# failed? → current_max *= fail_factor
|
|
20
|
+
#
|
|
21
|
+
# Always clamped to >= min. Never grows without bound — the
|
|
22
|
+
# algorithm self-limits via target_lag_ms.
|
|
14
23
|
class AdaptiveConcurrency < Gate
|
|
15
|
-
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
DEFAULT_EWMA_ALPHA = 0.5
|
|
20
|
-
DEFAULT_FAIL_FACTOR = 0.5
|
|
21
|
-
DEFAULT_SLOW_FACTOR = 0.95
|
|
24
|
+
DEFAULT_FULL_BACKOFF = 1.0 # seconds
|
|
25
|
+
DEFAULT_EWMA_ALPHA = 0.5 # weight of the new sample in the EWMA
|
|
26
|
+
DEFAULT_FAIL_FACTOR = 0.5 # halve on perform raise
|
|
27
|
+
DEFAULT_SLOW_FACTOR = 0.95 # gentle shrink on overload
|
|
22
28
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def configure(initial_max:,
|
|
26
|
-
target_lag_ms: nil,
|
|
27
|
-
target_latency: nil,
|
|
28
|
-
min: 1,
|
|
29
|
-
ewma_alpha: DEFAULT_EWMA_ALPHA,
|
|
30
|
-
failure_decrease_factor: DEFAULT_FAIL_FACTOR,
|
|
31
|
-
overload_decrease_factor: DEFAULT_SLOW_FACTOR)
|
|
32
|
-
@initial_max = initial_max
|
|
33
|
-
@min = min
|
|
34
|
-
@target_lag_ms = target_lag_ms || target_latency
|
|
35
|
-
@ewma_alpha = ewma_alpha
|
|
36
|
-
@fail_factor = failure_decrease_factor
|
|
37
|
-
@slow_factor = overload_decrease_factor
|
|
38
|
-
raise ArgumentError, "adaptive_concurrency requires target_lag_ms" if @target_lag_ms.nil?
|
|
39
|
-
end
|
|
29
|
+
attr_reader :initial_max, :target_lag_ms, :min,
|
|
30
|
+
:ewma_alpha, :fail_factor, :slow_factor, :full_backoff
|
|
40
31
|
|
|
41
|
-
def
|
|
42
|
-
|
|
32
|
+
def initialize(initial_max:, target_lag_ms:, min: 1,
|
|
33
|
+
ewma_alpha: DEFAULT_EWMA_ALPHA,
|
|
34
|
+
failure_decrease_factor: DEFAULT_FAIL_FACTOR,
|
|
35
|
+
overload_decrease_factor: DEFAULT_SLOW_FACTOR,
|
|
36
|
+
full_backoff: DEFAULT_FULL_BACKOFF)
|
|
37
|
+
super()
|
|
38
|
+
@initial_max = Integer(initial_max)
|
|
39
|
+
@target_lag_ms = Float(target_lag_ms)
|
|
40
|
+
@min = Integer(min)
|
|
41
|
+
@ewma_alpha = Float(ewma_alpha)
|
|
42
|
+
@fail_factor = Float(failure_decrease_factor)
|
|
43
|
+
@slow_factor = Float(overload_decrease_factor)
|
|
44
|
+
@full_backoff = Float(full_backoff)
|
|
45
|
+
raise ArgumentError, "target_lag_ms must be > 0" unless @target_lag_ms.positive?
|
|
46
|
+
raise ArgumentError, "min must be >= 1" unless @min >= 1
|
|
47
|
+
raise ArgumentError, "initial_max must be >= min" unless @initial_max >= @min
|
|
43
48
|
end
|
|
44
49
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def filter(batch, context)
|
|
49
|
-
by_partition = batch.group_by { |staged| partition_key_for(context.for(staged)) }
|
|
50
|
-
|
|
51
|
-
# Seed any missing stats rows so the first admission has something
|
|
52
|
-
# to read. Cheap: one INSERT ... ON CONFLICT DO NOTHING per key.
|
|
53
|
-
by_partition.each_key do |key|
|
|
54
|
-
AdaptiveConcurrencyStats.seed!(
|
|
55
|
-
policy_name: policy.name,
|
|
56
|
-
gate_name: name,
|
|
57
|
-
partition_key: key,
|
|
58
|
-
initial_max: resolve(@initial_max, nil).to_i
|
|
59
|
-
)
|
|
60
|
-
end
|
|
50
|
+
def name
|
|
51
|
+
:adaptive_concurrency
|
|
52
|
+
end
|
|
61
53
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
partition_keys: by_partition.keys
|
|
66
|
-
)
|
|
54
|
+
def evaluate(ctx, partition, admit_budget)
|
|
55
|
+
policy_name = partition["policy_name"]
|
|
56
|
+
key = inflight_partition_key(policy_name, ctx)
|
|
67
57
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
58
|
+
# Seed lazily so the very first admission has a row to read
|
|
59
|
+
# (and so record_observation can UPDATE without a check).
|
|
60
|
+
Repository.adaptive_seed!(
|
|
61
|
+
policy_name: policy_name,
|
|
62
|
+
partition_key: key,
|
|
63
|
+
initial_max: @initial_max
|
|
72
64
|
)
|
|
73
65
|
|
|
74
|
-
|
|
66
|
+
cap = Repository.adaptive_current_max(
|
|
67
|
+
policy_name: policy_name,
|
|
68
|
+
partition_key: key
|
|
69
|
+
) || @initial_max
|
|
70
|
+
cap = [cap, @min].max
|
|
75
71
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
72
|
+
in_flight = Repository.count_inflight(
|
|
73
|
+
policy_name: policy_name,
|
|
74
|
+
partition_key: key
|
|
75
|
+
)
|
|
76
|
+
remaining = cap - in_flight
|
|
81
77
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
effective_max = [ effective_max, resolve(@initial_max, nil).to_i ].max
|
|
88
|
-
end
|
|
78
|
+
# Safety valve. AIMD can shrink current_max during a slow burst;
|
|
79
|
+
# if the partition then idles, no observations come in to grow
|
|
80
|
+
# the cap back. When in_flight == 0 we ensure at least
|
|
81
|
+
# initial_max so the partition never fossilizes at min.
|
|
82
|
+
remaining = [remaining, @initial_max].max if in_flight.zero?
|
|
89
83
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
end
|
|
84
|
+
if remaining <= 0
|
|
85
|
+
return Decision.new(allowed: 0,
|
|
86
|
+
retry_after: @full_backoff,
|
|
87
|
+
reason: "adaptive_concurrency_full")
|
|
95
88
|
end
|
|
96
89
|
|
|
97
|
-
|
|
98
|
-
admitted.map(&:first)
|
|
90
|
+
Decision.new(allowed: [remaining, admit_budget].min)
|
|
99
91
|
end
|
|
100
92
|
|
|
101
|
-
#
|
|
102
|
-
#
|
|
103
|
-
#
|
|
104
|
-
def
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
93
|
+
# Same canonical scope as the staged_jobs partition_key — every
|
|
94
|
+
# gate in a policy uses `policy.partition_for(ctx)` so the
|
|
95
|
+
# inflight count and the adaptive stats line up exactly.
|
|
96
|
+
def inflight_partition_key(policy_name, ctx)
|
|
97
|
+
policy = DispatchPolicy.registry.fetch(policy_name)
|
|
98
|
+
raise InvalidPolicy, "unknown policy #{policy_name.inspect}" unless policy
|
|
99
|
+
policy.partition_for(ctx)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Called from InflightTracker.track after each perform completes
|
|
103
|
+
# (success or failure). Updates the AIMD state atomically in one
|
|
104
|
+
# SQL statement.
|
|
105
|
+
def record_observation(policy_name:, partition_key:, queue_lag_ms:, succeeded:)
|
|
106
|
+
Repository.adaptive_seed!(
|
|
107
|
+
policy_name: policy_name,
|
|
108
|
+
partition_key: partition_key.to_s,
|
|
109
|
+
initial_max: @initial_max
|
|
110
|
+
)
|
|
111
|
+
Repository.adaptive_record!(
|
|
112
|
+
policy_name: policy_name,
|
|
113
|
+
partition_key: partition_key.to_s,
|
|
114
|
+
queue_lag_ms: queue_lag_ms,
|
|
115
|
+
succeeded: succeeded,
|
|
116
|
+
alpha: @ewma_alpha,
|
|
117
|
+
target_lag_ms: @target_lag_ms,
|
|
118
|
+
fail_factor: @fail_factor,
|
|
119
|
+
slow_factor: @slow_factor,
|
|
120
|
+
min: @min
|
|
117
121
|
)
|
|
118
122
|
end
|
|
119
123
|
end
|
|
120
|
-
|
|
121
|
-
Gate.register(:adaptive_concurrency, AdaptiveConcurrency)
|
|
122
124
|
end
|
|
123
125
|
end
|
|
@@ -2,42 +2,61 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
module Gates
|
|
5
|
+
# Concurrency gate: caps in-flight jobs per partition.
|
|
6
|
+
#
|
|
7
|
+
# The partition scope is the policy's `partition_by`. Inflight rows
|
|
8
|
+
# are written by InflightTracker around_perform with the same key,
|
|
9
|
+
# so this gate's COUNT(*) aggregates the same canonical scope as
|
|
10
|
+
# the staged_jobs row.
|
|
5
11
|
class Concurrency < Gate
|
|
6
|
-
|
|
7
|
-
|
|
12
|
+
DEFAULT_FULL_BACKOFF = 1.0 # seconds
|
|
13
|
+
|
|
14
|
+
attr_reader :max_proc, :full_backoff
|
|
15
|
+
|
|
16
|
+
def initialize(max:, full_backoff: DEFAULT_FULL_BACKOFF)
|
|
17
|
+
super()
|
|
18
|
+
@max_proc = max.respond_to?(:call) ? max : ->(_ctx) { max }
|
|
19
|
+
@full_backoff = full_backoff.to_f
|
|
8
20
|
end
|
|
9
21
|
|
|
10
|
-
def
|
|
11
|
-
|
|
22
|
+
def name
|
|
23
|
+
:concurrency
|
|
12
24
|
end
|
|
13
25
|
|
|
14
|
-
def
|
|
15
|
-
|
|
26
|
+
def evaluate(ctx, partition, admit_budget)
|
|
27
|
+
cap = capacity_for(ctx)
|
|
28
|
+
return Decision.deny(retry_after: @full_backoff, reason: "max=0") if cap <= 0
|
|
16
29
|
|
|
17
|
-
in_flight =
|
|
18
|
-
policy_name:
|
|
19
|
-
|
|
20
|
-
partition_keys: by_partition.keys
|
|
30
|
+
in_flight = Repository.count_inflight(
|
|
31
|
+
policy_name: partition["policy_name"],
|
|
32
|
+
partition_key: inflight_partition_key(partition["policy_name"], ctx)
|
|
21
33
|
)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
jobs
|
|
26
|
-
|
|
27
|
-
limit = resolve(@max, ctx).to_i
|
|
28
|
-
used = in_flight.fetch(partition_key, 0)
|
|
29
|
-
if used < limit
|
|
30
|
-
admitted << [ staged, partition_key ]
|
|
31
|
-
in_flight[partition_key] = used + 1
|
|
32
|
-
end
|
|
33
|
-
end
|
|
34
|
+
remaining = cap - in_flight
|
|
35
|
+
if remaining <= 0
|
|
36
|
+
# Stop hammering this partition with COUNT(*) every tick — back off
|
|
37
|
+
# until enough jobs are likely to have finished.
|
|
38
|
+
return Decision.new(allowed: 0, retry_after: @full_backoff, reason: "concurrency_full")
|
|
34
39
|
end
|
|
35
40
|
|
|
36
|
-
|
|
37
|
-
admitted.map(&:first)
|
|
41
|
+
Decision.new(allowed: [remaining, admit_budget].min)
|
|
38
42
|
end
|
|
39
|
-
end
|
|
40
43
|
|
|
41
|
-
|
|
44
|
+
# The inflight key is always the policy's canonical partition
|
|
45
|
+
# value — same as what's stored in staged_jobs.partition_key.
|
|
46
|
+
# This is what makes throttle + concurrency in the same policy
|
|
47
|
+
# enforce their state at exactly one consistent scope.
|
|
48
|
+
def inflight_partition_key(policy_name, ctx)
|
|
49
|
+
policy = DispatchPolicy.registry.fetch(policy_name)
|
|
50
|
+
raise InvalidPolicy, "unknown policy #{policy_name.inspect}" unless policy
|
|
51
|
+
policy.partition_for(ctx)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def capacity_for(ctx)
|
|
57
|
+
value = @max_proc.call(ctx)
|
|
58
|
+
value.nil? ? 0 : Integer(value)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
42
61
|
end
|
|
43
62
|
end
|