dispatch_policy 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +98 -28
- data/MIT-LICENSE +16 -17
- data/README.md +452 -388
- data/app/assets/images/dispatch_policy/logo-large.svg +9 -0
- data/app/assets/images/dispatch_policy/logo-small.svg +7 -0
- data/app/assets/javascripts/dispatch_policy/turbo.es2017-umd.min.js +35 -0
- data/app/assets/stylesheets/dispatch_policy/application.css +294 -0
- data/app/controllers/dispatch_policy/application_controller.rb +45 -1
- data/app/controllers/dispatch_policy/assets_controller.rb +31 -0
- data/app/controllers/dispatch_policy/dashboard_controller.rb +91 -0
- data/app/controllers/dispatch_policy/partitions_controller.rb +122 -0
- data/app/controllers/dispatch_policy/policies_controller.rb +94 -267
- data/app/controllers/dispatch_policy/staged_jobs_controller.rb +9 -0
- data/app/models/dispatch_policy/adaptive_concurrency_stats.rb +11 -81
- data/app/models/dispatch_policy/inflight_job.rb +12 -0
- data/app/models/dispatch_policy/partition.rb +21 -0
- data/app/models/dispatch_policy/staged_job.rb +4 -97
- data/app/models/dispatch_policy/tick_sample.rb +11 -0
- data/app/views/dispatch_policy/dashboard/index.html.erb +109 -0
- data/app/views/dispatch_policy/partitions/index.html.erb +63 -0
- data/app/views/dispatch_policy/partitions/show.html.erb +106 -0
- data/app/views/dispatch_policy/policies/index.html.erb +15 -37
- data/app/views/dispatch_policy/policies/show.html.erb +139 -223
- data/app/views/dispatch_policy/shared/_capacity.html.erb +67 -0
- data/app/views/dispatch_policy/shared/_hints.html.erb +13 -0
- data/app/views/dispatch_policy/shared/_partition_row.html.erb +12 -0
- data/app/views/dispatch_policy/staged_jobs/show.html.erb +31 -0
- data/app/views/layouts/dispatch_policy/application.html.erb +164 -231
- data/config/routes.rb +21 -2
- data/db/migrate/20260501000001_create_dispatch_policy_tables.rb +103 -0
- data/lib/dispatch_policy/assets.rb +38 -0
- data/lib/dispatch_policy/bypass.rb +23 -0
- data/lib/dispatch_policy/config.rb +85 -0
- data/lib/dispatch_policy/context.rb +50 -0
- data/lib/dispatch_policy/cursor_pagination.rb +121 -0
- data/lib/dispatch_policy/decision.rb +22 -0
- data/lib/dispatch_policy/engine.rb +5 -27
- data/lib/dispatch_policy/forwarder.rb +63 -0
- data/lib/dispatch_policy/gate.rb +10 -38
- data/lib/dispatch_policy/gates/adaptive_concurrency.rb +99 -97
- data/lib/dispatch_policy/gates/concurrency.rb +45 -26
- data/lib/dispatch_policy/gates/throttle.rb +65 -41
- data/lib/dispatch_policy/inflight_tracker.rb +174 -0
- data/lib/dispatch_policy/job_extension.rb +155 -0
- data/lib/dispatch_policy/operator_hints.rb +126 -0
- data/lib/dispatch_policy/pipeline.rb +48 -0
- data/lib/dispatch_policy/policy.rb +61 -59
- data/lib/dispatch_policy/policy_dsl.rb +120 -0
- data/lib/dispatch_policy/railtie.rb +35 -0
- data/lib/dispatch_policy/registry.rb +46 -0
- data/lib/dispatch_policy/repository.rb +723 -0
- data/lib/dispatch_policy/serializer.rb +36 -0
- data/lib/dispatch_policy/tick.rb +260 -256
- data/lib/dispatch_policy/tick_loop.rb +59 -26
- data/lib/dispatch_policy/version.rb +1 -1
- data/lib/dispatch_policy.rb +72 -52
- data/lib/generators/dispatch_policy/install/install_generator.rb +70 -0
- data/lib/generators/dispatch_policy/install/templates/create_dispatch_policy_tables.rb.tt +95 -0
- data/lib/generators/dispatch_policy/install/templates/dispatch_tick_loop_job.rb.tt +53 -0
- data/lib/generators/dispatch_policy/install/templates/initializer.rb.tt +11 -0
- metadata +134 -42
- data/app/models/dispatch_policy/partition_inflight_count.rb +0 -42
- data/app/models/dispatch_policy/partition_observation.rb +0 -76
- data/app/models/dispatch_policy/throttle_bucket.rb +0 -41
- data/db/migrate/20260424000001_create_dispatch_policy_tables.rb +0 -80
- data/db/migrate/20260424000002_create_adaptive_concurrency_stats.rb +0 -22
- data/db/migrate/20260424000003_create_adaptive_concurrency_samples.rb +0 -25
- data/db/migrate/20260424000004_rename_samples_to_partition_observations.rb +0 -32
- data/db/migrate/20260425000001_add_duration_to_partition_observations.rb +0 -8
- data/lib/dispatch_policy/active_job_perform_all_later_patch.rb +0 -32
- data/lib/dispatch_policy/dispatch_context.rb +0 -53
- data/lib/dispatch_policy/dispatchable.rb +0 -123
- data/lib/dispatch_policy/gates/fair_interleave.rb +0 -32
- data/lib/dispatch_policy/gates/global_cap.rb +0 -26
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class Config
|
|
5
|
+
attr_accessor :enabled,
|
|
6
|
+
:tick_max_duration,
|
|
7
|
+
:partition_batch_size,
|
|
8
|
+
:admission_batch_size,
|
|
9
|
+
:idle_pause,
|
|
10
|
+
:busy_pause,
|
|
11
|
+
:partition_inactive_after,
|
|
12
|
+
:inflight_stale_after,
|
|
13
|
+
:inflight_heartbeat_interval,
|
|
14
|
+
:real_adapter,
|
|
15
|
+
:logger,
|
|
16
|
+
:clock,
|
|
17
|
+
:sweep_every_ticks,
|
|
18
|
+
:metrics_retention,
|
|
19
|
+
:database_role,
|
|
20
|
+
:fairness_half_life_seconds,
|
|
21
|
+
:tick_admission_budget,
|
|
22
|
+
:adapter_throughput_target
|
|
23
|
+
|
|
24
|
+
def initialize
|
|
25
|
+
# Master switch. When false, the around_enqueue and the BulkEnqueue
|
|
26
|
+
# patch pass through to the real adapter without staging — all of
|
|
27
|
+
# the gem's machinery becomes a no-op for new perform_later calls.
|
|
28
|
+
# The TickLoop also exits early. Used during cutovers to drain
|
|
29
|
+
# the staging table without taking traffic offline.
|
|
30
|
+
@enabled = true
|
|
31
|
+
@tick_max_duration = 25
|
|
32
|
+
@partition_batch_size = 50
|
|
33
|
+
@admission_batch_size = 100
|
|
34
|
+
@idle_pause = 0.5
|
|
35
|
+
# Sleep between iterations when the previous tick admitted > 0
|
|
36
|
+
# jobs. 0 (default) preserves the original "busy = no pause"
|
|
37
|
+
# behavior. Set to a small value (e.g. 0.02) to back off the DB
|
|
38
|
+
# when several TickLoops compete for connections; the per-loop
|
|
39
|
+
# throughput ceiling becomes admission_batch_size / busy_pause.
|
|
40
|
+
@busy_pause = 0.0
|
|
41
|
+
@partition_inactive_after = 24 * 60 * 60
|
|
42
|
+
@inflight_stale_after = 5 * 60
|
|
43
|
+
@inflight_heartbeat_interval = 30
|
|
44
|
+
@real_adapter = nil
|
|
45
|
+
@logger = nil
|
|
46
|
+
@clock = -> { Time.now.utc }
|
|
47
|
+
@sweep_every_ticks = 50
|
|
48
|
+
@metrics_retention = 24 * 60 * 60
|
|
49
|
+
# AR role for the admission TX. nil = default connection. Set to
|
|
50
|
+
# e.g. :queue when the host runs solid_queue on a separate DB.
|
|
51
|
+
@database_role = nil
|
|
52
|
+
# Fairness: the half-life of decayed_admits (per-partition EWMA).
|
|
53
|
+
# 60s means a partition's "recent activity" weight halves every
|
|
54
|
+
# 60s of idleness. Tick reorders claimed partitions by lowest
|
|
55
|
+
# decayed_admits first; under-admitted ones get first crack.
|
|
56
|
+
@fairness_half_life_seconds = 60
|
|
57
|
+
# Optional global cap on admissions per tick. nil = no cap; each
|
|
58
|
+
# partition uses admission_batch_size as its ceiling. When set,
|
|
59
|
+
# fair_share = ceil(cap / partitions_seen) is the per-partition
|
|
60
|
+
# ceiling, with redistribution of leftover budget after pass-1.
|
|
61
|
+
@tick_admission_budget = nil
|
|
62
|
+
# Operator-supplied "ceiling" of the underlying adapter, in jobs
|
|
63
|
+
# per second. The dashboard renders the live admit rate as a
|
|
64
|
+
# percentage of this and fires a hint when we're closing on it.
|
|
65
|
+
# nil = no ceiling reference (just shows the absolute rate).
|
|
66
|
+
# Measured locally against good_job: ~3500 jobs/sec per worker.
|
|
67
|
+
@adapter_throughput_target = nil
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def now
|
|
71
|
+
@clock.call
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def logger
|
|
75
|
+
@logger || (defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger) || default_logger
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
def default_logger
|
|
81
|
+
require "logger"
|
|
82
|
+
@default_logger ||= Logger.new($stdout, level: Logger::INFO)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class Context
|
|
5
|
+
def self.wrap(value)
|
|
6
|
+
case value
|
|
7
|
+
when Context then value
|
|
8
|
+
when Hash then new(value)
|
|
9
|
+
when nil then new({})
|
|
10
|
+
else
|
|
11
|
+
raise InvalidPolicy, "context must be a Hash, got #{value.class}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
attr_reader :data
|
|
16
|
+
|
|
17
|
+
def initialize(hash)
|
|
18
|
+
@data = deep_stringify(hash).freeze
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def [](key)
|
|
22
|
+
@data[key.to_s]
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def to_h
|
|
26
|
+
@data
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_jsonb
|
|
30
|
+
@data
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def fetch(key, *args, &block)
|
|
34
|
+
@data.fetch(key.to_s, *args, &block)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def deep_stringify(value)
|
|
40
|
+
case value
|
|
41
|
+
when Hash
|
|
42
|
+
value.each_with_object({}) { |(k, v), m| m[k.to_s] = deep_stringify(v) }
|
|
43
|
+
when Array
|
|
44
|
+
value.map { |v| deep_stringify(v) }
|
|
45
|
+
else
|
|
46
|
+
value
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "base64"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
module DispatchPolicy
|
|
7
|
+
# Tiny keyset-pagination helper for the engine UI. Each sort mode declares
|
|
8
|
+
# a single sortable column plus the row id as a deterministic tiebreaker
|
|
9
|
+
# so two rows can never share the same cursor. NULLable columns are
|
|
10
|
+
# coalesced to a sentinel ('1970-01-01' for timestamps) so the cursor
|
|
11
|
+
# clause stays a simple tuple comparison.
|
|
12
|
+
module CursorPagination
|
|
13
|
+
SENTINEL_TS = "1970-01-01 00:00:00".freeze
|
|
14
|
+
|
|
15
|
+
# name => { sql_order:, cursor_sql:, direction:, label: }
|
|
16
|
+
# cursor_sql is the expression to extract the sort key for a row
|
|
17
|
+
# (used both in ORDER BY and to build the cursor tuple).
|
|
18
|
+
SORTS = {
|
|
19
|
+
"pending" => {
|
|
20
|
+
sql_order: "pending_count DESC, id ASC",
|
|
21
|
+
cursor_sql: "pending_count",
|
|
22
|
+
direction: :desc,
|
|
23
|
+
label: "pending desc"
|
|
24
|
+
},
|
|
25
|
+
"admitted" => {
|
|
26
|
+
sql_order: "total_admitted DESC, id ASC",
|
|
27
|
+
cursor_sql: "total_admitted",
|
|
28
|
+
direction: :desc,
|
|
29
|
+
label: "lifetime admitted"
|
|
30
|
+
},
|
|
31
|
+
"stale" => {
|
|
32
|
+
sql_order: "COALESCE(last_checked_at, TIMESTAMP '#{SENTINEL_TS}') ASC, id ASC",
|
|
33
|
+
cursor_sql: "COALESCE(last_checked_at, TIMESTAMP '#{SENTINEL_TS}')",
|
|
34
|
+
direction: :asc,
|
|
35
|
+
label: "stalest (round-trip)"
|
|
36
|
+
},
|
|
37
|
+
"recent" => {
|
|
38
|
+
sql_order: "COALESCE(last_admit_at, TIMESTAMP '#{SENTINEL_TS}') DESC, id ASC",
|
|
39
|
+
cursor_sql: "COALESCE(last_admit_at, TIMESTAMP '#{SENTINEL_TS}')",
|
|
40
|
+
direction: :desc,
|
|
41
|
+
label: "recent admit"
|
|
42
|
+
},
|
|
43
|
+
"key" => {
|
|
44
|
+
sql_order: "partition_key ASC, id ASC",
|
|
45
|
+
cursor_sql: "partition_key",
|
|
46
|
+
direction: :asc,
|
|
47
|
+
label: "partition key"
|
|
48
|
+
}
|
|
49
|
+
}.freeze
|
|
50
|
+
|
|
51
|
+
DEFAULT_SORT = "pending"
|
|
52
|
+
|
|
53
|
+
module_function
|
|
54
|
+
|
|
55
|
+
def sort_for(name)
|
|
56
|
+
SORTS[name] || SORTS.fetch(DEFAULT_SORT)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def encode(value, id)
|
|
60
|
+
Base64.urlsafe_encode64(JSON.dump([value, id]), padding: false)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def decode(cursor)
|
|
64
|
+
return nil if cursor.nil? || cursor.empty?
|
|
65
|
+
|
|
66
|
+
decoded = JSON.parse(Base64.urlsafe_decode64(cursor))
|
|
67
|
+
return nil unless decoded.is_a?(Array) && decoded.size == 2
|
|
68
|
+
|
|
69
|
+
decoded
|
|
70
|
+
rescue StandardError
|
|
71
|
+
nil
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Apply a cursor tuple (value, id) to an AR scope under the given sort.
|
|
75
|
+
# The tiebreaker on id is always ASC so id strictly advances forward.
|
|
76
|
+
def apply(scope, sort_name, cursor)
|
|
77
|
+
sort = sort_for(sort_name)
|
|
78
|
+
return scope if cursor.nil?
|
|
79
|
+
|
|
80
|
+
value, last_id = cursor
|
|
81
|
+
case sort[:direction]
|
|
82
|
+
when :desc
|
|
83
|
+
scope.where(
|
|
84
|
+
"(#{sort[:cursor_sql]} < ?) OR (#{sort[:cursor_sql]} = ? AND id > ?)",
|
|
85
|
+
value, value, last_id
|
|
86
|
+
)
|
|
87
|
+
when :asc
|
|
88
|
+
scope.where(
|
|
89
|
+
"(#{sort[:cursor_sql]} > ?) OR (#{sort[:cursor_sql]} = ? AND id > ?)",
|
|
90
|
+
value, value, last_id
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Read the cursor key from a row using the given sort. Returns the
|
|
96
|
+
# raw value the cursor was built from (for emitting to the next link).
|
|
97
|
+
def extract(row, sort_name)
|
|
98
|
+
sort = sort_for(sort_name)
|
|
99
|
+
column = sort[:cursor_sql]
|
|
100
|
+
# cursor_sql may include a COALESCE(...). For row-side extraction we
|
|
101
|
+
# mirror that with Ruby. The columns we coalesce are timestamps; we
|
|
102
|
+
# use Time.at(0) as the equivalent sentinel.
|
|
103
|
+
raw = case column
|
|
104
|
+
when "pending_count", "total_admitted", "partition_key"
|
|
105
|
+
row.send(column)
|
|
106
|
+
when /COALESCE\(last_checked_at,/
|
|
107
|
+
row.last_checked_at || Time.at(0)
|
|
108
|
+
when /COALESCE\(last_admit_at,/
|
|
109
|
+
row.last_admit_at || Time.at(0)
|
|
110
|
+
end
|
|
111
|
+
[serialize_value(raw), row.id]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def serialize_value(v)
|
|
115
|
+
case v
|
|
116
|
+
when Time, ActiveSupport::TimeWithZone then v.utc.iso8601(6)
|
|
117
|
+
else v
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
class Decision
|
|
5
|
+
attr_reader :allowed, :retry_after, :gate_state_patch, :reason
|
|
6
|
+
|
|
7
|
+
def initialize(allowed:, retry_after: nil, gate_state_patch: nil, reason: nil)
|
|
8
|
+
@allowed = allowed
|
|
9
|
+
@retry_after = retry_after
|
|
10
|
+
@gate_state_patch = gate_state_patch
|
|
11
|
+
@reason = reason
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.unlimited
|
|
15
|
+
new(allowed: Float::INFINITY)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.deny(retry_after: nil, reason: nil)
|
|
19
|
+
new(allowed: 0, retry_after: retry_after, reason: reason)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -3,34 +3,12 @@
|
|
|
3
3
|
require "rails/engine"
|
|
4
4
|
|
|
5
5
|
module DispatchPolicy
|
|
6
|
+
# Mounted by the host app. Views, controllers, and AR models live under
|
|
7
|
+
# `app/`; the layout inlines the engine CSS by reading
|
|
8
|
+
# `app/assets/stylesheets/dispatch_policy/application.css` at render
|
|
9
|
+
# time, and serves the vendored Turbo bundle through `AssetsController`
|
|
10
|
+
# at a content-addressed URL — no asset pipeline integration required.
|
|
6
11
|
class Engine < ::Rails::Engine
|
|
7
12
|
isolate_namespace DispatchPolicy
|
|
8
|
-
|
|
9
|
-
initializer "dispatch_policy.reference_gates" do
|
|
10
|
-
config.to_prepare do
|
|
11
|
-
# Reference the built-in gates so they register in Gate.registry.
|
|
12
|
-
DispatchPolicy::Gates::Concurrency
|
|
13
|
-
DispatchPolicy::Gates::Throttle
|
|
14
|
-
DispatchPolicy::Gates::GlobalCap
|
|
15
|
-
DispatchPolicy::Gates::FairInterleave
|
|
16
|
-
DispatchPolicy::Gates::AdaptiveConcurrency
|
|
17
|
-
|
|
18
|
-
DispatchPolicy::ActiveJobPerformAllLaterPatch
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
initializer "dispatch_policy.boot_prune", after: :load_config_initializers do
|
|
23
|
-
config.to_prepare do
|
|
24
|
-
begin
|
|
25
|
-
DispatchPolicy::Tick.prune_orphan_gate_rows
|
|
26
|
-
DispatchPolicy::Tick.prune_idle_partitions
|
|
27
|
-
DispatchPolicy::PartitionObservation.prune!
|
|
28
|
-
rescue ActiveRecord::NoDatabaseError,
|
|
29
|
-
ActiveRecord::StatementInvalid,
|
|
30
|
-
ActiveRecord::ConnectionNotEstablished
|
|
31
|
-
# DB not ready — skip silently.
|
|
32
|
-
end
|
|
33
|
-
end
|
|
34
|
-
end
|
|
35
13
|
end
|
|
36
14
|
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DispatchPolicy
|
|
4
|
+
# Re-enqueues admitted jobs onto the real ActiveJob adapter under a
|
|
5
|
+
# `Bypass.with` block, so the around_enqueue callback that staged them
|
|
6
|
+
# in the first place lets the call through.
|
|
7
|
+
#
|
|
8
|
+
# Called from inside Tick's admission transaction. With a PG-backed
|
|
9
|
+
# adapter (good_job / solid_queue) the adapter's INSERT shares the
|
|
10
|
+
# transaction, so any exception here aborts the whole admission
|
|
11
|
+
# atomically (staged_jobs return, inflight rows disappear, partition
|
|
12
|
+
# counters revert, adapter rows revert). There is intentionally no
|
|
13
|
+
# rescue here: failures must propagate to roll back the surrounding TX.
|
|
14
|
+
#
|
|
15
|
+
# Bulk path: rows without scheduled_at go through ActiveJob.perform_all_later,
|
|
16
|
+
# which collapses to a single multi-row INSERT on adapters that implement
|
|
17
|
+
# enqueue_all natively (good_job, solid_queue). Rows with scheduled_at
|
|
18
|
+
# keep the per-row path because perform_all_later doesn't accept a
|
|
19
|
+
# wait_until per job.
|
|
20
|
+
module Forwarder
|
|
21
|
+
module_function
|
|
22
|
+
|
|
23
|
+
# @param rows [Array<Hash>] admitted staged_job rows (already deleted from staging)
|
|
24
|
+
# @raise StandardError propagates any error from deserialize / adapter enqueue
|
|
25
|
+
# @raise EnqueueFailed if the adapter's enqueue_all returned without
|
|
26
|
+
# raising but flagged any job as not-successfully-enqueued (the
|
|
27
|
+
# atomic contract requires caller-visible failure so the surrounding
|
|
28
|
+
# TX rolls back).
|
|
29
|
+
def dispatch(rows)
|
|
30
|
+
return if rows.empty?
|
|
31
|
+
|
|
32
|
+
scheduled, immediate = rows.partition { |row| row["scheduled_at"] }
|
|
33
|
+
|
|
34
|
+
if immediate.any?
|
|
35
|
+
jobs = immediate.map { |row| Serializer.deserialize(row["job_data"]) }
|
|
36
|
+
Bypass.with { ::ActiveJob.perform_all_later(jobs) }
|
|
37
|
+
not_enqueued = jobs.reject { |j| j.respond_to?(:successfully_enqueued?) ? j.successfully_enqueued? : true }
|
|
38
|
+
if not_enqueued.any?
|
|
39
|
+
ids = not_enqueued.map(&:job_id).join(", ")
|
|
40
|
+
raise EnqueueFailed,
|
|
41
|
+
"perform_all_later soft-failed #{not_enqueued.size}/#{jobs.size} jobs (#{ids})"
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
scheduled.each do |row|
|
|
46
|
+
job = Serializer.deserialize(row["job_data"])
|
|
47
|
+
wait_until = enqueue_wait_until(row)
|
|
48
|
+
Bypass.with { job.set(wait_until: wait_until).enqueue }
|
|
49
|
+
if job.respond_to?(:successfully_enqueued?) && !job.successfully_enqueued?
|
|
50
|
+
raise EnqueueFailed, "scheduled enqueue soft-failed for #{job.job_id}"
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def enqueue_wait_until(row)
|
|
56
|
+
ts = row["scheduled_at"]
|
|
57
|
+
return nil unless ts
|
|
58
|
+
ts.is_a?(Time) ? ts : Time.parse(ts.to_s)
|
|
59
|
+
rescue ArgumentError
|
|
60
|
+
nil
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
data/lib/dispatch_policy/gate.rb
CHANGED
|
@@ -2,48 +2,20 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
class Gate
|
|
5
|
-
|
|
6
|
-
def registry
|
|
7
|
-
@registry ||= {}
|
|
8
|
-
end
|
|
9
|
-
|
|
10
|
-
def register(name, klass)
|
|
11
|
-
registry[name.to_sym] = klass
|
|
12
|
-
end
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
attr_reader :policy, :partition_by, :name
|
|
16
|
-
|
|
17
|
-
def initialize(policy:, name:, partition_by: nil, **opts)
|
|
18
|
-
@policy = policy
|
|
19
|
-
@name = name
|
|
20
|
-
@partition_by = partition_by
|
|
21
|
-
configure(**opts)
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
def configure(**_opts); end
|
|
25
|
-
|
|
26
|
-
# Resolve a partition key for a given context.
|
|
27
|
-
def partition_key_for(ctx)
|
|
28
|
-
return "default" if @partition_by.nil?
|
|
29
|
-
@partition_by.call(ctx).to_s
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# Subclasses must implement.
|
|
33
|
-
def filter(_batch, _context)
|
|
5
|
+
def name
|
|
34
6
|
raise NotImplementedError
|
|
35
7
|
end
|
|
36
8
|
|
|
37
|
-
#
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
9
|
+
# @param ctx [DispatchPolicy::Context]
|
|
10
|
+
# @param partition [Hash] the partitions row (string keys)
|
|
11
|
+
# @param admit_budget [Integer] the budget remaining from earlier gates
|
|
12
|
+
# @return [DispatchPolicy::Decision]
|
|
13
|
+
def evaluate(_ctx, _partition, _admit_budget)
|
|
14
|
+
raise NotImplementedError
|
|
41
15
|
end
|
|
42
16
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def
|
|
46
|
-
value.respond_to?(:call) ? value.call(ctx) : value
|
|
47
|
-
end
|
|
17
|
+
# Called after a successful admit to update gate-local state.
|
|
18
|
+
# Returns a hash patch to merge into partition.gate_state, or nil.
|
|
19
|
+
def consume(_decision, _admitted_count); nil; end
|
|
48
20
|
end
|
|
49
21
|
end
|
|
@@ -2,122 +2,124 @@
|
|
|
2
2
|
|
|
3
3
|
module DispatchPolicy
|
|
4
4
|
module Gates
|
|
5
|
-
#
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
# perform_start — so it reflects "are we admitting too fast?" without
|
|
10
|
-
# getting polluted by how long the external work takes.
|
|
5
|
+
# Self-tuning concurrency gate. Like :concurrency but with a
|
|
6
|
+
# per-partition cap (`current_max`) that grows when the adapter
|
|
7
|
+
# queue is empty and shrinks when it builds up. AIMD loop persisted
|
|
8
|
+
# in `dispatch_policy_adaptive_concurrency_stats`.
|
|
11
9
|
#
|
|
12
|
-
#
|
|
13
|
-
#
|
|
10
|
+
# Feedback signal is `queue_lag_ms = perform_start - admitted_at`
|
|
11
|
+
# (time the job spent waiting in the adapter after admission).
|
|
12
|
+
# Pure saturation signal — slow performs in the downstream service
|
|
13
|
+
# don't punish admissions if workers still drain the queue quickly.
|
|
14
|
+
#
|
|
15
|
+
# Update rule applied after each perform (in InflightTracker.track):
|
|
16
|
+
#
|
|
17
|
+
# succeeded? & ewma_lag <= target_lag_ms → current_max += 1
|
|
18
|
+
# succeeded? & ewma_lag > target_lag_ms → current_max *= slow_factor
|
|
19
|
+
# failed? → current_max *= fail_factor
|
|
20
|
+
#
|
|
21
|
+
# Always clamped to >= min. Never grows without bound — the
|
|
22
|
+
# algorithm self-limits via target_lag_ms.
|
|
14
23
|
class AdaptiveConcurrency < Gate
|
|
15
|
-
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
DEFAULT_EWMA_ALPHA = 0.5
|
|
20
|
-
DEFAULT_FAIL_FACTOR = 0.5
|
|
21
|
-
DEFAULT_SLOW_FACTOR = 0.95
|
|
24
|
+
DEFAULT_FULL_BACKOFF = 1.0 # seconds
|
|
25
|
+
DEFAULT_EWMA_ALPHA = 0.5 # weight of the new sample in the EWMA
|
|
26
|
+
DEFAULT_FAIL_FACTOR = 0.5 # halve on perform raise
|
|
27
|
+
DEFAULT_SLOW_FACTOR = 0.95 # gentle shrink on overload
|
|
22
28
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def configure(initial_max:,
|
|
26
|
-
target_lag_ms: nil,
|
|
27
|
-
target_latency: nil,
|
|
28
|
-
min: 1,
|
|
29
|
-
ewma_alpha: DEFAULT_EWMA_ALPHA,
|
|
30
|
-
failure_decrease_factor: DEFAULT_FAIL_FACTOR,
|
|
31
|
-
overload_decrease_factor: DEFAULT_SLOW_FACTOR)
|
|
32
|
-
@initial_max = initial_max
|
|
33
|
-
@min = min
|
|
34
|
-
@target_lag_ms = target_lag_ms || target_latency
|
|
35
|
-
@ewma_alpha = ewma_alpha
|
|
36
|
-
@fail_factor = failure_decrease_factor
|
|
37
|
-
@slow_factor = overload_decrease_factor
|
|
38
|
-
raise ArgumentError, "adaptive_concurrency requires target_lag_ms" if @target_lag_ms.nil?
|
|
39
|
-
end
|
|
29
|
+
attr_reader :initial_max, :target_lag_ms, :min,
|
|
30
|
+
:ewma_alpha, :fail_factor, :slow_factor, :full_backoff
|
|
40
31
|
|
|
41
|
-
def
|
|
42
|
-
|
|
32
|
+
def initialize(initial_max:, target_lag_ms:, min: 1,
|
|
33
|
+
ewma_alpha: DEFAULT_EWMA_ALPHA,
|
|
34
|
+
failure_decrease_factor: DEFAULT_FAIL_FACTOR,
|
|
35
|
+
overload_decrease_factor: DEFAULT_SLOW_FACTOR,
|
|
36
|
+
full_backoff: DEFAULT_FULL_BACKOFF)
|
|
37
|
+
super()
|
|
38
|
+
@initial_max = Integer(initial_max)
|
|
39
|
+
@target_lag_ms = Float(target_lag_ms)
|
|
40
|
+
@min = Integer(min)
|
|
41
|
+
@ewma_alpha = Float(ewma_alpha)
|
|
42
|
+
@fail_factor = Float(failure_decrease_factor)
|
|
43
|
+
@slow_factor = Float(overload_decrease_factor)
|
|
44
|
+
@full_backoff = Float(full_backoff)
|
|
45
|
+
raise ArgumentError, "target_lag_ms must be > 0" unless @target_lag_ms.positive?
|
|
46
|
+
raise ArgumentError, "min must be >= 1" unless @min >= 1
|
|
47
|
+
raise ArgumentError, "initial_max must be >= min" unless @initial_max >= @min
|
|
43
48
|
end
|
|
44
49
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def filter(batch, context)
|
|
49
|
-
by_partition = batch.group_by { |staged| partition_key_for(context.for(staged)) }
|
|
50
|
-
|
|
51
|
-
# Seed any missing stats rows so the first admission has something
|
|
52
|
-
# to read. Cheap: one INSERT ... ON CONFLICT DO NOTHING per key.
|
|
53
|
-
by_partition.each_key do |key|
|
|
54
|
-
AdaptiveConcurrencyStats.seed!(
|
|
55
|
-
policy_name: policy.name,
|
|
56
|
-
gate_name: name,
|
|
57
|
-
partition_key: key,
|
|
58
|
-
initial_max: resolve(@initial_max, nil).to_i
|
|
59
|
-
)
|
|
60
|
-
end
|
|
50
|
+
def name
|
|
51
|
+
:adaptive_concurrency
|
|
52
|
+
end
|
|
61
53
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
partition_keys: by_partition.keys
|
|
66
|
-
)
|
|
54
|
+
def evaluate(ctx, partition, admit_budget)
|
|
55
|
+
policy_name = partition["policy_name"]
|
|
56
|
+
key = inflight_partition_key(policy_name, ctx)
|
|
67
57
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
58
|
+
# Seed lazily so the very first admission has a row to read
|
|
59
|
+
# (and so record_observation can UPDATE without a check).
|
|
60
|
+
Repository.adaptive_seed!(
|
|
61
|
+
policy_name: policy_name,
|
|
62
|
+
partition_key: key,
|
|
63
|
+
initial_max: @initial_max
|
|
72
64
|
)
|
|
73
65
|
|
|
74
|
-
|
|
66
|
+
cap = Repository.adaptive_current_max(
|
|
67
|
+
policy_name: policy_name,
|
|
68
|
+
partition_key: key
|
|
69
|
+
) || @initial_max
|
|
70
|
+
cap = [cap, @min].max
|
|
75
71
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
72
|
+
in_flight = Repository.count_inflight(
|
|
73
|
+
policy_name: policy_name,
|
|
74
|
+
partition_key: key
|
|
75
|
+
)
|
|
76
|
+
remaining = cap - in_flight
|
|
81
77
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
effective_max = [ effective_max, resolve(@initial_max, nil).to_i ].max
|
|
88
|
-
end
|
|
78
|
+
# Safety valve. AIMD can shrink current_max during a slow burst;
|
|
79
|
+
# if the partition then idles, no observations come in to grow
|
|
80
|
+
# the cap back. When in_flight == 0 we ensure at least
|
|
81
|
+
# initial_max so the partition never fossilizes at min.
|
|
82
|
+
remaining = [remaining, @initial_max].max if in_flight.zero?
|
|
89
83
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
end
|
|
84
|
+
if remaining <= 0
|
|
85
|
+
return Decision.new(allowed: 0,
|
|
86
|
+
retry_after: @full_backoff,
|
|
87
|
+
reason: "adaptive_concurrency_full")
|
|
95
88
|
end
|
|
96
89
|
|
|
97
|
-
|
|
98
|
-
admitted.map(&:first)
|
|
90
|
+
Decision.new(allowed: [remaining, admit_budget].min)
|
|
99
91
|
end
|
|
100
92
|
|
|
101
|
-
#
|
|
102
|
-
#
|
|
103
|
-
#
|
|
104
|
-
def
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
93
|
+
# Same canonical scope as the staged_jobs partition_key — every
|
|
94
|
+
# gate in a policy uses `policy.partition_for(ctx)` so the
|
|
95
|
+
# inflight count and the adaptive stats line up exactly.
|
|
96
|
+
def inflight_partition_key(policy_name, ctx)
|
|
97
|
+
policy = DispatchPolicy.registry.fetch(policy_name)
|
|
98
|
+
raise InvalidPolicy, "unknown policy #{policy_name.inspect}" unless policy
|
|
99
|
+
policy.partition_for(ctx)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Called from InflightTracker.track after each perform completes
|
|
103
|
+
# (success or failure). Updates the AIMD state atomically in one
|
|
104
|
+
# SQL statement.
|
|
105
|
+
def record_observation(policy_name:, partition_key:, queue_lag_ms:, succeeded:)
|
|
106
|
+
Repository.adaptive_seed!(
|
|
107
|
+
policy_name: policy_name,
|
|
108
|
+
partition_key: partition_key.to_s,
|
|
109
|
+
initial_max: @initial_max
|
|
110
|
+
)
|
|
111
|
+
Repository.adaptive_record!(
|
|
112
|
+
policy_name: policy_name,
|
|
113
|
+
partition_key: partition_key.to_s,
|
|
114
|
+
queue_lag_ms: queue_lag_ms,
|
|
115
|
+
succeeded: succeeded,
|
|
116
|
+
alpha: @ewma_alpha,
|
|
117
|
+
target_lag_ms: @target_lag_ms,
|
|
118
|
+
fail_factor: @fail_factor,
|
|
119
|
+
slow_factor: @slow_factor,
|
|
120
|
+
min: @min
|
|
117
121
|
)
|
|
118
122
|
end
|
|
119
123
|
end
|
|
120
|
-
|
|
121
|
-
Gate.register(:adaptive_concurrency, AdaptiveConcurrency)
|
|
122
124
|
end
|
|
123
125
|
end
|