dispatch-kit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dispatch_kit/__init__.py +88 -0
- dispatch_kit/approval.py +44 -0
- dispatch_kit/budget.py +156 -0
- dispatch_kit/dispatch.py +102 -0
- dispatch_kit/egress.py +108 -0
- dispatch_kit/estimate.py +58 -0
- dispatch_kit/py.typed +0 -0
- dispatch_kit/routing.py +157 -0
- dispatch_kit-0.1.0.dist-info/METADATA +88 -0
- dispatch_kit-0.1.0.dist-info/RECORD +12 -0
- dispatch_kit-0.1.0.dist-info/WHEEL +5 -0
- dispatch_kit-0.1.0.dist-info/top_level.txt +1 -0
dispatch_kit/__init__.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""dispatch-kit — a pure, fail-closed library for gating expensive remote/external work.
|
|
2
|
+
|
|
3
|
+
Decide whether an expensive operation may run, how much it will cost, and where it should run —
|
|
4
|
+
the same machinery for a cloud GPU job (Cloud Run L4 over a tailnet) and a paid LLM/SDK API call:
|
|
5
|
+
|
|
6
|
+
from dispatch_kit import BudgetCap, BudgetState, admits, estimate_cost # the hard $ cap
|
|
7
|
+
from dispatch_kit import select_backend, BackendKind, ToolRequirements # where it runs
|
|
8
|
+
from dispatch_kit import SecretRef, ExternalEndpoint, log_egress # opt-in API egress
|
|
9
|
+
from dispatch_kit import Approval # the approval audit fact
|
|
10
|
+
|
|
11
|
+
Pure domain — stdlib only, no I/O, no provider SDK code; every check is fail-closed (default budget
|
|
12
|
+
``0`` = paid work off; SDK egress opt-in only; a missing key/over-budget refuses). The transport
|
|
13
|
+
auth (who) is a separate concern — pair this with ``tailnet-guard``. The consuming app keeps its job
|
|
14
|
+
entity, persistence, and executor; this owns the policy.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from .approval import Approval, ApprovalOutcome
|
|
20
|
+
from .budget import (
|
|
21
|
+
AdmissionDecision,
|
|
22
|
+
BudgetCap,
|
|
23
|
+
BudgetState,
|
|
24
|
+
BudgetWindow,
|
|
25
|
+
CostRates,
|
|
26
|
+
admits,
|
|
27
|
+
estimate_cost,
|
|
28
|
+
)
|
|
29
|
+
from .dispatch import (
|
|
30
|
+
DispatchError,
|
|
31
|
+
JobStore,
|
|
32
|
+
Lease,
|
|
33
|
+
Transport,
|
|
34
|
+
WorkerExecutor,
|
|
35
|
+
is_lease_stale,
|
|
36
|
+
should_give_up,
|
|
37
|
+
)
|
|
38
|
+
from .egress import (
|
|
39
|
+
EnvLookup,
|
|
40
|
+
ExternalEndpoint,
|
|
41
|
+
SecretMissingError,
|
|
42
|
+
SecretRef,
|
|
43
|
+
log_egress,
|
|
44
|
+
)
|
|
45
|
+
from .estimate import CostEstimate, HostCapabilities, vram_fits
|
|
46
|
+
from .routing import (
|
|
47
|
+
BackendCapabilities,
|
|
48
|
+
BackendKind,
|
|
49
|
+
NodeIdentity,
|
|
50
|
+
NoEligibleBackendError,
|
|
51
|
+
Routable,
|
|
52
|
+
ToolRequirements,
|
|
53
|
+
select_backend,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
"AdmissionDecision",
|
|
58
|
+
"Approval",
|
|
59
|
+
"ApprovalOutcome",
|
|
60
|
+
"BackendCapabilities",
|
|
61
|
+
"BackendKind",
|
|
62
|
+
"BudgetCap",
|
|
63
|
+
"BudgetState",
|
|
64
|
+
"BudgetWindow",
|
|
65
|
+
"CostEstimate",
|
|
66
|
+
"CostRates",
|
|
67
|
+
"DispatchError",
|
|
68
|
+
"EnvLookup",
|
|
69
|
+
"ExternalEndpoint",
|
|
70
|
+
"HostCapabilities",
|
|
71
|
+
"JobStore",
|
|
72
|
+
"Lease",
|
|
73
|
+
"NoEligibleBackendError",
|
|
74
|
+
"NodeIdentity",
|
|
75
|
+
"Routable",
|
|
76
|
+
"SecretMissingError",
|
|
77
|
+
"SecretRef",
|
|
78
|
+
"ToolRequirements",
|
|
79
|
+
"Transport",
|
|
80
|
+
"WorkerExecutor",
|
|
81
|
+
"admits",
|
|
82
|
+
"estimate_cost",
|
|
83
|
+
"is_lease_stale",
|
|
84
|
+
"log_egress",
|
|
85
|
+
"select_backend",
|
|
86
|
+
"should_give_up",
|
|
87
|
+
"vram_fits",
|
|
88
|
+
]
|
dispatch_kit/approval.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Approval — the human decision + audit record that gates an expensive (non-local) job.
|
|
2
|
+
|
|
3
|
+
A gated job leaves "awaiting approval" only by a human decision; recording WHO decided, WHEN, and
|
|
4
|
+
WHY keeps a paid run attributable (append-only in spirit). The state machine — which states are
|
|
5
|
+
gated and the transitions between them — lives in the consuming app's job entity; this is the
|
|
6
|
+
shared audit fact and the binary outcome both apps record.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import enum
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ApprovalOutcome(enum.StrEnum):
|
|
16
|
+
"""The decision a human made on a gated job."""
|
|
17
|
+
|
|
18
|
+
APPROVED = "approved"
|
|
19
|
+
REJECTED = "rejected"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True, slots=True)
|
|
23
|
+
class Approval:
|
|
24
|
+
"""Who approved or rejected a gated job, when, and why — the audit fact.
|
|
25
|
+
|
|
26
|
+
``author``/``reason``/``decided_at`` are all required: a decision is never anonymous,
|
|
27
|
+
unexplained, or untimestamped. ``decided_at`` is ISO-8601, server-assigned at the decision.
|
|
28
|
+
``outcome`` defaults to APPROVED so the common "approve" path constructs positionally; a
|
|
29
|
+
rejection passes ``ApprovalOutcome.REJECTED``.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
author: str
|
|
33
|
+
reason: str
|
|
34
|
+
decided_at: str
|
|
35
|
+
outcome: ApprovalOutcome = ApprovalOutcome.APPROVED
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
for name, value in (
|
|
39
|
+
("author", self.author),
|
|
40
|
+
("reason", self.reason),
|
|
41
|
+
("decided_at", self.decided_at),
|
|
42
|
+
):
|
|
43
|
+
if not value:
|
|
44
|
+
raise ValueError(f"an approval needs a non-empty {name}")
|
dispatch_kit/budget.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Budget ledger — a hard, fail-closed spend cap that gates expensive (non-local) work.
|
|
2
|
+
|
|
3
|
+
Cloud GPU jobs and paid API calls cost real money, so a **hard** spend cap is a circuit breaker
|
|
4
|
+
against errant testing or an agent flooding dispatch. The cap is enforced by **reserving** a job's
|
|
5
|
+
upper-bound cost the moment it is approved/queued, against BOTH a per-run window and a per-month
|
|
6
|
+
window: a job is admitted only if ``reserved + spent + its estimate`` stays within both caps. A
|
|
7
|
+
burst of approvals reserves cumulatively, so once a cap is reached the next job is refused — you
|
|
8
|
+
cannot even *queue* past it. The default cap is **zero**, so paid work stays off until you
|
|
9
|
+
deliberately set a budget.
|
|
10
|
+
|
|
11
|
+
This is pure domain (the rule); the persisted ledger rows + the per-backend rates live in the
|
|
12
|
+
consuming app's storage layer. Money is a :class:`~decimal.Decimal` (exact), never a float — a
|
|
13
|
+
fraction of a cent per GPU-second compounds, and a cost guard that drifts is not a guard.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import enum
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from decimal import Decimal
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BudgetWindow(enum.StrEnum):
|
|
24
|
+
"""The windows a reservation is checked against — BOTH must have room (fail closed).
|
|
25
|
+
|
|
26
|
+
``RUN`` bounds a single batch/session (stops one runaway loop); ``MONTH`` bounds slow
|
|
27
|
+
accumulation across sessions. A job must fit under both to be admitted.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
RUN = "run"
|
|
31
|
+
MONTH = "month"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True, slots=True)
|
|
35
|
+
class BudgetCap:
|
|
36
|
+
"""The hard ceiling per window, in USD. Default ``0`` means paid work is OFF (fail closed)."""
|
|
37
|
+
|
|
38
|
+
run_usd: Decimal = Decimal(0)
|
|
39
|
+
month_usd: Decimal = Decimal(0)
|
|
40
|
+
|
|
41
|
+
def __post_init__(self) -> None:
|
|
42
|
+
if self.run_usd < 0 or self.month_usd < 0:
|
|
43
|
+
raise ValueError("a budget cap cannot be negative")
|
|
44
|
+
|
|
45
|
+
def for_window(self, window: BudgetWindow) -> Decimal:
|
|
46
|
+
"""The cap for ``window`` (the single source the admission rule reads)."""
|
|
47
|
+
return self.run_usd if window is BudgetWindow.RUN else self.month_usd
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True, slots=True)
|
|
51
|
+
class BudgetState:
|
|
52
|
+
"""Committed USD for a window: ``reserved`` (approved, not yet run) + ``spent`` (reconciled).
|
|
53
|
+
|
|
54
|
+
Reserving on approval and only reconciling ``reserved -> spent`` on completion is what makes a
|
|
55
|
+
burst of approvals count immediately — the cap sees the whole queue's cost, not just what ran.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
reserved_usd: Decimal = Decimal(0)
|
|
59
|
+
spent_usd: Decimal = Decimal(0)
|
|
60
|
+
|
|
61
|
+
def __post_init__(self) -> None:
|
|
62
|
+
if self.reserved_usd < 0 or self.spent_usd < 0:
|
|
63
|
+
raise ValueError("budget reserved/spent cannot be negative")
|
|
64
|
+
|
|
65
|
+
def committed(self) -> Decimal:
|
|
66
|
+
"""The total already committed against the cap (reserved + spent)."""
|
|
67
|
+
return self.reserved_usd + self.spent_usd
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True, slots=True)
|
|
71
|
+
class CostRates:
|
|
72
|
+
"""Per-second USD rates for a backend — plain config data (no provider SDK code here).
|
|
73
|
+
|
|
74
|
+
``idle_tail_s`` is the post-request warm tail you still pay (a GPU Cloud Run instance bills for
|
|
75
|
+
~10 min after the last request before scaling to zero); folding it into the estimate keeps the
|
|
76
|
+
reservation an upper bound. A token-priced API can model itself with ``gpu_usd_per_s`` = 0 and a
|
|
77
|
+
flat per-call cost via :func:`estimate_cost` inputs, or extend this with its own rate object.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
gpu_usd_per_s: Decimal
|
|
81
|
+
vcpu_usd_per_s: Decimal
|
|
82
|
+
gib_usd_per_s: Decimal
|
|
83
|
+
idle_tail_s: Decimal = Decimal(0)
|
|
84
|
+
|
|
85
|
+
def __post_init__(self) -> None:
|
|
86
|
+
for name, value in (
|
|
87
|
+
("gpu_usd_per_s", self.gpu_usd_per_s),
|
|
88
|
+
("vcpu_usd_per_s", self.vcpu_usd_per_s),
|
|
89
|
+
("gib_usd_per_s", self.gib_usd_per_s),
|
|
90
|
+
("idle_tail_s", self.idle_tail_s),
|
|
91
|
+
):
|
|
92
|
+
if value < 0:
|
|
93
|
+
raise ValueError(f"{name} cannot be negative")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass(frozen=True, slots=True)
|
|
97
|
+
class AdmissionDecision:
|
|
98
|
+
"""Whether a job fits the budget, and — when it does not — which window refused and why.
|
|
99
|
+
|
|
100
|
+
``refused_window`` is the first window (run, then month) that lacked room; ``None`` on admit.
|
|
101
|
+
The reason is human-facing, so the refusal is legible at approval, never a silent drop.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
admitted: bool
|
|
105
|
+
refused_window: BudgetWindow | None
|
|
106
|
+
reason: str
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def admits(
|
|
110
|
+
estimate_usd: Decimal,
|
|
111
|
+
run_state: BudgetState,
|
|
112
|
+
month_state: BudgetState,
|
|
113
|
+
cap: BudgetCap,
|
|
114
|
+
) -> AdmissionDecision:
|
|
115
|
+
"""Can a job whose upper-bound cost is ``estimate_usd`` be admitted? Fail-closed, both windows.
|
|
116
|
+
|
|
117
|
+
Admitted only if ``reserved + spent + estimate <= cap`` for the run AND the month window. The
|
|
118
|
+
default cap (zero) admits nothing — paid work stays off until a budget is set. A negative
|
|
119
|
+
estimate is refused (its cost is not reasoned about); a zero-cost job within a zero cap is fine.
|
|
120
|
+
"""
|
|
121
|
+
if estimate_usd < 0:
|
|
122
|
+
return AdmissionDecision(False, None, "a cost estimate cannot be negative")
|
|
123
|
+
for window, state in ((BudgetWindow.RUN, run_state), (BudgetWindow.MONTH, month_state)):
|
|
124
|
+
cap_window = cap.for_window(window)
|
|
125
|
+
if state.committed() + estimate_usd > cap_window:
|
|
126
|
+
return AdmissionDecision(
|
|
127
|
+
False,
|
|
128
|
+
window,
|
|
129
|
+
(
|
|
130
|
+
f"would exceed the {window.value} budget cap (${cap_window}): "
|
|
131
|
+
f"${state.committed()} committed + ${estimate_usd} estimate"
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
return AdmissionDecision(True, None, "within budget")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def estimate_cost(
|
|
138
|
+
rates: CostRates,
|
|
139
|
+
*,
|
|
140
|
+
max_runtime_s: int,
|
|
141
|
+
vcpus: int,
|
|
142
|
+
memory_gib: int,
|
|
143
|
+
) -> Decimal:
|
|
144
|
+
"""Upper-bound USD for one job: (max runtime + idle tail) x (GPU + vCPU + memory) rates. Pure.
|
|
145
|
+
|
|
146
|
+
Uses the caller's declared MAX runtime plus the warm idle tail, so the reservation
|
|
147
|
+
over-estimates; the ledger reconciles ``reserved -> spent`` from the backend's true reported
|
|
148
|
+
runtime on completion. Over-reserving is the safe direction for a hard cap.
|
|
149
|
+
"""
|
|
150
|
+
if max_runtime_s < 0 or vcpus < 0 or memory_gib < 0:
|
|
151
|
+
raise ValueError("runtime/vcpus/memory cannot be negative")
|
|
152
|
+
billable_s = Decimal(max_runtime_s) + rates.idle_tail_s
|
|
153
|
+
per_second = (
|
|
154
|
+
rates.gpu_usd_per_s + rates.vcpu_usd_per_s * vcpus + rates.gib_usd_per_s * memory_gib
|
|
155
|
+
)
|
|
156
|
+
return billable_s * per_second
|
dispatch_kit/dispatch.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Dispatch contract for expensive async jobs that run anywhere on the tailnet.
|
|
2
|
+
|
|
3
|
+
Both a PULL worker (it polls: claim a job, run it, complete it) and a PUSH orchestrator (it posts a
|
|
4
|
+
job to a worker and waits) need the SAME guarantees: a job is CLAIMED atomically so it runs exactly
|
|
5
|
+
once, a stale result is REJECTED, and a worker that dies mid-job has its lease RECOVERED. This
|
|
6
|
+
module is the shared CONTRACT — the pure lease rules + the store/transport/worker protocols. The
|
|
7
|
+
store (SQLite vs SQLAlchemy), the transport (pull vs push), and the payload (a transcribe request vs
|
|
8
|
+
a tool invocation) are per-app ADAPTERS, so two apps converge on one model WITHOUT a shared DB.
|
|
9
|
+
|
|
10
|
+
Auth (who may run a job) is :mod:`tailnet_guard`; policy (afford / route / approve) is the rest of
|
|
11
|
+
:mod:`dispatch_kit`. This is only the run-it-once-recoverably engine on top of those.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Sequence
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any, Protocol
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DispatchError(RuntimeError):
|
|
22
|
+
"""A dispatch invariant was violated (a stale complete, or a job recovered past its cap)."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True, slots=True)
|
|
26
|
+
class Lease:
|
|
27
|
+
"""A claim on a job: when it was leased and how many times it has been recovered.
|
|
28
|
+
|
|
29
|
+
``leased_at`` is epoch seconds, stamped at the atomic claim. ``attempts`` counts recoveries (a
|
|
30
|
+
worker that died and had its lease reclaimed); a job past ``max_attempts`` fails rather than
|
|
31
|
+
re-leasing forever.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
job_id: str
|
|
35
|
+
leased_at: float
|
|
36
|
+
attempts: int = 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_lease_stale(leased_at: float, now: float, ttl_seconds: float) -> bool:
|
|
40
|
+
"""Whether a lease has outlived its TTL (the worker likely died) and may be reclaimed.
|
|
41
|
+
|
|
42
|
+
The single home of the lease-staleness rule, shared by every store's ``recover_stale`` so a
|
|
43
|
+
"leased but silent" job is reclaimed on one schedule. ``ttl_seconds`` should comfortably exceed
|
|
44
|
+
the longest a healthy run takes, so a slow job is not stolen from a live worker.
|
|
45
|
+
"""
|
|
46
|
+
return (now - leased_at) >= ttl_seconds
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def should_give_up(attempts: int, max_attempts: int) -> bool:
|
|
50
|
+
"""Whether a job has been recovered too many times and should FAIL rather than re-lease.
|
|
51
|
+
|
|
52
|
+
Fail-closed against a poison job that crashes every worker it lands on: after ``max_attempts``
|
|
53
|
+
recoveries it is marked failed (with its recorded error), never re-queued indefinitely.
|
|
54
|
+
"""
|
|
55
|
+
return attempts >= max_attempts
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class JobStore(Protocol):
|
|
59
|
+
"""The authoritative job store — the ONE place a job's claim/complete is decided, ATOMICALLY.
|
|
60
|
+
|
|
61
|
+
``claim`` must be atomic (one transaction / compare-and-set): two concurrent workers can never
|
|
62
|
+
both claim the same job — that single claim is the run-exactly-once guarantee. ``complete`` must
|
|
63
|
+
REJECT a result for a job not currently leased/running (a stale resubmit after the lease was
|
|
64
|
+
recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``recover_stale``
|
|
65
|
+
re-leases jobs whose lease outlived the TTL (see :func:`is_lease_stale`), bumping the attempt
|
|
66
|
+
count and failing a job past ``max_attempts`` (see :func:`should_give_up`). The payload type is
|
|
67
|
+
the app's own (a transcribe request, a tool invocation, ...).
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
|
|
71
|
+
"""Atomically claim ONE runnable job in ``lanes`` -> ``(job_id, payload)``, or ``None``."""
|
|
72
|
+
|
|
73
|
+
def complete(self, job_id: str, result: Any) -> bool:
|
|
74
|
+
"""Apply a result IFF the job is leased/running; ``False`` if stale (done/recovered)."""
|
|
75
|
+
|
|
76
|
+
def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
|
|
77
|
+
"""Re-lease jobs whose lease is stale (return their ids); fail those past the cap."""
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class Transport(Protocol):
|
|
81
|
+
"""The hop that moves a job to where it runs — the only thing that differs push vs pull.
|
|
82
|
+
|
|
83
|
+
PULL: the worker calls the store's claim/complete directly, so there is no Transport. PUSH: the
|
|
84
|
+
orchestrator submits the encoded job to a worker endpoint and gets the encoded result back.
|
|
85
|
+
Either way the worker's auth (``tailnet_guard``) and the policy are unchanged — this is
|
|
86
|
+
purely the byte-moving seam, so an app swaps pull for push without touching its job model.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def submit(self, envelope: dict[str, Any], token: str) -> dict[str, Any]:
|
|
90
|
+
"""Deliver an encoded job to a worker with its capability token; return the response."""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class WorkerExecutor(Protocol):
|
|
94
|
+
"""The worker-side run contract: decode (integrity-checked) -> run -> encode.
|
|
95
|
+
|
|
96
|
+
The decode MUST verify every artifact's content hash, and the run happens only AFTER the auth
|
|
97
|
+
guard passes (guard-before-decode), so a tampered/unauthorized job never reaches the executor.
|
|
98
|
+
Returns the encoded result the store/transport carries back.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def execute(self, payload: Any) -> Any:
|
|
102
|
+
"""Run an already-authorized, integrity-checked payload and return its encoded result."""
|
dispatch_kit/egress.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""External-API egress — the ONE sanctioned way data leaves the trust boundary, opt-in + logged.
|
|
2
|
+
|
|
3
|
+
A hosted SDK or LLM API (Rowan, Gemini, Claude, ...) is public TLS + an API key — it intentionally
|
|
4
|
+
breaks the tailnet-only / local-only default. This module gives the three non-negotiable properties
|
|
5
|
+
WITHOUT each caller re-rolling them:
|
|
6
|
+
|
|
7
|
+
* the key is a SECRET REFERENCE (an env-var name), resolved at call time, never inlined/logged;
|
|
8
|
+
* a missing key fails closed (raises) — never an unauthenticated call;
|
|
9
|
+
* every dispatch logs the egress (that data left the boundary, to which host) — never the key.
|
|
10
|
+
|
|
11
|
+
The actual HTTP request stays in the consuming app (it knows its payload shape); this module owns
|
|
12
|
+
the credential discipline + the egress audit so every external call is consistent and never the
|
|
13
|
+
default. Pair it with :func:`dispatch_kit.routing.select_backend` (SDK is opt-in) and the budget.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
from collections.abc import Callable
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from urllib.parse import urlsplit
|
|
23
|
+
|
|
24
|
+
_LOGGER = logging.getLogger("dispatch_kit.egress")
|
|
25
|
+
|
|
26
|
+
#: A lookup from an env-var name to its value (or ``None``) — injectable so tests need no real env.
|
|
27
|
+
EnvLookup = Callable[[str], "str | None"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SecretMissingError(RuntimeError):
|
|
31
|
+
"""The referenced secret (API key) is absent from the environment — fail closed.
|
|
32
|
+
|
|
33
|
+
A missing key is a hard stop, not a silent unauthenticated call: the opt-in egress exception is
|
|
34
|
+
only valid with an explicit, configured credential.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True, slots=True)
|
|
39
|
+
class SecretRef:
|
|
40
|
+
"""A reference to a secret by ENV VAR NAME — never the value.
|
|
41
|
+
|
|
42
|
+
The credential is sourced from a secret at call time (:meth:`resolve`) and cannot leak into
|
|
43
|
+
source or a serialized config: a config object holds only the *name* of the env var.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
env_var: str
|
|
47
|
+
|
|
48
|
+
def __post_init__(self) -> None:
|
|
49
|
+
if not self.env_var:
|
|
50
|
+
raise ValueError("a SecretRef needs a non-empty env var name")
|
|
51
|
+
|
|
52
|
+
def resolve(self, env: EnvLookup = os.environ.get) -> str:
|
|
53
|
+
"""Read the secret from the environment now; raise :class:`SecretMissingError` if unset."""
|
|
54
|
+
value = env(self.env_var)
|
|
55
|
+
if not value:
|
|
56
|
+
raise SecretMissingError(
|
|
57
|
+
f"secret env var {self.env_var!r} is unset — refusing an unauthenticated call"
|
|
58
|
+
)
|
|
59
|
+
return value
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass(frozen=True, slots=True)
|
|
63
|
+
class ExternalEndpoint:
|
|
64
|
+
"""A hosted external API the app may call (opt-in egress): an https base URL + its key ref.
|
|
65
|
+
|
|
66
|
+
Construction refuses a non-``https://`` URL (public egress must be TLS). ``name`` is a stable
|
|
67
|
+
display label used in the egress audit; ``secret`` references the key by env-var name only.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name: str
|
|
71
|
+
base_url: str
|
|
72
|
+
secret: SecretRef
|
|
73
|
+
|
|
74
|
+
def __post_init__(self) -> None:
|
|
75
|
+
if not self.name:
|
|
76
|
+
raise ValueError("an external endpoint needs a non-empty name")
|
|
77
|
+
if not self.base_url.startswith("https://"):
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"external endpoint {self.name!r} must use https:// (public TLS); got "
|
|
80
|
+
f"{self.base_url!r}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def host(self) -> str:
|
|
84
|
+
"""The endpoint host, for the egress audit (never the full URL with query/credentials)."""
|
|
85
|
+
return urlsplit(self.base_url).hostname or self.base_url
|
|
86
|
+
|
|
87
|
+
def bearer(self, env: EnvLookup = os.environ.get) -> str:
|
|
88
|
+
"""Resolve the key now and return the ``Authorization`` header value (``Bearer <key>``).
|
|
89
|
+
|
|
90
|
+
Raises :class:`SecretMissingError` if the key is unset, so a request is never built without
|
|
91
|
+
a credential. The returned string contains the key — it is for the header only, never a log.
|
|
92
|
+
"""
|
|
93
|
+
return f"Bearer {self.secret.resolve(env)}"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def log_egress(endpoint: ExternalEndpoint, *, detail: str = "") -> None:
|
|
97
|
+
"""Audit that data is leaving the trust boundary, to which host — never the secret.
|
|
98
|
+
|
|
99
|
+
Call this immediately before an external request. Logs the endpoint name + host only (the
|
|
100
|
+
audit-relevant fact); ``detail`` may add a non-sensitive note (e.g. the tool/model name).
|
|
101
|
+
"""
|
|
102
|
+
suffix = f" [{detail}]" if detail else ""
|
|
103
|
+
_LOGGER.warning(
|
|
104
|
+
"egress: %r -> host %r (data leaves the trust boundary)%s",
|
|
105
|
+
endpoint.name,
|
|
106
|
+
endpoint.host(),
|
|
107
|
+
suffix,
|
|
108
|
+
)
|
dispatch_kit/estimate.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Pre-dispatch cost + feasibility estimates — refuse a job that cannot fit BEFORE it runs.
|
|
2
|
+
|
|
3
|
+
A backend declares how much compute it has (:class:`HostCapabilities`); an adapter declares what a
|
|
4
|
+
job will cost and need (:class:`CostEstimate`). The shared :func:`vram_fits` rule — "no GPU means a
|
|
5
|
+
GPU job is infeasible" — lives in exactly one place so the local cost gate and the backend router
|
|
6
|
+
agree. All pure: no I/O, no side effects.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def vram_fits(available_gb: float | None, required_gb: float | None) -> bool:
|
|
15
|
+
"""The single VRAM-budget rule, shared by the host dispatch-kit and the backend router. Pure.
|
|
16
|
+
|
|
17
|
+
A CPU job (``required_gb`` is ``None``) always fits; a GPU job needs a GPU present
|
|
18
|
+
(``available_gb`` not ``None``) with at least the required VRAM. Both the local cost gate
|
|
19
|
+
(:meth:`HostCapabilities.can_fit_vram`) and the backend capability match defer here so the
|
|
20
|
+
fail-closed "no GPU means a GPU job is infeasible" rule lives in exactly one place.
|
|
21
|
+
"""
|
|
22
|
+
if required_gb is None:
|
|
23
|
+
return True
|
|
24
|
+
if available_gb is None:
|
|
25
|
+
return False
|
|
26
|
+
return available_gb >= required_gb
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True, slots=True)
|
|
30
|
+
class HostCapabilities:
|
|
31
|
+
"""A machine's compute budget, supplied per dispatch so the cost gate can fail closed.
|
|
32
|
+
|
|
33
|
+
``available_vram_gb`` of ``None`` means no GPU is present (a GPU job is then infeasible). A
|
|
34
|
+
CPU-only tool ignores this; a GPU tool compares its ``estimated_vram_gb`` against the budget so
|
|
35
|
+
a too-large job is refused before the executor runs. The default models a no-GPU host so a
|
|
36
|
+
missing budget fails closed, never open.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
available_vram_gb: float | None = None
|
|
40
|
+
|
|
41
|
+
def can_fit_vram(self, required_gb: float | None) -> bool:
|
|
42
|
+
"""True if a job needing ``required_gb`` of VRAM fits here (a CPU job always fits)."""
|
|
43
|
+
return vram_fits(self.available_vram_gb, required_gb)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True, slots=True)
|
|
47
|
+
class CostEstimate:
|
|
48
|
+
"""A pre-dispatch estimate so a too-large/too-long job is refused before it runs.
|
|
49
|
+
|
|
50
|
+
``feasible_locally`` is the gate result (does it fit the host that would run it); ``reason`` is
|
|
51
|
+
the human-facing rationale carried on a refusal. ``estimated_seconds`` + ``estimated_vram_gb``
|
|
52
|
+
also feed the budget reservation (:func:`dispatch_kit.budget.estimate_cost`).
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
estimated_seconds: float
|
|
56
|
+
estimated_vram_gb: float | None
|
|
57
|
+
feasible_locally: bool
|
|
58
|
+
reason: str
|
dispatch_kit/py.typed
ADDED
|
File without changes
|
dispatch_kit/routing.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Backend routing — the PURE preference policy that picks where a job runs.
|
|
2
|
+
|
|
3
|
+
A job carries its :class:`ToolRequirements` (what it needs); a backend declares its
|
|
4
|
+
:class:`BackendCapabilities` (what it can run). :func:`select_backend` is pure: it filters
|
|
5
|
+
to the backends that can satisfy the job, then applies the fixed policy preference —
|
|
6
|
+
**LOCAL -> LAN worker -> CLOUD worker -> SDK** — returning the first match. The SDK (the deliberate
|
|
7
|
+
public-egress exception) is admitted only when the caller explicitly opts in; otherwise it is never
|
|
8
|
+
selected, even if it is the only fit (fail closed on egress).
|
|
9
|
+
|
|
10
|
+
The router never runs anything and holds no executor — it only *chooses*, over any objects that are
|
|
11
|
+
:class:`Routable` (have a ``kind`` and a ``can_satisfy``). The consuming app binds the actual
|
|
12
|
+
executor onto its own backend type; this library stays free of that I/O.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import enum
|
|
18
|
+
from collections.abc import Iterable
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Protocol, TypeVar
|
|
21
|
+
|
|
22
|
+
from .estimate import vram_fits
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BackendKind(enum.StrEnum):
|
|
26
|
+
"""Where a job's compute runs — sets the router's preference and lands on provenance/audit."""
|
|
27
|
+
|
|
28
|
+
LOCAL = "local"
|
|
29
|
+
"""In-process, or a local subprocess/container on the orchestrator host."""
|
|
30
|
+
LAN_WORKER = "lan_worker"
|
|
31
|
+
"""A spare LAN desktop running a worker agent, reached over the tailnet."""
|
|
32
|
+
CLOUD_WORKER = "cloud_worker"
|
|
33
|
+
"""A cloud GPU (e.g. GCP L4 on Cloud Run) running a worker agent, reached over the tailnet."""
|
|
34
|
+
SDK = "sdk"
|
|
35
|
+
"""A third-party hosted SDK / API over public TLS — the one opt-in egress exception."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# The fixed policy preference order. LOCAL first (cheapest, no egress), then a LAN worker, then a
|
|
39
|
+
# cloud worker, and SDK last — and only when explicitly opted in. This tuple IS the policy; the
|
|
40
|
+
# router walks it in order, so changing the preference is a one-line edit here, not scattered.
|
|
41
|
+
_PREFERENCE_ORDER: tuple[BackendKind, ...] = (
|
|
42
|
+
BackendKind.LOCAL,
|
|
43
|
+
BackendKind.LAN_WORKER,
|
|
44
|
+
BackendKind.CLOUD_WORKER,
|
|
45
|
+
BackendKind.SDK,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(frozen=True, slots=True)
|
|
50
|
+
class NodeIdentity:
|
|
51
|
+
"""The identity of a node a job ran on — pinned for audit + reproducibility.
|
|
52
|
+
|
|
53
|
+
For a local job this is the orchestrator host. For a remote worker it is the worker's pinned
|
|
54
|
+
tailnet identity (its MagicDNS name / tailnet IP), so an audit records not just *that* a job ran
|
|
55
|
+
remotely but *on which authorized worker* — the same identity the trust barrier pins.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name: str
|
|
59
|
+
address: str | None = None
|
|
60
|
+
|
|
61
|
+
def __post_init__(self) -> None:
|
|
62
|
+
if not self.name:
|
|
63
|
+
raise ValueError("a NodeIdentity requires a non-empty name")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True, slots=True)
|
|
67
|
+
class ToolRequirements:
|
|
68
|
+
"""What a job needs of a backend. Pure data.
|
|
69
|
+
|
|
70
|
+
``min_vram_gb`` of ``None`` is a CPU job (any backend's compute fits). ``image`` set means the
|
|
71
|
+
job runs in a pinned container, so a backend must be able to run that image. ``tool_id`` lets a
|
|
72
|
+
backend that ships only certain tools (no image) declare it can run them. The consuming app
|
|
73
|
+
derives these from its own manifest/invocation — this library does not couple to those types.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
tool_id: str
|
|
77
|
+
min_vram_gb: float | None = None
|
|
78
|
+
image: str | None = None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True, slots=True)
|
|
82
|
+
class BackendCapabilities:
|
|
83
|
+
"""What a backend can run — matched against :class:`ToolRequirements` by the router. Pure data.
|
|
84
|
+
|
|
85
|
+
``available_vram_gb`` of ``None`` means no GPU (a GPU job cannot run here). ``images`` /
|
|
86
|
+
``tool_ids`` of ``None`` mean "no restriction" (a general-purpose backend that runs any pinned
|
|
87
|
+
image or any tool); a non-empty set restricts the backend to exactly those.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
has_gpu: bool = False
|
|
91
|
+
available_vram_gb: float | None = None
|
|
92
|
+
images: frozenset[str] | None = None
|
|
93
|
+
tool_ids: frozenset[str] | None = None
|
|
94
|
+
|
|
95
|
+
def can_satisfy(self, requirements: ToolRequirements) -> bool:
|
|
96
|
+
"""True if a backend with these capabilities can run a job with ``requirements``. Pure.
|
|
97
|
+
|
|
98
|
+
VRAM feasibility defers to the shared :func:`dispatch_kit.estimate.vram_fits` rule (the one
|
|
99
|
+
home of "no GPU means a GPU job is infeasible"); the image/tool gate is local.
|
|
100
|
+
"""
|
|
101
|
+
if not vram_fits(self.available_vram_gb, requirements.min_vram_gb):
|
|
102
|
+
return False
|
|
103
|
+
if requirements.image is not None and self.images is not None:
|
|
104
|
+
return requirements.image in self.images
|
|
105
|
+
if requirements.image is None and self.tool_ids is not None:
|
|
106
|
+
return requirements.tool_id in self.tool_ids
|
|
107
|
+
return True
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class Routable(Protocol):
|
|
111
|
+
"""A backend the router can choose among: it has a :class:`BackendKind` and can answer whether
|
|
112
|
+
it can run a job. The consuming app's executor-bound backend type satisfies this structurally,
|
|
113
|
+
so :func:`select_backend` chooses without this library ever knowing about executors."""
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def kind(self) -> BackendKind:
|
|
117
|
+
"""The backend's kind, which sets its routing preference."""
|
|
118
|
+
|
|
119
|
+
def can_satisfy(self, requirements: ToolRequirements) -> bool:
|
|
120
|
+
"""Whether this backend can run a job with the given requirements."""
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class NoEligibleBackendError(RuntimeError):
|
|
124
|
+
"""Raised when no registered backend can satisfy a job's requirements under the policy.
|
|
125
|
+
|
|
126
|
+
Fail closed: a job with no place to run is a hard error, never a silent local fallback that
|
|
127
|
+
might OOM or an SDK call the caller did not opt into.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
_R = TypeVar("_R", bound=Routable)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def select_backend(
|
|
135
|
+
backends: Iterable[_R],
|
|
136
|
+
requirements: ToolRequirements,
|
|
137
|
+
*,
|
|
138
|
+
allow_sdk: bool = False,
|
|
139
|
+
) -> _R:
|
|
140
|
+
"""Pick the preferred backend that can satisfy ``requirements`` — PURE policy.
|
|
141
|
+
|
|
142
|
+
Walks the fixed preference order (LOCAL -> LAN -> CLOUD -> SDK) and returns the first registered
|
|
143
|
+
backend of that kind whose capabilities satisfy the job. An SDK backend is considered ONLY when
|
|
144
|
+
``allow_sdk`` is true — the deliberate egress exception is opt-in, so by default SDK is skipped
|
|
145
|
+
even if it is the only fit. Raises :class:`NoEligibleBackendError` if nothing eligible matches.
|
|
146
|
+
"""
|
|
147
|
+
candidates = list(backends)
|
|
148
|
+
for kind in _PREFERENCE_ORDER:
|
|
149
|
+
if kind is BackendKind.SDK and not allow_sdk:
|
|
150
|
+
continue
|
|
151
|
+
for backend in candidates:
|
|
152
|
+
if backend.kind is kind and backend.can_satisfy(requirements):
|
|
153
|
+
return backend
|
|
154
|
+
raise NoEligibleBackendError(
|
|
155
|
+
f"no eligible backend for tool {requirements.tool_id!r} "
|
|
156
|
+
f"(vram>={requirements.min_vram_gb}, image={requirements.image}, allow_sdk={allow_sdk})"
|
|
157
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dispatch-kit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
|
|
5
|
+
Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: budget,cost,cloud,gpu,llm,dispatch,egress,approval
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
20
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
21
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
22
|
+
Requires-Dist: flake8>=6.0; extra == "dev"
|
|
23
|
+
Requires-Dist: pylint>=3.0; extra == "dev"
|
|
24
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
25
|
+
|
|
26
|
+
# dispatch-kit
|
|
27
|
+
|
|
28
|
+
A tiny, **pure, dependency-free** library for gating expensive remote/external work — the same
|
|
29
|
+
machinery for a **cloud GPU job** (a Cloud Run L4 reached over a tailnet) and a **paid LLM/SDK API
|
|
30
|
+
call** (Gemini, Claude, Rowan). It answers three questions, fail-closed:
|
|
31
|
+
|
|
32
|
+
- **Can we afford it?** — a hard, reserve-on-approval **budget cap** (per-run + per-month).
|
|
33
|
+
- **Where should it run?** — a pure **router**: `LOCAL → LAN → CLOUD → SDK`, SDK opt-in only.
|
|
34
|
+
- **Is the external call safe?** — opt-in, audited **API egress** with reference-only secrets.
|
|
35
|
+
|
|
36
|
+
It owns the *policy* (afford / route / approve / egress); your app keeps its job entity,
|
|
37
|
+
persistence, and executor. The transport *auth* (who may talk) is a separate concern — pair this
|
|
38
|
+
with [`tailnet-guard`](https://github.com/falahat/tailnet-guard). Stdlib only; every check is
|
|
39
|
+
fail-closed (default budget `0` = paid work off; SDK never auto-selected; a missing key refuses).
|
|
40
|
+
|
|
41
|
+
## Use
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from decimal import Decimal
|
|
45
|
+
from dispatch_kit import (
|
|
46
|
+
BudgetCap, BudgetState, CostRates, admits, estimate_cost, # the hard $ cap
|
|
47
|
+
select_backend, BackendKind, ToolRequirements, # the where
|
|
48
|
+
SecretRef, ExternalEndpoint, log_egress, # opt-in API egress
|
|
49
|
+
Approval, # the approval audit fact
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# 1. Reserve-on-approval: refuse a job that would push past the cap (both windows).
|
|
53
|
+
rates = CostRates(gpu_usd_per_s=Decimal("0.0008"), vcpu_usd_per_s=Decimal("0.00001"),
|
|
54
|
+
gib_usd_per_s=Decimal("0.000002"), idle_tail_s=Decimal(600))
|
|
55
|
+
cost = estimate_cost(rates, max_runtime_s=3600, vcpus=8, memory_gib=32) # an UPPER bound
|
|
56
|
+
decision = admits(cost, run_state, month_state, BudgetCap(run_usd=Decimal(50), month_usd=Decimal(500)))
|
|
57
|
+
if not decision.admitted:
|
|
58
|
+
raise OverBudget(decision.reason) # default cap is $0 — paid work is off until you set one
|
|
59
|
+
|
|
60
|
+
# 2. Pick where it runs — LOCAL first, SDK only if explicitly allowed.
|
|
61
|
+
backend = select_backend(my_backends, ToolRequirements(tool_id="cofold", min_vram_gb=24.0))
|
|
62
|
+
|
|
63
|
+
# 3. An LLM/SDK key is a REFERENCE (env var name), resolved at call time, never logged.
|
|
64
|
+
gemini = ExternalEndpoint("gemini", "https://generativelanguage.googleapis.com",
|
|
65
|
+
SecretRef("GEMINI_API_KEY"))
|
|
66
|
+
log_egress(gemini, detail="summarize") # audit that data left the boundary
|
|
67
|
+
headers = {"Authorization": gemini.bearer()} # raises if the key is unset (never an unauth call)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## What's in the box
|
|
71
|
+
|
|
72
|
+
| Module | Purpose |
|
|
73
|
+
|---|---|
|
|
74
|
+
| `budget` | `BudgetCap` / `BudgetState` / `CostRates` / `admits` / `estimate_cost` — the hard, Decimal-exact, reserve-on-approval cap across a run + month window |
|
|
75
|
+
| `estimate` | `CostEstimate` / `HostCapabilities` / `vram_fits` — the one "no GPU ⇒ a GPU job is infeasible" rule, shared by the gate and the router |
|
|
76
|
+
| `routing` | `BackendKind` / `BackendCapabilities` / `ToolRequirements` / `select_backend` (generic over a `Routable`) — the pure `LOCAL→LAN→CLOUD→SDK` policy; SDK opt-in |
|
|
77
|
+
| `egress` | `SecretRef` / `ExternalEndpoint` / `log_egress` — reference-only API keys, https-only, fail-closed on a missing key, audited egress (SDKs **and** LLM APIs) |
|
|
78
|
+
| `approval` | `Approval` / `ApprovalOutcome` — the who/when/why audit fact for a gated job |
|
|
79
|
+
| `dispatch` | `JobStore` / `Transport` / `WorkerExecutor` protocols + `is_lease_stale` / `should_give_up` / `Lease` — the run-it-once-recoverably contract (atomic claim, stale-reject, lease recovery); push vs pull is only the `Transport` adapter |
|
|
80
|
+
|
|
81
|
+
## Notes
|
|
82
|
+
|
|
83
|
+
- **The budget cap lives in your dispatch service, never the UI** — an agent hitting the API
|
|
84
|
+
directly is still gated. Default cap `$0`; if spend can't be computed, refuse.
|
|
85
|
+
- **Reserve on approval, reconcile on completion** — approving reserves the estimate immediately so
|
|
86
|
+
a burst counts against the cap; the worker's true runtime reconciles `reserved → spent`.
|
|
87
|
+
- **SDK / external egress is the one deliberate exception** — never the default (`allow_sdk` /
|
|
88
|
+
opt-in), always logged, the key sourced from a secret at call time and never written to a log.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
dispatch_kit/__init__.py,sha256=PUWm1pmCcji8QpDA9ZLZ2m3mzZhOzhumFSFypqlVLoA,2356
|
|
2
|
+
dispatch_kit/approval.py,sha256=6capLUQJbognPTSAzsoRyrNwWPJwRbxt6tkyDLqvxgs,1552
|
|
3
|
+
dispatch_kit/budget.py,sha256=m-2MAkyWq0kg7asp1g5t6lfnyOAXjjcvjM_oJy8K9wI,6329
|
|
4
|
+
dispatch_kit/dispatch.py,sha256=h2u2wVOFoQMWT_K-3affUilPm6E28VNDnRWPap3zPPk,5003
|
|
5
|
+
dispatch_kit/egress.py,sha256=HPNCAJh0LTVG9pfkjyQbHWpaiPkSyUG4jUkKMpX4RRY,4392
|
|
6
|
+
dispatch_kit/estimate.py,sha256=dnUv4iDAh3UyAjCgkq-mRWrhZP1t984aUhlfL9PHN6g,2507
|
|
7
|
+
dispatch_kit/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
dispatch_kit/routing.py,sha256=BO0z8lUvteNOl19cyCAgEoDhFfOzDVvi_ffmnITkW1M,6698
|
|
9
|
+
dispatch_kit-0.1.0.dist-info/METADATA,sha256=CfBSqIDhM-wX5e7TfPuw6qhOLo0d1dJOK9TXDKM13xU,5256
|
|
10
|
+
dispatch_kit-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
11
|
+
dispatch_kit-0.1.0.dist-info/top_level.txt,sha256=MQSrew1pPSR4Ei9U80LdMbq3gEjKk03Xa597gflZrsE,13
|
|
12
|
+
dispatch_kit-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dispatch_kit
|