dispatch-kit 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/PKG-INFO +1 -1
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/pyproject.toml +4 -2
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/__init__.py +13 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/dispatch.py +29 -7
- dispatch_kit-0.3.0/src/dispatch_kit/engine.py +63 -0
- dispatch_kit-0.3.0/src/dispatch_kit/faults.py +43 -0
- dispatch_kit-0.3.0/src/dispatch_kit/observe.py +183 -0
- dispatch_kit-0.3.0/src/dispatch_kit/provenance.py +86 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/PKG-INFO +1 -1
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/SOURCES.txt +8 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_dispatch.py +11 -1
- dispatch_kit-0.3.0/tests/test_engine.py +103 -0
- dispatch_kit-0.3.0/tests/test_faults.py +26 -0
- dispatch_kit-0.3.0/tests/test_observe.py +151 -0
- dispatch_kit-0.3.0/tests/test_provenance.py +61 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/README.md +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/setup.cfg +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/approval.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/budget.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/egress.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/estimate.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/py.typed +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/routing.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/dependency_links.txt +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/requires.txt +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/top_level.txt +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_approval.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_budget.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_egress.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_estimate.py +0 -0
- {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_routing.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dispatch-kit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
|
|
5
5
|
Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "dispatch-kit"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress."
|
|
9
9
|
authors = [{ name = "Aryan Falahatpisheh", email = "aryanfalahat@gmail.com" }]
|
|
10
10
|
readme = "README.md"
|
|
@@ -77,6 +77,8 @@ source-roots = ["src"]
|
|
|
77
77
|
max-line-length = 100
|
|
78
78
|
|
|
79
79
|
[tool.pylint.design]
|
|
80
|
-
# Pure value-object domain: frozen data records with few/no methods
|
|
80
|
+
# Pure value-object domain: frozen data records with few/no methods — and, for a full
|
|
81
|
+
# reproducibility record (Provenance), many fields — are expected, not a design smell.
|
|
81
82
|
min-public-methods = 0
|
|
82
83
|
max-args = 6
|
|
84
|
+
max-attributes = 15
|
|
@@ -30,6 +30,7 @@ from .dispatch import (
|
|
|
30
30
|
DispatchError,
|
|
31
31
|
JobStore,
|
|
32
32
|
Lease,
|
|
33
|
+
RetriableError,
|
|
33
34
|
Transport,
|
|
34
35
|
WorkerExecutor,
|
|
35
36
|
is_lease_stale,
|
|
@@ -42,7 +43,11 @@ from .egress import (
|
|
|
42
43
|
SecretRef,
|
|
43
44
|
log_egress,
|
|
44
45
|
)
|
|
46
|
+
from .engine import drain
|
|
45
47
|
from .estimate import CostEstimate, HostCapabilities, vram_fits
|
|
48
|
+
from .faults import Contained
|
|
49
|
+
from .observe import Observe, ObserveConfig
|
|
50
|
+
from .provenance import Determinism, GpuContext, Provenance
|
|
46
51
|
from .routing import (
|
|
47
52
|
BackendCapabilities,
|
|
48
53
|
BackendKind,
|
|
@@ -62,16 +67,23 @@ __all__ = [
|
|
|
62
67
|
"BudgetCap",
|
|
63
68
|
"BudgetState",
|
|
64
69
|
"BudgetWindow",
|
|
70
|
+
"Contained",
|
|
65
71
|
"CostEstimate",
|
|
66
72
|
"CostRates",
|
|
73
|
+
"Determinism",
|
|
67
74
|
"DispatchError",
|
|
68
75
|
"EnvLookup",
|
|
69
76
|
"ExternalEndpoint",
|
|
77
|
+
"GpuContext",
|
|
70
78
|
"HostCapabilities",
|
|
71
79
|
"JobStore",
|
|
72
80
|
"Lease",
|
|
73
81
|
"NoEligibleBackendError",
|
|
74
82
|
"NodeIdentity",
|
|
83
|
+
"Observe",
|
|
84
|
+
"ObserveConfig",
|
|
85
|
+
"Provenance",
|
|
86
|
+
"RetriableError",
|
|
75
87
|
"Routable",
|
|
76
88
|
"SecretMissingError",
|
|
77
89
|
"SecretRef",
|
|
@@ -79,6 +91,7 @@ __all__ = [
|
|
|
79
91
|
"Transport",
|
|
80
92
|
"WorkerExecutor",
|
|
81
93
|
"admits",
|
|
94
|
+
"drain",
|
|
82
95
|
"estimate_cost",
|
|
83
96
|
"is_lease_stale",
|
|
84
97
|
"log_egress",
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
Both a PULL worker (it polls: claim a job, run it, complete it) and a PUSH orchestrator (it posts a
|
|
4
4
|
job to a worker and waits) need the SAME guarantees: a job is CLAIMED atomically so it runs exactly
|
|
5
|
-
once, a stale result is REJECTED,
|
|
6
|
-
|
|
5
|
+
once, a stale result is REJECTED, a clean failure is RECORDED (terminally, or retriably so it is
|
|
6
|
+
re-queued up to a cap), and a worker that dies mid-job has its lease RECOVERED. This module is the
|
|
7
|
+
shared CONTRACT — the pure lease rules + the store/transport/worker protocols. The
|
|
7
8
|
store (SQLite vs SQLAlchemy), the transport (pull vs push), and the payload (a transcribe request vs
|
|
8
9
|
a tool invocation) are per-app ADAPTERS, so two apps converge on one model WITHOUT a shared DB.
|
|
9
10
|
|
|
@@ -22,6 +23,15 @@ class DispatchError(RuntimeError):
|
|
|
22
23
|
"""A dispatch invariant was violated (a stale complete, or a job recovered past its cap)."""
|
|
23
24
|
|
|
24
25
|
|
|
26
|
+
class RetriableError(Exception):
|
|
27
|
+
"""An executor raises this to mark a failure TRANSIENT, so the engine re-queues the job for
|
|
28
|
+
another attempt (bumping the attempt count, then poisoning past the cap) instead of failing it
|
|
29
|
+
terminally. Use it for an infrastructure hiccup a retry can fix — a timeout, an OOM, a rate
|
|
30
|
+
limit — NOT for a bad input or a logic error, where retrying only wastes attempts. Any OTHER
|
|
31
|
+
exception from the executor is treated as terminal.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
|
|
25
35
|
@dataclass(frozen=True, slots=True)
|
|
26
36
|
class Lease:
|
|
27
37
|
"""A claim on a job: when it was leased and how many times it has been recovered.
|
|
@@ -56,15 +66,17 @@ def should_give_up(attempts: int, max_attempts: int) -> bool:
|
|
|
56
66
|
|
|
57
67
|
|
|
58
68
|
class JobStore(Protocol):
|
|
59
|
-
"""The authoritative job store — the ONE place
|
|
69
|
+
"""The authoritative job store — the ONE place claim/complete/fail is decided, ATOMICALLY.
|
|
60
70
|
|
|
61
71
|
``claim`` must be atomic (one transaction / compare-and-set): two concurrent workers can never
|
|
62
72
|
both claim the same job — that single claim is the run-exactly-once guarantee. ``complete`` must
|
|
63
73
|
REJECT a result for a job not currently leased/running (a stale resubmit after the lease was
|
|
64
|
-
recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
the
|
|
74
|
+
recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``fail`` records a
|
|
75
|
+
clean failure: terminal (a bad input), or retriable — re-queued for another attempt, bumping
|
|
76
|
+
attempts and poisoned past the cap (like a recovered crash). ``recover_stale`` re-leases jobs
|
|
77
|
+
whose lease outlived the TTL (see :func:`is_lease_stale`), bumping the attempt count and
|
|
78
|
+
failing a job past ``max_attempts`` (see :func:`should_give_up`). The payload type is the
|
|
79
|
+
app's own (a transcribe request, a tool invocation, ...).
|
|
68
80
|
"""
|
|
69
81
|
|
|
70
82
|
def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
|
|
@@ -73,6 +85,16 @@ class JobStore(Protocol):
|
|
|
73
85
|
def complete(self, job_id: str, result: Any) -> bool:
|
|
74
86
|
"""Apply a result IFF the job is leased/running; ``False`` if stale (done/recovered)."""
|
|
75
87
|
|
|
88
|
+
def fail(self, job_id: str, error: str, *, retriable: bool, max_attempts: int) -> bool:
|
|
89
|
+
"""Record a clean failure of a leased job; ``False`` if it was not leased/running.
|
|
90
|
+
|
|
91
|
+
``retriable=False`` fails it terminally now — a bad input or logic error a retry cannot
|
|
92
|
+
fix. ``retriable=True`` returns it to the queue for another attempt, bumping the attempt
|
|
93
|
+
count; once it has been attempted ``max_attempts`` times it is failed terminally instead
|
|
94
|
+
(the poison rule of :func:`should_give_up`), so a persistently-transient job can never
|
|
95
|
+
loop forever.
|
|
96
|
+
"""
|
|
97
|
+
|
|
76
98
|
def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
|
|
77
99
|
"""Re-lease jobs whose lease is stale (return their ids); fail those past the cap."""
|
|
78
100
|
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""The run-exactly-once engine — drain a :class:`~dispatch_kit.dispatch.JobStore`.
|
|
2
|
+
|
|
3
|
+
Pure coordination over the dispatch ports: :func:`drain` owns the claim -> run -> record skeleton
|
|
4
|
+
AND the failure classification, so every app gets robust draining without re-implementing it. The
|
|
5
|
+
JobStore (persistence + atomic claim + retry timing) and the ``execute`` callable (the actual work)
|
|
6
|
+
are the app's adapters — pull a transcribe request, run a tool invocation, whatever; the engine only
|
|
7
|
+
coordinates.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from collections.abc import Callable, Sequence
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from .dispatch import JobStore, RetriableError
|
|
16
|
+
from .faults import Contained
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def drain(
|
|
20
|
+
store: JobStore,
|
|
21
|
+
execute: Callable[[Any], Any],
|
|
22
|
+
lanes: Sequence[str],
|
|
23
|
+
*,
|
|
24
|
+
max_attempts: int,
|
|
25
|
+
) -> int:
|
|
26
|
+
"""Claim and run every currently-runnable job in ``lanes``; return how many were handled.
|
|
27
|
+
|
|
28
|
+
Each job is isolated (:class:`~dispatch_kit.faults.Contained`): a failure is recorded —
|
|
29
|
+
terminal, or retriable if ``execute`` raised :class:`RetriableError` — and the drain CONTINUES,
|
|
30
|
+
so one bad job never stalls the queue. Loops until ``claim`` returns ``None``, so call it on a
|
|
31
|
+
poll interval. Whether a retriable re-queue is retried this pass or later (a backoff) is the
|
|
32
|
+
store's policy, not the engine's.
|
|
33
|
+
"""
|
|
34
|
+
handled = 0
|
|
35
|
+
while (claimed := store.claim(lanes)) is not None:
|
|
36
|
+
job_id, payload = claimed
|
|
37
|
+
_record(store, execute, job_id, payload, max_attempts=max_attempts)
|
|
38
|
+
handled += 1
|
|
39
|
+
return handled
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _record(
|
|
43
|
+
store: JobStore,
|
|
44
|
+
execute: Callable[[Any], Any],
|
|
45
|
+
job_id: str,
|
|
46
|
+
payload: Any,
|
|
47
|
+
*,
|
|
48
|
+
max_attempts: int,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Run one claimed job and record its outcome: complete, or fail (terminal / retriable)."""
|
|
51
|
+
result: Any = None
|
|
52
|
+
with Contained() as box:
|
|
53
|
+
result = execute(payload)
|
|
54
|
+
error = box.error
|
|
55
|
+
if error is None:
|
|
56
|
+
store.complete(job_id, result)
|
|
57
|
+
else:
|
|
58
|
+
store.fail(
|
|
59
|
+
job_id,
|
|
60
|
+
str(error),
|
|
61
|
+
retriable=isinstance(error, RetriableError),
|
|
62
|
+
max_attempts=max_attempts,
|
|
63
|
+
)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Fault isolation — contain a unit of work's failure instead of letting it kill the loop.
|
|
2
|
+
|
|
3
|
+
A drain loop / worker / handler must not die because one job raised. :class:`Contained` records an
|
|
4
|
+
ordinary exception via the context-manager protocol, so there is exactly ONE audited place where
|
|
5
|
+
arbitrary failures stop (no scattered ``except Exception``). A ``BaseException`` that is not an
|
|
6
|
+
``Exception`` (``KeyboardInterrupt`` / ``SystemExit``) still propagates, so shutdown works.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from types import TracebackType
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Contained:
|
|
15
|
+
"""Context manager that records, rather than raises, an ordinary exception.
|
|
16
|
+
|
|
17
|
+
Read ``.error`` after the block to handle what happened::
|
|
18
|
+
|
|
19
|
+
with Contained() as box:
|
|
20
|
+
result = risky()
|
|
21
|
+
if box.error is not None:
|
|
22
|
+
... # handle the failure (``result`` is unset)
|
|
23
|
+
|
|
24
|
+
It contains failures through ``__exit__`` (returning ``True`` swallows) rather than
|
|
25
|
+
``except Exception``, so there is exactly one audited boundary where arbitrary failures stop.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self) -> None:
|
|
29
|
+
self.error: Exception | None = None
|
|
30
|
+
|
|
31
|
+
def __enter__(self) -> Contained:
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(
|
|
35
|
+
self,
|
|
36
|
+
exc_type: type[BaseException] | None,
|
|
37
|
+
exc: BaseException | None,
|
|
38
|
+
traceback: TracebackType | None,
|
|
39
|
+
) -> bool:
|
|
40
|
+
if exc is None or not isinstance(exc, Exception):
|
|
41
|
+
return False # clean exit, or a BaseException we must let propagate
|
|
42
|
+
self.error = exc
|
|
43
|
+
return True # contained: swallow the exception
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Optional observability facade — OpenTelemetry traces + Sentry errors + structured JSON logs.
|
|
2
|
+
|
|
3
|
+
Opt-in, OFF by default: nothing leaves the box unless the operator sets an env var. Construct one
|
|
4
|
+
:class:`Observe` per service with a service name + an env-var prefix; until a DSN/endpoint is set,
|
|
5
|
+
every call is a zero-cost no-op, so instrumentation can live permanently at the call sites. The
|
|
6
|
+
export SDKs are loaded via :func:`importlib.import_module` (a runtime call, so this module — and the
|
|
7
|
+
whole package — stays import-clean and dependency-free without them); the facade degrades to no-ops
|
|
8
|
+
when they are absent.
|
|
9
|
+
|
|
10
|
+
This owns the parts every service shares: traces, error reporting, JSON log formatting. A service
|
|
11
|
+
that also exports *metrics* builds its own meter, reusing identity via :meth:`Observe.resource`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import importlib
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
from collections.abc import Iterator
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class ObserveConfig:
|
|
28
|
+
"""Per-service observability settings.
|
|
29
|
+
|
|
30
|
+
``env_prefix`` namespaces the opt-in vars: ``<PREFIX>_SENTRY_DSN`` (falling back to the standard
|
|
31
|
+
``SENTRY_DSN``) enables Sentry, ``<PREFIX>_LOG_JSON`` turns on JSON logs; OTLP traces follow the
|
|
32
|
+
standard ``OTEL_EXPORTER_OTLP_ENDPOINT``. ``pip_hint`` is shown when an export is requested but
|
|
33
|
+
the optional SDK extra is not installed. ``tracer_name`` names the tracer/logger.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
service_name: str
|
|
37
|
+
env_prefix: str
|
|
38
|
+
pip_hint: str
|
|
39
|
+
tracer_name: str
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _JsonFormatter(logging.Formatter):
|
|
43
|
+
"""One-line JSON per record (ts/level/logger/msg + service/command) so logs are parseable
|
|
44
|
+
and shippable — searchable across containers, not just grep on one box."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, service: str, command: str | None) -> None:
|
|
47
|
+
super().__init__()
|
|
48
|
+
self._base: dict[str, str] = {"service": service}
|
|
49
|
+
if command:
|
|
50
|
+
self._base["command"] = command
|
|
51
|
+
|
|
52
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
53
|
+
payload: dict[str, object] = {
|
|
54
|
+
"ts": self.formatTime(record),
|
|
55
|
+
"level": record.levelname,
|
|
56
|
+
"logger": record.name,
|
|
57
|
+
"msg": record.getMessage(),
|
|
58
|
+
**self._base,
|
|
59
|
+
}
|
|
60
|
+
if record.exc_info:
|
|
61
|
+
payload["exc"] = self.formatException(record.exc_info)
|
|
62
|
+
return json.dumps(payload)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Observe:
|
|
66
|
+
"""A per-service observability facade: traces (OTel), errors (Sentry), JSON logs.
|
|
67
|
+
|
|
68
|
+
Construct with an :class:`ObserveConfig`, call :meth:`setup` once at process start, then use
|
|
69
|
+
:meth:`span` / :meth:`capture_exception` freely — both no-op when export is off. The
|
|
70
|
+
``build_sentry`` / ``build_tracer`` methods are the seams tests patch.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(self, config: ObserveConfig) -> None:
|
|
74
|
+
self._cfg = config
|
|
75
|
+
self._log = logging.getLogger(config.tracer_name)
|
|
76
|
+
self._sentry: Any = None
|
|
77
|
+
self._tracer: Any = None
|
|
78
|
+
|
|
79
|
+
def setup(self, *, command: str | None = None) -> None:
|
|
80
|
+
"""Wire whichever exporters the environment opts into. Idempotent — resets first, so it can
|
|
81
|
+
be called once per process or re-run in tests."""
|
|
82
|
+
self.reset()
|
|
83
|
+
self._init_logging(command)
|
|
84
|
+
self._init_sentry()
|
|
85
|
+
self._init_tracer(command)
|
|
86
|
+
|
|
87
|
+
def reset(self) -> None:
|
|
88
|
+
"""Deactivate every exporter (before a re-:meth:`setup` and in tests)."""
|
|
89
|
+
self._sentry = None
|
|
90
|
+
self._tracer = None
|
|
91
|
+
|
|
92
|
+
def is_active(self) -> dict[str, bool]:
|
|
93
|
+
"""Which exporters are live — for a health check and asserted by tests."""
|
|
94
|
+
return {"sentry": self._sentry is not None, "otel": self._tracer is not None}
|
|
95
|
+
|
|
96
|
+
@contextmanager
|
|
97
|
+
def span(self, name: str, **attributes: object) -> Iterator[Any]:
|
|
98
|
+
"""Trace the wrapped block as one OTel span (with ``attributes``). A no-op context yielding
|
|
99
|
+
``None`` when tracing is off, so call sites need no guard."""
|
|
100
|
+
tracer = self._tracer
|
|
101
|
+
if tracer is None:
|
|
102
|
+
yield None
|
|
103
|
+
return
|
|
104
|
+
with tracer.start_as_current_span(name) as active:
|
|
105
|
+
for key, value in attributes.items():
|
|
106
|
+
active.set_attribute(key, value)
|
|
107
|
+
yield active
|
|
108
|
+
|
|
109
|
+
def capture_exception(self, exc: BaseException, **tags: object) -> None:
|
|
110
|
+
"""Report a caught exception to Sentry with ``tags`` for grouping. No-op when off — the
|
|
111
|
+
caller still records its own failure normally."""
|
|
112
|
+
sentry = self._sentry
|
|
113
|
+
if sentry is None:
|
|
114
|
+
return
|
|
115
|
+
with sentry.new_scope() as scope:
|
|
116
|
+
for key, value in tags.items():
|
|
117
|
+
scope.set_tag(key, str(value))
|
|
118
|
+
sentry.capture_exception(exc)
|
|
119
|
+
|
|
120
|
+
def otlp_endpoint(self) -> str | None:
|
|
121
|
+
"""The OTLP endpoint if set — a service building its own meter checks this."""
|
|
122
|
+
return os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT") or None
|
|
123
|
+
|
|
124
|
+
def resource(self, **extra: Any) -> Any:
|
|
125
|
+
"""An OTel ``Resource`` carrying ``service.name`` + ``extra`` — so a service's own meter
|
|
126
|
+
shares this service's identity. Requires the opentelemetry SDK."""
|
|
127
|
+
resources = importlib.import_module("opentelemetry.sdk.resources")
|
|
128
|
+
return resources.Resource.create({"service.name": self._cfg.service_name, **extra})
|
|
129
|
+
|
|
130
|
+
def _init_logging(self, command: str | None) -> None:
|
|
131
|
+
if not os.environ.get(f"{self._cfg.env_prefix}_LOG_JSON"):
|
|
132
|
+
return
|
|
133
|
+
fmt = _JsonFormatter(self._cfg.service_name, command)
|
|
134
|
+
root = logging.getLogger()
|
|
135
|
+
if not root.handlers:
|
|
136
|
+
root.addHandler(logging.StreamHandler())
|
|
137
|
+
for handler in root.handlers:
|
|
138
|
+
handler.setFormatter(fmt)
|
|
139
|
+
|
|
140
|
+
def _init_sentry(self) -> None:
|
|
141
|
+
dsn = os.environ.get(f"{self._cfg.env_prefix}_SENTRY_DSN") or os.environ.get("SENTRY_DSN")
|
|
142
|
+
if not dsn:
|
|
143
|
+
return
|
|
144
|
+
try:
|
|
145
|
+
self._sentry = self.build_sentry(dsn)
|
|
146
|
+
except ImportError:
|
|
147
|
+
self._log.warning("SENTRY_DSN set but sentry-sdk is missing — %s", self._cfg.pip_hint)
|
|
148
|
+
return
|
|
149
|
+
self._log.info("Sentry error reporting enabled")
|
|
150
|
+
|
|
151
|
+
def _init_tracer(self, command: str | None) -> None:
|
|
152
|
+
endpoint = self.otlp_endpoint()
|
|
153
|
+
if not endpoint:
|
|
154
|
+
return
|
|
155
|
+
try:
|
|
156
|
+
self._tracer = self.build_tracer(command)
|
|
157
|
+
except ImportError:
|
|
158
|
+
self._log.warning(
|
|
159
|
+
"OTEL_EXPORTER_OTLP_ENDPOINT set but opentelemetry is missing — %s",
|
|
160
|
+
self._cfg.pip_hint,
|
|
161
|
+
)
|
|
162
|
+
return
|
|
163
|
+
self._log.info("OpenTelemetry export enabled -> %s", endpoint)
|
|
164
|
+
|
|
165
|
+
def build_sentry(self, dsn: str) -> Any:
|
|
166
|
+
"""Initialise the real Sentry SDK (errors only; OTel owns traces). The dynamic import is the
|
|
167
|
+
seam tests patch + the ImportError surface when the extra is absent."""
|
|
168
|
+
sentry_sdk = importlib.import_module("sentry_sdk")
|
|
169
|
+
sentry_sdk.init(dsn=dsn, send_default_pii=False, traces_sample_rate=0.0)
|
|
170
|
+
return sentry_sdk
|
|
171
|
+
|
|
172
|
+
def build_tracer(self, command: str | None) -> Any:
|
|
173
|
+
"""Register an OTLP-exporting tracer provider over this service's resource; return its
|
|
174
|
+
tracer. Endpoint/headers are read from the standard ``OTEL_EXPORTER_OTLP_*`` env."""
|
|
175
|
+
trace = importlib.import_module("opentelemetry.trace")
|
|
176
|
+
otlp = importlib.import_module("opentelemetry.exporter.otlp.proto.http.trace_exporter")
|
|
177
|
+
sdk_trace = importlib.import_module("opentelemetry.sdk.trace")
|
|
178
|
+
sdk_export = importlib.import_module("opentelemetry.sdk.trace.export")
|
|
179
|
+
extra = {f"{self._cfg.tracer_name}.command": command} if command else {}
|
|
180
|
+
provider = sdk_trace.TracerProvider(resource=self.resource(**extra))
|
|
181
|
+
provider.add_span_processor(sdk_export.BatchSpanProcessor(otlp.OTLPSpanExporter()))
|
|
182
|
+
trace.set_tracer_provider(provider)
|
|
183
|
+
return trace.get_tracer(self._cfg.tracer_name)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Provenance — the reproducibility record stamped on a completed job.
|
|
2
|
+
|
|
3
|
+
Generic across apps (a tool invocation, an ML transcription, any dispatched compute): every field
|
|
4
|
+
that makes a run reproducible is mandatory unless genuinely inapplicable (no GPU on a CPU tool, no
|
|
5
|
+
seed on a deterministic one), and those exceptions are explicit ``None``, never silent omissions —
|
|
6
|
+
a job that cannot state its provenance cannot commit. Invalid provenance is unrepresentable: a
|
|
7
|
+
seed/determinism contradiction is rejected at construction.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import enum
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
from .routing import BackendKind, NodeIdentity
|
|
16
|
+
|
|
17
|
+
__all__ = ["Determinism", "GpuContext", "Provenance"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Determinism(enum.StrEnum):
|
|
21
|
+
"""How reproducible a tool's output is — recorded so non-determinism is declared, not hidden."""
|
|
22
|
+
|
|
23
|
+
DETERMINISTIC = "deterministic"
|
|
24
|
+
"""Same inputs always give bit-identical outputs."""
|
|
25
|
+
SEEDED = "seeded"
|
|
26
|
+
"""Reproducible given the recorded seed."""
|
|
27
|
+
NONDETERMINISTIC = "nondeterministic"
|
|
28
|
+
"""Output varies run-to-run (e.g. unseeded sampling); flagged for ensemble handling."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True, slots=True)
|
|
32
|
+
class GpuContext:
|
|
33
|
+
"""The GPU a job ran on; ``None`` at the Provenance level means a CPU-only job."""
|
|
34
|
+
|
|
35
|
+
model: str
|
|
36
|
+
vram_gb: float
|
|
37
|
+
|
|
38
|
+
def __post_init__(self) -> None:
|
|
39
|
+
if self.vram_gb <= 0:
|
|
40
|
+
raise ValueError(f"GPU vram_gb must be positive; got {self.vram_gb}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True, slots=True)
|
|
44
|
+
class Provenance:
|
|
45
|
+
"""The full reproducibility record stamped onto a completed job.
|
|
46
|
+
|
|
47
|
+
Hashes (weights sha256, params, input) and reference-data versions let a re-run be checked for
|
|
48
|
+
drift; ``determinism`` plus ``seed`` say whether an identical result is even expected. A
|
|
49
|
+
``NONDETERMINISTIC`` job with a seed, or a ``SEEDED`` job without one, is a contradiction and is
|
|
50
|
+
rejected at construction — invalid provenance is unrepresentable.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
tool_id: str
|
|
54
|
+
tool_version: str
|
|
55
|
+
weights_id: str | None
|
|
56
|
+
weights_sha256: str | None
|
|
57
|
+
params_hash: str
|
|
58
|
+
input_hash: str
|
|
59
|
+
reference_data_versions: dict[str, str]
|
|
60
|
+
container_digest: str | None
|
|
61
|
+
seed: int | None
|
|
62
|
+
determinism: Determinism
|
|
63
|
+
runtime_seconds: float
|
|
64
|
+
gpu: GpuContext | None = None
|
|
65
|
+
backend: BackendKind = BackendKind.LOCAL
|
|
66
|
+
"""Where compute ran. Defaults to LOCAL: a job with no remote dispatch ran here."""
|
|
67
|
+
node: NodeIdentity | None = None
|
|
68
|
+
"""The pinned identity of the node that ran the job; ``None`` only when the host is unknown."""
|
|
69
|
+
extra: dict[str, str] = field(default_factory=dict)
|
|
70
|
+
|
|
71
|
+
def __post_init__(self) -> None:
|
|
72
|
+
if not self.tool_id or not self.tool_version:
|
|
73
|
+
raise ValueError("provenance requires a tool_id and tool_version")
|
|
74
|
+
if not self.params_hash or not self.input_hash:
|
|
75
|
+
raise ValueError("provenance requires params_hash and input_hash")
|
|
76
|
+
if self.runtime_seconds < 0:
|
|
77
|
+
raise ValueError(f"runtime_seconds must be non-negative; got {self.runtime_seconds}")
|
|
78
|
+
self._validate_seed_determinism()
|
|
79
|
+
|
|
80
|
+
def _validate_seed_determinism(self) -> None:
|
|
81
|
+
if self.determinism is Determinism.SEEDED and self.seed is None:
|
|
82
|
+
raise ValueError("a SEEDED job must record its seed")
|
|
83
|
+
if self.determinism is Determinism.NONDETERMINISTIC and self.seed is not None:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
"a NONDETERMINISTIC job must not carry a seed (a seed implies reproducibility)"
|
|
86
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dispatch-kit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
|
|
5
5
|
Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -6,7 +6,11 @@ src/dispatch_kit/approval.py
|
|
|
6
6
|
src/dispatch_kit/budget.py
|
|
7
7
|
src/dispatch_kit/dispatch.py
|
|
8
8
|
src/dispatch_kit/egress.py
|
|
9
|
+
src/dispatch_kit/engine.py
|
|
9
10
|
src/dispatch_kit/estimate.py
|
|
11
|
+
src/dispatch_kit/faults.py
|
|
12
|
+
src/dispatch_kit/observe.py
|
|
13
|
+
src/dispatch_kit/provenance.py
|
|
10
14
|
src/dispatch_kit/py.typed
|
|
11
15
|
src/dispatch_kit/routing.py
|
|
12
16
|
src/dispatch_kit.egg-info/PKG-INFO
|
|
@@ -18,5 +22,9 @@ tests/test_approval.py
|
|
|
18
22
|
tests/test_budget.py
|
|
19
23
|
tests/test_dispatch.py
|
|
20
24
|
tests/test_egress.py
|
|
25
|
+
tests/test_engine.py
|
|
21
26
|
tests/test_estimate.py
|
|
27
|
+
tests/test_faults.py
|
|
28
|
+
tests/test_observe.py
|
|
29
|
+
tests/test_provenance.py
|
|
22
30
|
tests/test_routing.py
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from dispatch_kit import Lease, is_lease_stale, should_give_up
|
|
5
|
+
from dispatch_kit import Lease, RetriableError, is_lease_stale, should_give_up
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def test_lease_is_stale_only_past_the_ttl() -> None:
|
|
@@ -23,3 +23,13 @@ def test_lease_carries_its_attempt_count() -> None:
|
|
|
23
23
|
assert fresh.attempts == 0
|
|
24
24
|
recovered = Lease(job_id="j1", leased_at=200.0, attempts=1)
|
|
25
25
|
assert recovered.attempts == 1
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_retriable_error_marks_a_failure_for_retry() -> None:
|
|
29
|
+
# An executor raises RetriableError to opt a transient failure into re-queue (vs terminal);
|
|
30
|
+
# it is a plain Exception the engine catches like any other, routing it to fail(retriable=True).
|
|
31
|
+
assert issubclass(RetriableError, Exception)
|
|
32
|
+
try:
|
|
33
|
+
raise RetriableError("transient")
|
|
34
|
+
except RetriableError as exc:
|
|
35
|
+
assert str(exc) == "transient"
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""The drain engine over a fake in-memory JobStore: complete, terminal/retriable fail, isolation.
|
|
2
|
+
|
|
3
|
+
The fake store implements just enough of the contract to drive the engine. Its ``fail(retriable)``
|
|
4
|
+
re-queues immediately (no backoff), so a retriable job reaches the poison cap within one ``drain``
|
|
5
|
+
pass — a real store adds a backoff so a transient failure isn't burned to poison instantly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from dispatch_kit import RetriableError, drain
|
|
14
|
+
from dispatch_kit.dispatch import should_give_up
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _FakeStore:
|
|
18
|
+
def __init__(self, jobs: dict[str, dict[str, Any]]) -> None:
|
|
19
|
+
self.jobs = jobs # id -> {payload, lane, status, attempts}
|
|
20
|
+
self.results: dict[str, Any] = {}
|
|
21
|
+
self.errors: dict[str, str] = {}
|
|
22
|
+
|
|
23
|
+
def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
|
|
24
|
+
for job_id, job in self.jobs.items():
|
|
25
|
+
if job["status"] == "pending" and job["lane"] in lanes:
|
|
26
|
+
job["status"] = "working"
|
|
27
|
+
return job_id, job["payload"]
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
def complete(self, job_id: str, result: Any) -> bool:
|
|
31
|
+
job = self.jobs.get(job_id)
|
|
32
|
+
if job is None or job["status"] != "working":
|
|
33
|
+
return False
|
|
34
|
+
job["status"] = "done"
|
|
35
|
+
self.results[job_id] = result
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
def fail(self, job_id: str, error: str, *, retriable: bool, max_attempts: int) -> bool:
|
|
39
|
+
job = self.jobs.get(job_id)
|
|
40
|
+
if job is None or job["status"] != "working":
|
|
41
|
+
return False
|
|
42
|
+
if retriable:
|
|
43
|
+
job["attempts"] += 1
|
|
44
|
+
if should_give_up(job["attempts"], max_attempts):
|
|
45
|
+
job["status"] = "failed"
|
|
46
|
+
self.errors[job_id] = error
|
|
47
|
+
else:
|
|
48
|
+
job["status"] = "pending" # re-queue
|
|
49
|
+
else:
|
|
50
|
+
job["status"] = "failed"
|
|
51
|
+
self.errors[job_id] = error
|
|
52
|
+
return True
|
|
53
|
+
|
|
54
|
+
def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _job(payload: str, *, lane: str = "ml") -> dict[str, Any]:
|
|
59
|
+
return {"payload": payload, "lane": lane, "status": "pending", "attempts": 0}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_drain_runs_every_pending_job_and_completes_it() -> None:
|
|
63
|
+
store = _FakeStore({"j1": _job("a"), "j2": _job("b")})
|
|
64
|
+
handled = drain(store, str.upper, ["ml"], max_attempts=3)
|
|
65
|
+
assert handled == 2
|
|
66
|
+
assert store.results == {"j1": "A", "j2": "B"}
|
|
67
|
+
assert all(job["status"] == "done" for job in store.jobs.values())
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_terminal_failure_is_recorded_and_draining_continues() -> None:
|
|
71
|
+
def execute(payload: str) -> str:
|
|
72
|
+
if payload == "bad":
|
|
73
|
+
raise ValueError("nope")
|
|
74
|
+
return payload
|
|
75
|
+
|
|
76
|
+
store = _FakeStore({"j1": _job("bad"), "j2": _job("ok")})
|
|
77
|
+
drain(store, execute, ["ml"], max_attempts=3)
|
|
78
|
+
assert store.jobs["j1"]["status"] == "failed"
|
|
79
|
+
assert "nope" in store.errors["j1"]
|
|
80
|
+
assert store.results == {"j2": "ok"} # the good job still ran despite the bad one
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_retriable_failure_requeues_then_poisons_at_the_cap() -> None:
|
|
84
|
+
def execute(_payload: str) -> str:
|
|
85
|
+
raise RetriableError("transient")
|
|
86
|
+
|
|
87
|
+
store = _FakeStore({"j1": _job("x")})
|
|
88
|
+
drain(store, execute, ["ml"], max_attempts=2)
|
|
89
|
+
assert store.jobs["j1"]["status"] == "failed" # poisoned after the cap
|
|
90
|
+
assert store.jobs["j1"]["attempts"] == 2
|
|
91
|
+
assert "transient" in store.errors["j1"]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_drain_only_touches_its_lanes() -> None:
|
|
95
|
+
store = _FakeStore({"j1": _job("a", lane="ml"), "j2": _job("b", lane="sync")})
|
|
96
|
+
handled = drain(store, str.upper, ["ml"], max_attempts=3)
|
|
97
|
+
assert handled == 1
|
|
98
|
+
assert store.jobs["j1"]["status"] == "done"
|
|
99
|
+
assert store.jobs["j2"]["status"] == "pending" # the other lane is untouched
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_drain_returns_zero_when_nothing_runnable() -> None:
|
|
103
|
+
assert drain(_FakeStore({}), str.upper, ["ml"], max_attempts=3) == 0
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Contained records an ordinary failure but lets a BaseException through."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from dispatch_kit import Contained
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_contained_records_an_ordinary_exception() -> None:
|
|
11
|
+
with Contained() as box:
|
|
12
|
+
raise ValueError("boom")
|
|
13
|
+
assert isinstance(box.error, ValueError)
|
|
14
|
+
assert str(box.error) == "boom"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_contained_is_clean_when_nothing_raises() -> None:
|
|
18
|
+
with Contained() as box:
|
|
19
|
+
pass
|
|
20
|
+
assert box.error is None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_contained_lets_base_exceptions_propagate() -> None:
|
|
24
|
+
# KeyboardInterrupt / SystemExit must NOT be swallowed — shutdown + cancellation still work.
|
|
25
|
+
with pytest.raises(KeyboardInterrupt), Contained():
|
|
26
|
+
raise KeyboardInterrupt
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""The observability facade is OFF by default and a safe no-op until an env var opts in.
|
|
2
|
+
|
|
3
|
+
The real SDKs are patched via the ``build_sentry`` / ``build_tracer`` seams, so these tests need
|
|
4
|
+
neither the export extras nor a network endpoint.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
|
|
14
|
+
from dispatch_kit.observe import Observe, ObserveConfig, _JsonFormatter
|
|
15
|
+
|
|
16
|
+
_CFG = ObserveConfig(
|
|
17
|
+
service_name="svc",
|
|
18
|
+
env_prefix="SVC",
|
|
19
|
+
pip_hint="pip install 'svc[observe]'",
|
|
20
|
+
tracer_name="svc",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def obs(monkeypatch: pytest.MonkeyPatch) -> Observe:
|
|
26
|
+
"""A fresh facade with every opt-in env var cleared."""
|
|
27
|
+
for var in ("SVC_SENTRY_DSN", "SENTRY_DSN", "OTEL_EXPORTER_OTLP_ENDPOINT", "SVC_LOG_JSON"):
|
|
28
|
+
monkeypatch.delenv(var, raising=False)
|
|
29
|
+
return Observe(_CFG)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class _FakeScope:
|
|
33
|
+
def __init__(self) -> None:
|
|
34
|
+
self.tags: dict[str, str] = {}
|
|
35
|
+
|
|
36
|
+
def __enter__(self) -> _FakeScope:
|
|
37
|
+
return self
|
|
38
|
+
|
|
39
|
+
def __exit__(self, *_exc: object) -> bool:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
def set_tag(self, key: str, value: str) -> None:
|
|
43
|
+
self.tags[key] = value
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class _FakeSentry:
|
|
47
|
+
def __init__(self) -> None:
|
|
48
|
+
self.captured: list[tuple[BaseException, dict[str, str]]] = []
|
|
49
|
+
self._scope = _FakeScope()
|
|
50
|
+
|
|
51
|
+
def new_scope(self) -> _FakeScope:
|
|
52
|
+
self._scope = _FakeScope()
|
|
53
|
+
return self._scope
|
|
54
|
+
|
|
55
|
+
def capture_exception(self, exc: BaseException) -> None:
|
|
56
|
+
self.captured.append((exc, dict(self._scope.tags)))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class _FakeSpan:
|
|
60
|
+
def __init__(self, name: str) -> None:
|
|
61
|
+
self.name = name
|
|
62
|
+
self.attrs: dict[str, object] = {}
|
|
63
|
+
|
|
64
|
+
def __enter__(self) -> _FakeSpan:
|
|
65
|
+
return self
|
|
66
|
+
|
|
67
|
+
def __exit__(self, *_exc: object) -> bool:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
def set_attribute(self, key: str, value: object) -> None:
|
|
71
|
+
self.attrs[key] = value
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class _FakeTracer:
|
|
75
|
+
def __init__(self) -> None:
|
|
76
|
+
self.spans: list[_FakeSpan] = []
|
|
77
|
+
|
|
78
|
+
def start_as_current_span(self, name: str) -> _FakeSpan:
|
|
79
|
+
span = _FakeSpan(name)
|
|
80
|
+
self.spans.append(span)
|
|
81
|
+
return span
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_inactive_by_default_is_a_safe_noop(obs: Observe) -> None:
|
|
85
|
+
obs.setup()
|
|
86
|
+
assert obs.is_active() == {"sentry": False, "otel": False}
|
|
87
|
+
with obs.span("op", tag="t") as active:
|
|
88
|
+
assert active is None
|
|
89
|
+
obs.capture_exception(ValueError("x"), op="t") # must not raise
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_sentry_opt_in_forwards_exception_with_tags(
|
|
93
|
+
obs: Observe, monkeypatch: pytest.MonkeyPatch
|
|
94
|
+
) -> None:
|
|
95
|
+
fake = _FakeSentry()
|
|
96
|
+
monkeypatch.setenv("SVC_SENTRY_DSN", "https://k@example.test/1")
|
|
97
|
+
monkeypatch.setattr(obs, "build_sentry", lambda _dsn: fake)
|
|
98
|
+
obs.setup()
|
|
99
|
+
assert obs.is_active()["sentry"] is True
|
|
100
|
+
err = ValueError("boom")
|
|
101
|
+
obs.capture_exception(err, op="x", id="r1")
|
|
102
|
+
assert fake.captured == [(err, {"op": "x", "id": "r1"})]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_otel_opt_in_records_spans(obs: Observe, monkeypatch: pytest.MonkeyPatch) -> None:
|
|
106
|
+
tracer = _FakeTracer()
|
|
107
|
+
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
|
|
108
|
+
monkeypatch.setattr(obs, "build_tracer", lambda _cmd: tracer)
|
|
109
|
+
obs.setup(command="work")
|
|
110
|
+
assert obs.is_active()["otel"] is True
|
|
111
|
+
with obs.span("job", op="x") as active:
|
|
112
|
+
assert active is not None
|
|
113
|
+
assert tracer.spans[0].name == "job"
|
|
114
|
+
assert tracer.spans[0].attrs == {"op": "x"}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def test_missing_extra_logs_and_degrades(
|
|
118
|
+
obs: Observe, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture
|
|
119
|
+
) -> None:
|
|
120
|
+
def _no_sdk(_dsn: str) -> object:
|
|
121
|
+
raise ImportError("no sentry_sdk")
|
|
122
|
+
|
|
123
|
+
monkeypatch.setenv("SVC_SENTRY_DSN", "https://k@example.test/1")
|
|
124
|
+
monkeypatch.setattr(obs, "build_sentry", _no_sdk)
|
|
125
|
+
with caplog.at_level(logging.WARNING):
|
|
126
|
+
obs.setup()
|
|
127
|
+
assert obs.is_active()["sentry"] is False
|
|
128
|
+
assert any("sentry-sdk" in rec.message for rec in caplog.records)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def test_setup_is_idempotent_and_resets(obs: Observe, monkeypatch: pytest.MonkeyPatch) -> None:
|
|
132
|
+
fake = _FakeSentry()
|
|
133
|
+
monkeypatch.setenv("SVC_SENTRY_DSN", "https://k@example.test/1")
|
|
134
|
+
monkeypatch.setattr(obs, "build_sentry", lambda _dsn: fake)
|
|
135
|
+
obs.setup()
|
|
136
|
+
assert obs.is_active()["sentry"] is True
|
|
137
|
+
monkeypatch.delenv("SVC_SENTRY_DSN")
|
|
138
|
+
obs.setup()
|
|
139
|
+
assert obs.is_active()["sentry"] is False
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_json_formatter_emits_structured_record() -> None:
|
|
143
|
+
formatter = _JsonFormatter("svc", "work")
|
|
144
|
+
record = logging.LogRecord("svc.x", logging.INFO, "f.py", 1, "hello %s", ("world",), None)
|
|
145
|
+
payload = json.loads(formatter.format(record))
|
|
146
|
+
assert payload["service"] == "svc"
|
|
147
|
+
assert payload["command"] == "work"
|
|
148
|
+
assert payload["level"] == "INFO"
|
|
149
|
+
assert payload["logger"] == "svc.x"
|
|
150
|
+
assert payload["msg"] == "hello world"
|
|
151
|
+
assert "ts" in payload
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Provenance is a reproducibility record whose invalid states are unrepresentable."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from dispatch_kit import BackendKind, Determinism, GpuContext, Provenance
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _prov(
|
|
11
|
+
*,
|
|
12
|
+
tool_id: str = "tool",
|
|
13
|
+
params_hash: str = "p",
|
|
14
|
+
runtime_seconds: float = 1.0,
|
|
15
|
+
determinism: Determinism = Determinism.DETERMINISTIC,
|
|
16
|
+
seed: int | None = None,
|
|
17
|
+
) -> Provenance:
|
|
18
|
+
return Provenance(
|
|
19
|
+
tool_id=tool_id,
|
|
20
|
+
tool_version="1.0",
|
|
21
|
+
weights_id=None,
|
|
22
|
+
weights_sha256=None,
|
|
23
|
+
params_hash=params_hash,
|
|
24
|
+
input_hash="i",
|
|
25
|
+
reference_data_versions={},
|
|
26
|
+
container_digest=None,
|
|
27
|
+
seed=seed,
|
|
28
|
+
determinism=determinism,
|
|
29
|
+
runtime_seconds=runtime_seconds,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_a_valid_deterministic_provenance_constructs() -> None:
|
|
34
|
+
prov = _prov()
|
|
35
|
+
assert prov.backend is BackendKind.LOCAL # ran here unless a remote dispatch says otherwise
|
|
36
|
+
assert prov.gpu is None # CPU-only by default
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_missing_tool_or_hashes_are_rejected() -> None:
|
|
40
|
+
with pytest.raises(ValueError, match="tool_id"):
|
|
41
|
+
_prov(tool_id="")
|
|
42
|
+
with pytest.raises(ValueError, match="params_hash"):
|
|
43
|
+
_prov(params_hash="")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_negative_runtime_is_rejected() -> None:
|
|
47
|
+
with pytest.raises(ValueError, match="runtime_seconds"):
|
|
48
|
+
_prov(runtime_seconds=-1.0)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_seed_determinism_contradictions_are_rejected() -> None:
|
|
52
|
+
with pytest.raises(ValueError, match="must record its seed"):
|
|
53
|
+
_prov(determinism=Determinism.SEEDED, seed=None)
|
|
54
|
+
with pytest.raises(ValueError, match="must not carry a seed"):
|
|
55
|
+
_prov(determinism=Determinism.NONDETERMINISTIC, seed=42)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_gpu_vram_must_be_positive() -> None:
|
|
59
|
+
with pytest.raises(ValueError, match="vram_gb"):
|
|
60
|
+
GpuContext(model="L4", vram_gb=0.0)
|
|
61
|
+
assert GpuContext(model="L4", vram_gb=24.0).vram_gb == 24.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|