dispatch-kit 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/PKG-INFO +1 -1
  2. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/pyproject.toml +4 -2
  3. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/__init__.py +13 -0
  4. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/dispatch.py +29 -7
  5. dispatch_kit-0.3.0/src/dispatch_kit/engine.py +63 -0
  6. dispatch_kit-0.3.0/src/dispatch_kit/faults.py +43 -0
  7. dispatch_kit-0.3.0/src/dispatch_kit/observe.py +183 -0
  8. dispatch_kit-0.3.0/src/dispatch_kit/provenance.py +86 -0
  9. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/PKG-INFO +1 -1
  10. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/SOURCES.txt +8 -0
  11. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_dispatch.py +11 -1
  12. dispatch_kit-0.3.0/tests/test_engine.py +103 -0
  13. dispatch_kit-0.3.0/tests/test_faults.py +26 -0
  14. dispatch_kit-0.3.0/tests/test_observe.py +151 -0
  15. dispatch_kit-0.3.0/tests/test_provenance.py +61 -0
  16. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/README.md +0 -0
  17. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/setup.cfg +0 -0
  18. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/approval.py +0 -0
  19. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/budget.py +0 -0
  20. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/egress.py +0 -0
  21. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/estimate.py +0 -0
  22. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/py.typed +0 -0
  23. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit/routing.py +0 -0
  24. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/dependency_links.txt +0 -0
  25. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/requires.txt +0 -0
  26. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/src/dispatch_kit.egg-info/top_level.txt +0 -0
  27. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_approval.py +0 -0
  28. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_budget.py +0 -0
  29. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_egress.py +0 -0
  30. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_estimate.py +0 -0
  31. {dispatch_kit-0.1.0 → dispatch_kit-0.3.0}/tests/test_routing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dispatch-kit
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
5
5
  Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dispatch-kit"
7
- version = "0.1.0"
7
+ version = "0.3.0"
8
8
  description = "Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress."
9
9
  authors = [{ name = "Aryan Falahatpisheh", email = "aryanfalahat@gmail.com" }]
10
10
  readme = "README.md"
@@ -77,6 +77,8 @@ source-roots = ["src"]
77
77
  max-line-length = 100
78
78
 
79
79
  [tool.pylint.design]
80
- # Pure value-object domain: frozen data records with few/no methods are expected.
80
+ # Pure value-object domain: frozen data records with few/no methods and, for a full
81
+ # reproducibility record (Provenance), many fields — are expected, not a design smell.
81
82
  min-public-methods = 0
82
83
  max-args = 6
84
+ max-attributes = 15
@@ -30,6 +30,7 @@ from .dispatch import (
30
30
  DispatchError,
31
31
  JobStore,
32
32
  Lease,
33
+ RetriableError,
33
34
  Transport,
34
35
  WorkerExecutor,
35
36
  is_lease_stale,
@@ -42,7 +43,11 @@ from .egress import (
42
43
  SecretRef,
43
44
  log_egress,
44
45
  )
46
+ from .engine import drain
45
47
  from .estimate import CostEstimate, HostCapabilities, vram_fits
48
+ from .faults import Contained
49
+ from .observe import Observe, ObserveConfig
50
+ from .provenance import Determinism, GpuContext, Provenance
46
51
  from .routing import (
47
52
  BackendCapabilities,
48
53
  BackendKind,
@@ -62,16 +67,23 @@ __all__ = [
62
67
  "BudgetCap",
63
68
  "BudgetState",
64
69
  "BudgetWindow",
70
+ "Contained",
65
71
  "CostEstimate",
66
72
  "CostRates",
73
+ "Determinism",
67
74
  "DispatchError",
68
75
  "EnvLookup",
69
76
  "ExternalEndpoint",
77
+ "GpuContext",
70
78
  "HostCapabilities",
71
79
  "JobStore",
72
80
  "Lease",
73
81
  "NoEligibleBackendError",
74
82
  "NodeIdentity",
83
+ "Observe",
84
+ "ObserveConfig",
85
+ "Provenance",
86
+ "RetriableError",
75
87
  "Routable",
76
88
  "SecretMissingError",
77
89
  "SecretRef",
@@ -79,6 +91,7 @@ __all__ = [
79
91
  "Transport",
80
92
  "WorkerExecutor",
81
93
  "admits",
94
+ "drain",
82
95
  "estimate_cost",
83
96
  "is_lease_stale",
84
97
  "log_egress",
@@ -2,8 +2,9 @@
2
2
 
3
3
  Both a PULL worker (it polls: claim a job, run it, complete it) and a PUSH orchestrator (it posts a
4
4
  job to a worker and waits) need the SAME guarantees: a job is CLAIMED atomically so it runs exactly
5
- once, a stale result is REJECTED, and a worker that dies mid-job has its lease RECOVERED. This
6
- module is the shared CONTRACT the pure lease rules + the store/transport/worker protocols. The
5
+ once, a stale result is REJECTED, a clean failure is RECORDED (terminally, or retriably so it is
6
+ re-queued up to a cap), and a worker that dies mid-job has its lease RECOVERED. This module is the
7
+ shared CONTRACT — the pure lease rules + the store/transport/worker protocols. The
7
8
  store (SQLite vs SQLAlchemy), the transport (pull vs push), and the payload (a transcribe request vs
8
9
  a tool invocation) are per-app ADAPTERS, so two apps converge on one model WITHOUT a shared DB.
9
10
 
@@ -22,6 +23,15 @@ class DispatchError(RuntimeError):
22
23
  """A dispatch invariant was violated (a stale complete, or a job recovered past its cap)."""
23
24
 
24
25
 
26
+ class RetriableError(Exception):
27
+ """An executor raises this to mark a failure TRANSIENT, so the engine re-queues the job for
28
+ another attempt (bumping the attempt count, then poisoning past the cap) instead of failing it
29
+ terminally. Use it for an infrastructure hiccup a retry can fix — a timeout, an OOM, a rate
30
+ limit — NOT for a bad input or a logic error, where retrying only wastes attempts. Any OTHER
31
+ exception from the executor is treated as terminal.
32
+ """
33
+
34
+
25
35
  @dataclass(frozen=True, slots=True)
26
36
  class Lease:
27
37
  """A claim on a job: when it was leased and how many times it has been recovered.
@@ -56,15 +66,17 @@ def should_give_up(attempts: int, max_attempts: int) -> bool:
56
66
 
57
67
 
58
68
  class JobStore(Protocol):
59
- """The authoritative job store — the ONE place a job's claim/complete is decided, ATOMICALLY.
69
+ """The authoritative job store — the ONE place claim/complete/fail is decided, ATOMICALLY.
60
70
 
61
71
  ``claim`` must be atomic (one transaction / compare-and-set): two concurrent workers can never
62
72
  both claim the same job — that single claim is the run-exactly-once guarantee. ``complete`` must
63
73
  REJECT a result for a job not currently leased/running (a stale resubmit after the lease was
64
- recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``recover_stale``
65
- re-leases jobs whose lease outlived the TTL (see :func:`is_lease_stale`), bumping the attempt
66
- count and failing a job past ``max_attempts`` (see :func:`should_give_up`). The payload type is
67
- the app's own (a transcribe request, a tool invocation, ...).
74
+ recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``fail`` records a
75
+ clean failure: terminal (a bad input), or retriable re-queued for another attempt, bumping
76
+ attempts and poisoned past the cap (like a recovered crash). ``recover_stale`` re-leases jobs
77
+ whose lease outlived the TTL (see :func:`is_lease_stale`), bumping the attempt count and
78
+ failing a job past ``max_attempts`` (see :func:`should_give_up`). The payload type is the
79
+ app's own (a transcribe request, a tool invocation, ...).
68
80
  """
69
81
 
70
82
  def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
@@ -73,6 +85,16 @@ class JobStore(Protocol):
73
85
  def complete(self, job_id: str, result: Any) -> bool:
74
86
  """Apply a result IFF the job is leased/running; ``False`` if stale (done/recovered)."""
75
87
 
88
+ def fail(self, job_id: str, error: str, *, retriable: bool, max_attempts: int) -> bool:
89
+ """Record a clean failure of a leased job; ``False`` if it was not leased/running.
90
+
91
+ ``retriable=False`` fails it terminally now — a bad input or logic error a retry cannot
92
+ fix. ``retriable=True`` returns it to the queue for another attempt, bumping the attempt
93
+ count; once it has been attempted ``max_attempts`` times it is failed terminally instead
94
+ (the poison rule of :func:`should_give_up`), so a persistently-transient job can never
95
+ loop forever.
96
+ """
97
+
76
98
  def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
77
99
  """Re-lease jobs whose lease is stale (return their ids); fail those past the cap."""
78
100
 
@@ -0,0 +1,63 @@
1
+ """The run-exactly-once engine — drain a :class:`~dispatch_kit.dispatch.JobStore`.
2
+
3
+ Pure coordination over the dispatch ports: :func:`drain` owns the claim -> run -> record skeleton
4
+ AND the failure classification, so every app gets robust draining without re-implementing it. The
5
+ JobStore (persistence + atomic claim + retry timing) and the ``execute`` callable (the actual work)
6
+ are the app's adapters — pull a transcribe request, run a tool invocation, whatever; the engine only
7
+ coordinates.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from collections.abc import Callable, Sequence
13
+ from typing import Any
14
+
15
+ from .dispatch import JobStore, RetriableError
16
+ from .faults import Contained
17
+
18
+
19
+ def drain(
20
+ store: JobStore,
21
+ execute: Callable[[Any], Any],
22
+ lanes: Sequence[str],
23
+ *,
24
+ max_attempts: int,
25
+ ) -> int:
26
+ """Claim and run every currently-runnable job in ``lanes``; return how many were handled.
27
+
28
+ Each job is isolated (:class:`~dispatch_kit.faults.Contained`): a failure is recorded —
29
+ terminal, or retriable if ``execute`` raised :class:`RetriableError` — and the drain CONTINUES,
30
+ so one bad job never stalls the queue. Loops until ``claim`` returns ``None``, so call it on a
31
+ poll interval. Whether a retriable re-queue is retried this pass or later (a backoff) is the
32
+ store's policy, not the engine's.
33
+ """
34
+ handled = 0
35
+ while (claimed := store.claim(lanes)) is not None:
36
+ job_id, payload = claimed
37
+ _record(store, execute, job_id, payload, max_attempts=max_attempts)
38
+ handled += 1
39
+ return handled
40
+
41
+
42
+ def _record(
43
+ store: JobStore,
44
+ execute: Callable[[Any], Any],
45
+ job_id: str,
46
+ payload: Any,
47
+ *,
48
+ max_attempts: int,
49
+ ) -> None:
50
+ """Run one claimed job and record its outcome: complete, or fail (terminal / retriable)."""
51
+ result: Any = None
52
+ with Contained() as box:
53
+ result = execute(payload)
54
+ error = box.error
55
+ if error is None:
56
+ store.complete(job_id, result)
57
+ else:
58
+ store.fail(
59
+ job_id,
60
+ str(error),
61
+ retriable=isinstance(error, RetriableError),
62
+ max_attempts=max_attempts,
63
+ )
@@ -0,0 +1,43 @@
1
+ """Fault isolation — contain a unit of work's failure instead of letting it kill the loop.
2
+
3
+ A drain loop / worker / handler must not die because one job raised. :class:`Contained` records an
4
+ ordinary exception via the context-manager protocol, so there is exactly ONE audited place where
5
+ arbitrary failures stop (no scattered ``except Exception``). A ``BaseException`` that is not an
6
+ ``Exception`` (``KeyboardInterrupt`` / ``SystemExit``) still propagates, so shutdown works.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from types import TracebackType
12
+
13
+
14
+ class Contained:
15
+ """Context manager that records, rather than raises, an ordinary exception.
16
+
17
+ Read ``.error`` after the block to handle what happened::
18
+
19
+ with Contained() as box:
20
+ result = risky()
21
+ if box.error is not None:
22
+ ... # handle the failure (``result`` is unset)
23
+
24
+ It contains failures through ``__exit__`` (returning ``True`` swallows) rather than
25
+ ``except Exception``, so there is exactly one audited boundary where arbitrary failures stop.
26
+ """
27
+
28
+ def __init__(self) -> None:
29
+ self.error: Exception | None = None
30
+
31
+ def __enter__(self) -> Contained:
32
+ return self
33
+
34
+ def __exit__(
35
+ self,
36
+ exc_type: type[BaseException] | None,
37
+ exc: BaseException | None,
38
+ traceback: TracebackType | None,
39
+ ) -> bool:
40
+ if exc is None or not isinstance(exc, Exception):
41
+ return False # clean exit, or a BaseException we must let propagate
42
+ self.error = exc
43
+ return True # contained: swallow the exception
@@ -0,0 +1,183 @@
1
+ """Optional observability facade — OpenTelemetry traces + Sentry errors + structured JSON logs.
2
+
3
+ Opt-in, OFF by default: nothing leaves the box unless the operator sets an env var. Construct one
4
+ :class:`Observe` per service with a service name + an env-var prefix; until a DSN/endpoint is set,
5
+ every call is a zero-cost no-op, so instrumentation can live permanently at the call sites. The
6
+ export SDKs are loaded via :func:`importlib.import_module` (a runtime call, so this module — and the
7
+ whole package — stays import-clean and dependency-free without them); the facade degrades to no-ops
8
+ when they are absent.
9
+
10
+ This owns the parts every service shares: traces, error reporting, JSON log formatting. A service
11
+ that also exports *metrics* builds its own meter, reusing identity via :meth:`Observe.resource`.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import importlib
17
+ import json
18
+ import logging
19
+ import os
20
+ from collections.abc import Iterator
21
+ from contextlib import contextmanager
22
+ from dataclasses import dataclass
23
+ from typing import Any
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class ObserveConfig:
28
+ """Per-service observability settings.
29
+
30
+ ``env_prefix`` namespaces the opt-in vars: ``<PREFIX>_SENTRY_DSN`` (falling back to the standard
31
+ ``SENTRY_DSN``) enables Sentry, ``<PREFIX>_LOG_JSON`` turns on JSON logs; OTLP traces follow the
32
+ standard ``OTEL_EXPORTER_OTLP_ENDPOINT``. ``pip_hint`` is shown when an export is requested but
33
+ the optional SDK extra is not installed. ``tracer_name`` names the tracer/logger.
34
+ """
35
+
36
+ service_name: str
37
+ env_prefix: str
38
+ pip_hint: str
39
+ tracer_name: str
40
+
41
+
42
+ class _JsonFormatter(logging.Formatter):
43
+ """One-line JSON per record (ts/level/logger/msg + service/command) so logs are parseable
44
+ and shippable — searchable across containers, not just grep on one box."""
45
+
46
+ def __init__(self, service: str, command: str | None) -> None:
47
+ super().__init__()
48
+ self._base: dict[str, str] = {"service": service}
49
+ if command:
50
+ self._base["command"] = command
51
+
52
+ def format(self, record: logging.LogRecord) -> str:
53
+ payload: dict[str, object] = {
54
+ "ts": self.formatTime(record),
55
+ "level": record.levelname,
56
+ "logger": record.name,
57
+ "msg": record.getMessage(),
58
+ **self._base,
59
+ }
60
+ if record.exc_info:
61
+ payload["exc"] = self.formatException(record.exc_info)
62
+ return json.dumps(payload)
63
+
64
+
65
+ class Observe:
66
+ """A per-service observability facade: traces (OTel), errors (Sentry), JSON logs.
67
+
68
+ Construct with an :class:`ObserveConfig`, call :meth:`setup` once at process start, then use
69
+ :meth:`span` / :meth:`capture_exception` freely — both no-op when export is off. The
70
+ ``build_sentry`` / ``build_tracer`` methods are the seams tests patch.
71
+ """
72
+
73
+ def __init__(self, config: ObserveConfig) -> None:
74
+ self._cfg = config
75
+ self._log = logging.getLogger(config.tracer_name)
76
+ self._sentry: Any = None
77
+ self._tracer: Any = None
78
+
79
+ def setup(self, *, command: str | None = None) -> None:
80
+ """Wire whichever exporters the environment opts into. Idempotent — resets first, so it can
81
+ be called once per process or re-run in tests."""
82
+ self.reset()
83
+ self._init_logging(command)
84
+ self._init_sentry()
85
+ self._init_tracer(command)
86
+
87
+ def reset(self) -> None:
88
+ """Deactivate every exporter (before a re-:meth:`setup` and in tests)."""
89
+ self._sentry = None
90
+ self._tracer = None
91
+
92
+ def is_active(self) -> dict[str, bool]:
93
+ """Which exporters are live — for a health check and asserted by tests."""
94
+ return {"sentry": self._sentry is not None, "otel": self._tracer is not None}
95
+
96
+ @contextmanager
97
+ def span(self, name: str, **attributes: object) -> Iterator[Any]:
98
+ """Trace the wrapped block as one OTel span (with ``attributes``). A no-op context yielding
99
+ ``None`` when tracing is off, so call sites need no guard."""
100
+ tracer = self._tracer
101
+ if tracer is None:
102
+ yield None
103
+ return
104
+ with tracer.start_as_current_span(name) as active:
105
+ for key, value in attributes.items():
106
+ active.set_attribute(key, value)
107
+ yield active
108
+
109
+ def capture_exception(self, exc: BaseException, **tags: object) -> None:
110
+ """Report a caught exception to Sentry with ``tags`` for grouping. No-op when off — the
111
+ caller still records its own failure normally."""
112
+ sentry = self._sentry
113
+ if sentry is None:
114
+ return
115
+ with sentry.new_scope() as scope:
116
+ for key, value in tags.items():
117
+ scope.set_tag(key, str(value))
118
+ sentry.capture_exception(exc)
119
+
120
+ def otlp_endpoint(self) -> str | None:
121
+ """The OTLP endpoint if set — a service building its own meter checks this."""
122
+ return os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT") or None
123
+
124
+ def resource(self, **extra: Any) -> Any:
125
+ """An OTel ``Resource`` carrying ``service.name`` + ``extra`` — so a service's own meter
126
+ shares this service's identity. Requires the opentelemetry SDK."""
127
+ resources = importlib.import_module("opentelemetry.sdk.resources")
128
+ return resources.Resource.create({"service.name": self._cfg.service_name, **extra})
129
+
130
+ def _init_logging(self, command: str | None) -> None:
131
+ if not os.environ.get(f"{self._cfg.env_prefix}_LOG_JSON"):
132
+ return
133
+ fmt = _JsonFormatter(self._cfg.service_name, command)
134
+ root = logging.getLogger()
135
+ if not root.handlers:
136
+ root.addHandler(logging.StreamHandler())
137
+ for handler in root.handlers:
138
+ handler.setFormatter(fmt)
139
+
140
+ def _init_sentry(self) -> None:
141
+ dsn = os.environ.get(f"{self._cfg.env_prefix}_SENTRY_DSN") or os.environ.get("SENTRY_DSN")
142
+ if not dsn:
143
+ return
144
+ try:
145
+ self._sentry = self.build_sentry(dsn)
146
+ except ImportError:
147
+ self._log.warning("SENTRY_DSN set but sentry-sdk is missing — %s", self._cfg.pip_hint)
148
+ return
149
+ self._log.info("Sentry error reporting enabled")
150
+
151
+ def _init_tracer(self, command: str | None) -> None:
152
+ endpoint = self.otlp_endpoint()
153
+ if not endpoint:
154
+ return
155
+ try:
156
+ self._tracer = self.build_tracer(command)
157
+ except ImportError:
158
+ self._log.warning(
159
+ "OTEL_EXPORTER_OTLP_ENDPOINT set but opentelemetry is missing — %s",
160
+ self._cfg.pip_hint,
161
+ )
162
+ return
163
+ self._log.info("OpenTelemetry export enabled -> %s", endpoint)
164
+
165
+ def build_sentry(self, dsn: str) -> Any:
166
+ """Initialise the real Sentry SDK (errors only; OTel owns traces). The dynamic import is the
167
+ seam tests patch + the ImportError surface when the extra is absent."""
168
+ sentry_sdk = importlib.import_module("sentry_sdk")
169
+ sentry_sdk.init(dsn=dsn, send_default_pii=False, traces_sample_rate=0.0)
170
+ return sentry_sdk
171
+
172
+ def build_tracer(self, command: str | None) -> Any:
173
+ """Register an OTLP-exporting tracer provider over this service's resource; return its
174
+ tracer. Endpoint/headers are read from the standard ``OTEL_EXPORTER_OTLP_*`` env."""
175
+ trace = importlib.import_module("opentelemetry.trace")
176
+ otlp = importlib.import_module("opentelemetry.exporter.otlp.proto.http.trace_exporter")
177
+ sdk_trace = importlib.import_module("opentelemetry.sdk.trace")
178
+ sdk_export = importlib.import_module("opentelemetry.sdk.trace.export")
179
+ extra = {f"{self._cfg.tracer_name}.command": command} if command else {}
180
+ provider = sdk_trace.TracerProvider(resource=self.resource(**extra))
181
+ provider.add_span_processor(sdk_export.BatchSpanProcessor(otlp.OTLPSpanExporter()))
182
+ trace.set_tracer_provider(provider)
183
+ return trace.get_tracer(self._cfg.tracer_name)
@@ -0,0 +1,86 @@
1
+ """Provenance — the reproducibility record stamped on a completed job.
2
+
3
+ Generic across apps (a tool invocation, an ML transcription, any dispatched compute): every field
4
+ that makes a run reproducible is mandatory unless genuinely inapplicable (no GPU on a CPU tool, no
5
+ seed on a deterministic one), and those exceptions are explicit ``None``, never silent omissions —
6
+ a job that cannot state its provenance cannot commit. Invalid provenance is unrepresentable: a
7
+ seed/determinism contradiction is rejected at construction.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import enum
13
+ from dataclasses import dataclass, field
14
+
15
+ from .routing import BackendKind, NodeIdentity
16
+
17
+ __all__ = ["Determinism", "GpuContext", "Provenance"]
18
+
19
+
20
+ class Determinism(enum.StrEnum):
21
+ """How reproducible a tool's output is — recorded so non-determinism is declared, not hidden."""
22
+
23
+ DETERMINISTIC = "deterministic"
24
+ """Same inputs always give bit-identical outputs."""
25
+ SEEDED = "seeded"
26
+ """Reproducible given the recorded seed."""
27
+ NONDETERMINISTIC = "nondeterministic"
28
+ """Output varies run-to-run (e.g. unseeded sampling); flagged for ensemble handling."""
29
+
30
+
31
+ @dataclass(frozen=True, slots=True)
32
+ class GpuContext:
33
+ """The GPU a job ran on; ``None`` at the Provenance level means a CPU-only job."""
34
+
35
+ model: str
36
+ vram_gb: float
37
+
38
+ def __post_init__(self) -> None:
39
+ if self.vram_gb <= 0:
40
+ raise ValueError(f"GPU vram_gb must be positive; got {self.vram_gb}")
41
+
42
+
43
+ @dataclass(frozen=True, slots=True)
44
+ class Provenance:
45
+ """The full reproducibility record stamped onto a completed job.
46
+
47
+ Hashes (weights sha256, params, input) and reference-data versions let a re-run be checked for
48
+ drift; ``determinism`` plus ``seed`` say whether an identical result is even expected. A
49
+ ``NONDETERMINISTIC`` job with a seed, or a ``SEEDED`` job without one, is a contradiction and is
50
+ rejected at construction — invalid provenance is unrepresentable.
51
+ """
52
+
53
+ tool_id: str
54
+ tool_version: str
55
+ weights_id: str | None
56
+ weights_sha256: str | None
57
+ params_hash: str
58
+ input_hash: str
59
+ reference_data_versions: dict[str, str]
60
+ container_digest: str | None
61
+ seed: int | None
62
+ determinism: Determinism
63
+ runtime_seconds: float
64
+ gpu: GpuContext | None = None
65
+ backend: BackendKind = BackendKind.LOCAL
66
+ """Where compute ran. Defaults to LOCAL: a job with no remote dispatch ran here."""
67
+ node: NodeIdentity | None = None
68
+ """The pinned identity of the node that ran the job; ``None`` only when the host is unknown."""
69
+ extra: dict[str, str] = field(default_factory=dict)
70
+
71
+ def __post_init__(self) -> None:
72
+ if not self.tool_id or not self.tool_version:
73
+ raise ValueError("provenance requires a tool_id and tool_version")
74
+ if not self.params_hash or not self.input_hash:
75
+ raise ValueError("provenance requires params_hash and input_hash")
76
+ if self.runtime_seconds < 0:
77
+ raise ValueError(f"runtime_seconds must be non-negative; got {self.runtime_seconds}")
78
+ self._validate_seed_determinism()
79
+
80
+ def _validate_seed_determinism(self) -> None:
81
+ if self.determinism is Determinism.SEEDED and self.seed is None:
82
+ raise ValueError("a SEEDED job must record its seed")
83
+ if self.determinism is Determinism.NONDETERMINISTIC and self.seed is not None:
84
+ raise ValueError(
85
+ "a NONDETERMINISTIC job must not carry a seed (a seed implies reproducibility)"
86
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dispatch-kit
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
5
5
  Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
6
6
  License: MIT
@@ -6,7 +6,11 @@ src/dispatch_kit/approval.py
6
6
  src/dispatch_kit/budget.py
7
7
  src/dispatch_kit/dispatch.py
8
8
  src/dispatch_kit/egress.py
9
+ src/dispatch_kit/engine.py
9
10
  src/dispatch_kit/estimate.py
11
+ src/dispatch_kit/faults.py
12
+ src/dispatch_kit/observe.py
13
+ src/dispatch_kit/provenance.py
10
14
  src/dispatch_kit/py.typed
11
15
  src/dispatch_kit/routing.py
12
16
  src/dispatch_kit.egg-info/PKG-INFO
@@ -18,5 +22,9 @@ tests/test_approval.py
18
22
  tests/test_budget.py
19
23
  tests/test_dispatch.py
20
24
  tests/test_egress.py
25
+ tests/test_engine.py
21
26
  tests/test_estimate.py
27
+ tests/test_faults.py
28
+ tests/test_observe.py
29
+ tests/test_provenance.py
22
30
  tests/test_routing.py
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from dispatch_kit import Lease, is_lease_stale, should_give_up
5
+ from dispatch_kit import Lease, RetriableError, is_lease_stale, should_give_up
6
6
 
7
7
 
8
8
  def test_lease_is_stale_only_past_the_ttl() -> None:
@@ -23,3 +23,13 @@ def test_lease_carries_its_attempt_count() -> None:
23
23
  assert fresh.attempts == 0
24
24
  recovered = Lease(job_id="j1", leased_at=200.0, attempts=1)
25
25
  assert recovered.attempts == 1
26
+
27
+
28
+ def test_retriable_error_marks_a_failure_for_retry() -> None:
29
+ # An executor raises RetriableError to opt a transient failure into re-queue (vs terminal);
30
+ # it is a plain Exception the engine catches like any other, routing it to fail(retriable=True).
31
+ assert issubclass(RetriableError, Exception)
32
+ try:
33
+ raise RetriableError("transient")
34
+ except RetriableError as exc:
35
+ assert str(exc) == "transient"
@@ -0,0 +1,103 @@
1
+ """The drain engine over a fake in-memory JobStore: complete, terminal/retriable fail, isolation.
2
+
3
+ The fake store implements just enough of the contract to drive the engine. Its ``fail(retriable)``
4
+ re-queues immediately (no backoff), so a retriable job reaches the poison cap within one ``drain``
5
+ pass — a real store adds a backoff so a transient failure isn't burned to poison instantly.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Sequence
11
+ from typing import Any
12
+
13
+ from dispatch_kit import RetriableError, drain
14
+ from dispatch_kit.dispatch import should_give_up
15
+
16
+
17
+ class _FakeStore:
18
+ def __init__(self, jobs: dict[str, dict[str, Any]]) -> None:
19
+ self.jobs = jobs # id -> {payload, lane, status, attempts}
20
+ self.results: dict[str, Any] = {}
21
+ self.errors: dict[str, str] = {}
22
+
23
+ def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
24
+ for job_id, job in self.jobs.items():
25
+ if job["status"] == "pending" and job["lane"] in lanes:
26
+ job["status"] = "working"
27
+ return job_id, job["payload"]
28
+ return None
29
+
30
+ def complete(self, job_id: str, result: Any) -> bool:
31
+ job = self.jobs.get(job_id)
32
+ if job is None or job["status"] != "working":
33
+ return False
34
+ job["status"] = "done"
35
+ self.results[job_id] = result
36
+ return True
37
+
38
+ def fail(self, job_id: str, error: str, *, retriable: bool, max_attempts: int) -> bool:
39
+ job = self.jobs.get(job_id)
40
+ if job is None or job["status"] != "working":
41
+ return False
42
+ if retriable:
43
+ job["attempts"] += 1
44
+ if should_give_up(job["attempts"], max_attempts):
45
+ job["status"] = "failed"
46
+ self.errors[job_id] = error
47
+ else:
48
+ job["status"] = "pending" # re-queue
49
+ else:
50
+ job["status"] = "failed"
51
+ self.errors[job_id] = error
52
+ return True
53
+
54
+ def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
55
+ return []
56
+
57
+
58
+ def _job(payload: str, *, lane: str = "ml") -> dict[str, Any]:
59
+ return {"payload": payload, "lane": lane, "status": "pending", "attempts": 0}
60
+
61
+
62
+ def test_drain_runs_every_pending_job_and_completes_it() -> None:
63
+ store = _FakeStore({"j1": _job("a"), "j2": _job("b")})
64
+ handled = drain(store, str.upper, ["ml"], max_attempts=3)
65
+ assert handled == 2
66
+ assert store.results == {"j1": "A", "j2": "B"}
67
+ assert all(job["status"] == "done" for job in store.jobs.values())
68
+
69
+
70
+ def test_terminal_failure_is_recorded_and_draining_continues() -> None:
71
+ def execute(payload: str) -> str:
72
+ if payload == "bad":
73
+ raise ValueError("nope")
74
+ return payload
75
+
76
+ store = _FakeStore({"j1": _job("bad"), "j2": _job("ok")})
77
+ drain(store, execute, ["ml"], max_attempts=3)
78
+ assert store.jobs["j1"]["status"] == "failed"
79
+ assert "nope" in store.errors["j1"]
80
+ assert store.results == {"j2": "ok"} # the good job still ran despite the bad one
81
+
82
+
83
+ def test_retriable_failure_requeues_then_poisons_at_the_cap() -> None:
84
+ def execute(_payload: str) -> str:
85
+ raise RetriableError("transient")
86
+
87
+ store = _FakeStore({"j1": _job("x")})
88
+ drain(store, execute, ["ml"], max_attempts=2)
89
+ assert store.jobs["j1"]["status"] == "failed" # poisoned after the cap
90
+ assert store.jobs["j1"]["attempts"] == 2
91
+ assert "transient" in store.errors["j1"]
92
+
93
+
94
+ def test_drain_only_touches_its_lanes() -> None:
95
+ store = _FakeStore({"j1": _job("a", lane="ml"), "j2": _job("b", lane="sync")})
96
+ handled = drain(store, str.upper, ["ml"], max_attempts=3)
97
+ assert handled == 1
98
+ assert store.jobs["j1"]["status"] == "done"
99
+ assert store.jobs["j2"]["status"] == "pending" # the other lane is untouched
100
+
101
+
102
+ def test_drain_returns_zero_when_nothing_runnable() -> None:
103
+ assert drain(_FakeStore({}), str.upper, ["ml"], max_attempts=3) == 0
@@ -0,0 +1,26 @@
1
+ """Contained records an ordinary failure but lets a BaseException through."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from dispatch_kit import Contained
8
+
9
+
10
+ def test_contained_records_an_ordinary_exception() -> None:
11
+ with Contained() as box:
12
+ raise ValueError("boom")
13
+ assert isinstance(box.error, ValueError)
14
+ assert str(box.error) == "boom"
15
+
16
+
17
+ def test_contained_is_clean_when_nothing_raises() -> None:
18
+ with Contained() as box:
19
+ pass
20
+ assert box.error is None
21
+
22
+
23
+ def test_contained_lets_base_exceptions_propagate() -> None:
24
+ # KeyboardInterrupt / SystemExit must NOT be swallowed — shutdown + cancellation still work.
25
+ with pytest.raises(KeyboardInterrupt), Contained():
26
+ raise KeyboardInterrupt
@@ -0,0 +1,151 @@
1
+ """The observability facade is OFF by default and a safe no-op until an env var opts in.
2
+
3
+ The real SDKs are patched via the ``build_sentry`` / ``build_tracer`` seams, so these tests need
4
+ neither the export extras nor a network endpoint.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+
12
+ import pytest
13
+
14
+ from dispatch_kit.observe import Observe, ObserveConfig, _JsonFormatter
15
+
16
+ _CFG = ObserveConfig(
17
+ service_name="svc",
18
+ env_prefix="SVC",
19
+ pip_hint="pip install 'svc[observe]'",
20
+ tracer_name="svc",
21
+ )
22
+
23
+
24
+ @pytest.fixture
25
+ def obs(monkeypatch: pytest.MonkeyPatch) -> Observe:
26
+ """A fresh facade with every opt-in env var cleared."""
27
+ for var in ("SVC_SENTRY_DSN", "SENTRY_DSN", "OTEL_EXPORTER_OTLP_ENDPOINT", "SVC_LOG_JSON"):
28
+ monkeypatch.delenv(var, raising=False)
29
+ return Observe(_CFG)
30
+
31
+
32
+ class _FakeScope:
33
+ def __init__(self) -> None:
34
+ self.tags: dict[str, str] = {}
35
+
36
+ def __enter__(self) -> _FakeScope:
37
+ return self
38
+
39
+ def __exit__(self, *_exc: object) -> bool:
40
+ return False
41
+
42
+ def set_tag(self, key: str, value: str) -> None:
43
+ self.tags[key] = value
44
+
45
+
46
+ class _FakeSentry:
47
+ def __init__(self) -> None:
48
+ self.captured: list[tuple[BaseException, dict[str, str]]] = []
49
+ self._scope = _FakeScope()
50
+
51
+ def new_scope(self) -> _FakeScope:
52
+ self._scope = _FakeScope()
53
+ return self._scope
54
+
55
+ def capture_exception(self, exc: BaseException) -> None:
56
+ self.captured.append((exc, dict(self._scope.tags)))
57
+
58
+
59
+ class _FakeSpan:
60
+ def __init__(self, name: str) -> None:
61
+ self.name = name
62
+ self.attrs: dict[str, object] = {}
63
+
64
+ def __enter__(self) -> _FakeSpan:
65
+ return self
66
+
67
+ def __exit__(self, *_exc: object) -> bool:
68
+ return False
69
+
70
+ def set_attribute(self, key: str, value: object) -> None:
71
+ self.attrs[key] = value
72
+
73
+
74
+ class _FakeTracer:
75
+ def __init__(self) -> None:
76
+ self.spans: list[_FakeSpan] = []
77
+
78
+ def start_as_current_span(self, name: str) -> _FakeSpan:
79
+ span = _FakeSpan(name)
80
+ self.spans.append(span)
81
+ return span
82
+
83
+
84
+ def test_inactive_by_default_is_a_safe_noop(obs: Observe) -> None:
85
+ obs.setup()
86
+ assert obs.is_active() == {"sentry": False, "otel": False}
87
+ with obs.span("op", tag="t") as active:
88
+ assert active is None
89
+ obs.capture_exception(ValueError("x"), op="t") # must not raise
90
+
91
+
92
+ def test_sentry_opt_in_forwards_exception_with_tags(
93
+ obs: Observe, monkeypatch: pytest.MonkeyPatch
94
+ ) -> None:
95
+ fake = _FakeSentry()
96
+ monkeypatch.setenv("SVC_SENTRY_DSN", "https://k@example.test/1")
97
+ monkeypatch.setattr(obs, "build_sentry", lambda _dsn: fake)
98
+ obs.setup()
99
+ assert obs.is_active()["sentry"] is True
100
+ err = ValueError("boom")
101
+ obs.capture_exception(err, op="x", id="r1")
102
+ assert fake.captured == [(err, {"op": "x", "id": "r1"})]
103
+
104
+
105
+ def test_otel_opt_in_records_spans(obs: Observe, monkeypatch: pytest.MonkeyPatch) -> None:
106
+ tracer = _FakeTracer()
107
+ monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318")
108
+ monkeypatch.setattr(obs, "build_tracer", lambda _cmd: tracer)
109
+ obs.setup(command="work")
110
+ assert obs.is_active()["otel"] is True
111
+ with obs.span("job", op="x") as active:
112
+ assert active is not None
113
+ assert tracer.spans[0].name == "job"
114
+ assert tracer.spans[0].attrs == {"op": "x"}
115
+
116
+
117
+ def test_missing_extra_logs_and_degrades(
118
+ obs: Observe, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture
119
+ ) -> None:
120
+ def _no_sdk(_dsn: str) -> object:
121
+ raise ImportError("no sentry_sdk")
122
+
123
+ monkeypatch.setenv("SVC_SENTRY_DSN", "https://k@example.test/1")
124
+ monkeypatch.setattr(obs, "build_sentry", _no_sdk)
125
+ with caplog.at_level(logging.WARNING):
126
+ obs.setup()
127
+ assert obs.is_active()["sentry"] is False
128
+ assert any("sentry-sdk" in rec.message for rec in caplog.records)
129
+
130
+
131
+ def test_setup_is_idempotent_and_resets(obs: Observe, monkeypatch: pytest.MonkeyPatch) -> None:
132
+ fake = _FakeSentry()
133
+ monkeypatch.setenv("SVC_SENTRY_DSN", "https://k@example.test/1")
134
+ monkeypatch.setattr(obs, "build_sentry", lambda _dsn: fake)
135
+ obs.setup()
136
+ assert obs.is_active()["sentry"] is True
137
+ monkeypatch.delenv("SVC_SENTRY_DSN")
138
+ obs.setup()
139
+ assert obs.is_active()["sentry"] is False
140
+
141
+
142
+ def test_json_formatter_emits_structured_record() -> None:
143
+ formatter = _JsonFormatter("svc", "work")
144
+ record = logging.LogRecord("svc.x", logging.INFO, "f.py", 1, "hello %s", ("world",), None)
145
+ payload = json.loads(formatter.format(record))
146
+ assert payload["service"] == "svc"
147
+ assert payload["command"] == "work"
148
+ assert payload["level"] == "INFO"
149
+ assert payload["logger"] == "svc.x"
150
+ assert payload["msg"] == "hello world"
151
+ assert "ts" in payload
@@ -0,0 +1,61 @@
1
+ """Provenance is a reproducibility record whose invalid states are unrepresentable."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from dispatch_kit import BackendKind, Determinism, GpuContext, Provenance
8
+
9
+
10
+ def _prov(
11
+ *,
12
+ tool_id: str = "tool",
13
+ params_hash: str = "p",
14
+ runtime_seconds: float = 1.0,
15
+ determinism: Determinism = Determinism.DETERMINISTIC,
16
+ seed: int | None = None,
17
+ ) -> Provenance:
18
+ return Provenance(
19
+ tool_id=tool_id,
20
+ tool_version="1.0",
21
+ weights_id=None,
22
+ weights_sha256=None,
23
+ params_hash=params_hash,
24
+ input_hash="i",
25
+ reference_data_versions={},
26
+ container_digest=None,
27
+ seed=seed,
28
+ determinism=determinism,
29
+ runtime_seconds=runtime_seconds,
30
+ )
31
+
32
+
33
+ def test_a_valid_deterministic_provenance_constructs() -> None:
34
+ prov = _prov()
35
+ assert prov.backend is BackendKind.LOCAL # ran here unless a remote dispatch says otherwise
36
+ assert prov.gpu is None # CPU-only by default
37
+
38
+
39
+ def test_missing_tool_or_hashes_are_rejected() -> None:
40
+ with pytest.raises(ValueError, match="tool_id"):
41
+ _prov(tool_id="")
42
+ with pytest.raises(ValueError, match="params_hash"):
43
+ _prov(params_hash="")
44
+
45
+
46
+ def test_negative_runtime_is_rejected() -> None:
47
+ with pytest.raises(ValueError, match="runtime_seconds"):
48
+ _prov(runtime_seconds=-1.0)
49
+
50
+
51
+ def test_seed_determinism_contradictions_are_rejected() -> None:
52
+ with pytest.raises(ValueError, match="must record its seed"):
53
+ _prov(determinism=Determinism.SEEDED, seed=None)
54
+ with pytest.raises(ValueError, match="must not carry a seed"):
55
+ _prov(determinism=Determinism.NONDETERMINISTIC, seed=42)
56
+
57
+
58
+ def test_gpu_vram_must_be_positive() -> None:
59
+ with pytest.raises(ValueError, match="vram_gb"):
60
+ GpuContext(model="L4", vram_gb=0.0)
61
+ assert GpuContext(model="L4", vram_gb=24.0).vram_gb == 24.0
File without changes
File without changes