dispatch-kit 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/PKG-INFO +1 -1
  2. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/pyproject.toml +4 -2
  3. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/__init__.py +10 -0
  4. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/dispatch.py +29 -7
  5. dispatch_kit-0.2.0/src/dispatch_kit/engine.py +63 -0
  6. dispatch_kit-0.2.0/src/dispatch_kit/faults.py +43 -0
  7. dispatch_kit-0.2.0/src/dispatch_kit/provenance.py +86 -0
  8. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit.egg-info/PKG-INFO +1 -1
  9. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit.egg-info/SOURCES.txt +6 -0
  10. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/tests/test_dispatch.py +11 -1
  11. dispatch_kit-0.2.0/tests/test_engine.py +103 -0
  12. dispatch_kit-0.2.0/tests/test_faults.py +26 -0
  13. dispatch_kit-0.2.0/tests/test_provenance.py +61 -0
  14. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/README.md +0 -0
  15. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/setup.cfg +0 -0
  16. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/approval.py +0 -0
  17. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/budget.py +0 -0
  18. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/egress.py +0 -0
  19. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/estimate.py +0 -0
  20. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/py.typed +0 -0
  21. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit/routing.py +0 -0
  22. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit.egg-info/dependency_links.txt +0 -0
  23. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit.egg-info/requires.txt +0 -0
  24. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/src/dispatch_kit.egg-info/top_level.txt +0 -0
  25. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/tests/test_approval.py +0 -0
  26. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/tests/test_budget.py +0 -0
  27. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/tests/test_egress.py +0 -0
  28. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/tests/test_estimate.py +0 -0
  29. {dispatch_kit-0.1.0 → dispatch_kit-0.2.0}/tests/test_routing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dispatch-kit
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
5
5
  Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "dispatch-kit"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress."
9
9
  authors = [{ name = "Aryan Falahatpisheh", email = "aryanfalahat@gmail.com" }]
10
10
  readme = "README.md"
@@ -77,6 +77,8 @@ source-roots = ["src"]
77
77
  max-line-length = 100
78
78
 
79
79
  [tool.pylint.design]
80
- # Pure value-object domain: frozen data records with few/no methods are expected.
80
+ # Pure value-object domain: frozen data records with few/no methods and, for a full
81
+ # reproducibility record (Provenance), many fields — are expected, not a design smell.
81
82
  min-public-methods = 0
82
83
  max-args = 6
84
+ max-attributes = 15
@@ -30,6 +30,7 @@ from .dispatch import (
30
30
  DispatchError,
31
31
  JobStore,
32
32
  Lease,
33
+ RetriableError,
33
34
  Transport,
34
35
  WorkerExecutor,
35
36
  is_lease_stale,
@@ -42,7 +43,10 @@ from .egress import (
42
43
  SecretRef,
43
44
  log_egress,
44
45
  )
46
+ from .engine import drain
45
47
  from .estimate import CostEstimate, HostCapabilities, vram_fits
48
+ from .faults import Contained
49
+ from .provenance import Determinism, GpuContext, Provenance
46
50
  from .routing import (
47
51
  BackendCapabilities,
48
52
  BackendKind,
@@ -62,16 +66,21 @@ __all__ = [
62
66
  "BudgetCap",
63
67
  "BudgetState",
64
68
  "BudgetWindow",
69
+ "Contained",
65
70
  "CostEstimate",
66
71
  "CostRates",
72
+ "Determinism",
67
73
  "DispatchError",
68
74
  "EnvLookup",
69
75
  "ExternalEndpoint",
76
+ "GpuContext",
70
77
  "HostCapabilities",
71
78
  "JobStore",
72
79
  "Lease",
73
80
  "NoEligibleBackendError",
74
81
  "NodeIdentity",
82
+ "Provenance",
83
+ "RetriableError",
75
84
  "Routable",
76
85
  "SecretMissingError",
77
86
  "SecretRef",
@@ -79,6 +88,7 @@ __all__ = [
79
88
  "Transport",
80
89
  "WorkerExecutor",
81
90
  "admits",
91
+ "drain",
82
92
  "estimate_cost",
83
93
  "is_lease_stale",
84
94
  "log_egress",
@@ -2,8 +2,9 @@
2
2
 
3
3
  Both a PULL worker (it polls: claim a job, run it, complete it) and a PUSH orchestrator (it posts a
4
4
  job to a worker and waits) need the SAME guarantees: a job is CLAIMED atomically so it runs exactly
5
- once, a stale result is REJECTED, and a worker that dies mid-job has its lease RECOVERED. This
6
- module is the shared CONTRACT the pure lease rules + the store/transport/worker protocols. The
5
+ once, a stale result is REJECTED, a clean failure is RECORDED (terminally, or retriably so it is
6
+ re-queued up to a cap), and a worker that dies mid-job has its lease RECOVERED. This module is the
7
+ shared CONTRACT — the pure lease rules + the store/transport/worker protocols. The
7
8
  store (SQLite vs SQLAlchemy), the transport (pull vs push), and the payload (a transcribe request vs
8
9
  a tool invocation) are per-app ADAPTERS, so two apps converge on one model WITHOUT a shared DB.
9
10
 
@@ -22,6 +23,15 @@ class DispatchError(RuntimeError):
22
23
  """A dispatch invariant was violated (a stale complete, or a job recovered past its cap)."""
23
24
 
24
25
 
26
+ class RetriableError(Exception):
27
+ """An executor raises this to mark a failure TRANSIENT, so the engine re-queues the job for
28
+ another attempt (bumping the attempt count, then poisoning past the cap) instead of failing it
29
+ terminally. Use it for an infrastructure hiccup a retry can fix — a timeout, an OOM, a rate
30
+ limit — NOT for a bad input or a logic error, where retrying only wastes attempts. Any OTHER
31
+ exception from the executor is treated as terminal.
32
+ """
33
+
34
+
25
35
  @dataclass(frozen=True, slots=True)
26
36
  class Lease:
27
37
  """A claim on a job: when it was leased and how many times it has been recovered.
@@ -56,15 +66,17 @@ def should_give_up(attempts: int, max_attempts: int) -> bool:
56
66
 
57
67
 
58
68
  class JobStore(Protocol):
59
- """The authoritative job store — the ONE place a job's claim/complete is decided, ATOMICALLY.
69
+ """The authoritative job store — the ONE place claim/complete/fail is decided, ATOMICALLY.
60
70
 
61
71
  ``claim`` must be atomic (one transaction / compare-and-set): two concurrent workers can never
62
72
  both claim the same job — that single claim is the run-exactly-once guarantee. ``complete`` must
63
73
  REJECT a result for a job not currently leased/running (a stale resubmit after the lease was
64
- recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``recover_stale``
65
- re-leases jobs whose lease outlived the TTL (see :func:`is_lease_stale`), bumping the attempt
66
- count and failing a job past ``max_attempts`` (see :func:`should_give_up`). The payload type is
67
- the app's own (a transcribe request, a tool invocation, ...).
74
+ recovered), returning ``False`` so a re-run cannot clobber a fresh result. ``fail`` records a
75
+ clean failure: terminal (a bad input), or retriable re-queued for another attempt, bumping
76
+ attempts and poisoned past the cap (like a recovered crash). ``recover_stale`` re-leases jobs
77
+ whose lease outlived the TTL (see :func:`is_lease_stale`), bumping the attempt count and
78
+ failing a job past ``max_attempts`` (see :func:`should_give_up`). The payload type is the
79
+ app's own (a transcribe request, a tool invocation, ...).
68
80
  """
69
81
 
70
82
  def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
@@ -73,6 +85,16 @@ class JobStore(Protocol):
73
85
  def complete(self, job_id: str, result: Any) -> bool:
74
86
  """Apply a result IFF the job is leased/running; ``False`` if stale (done/recovered)."""
75
87
 
88
+ def fail(self, job_id: str, error: str, *, retriable: bool, max_attempts: int) -> bool:
89
+ """Record a clean failure of a leased job; ``False`` if it was not leased/running.
90
+
91
+ ``retriable=False`` fails it terminally now — a bad input or logic error a retry cannot
92
+ fix. ``retriable=True`` returns it to the queue for another attempt, bumping the attempt
93
+ count; once it has been attempted ``max_attempts`` times it is failed terminally instead
94
+ (the poison rule of :func:`should_give_up`), so a persistently-transient job can never
95
+ loop forever.
96
+ """
97
+
76
98
  def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
77
99
  """Re-lease jobs whose lease is stale (return their ids); fail those past the cap."""
78
100
 
@@ -0,0 +1,63 @@
1
+ """The run-exactly-once engine — drain a :class:`~dispatch_kit.dispatch.JobStore`.
2
+
3
+ Pure coordination over the dispatch ports: :func:`drain` owns the claim -> run -> record skeleton
4
+ AND the failure classification, so every app gets robust draining without re-implementing it. The
5
+ JobStore (persistence + atomic claim + retry timing) and the ``execute`` callable (the actual work)
6
+ are the app's adapters — pull a transcribe request, run a tool invocation, whatever; the engine only
7
+ coordinates.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from collections.abc import Callable, Sequence
13
+ from typing import Any
14
+
15
+ from .dispatch import JobStore, RetriableError
16
+ from .faults import Contained
17
+
18
+
19
+ def drain(
20
+ store: JobStore,
21
+ execute: Callable[[Any], Any],
22
+ lanes: Sequence[str],
23
+ *,
24
+ max_attempts: int,
25
+ ) -> int:
26
+ """Claim and run every currently-runnable job in ``lanes``; return how many were handled.
27
+
28
+ Each job is isolated (:class:`~dispatch_kit.faults.Contained`): a failure is recorded —
29
+ terminal, or retriable if ``execute`` raised :class:`RetriableError` — and the drain CONTINUES,
30
+ so one bad job never stalls the queue. Loops until ``claim`` returns ``None``, so call it on a
31
+ poll interval. Whether a retriable re-queue is retried this pass or later (a backoff) is the
32
+ store's policy, not the engine's.
33
+ """
34
+ handled = 0
35
+ while (claimed := store.claim(lanes)) is not None:
36
+ job_id, payload = claimed
37
+ _record(store, execute, job_id, payload, max_attempts=max_attempts)
38
+ handled += 1
39
+ return handled
40
+
41
+
42
+ def _record(
43
+ store: JobStore,
44
+ execute: Callable[[Any], Any],
45
+ job_id: str,
46
+ payload: Any,
47
+ *,
48
+ max_attempts: int,
49
+ ) -> None:
50
+ """Run one claimed job and record its outcome: complete, or fail (terminal / retriable)."""
51
+ result: Any = None
52
+ with Contained() as box:
53
+ result = execute(payload)
54
+ error = box.error
55
+ if error is None:
56
+ store.complete(job_id, result)
57
+ else:
58
+ store.fail(
59
+ job_id,
60
+ str(error),
61
+ retriable=isinstance(error, RetriableError),
62
+ max_attempts=max_attempts,
63
+ )
@@ -0,0 +1,43 @@
1
+ """Fault isolation — contain a unit of work's failure instead of letting it kill the loop.
2
+
3
+ A drain loop / worker / handler must not die because one job raised. :class:`Contained` records an
4
+ ordinary exception via the context-manager protocol, so there is exactly ONE audited place where
5
+ arbitrary failures stop (no scattered ``except Exception``). A ``BaseException`` that is not an
6
+ ``Exception`` (``KeyboardInterrupt`` / ``SystemExit``) still propagates, so shutdown works.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from types import TracebackType
12
+
13
+
14
+ class Contained:
15
+ """Context manager that records, rather than raises, an ordinary exception.
16
+
17
+ Read ``.error`` after the block to handle what happened::
18
+
19
+ with Contained() as box:
20
+ result = risky()
21
+ if box.error is not None:
22
+ ... # handle the failure (``result`` is unset)
23
+
24
+ It contains failures through ``__exit__`` (returning ``True`` swallows) rather than
25
+ ``except Exception``, so there is exactly one audited boundary where arbitrary failures stop.
26
+ """
27
+
28
+ def __init__(self) -> None:
29
+ self.error: Exception | None = None
30
+
31
+ def __enter__(self) -> Contained:
32
+ return self
33
+
34
+ def __exit__(
35
+ self,
36
+ exc_type: type[BaseException] | None,
37
+ exc: BaseException | None,
38
+ traceback: TracebackType | None,
39
+ ) -> bool:
40
+ if exc is None or not isinstance(exc, Exception):
41
+ return False # clean exit, or a BaseException we must let propagate
42
+ self.error = exc
43
+ return True # contained: swallow the exception
@@ -0,0 +1,86 @@
1
+ """Provenance — the reproducibility record stamped on a completed job.
2
+
3
+ Generic across apps (a tool invocation, an ML transcription, any dispatched compute): every field
4
+ that makes a run reproducible is mandatory unless genuinely inapplicable (no GPU on a CPU tool, no
5
+ seed on a deterministic one), and those exceptions are explicit ``None``, never silent omissions —
6
+ a job that cannot state its provenance cannot commit. Invalid provenance is unrepresentable: a
7
+ seed/determinism contradiction is rejected at construction.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import enum
13
+ from dataclasses import dataclass, field
14
+
15
+ from .routing import BackendKind, NodeIdentity
16
+
17
+ __all__ = ["Determinism", "GpuContext", "Provenance"]
18
+
19
+
20
+ class Determinism(enum.StrEnum):
21
+ """How reproducible a tool's output is — recorded so non-determinism is declared, not hidden."""
22
+
23
+ DETERMINISTIC = "deterministic"
24
+ """Same inputs always give bit-identical outputs."""
25
+ SEEDED = "seeded"
26
+ """Reproducible given the recorded seed."""
27
+ NONDETERMINISTIC = "nondeterministic"
28
+ """Output varies run-to-run (e.g. unseeded sampling); flagged for ensemble handling."""
29
+
30
+
31
+ @dataclass(frozen=True, slots=True)
32
+ class GpuContext:
33
+ """The GPU a job ran on; ``None`` at the Provenance level means a CPU-only job."""
34
+
35
+ model: str
36
+ vram_gb: float
37
+
38
+ def __post_init__(self) -> None:
39
+ if self.vram_gb <= 0:
40
+ raise ValueError(f"GPU vram_gb must be positive; got {self.vram_gb}")
41
+
42
+
43
+ @dataclass(frozen=True, slots=True)
44
+ class Provenance:
45
+ """The full reproducibility record stamped onto a completed job.
46
+
47
+ Hashes (weights sha256, params, input) and reference-data versions let a re-run be checked for
48
+ drift; ``determinism`` plus ``seed`` say whether an identical result is even expected. A
49
+ ``NONDETERMINISTIC`` job with a seed, or a ``SEEDED`` job without one, is a contradiction and is
50
+ rejected at construction — invalid provenance is unrepresentable.
51
+ """
52
+
53
+ tool_id: str
54
+ tool_version: str
55
+ weights_id: str | None
56
+ weights_sha256: str | None
57
+ params_hash: str
58
+ input_hash: str
59
+ reference_data_versions: dict[str, str]
60
+ container_digest: str | None
61
+ seed: int | None
62
+ determinism: Determinism
63
+ runtime_seconds: float
64
+ gpu: GpuContext | None = None
65
+ backend: BackendKind = BackendKind.LOCAL
66
+ """Where compute ran. Defaults to LOCAL: a job with no remote dispatch ran here."""
67
+ node: NodeIdentity | None = None
68
+ """The pinned identity of the node that ran the job; ``None`` only when the host is unknown."""
69
+ extra: dict[str, str] = field(default_factory=dict)
70
+
71
+ def __post_init__(self) -> None:
72
+ if not self.tool_id or not self.tool_version:
73
+ raise ValueError("provenance requires a tool_id and tool_version")
74
+ if not self.params_hash or not self.input_hash:
75
+ raise ValueError("provenance requires params_hash and input_hash")
76
+ if self.runtime_seconds < 0:
77
+ raise ValueError(f"runtime_seconds must be non-negative; got {self.runtime_seconds}")
78
+ self._validate_seed_determinism()
79
+
80
+ def _validate_seed_determinism(self) -> None:
81
+ if self.determinism is Determinism.SEEDED and self.seed is None:
82
+ raise ValueError("a SEEDED job must record its seed")
83
+ if self.determinism is Determinism.NONDETERMINISTIC and self.seed is not None:
84
+ raise ValueError(
85
+ "a NONDETERMINISTIC job must not carry a seed (a seed implies reproducibility)"
86
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dispatch-kit
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Pure, fail-closed cost-gating for expensive remote/external work: a hard $ budget cap, backend routing (local->cloud->SDK), and opt-in audited API egress.
5
5
  Author-email: Aryan Falahatpisheh <aryanfalahat@gmail.com>
6
6
  License: MIT
@@ -6,7 +6,10 @@ src/dispatch_kit/approval.py
6
6
  src/dispatch_kit/budget.py
7
7
  src/dispatch_kit/dispatch.py
8
8
  src/dispatch_kit/egress.py
9
+ src/dispatch_kit/engine.py
9
10
  src/dispatch_kit/estimate.py
11
+ src/dispatch_kit/faults.py
12
+ src/dispatch_kit/provenance.py
10
13
  src/dispatch_kit/py.typed
11
14
  src/dispatch_kit/routing.py
12
15
  src/dispatch_kit.egg-info/PKG-INFO
@@ -18,5 +21,8 @@ tests/test_approval.py
18
21
  tests/test_budget.py
19
22
  tests/test_dispatch.py
20
23
  tests/test_egress.py
24
+ tests/test_engine.py
21
25
  tests/test_estimate.py
26
+ tests/test_faults.py
27
+ tests/test_provenance.py
22
28
  tests/test_routing.py
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from dispatch_kit import Lease, is_lease_stale, should_give_up
5
+ from dispatch_kit import Lease, RetriableError, is_lease_stale, should_give_up
6
6
 
7
7
 
8
8
  def test_lease_is_stale_only_past_the_ttl() -> None:
@@ -23,3 +23,13 @@ def test_lease_carries_its_attempt_count() -> None:
23
23
  assert fresh.attempts == 0
24
24
  recovered = Lease(job_id="j1", leased_at=200.0, attempts=1)
25
25
  assert recovered.attempts == 1
26
+
27
+
28
+ def test_retriable_error_marks_a_failure_for_retry() -> None:
29
+ # An executor raises RetriableError to opt a transient failure into re-queue (vs terminal);
30
+ # it is a plain Exception the engine catches like any other, routing it to fail(retriable=True).
31
+ assert issubclass(RetriableError, Exception)
32
+ try:
33
+ raise RetriableError("transient")
34
+ except RetriableError as exc:
35
+ assert str(exc) == "transient"
@@ -0,0 +1,103 @@
1
+ """The drain engine over a fake in-memory JobStore: complete, terminal/retriable fail, isolation.
2
+
3
+ The fake store implements just enough of the contract to drive the engine. Its ``fail(retriable)``
4
+ re-queues immediately (no backoff), so a retriable job reaches the poison cap within one ``drain``
5
+ pass — a real store adds a backoff so a transient failure isn't burned to poison instantly.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Sequence
11
+ from typing import Any
12
+
13
+ from dispatch_kit import RetriableError, drain
14
+ from dispatch_kit.dispatch import should_give_up
15
+
16
+
17
+ class _FakeStore:
18
+ def __init__(self, jobs: dict[str, dict[str, Any]]) -> None:
19
+ self.jobs = jobs # id -> {payload, lane, status, attempts}
20
+ self.results: dict[str, Any] = {}
21
+ self.errors: dict[str, str] = {}
22
+
23
+ def claim(self, lanes: Sequence[str]) -> tuple[str, Any] | None:
24
+ for job_id, job in self.jobs.items():
25
+ if job["status"] == "pending" and job["lane"] in lanes:
26
+ job["status"] = "working"
27
+ return job_id, job["payload"]
28
+ return None
29
+
30
+ def complete(self, job_id: str, result: Any) -> bool:
31
+ job = self.jobs.get(job_id)
32
+ if job is None or job["status"] != "working":
33
+ return False
34
+ job["status"] = "done"
35
+ self.results[job_id] = result
36
+ return True
37
+
38
+ def fail(self, job_id: str, error: str, *, retriable: bool, max_attempts: int) -> bool:
39
+ job = self.jobs.get(job_id)
40
+ if job is None or job["status"] != "working":
41
+ return False
42
+ if retriable:
43
+ job["attempts"] += 1
44
+ if should_give_up(job["attempts"], max_attempts):
45
+ job["status"] = "failed"
46
+ self.errors[job_id] = error
47
+ else:
48
+ job["status"] = "pending" # re-queue
49
+ else:
50
+ job["status"] = "failed"
51
+ self.errors[job_id] = error
52
+ return True
53
+
54
+ def recover_stale(self, *, now: float, ttl_seconds: float, max_attempts: int) -> Sequence[str]:
55
+ return []
56
+
57
+
58
+ def _job(payload: str, *, lane: str = "ml") -> dict[str, Any]:
59
+ return {"payload": payload, "lane": lane, "status": "pending", "attempts": 0}
60
+
61
+
62
+ def test_drain_runs_every_pending_job_and_completes_it() -> None:
63
+ store = _FakeStore({"j1": _job("a"), "j2": _job("b")})
64
+ handled = drain(store, str.upper, ["ml"], max_attempts=3)
65
+ assert handled == 2
66
+ assert store.results == {"j1": "A", "j2": "B"}
67
+ assert all(job["status"] == "done" for job in store.jobs.values())
68
+
69
+
70
+ def test_terminal_failure_is_recorded_and_draining_continues() -> None:
71
+ def execute(payload: str) -> str:
72
+ if payload == "bad":
73
+ raise ValueError("nope")
74
+ return payload
75
+
76
+ store = _FakeStore({"j1": _job("bad"), "j2": _job("ok")})
77
+ drain(store, execute, ["ml"], max_attempts=3)
78
+ assert store.jobs["j1"]["status"] == "failed"
79
+ assert "nope" in store.errors["j1"]
80
+ assert store.results == {"j2": "ok"} # the good job still ran despite the bad one
81
+
82
+
83
+ def test_retriable_failure_requeues_then_poisons_at_the_cap() -> None:
84
+ def execute(_payload: str) -> str:
85
+ raise RetriableError("transient")
86
+
87
+ store = _FakeStore({"j1": _job("x")})
88
+ drain(store, execute, ["ml"], max_attempts=2)
89
+ assert store.jobs["j1"]["status"] == "failed" # poisoned after the cap
90
+ assert store.jobs["j1"]["attempts"] == 2
91
+ assert "transient" in store.errors["j1"]
92
+
93
+
94
+ def test_drain_only_touches_its_lanes() -> None:
95
+ store = _FakeStore({"j1": _job("a", lane="ml"), "j2": _job("b", lane="sync")})
96
+ handled = drain(store, str.upper, ["ml"], max_attempts=3)
97
+ assert handled == 1
98
+ assert store.jobs["j1"]["status"] == "done"
99
+ assert store.jobs["j2"]["status"] == "pending" # the other lane is untouched
100
+
101
+
102
+ def test_drain_returns_zero_when_nothing_runnable() -> None:
103
+ assert drain(_FakeStore({}), str.upper, ["ml"], max_attempts=3) == 0
@@ -0,0 +1,26 @@
1
+ """Contained records an ordinary failure but lets a BaseException through."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from dispatch_kit import Contained
8
+
9
+
10
+ def test_contained_records_an_ordinary_exception() -> None:
11
+ with Contained() as box:
12
+ raise ValueError("boom")
13
+ assert isinstance(box.error, ValueError)
14
+ assert str(box.error) == "boom"
15
+
16
+
17
+ def test_contained_is_clean_when_nothing_raises() -> None:
18
+ with Contained() as box:
19
+ pass
20
+ assert box.error is None
21
+
22
+
23
+ def test_contained_lets_base_exceptions_propagate() -> None:
24
+ # KeyboardInterrupt / SystemExit must NOT be swallowed — shutdown + cancellation still work.
25
+ with pytest.raises(KeyboardInterrupt), Contained():
26
+ raise KeyboardInterrupt
@@ -0,0 +1,61 @@
1
+ """Provenance is a reproducibility record whose invalid states are unrepresentable."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+ from dispatch_kit import BackendKind, Determinism, GpuContext, Provenance
8
+
9
+
10
+ def _prov(
11
+ *,
12
+ tool_id: str = "tool",
13
+ params_hash: str = "p",
14
+ runtime_seconds: float = 1.0,
15
+ determinism: Determinism = Determinism.DETERMINISTIC,
16
+ seed: int | None = None,
17
+ ) -> Provenance:
18
+ return Provenance(
19
+ tool_id=tool_id,
20
+ tool_version="1.0",
21
+ weights_id=None,
22
+ weights_sha256=None,
23
+ params_hash=params_hash,
24
+ input_hash="i",
25
+ reference_data_versions={},
26
+ container_digest=None,
27
+ seed=seed,
28
+ determinism=determinism,
29
+ runtime_seconds=runtime_seconds,
30
+ )
31
+
32
+
33
+ def test_a_valid_deterministic_provenance_constructs() -> None:
34
+ prov = _prov()
35
+ assert prov.backend is BackendKind.LOCAL # ran here unless a remote dispatch says otherwise
36
+ assert prov.gpu is None # CPU-only by default
37
+
38
+
39
+ def test_missing_tool_or_hashes_are_rejected() -> None:
40
+ with pytest.raises(ValueError, match="tool_id"):
41
+ _prov(tool_id="")
42
+ with pytest.raises(ValueError, match="params_hash"):
43
+ _prov(params_hash="")
44
+
45
+
46
+ def test_negative_runtime_is_rejected() -> None:
47
+ with pytest.raises(ValueError, match="runtime_seconds"):
48
+ _prov(runtime_seconds=-1.0)
49
+
50
+
51
+ def test_seed_determinism_contradictions_are_rejected() -> None:
52
+ with pytest.raises(ValueError, match="must record its seed"):
53
+ _prov(determinism=Determinism.SEEDED, seed=None)
54
+ with pytest.raises(ValueError, match="must not carry a seed"):
55
+ _prov(determinism=Determinism.NONDETERMINISTIC, seed=42)
56
+
57
+
58
+ def test_gpu_vram_must_be_positive() -> None:
59
+ with pytest.raises(ValueError, match="vram_gb"):
60
+ GpuContext(model="L4", vram_gb=0.0)
61
+ assert GpuContext(model="L4", vram_gb=24.0).vram_gb == 24.0
File without changes
File without changes