seedloop 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seedloop/__init__.py +41 -0
- seedloop/_audit.py +107 -0
- seedloop/_entropy.py +96 -0
- seedloop/_loop.py +165 -0
- seedloop/_net.py +190 -0
- seedloop/_run.py +88 -0
- seedloop/_trace.py +29 -0
- seedloop/_world.py +112 -0
- seedloop/demos/__init__.py +1 -0
- seedloop/demos/raft.py +181 -0
- seedloop/errors.py +57 -0
- seedloop/py.typed +0 -0
- seedloop-0.3.0.dist-info/METADATA +180 -0
- seedloop-0.3.0.dist-info/RECORD +17 -0
- seedloop-0.3.0.dist-info/WHEEL +5 -0
- seedloop-0.3.0.dist-info/licenses/LICENSE +21 -0
- seedloop-0.3.0.dist-info/top_level.txt +1 -0
seedloop/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""seedloop — deterministic simulation testing for Python asyncio.
|
|
2
|
+
|
|
3
|
+
Write a scenario against a :class:`World`, then ``check`` it across many seeds; a failing seed is
|
|
4
|
+
the reproduction — ``replay`` it to debug. The deterministic core (loop, virtual clock, seeded
|
|
5
|
+
entropy), the simulated network with fault injection (loss, duplication, partitions), the invariant
|
|
6
|
+
API, and the non-determinism auditor are in place; a worked Raft demo ships in ``seedloop.demos``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from seedloop._audit import audit_mode
|
|
10
|
+
from seedloop._entropy import ensure_hash_seed
|
|
11
|
+
from seedloop._net import Address, Endpoint, Message, Transport
|
|
12
|
+
from seedloop._run import CheckResult, Scenario, check, replay
|
|
13
|
+
from seedloop._world import Node, World
|
|
14
|
+
from seedloop.errors import (
|
|
15
|
+
BoundaryError,
|
|
16
|
+
DeadlockError,
|
|
17
|
+
EntropyLeakError,
|
|
18
|
+
InvariantError,
|
|
19
|
+
SeedloopError,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"Address",
|
|
24
|
+
"BoundaryError",
|
|
25
|
+
"CheckResult",
|
|
26
|
+
"DeadlockError",
|
|
27
|
+
"Endpoint",
|
|
28
|
+
"EntropyLeakError",
|
|
29
|
+
"InvariantError",
|
|
30
|
+
"Message",
|
|
31
|
+
"Node",
|
|
32
|
+
"Scenario",
|
|
33
|
+
"SeedloopError",
|
|
34
|
+
"Transport",
|
|
35
|
+
"World",
|
|
36
|
+
"audit_mode",
|
|
37
|
+
"check",
|
|
38
|
+
"ensure_hash_seed",
|
|
39
|
+
"replay",
|
|
40
|
+
]
|
|
41
|
+
__version__ = "0.3.0"
|
seedloop/_audit.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""The non-determinism auditor: runtime tripwires for uncontrolled entropy (ADR-0008).
|
|
2
|
+
|
|
3
|
+
A run is a pure function of its seed only if every entropy source it touches is the World's seeded
|
|
4
|
+
one. The loop already rejects the I/O boundary (``run_in_executor``, real sockets, DNS) in every
|
|
5
|
+
mode. This adds an opt-in *audit mode* that closes the Python-level entropy sources the loop does
|
|
6
|
+
not see: real wall-clock time, the unseeded global ``random``, ``os.urandom``/``secrets``, and a
|
|
7
|
+
bare ``threading.Thread``. In audit mode each raises instead of running, so a leak is a loud,
|
|
8
|
+
reproducible failure on the seed that hit it — the boundary enforced, not just stated (scope.md).
|
|
9
|
+
|
|
10
|
+
The tripwires patch only module-level entry points, never ``random.Random`` itself, so the World's
|
|
11
|
+
seeded ``rng`` keeps working; they are pure raises that touch no entropy and leave a clean run's
|
|
12
|
+
timeline unchanged; and they are restored on exit even on error.
|
|
13
|
+
|
|
14
|
+
Like any monkeypatch (and the CSPRNG shim), a tripwire catches a call that looks the name up at call
|
|
15
|
+
time — ``time.monotonic()``, ``random.random()`` — but not a reference bound *before* audit started
|
|
16
|
+
(``from time import monotonic`` then ``monotonic()``). The common attribute-call form is caught; the
|
|
17
|
+
same C-level caveat as ``scope.md`` applies below Python.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
import random
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
from collections.abc import Callable, Iterator
|
|
27
|
+
from contextlib import contextmanager
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
from seedloop.errors import BoundaryError, EntropyLeakError
|
|
31
|
+
|
|
32
|
+
# Real-time entry points (the loop owns virtual time via loop.time(), so any direct call is a leak).
|
|
33
|
+
_REAL_TIME = ("time", "monotonic", "perf_counter", "time_ns", "monotonic_ns", "perf_counter_ns")
|
|
34
|
+
|
|
35
|
+
# Every entropy-drawing module-level `random` function — the *complete* set on the global unseeded
|
|
36
|
+
# instance, not a subset, so a leak through any (e.g. expovariate for latency jitter) is caught.
|
|
37
|
+
# These are module functions; `random.Random` instances such as the seeded rng are untouched.
|
|
38
|
+
_RANDOM_FUNCS = (
|
|
39
|
+
"random",
|
|
40
|
+
"uniform",
|
|
41
|
+
"triangular",
|
|
42
|
+
"randint",
|
|
43
|
+
"randrange",
|
|
44
|
+
"choice",
|
|
45
|
+
"choices",
|
|
46
|
+
"shuffle",
|
|
47
|
+
"sample",
|
|
48
|
+
"getrandbits",
|
|
49
|
+
"randbytes",
|
|
50
|
+
"betavariate",
|
|
51
|
+
"expovariate",
|
|
52
|
+
"gammavariate",
|
|
53
|
+
"gauss",
|
|
54
|
+
"lognormvariate",
|
|
55
|
+
"normalvariate",
|
|
56
|
+
"vonmisesvariate",
|
|
57
|
+
"paretovariate",
|
|
58
|
+
"weibullvariate",
|
|
59
|
+
"binomialvariate", # 3.12+
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Each tripwire is (module, attribute, display name). hasattr-guarded so a name absent on a given
|
|
63
|
+
# interpreter is skipped rather than crashing the patcher. os.urandom and the random._urandom alias
|
|
64
|
+
# that secrets draws through are intercepted too.
|
|
65
|
+
_ENTROPY_SURFACES: list[tuple[Any, str, str]] = [
|
|
66
|
+
*((time, name, f"time.{name}") for name in _REAL_TIME if hasattr(time, name)),
|
|
67
|
+
(os, "urandom", "os.urandom"),
|
|
68
|
+
(random, "_urandom", "secrets/os.urandom"),
|
|
69
|
+
*((random, name, f"random.{name}") for name in _RANDOM_FUNCS if hasattr(random, name)),
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _entropy_tripwire(source: str) -> Callable[..., Any]:
|
|
74
|
+
def tripwire(*_args: Any, **_kwargs: Any) -> Any:
|
|
75
|
+
raise EntropyLeakError(source)
|
|
76
|
+
|
|
77
|
+
return tripwire
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _thread_tripwire(*_args: Any, **_kwargs: Any) -> Any:
|
|
81
|
+
raise BoundaryError(
|
|
82
|
+
"threading.Thread (a real thread) cannot be made deterministic and is out of scope in a "
|
|
83
|
+
"simulated run (see docs/scope.md)"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@contextmanager
|
|
88
|
+
def audit_mode() -> Iterator[None]:
|
|
89
|
+
"""Trip on uncontrolled entropy for the duration of the context.
|
|
90
|
+
|
|
91
|
+
Inside the context, real time, the unseeded global ``random``, ``os.urandom``/``secrets``, and
|
|
92
|
+
``threading.Thread.start`` raise (``EntropyLeakError`` for entropy, ``BoundaryError`` for the
|
|
93
|
+
thread) instead of running. The World's seeded ``rng`` and virtual clock are unaffected. Use it
|
|
94
|
+
via ``check(..., audit=True)`` / ``replay(..., audit=True)``, or directly to wrap your own run.
|
|
95
|
+
All patches are restored on exit, even on error.
|
|
96
|
+
"""
|
|
97
|
+
saved = [(mod, attr, getattr(mod, attr)) for mod, attr, _ in _ENTROPY_SURFACES]
|
|
98
|
+
saved_thread_start = threading.Thread.start
|
|
99
|
+
for mod, attr, name in _ENTROPY_SURFACES:
|
|
100
|
+
setattr(mod, attr, _entropy_tripwire(name))
|
|
101
|
+
threading.Thread.start = _thread_tripwire # type: ignore[method-assign]
|
|
102
|
+
try:
|
|
103
|
+
yield
|
|
104
|
+
finally:
|
|
105
|
+
for mod, attr, original in saved:
|
|
106
|
+
setattr(mod, attr, original)
|
|
107
|
+
threading.Thread.start = saved_thread_start # type: ignore[method-assign]
|
seedloop/_entropy.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Seeded entropy: per-component sub-streams, a CSPRNG shim, and a hash-seed launcher.
|
|
2
|
+
|
|
3
|
+
A run is a pure function of its seed, so every source of randomness must derive from it. The root
|
|
4
|
+
seed is split into independent named sub-streams (ADR-0009) so adding a draw in one component does
|
|
5
|
+
not perturb another's sequence. The CSPRNG shim routes ``os.urandom``/``secrets`` to a seeded
|
|
6
|
+
source for the duration of a run, and the launcher pins ``PYTHONHASHSEED`` before the interpreter
|
|
7
|
+
starts so set/dict iteration order is fixed (ADR-0010).
|
|
8
|
+
|
|
9
|
+
Verified against the interpreter during design: shimming ``os.urandom`` alone does *not* control
|
|
10
|
+
``secrets``/``random``, because ``random`` binds ``from os import urandom as _urandom`` at import —
|
|
11
|
+
so the shim patches ``random._urandom`` too; and two child processes launched with the same
|
|
12
|
+
``PYTHONHASHSEED`` hash identically while a different value differs.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import os
|
|
19
|
+
import random
|
|
20
|
+
import sys
|
|
21
|
+
from collections.abc import Callable, Iterator
|
|
22
|
+
from contextlib import contextmanager
|
|
23
|
+
|
|
24
|
+
_REEXEC_GUARD = "_SEEDLOOP_HASHSEED_REEXEC"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def substream(root_seed: int, label: str) -> random.Random:
|
|
28
|
+
"""Derive an independent, reproducible ``random.Random`` for a named component.
|
|
29
|
+
|
|
30
|
+
The stream is a pure function of ``(root_seed, label)``. Derivation hashes the canonical text
|
|
31
|
+
``f"{root_seed}:{label}"`` with ``blake2b`` — never the builtin ``hash()``, which is randomized
|
|
32
|
+
per process — so the same pair yields the same stream in every process, and any ``int`` seed
|
|
33
|
+
works (negative, or larger than 64 bits).
|
|
34
|
+
"""
|
|
35
|
+
digest = hashlib.blake2b(f"{root_seed}:{label}".encode(), digest_size=32).digest()
|
|
36
|
+
return random.Random(int.from_bytes(digest, "big"))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@contextmanager
|
|
40
|
+
def csprng_shim(stream: random.Random) -> Iterator[None]:
|
|
41
|
+
"""Route ``os.urandom`` and ``secrets`` to ``stream`` for the duration of the context.
|
|
42
|
+
|
|
43
|
+
Patches both ``os.urandom`` and the ``random._urandom`` alias that ``secrets`` draws through;
|
|
44
|
+
restores both originals on exit, even on error. Scoped to a single run; runs do not overlap in
|
|
45
|
+
one process.
|
|
46
|
+
"""
|
|
47
|
+
seeded = _seeded_urandom(stream)
|
|
48
|
+
orig_os = os.urandom
|
|
49
|
+
orig_random = random._urandom # type: ignore[attr-defined] # private alias secrets draws through
|
|
50
|
+
os.urandom = seeded
|
|
51
|
+
random._urandom = seeded # type: ignore[attr-defined]
|
|
52
|
+
try:
|
|
53
|
+
yield
|
|
54
|
+
finally:
|
|
55
|
+
os.urandom = orig_os
|
|
56
|
+
random._urandom = orig_random # type: ignore[attr-defined]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _seeded_urandom(stream: random.Random) -> Callable[[int], bytes]:
|
|
60
|
+
def seeded_urandom(n: int) -> bytes:
|
|
61
|
+
return stream.getrandbits(n * 8).to_bytes(n, "big") if n else b""
|
|
62
|
+
|
|
63
|
+
return seeded_urandom
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def hash_seed_for(root_seed: int) -> int:
|
|
67
|
+
"""The ``PYTHONHASHSEED`` value (0..4294967295) a run pins, derived from its root seed."""
|
|
68
|
+
digest = hashlib.blake2b(f"{root_seed}:hashseed".encode(), digest_size=4).digest()
|
|
69
|
+
return int.from_bytes(digest, "big")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def ensure_hash_seed(root_seed: int) -> None:
|
|
73
|
+
"""Ensure the interpreter runs with the run's pinned ``PYTHONHASHSEED``.
|
|
74
|
+
|
|
75
|
+
``PYTHONHASHSEED`` is read once at interpreter start, so it cannot be set from inside a run;
|
|
76
|
+
this re-runs the interpreter with the pinned value when needed. If already pinned, returns and
|
|
77
|
+
the caller proceeds in-process. Otherwise it launches a pinned child running the same command
|
|
78
|
+
and does not return — on POSIX by replacing the process (``execve``), on Windows (no true
|
|
79
|
+
``exec``) by spawning a child and exiting with its return code. A guard env var prevents
|
|
80
|
+
infinite recursion.
|
|
81
|
+
"""
|
|
82
|
+
target = str(hash_seed_for(root_seed))
|
|
83
|
+
if os.environ.get(_REEXEC_GUARD) == target or os.environ.get("PYTHONHASHSEED") == target:
|
|
84
|
+
return # already pinned (our child, or started correctly); proceed in-process
|
|
85
|
+
child_env = dict(os.environ, PYTHONHASHSEED=target, **{_REEXEC_GUARD: target})
|
|
86
|
+
# sys.orig_argv is the full original command (including -c / -m and their payload), so the
|
|
87
|
+
# child re-runs exactly what the parent ran; reconstructing from sys.argv would drop -c code.
|
|
88
|
+
argv = [sys.executable, *sys.orig_argv[1:]]
|
|
89
|
+
if os.name == "posix":
|
|
90
|
+
os.execve(sys.executable, argv, child_env)
|
|
91
|
+
else:
|
|
92
|
+
# Windows has no in-place exec; spawn a pinned child and propagate its exit code.
|
|
93
|
+
import subprocess
|
|
94
|
+
|
|
95
|
+
completed = subprocess.run(argv, env=child_env)
|
|
96
|
+
sys.exit(completed.returncode)
|
seedloop/_loop.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""The deterministic event loop.
|
|
2
|
+
|
|
3
|
+
``asyncio``'s loop is already single-threaded and its scheduling is deterministic by
|
|
4
|
+
construction; the one nondeterministic seam is the I/O poll (``selector.select()``). seedloop
|
|
5
|
+
subclasses :class:`asyncio.BaseEventLoop` and overrides only ``_run_once`` to remove that poll
|
|
6
|
+
(ADR-0013): the ready queue is drained in faithful ``call_soon`` FIFO order (ADR-0012), and the
|
|
7
|
+
real-I/O surface is rejected rather than run (``docs/scope.md``).
|
|
8
|
+
|
|
9
|
+
Time is virtual: ``loop.time()`` starts at 0 and never advances by waiting. When every task is
|
|
10
|
+
blocked, the loop jumps the clock to the next scheduled timer (the autojump of ADR-0005), so a
|
|
11
|
+
ten-second ``sleep`` resolves instantly. Timers live in a heap keyed ``(when, seq)``, so equal
|
|
12
|
+
deadlines fire in scheduling order — a deterministic tie-break CPython's ``TimerHandle`` (ordered by
|
|
13
|
+
deadline alone) lacks. ``BaseEventLoop`` (unlike ``BaseSelectorEventLoop``) creates no selector and
|
|
14
|
+
no self-pipe, so no real socket exists in the loop.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import heapq
|
|
21
|
+
from collections.abc import Callable
|
|
22
|
+
from typing import Any, NoReturn
|
|
23
|
+
|
|
24
|
+
from seedloop.errors import BoundaryError, DeadlockError
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DeterministicLoop(asyncio.BaseEventLoop):
|
|
28
|
+
"""A single-threaded ``asyncio`` loop with no real I/O and a virtual clock."""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
super().__init__()
|
|
32
|
+
self._sl_time = 0.0 # virtual monotonic time; advanced only by the autojump
|
|
33
|
+
# Timer heap of (when, seq, handle); the monotonic seq is the deterministic tie-break,
|
|
34
|
+
# so equal deadlines fire in scheduling order.
|
|
35
|
+
self._sl_timers: list[tuple[float, int, asyncio.TimerHandle]] = []
|
|
36
|
+
self._sl_timer_seq = 0
|
|
37
|
+
# Optional hook the World uses to check invariants after each step; None by default, so a
|
|
38
|
+
# run without invariants is unchanged. It may raise to fail the run.
|
|
39
|
+
self._sl_after_step: Callable[[], None] | None = None
|
|
40
|
+
|
|
41
|
+
def time(self) -> float:
|
|
42
|
+
return self._sl_time
|
|
43
|
+
|
|
44
|
+
def call_at( # type: ignore[override]
|
|
45
|
+
self, when: float, callback: Any, *args: Any, context: Any = None
|
|
46
|
+
) -> asyncio.TimerHandle:
|
|
47
|
+
self._check_closed() # type: ignore[attr-defined] # BaseEventLoop guard, not in the stubs
|
|
48
|
+
timer = asyncio.TimerHandle(when, callback, args, self, context)
|
|
49
|
+
heapq.heappush(self._sl_timers, (when, self._sl_timer_seq, timer))
|
|
50
|
+
self._sl_timer_seq += 1
|
|
51
|
+
return timer
|
|
52
|
+
|
|
53
|
+
def call_later( # type: ignore[override]
|
|
54
|
+
self, delay: float, callback: Any, *args: Any, context: Any = None
|
|
55
|
+
) -> asyncio.TimerHandle:
|
|
56
|
+
return self.call_at(self._sl_time + delay, callback, *args, context=context)
|
|
57
|
+
|
|
58
|
+
def _timer_handle_cancelled(self, handle: asyncio.TimerHandle) -> None:
|
|
59
|
+
# Cancelled timers are tombstoned and skipped when popped; no count bookkeeping needed.
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
def _run_once(self) -> None:
|
|
63
|
+
# Deterministic replacement for BaseEventLoop._run_once: no select(), no real I/O. When
|
|
64
|
+
# nothing is ready, advance virtual time to the next timer (autojump); then promote every
|
|
65
|
+
# timer now due and run the ready batch in faithful FIFO order (ADR-0012).
|
|
66
|
+
ready: Any = self._ready # type: ignore[attr-defined] # BaseEventLoop's ready deque
|
|
67
|
+
if not ready:
|
|
68
|
+
self._purge_cancelled_timers()
|
|
69
|
+
if self._sl_timers:
|
|
70
|
+
self._sl_time = max(self._sl_time, self._sl_timers[0][0]) # jump forward only
|
|
71
|
+
elif not self._stopping: # type: ignore[attr-defined] # BaseEventLoop stop flag
|
|
72
|
+
raise DeadlockError(
|
|
73
|
+
"the run is quiescent: every task is blocked and no timer is scheduled to "
|
|
74
|
+
"wake one"
|
|
75
|
+
)
|
|
76
|
+
self._fire_due_timers()
|
|
77
|
+
# Run the batch ready at step start in registration order; callbacks scheduled mid-batch
|
|
78
|
+
# run on the next step (the len() bound), matching CPython.
|
|
79
|
+
for _ in range(len(ready)):
|
|
80
|
+
handle = ready.popleft()
|
|
81
|
+
if not handle.cancelled():
|
|
82
|
+
handle._run()
|
|
83
|
+
if self._sl_after_step is not None:
|
|
84
|
+
self._sl_after_step() # check invariants; may raise to fail the run
|
|
85
|
+
|
|
86
|
+
def _fire_due_timers(self) -> None:
|
|
87
|
+
# Promote every timer whose deadline has arrived (<= the clock) to the ready queue.
|
|
88
|
+
ready = self._ready # type: ignore[attr-defined]
|
|
89
|
+
while self._sl_timers and self._sl_timers[0][0] <= self._sl_time:
|
|
90
|
+
handle = heapq.heappop(self._sl_timers)[2]
|
|
91
|
+
if not handle.cancelled():
|
|
92
|
+
ready.append(handle)
|
|
93
|
+
|
|
94
|
+
def _purge_cancelled_timers(self) -> None:
|
|
95
|
+
# Drop cancelled timers from the heap head so the earliest entry is a live deadline.
|
|
96
|
+
while self._sl_timers and self._sl_timers[0][2].cancelled():
|
|
97
|
+
heapq.heappop(self._sl_timers)
|
|
98
|
+
|
|
99
|
+
# --- boundary: operations that cannot be made deterministic are rejected (ADR-0002) ---
|
|
100
|
+
|
|
101
|
+
def _reject(self, what: str) -> NoReturn:
|
|
102
|
+
raise BoundaryError(
|
|
103
|
+
f"{what} cannot be made deterministic and is out of scope inside a simulated run "
|
|
104
|
+
f"(see docs/scope.md)"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def run_in_executor(self, *args: Any, **kwargs: Any) -> NoReturn: # type: ignore[override]
|
|
108
|
+
self._reject("run_in_executor (real threads)")
|
|
109
|
+
|
|
110
|
+
def call_soon_threadsafe(self, *args: Any, **kwargs: Any) -> NoReturn: # type: ignore[override]
|
|
111
|
+
self._reject("call_soon_threadsafe (another thread)")
|
|
112
|
+
|
|
113
|
+
def add_reader(self, *args: Any, **kwargs: Any) -> NoReturn: # type: ignore[override]
|
|
114
|
+
self._reject("add_reader (real I/O)")
|
|
115
|
+
|
|
116
|
+
def add_writer(self, *args: Any, **kwargs: Any) -> NoReturn: # type: ignore[override]
|
|
117
|
+
self._reject("add_writer (real I/O)")
|
|
118
|
+
|
|
119
|
+
async def sock_recv(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
120
|
+
self._reject("sock_recv (real socket)")
|
|
121
|
+
|
|
122
|
+
async def sock_sendall(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
123
|
+
self._reject("sock_sendall (real socket)")
|
|
124
|
+
|
|
125
|
+
async def sock_connect(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
126
|
+
self._reject("sock_connect (real socket)")
|
|
127
|
+
|
|
128
|
+
async def getaddrinfo(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
129
|
+
self._reject("getaddrinfo (real DNS)")
|
|
130
|
+
|
|
131
|
+
async def getnameinfo(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
132
|
+
self._reject("getnameinfo (real DNS)")
|
|
133
|
+
|
|
134
|
+
async def create_connection(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
135
|
+
self._reject("create_connection (real socket)")
|
|
136
|
+
|
|
137
|
+
async def create_server(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
138
|
+
self._reject("create_server (real socket)")
|
|
139
|
+
|
|
140
|
+
async def create_datagram_endpoint(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
141
|
+
# BaseEventLoop's version opens and binds a real UDP socket before failing; reject first.
|
|
142
|
+
self._reject("create_datagram_endpoint (real socket)")
|
|
143
|
+
|
|
144
|
+
async def connect_read_pipe(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
145
|
+
self._reject("connect_read_pipe (real pipe)")
|
|
146
|
+
|
|
147
|
+
async def connect_write_pipe(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
148
|
+
self._reject("connect_write_pipe (real pipe)")
|
|
149
|
+
|
|
150
|
+
async def subprocess_exec(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
151
|
+
self._reject("subprocess_exec (real subprocess)")
|
|
152
|
+
|
|
153
|
+
async def subprocess_shell(self, *args: Any, **kwargs: Any) -> NoReturn:
|
|
154
|
+
self._reject("subprocess_shell (real subprocess)")
|
|
155
|
+
|
|
156
|
+
def add_signal_handler(self, *args: Any, **kwargs: Any) -> NoReturn: # type: ignore[override]
|
|
157
|
+
self._reject("add_signal_handler (real signals)")
|
|
158
|
+
|
|
159
|
+
# _process_events and _write_to_self are abstract on BaseEventLoop. We never poll and never
|
|
160
|
+
# need a cross-thread wakeup, so both are inert.
|
|
161
|
+
def _process_events(self, event_list: Any) -> None:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
def _write_to_self(self) -> None:
|
|
165
|
+
pass
|
seedloop/_net.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""The simulated network: messages delivered as seeded timer events, with faults.
|
|
2
|
+
|
|
3
|
+
A message in flight is an ordinary timer on the loop's heap (``docs/network.md``). ``send`` draws a
|
|
4
|
+
latency from the seed's ``"net"`` sub-stream and schedules a delivery at ``now + latency``; ``recv``
|
|
5
|
+
blocks in virtual time until a message is queued. Reordering is emergent — two messages sent close
|
|
6
|
+
together draw independent latencies, so arrival order can differ from send order, reproducibly.
|
|
7
|
+
|
|
8
|
+
Faults — loss, duplication, and partitions — are drawn from the seed's ``"faults"`` sub-stream
|
|
9
|
+
(independent of ``"net"``, so enabling a fault does not shift surviving messages' latencies). An
|
|
10
|
+
endpoint can opt into a reliable, ordered channel. No real socket exists; the "network" is queues
|
|
11
|
+
and timers.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
from collections import deque
|
|
18
|
+
from random import Random
|
|
19
|
+
from typing import Protocol, runtime_checkable
|
|
20
|
+
|
|
21
|
+
from seedloop._trace import Timeline
|
|
22
|
+
from seedloop.errors import SeedloopError
|
|
23
|
+
|
|
24
|
+
Address = int # a node's address on the simulated network
|
|
25
|
+
Message = object # an opaque payload; seedloop schedules and orders it, never inspects it
|
|
26
|
+
|
|
27
|
+
# Default per-message latency range, in virtual seconds. Wide enough that two near-simultaneous
|
|
28
|
+
# sends can reorder.
|
|
29
|
+
_LAT_MIN = 0.001
|
|
30
|
+
_LAT_MAX = 0.020
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@runtime_checkable
|
|
34
|
+
class Endpoint(Protocol):
|
|
35
|
+
"""A node's bound handle on the network."""
|
|
36
|
+
|
|
37
|
+
address: Address
|
|
38
|
+
|
|
39
|
+
async def send(self, dst: Address, msg: Message) -> None: ...
|
|
40
|
+
async def recv(self) -> tuple[Address, Message]: ...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class Transport:
|
|
44
|
+
"""The simulated network behind ``world.net``."""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
loop: asyncio.AbstractEventLoop,
|
|
49
|
+
net_rng: Random,
|
|
50
|
+
faults_rng: Random,
|
|
51
|
+
timeline: Timeline,
|
|
52
|
+
) -> None:
|
|
53
|
+
self._loop = loop
|
|
54
|
+
self._net = net_rng
|
|
55
|
+
self._faults = faults_rng
|
|
56
|
+
self._timeline = timeline
|
|
57
|
+
self._endpoints: dict[Address, _Endpoint] = {}
|
|
58
|
+
self._next_mid = 0 # monotonic message id — the stable timeline identity, not Python id()
|
|
59
|
+
self._partition: list[set[Address]] | None = None # groups; None means full connectivity
|
|
60
|
+
self._reliable_clock: dict[
|
|
61
|
+
tuple[Address, Address], float
|
|
62
|
+
] = {} # per-link FIFO delivery time
|
|
63
|
+
|
|
64
|
+
def bind(
|
|
65
|
+
self,
|
|
66
|
+
address: Address,
|
|
67
|
+
*,
|
|
68
|
+
reliable: bool = False,
|
|
69
|
+
loss: float = 0.0,
|
|
70
|
+
duplicate: float = 0.0,
|
|
71
|
+
) -> Endpoint:
|
|
72
|
+
"""Give a node an endpoint at ``address``.
|
|
73
|
+
|
|
74
|
+
``loss``/``duplicate`` are per-message probabilities on this endpoint's outgoing links;
|
|
75
|
+
``reliable=True`` gives no-loss, in-order delivery (and ignores loss/duplicate).
|
|
76
|
+
Binding the same address twice is an error.
|
|
77
|
+
"""
|
|
78
|
+
if address in self._endpoints:
|
|
79
|
+
raise SeedloopError(f"address {address} is already bound")
|
|
80
|
+
if not 0.0 <= loss <= 1.0:
|
|
81
|
+
raise SeedloopError(f"loss must be a probability in [0, 1], got {loss}")
|
|
82
|
+
if not 0.0 <= duplicate <= 1.0:
|
|
83
|
+
raise SeedloopError(f"duplicate must be a probability in [0, 1], got {duplicate}")
|
|
84
|
+
endpoint = _Endpoint(self, address, reliable=reliable, loss=loss, duplicate=duplicate)
|
|
85
|
+
self._endpoints[address] = endpoint
|
|
86
|
+
return endpoint
|
|
87
|
+
|
|
88
|
+
def partition(self, *groups: set[Address]) -> None:
|
|
89
|
+
"""Split the network: nodes in different groups cannot reach each other until ``heal``.
|
|
90
|
+
|
|
91
|
+
A node in no listed group stays connected to everyone (it is not partitioned away).
|
|
92
|
+
"""
|
|
93
|
+
self._partition = [set(g) for g in groups]
|
|
94
|
+
|
|
95
|
+
def heal(self) -> None:
|
|
96
|
+
"""Restore full connectivity."""
|
|
97
|
+
self._partition = None
|
|
98
|
+
|
|
99
|
+
def _reachable(self, src: Address, dst: Address) -> bool:
|
|
100
|
+
if self._partition is None:
|
|
101
|
+
return True
|
|
102
|
+
gs = next((g for g in self._partition if src in g), None)
|
|
103
|
+
gd = next((g for g in self._partition if dst in g), None)
|
|
104
|
+
if gs is None or gd is None:
|
|
105
|
+
return True # an unpartitioned node reaches everyone
|
|
106
|
+
return gs is gd
|
|
107
|
+
|
|
108
|
+
def _send(self, endpoint: _Endpoint, dst: Address, msg: Message) -> None:
|
|
109
|
+
src = endpoint.address
|
|
110
|
+
mid = self._next_mid
|
|
111
|
+
self._next_mid += 1
|
|
112
|
+
self._timeline.record((self._loop.time(), "send", mid, src, dst))
|
|
113
|
+
if endpoint._reliable:
|
|
114
|
+
self._schedule_reliable(mid, src, dst, msg)
|
|
115
|
+
return
|
|
116
|
+
if endpoint._loss > 0.0 and self._faults.random() < endpoint._loss:
|
|
117
|
+
self._timeline.record((self._loop.time(), "drop", mid, src, dst))
|
|
118
|
+
return
|
|
119
|
+
self._schedule_delivery(mid, src, dst, msg)
|
|
120
|
+
if endpoint._duplicate > 0.0 and self._faults.random() < endpoint._duplicate:
|
|
121
|
+
self._timeline.record((self._loop.time(), "duplicate", mid, src, dst))
|
|
122
|
+
self._schedule_delivery(mid, src, dst, msg)
|
|
123
|
+
|
|
124
|
+
def _schedule_delivery(self, mid: int, src: Address, dst: Address, msg: Message) -> None:
|
|
125
|
+
latency = self._net.uniform(_LAT_MIN, _LAT_MAX)
|
|
126
|
+
self._loop.call_later(latency, self._deliver, mid, src, dst, msg)
|
|
127
|
+
|
|
128
|
+
def _schedule_reliable(self, mid: int, src: Address, dst: Address, msg: Message) -> None:
|
|
129
|
+
# Non-decreasing delivery times per (src, dst); equal times fire in send order via the timer
|
|
130
|
+
# (when, seq) tie-break — so a reliable link delivers in order, with no loss or duplication.
|
|
131
|
+
latency = self._net.uniform(_LAT_MIN, _LAT_MAX)
|
|
132
|
+
key = (src, dst)
|
|
133
|
+
when = max(self._loop.time() + latency, self._reliable_clock.get(key, 0.0))
|
|
134
|
+
self._reliable_clock[key] = when
|
|
135
|
+
self._loop.call_at(when, self._deliver, mid, src, dst, msg)
|
|
136
|
+
|
|
137
|
+
def _deliver(self, mid: int, src: Address, dst: Address, msg: Message) -> None:
|
|
138
|
+
if not self._reachable(src, dst):
|
|
139
|
+
# Reachability is evaluated when the delivery fires, not at send: a partition opened in
|
|
140
|
+
# flight cuts the message; one that healed in time lets it through.
|
|
141
|
+
self._timeline.record((self._loop.time(), "drop-partitioned", mid, src, dst))
|
|
142
|
+
return
|
|
143
|
+
self._timeline.record((self._loop.time(), "deliver", mid, src, dst))
|
|
144
|
+
endpoint = self._endpoints.get(dst)
|
|
145
|
+
if endpoint is None:
|
|
146
|
+
return # datagram to an unbound address is dropped, like sending into the void
|
|
147
|
+
endpoint._enqueue((src, msg))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class _Endpoint:
|
|
151
|
+
"""Concrete endpoint: a receive queue, an optional waiter, and its outgoing-link policy."""
|
|
152
|
+
|
|
153
|
+
def __init__(
|
|
154
|
+
self,
|
|
155
|
+
transport: Transport,
|
|
156
|
+
address: Address,
|
|
157
|
+
*,
|
|
158
|
+
reliable: bool,
|
|
159
|
+
loss: float,
|
|
160
|
+
duplicate: float,
|
|
161
|
+
) -> None:
|
|
162
|
+
self.address = address
|
|
163
|
+
self._transport = transport
|
|
164
|
+
self._reliable = reliable
|
|
165
|
+
self._loss = loss
|
|
166
|
+
self._duplicate = duplicate
|
|
167
|
+
self._queue: deque[tuple[Address, Message]] = deque()
|
|
168
|
+
self._waiter: asyncio.Future[None] | None = None
|
|
169
|
+
|
|
170
|
+
async def send(self, dst: Address, msg: Message) -> None:
|
|
171
|
+
# Schedules a delivery and returns immediately; it does not block on delivery.
|
|
172
|
+
self._transport._send(self, dst, msg)
|
|
173
|
+
|
|
174
|
+
async def recv(self) -> tuple[Address, Message]:
|
|
175
|
+
if self._waiter is not None:
|
|
176
|
+
# One endpoint has one logical receiver; a second concurrent recv would orphan the
|
|
177
|
+
# first's waiter. Fail loudly rather than corrupt delivery silently.
|
|
178
|
+
raise SeedloopError("concurrent recv on one endpoint is not supported")
|
|
179
|
+
while not self._queue:
|
|
180
|
+
self._waiter = self._transport._loop.create_future()
|
|
181
|
+
try:
|
|
182
|
+
await self._waiter
|
|
183
|
+
finally:
|
|
184
|
+
self._waiter = None
|
|
185
|
+
return self._queue.popleft()
|
|
186
|
+
|
|
187
|
+
def _enqueue(self, item: tuple[Address, Message]) -> None:
|
|
188
|
+
self._queue.append(item)
|
|
189
|
+
if self._waiter is not None and not self._waiter.done():
|
|
190
|
+
self._waiter.set_result(None)
|
seedloop/_run.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Running scenarios: ``check`` sweeps seeds, ``replay`` reproduces one.
|
|
2
|
+
|
|
3
|
+
The contract (ADR-0003): a run is a pure function of its seed, so a failing seed *is* the
|
|
4
|
+
reproduction. ``check`` runs a scenario once per seed and reports the first failing seed; ``replay``
|
|
5
|
+
rebuilds that exact run. A fresh :class:`World` is built per seed with no shared mutable state, so
|
|
6
|
+
one run cannot bleed into the next.
|
|
7
|
+
|
|
8
|
+
``check``/``replay`` do not pin ``PYTHONHASHSEED`` (ADR-0015): the launcher re-runs the whole
|
|
9
|
+
interpreter, which is wrong to trigger implicitly from inside a test runner. The guarantee instead
|
|
10
|
+
rests on library code never depending on hash order; a user whose own code does can call
|
|
11
|
+
``seedloop.ensure_hash_seed`` at their entry point.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Awaitable, Callable, Iterable, Sequence
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Literal
|
|
19
|
+
|
|
20
|
+
from seedloop._audit import audit_mode
|
|
21
|
+
from seedloop._entropy import csprng_shim, substream
|
|
22
|
+
from seedloop._world import World
|
|
23
|
+
|
|
24
|
+
Scenario = Callable[[World], Awaitable[None]]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class CheckResult:
|
|
29
|
+
"""The outcome of a seed sweep."""
|
|
30
|
+
|
|
31
|
+
checked: int # how many seeds ran
|
|
32
|
+
failing_seed: int | None # first failing seed, or None if all passed
|
|
33
|
+
error: Exception | None # the exception that seed raised, or None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _run_one(scenario: Scenario, seed: int, *, audit: bool = False) -> Sequence[object]:
|
|
37
|
+
"""Run ``scenario`` for one seed and return its recorded timeline.
|
|
38
|
+
|
|
39
|
+
Normally the CSPRNG shim is installed for the run and removed after, so ``os.urandom`` and
|
|
40
|
+
``secrets`` draw from the seed without leaking the seeded source into later runs. With
|
|
41
|
+
``audit=True`` the non-determinism auditor runs instead: uncontrolled entropy (real time, the
|
|
42
|
+
unseeded global ``random``, ``os.urandom``/``secrets``, a real thread) raises rather than being
|
|
43
|
+
seeded or run, so a leak fails on this seed (ADR-0008).
|
|
44
|
+
"""
|
|
45
|
+
world = World(seed)
|
|
46
|
+
context = audit_mode() if audit else csprng_shim(substream(seed, "csprng"))
|
|
47
|
+
with context:
|
|
48
|
+
world._drive(scenario(world))
|
|
49
|
+
return world.timeline
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def check(
|
|
53
|
+
scenario: Scenario,
|
|
54
|
+
*,
|
|
55
|
+
seeds: int | Iterable[int] = 1000,
|
|
56
|
+
on_failure: Literal["raise", "return"] = "raise",
|
|
57
|
+
audit: bool = False,
|
|
58
|
+
) -> CheckResult:
|
|
59
|
+
"""Run ``scenario`` once per seed; report the first seed that fails.
|
|
60
|
+
|
|
61
|
+
``seeds=N`` runs seeds ``0..N-1``; an iterable runs exactly those seeds. The first run that
|
|
62
|
+
raises — an ``assert``, a ``SeedloopError``, or any exception from the scenario — is the
|
|
63
|
+
failure. With ``on_failure="raise"`` the exception is re-raised tagged with its seed; with
|
|
64
|
+
``"return"`` the sweep stops and returns the :class:`CheckResult`. With ``audit=True`` the
|
|
65
|
+
non-determinism auditor runs each seed: an uncontrolled entropy source fails it (ADR-0008).
|
|
66
|
+
"""
|
|
67
|
+
seed_iter: Iterable[int] = range(seeds) if isinstance(seeds, int) else seeds
|
|
68
|
+
checked = 0
|
|
69
|
+
for seed in seed_iter:
|
|
70
|
+
checked += 1
|
|
71
|
+
try:
|
|
72
|
+
_run_one(scenario, seed, audit=audit)
|
|
73
|
+
except Exception as error:
|
|
74
|
+
# Only a scenario *failure* is caught; KeyboardInterrupt/SystemExit propagate so a
|
|
75
|
+
# long sweep stays abortable (and is never mis-tagged as a failing seed).
|
|
76
|
+
error.add_note(f"seedloop: failing seed={seed} (replay with seedloop.replay)")
|
|
77
|
+
if on_failure == "raise":
|
|
78
|
+
raise
|
|
79
|
+
return CheckResult(checked=checked, failing_seed=seed, error=error)
|
|
80
|
+
return CheckResult(checked=checked, failing_seed=None, error=None)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def replay(scenario: Scenario, *, seed: int, audit: bool = False) -> None:
|
|
84
|
+
"""Rebuild the exact run for ``seed`` and run it once, re-raising any failure.
|
|
85
|
+
|
|
86
|
+
``audit=True`` reproduces the run under the non-determinism auditor (ADR-0008).
|
|
87
|
+
"""
|
|
88
|
+
_run_one(scenario, seed, audit=audit)
|
seedloop/_trace.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""The timeline: an append-only record of a run's events.
|
|
2
|
+
|
|
3
|
+
Determinism is proven by replay — running the same scenario twice must produce an identical
|
|
4
|
+
timeline (``docs/testing.md``). This slice records the events a scenario chooses to log; later
|
|
5
|
+
slices add scheduled network and fault events with stable identities.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Timeline:
|
|
14
|
+
"""An ordered, append-only log of events for one run."""
|
|
15
|
+
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
self._events: list[object] = []
|
|
18
|
+
|
|
19
|
+
def record(self, event: object) -> None:
|
|
20
|
+
"""Append one event to the timeline."""
|
|
21
|
+
self._events.append(event)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def events(self) -> Sequence[object]:
|
|
25
|
+
"""The events recorded so far, in order (a read-only snapshot)."""
|
|
26
|
+
return tuple(self._events)
|
|
27
|
+
|
|
28
|
+
def __repr__(self) -> str:
|
|
29
|
+
return f"Timeline({self._events!r})"
|
seedloop/_world.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""The World: everything for one deterministic run, derived from one seed.
|
|
2
|
+
|
|
3
|
+
A run is a pure function of its seed. The World assembles the deterministic loop, the virtual clock,
|
|
4
|
+
and the seeded entropy into one object, exposes the user's seeded ``rng`` and the virtual clock, and
|
|
5
|
+
records a timeline so two runs of a seed can be compared. Users do not construct a World;
|
|
6
|
+
``check``/``replay`` build it and pass it to the scenario.
|
|
7
|
+
|
|
8
|
+
Scheduling stays faithful FIFO (ADR-0012), so the seed's observable effect is ``rng``, timer timing,
|
|
9
|
+
and the simulated network's delivery timing — not callback order.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
from collections.abc import Awaitable, Callable
|
|
16
|
+
from typing import Protocol, runtime_checkable
|
|
17
|
+
|
|
18
|
+
from seedloop._entropy import substream
|
|
19
|
+
from seedloop._loop import DeterministicLoop
|
|
20
|
+
from seedloop._net import Transport
|
|
21
|
+
from seedloop._trace import Timeline
|
|
22
|
+
from seedloop.errors import InvariantError
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@runtime_checkable
|
|
26
|
+
class Node(Protocol):
|
|
27
|
+
"""User code the World can start: any object with an async ``run``."""
|
|
28
|
+
|
|
29
|
+
async def run(self) -> None: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class World:
|
|
33
|
+
"""One deterministic run, all derived from ``seed``."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, seed: int) -> None:
|
|
36
|
+
self.seed = seed
|
|
37
|
+
self.rng = substream(seed, "user") # the user's entropy; never the global random
|
|
38
|
+
self._loop = DeterministicLoop()
|
|
39
|
+
self._timeline = Timeline()
|
|
40
|
+
self._started: list[asyncio.Task[None]] = []
|
|
41
|
+
self._invariants: list[tuple[str, Callable[[], bool]]] = []
|
|
42
|
+
self.net = Transport(
|
|
43
|
+
self._loop, substream(seed, "net"), substream(seed, "faults"), self._timeline
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def now(self) -> float:
|
|
47
|
+
"""Current virtual time in seconds (advances by autojump, never by real waiting)."""
|
|
48
|
+
return self._loop.time()
|
|
49
|
+
|
|
50
|
+
def record(self, event: object) -> None:
|
|
51
|
+
"""Append an event to the run's timeline, stamped with the current virtual time.
|
|
52
|
+
|
|
53
|
+
The timeline is the artifact that proves determinism: two runs of a seed must record an
|
|
54
|
+
identical sequence. A scenario records the decisions whose reproducibility it cares about.
|
|
55
|
+
"""
|
|
56
|
+
self._timeline.record((self._loop.time(), event))
|
|
57
|
+
|
|
58
|
+
def always(self, predicate: Callable[[], bool], *, name: str) -> None:
|
|
59
|
+
"""Register a safety property that must hold throughout the run.
|
|
60
|
+
|
|
61
|
+
``predicate`` is evaluated after every step (not during teardown); the first step where it
|
|
62
|
+
is false raises ``InvariantError(name)``, which ``check`` reports. It must be pure and
|
|
63
|
+
read-only — a predicate that mutates state or draws entropy would break determinism. A
|
|
64
|
+
started node's body runs a step after ``start``, so a predicate over node state sees its
|
|
65
|
+
initial value on the first check.
|
|
66
|
+
"""
|
|
67
|
+
self._invariants.append((name, predicate))
|
|
68
|
+
self._loop._sl_after_step = self._check_invariants # check from the next step on
|
|
69
|
+
|
|
70
|
+
def _check_invariants(self) -> None:
|
|
71
|
+
for name, predicate in self._invariants:
|
|
72
|
+
if not predicate():
|
|
73
|
+
raise InvariantError(name, self._loop.time())
|
|
74
|
+
|
|
75
|
+
def start(self, *nodes: Node) -> None:
|
|
76
|
+
"""Schedule each node's ``run()`` coroutine as a task on the loop.
|
|
77
|
+
|
|
78
|
+
A started node that raises fails the run (its exception surfaces from the run), rather than
|
|
79
|
+
being orphaned and silently logged — a failure the seed must report.
|
|
80
|
+
"""
|
|
81
|
+
for node in nodes:
|
|
82
|
+
self._started.append(self._loop.create_task(node.run()))
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def timeline(self) -> tuple[object, ...]:
|
|
86
|
+
"""The recorded timeline so far (a read-only snapshot)."""
|
|
87
|
+
return tuple(self._timeline.events)
|
|
88
|
+
|
|
89
|
+
def _drive(self, main: Awaitable[None]) -> None:
|
|
90
|
+
"""Run the scenario to completion, surface any started-node failure, then close the loop."""
|
|
91
|
+
try:
|
|
92
|
+
self._loop.run_until_complete(main)
|
|
93
|
+
# The scenario finished without raising; surface the first started node that failed
|
|
94
|
+
# (a crashed node would otherwise be an orphaned task, only logged).
|
|
95
|
+
for task in self._started:
|
|
96
|
+
exc = task.exception() if task.done() and not task.cancelled() else None
|
|
97
|
+
if exc is not None:
|
|
98
|
+
raise exc
|
|
99
|
+
finally:
|
|
100
|
+
# Invariants describe the logical run, not cancellation cleanup — stop checking them
|
|
101
|
+
# before teardown, so a node mutating observed state in its cancel handler cannot raise
|
|
102
|
+
# a spurious InvariantError (and cannot mask the real failure raised above).
|
|
103
|
+
self._loop._sl_after_step = None
|
|
104
|
+
# Cancel every task still pending — started nodes and any the scenario spawned — and let
|
|
105
|
+
# the cancellations process, so the loop closes without "Task was destroyed but it is
|
|
106
|
+
# pending" warnings (a node loop that never returns, or a recv stuck under a fault).
|
|
107
|
+
pending = [t for t in asyncio.all_tasks(self._loop) if not t.done()]
|
|
108
|
+
for task in pending:
|
|
109
|
+
task.cancel()
|
|
110
|
+
if pending:
|
|
111
|
+
self._loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
|
|
112
|
+
self._loop.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Worked demos that run real protocols under seedloop."""
|
seedloop/demos/raft.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""A small Raft leader election, run under seedloop — the worked proof.
|
|
2
|
+
|
|
3
|
+
This is election only (terms, ``RequestVote``, majority, heartbeats); log replication, persistence,
|
|
4
|
+
and membership changes are out of scope. It exists to demonstrate one thing end to end: seedloop
|
|
5
|
+
finds a real class of consensus bug and replays it from a seed.
|
|
6
|
+
|
|
7
|
+
The bug is a deliberate, labelled toggle, not a claimed discovery in canonical Raft. With
|
|
8
|
+
``buggy=True`` a node omits the single-vote-per-term rule, so it can grant a vote to two candidates
|
|
9
|
+
in the same term; in a three-node cluster that lets both reach a majority and become leader in one
|
|
10
|
+
term — split-brain, the exact failure the majority rule exists to prevent. With ``buggy=False`` the
|
|
11
|
+
rule is enforced and the same seed sweep finds no violation. That two-sided result is the proof: the
|
|
12
|
+
violation is the toggled flaw, not an accident of the harness.
|
|
13
|
+
|
|
14
|
+
Run it: ``python -m seedloop.demos.raft``
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import sys
|
|
21
|
+
from typing import cast
|
|
22
|
+
|
|
23
|
+
import seedloop
|
|
24
|
+
from seedloop import World
|
|
25
|
+
|
|
26
|
+
FOLLOWER, CANDIDATE, LEADER = "follower", "candidate", "leader"
|
|
27
|
+
|
|
28
|
+
# Election timeouts are drawn from world.rng (so the seed owns the race); the leader's heartbeat is
|
|
29
|
+
# faster than any election timeout, so a stable leader suppresses new elections.
|
|
30
|
+
_ELECTION_MIN, _ELECTION_MAX = 0.15, 0.30
|
|
31
|
+
_HEARTBEAT = 0.05
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RaftNode:
|
|
35
|
+
"""One node's election logic, sans-I/O against ``world.net``."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, world: World, addr: int, peers: list[int], *, buggy: bool) -> None:
|
|
38
|
+
self._world = world
|
|
39
|
+
self._ep = world.net.bind(addr)
|
|
40
|
+
self.addr = addr
|
|
41
|
+
self._peers = peers
|
|
42
|
+
self._all = len(peers) + 1
|
|
43
|
+
self._buggy = buggy
|
|
44
|
+
self.term = 0
|
|
45
|
+
self.role = FOLLOWER
|
|
46
|
+
self._voted_for: int | None = None
|
|
47
|
+
self._votes: set[int] = set()
|
|
48
|
+
|
|
49
|
+
def _election_timeout(self) -> float:
|
|
50
|
+
return self._world.rng.uniform(_ELECTION_MIN, _ELECTION_MAX)
|
|
51
|
+
|
|
52
|
+
async def _broadcast(self, msg: object) -> None:
|
|
53
|
+
for p in self._peers:
|
|
54
|
+
await self._ep.send(p, msg)
|
|
55
|
+
|
|
56
|
+
def _adopt_newer_term(self, term: int) -> None:
|
|
57
|
+
# Only a strictly higher term resets the vote — a node votes at most once per term, so
|
|
58
|
+
# stepping down within the same term must NOT clear who it already voted for.
|
|
59
|
+
if term > self.term:
|
|
60
|
+
self.term = term
|
|
61
|
+
self.role = FOLLOWER
|
|
62
|
+
self._voted_for = None
|
|
63
|
+
self._votes = set()
|
|
64
|
+
|
|
65
|
+
async def run(self) -> None:
|
|
66
|
+
while True:
|
|
67
|
+
timeout = _HEARTBEAT if self.role == LEADER else self._election_timeout()
|
|
68
|
+
try:
|
|
69
|
+
src, msg = await asyncio.wait_for(self._ep.recv(), timeout=timeout)
|
|
70
|
+
except TimeoutError:
|
|
71
|
+
if self.role == LEADER:
|
|
72
|
+
await self._broadcast(("heartbeat", self.term, self.addr))
|
|
73
|
+
else:
|
|
74
|
+
await self._begin_election()
|
|
75
|
+
continue
|
|
76
|
+
await self._handle(src, msg)
|
|
77
|
+
|
|
78
|
+
async def _begin_election(self) -> None:
|
|
79
|
+
self.term += 1
|
|
80
|
+
self.role = CANDIDATE
|
|
81
|
+
self._voted_for = self.addr
|
|
82
|
+
self._votes = {self.addr} # vote for self
|
|
83
|
+
await self._broadcast(("request_vote", self.term, self.addr))
|
|
84
|
+
|
|
85
|
+
async def _handle(self, src: int, msg: object) -> None:
|
|
86
|
+
fields = cast("tuple[object, ...]", msg)
|
|
87
|
+
kind = fields[0]
|
|
88
|
+
if kind == "request_vote":
|
|
89
|
+
await self._on_request_vote(src, cast("int", fields[1]))
|
|
90
|
+
elif kind == "vote":
|
|
91
|
+
await self._on_vote(
|
|
92
|
+
cast("int", fields[1]), cast("int", fields[2]), cast("bool", fields[3])
|
|
93
|
+
)
|
|
94
|
+
elif kind == "heartbeat":
|
|
95
|
+
self._on_heartbeat(cast("int", fields[1]))
|
|
96
|
+
|
|
97
|
+
async def _on_request_vote(self, src: int, term: int) -> None:
|
|
98
|
+
self._adopt_newer_term(term)
|
|
99
|
+
grant = False
|
|
100
|
+
# The bug: the correct rule grants at most one vote per term (`_voted_for` guard); the buggy
|
|
101
|
+
# path drops the guard, so a node can vote for two candidates in one term.
|
|
102
|
+
if (
|
|
103
|
+
term == self.term
|
|
104
|
+
and self.role != LEADER
|
|
105
|
+
and (self._buggy or self._voted_for in (None, src))
|
|
106
|
+
):
|
|
107
|
+
grant = True
|
|
108
|
+
self._voted_for = src
|
|
109
|
+
await self._ep.send(src, ("vote", self.term, self.addr, grant))
|
|
110
|
+
|
|
111
|
+
async def _on_vote(self, term: int, voter: int, granted: bool) -> None:
|
|
112
|
+
if term == self.term and self.role == CANDIDATE and granted:
|
|
113
|
+
self._votes.add(voter) # distinct voters; a majority of them elects
|
|
114
|
+
if len(self._votes) > self._all // 2: # a majority elects
|
|
115
|
+
self.role = LEADER
|
|
116
|
+
await self._broadcast(("heartbeat", self.term, self.addr))
|
|
117
|
+
|
|
118
|
+
def _on_heartbeat(self, term: int) -> None:
|
|
119
|
+
self._adopt_newer_term(term)
|
|
120
|
+
if term == self.term and self.role != FOLLOWER:
|
|
121
|
+
self.role = FOLLOWER # a leader exists this term; step down but keep our vote
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def leaders_by_term(nodes: list[RaftNode]) -> dict[int, set[int]]:
|
|
125
|
+
"""Map each term to the set of nodes that currently believe they lead it."""
|
|
126
|
+
out: dict[int, set[int]] = {}
|
|
127
|
+
for node in nodes:
|
|
128
|
+
if node.role == LEADER:
|
|
129
|
+
out.setdefault(node.term, set()).add(node.addr)
|
|
130
|
+
return out
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def at_most_one_leader_per_term(nodes: list[RaftNode]) -> bool:
|
|
134
|
+
"""Raft's election-safety property: no term ever has two leaders."""
|
|
135
|
+
return all(len(leaders) <= 1 for leaders in leaders_by_term(nodes).values())
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def election_scenario(*, buggy: bool, nodes: int = 3, seconds: float = 3.0) -> seedloop.Scenario:
|
|
139
|
+
"""A scenario that runs a cluster and asserts election safety throughout."""
|
|
140
|
+
|
|
141
|
+
async def scenario(world: World) -> None:
|
|
142
|
+
addrs = list(range(nodes))
|
|
143
|
+
cluster = [RaftNode(world, a, [p for p in addrs if p != a], buggy=buggy) for a in addrs]
|
|
144
|
+
for node in cluster:
|
|
145
|
+
world.start(node)
|
|
146
|
+
world.always(
|
|
147
|
+
lambda: at_most_one_leader_per_term(cluster), name="at-most-one-leader-per-term"
|
|
148
|
+
)
|
|
149
|
+
await asyncio.sleep(seconds) # let elections run; the invariant is checked each step
|
|
150
|
+
|
|
151
|
+
return scenario
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def find_split_brain(seeds: int = 200) -> int | None:
|
|
155
|
+
"""Sweep the buggy election for a seed that violates election safety; None if none found."""
|
|
156
|
+
result = seedloop.check(election_scenario(buggy=True), seeds=seeds, on_failure="return")
|
|
157
|
+
return result.failing_seed
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def main() -> None:
|
|
161
|
+
print("seedloop Raft election demo - hunting for split-brain\n")
|
|
162
|
+
seed = find_split_brain()
|
|
163
|
+
if seed is None:
|
|
164
|
+
print("no split-brain found in the swept seeds (try more seeds)")
|
|
165
|
+
sys.exit(1) # the proof did not reproduce — fail loudly (CI runs this)
|
|
166
|
+
print(f"buggy election: split-brain found at seed={seed}")
|
|
167
|
+
print(f" reproduce it: seedloop.replay(election_scenario(buggy=True), seed={seed})")
|
|
168
|
+
try:
|
|
169
|
+
seedloop.replay(election_scenario(buggy=True), seed=seed)
|
|
170
|
+
except seedloop.InvariantError as exc:
|
|
171
|
+
print(f" replay reproduces it: {exc}")
|
|
172
|
+
clean = seedloop.check(election_scenario(buggy=False), seeds=200, on_failure="return")
|
|
173
|
+
verdict = (
|
|
174
|
+
"no violation" if clean.failing_seed is None else f"FAILED at seed={clean.failing_seed}"
|
|
175
|
+
)
|
|
176
|
+
print(f"\ncorrect election (single-vote rule enforced): {verdict} over the same 200 seeds")
|
|
177
|
+
print("-> the violation is the toggled flaw, not the harness.")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
main()
|
seedloop/errors.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Exceptions seedloop raises.
|
|
2
|
+
|
|
3
|
+
One specific exception per failure mode; nothing is swallowed. The hierarchy is rooted at
|
|
4
|
+
``SeedloopError`` so everything seedloop raises can be caught with a single class.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SeedloopError(Exception):
|
|
11
|
+
"""Base class for every error seedloop raises."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BoundaryError(SeedloopError):
|
|
15
|
+
"""A simulated run reached outside the determinism boundary.
|
|
16
|
+
|
|
17
|
+
Real threads, ``run_in_executor``, subprocesses, real sockets, and cross-thread wakeups
|
|
18
|
+
cannot be made deterministic, so they are rejected rather than run silently (``docs/scope.md``).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DeadlockError(SeedloopError):
|
|
23
|
+
"""The run cannot progress and nothing is scheduled to wake it.
|
|
24
|
+
|
|
25
|
+
A real ``asyncio`` program would hang here; a simulated run raises instead of spinning, so
|
|
26
|
+
the deadlock is a visible failure tied to the seed that produced it.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class InvariantError(SeedloopError):
|
|
31
|
+
"""An ``always(...)`` invariant was violated during a run.
|
|
32
|
+
|
|
33
|
+
A continuous safety property (e.g. "at most one leader") that must hold throughout, checked
|
|
34
|
+
after every step; the first step where it is false raises this, which ``check`` reports as the
|
|
35
|
+
failure. Carries the invariant's ``name`` and the virtual ``time`` of the violation.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, name: str, time: float) -> None:
|
|
39
|
+
super().__init__(f"invariant {name!r} violated at t={time}")
|
|
40
|
+
self.name = name
|
|
41
|
+
self.time = time
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class EntropyLeakError(BoundaryError):
|
|
45
|
+
"""An uncontrolled entropy source was touched inside a simulated run.
|
|
46
|
+
|
|
47
|
+
In audit mode the non-determinism auditor raises this when code reaches for real
|
|
48
|
+
``os.urandom``/``secrets``, real time, or the unseeded global ``random`` instead of the World's
|
|
49
|
+
seeded source (``docs/decisions.md`` ADR-0008). Carries the offending ``source``.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, source: str) -> None:
|
|
53
|
+
super().__init__(
|
|
54
|
+
f"uncontrolled entropy source {source!r} used inside a run; route it through the seed "
|
|
55
|
+
f"(world.rng) or the virtual clock — see docs/scope.md"
|
|
56
|
+
)
|
|
57
|
+
self.source = source
|
seedloop/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seedloop
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Deterministic simulation testing for Python asyncio.
|
|
5
|
+
Author-email: Vojtěch Klíma <vojtechklima02@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/klimavojtech2002/seedloop
|
|
8
|
+
Project-URL: Repository, https://github.com/klimavojtech2002/seedloop
|
|
9
|
+
Project-URL: Issues, https://github.com/klimavojtech2002/seedloop/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/klimavojtech2002/seedloop/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: asyncio,testing,determinism,simulation,concurrency
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Testing
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
24
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-timeout>=2; extra == "dev"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# seedloop
|
|
30
|
+
|
|
31
|
+
Deterministic simulation testing for Python. Run your concurrent async logic through thousands of
|
|
32
|
+
controlled, reproducible timelines — varying message timing and delivery order, injecting network
|
|
33
|
+
faults, partitions, and delays — to surface the rare concurrency bug that shows up once in a million
|
|
34
|
+
runs, and replay it exactly from a seed.
|
|
35
|
+
|
|
36
|
+
It brings the FoundationDB / TigerBeetle / Antithesis style of reliability testing — until now living
|
|
37
|
+
only in Rust, C++, and Java — to Python's `asyncio`, as a `pip`-installable library.
|
|
38
|
+
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
## The problem
|
|
42
|
+
|
|
43
|
+
Concurrency bugs are the worst bugs. A protocol or state machine works in every test, then once a
|
|
44
|
+
week in CI a test fails, and nobody can reproduce it — because the failure depended on an exact
|
|
45
|
+
interleaving of events, a message arriving late, a partition healing at the wrong moment. You cannot
|
|
46
|
+
fix what you cannot reproduce, so these bugs are patched by guesswork and survive for years.
|
|
47
|
+
|
|
48
|
+
Deterministic simulation testing (DST) inverts this. It takes total control of every source of
|
|
49
|
+
nondeterminism — scheduling order, time, randomness, the network — and drives them all from a single
|
|
50
|
+
seed. The same seed produces the same timeline, so the same bug, every time. You explore thousands of
|
|
51
|
+
seeds to hunt for failures, and when one is found, the seed *is* the reproduction: replay it and the
|
|
52
|
+
bug happens again, deterministically, every run.
|
|
53
|
+
|
|
54
|
+
This is how FoundationDB reached its reliability record. It exists as a polished library in Rust
|
|
55
|
+
(`madsim`, `turmoil`). In Python — where a great deal of distributed and protocol code is written — it
|
|
56
|
+
does not exist at all. `seedloop` is that library.
|
|
57
|
+
|
|
58
|
+
## What you do with it
|
|
59
|
+
|
|
60
|
+
You write your protocol or algorithm against an abstract transport (the
|
|
61
|
+
[sans-I/O](https://sans-io.readthedocs.io/) style), and `seedloop` runs it inside a deterministic
|
|
62
|
+
world it fully controls. A test looks like this (`World`, `check`, `replay`, the network `world.net`
|
|
63
|
+
with loss/duplication/partitions, the `world.always` invariant API, and the `audit=True`
|
|
64
|
+
non-determinism auditor are all implemented; the seed-scheduled `world.run_for` fault schedule is the
|
|
65
|
+
next phase, specified in [docs/api.md](docs/api.md)):
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import seedloop
|
|
69
|
+
|
|
70
|
+
async def scenario(world: seedloop.World) -> None:
|
|
71
|
+
# Spin up your nodes; they send messages through the simulated network.
|
|
72
|
+
nodes = [RaftNode(addr, world.net) for addr in range(5)]
|
|
73
|
+
world.start(*nodes)
|
|
74
|
+
|
|
75
|
+
# State the invariant that must hold at every step, not just at the end.
|
|
76
|
+
world.always(lambda: at_most_one_leader(nodes), name="at-most-one-leader")
|
|
77
|
+
|
|
78
|
+
# Inject chaos the seed decides the details of.
|
|
79
|
+
await world.run_for(seconds=10, faults=[world.partition(), world.slow_link()])
|
|
80
|
+
|
|
81
|
+
# Hunt across 10,000 seeded timelines; on failure, print the seed.
|
|
82
|
+
seedloop.check(scenario, seeds=10_000)
|
|
83
|
+
# A failing run prints: seed=4823 → replay with seedloop.replay(scenario, seed=4823)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
`seedloop.replay(scenario, seed=4823)` re-runs that exact timeline, deterministically, as many times
|
|
87
|
+
as you need to debug it. The full API is in [docs/api.md](docs/api.md).
|
|
88
|
+
|
|
89
|
+
## The worked proof: a Raft split-brain, found and replayed
|
|
90
|
+
|
|
91
|
+
A small Raft leader election ships as a demo. With a deliberate, labelled flaw — a node that omits the
|
|
92
|
+
single-vote-per-term rule — a seed sweep finds the timing where two nodes both win an election in the
|
|
93
|
+
same term (split-brain), and replays it from the seed. The corrected election passes the same sweep, so
|
|
94
|
+
the violation is the toggled flaw, not the harness: in a three-node cluster the shared third voter can
|
|
95
|
+
only break the tie once under the single-vote rule, so one candidate gets two votes and the other one —
|
|
96
|
+
never two leaders.
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
$ python -m seedloop.demos.raft
|
|
100
|
+
seedloop Raft election demo - hunting for split-brain
|
|
101
|
+
|
|
102
|
+
buggy election: split-brain found at seed=7
|
|
103
|
+
reproduce it: seedloop.replay(election_scenario(buggy=True), seed=7)
|
|
104
|
+
replay reproduces it: invariant 'at-most-one-leader-per-term' violated at t=0.229...
|
|
105
|
+
correct election (single-vote rule enforced): no violation over the same 200 seeds
|
|
106
|
+
-> the violation is the toggled flaw, not the harness.
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The election logic is in [`src/seedloop/demos/raft.py`](src/seedloop/demos/raft.py). It is election only
|
|
110
|
+
(terms, `RequestVote`, majority, heartbeats) — log replication, persistence, and membership changes are
|
|
111
|
+
out of scope.
|
|
112
|
+
|
|
113
|
+
## What it does
|
|
114
|
+
|
|
115
|
+
- A **deterministic event loop** that makes `asyncio` task scheduling reproducible and drives the I/O
|
|
116
|
+
seam — where nondeterminism actually enters — from the seed.
|
|
117
|
+
- A **virtual clock** — `sleep` and timeouts advance simulated time instantly; no run is slower for
|
|
118
|
+
testing a 10-second scenario.
|
|
119
|
+
- **Seeded randomness** everywhere, so a run is a pure function of its seed.
|
|
120
|
+
- A **simulated network** with seeded latency, reordering, message loss, and partitions.
|
|
121
|
+
- **Fault injection** driven by the seed, so chaos is reproducible rather than random.
|
|
122
|
+
- **Invariants** — `world.always(...)` checks a continuous safety property at every step.
|
|
123
|
+
- A **non-determinism auditor** — `audit=True` turns any uncontrolled entropy source into a loud,
|
|
124
|
+
reproducible failure, so the determinism boundary is enforced, not just stated.
|
|
125
|
+
- **Seed replay** — the whole point: any failure reduces to a single integer you can replay forever.
|
|
126
|
+
|
|
127
|
+
## Scope — what it tests, and what it deliberately does not
|
|
128
|
+
|
|
129
|
+
The honesty in this section is the point. `seedloop` makes your async *logic* deterministic; it does
|
|
130
|
+
not make your *infrastructure* deterministic, and it does not pretend to. The full boundary, and the engineering reasons behind it, are in
|
|
131
|
+
[docs/scope.md](docs/scope.md). In short:
|
|
132
|
+
|
|
133
|
+
- **It is for** pure-Python async code that talks to an abstract transport: consensus (Raft/Paxos),
|
|
134
|
+
replication, gossip, CRDTs, custom wire protocols, schedulers, retry/backoff/circuit-breaker logic,
|
|
135
|
+
rate limiters — code where the *logic* holds the concurrency bugs.
|
|
136
|
+
- **It is not for** I/O-heavy applications bound to real drivers. Real threads, `multiprocessing`,
|
|
137
|
+
`uvloop`, and C-extension drivers (`asyncpg`, `grpcio`) are explicitly out of scope, because their
|
|
138
|
+
scheduling cannot be controlled from Python — the same wall that stops deterministic testing in Go.
|
|
139
|
+
`seedloop` tests your algorithm, not your database driver.
|
|
140
|
+
|
|
141
|
+
Choosing this boundary deliberately — rather than promising determinism it cannot deliver — is what
|
|
142
|
+
keeps the guarantee real.
|
|
143
|
+
|
|
144
|
+
## Status
|
|
145
|
+
|
|
146
|
+
The planned build is **complete through v0.3.0**: the deterministic core (custom event loop, virtual
|
|
147
|
+
clock with autojump, seeded entropy, the `World` / `check` / `replay` API), the simulated network with
|
|
148
|
+
fault injection (loss, duplication, partitions), the `world.always` invariant API, the non-determinism
|
|
149
|
+
auditor (`audit=True`), and the worked Raft demo (which runs today) — so `asyncio` runs are reproducible
|
|
150
|
+
and instant, a partition- or timing-dependent bug replays identically from its seed, and an uncontrolled
|
|
151
|
+
entropy source fails loudly under audit. Deferred: the seed-scheduled `world.run_for` fault schedule and
|
|
152
|
+
an optional Hypothesis integration (`seedloop[hypothesis]`). The full API target is in
|
|
153
|
+
[docs/api.md](docs/api.md) and the phased build in [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md).
|
|
154
|
+
|
|
155
|
+
## Why it exists
|
|
156
|
+
|
|
157
|
+
There is no `pip`-installable deterministic simulation testing framework for Python `asyncio` — the
|
|
158
|
+
capability lives in Rust (`madsim`, `turmoil`), C++ (FoundationDB), Java (OpenDST), and behind a
|
|
159
|
+
commercial hypervisor (Antithesis), but not in Python. Meanwhile the discipline is rising fast among
|
|
160
|
+
serious engineers (Antithesis raised a $105M round led by Jane Street to standardize DST; AWS has
|
|
161
|
+
codified deterministic and formal methods as standing practice). As one of its proponents puts it:
|
|
162
|
+
*writing code is no longer the bottleneck — making sure it does the right thing is.* `seedloop` is a
|
|
163
|
+
tool for exactly that, in the language that lacked it.
|
|
164
|
+
|
|
165
|
+
## Documentation
|
|
166
|
+
|
|
167
|
+
The design is specified before the code:
|
|
168
|
+
|
|
169
|
+
- [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) — how `asyncio` is made deterministic, and the phased build.
|
|
170
|
+
- [docs/api.md](docs/api.md) — the public API: `World`, `check`/`replay`, the transport, faults.
|
|
171
|
+
- [docs/internals.md](docs/internals.md) — the loop, virtual clock, entropy control, network and fault scheduling.
|
|
172
|
+
- [docs/network.md](docs/network.md) — the simulated transport and fault model.
|
|
173
|
+
- [docs/scope.md](docs/scope.md) — the determinism boundary: what is controlled and what is not.
|
|
174
|
+
- [docs/testing.md](docs/testing.md) — how determinism is proven by replay.
|
|
175
|
+
- [docs/decisions.md](docs/decisions.md) — the decision records (ADRs).
|
|
176
|
+
- [docs/glossary.md](docs/glossary.md) — the vocabulary.
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
seedloop/__init__.py,sha256=v0u4D7SHFXW-Jo_FGEJbr9GhtXy4jhtOLwW8QvIQDhM,1191
|
|
2
|
+
seedloop/_audit.py,sha256=FH0go-ybFodw1h3AIr3IRim3kPUPMAc-U2KFC0z6P0o,4517
|
|
3
|
+
seedloop/_entropy.py,sha256=4ZZfXN4HsZ9EiZ3024-zxjvbXR2XWaxQxqT2MQEXGP8,4499
|
|
4
|
+
seedloop/_loop.py,sha256=UT0lh6lawusSogsmXvAJ52AwG7tK0W0OYizGpt3rZ-s,7885
|
|
5
|
+
seedloop/_net.py,sha256=GqywpytBmLuPLtpB8KJpCbotYsl-OLXnjNkt0LFKwg0,7979
|
|
6
|
+
seedloop/_run.py,sha256=OPZQgCN5f9IG4xN_E616s_YhP0UAn2M7dSVvpePx31c,3911
|
|
7
|
+
seedloop/_trace.py,sha256=TEfJGP0E5M5Y1L-MEvPOsBb6QR3moZH49z5CloMOmkk,921
|
|
8
|
+
seedloop/_world.py,sha256=Pn9mTP4yu8PsmhX6hdIPwuknjJsyd_dsUJZBBXn9qHM,5253
|
|
9
|
+
seedloop/errors.py,sha256=absRq_4jWgzLlxIOBcJwMGYenlldml2p3maJVGwYbfI,2168
|
|
10
|
+
seedloop/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
seedloop/demos/__init__.py,sha256=7ux1hd1HRarZjNPHN3cELwwBXo7dHR-NTuCkKOVMdEs,59
|
|
12
|
+
seedloop/demos/raft.py,sha256=FUl06jOu-8axF4nplkOmZCZZEaW01O4UqN6ZpKPnS10,7520
|
|
13
|
+
seedloop-0.3.0.dist-info/licenses/LICENSE,sha256=bq78RJMIno1EDrCmF4-Q6E8TqWGPk12dM1yauYtEGqM,1072
|
|
14
|
+
seedloop-0.3.0.dist-info/METADATA,sha256=A4-L13cM1e35LrMNKYK5hKA_uIvEis9jRIDJrB1HAlg,10201
|
|
15
|
+
seedloop-0.3.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
16
|
+
seedloop-0.3.0.dist-info/top_level.txt,sha256=CDWSLLQIpYsE2ds0ATOpEYX_eQUGUjng4tE8l2sO9MA,9
|
|
17
|
+
seedloop-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Vojtěch Klíma
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
seedloop
|