dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/lane_journal.py
ADDED
|
@@ -0,0 +1,1001 @@
|
|
|
1
|
+
"""Lane-journal — a write-ahead log for the lane-lease arbiter (LJ-series).
|
|
2
|
+
|
|
3
|
+
The pure lane arbiter (`arbiter.arbitrate`) decides admission from a *live-lease
|
|
4
|
+
set* — current state only, no history. Without a durable record of what the
|
|
5
|
+
arbiter *decided*, "why was I refused at 14:03?", "when did this orphan die and
|
|
6
|
+
who reclaimed it?", and "reconstruct the lane state after a crash" are all
|
|
7
|
+
unanswerable, and the live set itself has nowhere durable to live across
|
|
8
|
+
processes.
|
|
9
|
+
|
|
10
|
+
This module is the **write-ahead log** that classic schedulers and lock managers
|
|
11
|
+
always keep: every lane decision (ACQUIRE / RELEASE / HEARTBEAT / SCAVENGE /
|
|
12
|
+
REFUSE / HALT / RECONCILE / ENFORCE / SPAWN) is appended — and `fsync`'d — to an append-only JSONL
|
|
13
|
+
file. `replay()` folds the log back into the authoritative live-lease set (so the
|
|
14
|
+
journal *is* the cross-process registry — there is no second store to keep in
|
|
15
|
+
sync), and `tail`/`read_all` answer history queries. The generic writer is the
|
|
16
|
+
Layer-3 `lane_lease` shell (`acquire`/`release`/`heartbeat`/`halt`) plus the
|
|
17
|
+
supervisor driver's `scavenge`; each appends inside its own `_Mutex`, so journal
|
|
18
|
+
append order equals decision order — the WAL invariant. `replay` folds by append
|
|
19
|
+
order and ignores `seq` (which is cosmetic), so an `O_APPEND` write under that
|
|
20
|
+
mutex is sufficient.
|
|
21
|
+
|
|
22
|
+
Design rules (the LJ scope boundary):
|
|
23
|
+
|
|
24
|
+
* **Pure where it can be.** `replay()` / `compact()` take entries and return
|
|
25
|
+
entries — entries in, list out, no disk — so the suite replays and compacts
|
|
26
|
+
them without touching a file. Only `append` / `read_all` / `tail` touch disk.
|
|
27
|
+
* **Log under the lock.** The writer appends inside the lease mutex that
|
|
28
|
+
serializes the decision, so a reader's `replay` sees a consistent order.
|
|
29
|
+
* **Torn-tail tolerant.** A process killed mid-`append` can leave a partial
|
|
30
|
+
final line. `read_all` skips an unparseable *trailing* line (and only the
|
|
31
|
+
trailing one) rather than raising — a half-written record is "didn't happen",
|
|
32
|
+
the safe WAL reading. A non-trailing corrupt line is kept as a `_CORRUPT`
|
|
33
|
+
sentinel so an audit still sees the integrity breach (and `compact` preserves
|
|
34
|
+
it — a rewrite must never silently erase it).
|
|
35
|
+
* **Host-local.** One journal per host. Every entry stamps `host_id` so a future
|
|
36
|
+
cross-host merge is *possible*, but cross-host coordination is out of scope.
|
|
37
|
+
* **Bounded by an explicit compaction, not auto-rotation.** The WAL is
|
|
38
|
+
append-only; `compact()` folds it to a single CHECKPOINT snapshot of the live
|
|
39
|
+
set when an operator runs `dos journal compact`. It is **live-set-preserving**
|
|
40
|
+
(`replay(compact(E)) == replay(E)` — the arbiter sees the identical leases), but
|
|
41
|
+
NOT liveness-fold-preserving: a CHECKPOINT carries no `ts`, so a mid-flight
|
|
42
|
+
compaction makes a still-live run read STALLED until its next beat (always the
|
|
43
|
+
safe direction — compaction can never fabricate a beat/event). Run it in a quiet
|
|
44
|
+
window. An automatic size/age trigger + a `[journal]` retention seam is deferred.
|
|
45
|
+
|
|
46
|
+
Read::
|
|
47
|
+
|
|
48
|
+
dos journal tail [N] # last N entries (default 20)
|
|
49
|
+
dos journal replay # reconstructed live-lease set
|
|
50
|
+
dos journal seq # current max seq
|
|
51
|
+
dos journal compact # fold to a CHECKPOINT snapshot (bound the file)
|
|
52
|
+
|
|
53
|
+
Write is library-only (the writers are `lane_lease` / the supervisor driver, each
|
|
54
|
+
under its own mutex) — there is deliberately no `append` CLI subcommand, so
|
|
55
|
+
nothing can journal a decision outside the lock that serializes it.
|
|
56
|
+
"""
|
|
57
|
+
from __future__ import annotations
|
|
58
|
+
|
|
59
|
+
import datetime as dt
|
|
60
|
+
import json
|
|
61
|
+
import os
|
|
62
|
+
import sys
|
|
63
|
+
from pathlib import Path
|
|
64
|
+
from typing import Any, Iterable
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
68
|
+
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
from dos import config as _config
|
|
73
|
+
from dos import durable_schema as _schema
|
|
74
|
+
|
|
75
|
+
# The durable-schema family + version for lane-journal records that carry a tag.
|
|
76
|
+
# Today ONLY the OP_ATTEMPT event tags itself (docs/207 §3) — the lease ops predate
|
|
77
|
+
# the tag contract and replay reads them as UNTAGGED (the tolerant legacy floor, so
|
|
78
|
+
# no existing journal needs migrating). The version is bumped ONLY on a non-additive
|
|
79
|
+
# change to a tagged record's shape; a new field is additive and never bumps it.
|
|
80
|
+
SCHEMA_FAMILY = "lane-journal"
|
|
81
|
+
LANE_JOURNAL_SCHEMA = 1
|
|
82
|
+
|
|
83
|
+
# Host-local WAL. The default resolves against the ACTIVE WORKSPACE (the injected
|
|
84
|
+
# config), never the package's own tree (the workspace-root rule). The
|
|
85
|
+
# `DISPATCH_LANE_JOURNAL_PATH` env override is the workspace-neutral alias;
|
|
86
|
+
# `JOB_LANE_JOURNAL_PATH` is a back-compat alias an early consumer still sets.
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _default_journal_path() -> Path:
|
|
90
|
+
return _config.active().paths.lane_journal
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Module-level convenience handle, resolved LAZILY (PEP 562 `__getattr__`) the
|
|
94
|
+
# first time `lane_journal.JOURNAL_PATH` is actually read — NOT at import. The
|
|
95
|
+
# original eager `JOURNAL_PATH = Path(... or _default_journal_path())` forced
|
|
96
|
+
# `config.active()` (→ `default_config` → the git-SHA subprocess + the WMI
|
|
97
|
+
# platform probe in `gather_env_print`) to run the instant `import dos`
|
|
98
|
+
# happened, taxing EVERY consumer's cold start ~tens of ms for a path almost no
|
|
99
|
+
# caller reads as a value (the live functions all call `_journal_path()` below,
|
|
100
|
+
# which re-resolves per call so a test that sets the env override after import
|
|
101
|
+
# still redirects). Deferring it keeps `import dos` cheap; the name stays exported
|
|
102
|
+
# for back-compat (`from dos.lane_journal import *` / the host re-export shims).
|
|
103
|
+
def __getattr__(name: str) -> Any: # noqa: D401 — PEP 562 module hook
|
|
104
|
+
if name == "JOURNAL_PATH":
|
|
105
|
+
return _journal_path()
|
|
106
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
107
|
+
|
|
108
|
+
# The decision vocabulary. ACQUIRE/RELEASE ship in LJ1 (the throughline);
|
|
109
|
+
# the rest are wired in LJ2/LJ5 but the replay folder already understands
|
|
110
|
+
# them so a forward-compatible journal replays cleanly the day they appear.
|
|
111
|
+
OP_ACQUIRE = "ACQUIRE"
|
|
112
|
+
OP_RELEASE = "RELEASE"
|
|
113
|
+
OP_HEARTBEAT = "HEARTBEAT"
|
|
114
|
+
OP_SCAVENGE = "SCAVENGE"
|
|
115
|
+
OP_REFUSE = "REFUSE" # LJ2 — recorded, but does NOT mutate lease state
|
|
116
|
+
OP_RECONCILE = "RECONCILE" # LJ5 — crash-recovery reconcile, recorded. NO
|
|
117
|
+
# in-kernel writer: RECONCILE re-asserts a lease into
|
|
118
|
+
# a SEPARATE live registry the WAL says is held. This
|
|
119
|
+
# kernel has ONE store — `replay` reconstructs the
|
|
120
|
+
# registry FROM the WAL — so there is no second store
|
|
121
|
+
# to reconcile into, and the writer is host-side (a
|
|
122
|
+
# host with its own execution-state.yaml). The op +
|
|
123
|
+
# the replay fold (folds it identically to ACQUIRE)
|
|
124
|
+
# stay for that forward-compat; the kernel just never
|
|
125
|
+
# emits one. (Contrast SCAVENGE, which IS in-repo —
|
|
126
|
+
# eviction is a real action against the one WAL.)
|
|
127
|
+
OP_HALT = "HALT" # docs/99 — a STOP DECISION for an in-flight run;
|
|
128
|
+
# recorded as INTENT, does NOT mutate lease state
|
|
129
|
+
# (the eventual RELEASE/SCAVENGE confirms eviction)
|
|
130
|
+
OP_ENFORCE = "ENFORCE" # docs/189 §C4 — an ENFORCEMENT OUTCOME: a handler
|
|
131
|
+
# (dos.enforce) proposed an effect on an intervention
|
|
132
|
+
# decision (observe/warn/block/defer). Recorded for
|
|
133
|
+
# forensics like REFUSE/HALT — it grants/removes NO
|
|
134
|
+
# lease, so replay ignores it for state. This is the
|
|
135
|
+
# missing PRODUCER that makes "which call was blocked,
|
|
136
|
+
# by which handler, and what was substituted?" answerable
|
|
137
|
+
# from the spine (the ARIES-recovery gap a blocking
|
|
138
|
+
# handler otherwise left no trace of). The kernel records
|
|
139
|
+
# the proposal; a host PEP performed (or did not) the act.
|
|
140
|
+
OP_ADOPT = "ADOPT" # C5 (docs/95) — a lease OWNERSHIP TRANSFER: a new
|
|
141
|
+
# acquirer takes over a lease whose holder is gone but
|
|
142
|
+
# whose recorded children are still live. replay rewrites
|
|
143
|
+
# the live lease's holder/pid/host_id to the adopter while
|
|
144
|
+
# KEEPING its (loop_ts, lane) identity, tree, and children
|
|
145
|
+
# — adoption is an ownership rewrite, NEVER a kill (the
|
|
146
|
+
# grandchildren keep running). The host decides WHEN to
|
|
147
|
+
# adopt (it measures child liveness at the boundary, now
|
|
148
|
+
# keyed on the kernel's recorded child pids via the
|
|
149
|
+
# proc-liveness rung); the kernel provides only the
|
|
150
|
+
# non-forgeable child-identity ANCHOR + this transfer op.
|
|
151
|
+
OP_ATTEMPT = "ATTEMPT" # docs/207 §3 — a PICK ATTEMPT was made on a unit, with
|
|
152
|
+
# its outcome when known. The anti-churn cross-run memory
|
|
153
|
+
# the bare loop lacked: `cooldown.cooldown_verdict` folds
|
|
154
|
+
# these to answer "have I already tried this unit and it
|
|
155
|
+
# didn't move?" Like REFUSE/HALT/ENFORCE it grants/removes
|
|
156
|
+
# NO lease, so replay ignores it for state — it is a
|
|
157
|
+
# forensic event the cooldown fold reads via `read_all`,
|
|
158
|
+
# never `replay`. Carries a `durable_schema` tag (the FIRST
|
|
159
|
+
# lane-journal record to — older readers see UNTAGGED and a
|
|
160
|
+
# tolerant fold accepts it; the tag future-proofs the fold).
|
|
161
|
+
OP_SPAWN = "SPAWN" # docs/reports/2026-06-09 (the dos-top visibility gap) —
|
|
162
|
+
# an INTENT-TO-TAKE-A-LANE recorded the instant a launcher
|
|
163
|
+
# commits to a lane, BEFORE preflight and before the durable
|
|
164
|
+
# ACQUIRE lands. It closes the SPAWN→ACQUIRE blind window:
|
|
165
|
+
# `dos top` reads only the WAL, so a loop that has decided
|
|
166
|
+
# its lane but not yet acquired is invisible (a *successful*
|
|
167
|
+
# `arbitrate` PERSISTS nothing — purity boundary). Like
|
|
168
|
+
# REFUSE/HALT/ENFORCE/ATTEMPT it grants/removes NO lease, so
|
|
169
|
+
# it is NOT in `_STATE_MUTATING_OPS` and `replay` ignores it
|
|
170
|
+
# for state — a not-yet-real run can therefore NEVER
|
|
171
|
+
# double-book a region (the docs/281 phantom-lease failure
|
|
172
|
+
# mode is structurally impossible here: an intention is not a
|
|
173
|
+
# hold). It is the durable, cross-process home for the
|
|
174
|
+
# supervisor's in-memory `pending` field (`supervise.py:106`):
|
|
175
|
+
# `dispatch_top` folds the RECENT SPAWNs for a lane with no
|
|
176
|
+
# live lease into a `SPAWNING` chip with a short TTL, so a
|
|
177
|
+
# launch that dies in preflight ages out on its own (the same
|
|
178
|
+
# self-heal `_expire_dead` gives a crashed holder). The
|
|
179
|
+
# eventual ACQUIRE supersedes the SPAWN (a held lease wins the
|
|
180
|
+
# chip); a RELEASE with no intervening ACQUIRE is a
|
|
181
|
+
# launch-aborted record.
|
|
182
|
+
OP_CHECKPOINT = "CHECKPOINT" # LJ compaction (docs/82) — a SNAPSHOT of the live
|
|
183
|
+
# set written at the head of a compacted journal.
|
|
184
|
+
# NOT a state-mutating op in the incremental sense:
|
|
185
|
+
# `replay` handles it specially — it RESETS the
|
|
186
|
+
# reconstructed live set to the checkpoint's payload,
|
|
187
|
+
# then folds the tail of fresh entries that follow it.
|
|
188
|
+
# This is what lets `compact` discard the long history
|
|
189
|
+
# of dead leases without losing a still-live one: the
|
|
190
|
+
# surviving leases ride forward in the snapshot, not as
|
|
191
|
+
# their (now-deleted) original ACQUIRE lines.
|
|
192
|
+
|
|
193
|
+
# Ops that change the reconstructed lease set. REFUSE is a decision worth
|
|
194
|
+
# logging (someone wanted a lane and couldn't have it) but it grants nothing,
|
|
195
|
+
# so replay ignores it for state reconstruction. HALT is likewise a recorded
|
|
196
|
+
# DECISION (docs/99): "stop this run that is not done" — but it is the kernel's
|
|
197
|
+
# *intent*, decoupled from the *fact* of the lease ending (the kernel cannot
|
|
198
|
+
# know the host's stop signal landed), so like REFUSE it grants/removes nothing
|
|
199
|
+
# in replay; a later RELEASE/SCAVENGE the driver appends is what actually evicts.
|
|
200
|
+
# SPAWN is the symmetrical INTENT on the acquire side: "a run is coming to this
|
|
201
|
+
# lane" — also decoupled from the *fact* of the hold, which only the eventual
|
|
202
|
+
# ACQUIRE records, so it too grants nothing in replay (an intention that never
|
|
203
|
+
# acquires can never strand a phantom hold). This is what lets an auditor tell a
|
|
204
|
+
# *kill* (HALT→SCAVENGE) from a *natural death* (RELEASE), and a *coming* run
|
|
205
|
+
# (SPAWN→ACQUIRE) from a *held* one — the forensic point of the closed op
|
|
206
|
+
# vocabulary.
|
|
207
|
+
_STATE_MUTATING_OPS = frozenset(
|
|
208
|
+
{OP_ACQUIRE, OP_RELEASE, OP_HEARTBEAT, OP_SCAVENGE, OP_RECONCILE, OP_ADOPT}
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def journal_now_iso() -> str:
|
|
213
|
+
"""Second-resolution UTC stamp for journal entries.
|
|
214
|
+
|
|
215
|
+
Deliberately finer than a minute-only loop stamp: the journal needs to order
|
|
216
|
+
events within a minute, and the monotonic `seq` is the real tiebreak, but a
|
|
217
|
+
second-resolution `ts` makes the log human-readable without ambiguity (and is
|
|
218
|
+
the instant the heartbeat-freshness fold trusts — `journal_delta`).
|
|
219
|
+
"""
|
|
220
|
+
return dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _journal_path() -> Path:
|
|
224
|
+
# Re-read the env var each call so a test that sets the override after
|
|
225
|
+
# import still redirects. Falls back to the active workspace config when no
|
|
226
|
+
# override is set.
|
|
227
|
+
return Path(
|
|
228
|
+
os.environ.get("DISPATCH_LANE_JOURNAL_PATH")
|
|
229
|
+
or os.environ.get("JOB_LANE_JOURNAL_PATH")
|
|
230
|
+
or _default_journal_path()
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def read_all(path: Path | None = None) -> list[dict]:
|
|
235
|
+
"""Return every journal entry in append order.
|
|
236
|
+
|
|
237
|
+
Skips an unparseable TRAILING line (a torn final record from a crash
|
|
238
|
+
mid-append) — but a non-trailing corrupt line is a real integrity problem
|
|
239
|
+
and is surfaced (kept as a sentinel so a caller/audit notices), never
|
|
240
|
+
silently dropped from the middle of the order.
|
|
241
|
+
"""
|
|
242
|
+
p = path or _journal_path()
|
|
243
|
+
if not p.exists():
|
|
244
|
+
return []
|
|
245
|
+
try:
|
|
246
|
+
raw = p.read_text(encoding="utf-8", errors="replace")
|
|
247
|
+
except OSError:
|
|
248
|
+
return []
|
|
249
|
+
lines = raw.splitlines()
|
|
250
|
+
out: list[dict] = []
|
|
251
|
+
for i, line in enumerate(lines):
|
|
252
|
+
s = line.strip()
|
|
253
|
+
if not s:
|
|
254
|
+
continue
|
|
255
|
+
try:
|
|
256
|
+
obj = json.loads(s)
|
|
257
|
+
except json.JSONDecodeError:
|
|
258
|
+
# Tolerate ONLY a torn final line (crash mid-append). Any earlier
|
|
259
|
+
# corrupt line is a genuine integrity breach — record a sentinel
|
|
260
|
+
# so audit/replay can flag it rather than pretend order is intact.
|
|
261
|
+
if i == len(lines) - 1:
|
|
262
|
+
break
|
|
263
|
+
out.append({"op": "_CORRUPT", "_raw": s, "_line": i})
|
|
264
|
+
continue
|
|
265
|
+
if isinstance(obj, dict):
|
|
266
|
+
out.append(obj)
|
|
267
|
+
return out
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def tail(n: int = 20, path: Path | None = None) -> list[dict]:
|
|
271
|
+
"""The last `n` entries — reads the whole file then slices.
|
|
272
|
+
|
|
273
|
+
The journal is NOT auto-rotated: on a long-lived fleet it grows unbounded and
|
|
274
|
+
this (like `read_all`/`replay`/`next_seq`) is O(file). Run `dos journal
|
|
275
|
+
compact` (`compact()` + the `lane_lease.compact_journal` I/O shell) to bound
|
|
276
|
+
it: that folds the WAL to a single CHECKPOINT snapshot of the live set,
|
|
277
|
+
live-set-preserving (`replay(compact(E)) == replay(E)` — the arbiter sees the
|
|
278
|
+
identical leases; see `compact` for the liveness-fold caveat). An automatic
|
|
279
|
+
size/age-triggered rotation + a `[journal]` retention seam is deferred."""
|
|
280
|
+
entries = read_all(path)
|
|
281
|
+
return entries[-n:] if n > 0 else entries
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def next_seq(path: Path | None = None) -> int:
|
|
285
|
+
"""The seq to stamp on the next entry = max existing seq + 1 (1-based).
|
|
286
|
+
|
|
287
|
+
Read under the SAME `_StateFileLock` the caller holds for the registry
|
|
288
|
+
write, so two concurrent acquirers can't mint the same seq.
|
|
289
|
+
"""
|
|
290
|
+
mx = 0
|
|
291
|
+
for e in read_all(path):
|
|
292
|
+
try:
|
|
293
|
+
s = int(e.get("seq") or 0)
|
|
294
|
+
except (TypeError, ValueError):
|
|
295
|
+
s = 0
|
|
296
|
+
# An OP_CHECKPOINT carries the high-water `seq` of the history it
|
|
297
|
+
# replaced (`seq_watermark`). After a compaction discards the lines that
|
|
298
|
+
# held the prior max seq, the watermark is the ONLY surviving record of
|
|
299
|
+
# it — so it must bound `next_seq` too, or a rewrite would let the next
|
|
300
|
+
# append REUSE a seq from the discarded prefix and corrupt append order.
|
|
301
|
+
try:
|
|
302
|
+
w = int(e.get("seq_watermark") or 0)
|
|
303
|
+
except (TypeError, ValueError):
|
|
304
|
+
w = 0
|
|
305
|
+
mx = max(mx, s, w)
|
|
306
|
+
return mx + 1
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def append(entry: dict, path: Path | None = None) -> dict:
|
|
310
|
+
"""Append one entry to the journal and `fsync` it to disk.
|
|
311
|
+
|
|
312
|
+
`entry` is the caller's decision payload; this stamps `seq` (if absent),
|
|
313
|
+
`ts` (if absent), and writes a single canonical-JSON line followed by a
|
|
314
|
+
newline, then `flush()` + `os.fsync()` so the record is durable before
|
|
315
|
+
the function returns (and thus before the caller mutates the registry).
|
|
316
|
+
|
|
317
|
+
Returns the stamped entry (with seq/ts filled in) so the caller can log
|
|
318
|
+
it. The caller is responsible for holding the state lock — `append` does
|
|
319
|
+
NOT lock, because journal order must equal registry-mutation order and
|
|
320
|
+
only the caller knows the surrounding critical section.
|
|
321
|
+
"""
|
|
322
|
+
p = path or _journal_path()
|
|
323
|
+
e = dict(entry)
|
|
324
|
+
e.setdefault("seq", next_seq(p))
|
|
325
|
+
e.setdefault("ts", journal_now_iso())
|
|
326
|
+
line = json.dumps(e, sort_keys=True, default=str, ensure_ascii=False) + "\n"
|
|
327
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
+
# O_APPEND makes the write atomic w.r.t. other appenders at the OS level;
|
|
329
|
+
# the surrounding _StateFileLock already serializes our own callers, but
|
|
330
|
+
# O_APPEND is the belt to that suspenders.
|
|
331
|
+
fd = os.open(str(p), os.O_WRONLY | os.O_APPEND | os.O_CREAT, 0o644)
|
|
332
|
+
try:
|
|
333
|
+
os.write(fd, line.encode("utf-8"))
|
|
334
|
+
os.fsync(fd)
|
|
335
|
+
finally:
|
|
336
|
+
os.close(fd)
|
|
337
|
+
return e
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _lease_identity(rec: dict) -> tuple[str, str]:
|
|
341
|
+
"""(loop_ts, lane) — the true lease identity (a loop_ts is minute-
|
|
342
|
+
resolution so two disjoint-lane loops can share one; lane disambiguates).
|
|
343
|
+
The same identity `journal_delta` scopes its liveness fold to."""
|
|
344
|
+
return (str(rec.get("loop_ts") or ""), str(rec.get("lane") or ""))
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def replay(entries: Iterable[dict]) -> list[dict]:
|
|
348
|
+
"""Fold the decision sequence into the authoritative live-lease set.
|
|
349
|
+
|
|
350
|
+
Pure: entries in, lease list out (no disk). This is the WAL-recovery core
|
|
351
|
+
and the LJ5 hero invariant — replaying the journal must reproduce the
|
|
352
|
+
authoritative live-lease set the arbiter admits against. Folding rules:
|
|
353
|
+
|
|
354
|
+
* ACQUIRE -> add/replace the (loop_ts, lane) lease with its payload.
|
|
355
|
+
* RELEASE -> remove the (loop_ts, lane) lease.
|
|
356
|
+
* SCAVENGE -> remove the (loop_ts, lane) lease (eviction).
|
|
357
|
+
* HEARTBEAT-> update the live lease's heartbeat_at (no-op if absent).
|
|
358
|
+
* RECONCILE-> re-assert a lease a separate registry was missing (LJ5; no
|
|
359
|
+
in-kernel writer — single-store kernels reconcile via this very replay).
|
|
360
|
+
* CHECKPOINT-> RESET the live set to the snapshot's `leases` payload, in
|
|
361
|
+
payload order, then keep folding the tail (LJ compaction, docs/82). This
|
|
362
|
+
is what lets `compact` discard the long dead-lease history without losing
|
|
363
|
+
a still-live lease — the surviving leases ride forward in the snapshot.
|
|
364
|
+
Handled BEFORE the state-mutating-ops gate so it can never be skipped.
|
|
365
|
+
* HALT / REFUSE / ENFORCE / ATTEMPT / SPAWN / _CORRUPT / unknown -> ignored for
|
|
366
|
+
state (HALT records a stop INTENT, REFUSE a denied request, ENFORCE an
|
|
367
|
+
enforcement outcome, ATTEMPT a pick attempt for the cooldown fold, SPAWN an
|
|
368
|
+
intent-to-take-a-lane for the dos-top SPAWNING chip — none grants or removes a
|
|
369
|
+
lease; a corrupt sentinel must not silently mutate state).
|
|
370
|
+
|
|
371
|
+
Returns leases in first-acquired order (stable), each a dict shaped like the
|
|
372
|
+
lease rows `lane_lease.acquire` writes, so an audit can diff byte-for-byte.
|
|
373
|
+
"""
|
|
374
|
+
# Ordered by first-acquire so the reconstructed list is stable/comparable.
|
|
375
|
+
live: dict[tuple[str, str], dict] = {}
|
|
376
|
+
order: list[tuple[str, str]] = []
|
|
377
|
+
|
|
378
|
+
def _forget(key: tuple[str, str]) -> None:
|
|
379
|
+
live.pop(key, None)
|
|
380
|
+
if key in order:
|
|
381
|
+
order.remove(key)
|
|
382
|
+
|
|
383
|
+
for e in entries:
|
|
384
|
+
op = str(e.get("op") or "")
|
|
385
|
+
if op == OP_CHECKPOINT:
|
|
386
|
+
# A compaction snapshot: RESET the reconstructed live set to exactly
|
|
387
|
+
# the leases the checkpoint carries (in payload order), discarding
|
|
388
|
+
# whatever was folded so far. This must run BEFORE the
|
|
389
|
+
# _STATE_MUTATING_OPS gate below — a checkpoint is not an incremental
|
|
390
|
+
# op, it is a re-base of the fold. Because `compact` writes a snapshot
|
|
391
|
+
# of `replay(prefix)`, re-basing onto it yields the identical live set:
|
|
392
|
+
# the replay(compact(E)) == replay(E) invariant.
|
|
393
|
+
live.clear()
|
|
394
|
+
order.clear()
|
|
395
|
+
payload = e.get("leases")
|
|
396
|
+
if isinstance(payload, list):
|
|
397
|
+
for lease in payload:
|
|
398
|
+
if not isinstance(lease, dict):
|
|
399
|
+
continue
|
|
400
|
+
key = _lease_identity(lease)
|
|
401
|
+
if not key[0] and not key[1]:
|
|
402
|
+
continue
|
|
403
|
+
if key not in live:
|
|
404
|
+
order.append(key)
|
|
405
|
+
live[key] = dict(lease)
|
|
406
|
+
continue
|
|
407
|
+
if op not in _STATE_MUTATING_OPS:
|
|
408
|
+
continue # REFUSE, HALT, ENFORCE, _CORRUPT, unknown — recorded, not state
|
|
409
|
+
key = _lease_identity(e)
|
|
410
|
+
if not key[0] and not key[1]:
|
|
411
|
+
continue
|
|
412
|
+
if op in (OP_ACQUIRE, OP_RECONCILE):
|
|
413
|
+
lease = e.get("lease")
|
|
414
|
+
if not isinstance(lease, dict):
|
|
415
|
+
# Forward-compat: an ACQUIRE may carry the lease fields inline
|
|
416
|
+
# rather than nested under "lease". Reconstruct from the
|
|
417
|
+
# known lease keys present on the entry.
|
|
418
|
+
lease = {
|
|
419
|
+
k: e[k] for k in (
|
|
420
|
+
"lane", "lane_kind", "tree", "loop_ts", "host_id",
|
|
421
|
+
"pid", "acquired_at", "heartbeat_at", "ttl_minutes",
|
|
422
|
+
"holder", "run_id",
|
|
423
|
+
) if k in e
|
|
424
|
+
}
|
|
425
|
+
if key not in live:
|
|
426
|
+
order.append(key)
|
|
427
|
+
live[key] = dict(lease)
|
|
428
|
+
elif op in (OP_RELEASE, OP_SCAVENGE):
|
|
429
|
+
_forget(key)
|
|
430
|
+
elif op == OP_HEARTBEAT:
|
|
431
|
+
if key in live:
|
|
432
|
+
hb = e.get("heartbeat_at") or e.get("ts")
|
|
433
|
+
if hb:
|
|
434
|
+
live[key]["heartbeat_at"] = hb
|
|
435
|
+
elif op == OP_ADOPT:
|
|
436
|
+
# Ownership TRANSFER (C5): a new acquirer takes over the live lease at
|
|
437
|
+
# this (loop_ts, lane). Rewrite ONLY ownership (holder/pid/host_id) +
|
|
438
|
+
# refresh the heartbeat so the adopted lease is not immediately stale;
|
|
439
|
+
# KEEP the lease's identity, tree, ttl, and children. NEVER add a lease
|
|
440
|
+
# that isn't live — adoption transfers an EXISTING hold, it does not
|
|
441
|
+
# grant one (an ADOPT against a released/scavenged key is a no-op, the
|
|
442
|
+
# safe direction: you cannot adopt a lease no one holds).
|
|
443
|
+
if key in live:
|
|
444
|
+
lease = live[key]
|
|
445
|
+
for fld in ("holder", "pid", "host_id"):
|
|
446
|
+
if fld in e and e[fld] is not None:
|
|
447
|
+
lease[fld] = e[fld]
|
|
448
|
+
hb = e.get("heartbeat_at") or e.get("ts")
|
|
449
|
+
if hb:
|
|
450
|
+
lease["heartbeat_at"] = hb
|
|
451
|
+
return [live[k] for k in order if k in live]
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
# --------------------------------------------------------------------------
|
|
455
|
+
# Entry builders — the writer (`lane_lease` / the supervisor driver) uses these
|
|
456
|
+
# so the entry shape is defined HERE (one home), not duplicated at each call
|
|
457
|
+
# site. Pure constructors.
|
|
458
|
+
# --------------------------------------------------------------------------
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def adopt_entry(lease: dict, *, new_holder: str, new_pid: Any = None,
|
|
462
|
+
new_host_id: str = "", heartbeat_at: str = "", reason: str = "") -> dict:
|
|
463
|
+
"""Build an ADOPT entry: transfer ownership of a live lease to `new_holder` (C5).
|
|
464
|
+
|
|
465
|
+
The eviction-free sibling of `scavenge_entry`. Where SCAVENGE removes a lease
|
|
466
|
+
whose holder is gone AND whose work is done, ADOPT transfers a lease whose holder
|
|
467
|
+
is gone but whose recorded children are STILL LIVE — so the lane keeps its
|
|
468
|
+
in-flight grandchildren instead of being reclaimed out from under them or wedged
|
|
469
|
+
to TTL. replay rewrites the live lease's `holder`/`pid`/`host_id` (and refreshes
|
|
470
|
+
the heartbeat) while keeping its identity, tree, ttl, and children.
|
|
471
|
+
|
|
472
|
+
The KERNEL never decides to adopt — it cannot non-forgeably tell orphaned-but-
|
|
473
|
+
working from stalled-dead (that needs grandchild liveness, host boundary I/O via
|
|
474
|
+
the proc-liveness rung). The host gathers that evidence, decides, and appends this
|
|
475
|
+
op; the kernel provides only the transfer mechanism + the child-identity anchor
|
|
476
|
+
`acquire_entry` records. `heartbeat_at` defaults to now so the adopted lease is not
|
|
477
|
+
instantly stale under the new owner.
|
|
478
|
+
"""
|
|
479
|
+
return {
|
|
480
|
+
"op": OP_ADOPT,
|
|
481
|
+
"lane": lease.get("lane"),
|
|
482
|
+
"loop_ts": lease.get("loop_ts"),
|
|
483
|
+
"holder": new_holder,
|
|
484
|
+
"pid": new_pid,
|
|
485
|
+
"host_id": new_host_id or lease.get("host_id"),
|
|
486
|
+
"prev_holder": lease.get("holder"),
|
|
487
|
+
"heartbeat_at": heartbeat_at or journal_now_iso(),
|
|
488
|
+
"reason": reason,
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def acquire_entry(lease: dict, *, reason: str = "", prev_holder: Any = None,
|
|
493
|
+
env_digest: str = "", children: Any = None,
|
|
494
|
+
run_id: Any = None) -> dict:
|
|
495
|
+
"""Build an ACQUIRE entry from the lease dict the writer just minted.
|
|
496
|
+
|
|
497
|
+
`run_id` (OPTIONAL, docs/118 Size S / docs/137) is the CID spine id of the run
|
|
498
|
+
that took this lease — the field that closes the WAL↔spine join. `refuse_entry`
|
|
499
|
+
and `halt_entry` already carry a `run_id`; the GRANT side did not, so a *held*
|
|
500
|
+
lane (unlike a *refused* one) could not be traced back to the run that wanted
|
|
501
|
+
it — the exact gap docs/118 measured at `0` join-ready ACQUIREs. It rides on
|
|
502
|
+
the NESTED lease so `replay` reconstructs it onto the live lease (and an ADOPT
|
|
503
|
+
preserves it), where any reader keyed on `run_id` (`decisions`,
|
|
504
|
+
`trajectory_audit._lease_run_id`, `dos trace`) reads it off. Purely ADDITIVE
|
|
505
|
+
like `env_digest`/`children`: an ACQUIRE with no `run_id` replays byte-identically
|
|
506
|
+
(the lane-journal forward-compat contract). Recorded, never adjudicated on — the
|
|
507
|
+
kernel does not gate on which run holds a lane; it just makes the hold
|
|
508
|
+
*attributable* (the docs/76 record-don't-decide line).
|
|
509
|
+
|
|
510
|
+
`children` (OPTIONAL, C5) is the list of child identities the holder spawned —
|
|
511
|
+
`[{"run_id": ..., "pid": ...}, ...]` — the non-forgeable ANCHOR that lets a later
|
|
512
|
+
acquirer tell "the holder is gone but its grandchildren are still working" from
|
|
513
|
+
"this lease is simply dead." Purely ADDITIVE like `env_digest`: an ACQUIRE with no
|
|
514
|
+
`children` replays unchanged. The kernel RECORDS the anchor; it never measures the
|
|
515
|
+
children's liveness (that is host boundary I/O via the proc-liveness rung) — it
|
|
516
|
+
just makes the host's later child-liveness probe key on a durable identity instead
|
|
517
|
+
of a forgeable log-growth signal. Rides on the lease payload so replay carries it
|
|
518
|
+
onto the reconstructed lease (and an ADOPT preserves it).
|
|
519
|
+
|
|
520
|
+
`env_digest` (OPTIONAL) is the holder's environment-print digest — the
|
|
521
|
+
`env_print.EnvPrint.digest` of the runtime that took the lease (docs/115
|
|
522
|
+
primitive 1). The ACQUIRE is where a lease is BORN (once per run's hold), so it
|
|
523
|
+
is the right entry to carry *under what* the hold happened; later beats /
|
|
524
|
+
releases carry only identity. Just the cheap KEY rides here, not the full print
|
|
525
|
+
(that lands once per run-dir in the intent ledger's INTENT record) — so
|
|
526
|
+
`dos top` / replay can answer "which environment holds this lane" and join back
|
|
527
|
+
to the full print by digest. Purely ADDITIVE: an ACQUIRE with no `env_digest`
|
|
528
|
+
is a hold from a kernel that did not stamp prints, replayed unchanged (a new
|
|
529
|
+
optional field never disturbs the fold — the lane-journal forward-compat
|
|
530
|
+
contract). Recorded, never adjudicated on (the docs/76 line); the
|
|
531
|
+
`FLEET_ENV_MISMATCH` gate that COMPARES a digest to a pin is a later phase, and
|
|
532
|
+
it lives in the arbiter, not here.
|
|
533
|
+
"""
|
|
534
|
+
e = {
|
|
535
|
+
"op": OP_ACQUIRE,
|
|
536
|
+
"lane": lease.get("lane"),
|
|
537
|
+
"lane_kind": lease.get("lane_kind"),
|
|
538
|
+
"tree": lease.get("tree"),
|
|
539
|
+
"loop_ts": lease.get("loop_ts"),
|
|
540
|
+
"host_id": lease.get("host_id"),
|
|
541
|
+
"pid": lease.get("pid"),
|
|
542
|
+
"ttl_minutes": lease.get("ttl_minutes"),
|
|
543
|
+
"prev_holder": prev_holder,
|
|
544
|
+
"reason": reason,
|
|
545
|
+
# Nest the full lease so replay reconstructs it exactly.
|
|
546
|
+
"lease": dict(lease),
|
|
547
|
+
}
|
|
548
|
+
if env_digest:
|
|
549
|
+
e["env_digest"] = env_digest
|
|
550
|
+
# The CID spine id (docs/118 S / docs/137) rides on the NESTED lease so replay
|
|
551
|
+
# carries it onto the reconstructed live lease and a later ADOPT preserves it —
|
|
552
|
+
# the WAL↔spine join key. Prefer an explicit `run_id` arg; else honor one already
|
|
553
|
+
# on the lease dict (a host that stamped it at mint time). Additive — absent ⇒ no
|
|
554
|
+
# `run_id` on the lease, replayed unchanged.
|
|
555
|
+
rid = run_id if run_id is not None else lease.get("run_id")
|
|
556
|
+
if rid:
|
|
557
|
+
e["lease"] = {**e["lease"], "run_id": str(rid)}
|
|
558
|
+
# The child-identity anchor (C5) rides on the nested lease so replay carries it
|
|
559
|
+
# onto the reconstructed lease and a later ADOPT preserves it. Prefer an explicit
|
|
560
|
+
# `children` arg; else honor one already on the lease dict (a host that stamps it
|
|
561
|
+
# at mint time). Additive — absent ⇒ no `children` key, replayed unchanged.
|
|
562
|
+
kids = children if children is not None else lease.get("children")
|
|
563
|
+
if kids:
|
|
564
|
+
e["lease"] = {**e["lease"], "children": list(kids)}
|
|
565
|
+
return e
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def release_entry(lease: dict, *, reason: str = "explicit") -> dict:
|
|
569
|
+
"""Build a RELEASE entry for a dropped lease."""
|
|
570
|
+
return {
|
|
571
|
+
"op": OP_RELEASE,
|
|
572
|
+
"lane": lease.get("lane"),
|
|
573
|
+
"loop_ts": lease.get("loop_ts"),
|
|
574
|
+
"host_id": lease.get("host_id"),
|
|
575
|
+
"reason": reason,
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def heartbeat_entry(lease: dict, *, heartbeat_at: str = "") -> dict:
|
|
580
|
+
"""Build a HEARTBEAT entry refreshing a live lease's liveness stamp.
|
|
581
|
+
|
|
582
|
+
The HEARTBEAT path is now complete end-to-end: this builder, the `replay`
|
|
583
|
+
fold (which sets a live lease's `heartbeat_at` from this entry's
|
|
584
|
+
`heartbeat_at` or its `ts`), the `journal_delta._HEARTBEAT_OPS` freshness
|
|
585
|
+
rung, AND the effectful writer (`lane_lease.heartbeat`, the verb behind
|
|
586
|
+
`dos lease-lane heartbeat`). That writer is what makes liveness SPINNING
|
|
587
|
+
reachable from real journal evidence — before it, nothing emitted an
|
|
588
|
+
OP_HEARTBEAT, so the newest beat was always the boundary ACQUIRE, which aged
|
|
589
|
+
out to STALLED.
|
|
590
|
+
|
|
591
|
+
A HEARTBEAT is a *beat*, not a state-change: replay keys it on the
|
|
592
|
+
`(loop_ts, lane)` identity and updates the freshness of an already-live lease
|
|
593
|
+
(a no-op if that lease isn't currently held), so it carries just the identity
|
|
594
|
+
+ the stamp, not the full lease body — and it is deliberately EXCLUDED from
|
|
595
|
+
`journal_delta._EVENT_OPS`, so a fresh beat proves life without counting as
|
|
596
|
+
progress (the SPINNING rung). `heartbeat_at` defaults to the entry `ts`
|
|
597
|
+
(filled by `append`); `lane_lease.heartbeat` passes the append instant
|
|
598
|
+
explicitly so the fold trusts the writer's own clock.
|
|
599
|
+
"""
|
|
600
|
+
e = {
|
|
601
|
+
"op": OP_HEARTBEAT,
|
|
602
|
+
"lane": lease.get("lane"),
|
|
603
|
+
"loop_ts": lease.get("loop_ts"),
|
|
604
|
+
"host_id": lease.get("host_id"),
|
|
605
|
+
}
|
|
606
|
+
if heartbeat_at:
|
|
607
|
+
e["heartbeat_at"] = heartbeat_at
|
|
608
|
+
return e
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def attempt_entry(
|
|
612
|
+
unit_id: str,
|
|
613
|
+
*,
|
|
614
|
+
outcome: str,
|
|
615
|
+
run_id: Any = None,
|
|
616
|
+
lane: str = "",
|
|
617
|
+
loop_ts: str = "",
|
|
618
|
+
host_id: Any = None,
|
|
619
|
+
) -> dict:
|
|
620
|
+
"""Build an OP_ATTEMPT entry — a recorded PICK ATTEMPT on a unit (docs/207 §3).
|
|
621
|
+
|
|
622
|
+
The anti-churn cross-run memory the bare loop lacked: a loop re-picked the same
|
|
623
|
+
drained unit every iteration once its claim TTL lapsed (measured ~5% of runs
|
|
624
|
+
shipping). This event records that a pick was ATTEMPTED, carrying its
|
|
625
|
+
``outcome`` when known, so `cooldown.cooldown_verdict` can fold the recent
|
|
626
|
+
history and answer "have I already tried this unit and it didn't move?" — the
|
|
627
|
+
`RECENTLY_ATTEMPTED` hold that skips a just-drained unit instead of re-dispatching.
|
|
628
|
+
|
|
629
|
+
``outcome`` is a typed token the cooldown fold reads (the closed set lives in
|
|
630
|
+
`dos.cooldown.AttemptOutcome` — e.g. ``"shipped"`` / ``"drained"`` /
|
|
631
|
+
``"blocked"`` / ``"error"``); recorded verbatim, interpreted only by the fold.
|
|
632
|
+
``run_id`` is the CID spine id of the attempting run (optional). ``lane`` /
|
|
633
|
+
``loop_ts`` / ``host_id`` correlate the attempt to a lease when known.
|
|
634
|
+
|
|
635
|
+
Like REFUSE/HALT/ENFORCE this is a FORENSIC event: OP_ATTEMPT is NOT in
|
|
636
|
+
`_STATE_MUTATING_OPS`, so `replay` ignores it for lease-state reconstruction (a
|
|
637
|
+
pick attempt grants/removes no lease) — journaling every attempt can never lose
|
|
638
|
+
or invent a live lease, it only adds the history the cooldown fold reads via
|
|
639
|
+
`read_all`. It carries a `durable_schema` tag (the FIRST lane-journal record to);
|
|
640
|
+
`append` merges it if absent, so the fold is version-forward-compatible.
|
|
641
|
+
"""
|
|
642
|
+
e = {
|
|
643
|
+
**_schema.tag(SCHEMA_FAMILY, LANE_JOURNAL_SCHEMA),
|
|
644
|
+
"op": OP_ATTEMPT,
|
|
645
|
+
"unit_id": str(unit_id),
|
|
646
|
+
"outcome": str(outcome),
|
|
647
|
+
"lane": lane,
|
|
648
|
+
"loop_ts": loop_ts,
|
|
649
|
+
"host_id": host_id,
|
|
650
|
+
}
|
|
651
|
+
if run_id is not None:
|
|
652
|
+
e["run_id"] = str(run_id)
|
|
653
|
+
return e
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def spawn_entry(
|
|
657
|
+
*,
|
|
658
|
+
lane: str,
|
|
659
|
+
loop_ts: str = "",
|
|
660
|
+
holder: str = "",
|
|
661
|
+
host_id: Any = None,
|
|
662
|
+
pid: Any = None,
|
|
663
|
+
run_id: Any = None,
|
|
664
|
+
reason: str = "",
|
|
665
|
+
) -> dict:
|
|
666
|
+
"""Build an OP_SPAWN entry — a recorded INTENT TO TAKE A LANE (the dos-top gap).
|
|
667
|
+
|
|
668
|
+
The acquire-side sibling of `halt_entry`. A HALT says "a held run is going to
|
|
669
|
+
stop"; a SPAWN says "a run is *coming* to this lane" — recorded the instant a
|
|
670
|
+
launcher commits to a lane, BEFORE preflight and before the durable ACQUIRE. It
|
|
671
|
+
exists to close the SPAWN→ACQUIRE blind window the audit names: `dos top` reads
|
|
672
|
+
only the WAL, and a *successful* `arbitrate` persists nothing, so between launch
|
|
673
|
+
and the first ACQUIRE a loop is invisible on the only surface the watchdog reads.
|
|
674
|
+
|
|
675
|
+
Like `halt_entry`/`refuse_entry`/`attempt_entry` this is a FORENSIC INTENT, not a
|
|
676
|
+
grant: OP_SPAWN is NOT in `_STATE_MUTATING_OPS`, so `replay` ignores it for lease
|
|
677
|
+
reconstruction. That is the whole safety argument — an intent that never acquires
|
|
678
|
+
can never strand a phantom hold (the docs/281 failure mode), and a not-yet-real
|
|
679
|
+
run can never double-book a region the arbiter admits against. The `dispatch_top`
|
|
680
|
+
SPAWNING chip is a SEPARATE fold over the recent SPAWNs (TTL-bounded, no-live-lease
|
|
681
|
+
only), never the admission live set.
|
|
682
|
+
|
|
683
|
+
`lane` is required (the region being committed to). `loop_ts`/`holder`/`host_id`/
|
|
684
|
+
`pid`/`run_id` correlate the intent to the eventual ACQUIRE when known — the same
|
|
685
|
+
identity tuple `acquire_entry` stamps, so a reader can join SPAWN→ACQUIRE. `reason`
|
|
686
|
+
is free text for the operator (e.g. the launch context). All optional but `lane`.
|
|
687
|
+
"""
|
|
688
|
+
e: dict = {
|
|
689
|
+
"op": OP_SPAWN,
|
|
690
|
+
"lane": lane,
|
|
691
|
+
"loop_ts": loop_ts,
|
|
692
|
+
"holder": holder,
|
|
693
|
+
"host_id": host_id,
|
|
694
|
+
"pid": pid,
|
|
695
|
+
"reason": reason,
|
|
696
|
+
}
|
|
697
|
+
if run_id is not None:
|
|
698
|
+
e["run_id"] = str(run_id)
|
|
699
|
+
return e
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def scavenge_entry(
|
|
703
|
+
lease: dict, *, reason: str = "scavenged", prev_holder: Any = None
|
|
704
|
+
) -> dict:
|
|
705
|
+
"""Build a SCAVENGE entry for an evicted (orphaned) lease.
|
|
706
|
+
|
|
707
|
+
The eviction sibling of `release_entry`: replay folds OP_SCAVENGE
|
|
708
|
+
identically to OP_RELEASE (it removes the `(loop_ts, lane)` lease), so this
|
|
709
|
+
carries the same eviction key — `loop_ts` + `lane` + `host_id` + `reason`.
|
|
710
|
+
A scavenge is an *eviction*, not a voluntary drop, so it ALSO carries the
|
|
711
|
+
forensic pair `pid` + `prev_holder` (`acquire_entry` stamps the same two):
|
|
712
|
+
an operator reading the journal can see exactly which process/holder was
|
|
713
|
+
reclaimed and why, without re-joining to the prior ACQUIRE. (The supervisor
|
|
714
|
+
driver writes this when `supervise()` returns a REAP for a STALLED lease.)
|
|
715
|
+
"""
|
|
716
|
+
return {
|
|
717
|
+
"op": OP_SCAVENGE,
|
|
718
|
+
"lane": lease.get("lane"),
|
|
719
|
+
"loop_ts": lease.get("loop_ts"),
|
|
720
|
+
"host_id": lease.get("host_id"),
|
|
721
|
+
"pid": lease.get("pid"),
|
|
722
|
+
"prev_holder": prev_holder,
|
|
723
|
+
"reason": reason,
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
def halt_entry(
|
|
728
|
+
handle: str,
|
|
729
|
+
*,
|
|
730
|
+
reason: str = "",
|
|
731
|
+
lane: str = "",
|
|
732
|
+
loop_ts: str = "",
|
|
733
|
+
host_id: Any = None,
|
|
734
|
+
run_id: Any = None,
|
|
735
|
+
command: Any = None,
|
|
736
|
+
) -> dict:
|
|
737
|
+
"""Build a HALT entry — a recorded STOP DECISION for an in-flight run (docs/99).
|
|
738
|
+
|
|
739
|
+
The DOMAIN-FREE contract: `handle` is an **opaque** identifier the HOST
|
|
740
|
+
supplies for the thing to stop — a pid string, a container id, a remote-task
|
|
741
|
+
token, a harness `Workflow` id. The kernel records it verbatim and interprets
|
|
742
|
+
NOTHING about it (it never learns "a run is a pid on this host" — that is the
|
|
743
|
+
domain knowledge a substrate must not carry, docs/99 §3). `command`, if given,
|
|
744
|
+
is the equally host-supplied stop command echoed onto the spine for forensics
|
|
745
|
+
— the kernel records the proposed command, it never runs it.
|
|
746
|
+
|
|
747
|
+
Unlike `scavenge_entry`, HALT carries no lease payload and removes no lease in
|
|
748
|
+
`replay` (it is NOT in `_STATE_MUTATING_OPS`): it is the kernel's *intent* to
|
|
749
|
+
stop, decoupled from the *fact* of the lease ending, which only a later
|
|
750
|
+
RELEASE/SCAVENGE the driver appends (once the stop is confirmed) records. The
|
|
751
|
+
lane/loop_ts/host_id are carried when known purely so an operator can correlate
|
|
752
|
+
the HALT to the lease it targeted, without re-joining to the ACQUIRE.
|
|
753
|
+
"""
|
|
754
|
+
return {
|
|
755
|
+
"op": OP_HALT,
|
|
756
|
+
"handle": handle,
|
|
757
|
+
"lane": lane,
|
|
758
|
+
"loop_ts": loop_ts,
|
|
759
|
+
"host_id": host_id,
|
|
760
|
+
"run_id": run_id,
|
|
761
|
+
"command": command,
|
|
762
|
+
"reason": reason,
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def refuse_entry(
|
|
767
|
+
decision: Any,
|
|
768
|
+
*,
|
|
769
|
+
owner: str,
|
|
770
|
+
lane: str = "",
|
|
771
|
+
loop_ts: str = "",
|
|
772
|
+
host_id: Any = None,
|
|
773
|
+
run_id: Any = None,
|
|
774
|
+
reason_class: str = "",
|
|
775
|
+
) -> dict:
|
|
776
|
+
"""Build a REFUSE entry — a recorded DENIED lane request (LJ2 / docs/82).
|
|
777
|
+
|
|
778
|
+
The forensic sibling of `acquire_entry`: an ACQUIRE records that someone GOT a
|
|
779
|
+
lane; a REFUSE records that someone WANTED one and could not have it. Without
|
|
780
|
+
it the journal cannot answer the question its own module docstring poses —
|
|
781
|
+
"why was I refused at 14:03?" — because a denied `arbitrate` leaves no trace
|
|
782
|
+
at all. Three readers already CONSUME `OP_REFUSE` (the decisions queue, the
|
|
783
|
+
central-index home, the trajectory audit); this is the missing PRODUCER.
|
|
784
|
+
|
|
785
|
+
`decision` is duck-typed off the pure `arbiter.LaneDecision` (or any object
|
|
786
|
+
exposing `.reason` / `.lane`) — the builder reads only those two attributes,
|
|
787
|
+
so it stays a pure stdlib leaf with no kernel import of the arbiter. `owner`
|
|
788
|
+
is the requester tag (recorded as `holder`, mirroring how `acquire_entry`
|
|
789
|
+
threads the lease holder). `reason_class` is the *typed* refusal token for a
|
|
790
|
+
future arbiter surface that carries one (`AdmissionVerdict.reason_class`);
|
|
791
|
+
today it defaults to `""` and the readers degrade an empty token gracefully.
|
|
792
|
+
|
|
793
|
+
Crucially, `OP_REFUSE` is NOT in `_STATE_MUTATING_OPS`, so `replay` ignores it
|
|
794
|
+
for state reconstruction (a denied request grants nothing): journaling every
|
|
795
|
+
refuse can never lose or invent a live lease — it only adds history.
|
|
796
|
+
"""
|
|
797
|
+
return {
|
|
798
|
+
"op": OP_REFUSE,
|
|
799
|
+
"lane": lane or getattr(decision, "lane", "") or "",
|
|
800
|
+
"loop_ts": loop_ts,
|
|
801
|
+
"host_id": host_id,
|
|
802
|
+
"run_id": run_id,
|
|
803
|
+
"holder": owner,
|
|
804
|
+
"reason": getattr(decision, "reason", "") or "",
|
|
805
|
+
"reason_class": reason_class,
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def enforce_entry(
|
|
810
|
+
proposal: Any,
|
|
811
|
+
*,
|
|
812
|
+
owner: str = "",
|
|
813
|
+
lane: str = "",
|
|
814
|
+
loop_ts: str = "",
|
|
815
|
+
host_id: Any = None,
|
|
816
|
+
run_id: Any = None,
|
|
817
|
+
tool: str = "",
|
|
818
|
+
) -> dict:
|
|
819
|
+
"""Build an OP_ENFORCE entry — a recorded ENFORCEMENT OUTCOME (docs/189 §C4).
|
|
820
|
+
|
|
821
|
+
The forensic sibling of `refuse_entry`, for the actuation seam (`dos.enforce`):
|
|
822
|
+
a REFUSE records that a lane request was denied; an ENFORCE records that a
|
|
823
|
+
handler PROPOSED an effect on an intervention decision — observe / warn / block
|
|
824
|
+
(with a synthetic substitute) / defer. Without it, a handler that withholds a
|
|
825
|
+
tool call leaves no trace on the spine, so an auditor (or a `resume`) cannot
|
|
826
|
+
answer "which call was blocked at 14:03, by which handler, and what was
|
|
827
|
+
substituted?" — the ARIES-recovery gap docs/189 names.
|
|
828
|
+
|
|
829
|
+
`proposal` is duck-typed off `dos.enforce.EffectProposal` (the builder reads
|
|
830
|
+
only `.to_dict()`, or falls back to the bare attributes) so this stays a pure
|
|
831
|
+
stdlib leaf with no kernel import of the enforce module — the same discipline
|
|
832
|
+
`refuse_entry` uses to read a `LaneDecision` without importing the arbiter. The
|
|
833
|
+
proposal body is stored under `proposal`; the chosen rung is lifted to a
|
|
834
|
+
top-level `intervention` for cheap filtering, the typed `reason_class` is lifted
|
|
835
|
+
to the top level (the SAME closed-vocab token `refuse_entry` writes — the
|
|
836
|
+
decisions queue and the cause-resolution fold read it there, never the nested
|
|
837
|
+
body), and `dispatch_call` / `withheld` make "did the real call fire?" answerable
|
|
838
|
+
without re-reading the body.
|
|
839
|
+
|
|
840
|
+
`owner` is the requester/actor tag (recorded as `holder`, mirroring
|
|
841
|
+
`acquire_entry`/`refuse_entry`); `tool` is the host-supplied name of the tool
|
|
842
|
+
call the decision was about (opaque to the kernel, echoed for correlation).
|
|
843
|
+
|
|
844
|
+
Crucially, `OP_ENFORCE` is NOT in `_STATE_MUTATING_OPS`, so `replay` ignores it
|
|
845
|
+
for state reconstruction (an enforcement proposal grants/removes no lease):
|
|
846
|
+
journaling every enforcement outcome can never lose or invent a live lease — it
|
|
847
|
+
only adds history.
|
|
848
|
+
"""
|
|
849
|
+
body = proposal.to_dict() if hasattr(proposal, "to_dict") else dict(proposal or {})
|
|
850
|
+
# Lift the rung + dispatch flag to the top level for cheap forensic filtering,
|
|
851
|
+
# tolerating either an EffectProposal (`.intervention` is an enum) or a raw dict.
|
|
852
|
+
rung = body.get("intervention", getattr(proposal, "intervention", ""))
|
|
853
|
+
rung = getattr(rung, "value", rung) or ""
|
|
854
|
+
dispatch = body.get("dispatch_call")
|
|
855
|
+
if dispatch is None:
|
|
856
|
+
dispatch = getattr(proposal, "dispatch_call", None)
|
|
857
|
+
# The TYPED refusal token is lifted to the top level for the SAME reason
|
|
858
|
+
# `refuse_entry` lifts it (and `intervention`/`reason` above): the decisions
|
|
859
|
+
# queue and the cause-resolution fold (`decisions._refusal_kind`,
|
|
860
|
+
# `picker_oracle.resolve_cause`) read the top-level `reason_class`, NOT the
|
|
861
|
+
# nested `proposal` body. Without this lift an ENFORCE-recorded refusal is
|
|
862
|
+
# LESS forensically recoverable than a REFUSE-recorded one — the closed-vocab
|
|
863
|
+
# token that the whole refusal-recovery story turns on is buried where no
|
|
864
|
+
# reader looks, so a SELF_MODIFY block reads as an UNCLASSIFIED refusal. An
|
|
865
|
+
# absent token degrades to "" exactly as `refuse_entry`'s does.
|
|
866
|
+
reason_class = (
|
|
867
|
+
body.get("reason_class", getattr(proposal, "reason_class", "")) or ""
|
|
868
|
+
)
|
|
869
|
+
return {
|
|
870
|
+
"op": OP_ENFORCE,
|
|
871
|
+
"lane": lane,
|
|
872
|
+
"loop_ts": loop_ts,
|
|
873
|
+
"host_id": host_id,
|
|
874
|
+
"run_id": run_id,
|
|
875
|
+
"holder": owner,
|
|
876
|
+
"tool": tool,
|
|
877
|
+
"intervention": str(rung),
|
|
878
|
+
"dispatch_call": bool(dispatch) if dispatch is not None else None,
|
|
879
|
+
"withheld": (not dispatch) if dispatch is not None else None,
|
|
880
|
+
"handler": body.get("handler", getattr(proposal, "handler", "")) or "",
|
|
881
|
+
"reason": body.get("reason", getattr(proposal, "reason", "")) or "",
|
|
882
|
+
"reason_class": reason_class,
|
|
883
|
+
"proposal": body,
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def checkpoint_entry(leases: list[dict], *, seq_watermark: int) -> dict:
|
|
888
|
+
"""Build an OP_CHECKPOINT snapshot of the authoritative live-lease set.
|
|
889
|
+
|
|
890
|
+
Written at the HEAD of a compacted journal (`compact`): it carries the full
|
|
891
|
+
live set folded from the discarded history so `replay` can reconstitute it
|
|
892
|
+
without the original ACQUIRE lines, plus `seq_watermark` (the max `seq` seen
|
|
893
|
+
in the discarded history) so `next_seq` stays monotonic across a rewrite that
|
|
894
|
+
deleted the lines holding the prior high-water mark. Pure constructor.
|
|
895
|
+
"""
|
|
896
|
+
return {
|
|
897
|
+
"op": OP_CHECKPOINT,
|
|
898
|
+
"leases": [dict(l) for l in leases],
|
|
899
|
+
"seq_watermark": int(seq_watermark),
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def compact(entries: Iterable[dict]) -> list[dict]:
|
|
904
|
+
"""Fold a journal down to a single CHECKPOINT (+ preserved corrupt sentinels).
|
|
905
|
+
|
|
906
|
+
PURE — entries in, a SHORTER entry list out, no disk, no clock. This is the
|
|
907
|
+
compaction core the I/O shell (`lane_lease.compact_journal`) writes back over
|
|
908
|
+
the WAL crash-safely. The discipline that makes it safe to discard the long
|
|
909
|
+
history of dead leases is the same one `replay` uses: fold to the authoritative
|
|
910
|
+
live set, then SNAPSHOT it — so a still-live ACQUIRE older than any cutoff
|
|
911
|
+
survives in the checkpoint payload, never dropped. A naive "delete old lines"
|
|
912
|
+
would forget a held lane and the kernel would false-ADMIT a colliding tree —
|
|
913
|
+
the catastrophic lost-live-lease bug this fold-to-snapshot design forecloses.
|
|
914
|
+
|
|
915
|
+
The DIFFERENTIAL-EQUIVALENCE invariant (pinned by a test):
|
|
916
|
+
replay(compact(E)) == replay(E)
|
|
917
|
+
holds because `replay`'s CHECKPOINT branch RESETS its live set to exactly the
|
|
918
|
+
payload this writes — the leases `replay(E)` would itself reconstruct. This is
|
|
919
|
+
equivalence for the ARBITER's live set, NOT for the liveness fold: a CHECKPOINT
|
|
920
|
+
carries no `ts` and is in neither `journal_delta._EVENT_OPS` nor
|
|
921
|
+
`_HEARTBEAT_OPS`, so a still-live run's beat anchor is dropped by compaction and
|
|
922
|
+
it reads STALLED to the liveness oracle until its next ACQUIRE/HEARTBEAT. That
|
|
923
|
+
is always the SAFE direction — compaction can never fabricate an event or beat,
|
|
924
|
+
so it can never cause a false-ADVANCING/SPINNING — but it is why compaction is
|
|
925
|
+
an operator verb for a quiet window, not an automatic per-append rotation.
|
|
926
|
+
|
|
927
|
+
`seq_watermark` is derived from the input only (max existing `seq`), so the
|
|
928
|
+
fold reads no clock and `next_seq` over the compacted journal is `>=`
|
|
929
|
+
`next_seq` over the original — never a reused seq. A `_CORRUPT` sentinel in the
|
|
930
|
+
input is PRESERVED into the output (appended after the checkpoint): a mid-file
|
|
931
|
+
integrity breach is real signal an audit must still see, never silently erased
|
|
932
|
+
by a rewrite.
|
|
933
|
+
"""
|
|
934
|
+
materialized = list(entries)
|
|
935
|
+
live = replay(materialized)
|
|
936
|
+
watermark = 0
|
|
937
|
+
corrupt: list[dict] = []
|
|
938
|
+
for e in materialized:
|
|
939
|
+
try:
|
|
940
|
+
s = int(e.get("seq") or 0)
|
|
941
|
+
except (TypeError, ValueError):
|
|
942
|
+
s = 0
|
|
943
|
+
# A pre-existing checkpoint's watermark also bounds the next seq.
|
|
944
|
+
try:
|
|
945
|
+
w = int(e.get("seq_watermark") or 0)
|
|
946
|
+
except (TypeError, ValueError):
|
|
947
|
+
w = 0
|
|
948
|
+
watermark = max(watermark, s, w)
|
|
949
|
+
if str(e.get("op") or "") == "_CORRUPT":
|
|
950
|
+
corrupt.append(dict(e))
|
|
951
|
+
return [checkpoint_entry(live, seq_watermark=watermark)] + corrupt
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def main(argv: list[str] | None = None) -> int:
|
|
955
|
+
import argparse
|
|
956
|
+
|
|
957
|
+
ap = argparse.ArgumentParser(
|
|
958
|
+
description=__doc__,
|
|
959
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
960
|
+
)
|
|
961
|
+
sub = ap.add_subparsers(dest="cmd", required=True)
|
|
962
|
+
p_tail = sub.add_parser("tail", help="print the last N entries")
|
|
963
|
+
p_tail.add_argument("n", nargs="?", type=int, default=20)
|
|
964
|
+
p_tail.add_argument("--json", action="store_true", help="raw JSONL")
|
|
965
|
+
sub.add_parser("replay", help="print the reconstructed live-lease set")
|
|
966
|
+
sub.add_parser("seq", help="print the current max seq")
|
|
967
|
+
args = ap.parse_args(argv)
|
|
968
|
+
|
|
969
|
+
if args.cmd == "tail":
|
|
970
|
+
entries = tail(args.n)
|
|
971
|
+
if args.json:
|
|
972
|
+
for e in entries:
|
|
973
|
+
print(json.dumps(e, sort_keys=True, default=str))
|
|
974
|
+
else:
|
|
975
|
+
if not entries:
|
|
976
|
+
print("(journal empty)")
|
|
977
|
+
for e in entries:
|
|
978
|
+
seq = e.get("seq", "?")
|
|
979
|
+
ts = e.get("ts", "?")
|
|
980
|
+
op = e.get("op", "?")
|
|
981
|
+
lane = e.get("lane", "")
|
|
982
|
+
extra = e.get("reason") or ""
|
|
983
|
+
loop = e.get("loop_ts") or ""
|
|
984
|
+
print(f"#{seq:<5} {ts} {op:9} {str(lane):14} "
|
|
985
|
+
f"{str(loop):16} {extra}")
|
|
986
|
+
return 0
|
|
987
|
+
|
|
988
|
+
if args.cmd == "replay":
|
|
989
|
+
leases = replay(read_all())
|
|
990
|
+
print(json.dumps(leases, indent=2, sort_keys=True, default=str))
|
|
991
|
+
return 0
|
|
992
|
+
|
|
993
|
+
if args.cmd == "seq":
|
|
994
|
+
print(next_seq() - 1)
|
|
995
|
+
return 0
|
|
996
|
+
|
|
997
|
+
return 2
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
if __name__ == "__main__":
|
|
1001
|
+
raise SystemExit(main())
|