dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
@@ -0,0 +1,274 @@
1
+ """dos.drivers.state_diff — the canonical-state-diff read-back witness (docs/181).
2
+
3
+ The second concept lifted from the 2026 result-state-witness SOTA (docs/180):
4
+ **Agent-Diff** (arXiv 2602.11224) defines task success as "whether the expected
5
+ change in environment STATE was achieved", computed as a canonical state diff
6
+ (inserted / deleted / updated entities) over containerized replicas — explicitly
7
+ rejecting "fuzzy trace or parameter matching". This driver is DOS's domain-free,
8
+ floor-disciplined version: it snapshots a state surface BEFORE and AFTER the agent's
9
+ turn, computes the delta, and witnesses whether a claimed effect-key is PRESENT in
10
+ that delta — feeding `effect_witness.witness_effect` a non-forgeable read-back.
11
+
12
+ Why this is a NON-FORGEABLE witness (the VAGEN "different surface" principle)
13
+ =============================================================================
14
+
15
+ The third SOTA concept (VAGEN, arXiv 2602.00575) is that the verifier must take its
16
+ OWN read actions — read the world from a surface the agent did not author. This
17
+ driver reads the **state surface itself** (a snapshot of the live store), NOT the
18
+ agent's narration of what it wrote and NOT the agent's own tool-call response bytes.
19
+ The snapshot is authored by the store, not the agent: the agent cannot forge "row 42
20
+ now exists in the orders table" without the row actually existing, because the
21
+ snapshot is a fresh read of the store. So this source is tagged `OS_RECORDED` when
22
+ the snapshot comes from a local store the kernel reads directly (a sqlite file, a
23
+ JSON state dump the OS wrote), and a host that reads a remote third-party store
24
+ (a SaaS API, a cloud DB) would register a `THIRD_PARTY`-tagged sibling.
25
+
26
+ The honest boundary: a snapshot the AGENT produced is forgeable
27
+ ===============================================================
28
+
29
+ If the "before"/"after" snapshots are bytes the agent itself emitted (it ran a query
30
+ and pasted the result), then actor==witness and the read-back is `AGENT_AUTHORED` —
31
+ the forgeable floor, structurally incapable of CONFIRMED under
32
+ `effect_witness`/`believe_under_floor`. This driver is only a sound witness when the
33
+ KERNEL reads the snapshot (it opens the sqlite file / re-fetches the dump), which is
34
+ why the snapshot reader is a parameter (`read_state`) the HOST supplies as an
35
+ independent reader, and why the default file reader is the safe, kernel-authored path.
36
+
37
+ The delta grammar (canonical, domain-free)
38
+ ==========================================
39
+
40
+ A "state" is a mapping of `entity_key -> entity_value` (rows by id, files by path,
41
+ records by key). The canonical diff over two snapshots is:
42
+
43
+ * inserted = keys in AFTER not in BEFORE
44
+ * deleted = keys in BEFORE not in AFTER
45
+ * updated = keys in both whose value differs
46
+
47
+ A claimed effect-key is PRESENT iff it appears in inserted ∪ updated (the agent
48
+ claimed it *made* a change to that entity). ABSENT iff it does not. This is the
49
+ domain-free "claim ⊆ witnessed-delta" presence check `effect_witness` wants — not a
50
+ gold-state correctness check (which a live runtime cannot have; docs/181 §"why
51
+ presence not correctness").
52
+
53
+ Shape & layering
54
+ ================
55
+
56
+ A driver — it has the I/O surface the kernel forbids (reading a state store). It
57
+ implements the `evidence.EvidenceSource` Protocol so it drops straight into
58
+ `gather_evidence` and the belief fold, and a thin `witness_effect_via_state_diff`
59
+ convenience that snapshots → diffs → joins the claim. It imports the kernel; the
60
+ kernel never imports it (the `drivers/__init__` rule). Advisory: it reports a
61
+ read-back; it never mutates state or refuses a lease.
62
+ """
63
+
64
+ from __future__ import annotations
65
+
66
+ import argparse
67
+ import json
68
+ from dataclasses import dataclass
69
+ from typing import Mapping
70
+
71
+ # Imports the kernel — never the other way round (the driver rule).
72
+ from dos.evidence import Accountability, EvidenceFacts
73
+ from dos.effect_witness import EffectClaim, EffectWitnessVerdict, witness_effect
74
+
75
+
76
+ # A state snapshot: entity-key -> an opaque, comparable value (str/number/JSON-able).
77
+ State = Mapping[str, object]
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class StateDelta:
82
+ """The canonical diff between two snapshots — inserted / deleted / updated keys."""
83
+
84
+ inserted: frozenset[str]
85
+ deleted: frozenset[str]
86
+ updated: frozenset[str]
87
+
88
+ @property
89
+ def changed(self) -> frozenset[str]:
90
+ """Keys the agent could have CLAIMED it made: inserted ∪ updated. A delete is
91
+ not a 'made this entity' claim in the presence sense, so it is reported but not
92
+ counted as 'present' (a host that wants delete-claims checks `deleted`)."""
93
+ return self.inserted | self.updated
94
+
95
+ def to_dict(self) -> dict:
96
+ return {
97
+ "inserted": sorted(self.inserted),
98
+ "deleted": sorted(self.deleted),
99
+ "updated": sorted(self.updated),
100
+ }
101
+
102
+
103
+ def diff_state(before: State, after: State) -> StateDelta:
104
+ """Canonical, domain-free diff over two snapshots. PURE — no I/O.
105
+
106
+ Values are compared by equality; a host whose values are unstable (timestamps,
107
+ auto-ids) should normalize them in its `read_state` reader before snapshotting, so
108
+ the diff reflects semantic change, not churn.
109
+ """
110
+ bkeys = set(before.keys())
111
+ akeys = set(after.keys())
112
+ inserted = akeys - bkeys
113
+ deleted = bkeys - akeys
114
+ updated = {k for k in (akeys & bkeys) if before[k] != after[k]}
115
+ return StateDelta(
116
+ inserted=frozenset(inserted),
117
+ deleted=frozenset(deleted),
118
+ updated=frozenset(updated),
119
+ )
120
+
121
+
122
+ class StateDiffEvidenceSource:
123
+ """An `evidence.EvidenceSource`: witness whether a claimed effect-key is in a delta.
124
+
125
+ Constructed with a precomputed `StateDelta` (snapshot/diff happened at the
126
+ boundary) and an `accountability` rung (`OS_RECORDED` when the KERNEL read the
127
+ snapshots; a remote store driver passes `THIRD_PARTY`; never `AGENT_AUTHORED` for a
128
+ sound witness). `gather(subject, config)` reads `subject` as the effect-key and
129
+ answers PRESENT (ATTESTED) / ABSENT (REFUTED) against the delta — never NO_SIGNAL,
130
+ because a computed delta IS a reached read (the absence of a key is a positive
131
+ 'not there', not 'could not tell'). The fail-safe degrade lives one level up in
132
+ the snapshot reader (`witness_effect_via_state_diff`): if the snapshots could not
133
+ be read, no source is built and the verdict is UNWITNESSED.
134
+ """
135
+
136
+ name = "state_diff"
137
+
138
+ def __init__(self, delta: StateDelta, *, accountability: Accountability = Accountability.OS_RECORDED) -> None:
139
+ if accountability.is_agent_authored:
140
+ # Guard the soundness contract loudly: a state-diff witness over
141
+ # agent-authored snapshots is NOT a witness (actor==witness). A host that
142
+ # truly has only agent-authored snapshots should not use this source.
143
+ raise ValueError(
144
+ "state_diff witness requires a non-forgeable snapshot rung "
145
+ "(OS_RECORDED/THIRD_PARTY); an agent-authored snapshot is not a witness"
146
+ )
147
+ self._delta = delta
148
+ self.accountability = accountability
149
+
150
+ def gather(self, subject: str, config: object) -> EvidenceFacts:
151
+ key = (subject or "").strip()
152
+ if not key:
153
+ return EvidenceFacts.no_signal(
154
+ self.name, self.accountability, subject,
155
+ detail="no effect-key given — nothing to look for in the delta",
156
+ )
157
+ if key in self._delta.changed:
158
+ where = "inserted" if key in self._delta.inserted else "updated"
159
+ return EvidenceFacts.attest(
160
+ self.name, self.accountability, key,
161
+ detail=f"effect-key {key!r} is in the state delta ({where})",
162
+ )
163
+ return EvidenceFacts.refute(
164
+ self.name, self.accountability, key,
165
+ detail=(
166
+ f"effect-key {key!r} is NOT in the state delta "
167
+ f"(inserted={len(self._delta.inserted)} updated={len(self._delta.updated)}) "
168
+ f"— the claimed change is absent from the world"
169
+ ),
170
+ )
171
+
172
+
173
+ def witness_effect_via_state_diff(
174
+ claim: EffectClaim,
175
+ before: State,
176
+ after: State,
177
+ *,
178
+ accountability: Accountability = Accountability.OS_RECORDED,
179
+ ) -> EffectWitnessVerdict:
180
+ """Snapshot-diff → join: the one-call convenience for a host with two snapshots.
181
+
182
+ Computes the canonical delta, builds the state-diff witness over it, and joins the
183
+ claim through `effect_witness.witness_effect`. The snapshots MUST have been read by
184
+ the kernel/host (a non-forgeable reader), not pasted by the agent — that is the
185
+ `accountability` rung's contract. Returns the four-valued verdict.
186
+ """
187
+ delta = diff_state(before, after)
188
+ source = StateDiffEvidenceSource(delta, accountability=accountability)
189
+ facts = source.gather(claim.probe_subject(), None)
190
+ return witness_effect(claim, [facts])
191
+
192
+
193
+ # ---------------------------------------------------------------------------
194
+ # A safe, kernel-authored snapshot reader: a JSON state-dump file.
195
+ # `read_state_json(path)` reads a {key: value} JSON object the STORE wrote. Because
196
+ # the kernel opens the file (the agent did not hand us the bytes), the resulting
197
+ # snapshot is OS_RECORDED. A host with a sqlite store / a SaaS API writes its own
198
+ # reader and tags the rung accordingly.
199
+ # ---------------------------------------------------------------------------
200
+
201
+
202
+ def read_state_json(path: str) -> State:
203
+ """Read a `{entity_key: value}` JSON object as a state snapshot. Raises on a bad
204
+ read (the caller decides the fail-safe — a missing snapshot → UNWITNESSED, never a
205
+ fabricated empty delta that would falsely REFUTE every claim)."""
206
+ with open(path, "r", encoding="utf-8") as f:
207
+ obj = json.load(f)
208
+ if not isinstance(obj, dict):
209
+ raise ValueError(f"state snapshot at {path!r} is a {type(obj).__name__}, not an object")
210
+ return obj
211
+
212
+
213
+ # ---------------------------------------------------------------------------
214
+ # CLI — `python -m dos.drivers.state_diff KEY --before B.json --after A.json`
215
+ # witnesses whether the claimed effect-key is present in the file-snapshot delta.
216
+ # ---------------------------------------------------------------------------
217
+
218
+
219
+ def main(argv: list[str] | None = None) -> int:
220
+ ap = argparse.ArgumentParser(
221
+ prog="dos.drivers.state_diff",
222
+ description=__doc__.splitlines()[0],
223
+ )
224
+ ap.add_argument("effect_key", help="the claimed effect-key to look for in the state delta")
225
+ ap.add_argument("--before", required=True, help="path to the BEFORE state snapshot (JSON object the STORE wrote)")
226
+ ap.add_argument("--after", required=True, help="path to the AFTER state snapshot")
227
+ ap.add_argument("--narrated", default="", help="the agent's original claim phrasing (for the operator surface)")
228
+ ap.add_argument("--third-party", action="store_true",
229
+ help="tag the snapshot rung THIRD_PARTY (a remote store) instead of OS_RECORDED")
230
+ ap.add_argument("--json", action="store_true", help="machine-readable verdict")
231
+ args = ap.parse_args(argv)
232
+
233
+ rung = Accountability.THIRD_PARTY if args.third_party else Accountability.OS_RECORDED
234
+ claim = EffectClaim(key=args.effect_key, narrated=args.narrated)
235
+
236
+ # Fail-safe at the boundary: an unreadable snapshot → UNWITNESSED (no claim of
237
+ # absence), never a fabricated empty delta.
238
+ try:
239
+ before = read_state_json(args.before)
240
+ after = read_state_json(args.after)
241
+ except (OSError, ValueError, json.JSONDecodeError) as e:
242
+ from dos.effect_witness import witness_effect # local import keeps module top clean
243
+ v = witness_effect(claim, []) # no read-backs → UNWITNESSED
244
+ v_dict = v.to_dict()
245
+ v_dict["reason"] = f"UNWITNESSED — could not read a state snapshot ({e}); cannot tell"
246
+ if args.json:
247
+ print(json.dumps(v_dict, indent=2))
248
+ else:
249
+ print(f"VERDICT UNWITNESSED\nWHY could not read a snapshot: {e}")
250
+ return 3
251
+
252
+ delta = diff_state(before, after)
253
+ v = witness_effect_via_state_diff(claim, before, after, accountability=rung)
254
+
255
+ if args.json:
256
+ out = v.to_dict()
257
+ out["delta"] = delta.to_dict()
258
+ print(json.dumps(out, indent=2))
259
+ else:
260
+ print(f"EFFECT {args.effect_key}")
261
+ print(f"DELTA +{len(delta.inserted)} ~{len(delta.updated)} -{len(delta.deleted)}")
262
+ print(f"VERDICT {v.verdict.value} (believe={v.believe} refuted={v.refuted})")
263
+ print(f"WITNESS {v.witness or '(none)'} ({v.accountability.value if v.accountability else '-'})")
264
+ print(f"WHY {v.reason}")
265
+
266
+ if v.is_refuted:
267
+ return 1
268
+ if v.is_confirmed:
269
+ return 0
270
+ return 3
271
+
272
+
273
+ if __name__ == "__main__":
274
+ raise SystemExit(main())
@@ -0,0 +1,347 @@
1
+ """dos.drivers.supervisor — the long-lived watchdog that ENACTS `supervise()`.
2
+
3
+ The supervisor verdict (`dos.supervise`, docs/99) is a PURE per-tick plan:
4
+ SPAWN these free lanes, REAP these STALLED leases, FLAG these spinners. The
5
+ kernel emits the plan and stops there — `dos loop` prints it, it never launches a
6
+ worker or writes the journal. This driver is the layer that *acts on* the plan:
7
+ each tick it gathers the evidence (reusing the kernel boundary helper
8
+ `cli._supervise_evidence`), calls the pure verdict, then turns the plan into
9
+ effects — `subprocess.Popen` a worker dispatch-loop per SPAWN, append a SCAVENGE
10
+ to the lane journal per REAP.
11
+
12
+ It is a **driver** (layer 4): the one place where subprocess + journal-write +
13
+ policy live. The kernel never imports it (the `import dos.drivers` litmus); it
14
+ `import dos` like any consumer. It is the population-axis analogue of the loop
15
+ *screenplay* a host builds over `liveness` — the kernel ships the verdict, the
16
+ driver puts it on a cadence and gives it hands.
17
+
18
+ ## Why a driver may write the journal (and must serialize)
19
+
20
+ `lane_journal.append` is deliberately lock-free: "journal order must equal
21
+ registry-mutation order and only the caller knows the surrounding critical
22
+ section." Today the kernel ships no in-tree writer; this driver is the first.
23
+ So it brings its own serialization — a single `O_CREAT|O_EXCL` lock file next to
24
+ the journal, held only across the append. The supervisor is single-writer-per-host
25
+ by design, so the lock serializes the supervisor's OWN appends; it does NOT (and
26
+ need not) coordinate with a worker's `lane_journal.append` ACQUIRE, which stays
27
+ lock-free — `seq` is cosmetic for `replay` (it folds by append order and ignores
28
+ `seq`), so an ACQUIRE/SCAVENGE seq-collision is benign. The lock's real job is
29
+ **crash-safety**: a supervisor killed mid-append (SIGKILL / OOM / power-loss on
30
+ this multi-day watchdog) must not wedge every future reap. So, like
31
+ `archive_lock`, it STEALS a lock older than a short TTL, and `run()` clears any
32
+ pre-existing lock once at startup (safe: single-writer-per-host).
33
+
34
+ ## The double-spawn race belt (the driver half of the kernel guard)
35
+
36
+ Between the tick that `Popen`s a worker and the tick where that worker's ACQUIRE
37
+ lands in the journal, the lane reads FREE — so a naive re-tick would launch a
38
+ second worker. The driver keeps a `launched: {lane: launched_at_ms}` set and, on
39
+ the next tick, marks every lane launched within `cooldown_ms` as `pending=True`
40
+ in the evidence. The pure verdict then counts it alive-or-coming and does not
41
+ re-emit a SPAWN for it (the kernel's `pending` guard). The belt bounds the race
42
+ to at most one extra worker per lane per cooldown window — never an unbounded
43
+ stampede. A lane drops out of `launched` once its lease is visible (its ACQUIRE
44
+ journalled), so a worker that came up healthy stops being treated as pending.
45
+
46
+ ## Structure (testable without real I/O)
47
+
48
+ `plan_tick(cfg, *, target, now_ms, launched, cooldown_ms)` is near-pure: it
49
+ derives `pending` from `launched`, gathers evidence, calls `supervise()`, and
50
+ returns the verdict — NO effects. `tick(...)` calls `plan_tick` and then performs
51
+ the effects (Popen + scavenge), returning `(verdict, actions)`. `run(...)` loops
52
+ `tick` + sleep. Tests drive `plan_tick`/`tick` with `subprocess.Popen` and
53
+ `lane_journal.append` monkeypatched, so no real `claude` and no real git run.
54
+ """
55
+
56
+ from __future__ import annotations
57
+
58
+ import os
59
+ import subprocess
60
+ import time
61
+ from dataclasses import dataclass, field
62
+ from pathlib import Path
63
+ from typing import Optional
64
+
65
+ from dos import config as _config
66
+ from dos import lane_journal, run_id, supervise
67
+
68
+ # The worker launch argv the SPAWN plan turns into. Generic + host-free: it shells
69
+ # the `/dos-dispatch-loop` slash-skill, never a host's fat script (the emitted
70
+ # command names no host — the same rule the `dos loop` CLI emission keeps).
71
+ WORKER_PROCESS_ID = "PROC-dos-dispatch-loop"
72
+ DEFAULT_INTERVAL_S = 300.0 # the watchdog wakes rarely — init's reaper cadence
73
+ DEFAULT_COOLDOWN_MS = 120_000 # ~2 min: covers a worker's cold-start + first ACQUIRE
74
+
75
+
76
+ def _worker_argv(lane: str) -> list[str]:
77
+ """The argv for one worker dispatch-loop on `lane` (generic, host-free)."""
78
+ return ["claude", "-p", f"/dos-dispatch-loop --lane {lane}"]
79
+
80
+
81
+ # --------------------------------------------------------------------------
82
+ # Journal write-lock — a dedicated O_CREAT|O_EXCL lock file next to the journal,
83
+ # held only across an append. The supervisor is single-writer-per-host by design,
84
+ # so this lock serializes the supervisor's OWN appends (it does NOT, and need not,
85
+ # coordinate with a worker's `lane_journal.append` ACQUIRE — that path is lock-free
86
+ # and `seq` is cosmetic for `replay`, which folds by append order and ignores it).
87
+ # Its real job is crash-safety: it MUST recover from a stale lock a crashed
88
+ # supervisor (SIGKILL / OOM / power-loss) left behind, or every future reap wedges
89
+ # forever. So, like `archive_lock`, it STEALS a lock older than a short TTL — the
90
+ # append is sub-second, so a few seconds is ample — and `run()` clears any
91
+ # pre-existing lock once at startup (safe: single-writer-per-host).
92
+ # --------------------------------------------------------------------------
93
+ _LOCK_TTL_S = 10.0 # an append is sub-second; a lock older than this is a crash orphan
94
+
95
+
96
+ def _journal_lock_path(cfg) -> Path:
97
+ return Path(str(cfg.paths.lane_journal) + ".supervisor.lock")
98
+
99
+
100
+ def _lock_age_s(lp: Path) -> "float | None":
101
+ """Age of the lock file in seconds by its mtime; None if it cannot be read."""
102
+ try:
103
+ return max(0.0, time.time() - lp.stat().st_mtime)
104
+ except OSError:
105
+ return None
106
+
107
+
108
+ def _clear_stale_lock(cfg) -> None:
109
+ """Unlink the journal write-lock if it exists (startup cleanup / steal helper).
110
+
111
+ Safe because the supervisor is single-writer-per-host: at `run()` startup there
112
+ is no other legitimate holder, so any lock present is a crash orphan from a
113
+ prior run. Also used to STEAL a lock older than the TTL mid-run.
114
+ """
115
+ lp = _journal_lock_path(cfg)
116
+ try:
117
+ lp.unlink()
118
+ except OSError:
119
+ pass
120
+
121
+
122
+ def _scavenge_under_lock(cfg, lease: dict, *, reason: str) -> bool:
123
+ """Append a SCAVENGE for `lease` to the lane journal under a write-lock.
124
+
125
+ Returns True on a clean append, False if a FRESH lock was held (the supervisor
126
+ is mid-append elsewhere — skip this tick, the next one retries) or the append
127
+ failed. A failed reap is never fatal: the lane stays STALLED and the next tick
128
+ re-emits the REAP, the idempotent-reconcile property.
129
+
130
+ Crash-safety: a lock older than `_LOCK_TTL_S` is a crash orphan (a real append
131
+ is sub-second), so it is STOLEN — unlinked and re-created — rather than
132
+ deferred forever. Without this, a supervisor killed mid-append would wedge
133
+ every future reap for the life of the host.
134
+ """
135
+ lp = _journal_lock_path(cfg)
136
+ lp.parent.mkdir(parents=True, exist_ok=True)
137
+ try:
138
+ fd = os.open(str(lp), os.O_WRONLY | os.O_CREAT | os.O_EXCL)
139
+ except FileExistsError:
140
+ # A lock is present. If it is older than the TTL it is a crash orphan —
141
+ # steal it and retry once. A fresh lock means a real concurrent append
142
+ # (only possible if someone ran two supervisors); defer to the next tick.
143
+ age = _lock_age_s(lp)
144
+ if age is None or age <= _LOCK_TTL_S:
145
+ return False
146
+ _clear_stale_lock(cfg)
147
+ try:
148
+ fd = os.open(str(lp), os.O_WRONLY | os.O_CREAT | os.O_EXCL)
149
+ except OSError:
150
+ return False # lost the steal race — retry next tick
151
+ except OSError:
152
+ return False
153
+ try:
154
+ os.write(fd, f"supervisor pid={os.getpid()}\n".encode("utf-8"))
155
+ os.close(fd)
156
+ entry = lane_journal.scavenge_entry(lease, reason=reason,
157
+ prev_holder=lease.get("host_id"))
158
+ lane_journal.append(entry, path=cfg.paths.lane_journal)
159
+ return True
160
+ except Exception: # noqa: BLE001 — a failed reap is non-fatal; retry next tick
161
+ return False
162
+ finally:
163
+ try:
164
+ lp.unlink()
165
+ except OSError:
166
+ pass
167
+
168
+
169
+ # --------------------------------------------------------------------------
170
+ # The tick — plan (near-pure) then enact (effects).
171
+ # --------------------------------------------------------------------------
172
+ @dataclass
173
+ class TickActions:
174
+ """What a tick actually did — the audit record a test asserts on."""
175
+
176
+ spawned: list[str] = field(default_factory=list) # lanes a worker was Popen'd for
177
+ reaped: list[str] = field(default_factory=list) # lanes a SCAVENGE was appended for
178
+ flagged: list[str] = field(default_factory=list) # lanes surfaced (advisory)
179
+ skipped_reaps: list[str] = field(default_factory=list) # REAPs the lock deferred
180
+ # Lanes a *proposed* halt was surfaced for (acting-on-spin, docs/90 §5). PURELY
181
+ # ADVISORY: the driver surfaces the proposal exactly as it surfaces `flagged` —
182
+ # it Popens nothing, writes NO OP_RELEASE / OP_SCAVENGE, kills no process. A
183
+ # spinner whose halt is proposed STILL holds its lease; actuation is the
184
+ # operator's explicit `dos halt`, never the supervisor's (the docs/99 floor).
185
+ proposed_halts: list[str] = field(default_factory=list)
186
+
187
+
188
+ def _pending_from_launched(launched: dict, *, now_ms: int, cooldown_ms: int) -> frozenset:
189
+ """Lanes launched within the cooldown window — the race belt's `pending` set."""
190
+ return frozenset(
191
+ lane for lane, ts in launched.items() if now_ms - ts < cooldown_ms
192
+ )
193
+
194
+
195
+ def plan_tick(cfg, *, target, now_ms, launched, cooldown_ms=DEFAULT_COOLDOWN_MS):
196
+ """Gather evidence (with the pending race-belt) and return the PURE verdict.
197
+
198
+ No effects — this is the testable seam. `launched` is the driver's
199
+ {lane: launched_at_ms} set; lanes inside the cooldown window are marked
200
+ `pending` so the verdict does not re-spawn a worker whose ACQUIRE has not yet
201
+ journalled. Imports `cli._supervise_evidence` so SUP and `dos loop` gather
202
+ through the SAME boundary code.
203
+
204
+ The population POLICY is the workspace's `dos.toml [supervise]` declaration
205
+ (`cfg.supervise`: count_spinning_as_alive + reap_stalled), with `target`
206
+ overridden by the driver's effective target for this run — the same
207
+ config-sourced policy the `dos loop` emitter uses, so the watchdog and the
208
+ hand-run emitter can never diverge on whether a spinner counts as up or the
209
+ dead are reaped.
210
+ """
211
+ import dataclasses
212
+
213
+ from dos import cli # consumer→consumer import (driver may import the CLI)
214
+
215
+ pending = _pending_from_launched(launched, now_ms=now_ms, cooldown_ms=cooldown_ms)
216
+ ev = cli._supervise_evidence(cfg, target=target, now_ms=now_ms, pending_lanes=pending)
217
+ policy = dataclasses.replace(cfg.supervise, target=target)
218
+ return supervise.supervise(ev, policy)
219
+
220
+
221
+ def tick(
222
+ cfg,
223
+ *,
224
+ target,
225
+ now_ms,
226
+ launched,
227
+ root_run=None,
228
+ cooldown_ms=DEFAULT_COOLDOWN_MS,
229
+ popen=subprocess.Popen,
230
+ ):
231
+ """One supervise tick: plan, then enact (Popen spawns + scavenge reaps).
232
+
233
+ Mutates `launched` in place (records each spawn's launch ms; drops a lane once
234
+ its lease is visible so it stops being treated as pending). `popen` is
235
+ injectable so tests record launches without a real subprocess. Returns
236
+ `(verdict, TickActions)`.
237
+ """
238
+ verdict = plan_tick(cfg, target=target, now_ms=now_ms, launched=launched,
239
+ cooldown_ms=cooldown_ms)
240
+ actions = TickActions()
241
+
242
+ # Reap first (free the dead lanes' journal state before refilling). Look up the
243
+ # live lease dict to pass the real (loop_ts, lane) identity to scavenge_entry.
244
+ live = _live_leases_by_lane(cfg)
245
+ for plan in verdict.reap:
246
+ lease = live.get(plan.lane) or {"lane": plan.lane}
247
+ if _scavenge_under_lock(cfg, lease, reason="supervisor: STALLED"):
248
+ actions.reaped.append(plan.lane)
249
+ launched.pop(plan.lane, None) # a reaped lane is no longer in-flight
250
+ else:
251
+ actions.skipped_reaps.append(plan.lane)
252
+
253
+ # Spawn the free admissible lanes the plan named. Each worker gets its OWN
254
+ # run-id minted as a CHILD of the supervisor root (process-id WORKER_PROCESS_ID),
255
+ # so the correlation spine records "this dispatch-loop was launched by this
256
+ # supervisor" across the `claude -p` boundary via the CID_* lineage env.
257
+ for plan in verdict.spawn:
258
+ env = dict(os.environ)
259
+ if root_run is not None:
260
+ child = run_id.mint(WORKER_PROCESS_ID, parent=root_run)
261
+ env.update(run_id.lineage_env(child))
262
+ try:
263
+ popen(_worker_argv(plan.lane), env=env)
264
+ launched[plan.lane] = now_ms
265
+ actions.spawned.append(plan.lane)
266
+ except Exception: # noqa: BLE001 — a failed launch is non-fatal; retry next tick
267
+ pass
268
+
269
+ actions.flagged = [p.lane for p in verdict.flag]
270
+
271
+ # Acting-on-spin (docs/90 §5): surface the *proposed* halts, advisory-only.
272
+ # CRITICAL: this is a SURFACE, not an actuation — we record the lanes and do
273
+ # NOT Popen, NOT scavenge, NOT release a lease. A proposed halt of a live
274
+ # spinner stays the operator's to enact (`dos halt`); the supervisor never
275
+ # kills a live worker (the docs/99 PDP-not-PEP floor). Note we read the
276
+ # SEPARATE `verdict.proposed_halt` tuple, never `verdict.reap` — so a proposal
277
+ # can never flow into the reap/scavenge path above.
278
+ actions.proposed_halts = [p.lane for p in verdict.proposed_halt]
279
+
280
+ # Housekeeping: a lane whose lease is now visible (ACQUIRE journalled) is no
281
+ # longer in-flight — drop it from `launched` so it stops counting as pending.
282
+ for lane in list(launched):
283
+ if lane in live:
284
+ launched.pop(lane, None)
285
+
286
+ return verdict, actions
287
+
288
+
289
+ def _live_leases_by_lane(cfg: _config.SubstrateConfig) -> dict:
290
+ """The current live leases keyed by lane (read-only; [] on a missing journal)."""
291
+ try:
292
+ entries = lane_journal.read_all(path=cfg.paths.lane_journal)
293
+ leases = lane_journal.replay(entries)
294
+ except Exception: # noqa: BLE001
295
+ return {}
296
+ return {str(l.get("lane") or ""): l for l in leases}
297
+
298
+
299
+ def run(
300
+ config=None,
301
+ *,
302
+ target: Optional[int] = None,
303
+ interval: float = DEFAULT_INTERVAL_S,
304
+ max_ticks: Optional[int] = None,
305
+ cooldown_ms: int = DEFAULT_COOLDOWN_MS,
306
+ clock_ms=None,
307
+ sleep=time.sleep,
308
+ popen=subprocess.Popen,
309
+ ) -> int:
310
+ """Run the supervisor watchdog until `max_ticks` or an operator interrupt.
311
+
312
+ Mints a root run-id (`PROC-dos-supervise`) so every worker it launches carries
313
+ the supervisor's lineage across the `claude -p` boundary (the correlation
314
+ spine). Each tick gathers + plans + enacts, then sleeps `interval` (long — a
315
+ watchdog, not a busy-poll). `clock_ms`/`sleep`/`popen` are injectable for
316
+ deterministic tests. Returns 0 on a clean stop.
317
+
318
+ `target` defaults to the workspace's standing `dos.toml [supervise]` target
319
+ (`cfg.supervise.target`) so a watchdog launched with no explicit population
320
+ keeps the declared one; pass an int to override it for this process. The two
321
+ booleans (count_spinning_as_alive / reap_stalled) always come from the config
322
+ policy via `plan_tick`.
323
+ """
324
+ cfg = _config.ensure(config)
325
+ if target is None:
326
+ target = cfg.supervise.target
327
+ # Startup crash-recovery: clear any journal write-lock a prior (crashed)
328
+ # supervisor left behind. Safe because the supervisor is single-writer-per-host
329
+ # — at startup there is no other legitimate holder, so a present lock is a
330
+ # crash orphan that would otherwise wedge the first reap.
331
+ _clear_stale_lock(cfg)
332
+ root_run = run_id.mint("dos-supervise")
333
+ launched: dict = {}
334
+ ticks = 0
335
+ _clock = clock_ms if clock_ms is not None else (lambda: int(time.time() * 1000))
336
+ try:
337
+ while max_ticks is None or ticks < max_ticks:
338
+ now_ms = _clock()
339
+ tick(cfg, target=target, now_ms=now_ms, launched=launched,
340
+ root_run=root_run, cooldown_ms=cooldown_ms, popen=popen)
341
+ ticks += 1
342
+ if max_ticks is not None and ticks >= max_ticks:
343
+ break
344
+ sleep(interval)
345
+ except KeyboardInterrupt:
346
+ return 0
347
+ return 0