dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/coverage.py ADDED
@@ -0,0 +1,387 @@
1
+ r"""coverage — the cheap, NON-GIT fan-out coverage fold for a self-reporting fleet (docs/197 §7(1)).
2
+
3
+ > **An HONEST AGGREGATOR, not a label factory.** It folds N already-adjudicated
4
+ > `result_state` terminal-state verdicts (each minted by `verify-result`, the
5
+ > §7(1) keystone) against the workflow-DECLARED expected count N, into ONE coverage
6
+ > headline + a per-class breakdown. It mints **ZERO** new ground-truth labels —
7
+ > every per-worker DEAD/HEALTHY fact it counts was already decided by
8
+ > `result_state.classify_terminal`; this batches them into one coverage answer the
9
+ > synthesizer can read. That honesty is load-bearing: re-counting the N
10
+ > already-adjudicated verdicts as "N new labels" would be the consistency-not-
11
+ > grounding sin (the docs/179 design law). The data-multiplier in the docs/179 set
12
+ > is `firing_label` (it JOINS a firing to a git outcome to DECIDE a previously-unknown
13
+ > label); `coverage` is the `fleet_roll` sibling — a fold over an already-labeled set.
14
+
15
+ What it IS — the win that is real (and narrow)
16
+ ==============================================
17
+
18
+ The dominant ultracode subagent is a **pure-text research/read worker that produces
19
+ no git commits**, so `completion.classify` (which folds `declared − git-ancestry-
20
+ verified` over an `intent_ledger`) returns INDETERMINATE for it — there is no ledger.
21
+ The only fossil a read-only worker leaves is its transcript's terminal record. So
22
+ `coverage` is the form of "is the fan-out actually done?" that works on the cheap
23
+ rung `result_state` already provides, and it earns its keep two narrow, defensible
24
+ ways:
25
+
26
+ 1. It makes the denominator **`declared` (a separate, workflow-authored integer)**,
27
+ NOT `len(returns)`. The pervasive laundering bug is `failed = N − survivors.length`
28
+ and `results.filter(Boolean)` (89/114 real scripts): a harness-synthesized death
29
+ returns a non-null error string that survives the filter, so a 4-of-7 fan-out is
30
+ silently banked as 7/7. Because `declared` is independent of the survivor list, a
31
+ short survivor list CANNOT read as FULL here — the laundering is structurally
32
+ impossible.
33
+ 2. It **surfaces a count the prior pipeline discarded** — `unaccounted` (declared
34
+ slots that produced neither a HEALTHY return nor a witnessed death) — and hands
35
+ the whole partition to the synthesizer as legible text, instead of `log()`-ing it
36
+ and throwing it away (today's behavior, the follow-up #1 premise).
37
+
38
+ Both are "better denominator hygiene," not a new per-datum label. Stated honestly so
39
+ the module ships in agreement with docs/179, not in contradiction with it.
40
+
41
+ The fold-mints-data law (docs/179) — applied, and the honest ruling
42
+ ===================================================================
43
+
44
+ The two facts the fold touches: **declared N** (workflow-authored) and the **multiset
45
+ of `result_state` terminal-states** (harness-authored, via the `model=='<synthetic>'`
46
+ gate). They were not compared at the fold before — but the comparison is *arithmetic*
47
+ (`healthy == declared?`), and it decides NO new truth value about any worker: each
48
+ worker's DEAD/HEALTHY was already adjudicated by `result_state`. So this is the
49
+ `fleet_roll` case (fold an already-labeled set → one headline + breakdown, 0 new
50
+ labels), NOT the `firing_label` case (join two facts to DECIDE an unknown label). The
51
+ `unaccounted`/`absent` surfacing is exactly what `fleet_roll.absent` does without
52
+ claiming to mint data. See [[project-dos-fold-mints-data-law]].
53
+
54
+ The byte-author law / advisory floor / reuse notes
55
+ ==================================================
56
+
57
+ The `healthy` count is grounded TRANSITIVELY: it derives from `result_state`'s
58
+ `model=='<synthetic>'` gate, a byte the Claude Code HARNESS — not the worker —
59
+ authored, so a worker cannot forge its slot HEALTHY when the harness killed it
60
+ (the docs/138 grounding-not-consistency invariant). BUT the pure core can only be as
61
+ grounded as the verdicts handed in: if a caller asserts terminal-states directly
62
+ (the CLI `--states` path) instead of letting `coverage_from_transcripts` run
63
+ `result_state.verify_transcript`, the count is **workflow-asserted, not harness-
64
+ grounded**. The CLI stamps that distinction (`grounded: false` vs `true`) so a
65
+ consumer knows whether the denominator was re-grounded; the pure `classify_coverage`
66
+ counts whatever states it is given and never re-grounds (it is pure — no I/O).
67
+
68
+ ADVISORY (PDP, not PEP — the docs/197 §6.5 / docs/99 line): it REPORTS a coverage
69
+ verdict + a synthesizer-legible `prompt_line`; it never re-runs a dead worker
70
+ (re-dispatch of the dead slot's OWN unit is the conductor's act) and never re-prompts
71
+ the synthesizer mid-plan (the −9 pp DEFER derail). It also does NOT judge the
72
+ CORRECTNESS of a HEALTHY return — a 7/7 FULL coverage of seven WRONG answers is still
73
+ FULL; coverage certifies the denominator, never the values. Whether a healthy finding
74
+ is true is `effect_witness` / `believe_under_floor`'s job (the witness-routing rung,
75
+ docs/197 §7(2)).
76
+
77
+ ⚓ Kernel discipline (the litmus): a PURE verdict + a boundary reader. It imports only
78
+ the sibling kernel module `result_state` (+ stdlib) — NOT `resume`/`intent_ledger`/
79
+ `scope_source` (those are `completion`'s git-ledger imports; folding them in would drag
80
+ git concepts into the pure-text path). Names no host, resolves nothing against
81
+ `__file__`, takes no lease. The transcript I/O is the caller's boundary
82
+ (`coverage_from_transcripts`, which delegates to `result_state.verify_transcript`),
83
+ exactly the `liveness.classify` over `git_delta` shape, one rung over. It mirrors
84
+ `completion`'s SHAPE (a `str`-enum verdict + frozen `*Verdict` + `to_dict` +
85
+ `fraction`-style legibility), but shares no body — a new leaf, the third sibling of
86
+ the "is the fan-out done, or only declared done?" family.
87
+ """
88
+
89
+ from __future__ import annotations
90
+
91
+ import enum
92
+ from dataclasses import dataclass
93
+ from typing import Optional, Sequence, Union
94
+
95
+ from dos.result_state import ResultStateVerdict, TerminalClass, TerminalState
96
+
97
+
98
+ # ───────────────────────────── the coverage verdict ───────────────────────────
99
+ class Coverage(str, enum.Enum):
100
+ """The typed coverage verdict — five states, mutually exclusive.
101
+
102
+ `str`-valued so it round-trips a `--json` token / exit-code map without a lookup
103
+ table (the `Completion` / `Resume` / `Liveness` idiom). The asymmetry maps to the
104
+ consumer's action:
105
+
106
+ * FULL — every declared worker returned a real result; fold all.
107
+ * UNDERFILLED — a sub-quorum returned (0 < healthy < declared); fold WITH a
108
+ caveat, count the gap in the denominator.
109
+ * STARVED — nothing real came back (healthy == 0, declared > 0); do NOT
110
+ synthesize — there is no real material to fold.
111
+ * OVERFILLED — more healthy returns than declared (healthy > declared): a
112
+ dispatch/glob bug (a re-dispatch double-counted, a stale glob).
113
+ Surfaced, never silently reported as FULL with `fraction > 1`.
114
+ * EMPTY — nothing was fanned out (declared == 0). Degenerate, NOT an error.
115
+ """
116
+
117
+ FULL = "FULL"
118
+ UNDERFILLED = "UNDERFILLED"
119
+ STARVED = "STARVED"
120
+ OVERFILLED = "OVERFILLED"
121
+ EMPTY = "EMPTY"
122
+
123
+ def __str__(self) -> str: # pragma: no cover - trivial
124
+ return self.value
125
+
126
+ @property
127
+ def foldable(self) -> bool:
128
+ """True iff there is real material to synthesize from (everything but STARVED).
129
+
130
+ OVERFILLED is foldable (there ARE healthy results — too many, but real); the
131
+ caveat is about the count mismatch, not the absence of material."""
132
+ return self is not Coverage.STARVED
133
+
134
+ @property
135
+ def should_caveat(self) -> bool:
136
+ """True iff the synthesis prompt MUST carry a coverage caveat (not FULL/EMPTY)."""
137
+ return self in (Coverage.UNDERFILLED, Coverage.STARVED, Coverage.OVERFILLED)
138
+
139
+
140
+ @dataclass(frozen=True)
141
+ class CoveragePolicy:
142
+ """Knobs for the coverage verdict — policy, not mechanism (the `ResumePolicy` split).
143
+
144
+ ``min_quorum`` is a LEGIBILITY-only flag: when set, `to_dict` reports
145
+ ``quorum_met = healthy/declared >= min_quorum``. It NEVER changes the verdict —
146
+ "is 4/7 acceptable?" is host policy the synthesizer/conductor decides; coverage
147
+ only reports the fraction + an advisory flag. FULL stays strict equality. The
148
+ default is generic (no host tuning); a workspace could declare its own in a future
149
+ `dos.toml [coverage]` seam (like the planned `[liveness]`/`[completion]`).
150
+ """
151
+
152
+ min_quorum: Optional[float] = None
153
+
154
+
155
+ DEFAULT_COVERAGE_POLICY = CoveragePolicy()
156
+
157
+
158
+ @dataclass(frozen=True)
159
+ class ReturnState:
160
+ """One declared worker slot's witnessed terminal-state — the minimal datum the fold
161
+ counts. `state` is a `result_state.TerminalState` (the rung coverage trusts);
162
+ `agent_id` is optional legibility only (a per-slot breakdown). Nothing else about
163
+ the return is load-bearing here — the CORRECTNESS of a HEALTHY return is
164
+ `effect_witness`'s job, not coverage's."""
165
+
166
+ state: TerminalState
167
+ agent_id: str = ""
168
+
169
+
170
+ @dataclass(frozen=True)
171
+ class CoverageVerdict:
172
+ """The single verdict `classify_coverage` returns, with the partition echoed back.
173
+
174
+ `declared` is the workflow-authored denominator (independent of the survivor list —
175
+ the laundering fix). `healthy`/`dead`/`unreadable` partition the WITNESSED slots;
176
+ `unaccounted` is the declared slots that produced no witnessed verdict at all (the
177
+ surfaced-discarded count). `dead_classes` is the `result_state.TerminalClass`
178
+ breakdown of the deaths — populated only when full `ResultStateVerdict`s were
179
+ counted (the harness-grounded path), so the reason text can say "rate-limit" vs
180
+ "quota" honestly; empty when bare `TerminalState`s were counted. `to_dict` is the
181
+ `--json` shape (incl. the synthesizer-legible `prompt_line`)."""
182
+
183
+ state: Coverage
184
+ declared: int
185
+ healthy: int
186
+ dead: int
187
+ unreadable: int
188
+ reason: str
189
+ dead_classes: tuple[tuple[str, int], ...] = ()
190
+ quorum_met: Optional[bool] = None
191
+
192
+ @property
193
+ def unaccounted(self) -> int:
194
+ """Declared slots that produced no witnessed verdict (declared − the witnessed
195
+ partition). Floored at 0 — an over-fill is reported via OVERFILLED, never as a
196
+ negative `unaccounted`."""
197
+ return max(0, self.declared - self.healthy - self.dead - self.unreadable)
198
+
199
+ @property
200
+ def fraction(self) -> Optional[float]:
201
+ """healthy / declared — the coverage fraction, or None when nothing was declared.
202
+ A legibility aid; never load-bearing for the verdict. May exceed 1.0 only in the
203
+ OVERFILLED case (reported so the dispatch bug is visible, not hidden)."""
204
+ return (self.healthy / self.declared) if self.declared else None
205
+
206
+ @property
207
+ def prompt_line(self) -> str:
208
+ """The deterministic sentence a workflow interpolates VERBATIM into its synthesis
209
+ prompt — the whole point of the module (the laundering fix is legible coverage,
210
+ not a `log()`-ed one). Generated from the REAL `(dead, unreadable, unaccounted)`
211
+ partition (Fix 2/3): it NEVER asserts a death that was not witnessed — an
212
+ unreadable slot is reported as "could not be read", a missing slot as "did not
213
+ return a transcript", and only `dead`/`dead_classes` license the word "died"."""
214
+ d = self.declared
215
+ if self.state is Coverage.EMPTY:
216
+ return "No workers were fanned out (declared == 0); there is nothing to fold."
217
+ if self.state is Coverage.FULL:
218
+ return (f"All {self.healthy} of {d} fan-out workers returned a real result; "
219
+ f"this is full coverage.")
220
+ # Build the gap clause from the actual partition, never a hardcoded "died".
221
+ parts = []
222
+ if self.dead:
223
+ cls = self._dead_class_phrase()
224
+ parts.append(f"{self.dead} died on a harness-authored terminal{cls}")
225
+ if self.unreadable:
226
+ parts.append(f"{self.unreadable} could not be read (NOT a witnessed death)")
227
+ if self.unaccounted:
228
+ parts.append(f"{self.unaccounted} did not return a transcript")
229
+ gap = "; ".join(parts) if parts else "the missing slots are unaccounted"
230
+ if self.state is Coverage.STARVED:
231
+ # 0 healthy — but the reason text must reflect WHY (deaths vs unreadable vs
232
+ # missing), because the right operator action differs (re-dispatch a death;
233
+ # fix the read path for unreadable; locate the transcripts for missing).
234
+ return (f"COVERAGE FAILURE: 0 of {d} fan-out workers returned a real result "
235
+ f"({gap}). There is no real material to synthesize. Do NOT fabricate "
236
+ f"findings; report the fan-out as failed and act on the gap above "
237
+ f"(re-dispatch deaths; fix the read path for unreadable; locate "
238
+ f"missing transcripts).")
239
+ if self.state is Coverage.OVERFILLED:
240
+ return (f"COVERAGE ANOMALY: {self.healthy} workers returned a real result but "
241
+ f"only {d} were declared — more results than expected (a re-dispatch "
242
+ f"double-count or a stale transcript glob). Treat the count as "
243
+ f"unreliable and reconcile the dispatch before trusting coverage.")
244
+ # UNDERFILLED
245
+ return (f"COVERAGE CAVEAT: only {self.healthy} of {d} fan-out workers returned a "
246
+ f"real result ({gap}). Treat the findings below as a SUB-QUORUM SAMPLE "
247
+ f"({self.healthy}/{d}), not an exhaustive survey; do not state or imply "
248
+ f"full coverage, and flag the gap above.")
249
+
250
+ def _dead_class_phrase(self) -> str:
251
+ """A short ' (rate-limit/quota/...)' phrase from `dead_classes`, or '' when the
252
+ deaths were counted from bare TerminalStates (no class detail). The ONLY license
253
+ to name a death cause — never asserted from an unreadable/missing slot."""
254
+ if not self.dead_classes:
255
+ return ""
256
+ names = "/".join(c.lower().replace("_", "-") for c, _ in self.dead_classes)
257
+ return f" ({names})"
258
+
259
+ def to_dict(self) -> dict:
260
+ return {
261
+ "state": self.state.value,
262
+ "declared": self.declared,
263
+ "healthy": self.healthy,
264
+ "dead": self.dead,
265
+ "unreadable": self.unreadable,
266
+ "unaccounted": self.unaccounted,
267
+ "fraction": (round(self.fraction, 4) if self.fraction is not None else None),
268
+ "foldable": self.state.foldable,
269
+ "should_caveat": self.state.should_caveat,
270
+ "dead_classes": [list(c) for c in self.dead_classes],
271
+ "quorum_met": self.quorum_met,
272
+ "prompt_line": self.prompt_line,
273
+ "reason": self.reason,
274
+ }
275
+
276
+
277
+ # ───────────────────────────── the pure fold ──────────────────────────────────
278
+ _Return = Union[ReturnState, ResultStateVerdict, TerminalState]
279
+
280
+
281
+ def _as_state(r: _Return) -> tuple[TerminalState, Optional[TerminalClass]]:
282
+ """Coerce one return element to `(TerminalState, TerminalClass | None)`. PURE.
283
+
284
+ Accepts a bare `TerminalState`, a full `ResultStateVerdict` (carries the class
285
+ detail), or a `ReturnState` wrapper. Any other type raises `TypeError` — the CLI
286
+ maps it to a contract error (exit 2), never silently miscounts."""
287
+ if isinstance(r, TerminalState):
288
+ return (r, None)
289
+ if isinstance(r, ResultStateVerdict):
290
+ return (r.state, r.cls)
291
+ if isinstance(r, ReturnState):
292
+ return (r.state, None)
293
+ raise TypeError(
294
+ f"coverage: a return must be a TerminalState, ResultStateVerdict, or "
295
+ f"ReturnState, not {type(r).__name__}"
296
+ )
297
+
298
+
299
+ def classify_coverage(
300
+ declared: int,
301
+ returns: Sequence[_Return],
302
+ policy: CoveragePolicy = DEFAULT_COVERAGE_POLICY,
303
+ ) -> CoverageVerdict:
304
+ """Fold the witnessed terminal-states against the declared count. PURE — no I/O.
305
+
306
+ Counts each return's `result_state` terminal-state into `{healthy, dead,
307
+ unreadable}` (an UNREADABLE return is LIVE-not-dead — the fail-safe floor
308
+ inherited from `result_state`: a read fault must NEVER be counted a death), then
309
+ decides the coverage state from `healthy` vs `declared`:
310
+
311
+ declared <= 0 → EMPTY (nothing fanned out)
312
+ healthy > declared → OVERFILLED (dispatch/glob bug)
313
+ healthy == declared (declared > 0) → FULL
314
+ healthy == 0 (declared > 0) → STARVED
315
+ 0 < healthy < declared → UNDERFILLED
316
+
317
+ `dead` is SYNTHETIC or EMPTY (both carry `result_state` `.dead == True`).
318
+ `unaccounted` (declared slots with no witnessed verdict) falls out as
319
+ `declared − healthy − dead − unreadable` and rides UNDERFILLED/STARVED.
320
+
321
+ ADVISORY (docs/197 §6.5): it mints a coverage verdict; the consumer decides what to
322
+ do (fold-with-caveat / don't-fold / re-dispatch). It never re-runs a worker and
323
+ never judges the correctness of a healthy return (that is `effect_witness`).
324
+ """
325
+ healthy = dead = unreadable = 0
326
+ cls_counts: dict[str, int] = {}
327
+ for r in returns:
328
+ state, cls = _as_state(r)
329
+ if state is TerminalState.HEALTHY:
330
+ healthy += 1
331
+ elif state is TerminalState.UNREADABLE:
332
+ unreadable += 1 # FAIL-SAFE: live, NOT a witnessed death.
333
+ else: # SYNTHETIC or EMPTY — result_state.dead == True.
334
+ dead += 1
335
+ if cls is not None and cls is not TerminalClass.NONE:
336
+ cls_counts[cls.value] = cls_counts.get(cls.value, 0) + 1
337
+
338
+ if declared <= 0:
339
+ state, reason = Coverage.EMPTY, "nothing was fanned out (declared == 0)"
340
+ elif healthy > declared:
341
+ state = Coverage.OVERFILLED
342
+ reason = (f"{healthy} healthy returns but only {declared} declared — more "
343
+ f"results than expected (a dispatch/glob bug)")
344
+ elif healthy == declared:
345
+ state, reason = Coverage.FULL, f"all {declared} declared worker(s) returned a real result"
346
+ elif healthy == 0:
347
+ state = Coverage.STARVED
348
+ reason = f"0 of {declared} declared worker(s) returned a real result — nothing to synthesize"
349
+ else:
350
+ state = Coverage.UNDERFILLED
351
+ reason = f"{healthy} of {declared} declared worker(s) returned a real result (sub-quorum)"
352
+
353
+ quorum_met: Optional[bool] = None
354
+ if policy.min_quorum is not None and declared > 0:
355
+ quorum_met = (healthy / declared) >= policy.min_quorum
356
+
357
+ return CoverageVerdict(
358
+ state=state,
359
+ declared=declared,
360
+ healthy=healthy,
361
+ dead=dead,
362
+ unreadable=unreadable,
363
+ reason=reason,
364
+ dead_classes=tuple(sorted(cls_counts.items())),
365
+ quorum_met=quorum_met,
366
+ )
367
+
368
+
369
+ # ───────────────────────────── boundary I/O ───────────────────────────────────
370
+ def coverage_from_transcripts(
371
+ declared: int,
372
+ paths: Sequence[str],
373
+ policy: CoveragePolicy = DEFAULT_COVERAGE_POLICY,
374
+ ) -> CoverageVerdict:
375
+ """Fold a list of subagent transcript paths into a coverage verdict. NOT pure.
376
+
377
+ Reads each path via `result_state.verify_transcript` at the boundary (a missing /
378
+ garbled file yields UNREADABLE, which counts LIVE — the fail-safe floor), then
379
+ folds the verdicts with the pure `classify_coverage`. This is the HARNESS-GROUNDED
380
+ path: coverage itself runs the `model=='<synthetic>'` classification, so the
381
+ `healthy`/`dead` counts cannot be forged by a self-reporting workflow (the CLI
382
+ stamps `grounded: true` for this path). The `git_delta`/`liveness` "I/O at the
383
+ boundary, data to the pure core" discipline.
384
+ """
385
+ from dos import result_state
386
+ verdicts = [result_state.verify_transcript(str(p)) for p in paths]
387
+ return classify_coverage(declared, verdicts, policy)
dos/dangling_intent.py ADDED
@@ -0,0 +1,287 @@
1
+ """DI — the dangling-intent verdict: *did the agent stop right after admitting unfinished work?*
2
+
3
+ docs/150 (the steelman of docs/149). docs/149 measured that **~92 % of real EnterpriseOps-Gym
4
+ failures are "the action never happened"** — Premature Completion, the model declaring done and
5
+ stopping with required rows unwritten — and concluded DOS could not own it byte-cleanly, because
6
+ the `completion` verdict's inputs (`declared − verified`) are both forgeable here (no git ancestry,
7
+ no env-authored per-step checkpoint, the declared scope is the agent's own). That conclusion is
8
+ **cracked, not overturned**, by one observation docs/149 §3 over-generalized past:
9
+
10
+ > docs/149 wrote "`verified` = the agent's own narration → forgeable." That is true **only toward
11
+ > "I'm done."** A self-report of *incompleteness* — "Now I need to allocate the personnel…" right
12
+ > before stopping — is an **admission against interest**: no premature-completing agent benefits
13
+ > from falsely confessing unfinished work. It is the one self-report class DOS *already* believes
14
+ > — `resume` keeps a `STEP_CLAIMED`-but-unverified step IN the residual (it trusts the agent only
15
+ > when the agent admits *more* work, never when it claims *less*). So the against-interest axis is
16
+ > not forgeable-in-favor, and a verdict over it is not the §5a mirror-verifier.
17
+
18
+ The byte-clean question (over two surfaces, one with an env-authored corroborator)
19
+ ==================================================================================
20
+
21
+ > Does the agent's LAST authored turn contain a fixed, task-INDEPENDENT
22
+ > committed-future-intent discourse marker ("I need to", "I still have to", "next I will",
23
+ > "I was unable to", "I cannot … yet") — AND did NO env-authored tool result land after it?
24
+
25
+ It is **not a planner**: the cue set is an English-discourse-marker grammar writable having read
26
+ *zero task prompts*. The litmus the verb→tool-table mechanisms FAILED (they had to read *this
27
+ task's* prose to know "allocate" → `add_group_member` not `add_new_user_group`): this verdict is
28
+ **invariant under task-swap**. It never reads a domain noun, never enumerates a step-list, never
29
+ decides what the agent *should* do next. It does not even need to know what the unfinished step
30
+ *is* — only that a standing-obligation envelope was emitted at the moment of stopping. Key on the
31
+ first-person-future-intent *envelope*, NEVER on content ("manager", "members", "allocate"); the
32
+ instant it keys on a domain noun it has become a planner and is dead (the load-bearing knife-edge).
33
+
34
+ It is **not a satisfaction predicate**: it reads no DB, never asks "is the work correct / enough".
35
+ And the agent-authored surface (the narration) is corroborated by an **env-authored absence** — the
36
+ gym writes a tool `result` only when a tool actually executed, so "no result after the terminal
37
+ turn" is a non-forgeable byte (the `precursor_gate` existence-of-a-result rung). That corroborator
38
+ drives the dominant false-positive (named-it-then-did-it) toward zero by construction: a marker
39
+ followed by a real subsequent call is ABSTAIN, not a fire.
40
+
41
+ The honest forgeable hole, named (the `arg_provenance` discipline)
42
+ ==================================================================
43
+
44
+ This reads ONE agent-authored surface, so it is **forgeable by SUPPRESSION**: an agent trained
45
+ against the gate simply stops narrating "I need to…" and the signal goes quiet. Acceptable ONLY
46
+ because suppression degrades to **false-DONE → baseline** (the safe fail-toward-done direction),
47
+ and because the value is on *today's* uncoached cheap models that narrate their abandoned plans —
48
+ not on an adversary optimizing against the gate. It catches the **honest-but-premature** stopper;
49
+ it is blind to the **silent** stopper (stops with no narration — the majority of the measured 92 %)
50
+ and to the **adversarial** one. That is a forgeable hole stated plainly, exactly as
51
+ `arg_provenance` names its false-SUPPORTED miss. The recall ceiling is the *narrating* subset —
52
+ measured ~15-30 % of the missing-row failures (`replay_dangling.py`).
53
+
54
+ Advisory only — it never supplies the plan (DETECT, not FIX)
55
+ ============================================================
56
+
57
+ The verdict maps to `Intervention.WARN` and nothing harder (the type has no other rung). On a fire
58
+ the consumer re-surfaces **the agent's own abandoned sentence** ("your final message says you still
59
+ needed to X, and no tool ran after — continue or confirm"). It authors **no directive and no step**
60
+ — so it cannot inject the foreign instruction that caused the −9 pp derailment (docs/143: a
61
+ verifier-authored directive on a correct path). Its worst case is replaying a sentence the agent
62
+ already wrote (a one-turn iteration tax), never a derailment. It does not and structurally cannot
63
+ tell the model *what* call to make — that is the +14-35 pp planner lever, forfeit by doctrine. So
64
+ the claim is exactly "DOS can byte-cleanly DETECT a slice of premature completion," never "DOS can
65
+ fix it."
66
+
67
+ ⚓ Pure kernel, I/O on the edge (the dos idiom — mirrors `claim_extract.extract_claims`,
68
+ `liveness.classify`, `precursor_gate.classify_call`): `classify_stop(StopEvidence, policy) ->
69
+ DanglingVerdict` is a frozen datum in, a frozen verdict out. The boundary reader gathers the
70
+ terminal narration (`claim_extract.assistant_text_from_transcript`) and counts env-authored tool
71
+ results after it AT THE CALL EDGE; the kernel never reads a file, a clock, or a DB.
72
+ """
73
+
74
+ from __future__ import annotations
75
+
76
+ import enum
77
+ import re
78
+ from dataclasses import dataclass
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # The typed verdict — two-valued (the EvidenceStance REFUTED/NO_SIGNAL shape).
83
+ # ---------------------------------------------------------------------------
84
+ class Dangling(str, enum.Enum):
85
+ """The dangling-intent verdict — two states. `str`-valued so it round-trips a CLI token / JSON.
86
+
87
+ DANGLING_INTENT — the agent's last authored turn declared a committed-future obligation AND
88
+ no env-authored tool result landed after it. The one actionable rung — a
89
+ consumer re-surfaces the agent's own sentence (WARN). NOT a claim the work
90
+ is incomplete in truth — only that the agent SAID so and then stopped.
91
+ ABSTAIN — the fail-safe zero: no future-intent marker in the terminal turn, OR a real
92
+ tool result followed it (the agent named a step and then DID act), OR the
93
+ cue set is empty. Honest no-signal; never a block, always fail-toward-done.
94
+ """
95
+
96
+ DANGLING_INTENT = "DANGLING_INTENT"
97
+ ABSTAIN = "ABSTAIN"
98
+
99
+ def __str__(self) -> str: # pragma: no cover - trivial
100
+ return self.value
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # The cue grammar — task-INDEPENDENT first-person-future-intent markers.
105
+ # ---------------------------------------------------------------------------
106
+ # Each cue is a regex matched casefold against the terminal narration. They key ONLY on the
107
+ # first-person committed-future / unfulfilled-intent ENVELOPE — never a domain noun. This is the
108
+ # difference between a fixed grammar (writable having read zero task prompts) and a planner (per-task
109
+ # prose reasoning). A host may override/extend via `dos.toml [dangling.cues]` (config-as-data); an
110
+ # EMPTY set ABSTAINs everything (fail-toward-done). Kept deliberately conservative: a missed marker
111
+ # is a safe ABSTAIN; the bias is to under-fire (the `arg_provenance` posture).
112
+ DEFAULT_CUES: tuple[str, ...] = (
113
+ r"\bi (?:still )?need to\b",
114
+ r"\bi (?:still )?have to\b",
115
+ r"\bi (?:will|'ll) (?:now |then )?(?:need to |have to |proceed to )",
116
+ r"\bnext,? i (?:will|'ll|need|have|should)\b",
117
+ r"\bi should (?:now |next )?(?:proceed|continue|identify|create|add|assign|update)\b",
118
+ r"\bi was unable to\b",
119
+ r"\bi (?:have|haven't|had) not (?:yet )?(?:been able|completed|finished|done)\b",
120
+ r"\bi cannot .{0,40}\byet\b",
121
+ r"\b(?:still|yet) to be (?:done|completed|added|assigned|created)\b",
122
+ r"\bremains? to be (?:done|completed|added|assigned)\b",
123
+ r"\bto (?:do|complete|finish) this,? i (?:need|will|have|must)\b",
124
+ r"\bnow,? to\b.{0,40}\bi (?:need|will|must|have to)\b",
125
+ )
126
+
127
+ # Words that, when they immediately wrap a cue, mark it as a COMPLETED report, not an open one —
128
+ # "I needed to X, which I have now done" must NOT fire. A conservative negative guard (the cue
129
+ # itself is the primary signal; this only suppresses an obvious past-tense-resolved phrasing).
130
+ _RESOLVED_GUARD_RE = re.compile(
131
+ r"\b(?:have|has|already|now) (?:been )?(?:done|completed|finished|created|added|assigned|set up)\b",
132
+ re.IGNORECASE,
133
+ )
134
+
135
+
136
+ @dataclass(frozen=True)
137
+ class DanglingPolicy:
138
+ """The cue grammar + knobs — mechanism is kernel, the cue list is config (the `ProvenancePolicy`
139
+ / `StreamPolicy` seam). Defaults GENERIC; a host declares its own in `dos.toml [dangling]`.
140
+
141
+ cues — the committed-future-intent marker regexes (casefold). EMPTY → ABSTAIN-all
142
+ (the fail-toward-done floor: no cues declared = no accusation possible).
143
+ tail_chars — only the LAST `tail_chars` of the terminal narration are scanned (an open
144
+ obligation declared in the MIDDLE of a long turn that then continues to act is
145
+ not a *terminal* dangle; the signal is "ended ON the admission"). 0 = whole turn.
146
+ """
147
+
148
+ cues: tuple[str, ...] = DEFAULT_CUES
149
+ tail_chars: int = 600
150
+
151
+ def __post_init__(self) -> None:
152
+ if self.tail_chars < 0:
153
+ raise ValueError("tail_chars must be >= 0")
154
+
155
+
156
+ DEFAULT_POLICY = DanglingPolicy()
157
+
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Frozen input — the pure datum the boundary gathers and hands in.
161
+ # ---------------------------------------------------------------------------
162
+ @dataclass(frozen=True)
163
+ class StopEvidence:
164
+ """Everything `classify_stop` needs, gathered by the CALLER at the stop event. No I/O inside.
165
+
166
+ final_turn_text — the agent's LAST authored narration (the terminal `ai_message` /
167
+ `model_response`), read at the boundary by
168
+ `claim_extract.assistant_text_from_transcript`. Agent-authored — but
169
+ distrusted on the AGAINST-INTEREST axis only.
170
+ results_after_turn — the count of env-authored tool `result` entries that landed AFTER the
171
+ terminal turn. The ENV-AUTHORED corroborator: the gym writes a result
172
+ only when a tool actually executed, so >0 means the agent named a step
173
+ and then ACTED → ABSTAIN (not a terminal dangle). Defaults 0 (the common
174
+ stop case: the last turn is narration with nothing after it).
175
+ """
176
+
177
+ final_turn_text: str
178
+ results_after_turn: int = 0
179
+
180
+ def __post_init__(self) -> None:
181
+ if self.results_after_turn < 0:
182
+ raise ValueError("results_after_turn must be >= 0")
183
+
184
+
185
+ # ---------------------------------------------------------------------------
186
+ # Frozen verdict — advisory only.
187
+ # ---------------------------------------------------------------------------
188
+ @dataclass(frozen=True)
189
+ class DanglingVerdict:
190
+ """The verdict `classify_stop` returns — typed state + the matched cue for the WARN string.
191
+
192
+ `matched_cue` is the offending marker text (the substring that fired) so the consumer's WARN can
193
+ quote the agent's OWN words back ("your final message says: '<…>' — and no tool ran after").
194
+ `reason` is the one-line operator summary. Advisory: never raises, never blocks the stop.
195
+ """
196
+
197
+ verdict: Dangling
198
+ matched_cue: str
199
+ reason: str
200
+
201
+ @property
202
+ def is_dangling(self) -> bool:
203
+ return self.verdict is Dangling.DANGLING_INTENT
204
+
205
+ def to_dict(self) -> dict:
206
+ return {
207
+ "verdict": self.verdict.value,
208
+ "matched_cue": self.matched_cue,
209
+ "reason": self.reason,
210
+ }
211
+
212
+
213
+ # ---------------------------------------------------------------------------
214
+ # The pure verdict.
215
+ # ---------------------------------------------------------------------------
216
+ def _find_cue(text: str, policy: DanglingPolicy) -> str:
217
+ """The first committed-future-intent cue that matches the (tail of the) text, or "" if none.
218
+ Suppressed when an obvious resolved-guard phrase wraps it (named-it-then-did-it in one turn)."""
219
+ if not policy.cues:
220
+ return ""
221
+ scan = text if policy.tail_chars == 0 else text[-policy.tail_chars:]
222
+ low = scan.casefold()
223
+ for cue in policy.cues:
224
+ m = re.search(cue, low)
225
+ if not m:
226
+ continue
227
+ s, e = m.start(), m.end()
228
+ # The resolved-guard is checked ONLY within the cue's OWN sentence — clipped at the nearest
229
+ # sentence terminator on each side. This is the "I needed to X, which I have now done"
230
+ # same-clause shape; it must NOT reach back into a PRIOR completed sentence ("the group has
231
+ # been created. Now I need to allocate…") and wrongly suppress a genuine LATER dangle (the
232
+ # real-example bug). The cue itself is the primary signal; this only kills an obvious
233
+ # in-clause past-tense resolution.
234
+ sent_start = max((scan.rfind(c, 0, s) for c in ".!?\n"), default=-1) + 1
235
+ nxt = [scan.find(c, e) for c in ".!?\n"]
236
+ nxt = [i for i in nxt if i >= 0]
237
+ sent_end = min(nxt) + 1 if nxt else len(scan)
238
+ sentence = scan[sent_start:sent_end]
239
+ if _RESOLVED_GUARD_RE.search(sentence):
240
+ continue
241
+ return scan[s:e].strip()
242
+ return ""
243
+
244
+
245
+ def classify_stop(
246
+ ev: StopEvidence, policy: DanglingPolicy = DEFAULT_POLICY
247
+ ) -> DanglingVerdict:
248
+ """Classify whether the agent stopped right after admitting unfinished work. PURE — no I/O.
249
+
250
+ The ladder, top to bottom:
251
+ 1. ABSTAIN — a real tool result landed AFTER the terminal turn (`results_after_turn > 0`): the
252
+ agent named a step and then ACTED, so this is not a terminal dangle. The env-authored
253
+ corroborator wins first — it is the non-forgeable byte that kills the named-it-then-did-it
254
+ false positive.
255
+ 2. ABSTAIN — no committed-future-intent cue in the terminal narration (or an empty cue set):
256
+ the agent did not admit unfinished work. The fail-toward-done floor.
257
+ 3. DANGLING_INTENT — a cue fired AND nothing executed after: the agent's own last words admit
258
+ an open obligation and the run stopped. The one actionable rung (advisory WARN).
259
+
260
+ Advisory: the verdict REPORTS; the consumer re-surfaces the agent's own sentence (never a
261
+ directive, never a forced continue — the docs/143 −9 pp channel is unreachable by type).
262
+ """
263
+ # 1. the env-authored corroborator first: acted-after → not a terminal dangle.
264
+ if ev.results_after_turn > 0:
265
+ return DanglingVerdict(
266
+ verdict=Dangling.ABSTAIN,
267
+ matched_cue="",
268
+ reason=(
269
+ f"{ev.results_after_turn} tool result(s) landed after the terminal turn — the "
270
+ f"agent named a step and then acted, not a dangling stop"
271
+ ),
272
+ )
273
+ cue = _find_cue(ev.final_turn_text or "", policy)
274
+ if not cue:
275
+ return DanglingVerdict(
276
+ verdict=Dangling.ABSTAIN,
277
+ matched_cue="",
278
+ reason="no committed-future-intent marker in the terminal turn — clean stop (or no cues)",
279
+ )
280
+ return DanglingVerdict(
281
+ verdict=Dangling.DANGLING_INTENT,
282
+ matched_cue=cue,
283
+ reason=(
284
+ f"the terminal turn admits an open obligation ({cue!r}) and no tool ran after — the "
285
+ f"agent stopped right after saying it still had work (an admission against interest)"
286
+ ),
287
+ )