dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/coverage.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
r"""coverage — the cheap, NON-GIT fan-out coverage fold for a self-reporting fleet (docs/197 §7(1)).
|
|
2
|
+
|
|
3
|
+
> **An HONEST AGGREGATOR, not a label factory.** It folds N already-adjudicated
|
|
4
|
+
> `result_state` terminal-state verdicts (each minted by `verify-result`, the
|
|
5
|
+
> §7(1) keystone) against the workflow-DECLARED expected count N, into ONE coverage
|
|
6
|
+
> headline + a per-class breakdown. It mints **ZERO** new ground-truth labels —
|
|
7
|
+
> every per-worker DEAD/HEALTHY fact it counts was already decided by
|
|
8
|
+
> `result_state.classify_terminal`; this batches them into one coverage answer the
|
|
9
|
+
> synthesizer can read. That honesty is load-bearing: re-counting the N
|
|
10
|
+
> already-adjudicated verdicts as "N new labels" would be the consistency-not-
|
|
11
|
+
> grounding sin (the docs/179 design law). The data-multiplier in the docs/179 set
|
|
12
|
+
> is `firing_label` (it JOINS a firing to a git outcome to DECIDE a previously-unknown
|
|
13
|
+
> label); `coverage` is the `fleet_roll` sibling — a fold over an already-labeled set.
|
|
14
|
+
|
|
15
|
+
What it IS — the win that is real (and narrow)
|
|
16
|
+
==============================================
|
|
17
|
+
|
|
18
|
+
The dominant ultracode subagent is a **pure-text research/read worker that produces
|
|
19
|
+
no git commits**, so `completion.classify` (which folds `declared − git-ancestry-
|
|
20
|
+
verified` over an `intent_ledger`) returns INDETERMINATE for it — there is no ledger.
|
|
21
|
+
The only fossil a read-only worker leaves is its transcript's terminal record. So
|
|
22
|
+
`coverage` is the form of "is the fan-out actually done?" that works on the cheap
|
|
23
|
+
rung `result_state` already provides, and it earns its keep two narrow, defensible
|
|
24
|
+
ways:
|
|
25
|
+
|
|
26
|
+
1. It makes the denominator **`declared` (a separate, workflow-authored integer)**,
|
|
27
|
+
NOT `len(returns)`. The pervasive laundering bug is `failed = N − survivors.length`
|
|
28
|
+
and `results.filter(Boolean)` (89/114 real scripts): a harness-synthesized death
|
|
29
|
+
returns a non-null error string that survives the filter, so a 4-of-7 fan-out is
|
|
30
|
+
silently banked as 7/7. Because `declared` is independent of the survivor list, a
|
|
31
|
+
short survivor list CANNOT read as FULL here — the laundering is structurally
|
|
32
|
+
impossible.
|
|
33
|
+
2. It **surfaces a count the prior pipeline discarded** — `unaccounted` (declared
|
|
34
|
+
slots that produced neither a HEALTHY return nor a witnessed death) — and hands
|
|
35
|
+
the whole partition to the synthesizer as legible text, instead of `log()`-ing it
|
|
36
|
+
and throwing it away (today's behavior, the follow-up #1 premise).
|
|
37
|
+
|
|
38
|
+
Both are "better denominator hygiene," not a new per-datum label. Stated honestly so
|
|
39
|
+
the module ships in agreement with docs/179, not in contradiction with it.
|
|
40
|
+
|
|
41
|
+
The fold-mints-data law (docs/179) — applied, and the honest ruling
|
|
42
|
+
===================================================================
|
|
43
|
+
|
|
44
|
+
The two facts the fold touches: **declared N** (workflow-authored) and the **multiset
|
|
45
|
+
of `result_state` terminal-states** (harness-authored, via the `model=='<synthetic>'`
|
|
46
|
+
gate). They were not compared at the fold before — but the comparison is *arithmetic*
|
|
47
|
+
(`healthy == declared?`), and it decides NO new truth value about any worker: each
|
|
48
|
+
worker's DEAD/HEALTHY was already adjudicated by `result_state`. So this is the
|
|
49
|
+
`fleet_roll` case (fold an already-labeled set → one headline + breakdown, 0 new
|
|
50
|
+
labels), NOT the `firing_label` case (join two facts to DECIDE an unknown label). The
|
|
51
|
+
`unaccounted`/`absent` surfacing is exactly what `fleet_roll.absent` does without
|
|
52
|
+
claiming to mint data. See [[project-dos-fold-mints-data-law]].
|
|
53
|
+
|
|
54
|
+
The byte-author law / advisory floor / reuse notes
|
|
55
|
+
==================================================
|
|
56
|
+
|
|
57
|
+
The `healthy` count is grounded TRANSITIVELY: it derives from `result_state`'s
|
|
58
|
+
`model=='<synthetic>'` gate, a byte the Claude Code HARNESS — not the worker —
|
|
59
|
+
authored, so a worker cannot forge its slot HEALTHY when the harness killed it
|
|
60
|
+
(the docs/138 grounding-not-consistency invariant). BUT the pure core can only be as
|
|
61
|
+
grounded as the verdicts handed in: if a caller asserts terminal-states directly
|
|
62
|
+
(the CLI `--states` path) instead of letting `coverage_from_transcripts` run
|
|
63
|
+
`result_state.verify_transcript`, the count is **workflow-asserted, not harness-
|
|
64
|
+
grounded**. The CLI stamps that distinction (`grounded: false` vs `true`) so a
|
|
65
|
+
consumer knows whether the denominator was re-grounded; the pure `classify_coverage`
|
|
66
|
+
counts whatever states it is given and never re-grounds (it is pure — no I/O).
|
|
67
|
+
|
|
68
|
+
ADVISORY (PDP, not PEP — the docs/197 §6.5 / docs/99 line): it REPORTS a coverage
|
|
69
|
+
verdict + a synthesizer-legible `prompt_line`; it never re-runs a dead worker
|
|
70
|
+
(re-dispatch of the dead slot's OWN unit is the conductor's act) and never re-prompts
|
|
71
|
+
the synthesizer mid-plan (the −9 pp DEFER derail). It also does NOT judge the
|
|
72
|
+
CORRECTNESS of a HEALTHY return — a 7/7 FULL coverage of seven WRONG answers is still
|
|
73
|
+
FULL; coverage certifies the denominator, never the values. Whether a healthy finding
|
|
74
|
+
is true is `effect_witness` / `believe_under_floor`'s job (the witness-routing rung,
|
|
75
|
+
docs/197 §7(2)).
|
|
76
|
+
|
|
77
|
+
⚓ Kernel discipline (the litmus): a PURE verdict + a boundary reader. It imports only
|
|
78
|
+
the sibling kernel module `result_state` (+ stdlib) — NOT `resume`/`intent_ledger`/
|
|
79
|
+
`scope_source` (those are `completion`'s git-ledger imports; folding them in would drag
|
|
80
|
+
git concepts into the pure-text path). Names no host, resolves nothing against
|
|
81
|
+
`__file__`, takes no lease. The transcript I/O is the caller's boundary
|
|
82
|
+
(`coverage_from_transcripts`, which delegates to `result_state.verify_transcript`),
|
|
83
|
+
exactly the `liveness.classify` over `git_delta` shape, one rung over. It mirrors
|
|
84
|
+
`completion`'s SHAPE (a `str`-enum verdict + frozen `*Verdict` + `to_dict` +
|
|
85
|
+
`fraction`-style legibility), but shares no body — a new leaf, the third sibling of
|
|
86
|
+
the "is the fan-out done, or only declared done?" family.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
from __future__ import annotations
|
|
90
|
+
|
|
91
|
+
import enum
|
|
92
|
+
from dataclasses import dataclass
|
|
93
|
+
from typing import Optional, Sequence, Union
|
|
94
|
+
|
|
95
|
+
from dos.result_state import ResultStateVerdict, TerminalClass, TerminalState
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ───────────────────────────── the coverage verdict ───────────────────────────
|
|
99
|
+
class Coverage(str, enum.Enum):
|
|
100
|
+
"""The typed coverage verdict — five states, mutually exclusive.
|
|
101
|
+
|
|
102
|
+
`str`-valued so it round-trips a `--json` token / exit-code map without a lookup
|
|
103
|
+
table (the `Completion` / `Resume` / `Liveness` idiom). The asymmetry maps to the
|
|
104
|
+
consumer's action:
|
|
105
|
+
|
|
106
|
+
* FULL — every declared worker returned a real result; fold all.
|
|
107
|
+
* UNDERFILLED — a sub-quorum returned (0 < healthy < declared); fold WITH a
|
|
108
|
+
caveat, count the gap in the denominator.
|
|
109
|
+
* STARVED — nothing real came back (healthy == 0, declared > 0); do NOT
|
|
110
|
+
synthesize — there is no real material to fold.
|
|
111
|
+
* OVERFILLED — more healthy returns than declared (healthy > declared): a
|
|
112
|
+
dispatch/glob bug (a re-dispatch double-counted, a stale glob).
|
|
113
|
+
Surfaced, never silently reported as FULL with `fraction > 1`.
|
|
114
|
+
* EMPTY — nothing was fanned out (declared == 0). Degenerate, NOT an error.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
FULL = "FULL"
|
|
118
|
+
UNDERFILLED = "UNDERFILLED"
|
|
119
|
+
STARVED = "STARVED"
|
|
120
|
+
OVERFILLED = "OVERFILLED"
|
|
121
|
+
EMPTY = "EMPTY"
|
|
122
|
+
|
|
123
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
124
|
+
return self.value
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def foldable(self) -> bool:
|
|
128
|
+
"""True iff there is real material to synthesize from (everything but STARVED).
|
|
129
|
+
|
|
130
|
+
OVERFILLED is foldable (there ARE healthy results — too many, but real); the
|
|
131
|
+
caveat is about the count mismatch, not the absence of material."""
|
|
132
|
+
return self is not Coverage.STARVED
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def should_caveat(self) -> bool:
|
|
136
|
+
"""True iff the synthesis prompt MUST carry a coverage caveat (not FULL/EMPTY)."""
|
|
137
|
+
return self in (Coverage.UNDERFILLED, Coverage.STARVED, Coverage.OVERFILLED)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
@dataclass(frozen=True)
|
|
141
|
+
class CoveragePolicy:
|
|
142
|
+
"""Knobs for the coverage verdict — policy, not mechanism (the `ResumePolicy` split).
|
|
143
|
+
|
|
144
|
+
``min_quorum`` is a LEGIBILITY-only flag: when set, `to_dict` reports
|
|
145
|
+
``quorum_met = healthy/declared >= min_quorum``. It NEVER changes the verdict —
|
|
146
|
+
"is 4/7 acceptable?" is host policy the synthesizer/conductor decides; coverage
|
|
147
|
+
only reports the fraction + an advisory flag. FULL stays strict equality. The
|
|
148
|
+
default is generic (no host tuning); a workspace could declare its own in a future
|
|
149
|
+
`dos.toml [coverage]` seam (like the planned `[liveness]`/`[completion]`).
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
min_quorum: Optional[float] = None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
DEFAULT_COVERAGE_POLICY = CoveragePolicy()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@dataclass(frozen=True)
|
|
159
|
+
class ReturnState:
|
|
160
|
+
"""One declared worker slot's witnessed terminal-state — the minimal datum the fold
|
|
161
|
+
counts. `state` is a `result_state.TerminalState` (the rung coverage trusts);
|
|
162
|
+
`agent_id` is optional legibility only (a per-slot breakdown). Nothing else about
|
|
163
|
+
the return is load-bearing here — the CORRECTNESS of a HEALTHY return is
|
|
164
|
+
`effect_witness`'s job, not coverage's."""
|
|
165
|
+
|
|
166
|
+
state: TerminalState
|
|
167
|
+
agent_id: str = ""
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@dataclass(frozen=True)
|
|
171
|
+
class CoverageVerdict:
|
|
172
|
+
"""The single verdict `classify_coverage` returns, with the partition echoed back.
|
|
173
|
+
|
|
174
|
+
`declared` is the workflow-authored denominator (independent of the survivor list —
|
|
175
|
+
the laundering fix). `healthy`/`dead`/`unreadable` partition the WITNESSED slots;
|
|
176
|
+
`unaccounted` is the declared slots that produced no witnessed verdict at all (the
|
|
177
|
+
surfaced-discarded count). `dead_classes` is the `result_state.TerminalClass`
|
|
178
|
+
breakdown of the deaths — populated only when full `ResultStateVerdict`s were
|
|
179
|
+
counted (the harness-grounded path), so the reason text can say "rate-limit" vs
|
|
180
|
+
"quota" honestly; empty when bare `TerminalState`s were counted. `to_dict` is the
|
|
181
|
+
`--json` shape (incl. the synthesizer-legible `prompt_line`)."""
|
|
182
|
+
|
|
183
|
+
state: Coverage
|
|
184
|
+
declared: int
|
|
185
|
+
healthy: int
|
|
186
|
+
dead: int
|
|
187
|
+
unreadable: int
|
|
188
|
+
reason: str
|
|
189
|
+
dead_classes: tuple[tuple[str, int], ...] = ()
|
|
190
|
+
quorum_met: Optional[bool] = None
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def unaccounted(self) -> int:
|
|
194
|
+
"""Declared slots that produced no witnessed verdict (declared − the witnessed
|
|
195
|
+
partition). Floored at 0 — an over-fill is reported via OVERFILLED, never as a
|
|
196
|
+
negative `unaccounted`."""
|
|
197
|
+
return max(0, self.declared - self.healthy - self.dead - self.unreadable)
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def fraction(self) -> Optional[float]:
|
|
201
|
+
"""healthy / declared — the coverage fraction, or None when nothing was declared.
|
|
202
|
+
A legibility aid; never load-bearing for the verdict. May exceed 1.0 only in the
|
|
203
|
+
OVERFILLED case (reported so the dispatch bug is visible, not hidden)."""
|
|
204
|
+
return (self.healthy / self.declared) if self.declared else None
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def prompt_line(self) -> str:
|
|
208
|
+
"""The deterministic sentence a workflow interpolates VERBATIM into its synthesis
|
|
209
|
+
prompt — the whole point of the module (the laundering fix is legible coverage,
|
|
210
|
+
not a `log()`-ed one). Generated from the REAL `(dead, unreadable, unaccounted)`
|
|
211
|
+
partition (Fix 2/3): it NEVER asserts a death that was not witnessed — an
|
|
212
|
+
unreadable slot is reported as "could not be read", a missing slot as "did not
|
|
213
|
+
return a transcript", and only `dead`/`dead_classes` license the word "died"."""
|
|
214
|
+
d = self.declared
|
|
215
|
+
if self.state is Coverage.EMPTY:
|
|
216
|
+
return "No workers were fanned out (declared == 0); there is nothing to fold."
|
|
217
|
+
if self.state is Coverage.FULL:
|
|
218
|
+
return (f"All {self.healthy} of {d} fan-out workers returned a real result; "
|
|
219
|
+
f"this is full coverage.")
|
|
220
|
+
# Build the gap clause from the actual partition, never a hardcoded "died".
|
|
221
|
+
parts = []
|
|
222
|
+
if self.dead:
|
|
223
|
+
cls = self._dead_class_phrase()
|
|
224
|
+
parts.append(f"{self.dead} died on a harness-authored terminal{cls}")
|
|
225
|
+
if self.unreadable:
|
|
226
|
+
parts.append(f"{self.unreadable} could not be read (NOT a witnessed death)")
|
|
227
|
+
if self.unaccounted:
|
|
228
|
+
parts.append(f"{self.unaccounted} did not return a transcript")
|
|
229
|
+
gap = "; ".join(parts) if parts else "the missing slots are unaccounted"
|
|
230
|
+
if self.state is Coverage.STARVED:
|
|
231
|
+
# 0 healthy — but the reason text must reflect WHY (deaths vs unreadable vs
|
|
232
|
+
# missing), because the right operator action differs (re-dispatch a death;
|
|
233
|
+
# fix the read path for unreadable; locate the transcripts for missing).
|
|
234
|
+
return (f"COVERAGE FAILURE: 0 of {d} fan-out workers returned a real result "
|
|
235
|
+
f"({gap}). There is no real material to synthesize. Do NOT fabricate "
|
|
236
|
+
f"findings; report the fan-out as failed and act on the gap above "
|
|
237
|
+
f"(re-dispatch deaths; fix the read path for unreadable; locate "
|
|
238
|
+
f"missing transcripts).")
|
|
239
|
+
if self.state is Coverage.OVERFILLED:
|
|
240
|
+
return (f"COVERAGE ANOMALY: {self.healthy} workers returned a real result but "
|
|
241
|
+
f"only {d} were declared — more results than expected (a re-dispatch "
|
|
242
|
+
f"double-count or a stale transcript glob). Treat the count as "
|
|
243
|
+
f"unreliable and reconcile the dispatch before trusting coverage.")
|
|
244
|
+
# UNDERFILLED
|
|
245
|
+
return (f"COVERAGE CAVEAT: only {self.healthy} of {d} fan-out workers returned a "
|
|
246
|
+
f"real result ({gap}). Treat the findings below as a SUB-QUORUM SAMPLE "
|
|
247
|
+
f"({self.healthy}/{d}), not an exhaustive survey; do not state or imply "
|
|
248
|
+
f"full coverage, and flag the gap above.")
|
|
249
|
+
|
|
250
|
+
def _dead_class_phrase(self) -> str:
|
|
251
|
+
"""A short ' (rate-limit/quota/...)' phrase from `dead_classes`, or '' when the
|
|
252
|
+
deaths were counted from bare TerminalStates (no class detail). The ONLY license
|
|
253
|
+
to name a death cause — never asserted from an unreadable/missing slot."""
|
|
254
|
+
if not self.dead_classes:
|
|
255
|
+
return ""
|
|
256
|
+
names = "/".join(c.lower().replace("_", "-") for c, _ in self.dead_classes)
|
|
257
|
+
return f" ({names})"
|
|
258
|
+
|
|
259
|
+
def to_dict(self) -> dict:
|
|
260
|
+
return {
|
|
261
|
+
"state": self.state.value,
|
|
262
|
+
"declared": self.declared,
|
|
263
|
+
"healthy": self.healthy,
|
|
264
|
+
"dead": self.dead,
|
|
265
|
+
"unreadable": self.unreadable,
|
|
266
|
+
"unaccounted": self.unaccounted,
|
|
267
|
+
"fraction": (round(self.fraction, 4) if self.fraction is not None else None),
|
|
268
|
+
"foldable": self.state.foldable,
|
|
269
|
+
"should_caveat": self.state.should_caveat,
|
|
270
|
+
"dead_classes": [list(c) for c in self.dead_classes],
|
|
271
|
+
"quorum_met": self.quorum_met,
|
|
272
|
+
"prompt_line": self.prompt_line,
|
|
273
|
+
"reason": self.reason,
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ───────────────────────────── the pure fold ──────────────────────────────────
|
|
278
|
+
_Return = Union[ReturnState, ResultStateVerdict, TerminalState]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _as_state(r: _Return) -> tuple[TerminalState, Optional[TerminalClass]]:
|
|
282
|
+
"""Coerce one return element to `(TerminalState, TerminalClass | None)`. PURE.
|
|
283
|
+
|
|
284
|
+
Accepts a bare `TerminalState`, a full `ResultStateVerdict` (carries the class
|
|
285
|
+
detail), or a `ReturnState` wrapper. Any other type raises `TypeError` — the CLI
|
|
286
|
+
maps it to a contract error (exit 2), never silently miscounts."""
|
|
287
|
+
if isinstance(r, TerminalState):
|
|
288
|
+
return (r, None)
|
|
289
|
+
if isinstance(r, ResultStateVerdict):
|
|
290
|
+
return (r.state, r.cls)
|
|
291
|
+
if isinstance(r, ReturnState):
|
|
292
|
+
return (r.state, None)
|
|
293
|
+
raise TypeError(
|
|
294
|
+
f"coverage: a return must be a TerminalState, ResultStateVerdict, or "
|
|
295
|
+
f"ReturnState, not {type(r).__name__}"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def classify_coverage(
|
|
300
|
+
declared: int,
|
|
301
|
+
returns: Sequence[_Return],
|
|
302
|
+
policy: CoveragePolicy = DEFAULT_COVERAGE_POLICY,
|
|
303
|
+
) -> CoverageVerdict:
|
|
304
|
+
"""Fold the witnessed terminal-states against the declared count. PURE — no I/O.
|
|
305
|
+
|
|
306
|
+
Counts each return's `result_state` terminal-state into `{healthy, dead,
|
|
307
|
+
unreadable}` (an UNREADABLE return is LIVE-not-dead — the fail-safe floor
|
|
308
|
+
inherited from `result_state`: a read fault must NEVER be counted a death), then
|
|
309
|
+
decides the coverage state from `healthy` vs `declared`:
|
|
310
|
+
|
|
311
|
+
declared <= 0 → EMPTY (nothing fanned out)
|
|
312
|
+
healthy > declared → OVERFILLED (dispatch/glob bug)
|
|
313
|
+
healthy == declared (declared > 0) → FULL
|
|
314
|
+
healthy == 0 (declared > 0) → STARVED
|
|
315
|
+
0 < healthy < declared → UNDERFILLED
|
|
316
|
+
|
|
317
|
+
`dead` is SYNTHETIC or EMPTY (both carry `result_state` `.dead == True`).
|
|
318
|
+
`unaccounted` (declared slots with no witnessed verdict) falls out as
|
|
319
|
+
`declared − healthy − dead − unreadable` and rides UNDERFILLED/STARVED.
|
|
320
|
+
|
|
321
|
+
ADVISORY (docs/197 §6.5): it mints a coverage verdict; the consumer decides what to
|
|
322
|
+
do (fold-with-caveat / don't-fold / re-dispatch). It never re-runs a worker and
|
|
323
|
+
never judges the correctness of a healthy return (that is `effect_witness`).
|
|
324
|
+
"""
|
|
325
|
+
healthy = dead = unreadable = 0
|
|
326
|
+
cls_counts: dict[str, int] = {}
|
|
327
|
+
for r in returns:
|
|
328
|
+
state, cls = _as_state(r)
|
|
329
|
+
if state is TerminalState.HEALTHY:
|
|
330
|
+
healthy += 1
|
|
331
|
+
elif state is TerminalState.UNREADABLE:
|
|
332
|
+
unreadable += 1 # FAIL-SAFE: live, NOT a witnessed death.
|
|
333
|
+
else: # SYNTHETIC or EMPTY — result_state.dead == True.
|
|
334
|
+
dead += 1
|
|
335
|
+
if cls is not None and cls is not TerminalClass.NONE:
|
|
336
|
+
cls_counts[cls.value] = cls_counts.get(cls.value, 0) + 1
|
|
337
|
+
|
|
338
|
+
if declared <= 0:
|
|
339
|
+
state, reason = Coverage.EMPTY, "nothing was fanned out (declared == 0)"
|
|
340
|
+
elif healthy > declared:
|
|
341
|
+
state = Coverage.OVERFILLED
|
|
342
|
+
reason = (f"{healthy} healthy returns but only {declared} declared — more "
|
|
343
|
+
f"results than expected (a dispatch/glob bug)")
|
|
344
|
+
elif healthy == declared:
|
|
345
|
+
state, reason = Coverage.FULL, f"all {declared} declared worker(s) returned a real result"
|
|
346
|
+
elif healthy == 0:
|
|
347
|
+
state = Coverage.STARVED
|
|
348
|
+
reason = f"0 of {declared} declared worker(s) returned a real result — nothing to synthesize"
|
|
349
|
+
else:
|
|
350
|
+
state = Coverage.UNDERFILLED
|
|
351
|
+
reason = f"{healthy} of {declared} declared worker(s) returned a real result (sub-quorum)"
|
|
352
|
+
|
|
353
|
+
quorum_met: Optional[bool] = None
|
|
354
|
+
if policy.min_quorum is not None and declared > 0:
|
|
355
|
+
quorum_met = (healthy / declared) >= policy.min_quorum
|
|
356
|
+
|
|
357
|
+
return CoverageVerdict(
|
|
358
|
+
state=state,
|
|
359
|
+
declared=declared,
|
|
360
|
+
healthy=healthy,
|
|
361
|
+
dead=dead,
|
|
362
|
+
unreadable=unreadable,
|
|
363
|
+
reason=reason,
|
|
364
|
+
dead_classes=tuple(sorted(cls_counts.items())),
|
|
365
|
+
quorum_met=quorum_met,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# ───────────────────────────── boundary I/O ───────────────────────────────────
|
|
370
|
+
def coverage_from_transcripts(
|
|
371
|
+
declared: int,
|
|
372
|
+
paths: Sequence[str],
|
|
373
|
+
policy: CoveragePolicy = DEFAULT_COVERAGE_POLICY,
|
|
374
|
+
) -> CoverageVerdict:
|
|
375
|
+
"""Fold a list of subagent transcript paths into a coverage verdict. NOT pure.
|
|
376
|
+
|
|
377
|
+
Reads each path via `result_state.verify_transcript` at the boundary (a missing /
|
|
378
|
+
garbled file yields UNREADABLE, which counts LIVE — the fail-safe floor), then
|
|
379
|
+
folds the verdicts with the pure `classify_coverage`. This is the HARNESS-GROUNDED
|
|
380
|
+
path: coverage itself runs the `model=='<synthetic>'` classification, so the
|
|
381
|
+
`healthy`/`dead` counts cannot be forged by a self-reporting workflow (the CLI
|
|
382
|
+
stamps `grounded: true` for this path). The `git_delta`/`liveness` "I/O at the
|
|
383
|
+
boundary, data to the pure core" discipline.
|
|
384
|
+
"""
|
|
385
|
+
from dos import result_state
|
|
386
|
+
verdicts = [result_state.verify_transcript(str(p)) for p in paths]
|
|
387
|
+
return classify_coverage(declared, verdicts, policy)
|
dos/dangling_intent.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""DI — the dangling-intent verdict: *did the agent stop right after admitting unfinished work?*
|
|
2
|
+
|
|
3
|
+
docs/150 (the steelman of docs/149). docs/149 measured that **~92 % of real EnterpriseOps-Gym
|
|
4
|
+
failures are "the action never happened"** — Premature Completion, the model declaring done and
|
|
5
|
+
stopping with required rows unwritten — and concluded DOS could not own it byte-cleanly, because
|
|
6
|
+
the `completion` verdict's inputs (`declared − verified`) are both forgeable here (no git ancestry,
|
|
7
|
+
no env-authored per-step checkpoint, the declared scope is the agent's own). That conclusion is
|
|
8
|
+
**cracked, not overturned**, by one observation docs/149 §3 over-generalized past:
|
|
9
|
+
|
|
10
|
+
> docs/149 wrote "`verified` = the agent's own narration → forgeable." That is true **only toward
|
|
11
|
+
> "I'm done."** A self-report of *incompleteness* — "Now I need to allocate the personnel…" right
|
|
12
|
+
> before stopping — is an **admission against interest**: no premature-completing agent benefits
|
|
13
|
+
> from falsely confessing unfinished work. It is the one self-report class DOS *already* believes
|
|
14
|
+
> — `resume` keeps a `STEP_CLAIMED`-but-unverified step IN the residual (it trusts the agent only
|
|
15
|
+
> when the agent admits *more* work, never when it claims *less*). So the against-interest axis is
|
|
16
|
+
> not forgeable-in-favor, and a verdict over it is not the §5a mirror-verifier.
|
|
17
|
+
|
|
18
|
+
The byte-clean question (over two surfaces, one with an env-authored corroborator)
|
|
19
|
+
==================================================================================
|
|
20
|
+
|
|
21
|
+
> Does the agent's LAST authored turn contain a fixed, task-INDEPENDENT
|
|
22
|
+
> committed-future-intent discourse marker ("I need to", "I still have to", "next I will",
|
|
23
|
+
> "I was unable to", "I cannot … yet") — AND did NO env-authored tool result land after it?
|
|
24
|
+
|
|
25
|
+
It is **not a planner**: the cue set is an English-discourse-marker grammar writable having read
|
|
26
|
+
*zero task prompts*. The litmus the verb→tool-table mechanisms FAILED (they had to read *this
|
|
27
|
+
task's* prose to know "allocate" → `add_group_member` not `add_new_user_group`): this verdict is
|
|
28
|
+
**invariant under task-swap**. It never reads a domain noun, never enumerates a step-list, never
|
|
29
|
+
decides what the agent *should* do next. It does not even need to know what the unfinished step
|
|
30
|
+
*is* — only that a standing-obligation envelope was emitted at the moment of stopping. Key on the
|
|
31
|
+
first-person-future-intent *envelope*, NEVER on content ("manager", "members", "allocate"); the
|
|
32
|
+
instant it keys on a domain noun it has become a planner and is dead (the load-bearing knife-edge).
|
|
33
|
+
|
|
34
|
+
It is **not a satisfaction predicate**: it reads no DB, never asks "is the work correct / enough".
|
|
35
|
+
And the agent-authored surface (the narration) is corroborated by an **env-authored absence** — the
|
|
36
|
+
gym writes a tool `result` only when a tool actually executed, so "no result after the terminal
|
|
37
|
+
turn" is a non-forgeable byte (the `precursor_gate` existence-of-a-result rung). That corroborator
|
|
38
|
+
drives the dominant false-positive (named-it-then-did-it) toward zero by construction: a marker
|
|
39
|
+
followed by a real subsequent call is ABSTAIN, not a fire.
|
|
40
|
+
|
|
41
|
+
The honest forgeable hole, named (the `arg_provenance` discipline)
|
|
42
|
+
==================================================================
|
|
43
|
+
|
|
44
|
+
This reads ONE agent-authored surface, so it is **forgeable by SUPPRESSION**: an agent trained
|
|
45
|
+
against the gate simply stops narrating "I need to…" and the signal goes quiet. Acceptable ONLY
|
|
46
|
+
because suppression degrades to **false-DONE → baseline** (the safe fail-toward-done direction),
|
|
47
|
+
and because the value is on *today's* uncoached cheap models that narrate their abandoned plans —
|
|
48
|
+
not on an adversary optimizing against the gate. It catches the **honest-but-premature** stopper;
|
|
49
|
+
it is blind to the **silent** stopper (stops with no narration — the majority of the measured 92 %)
|
|
50
|
+
and to the **adversarial** one. That is a forgeable hole stated plainly, exactly as
|
|
51
|
+
`arg_provenance` names its false-SUPPORTED miss. The recall ceiling is the *narrating* subset —
|
|
52
|
+
measured ~15-30 % of the missing-row failures (`replay_dangling.py`).
|
|
53
|
+
|
|
54
|
+
Advisory only — it never supplies the plan (DETECT, not FIX)
|
|
55
|
+
============================================================
|
|
56
|
+
|
|
57
|
+
The verdict maps to `Intervention.WARN` and nothing harder (the type has no other rung). On a fire
|
|
58
|
+
the consumer re-surfaces **the agent's own abandoned sentence** ("your final message says you still
|
|
59
|
+
needed to X, and no tool ran after — continue or confirm"). It authors **no directive and no step**
|
|
60
|
+
— so it cannot inject the foreign instruction that caused the −9 pp derailment (docs/143: a
|
|
61
|
+
verifier-authored directive on a correct path). Its worst case is replaying a sentence the agent
|
|
62
|
+
already wrote (a one-turn iteration tax), never a derailment. It does not and structurally cannot
|
|
63
|
+
tell the model *what* call to make — that is the +14-35 pp planner lever, forfeit by doctrine. So
|
|
64
|
+
the claim is exactly "DOS can byte-cleanly DETECT a slice of premature completion," never "DOS can
|
|
65
|
+
fix it."
|
|
66
|
+
|
|
67
|
+
⚓ Pure kernel, I/O on the edge (the dos idiom — mirrors `claim_extract.extract_claims`,
|
|
68
|
+
`liveness.classify`, `precursor_gate.classify_call`): `classify_stop(StopEvidence, policy) ->
|
|
69
|
+
DanglingVerdict` is a frozen datum in, a frozen verdict out. The boundary reader gathers the
|
|
70
|
+
terminal narration (`claim_extract.assistant_text_from_transcript`) and counts env-authored tool
|
|
71
|
+
results after it AT THE CALL EDGE; the kernel never reads a file, a clock, or a DB.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
from __future__ import annotations
|
|
75
|
+
|
|
76
|
+
import enum
|
|
77
|
+
import re
|
|
78
|
+
from dataclasses import dataclass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# The typed verdict — two-valued (the EvidenceStance REFUTED/NO_SIGNAL shape).
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
class Dangling(str, enum.Enum):
|
|
85
|
+
"""The dangling-intent verdict — two states. `str`-valued so it round-trips a CLI token / JSON.
|
|
86
|
+
|
|
87
|
+
DANGLING_INTENT — the agent's last authored turn declared a committed-future obligation AND
|
|
88
|
+
no env-authored tool result landed after it. The one actionable rung — a
|
|
89
|
+
consumer re-surfaces the agent's own sentence (WARN). NOT a claim the work
|
|
90
|
+
is incomplete in truth — only that the agent SAID so and then stopped.
|
|
91
|
+
ABSTAIN — the fail-safe zero: no future-intent marker in the terminal turn, OR a real
|
|
92
|
+
tool result followed it (the agent named a step and then DID act), OR the
|
|
93
|
+
cue set is empty. Honest no-signal; never a block, always fail-toward-done.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
DANGLING_INTENT = "DANGLING_INTENT"
|
|
97
|
+
ABSTAIN = "ABSTAIN"
|
|
98
|
+
|
|
99
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
100
|
+
return self.value
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# The cue grammar — task-INDEPENDENT first-person-future-intent markers.
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# Each cue is a regex matched casefold against the terminal narration. They key ONLY on the
|
|
107
|
+
# first-person committed-future / unfulfilled-intent ENVELOPE — never a domain noun. This is the
|
|
108
|
+
# difference between a fixed grammar (writable having read zero task prompts) and a planner (per-task
|
|
109
|
+
# prose reasoning). A host may override/extend via `dos.toml [dangling.cues]` (config-as-data); an
|
|
110
|
+
# EMPTY set ABSTAINs everything (fail-toward-done). Kept deliberately conservative: a missed marker
|
|
111
|
+
# is a safe ABSTAIN; the bias is to under-fire (the `arg_provenance` posture).
|
|
112
|
+
DEFAULT_CUES: tuple[str, ...] = (
|
|
113
|
+
r"\bi (?:still )?need to\b",
|
|
114
|
+
r"\bi (?:still )?have to\b",
|
|
115
|
+
r"\bi (?:will|'ll) (?:now |then )?(?:need to |have to |proceed to )",
|
|
116
|
+
r"\bnext,? i (?:will|'ll|need|have|should)\b",
|
|
117
|
+
r"\bi should (?:now |next )?(?:proceed|continue|identify|create|add|assign|update)\b",
|
|
118
|
+
r"\bi was unable to\b",
|
|
119
|
+
r"\bi (?:have|haven't|had) not (?:yet )?(?:been able|completed|finished|done)\b",
|
|
120
|
+
r"\bi cannot .{0,40}\byet\b",
|
|
121
|
+
r"\b(?:still|yet) to be (?:done|completed|added|assigned|created)\b",
|
|
122
|
+
r"\bremains? to be (?:done|completed|added|assigned)\b",
|
|
123
|
+
r"\bto (?:do|complete|finish) this,? i (?:need|will|have|must)\b",
|
|
124
|
+
r"\bnow,? to\b.{0,40}\bi (?:need|will|must|have to)\b",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Words that, when they immediately wrap a cue, mark it as a COMPLETED report, not an open one —
|
|
128
|
+
# "I needed to X, which I have now done" must NOT fire. A conservative negative guard (the cue
|
|
129
|
+
# itself is the primary signal; this only suppresses an obvious past-tense-resolved phrasing).
|
|
130
|
+
_RESOLVED_GUARD_RE = re.compile(
|
|
131
|
+
r"\b(?:have|has|already|now) (?:been )?(?:done|completed|finished|created|added|assigned|set up)\b",
|
|
132
|
+
re.IGNORECASE,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@dataclass(frozen=True)
|
|
137
|
+
class DanglingPolicy:
|
|
138
|
+
"""The cue grammar + knobs — mechanism is kernel, the cue list is config (the `ProvenancePolicy`
|
|
139
|
+
/ `StreamPolicy` seam). Defaults GENERIC; a host declares its own in `dos.toml [dangling]`.
|
|
140
|
+
|
|
141
|
+
cues — the committed-future-intent marker regexes (casefold). EMPTY → ABSTAIN-all
|
|
142
|
+
(the fail-toward-done floor: no cues declared = no accusation possible).
|
|
143
|
+
tail_chars — only the LAST `tail_chars` of the terminal narration are scanned (an open
|
|
144
|
+
obligation declared in the MIDDLE of a long turn that then continues to act is
|
|
145
|
+
not a *terminal* dangle; the signal is "ended ON the admission"). 0 = whole turn.
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
cues: tuple[str, ...] = DEFAULT_CUES
|
|
149
|
+
tail_chars: int = 600
|
|
150
|
+
|
|
151
|
+
def __post_init__(self) -> None:
|
|
152
|
+
if self.tail_chars < 0:
|
|
153
|
+
raise ValueError("tail_chars must be >= 0")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
DEFAULT_POLICY = DanglingPolicy()
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
160
|
+
# Frozen input — the pure datum the boundary gathers and hands in.
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
@dataclass(frozen=True)
|
|
163
|
+
class StopEvidence:
|
|
164
|
+
"""Everything `classify_stop` needs, gathered by the CALLER at the stop event. No I/O inside.
|
|
165
|
+
|
|
166
|
+
final_turn_text — the agent's LAST authored narration (the terminal `ai_message` /
|
|
167
|
+
`model_response`), read at the boundary by
|
|
168
|
+
`claim_extract.assistant_text_from_transcript`. Agent-authored — but
|
|
169
|
+
distrusted on the AGAINST-INTEREST axis only.
|
|
170
|
+
results_after_turn — the count of env-authored tool `result` entries that landed AFTER the
|
|
171
|
+
terminal turn. The ENV-AUTHORED corroborator: the gym writes a result
|
|
172
|
+
only when a tool actually executed, so >0 means the agent named a step
|
|
173
|
+
and then ACTED → ABSTAIN (not a terminal dangle). Defaults 0 (the common
|
|
174
|
+
stop case: the last turn is narration with nothing after it).
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
final_turn_text: str
|
|
178
|
+
results_after_turn: int = 0
|
|
179
|
+
|
|
180
|
+
def __post_init__(self) -> None:
|
|
181
|
+
if self.results_after_turn < 0:
|
|
182
|
+
raise ValueError("results_after_turn must be >= 0")
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
# Frozen verdict — advisory only.
|
|
187
|
+
# ---------------------------------------------------------------------------
|
|
188
|
+
@dataclass(frozen=True)
|
|
189
|
+
class DanglingVerdict:
|
|
190
|
+
"""The verdict `classify_stop` returns — typed state + the matched cue for the WARN string.
|
|
191
|
+
|
|
192
|
+
`matched_cue` is the offending marker text (the substring that fired) so the consumer's WARN can
|
|
193
|
+
quote the agent's OWN words back ("your final message says: '<…>' — and no tool ran after").
|
|
194
|
+
`reason` is the one-line operator summary. Advisory: never raises, never blocks the stop.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
verdict: Dangling
|
|
198
|
+
matched_cue: str
|
|
199
|
+
reason: str
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def is_dangling(self) -> bool:
|
|
203
|
+
return self.verdict is Dangling.DANGLING_INTENT
|
|
204
|
+
|
|
205
|
+
def to_dict(self) -> dict:
|
|
206
|
+
return {
|
|
207
|
+
"verdict": self.verdict.value,
|
|
208
|
+
"matched_cue": self.matched_cue,
|
|
209
|
+
"reason": self.reason,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# ---------------------------------------------------------------------------
|
|
214
|
+
# The pure verdict.
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
def _find_cue(text: str, policy: DanglingPolicy) -> str:
|
|
217
|
+
"""The first committed-future-intent cue that matches the (tail of the) text, or "" if none.
|
|
218
|
+
Suppressed when an obvious resolved-guard phrase wraps it (named-it-then-did-it in one turn)."""
|
|
219
|
+
if not policy.cues:
|
|
220
|
+
return ""
|
|
221
|
+
scan = text if policy.tail_chars == 0 else text[-policy.tail_chars:]
|
|
222
|
+
low = scan.casefold()
|
|
223
|
+
for cue in policy.cues:
|
|
224
|
+
m = re.search(cue, low)
|
|
225
|
+
if not m:
|
|
226
|
+
continue
|
|
227
|
+
s, e = m.start(), m.end()
|
|
228
|
+
# The resolved-guard is checked ONLY within the cue's OWN sentence — clipped at the nearest
|
|
229
|
+
# sentence terminator on each side. This is the "I needed to X, which I have now done"
|
|
230
|
+
# same-clause shape; it must NOT reach back into a PRIOR completed sentence ("the group has
|
|
231
|
+
# been created. Now I need to allocate…") and wrongly suppress a genuine LATER dangle (the
|
|
232
|
+
# real-example bug). The cue itself is the primary signal; this only kills an obvious
|
|
233
|
+
# in-clause past-tense resolution.
|
|
234
|
+
sent_start = max((scan.rfind(c, 0, s) for c in ".!?\n"), default=-1) + 1
|
|
235
|
+
nxt = [scan.find(c, e) for c in ".!?\n"]
|
|
236
|
+
nxt = [i for i in nxt if i >= 0]
|
|
237
|
+
sent_end = min(nxt) + 1 if nxt else len(scan)
|
|
238
|
+
sentence = scan[sent_start:sent_end]
|
|
239
|
+
if _RESOLVED_GUARD_RE.search(sentence):
|
|
240
|
+
continue
|
|
241
|
+
return scan[s:e].strip()
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def classify_stop(
|
|
246
|
+
ev: StopEvidence, policy: DanglingPolicy = DEFAULT_POLICY
|
|
247
|
+
) -> DanglingVerdict:
|
|
248
|
+
"""Classify whether the agent stopped right after admitting unfinished work. PURE — no I/O.
|
|
249
|
+
|
|
250
|
+
The ladder, top to bottom:
|
|
251
|
+
1. ABSTAIN — a real tool result landed AFTER the terminal turn (`results_after_turn > 0`): the
|
|
252
|
+
agent named a step and then ACTED, so this is not a terminal dangle. The env-authored
|
|
253
|
+
corroborator wins first — it is the non-forgeable byte that kills the named-it-then-did-it
|
|
254
|
+
false positive.
|
|
255
|
+
2. ABSTAIN — no committed-future-intent cue in the terminal narration (or an empty cue set):
|
|
256
|
+
the agent did not admit unfinished work. The fail-toward-done floor.
|
|
257
|
+
3. DANGLING_INTENT — a cue fired AND nothing executed after: the agent's own last words admit
|
|
258
|
+
an open obligation and the run stopped. The one actionable rung (advisory WARN).
|
|
259
|
+
|
|
260
|
+
Advisory: the verdict REPORTS; the consumer re-surfaces the agent's own sentence (never a
|
|
261
|
+
directive, never a forced continue — the docs/143 −9 pp channel is unreachable by type).
|
|
262
|
+
"""
|
|
263
|
+
# 1. the env-authored corroborator first: acted-after → not a terminal dangle.
|
|
264
|
+
if ev.results_after_turn > 0:
|
|
265
|
+
return DanglingVerdict(
|
|
266
|
+
verdict=Dangling.ABSTAIN,
|
|
267
|
+
matched_cue="",
|
|
268
|
+
reason=(
|
|
269
|
+
f"{ev.results_after_turn} tool result(s) landed after the terminal turn — the "
|
|
270
|
+
f"agent named a step and then acted, not a dangling stop"
|
|
271
|
+
),
|
|
272
|
+
)
|
|
273
|
+
cue = _find_cue(ev.final_turn_text or "", policy)
|
|
274
|
+
if not cue:
|
|
275
|
+
return DanglingVerdict(
|
|
276
|
+
verdict=Dangling.ABSTAIN,
|
|
277
|
+
matched_cue="",
|
|
278
|
+
reason="no committed-future-intent marker in the terminal turn — clean stop (or no cues)",
|
|
279
|
+
)
|
|
280
|
+
return DanglingVerdict(
|
|
281
|
+
verdict=Dangling.DANGLING_INTENT,
|
|
282
|
+
matched_cue=cue,
|
|
283
|
+
reason=(
|
|
284
|
+
f"the terminal turn admits an open obligation ({cue!r}) and no tool ran after — the "
|
|
285
|
+
f"agent stopped right after saying it still had work (an admission against interest)"
|
|
286
|
+
),
|
|
287
|
+
)
|