dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/loop_decide.py ADDED
@@ -0,0 +1,1746 @@
1
+ """OC2 — the /dispatch-loop control-flow decision surface (the "one small thing").
2
+
3
+ `/dispatch-loop`'s SKILL.md is ~1400 lines and ~80 steps; the load-bearing
4
+ question — *under what exact conditions does this loop stop?* — was answerable
5
+ only by reading Step 3, Step 3.5, and Step 4 together (~210 lines of prose state
6
+ transitions). OC2 pulls that loop-level decision into one pure, testable
7
+ function so a reader can hold the control flow in their head and verify the stop
8
+ conditions without the whole file.
9
+
10
+ This module is the loop-level layer **above** `gate_classify.gate_policy`:
11
+
12
+ gate_classify.classify_packet → one packet → one typed Verdict
13
+ gate_classify.gate_policy → (Verdict, --gate mode) → one GateAction
14
+ dispatch_loop_decide.decide → (LoopState, IterationOutcome) → one LoopDecision
15
+ ^ THIS module — composes gate_policy, adds
16
+ the counters/streaks/cap the loop carries
17
+
18
+ `decide()` consumes one iteration's typed outcome plus the carried working-
19
+ context counters and returns exactly one decision: continue (with the next mode)
20
+ or stop (with a named reason). It is **pure** — no subprocess, no file or git
21
+ I/O — for the same reason `gate_policy` is: the loop's stop conditions can be
22
+ replay-tested in isolation, away from everything that makes a live /dispatch
23
+ iteration cost $10-40.
24
+
25
+ The five stop conditions, in one place (the whole point of this module):
26
+
27
+ 1. ITERATION_CAP — iteration count reached `max_iterations` (default 5).
28
+ 2. DRAINED_TWICE — a DRAIN verdict on the /dispatch immediately after a
29
+ **productive** /replan that itself followed a DRAIN.
30
+ /replan tried to refill and could not; the
31
+ lane/portfolio is genuinely exhausted. (hard gate only —
32
+ soft/drive stop on the first DRAIN.) FQ-240: an
33
+ *unproductive* /replan (0 gardening / 0 refill, e.g. the
34
+ §1.5 no-op skip) does NOT arm this trigger — it never
35
+ actually attempted a refill, so a DRAIN after it is not
36
+ "drained twice".
37
+ 3. CONSECUTIVE_UNCLEAR — `consecutive_unclear` reached `max_unclear` (default
38
+ 3). The iteration subprocess is failing systematically,
39
+ not draining a backlog.
40
+ 4. RATE_LIMITED — a usage/rate-limit rejection. Every retry would fail the
41
+ same way until the window resets; do not burn launches.
42
+ 5. LAUNCH_FAILED — the iteration subprocess never produced a valid init
43
+ envelope. A repeating launch failure would burn all
44
+ remaining slots.
45
+
46
+ Plus the soft/drive gate-policy stops (a true DRAIN or a BLOCKED under
47
+ soft/drive), which `decide()` reads straight off `gate_policy`'s GateAction
48
+ rather than re-encoding.
49
+
50
+ ⚓ Mechanical contract over prose ([[feedback_mechanical_contract_over_prose]]):
51
+ the loop's stop/continue/replan decision is now a mechanism (this function),
52
+ not ~80 steps of prose a downstream model is trusted to apply consistently.
53
+
54
+ ⚓ Typed verdict over binary gate ([[feedback_typed_verdict_over_binary_gate]]):
55
+ `decide()` composes the existing typed `gate_policy` rather than re-classifying;
56
+ the loop-level counters (drained-twice, unclear streak) are the part this layer
57
+ adds on top.
58
+
59
+ The wait-marker budget (`wait_marker_budget`) is the OC2 billing addendum: every
60
+ `claude -p` keep-alive marker is its own assistant turn that replays the full
61
+ context out of cache (~$0.03-0.10 each; session 4b4ff97c burned 252 markers /
62
+ ~$7.80 in one run). The post-hoc `keepalive_poll` flag in
63
+ `scripts/headless_telemetry.py` *names* the spend at >=5 markers; this function
64
+ is the *runtime* lever — the loop can refuse a marker that won't earn its
65
+ cache-read cost before it is emitted.
66
+ """
67
+
68
+ from __future__ import annotations
69
+
70
+ import enum
71
+ from dataclasses import dataclass, replace
72
+ from typing import Optional
73
+
74
+ from dos import breaker
75
+ from dos.gate_classify import (
76
+ GATE_HARD,
77
+ GATE_MODES,
78
+ GateAction,
79
+ ReplanProductivity,
80
+ Verdict,
81
+ gate_policy,
82
+ )
83
+ from dos.liveness import Liveness
84
+ from dos.completion import (
85
+ Completion,
86
+ CompletionVerdict,
87
+ ConvergenceVerdict,
88
+ )
89
+ from dos.pickable import Pickability
90
+ from dos.cooldown import Cooldown
91
+ from dos.tokens import blocked_reason_for_key
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Iteration outcome — the typed result of ONE /dispatch-loop iteration.
96
+ #
97
+ # This is exactly what Step 3's gate-detection grep already prints:
98
+ # `SHIPPED verdict=LIVE`, `GATE verdict=<DRAIN|STALE-STAMP|BLOCKED>`, `INTERIM`,
99
+ # `UNCLEAR`, `RATE_LIMITED`. `OutcomeKind` names those, and `IterationOutcome`
100
+ # carries the GATE verdict alongside the kind so `decide()` can route a GATE
101
+ # through `gate_policy` without re-parsing prose.
102
+ # ---------------------------------------------------------------------------
103
+
104
+
105
+ class OutcomeKind(str, enum.Enum):
106
+ """The kind of one iteration's exit, as Step 3's grep classifies it.
107
+
108
+ `str`-valued so it round-trips through the grep's stdout token without a
109
+ lookup table (mirrors `gate_classify.Verdict`).
110
+ """
111
+
112
+ SHIPPED = "SHIPPED" # /dispatch shipped picks (child2 ran)
113
+ GATE = "GATE" # /dispatch reached Step 9 with child2 skipped
114
+ REPLAN_DONE = "REPLAN_DONE" # a /replan iteration completed (any outcome)
115
+ UNCLEAR = "UNCLEAR" # crashed/killed before Step 9, or INTERIM
116
+ RATE_LIMITED = "RATE_LIMITED" # usage/rate-limit rejection — not a fault
117
+ OVERLOADED = "OVERLOADED" # transient 529 server overload — retryable with backoff
118
+ LAUNCH_FAILED = "LAUNCH_FAILED" # no valid init envelope — never started
119
+
120
+ def __str__(self) -> str: # pragma: no cover - trivial
121
+ return self.value
122
+
123
+
124
+ class DescendantProgress(str, enum.Enum):
125
+ """FQ-509 — *is a parked parent's own descendant making FORWARD PROGRESS?*
126
+
127
+ The loop-level companion to `liveness`, but about a DIFFERENT subject: not
128
+ "is THIS run advancing" (that is `liveness`/`Liveness`) but "did the headless
129
+ `-p` orchestrator this iteration launched PARK while a descendant it spawned
130
+ is still committing the registered picks". When a parent `/dispatch` ends its
131
+ turn early (the self-park invariant), its descendants keep working in their
132
+ own (detached) trees and land their commits seconds-to-minutes LATER — but the
133
+ driver's ancestry check fires the instant the parent `-p` exits, sees 0
134
+ committed picks, and the iteration collapses to UNCLEAR. Today that UNCLEAR
135
+ charges the `consecutive_unclear` breaker, so a parent that merely parked over
136
+ a HEALTHY committing descendant is counted as a systematic FAULT and the loop
137
+ self-stops with CONSECUTIVE_UNCLEAR after `max_unclear` such iters — AND it
138
+ re-dispatches a fresh child each time instead of waiting for the live one.
139
+
140
+ The word is **PROGRESS, not "liveness", on purpose.** A child-stall probe
141
+ reports a child ALIVE whenever its log was touched inside the quiet window
142
+ (~10 min) — so a grandchild REAPED seconds ago at parent-exit still reads
143
+ "alive" for ten minutes (a corpse). "Liveness" invites that conflation; this
144
+ enum's contract is FORWARD DELTA only. The host maps the child-stall facts to
145
+ this enum and MUST collapse a log-touched-but-no-commit "alive" to
146
+ `NONE_OBSERVED`, mapping `ADVANCING` ONLY on a real forward delta — HEAD
147
+ advanced since the iteration's start SHA (`new_commit`) OR the ancestry-backed
148
+ CHURNING verdict (all registered picks already ancestors of HEAD). That
149
+ corpse-guard is what keeps the adopt-wait from waiting on a dead child.
150
+
151
+ Values:
152
+ ADVANCING — the descendant landed a forward delta (new commit since
153
+ start, or all picks already shipped/churning): a parked-but-
154
+ PRODUCTIVE child; the UNCLEAR is not a fault, so adopt-wait.
155
+ DEAD — the descendant is genuinely dead (no log growth AND no new
156
+ commit): today's behavior exactly — the honest UNCLEAR stop.
157
+ NONE_OBSERVED — no forward-progress signal (no own descendant, no ancestry
158
+ window, or a log-touched-but-not-committing "alive" corpse).
159
+ Treated identically to `None` (the un-migrated default).
160
+ """
161
+
162
+ ADVANCING = "advancing"
163
+ DEAD = "dead"
164
+ NONE_OBSERVED = "none-observed"
165
+
166
+ def __str__(self) -> str: # pragma: no cover - trivial
167
+ return self.value
168
+
169
+
170
+ @dataclass(frozen=True)
171
+ class IterationOutcome:
172
+ """One iteration's typed result, as Step 3 produces it.
173
+
174
+ `kind` is the Step-3 grep token. `verdict` is set ONLY for `kind=GATE` — it
175
+ is the typed `Verdict` from the structural `verdict=<X>` token in
176
+ /dispatch's Step 9 archive subject (QWB8). For every other kind it is None
177
+ (a SHIPPED iteration has no gate verdict; an UNCLEAR one never reached the
178
+ gate).
179
+
180
+ `replan_productivity` is the FQ-240 signal, set ONLY for `kind=REPLAN_DONE`
181
+ — the typed `ReplanProductivity` verdict from
182
+ `gate_classify.classify_replan_productivity` over the /replan iteration's
183
+ terminal result text. It is what makes the drained-twice rule honest: a DRAIN
184
+ is only "drained twice" if the /replan between the two DRAINs was
185
+ PRODUCTIVE (a genuine refill attempt). When None on a REPLAN_DONE outcome
186
+ (a caller that did not classify), `decide()` defaults it to PRODUCTIVE — the
187
+ conservative pre-FQ-240 behavior.
188
+
189
+ `packet_judge` is the PJ2 stage-3 verdict from
190
+ `scripts/packet_verdict.py classify` (`SHIPPED-CLEAN` / `SHIPPED-DIRTY` /
191
+ `STALLED` / `BLOCKED-OUTCOME`), set ONLY for `kind=SHIPPED`. `ship_count` is the
192
+ measured ship-count from the same classify evidence — required when
193
+ `packet_judge` is set. The pair drives the SHIPPED-DIRTY-0 breaker: a
194
+ SHIPPED iter with packet_judge=`SHIPPED-DIRTY` AND ship_count==0 is the
195
+ degraded-shipping signal the breaker counts; any other SHIPPED outcome
196
+ resets the streak.
197
+
198
+ `measurement_expected` is the FQ-420 distrust flag (set ONLY for
199
+ `kind=SHIPPED`). A SHIPPED token is a *self-report* — "/dispatch says it
200
+ shipped picks." The PJ2 packet-judge is the kernel's independent measurement
201
+ of that claim against the post-fanout commit set. When the driver INTENDED to
202
+ measure but could not resolve the fanout run-ts (`packet_judge` came back
203
+ None on a head==SHIPPED iteration), the measurement is MISSING, not absent-
204
+ by-design — and a missing measurement on a claimed ship is exactly the lie
205
+ the kernel exists to refuse. Setting `measurement_expected=True` asserts "a
206
+ measurement was owed here"; `decide()` then STALLs the loop with
207
+ `UNMEASURED_SHIPPED` rather than taking the conservative healthy path, so a
208
+ null-on-SHIPPED can never silently pass `continue`. The default `False`
209
+ preserves the un-migrated-caller behavior: a caller that never measures
210
+ (no PJ2 stage at all) still gets the pre-FQ-420 conservative healthy path
211
+ when it omits `packet_judge` — the kernel only distrusts a SHIPPED whose
212
+ owner SAID it would measure it. Requiring `packet_judge` to be present
213
+ whenever `measurement_expected=True` AND the iter is healthy is the
214
+ caller's contract; the kernel reads the *absence* of the judge under an
215
+ expectation as the STALL signal. Must be False unless `kind=SHIPPED`.
216
+
217
+ `blocked_cause` is the classified `dos.tokens.BlockedReason` key for a GATE
218
+ BLOCKED — the canonical cause the driver mined from the Outcome cell (via
219
+ `unstick_audit.classify_cause`), set ONLY for `kind=GATE` with
220
+ `verdict=BLOCKED`. It is what lets `decide()` distinguish a *re-dispatch-
221
+ curable* BLOCKED (a stale-stamp / refill drift a `/replan` clears — counts
222
+ toward the FQ-452 spin-breaker, routes to /replan as before) from a
223
+ *re-dispatch-INVARIANT* BLOCKED (an operator-decision, a false-ship oracle
224
+ conflation — a reason whose `BLOCKED_REASONS[cause].self_heals_via` is NOT
225
+ `/replan`). An invariant BLOCKED re-blocks identically on every re-dispatch,
226
+ so spinning it through /replan up to the FQ-452 cap (3 iters) is pure churn;
227
+ `decide()` honest-STOPs on the FIRST such BLOCKED instead (the post-run
228
+ analogue of the pre-launch `PICK_HELD_INVARIANT` rung). None (an un-migrated
229
+ caller, or a BLOCKED whose cause the driver could not classify) preserves
230
+ today's behavior exactly — the FQ-452 spin-breaker still bounds the churn at
231
+ 3. Must be None unless `kind=GATE` with `verdict=BLOCKED`.
232
+ """
233
+
234
+ kind: OutcomeKind
235
+ verdict: Optional[Verdict] = None
236
+ replan_productivity: Optional[ReplanProductivity] = None
237
+ packet_judge: Optional[str] = None
238
+ ship_count: Optional[int] = None
239
+ measurement_expected: bool = False
240
+ blocked_cause: Optional[str] = None
241
+
242
+ def __post_init__(self) -> None:
243
+ if self.kind is OutcomeKind.GATE and self.verdict is None:
244
+ raise ValueError(
245
+ "a GATE outcome must carry a typed verdict "
246
+ "(the verdict=<X> token from /dispatch's Step 9 archive subject)"
247
+ )
248
+ if self.kind is not OutcomeKind.GATE and self.verdict is not None:
249
+ raise ValueError(
250
+ f"a {self.kind} outcome must not carry a verdict "
251
+ f"(only a GATE iteration has a gate verdict)"
252
+ )
253
+ if (
254
+ self.kind is not OutcomeKind.REPLAN_DONE
255
+ and self.replan_productivity is not None
256
+ ):
257
+ raise ValueError(
258
+ f"a {self.kind} outcome must not carry a replan_productivity "
259
+ f"verdict (only a REPLAN_DONE iteration is a /replan)"
260
+ )
261
+ if self.kind is not OutcomeKind.SHIPPED and (
262
+ self.packet_judge is not None or self.ship_count is not None
263
+ ):
264
+ raise ValueError(
265
+ f"a {self.kind} outcome must not carry packet_judge/ship_count "
266
+ f"(only a SHIPPED iteration has a packet-outcome verdict)"
267
+ )
268
+ if (self.packet_judge is None) != (self.ship_count is None):
269
+ raise ValueError(
270
+ "packet_judge and ship_count must be set together "
271
+ "(both required when present on a SHIPPED outcome)"
272
+ )
273
+ if self.measurement_expected and self.kind is not OutcomeKind.SHIPPED:
274
+ raise ValueError(
275
+ f"a {self.kind} outcome must not set measurement_expected "
276
+ f"(only a SHIPPED iteration owes a packet-judge measurement)"
277
+ )
278
+ if self.blocked_cause is not None and not (
279
+ self.kind is OutcomeKind.GATE and self.verdict is Verdict.BLOCKED
280
+ ):
281
+ raise ValueError(
282
+ f"a {self.kind} outcome (verdict={self.verdict}) must not carry "
283
+ f"blocked_cause (only a GATE BLOCKED iteration has a blocked cause)"
284
+ )
285
+
286
+
287
+ # ---------------------------------------------------------------------------
288
+ # Loop state — the carried working context.
289
+ #
290
+ # These are the per-loop counters Step 0/Step 3/Step 4 thread through working
291
+ # context. Holding them in one frozen dataclass — and transitioning them in one
292
+ # function — is what makes the loop's control flow inspectable: a reader checks
293
+ # the five stop conditions against these fields, not against scattered prose.
294
+ # ---------------------------------------------------------------------------
295
+
296
+
297
+ @dataclass(frozen=True)
298
+ class LoopState:
299
+ """The /dispatch-loop working-context counters `decide()` transitions.
300
+
301
+ Fields (the loop-level carry-over — `SCOPE`/`LANE` are not here because they
302
+ are constant for the whole loop and never drive a stop decision):
303
+
304
+ iteration — 1-based count of the iteration that just ran.
305
+ last_replan_drained — True iff the immediately-prior iteration was a
306
+ **productive** /replan that followed a DRAIN. The
307
+ drained-twice trigger: a DRAIN on the /dispatch
308
+ *after* such a /replan means /replan tried to refill
309
+ and could not. FQ-240: an unproductive /replan (0
310
+ gardening / 0 refill) leaves this False — it was not
311
+ a refill attempt.
312
+ consecutive_unclear — back-to-back UNCLEAR streak; the circuit breaker.
313
+ consecutive_dirty_zero — back-to-back SHIPPED-DIRTY iters where the
314
+ measured ship-count was 0. The breaker that pairs
315
+ with the cap-10 raise: catches a /dispatch
316
+ regression that ships apparently-successful but
317
+ actually-empty iters indefinitely (input gate
318
+ says LIVE, packet-judge says SHIPPED-DIRTY, 0
319
+ commits land). Reset on any SHIPPED-CLEAN /
320
+ GATE / REPLAN_DONE outcome.
321
+ gate_mode — the --gate policy (hard|soft|drive), constant for
322
+ the loop; passed straight to `gate_policy`.
323
+ max_iterations — the hard cap (10; no override flag in the SKILL).
324
+ Raised from 5 in the 2026-05-22 cap raise — see
325
+ the SKILL's Contract section for the named
326
+ damage-bound rationale (the degraded-shipping
327
+ scenario the SHIPPED-DIRTY-0 breaker now kills).
328
+ max_unclear — the circuit-breaker threshold (3).
329
+ max_dirty_zero — the SHIPPED-DIRTY-0 breaker threshold (3).
330
+ Sized to detect a sustained degraded-shipping
331
+ regression while tolerating one-off DIRTY-0 iters
332
+ that may recover on the next /dispatch.
333
+ consecutive_stale_stamp — FQ-452: back-to-back GATE iterations whose
334
+ verdict was STALE-STAMP or BLOCKED and routed to
335
+ /replan WITHOUT the lane recovering. The
336
+ non-converging-spin breaker: a plan-meta `remaining:`
337
+ list naming already-shipped phases makes the picker
338
+ re-derive 0-live → GATE BLOCKED → /replan → (the
339
+ §1.5 skip-gate keys on new_findings/substantive_ships,
340
+ not stale-stamp drift, so /replan exits UNPRODUCTIVE
341
+ without reconciling the list) → BLOCKED again, forever.
342
+ The streak SURVIVES the intervening REPLAN_DONE on
343
+ purpose (the /replan is the *response* to the
344
+ stale-stamp; if it didn't fix it, the next /dispatch
345
+ stale-stamps again and the streak must continue) — it
346
+ resets only on a SHIPPED iteration or a DIFFERENT gate
347
+ verdict (LIVE/DRAIN/RACE). On the Kth consecutive
348
+ instance `decide()` STOPs with
349
+ STALE_STAMP_UNRECONCILED + surface so the loop refuses
350
+ to spin a (K+1)th /replan into the same unreconciled
351
+ list; the caller (driver) names the
352
+ `plan-meta-gardening:<series>` actuation the operator
353
+ /replan must run (the kernel is pure + series-blind).
354
+ max_stale_stamp — the FQ-452 spin-breaker threshold (3). One
355
+ stale-stamp gate routes to /replan normally (the
356
+ gardening sweep usually clears it); three in a row
357
+ without recovery means /replan is structurally NOT
358
+ reconciling the list and another iteration would
359
+ just re-spin.
360
+ liveness — the OPTIONAL in-flight `Liveness` verdict
361
+ (ADVANCING/SPINNING/STALLED) the caller gathered via
362
+ `dos liveness` for THIS run over the interval since it
363
+ started (docs/99 / docs/82 Phase-3a). It lives here,
364
+ not on `IterationOutcome`, because liveness is a
365
+ property of the run *across the interval* — carried
366
+ context, like `gate_mode` — not of one iteration's
367
+ exit token. `decide()` STOPs the loop with
368
+ `StopReason.SPINNING` when this is `SPINNING`: a
369
+ ground-truth anti-spin breaker that complements the
370
+ self-report breakers (`consecutive_dirty_zero` et al.)
371
+ by reading git/journal, not the caller's outcome
372
+ token. **Opt-in**: `None` (the default) means the
373
+ caller did not gather a verdict, and `decide()` is
374
+ then BYTE-IDENTICAL to the pre-3a behavior — the same
375
+ conservative-default discipline as
376
+ `IterationOutcome.measurement_expected=False`. ADVANCING
377
+ and STALLED never stop the loop here: ADVANCING is the
378
+ benign verdict, and STALLED ("dead/hung") is the
379
+ SUPERVISOR's reap input (`supervise.py`), not a live
380
+ loop's self-stop — a loop making decisions is by
381
+ construction alive, so STALLED reaching `decide()` is
382
+ degenerate and mapping it would duplicate the
383
+ supervisor's job and blur the alive-vs-dead line.
384
+ (`Liveness` is a SIBLING kernel import — `liveness` is
385
+ `loop_decide`'s sibling per CLAUDE.md; the litmus is
386
+ "no host, no I/O", not "no sibling import", and
387
+ `loop_decide` stays pure: it READS a verdict value,
388
+ never computes one.)
389
+ completion — the OPTIONAL in-flight `CompletionVerdict` (docs/117
390
+ §5.4 / Phase 3) the caller gathered for THIS run after
391
+ the iteration: it ran `completion.classify` over the
392
+ run's `LedgerState` + freshly-read `AncestryFacts` (the
393
+ same git read `resume`'s evidence-gather does) and
394
+ handed the result in. Like `liveness` it is in-flight
395
+ EVIDENCE, not carried counter state — it lives here
396
+ because `decide()` is pure and may not read git itself.
397
+ `decide()` STOPs with `StopReason.COMPLETE` when this is
398
+ `COMPLETE` (every declared unit verified on the
399
+ non-forgeable rung → the work is *finished*, the first
400
+ non-give-up terminal) and with `StopReason.THRASHING`
401
+ when it is `UNDERDECLARED` (done-but-under-declared; a
402
+ human must reconcile → surface). INCOMPLETE and
403
+ INDETERMINATE never stop here: INCOMPLETE means the loop
404
+ should *continue* re-dispatching the residual (the
405
+ caller owns that), and INDETERMINATE means "can't tell"
406
+ — we never *assert* done on an unsound fold, so it falls
407
+ through to the existing logic. **Opt-in**: `None` (the
408
+ default) means the caller gathered no verdict and
409
+ `decide()` is BYTE-IDENTICAL to the pre-Phase-3 loop —
410
+ the same conservative default as `liveness`.
411
+ pickability — the OPTIONAL pre-dispatch `Pickability` verdict
412
+ (docs/168 §5) the caller gathered for the lane it would
413
+ dispatch NEXT: it ran `pickable.classify` over the
414
+ host-gathered unit state and handed the result in. Like
415
+ `liveness`/`completion` it is in-flight EVIDENCE, not
416
+ carried counter state — `decide()` is pure and may not
417
+ read the plan class / soak index / live claims itself.
418
+ `decide()` STOPs with `StopReason.PICK_HELD_INVARIANT`
419
+ when this verdict `is_redispatch_invariant` (the lane is
420
+ held ONLY by a reason a re-dispatch cannot change —
421
+ DRAFT_CLASS / OPERATOR_GATED / SOAK_OPEN /
422
+ DEPENDENCY_UNMET): re-dispatching it would re-block
423
+ identically, so the loop honest-STOPs and surfaces the
424
+ typed hold for routing (DRAFT→/promote, OPERATOR→
425
+ escalate a decision, SOAK→wait) instead of spinning. This
426
+ converts the per-run human "honest STOP" override
427
+ (documented across a dozen drain-trap run READMEs — ASI
428
+ #475, RTN soak, FMP #493) into a kernel rule. An
429
+ OFFERABLE verdict, or a HELD verdict whose reason is
430
+ re-dispatch-CURABLE (IN_FLIGHT / SOFT_CLAIMED_ELSEWHERE /
431
+ STALE_CLAIM / COOLDOWN / SHIPPED / UNPARSEABLE), never
432
+ stops here — those CAN clear, so the loop keeps its
433
+ existing behavior. **Opt-in**: `None` (the default)
434
+ skips the rung entirely → BYTE-IDENTICAL to the
435
+ pre-docs/168 loop, the same conservative default as
436
+ `liveness` / `completion`.
437
+ cooldown — the OPTIONAL anti-churn `Cooldown` verdict (docs/207 §3)
438
+ the caller gathered for the unit it would dispatch NEXT,
439
+ AFTER it already skipped every fresher candidate: it ran
440
+ `cooldown.cooldown_verdict` over the unit's `OP_ATTEMPT`
441
+ history and handed the result in. `decide()` STOPs with
442
+ `StopReason.PICK_COOLDOWN` when this verdict is
443
+ `RECENTLY_ATTEMPTED` — the unit was attempted-and-didn't-
444
+ move inside the window AND (by the host's pick-selection
445
+ contract) nothing fresher is offerable, so re-dispatching
446
+ it would re-storm (the ~5%-shipping re-pick storm the bare
447
+ loop hit). This is the cross-run memory `liveness` (a
448
+ single-run verdict) cannot provide. A `CLEAR` verdict
449
+ never stops — the window elapsed or nothing held it. Like
450
+ `pickability` it is in-flight EVIDENCE, not carried state
451
+ — `decide()` is pure and may not read the journal. The
452
+ host's contract: only hand a `RECENTLY_ATTEMPTED` here
453
+ once it has ALREADY skipped the offerable-and-not-cooled
454
+ units (the skip-to-next is pick-selection's job; the STOP
455
+ is the all-cooled terminal). **Opt-in**: `None` skips it
456
+ → byte-identical to the pre-docs/207 loop.
457
+ convergence — the OPTIONAL in-flight `ConvergenceVerdict` (docs/117
458
+ §5.2 / Phase 3) over the residual-size history: the
459
+ DYNAMIC companion to `completion`. `COMPLETE` is a
460
+ static fixpoint (residual empty *now*); this catches the
461
+ *won't-ever-get-there* loop (the residual churns but
462
+ never empties — the reviewer-finds-new-findings case).
463
+ `decide()` STOPs with `StopReason.THRASHING` (surface)
464
+ when this verdict `should_surface` (THRASHING or
465
+ STARVED). CONVERGING / INSUFFICIENT never stop — the
466
+ loop keeps going (no fixpoint reached *yet* is not a
467
+ stop). Checked only when `completion` did not already
468
+ stop the loop COMPLETE — a converged run is done, not
469
+ thrashing. **Opt-in**: `None` skips the rung entirely.
470
+ descendant_progress — FQ-509: the OPTIONAL `DescendantProgress` verdict for
471
+ THIS iteration's own parked descendant (the headless
472
+ `-p` child the iteration launched that PARKED while a
473
+ grandchild it spawned is still committing). In-flight
474
+ EVIDENCE the caller re-gathers each iteration from the
475
+ child-stall probe (NOT carried state — cleared up-front
476
+ like `liveness`). `decide()` reads it ONLY inside the
477
+ UNCLEAR rung: when it is `ADVANCING` (the descendant
478
+ landed a forward delta — new commit since start, or all
479
+ picks already ancestors), the UNCLEAR is a parked-but-
480
+ PRODUCTIVE child, NOT a /dispatch fault — so `decide()`
481
+ CONTINUEs (re-dispatch / adopt-wait for the live child
482
+ to land its picks) WITHOUT charging the
483
+ `consecutive_unclear` breaker, bounded by
484
+ `consecutive_adopt_wait`. `DEAD` / `NONE_OBSERVED` /
485
+ `None` all take today's exact UNCLEAR path — the host's
486
+ corpse-guard (a log-touched-but-not-committing "alive"
487
+ must map to `NONE_OBSERVED`, never `ADVANCING`) is what
488
+ keeps a reaped descendant from ever adopt-waiting.
489
+ **Opt-in**: `None` (the default) skips the pre-check →
490
+ BYTE-IDENTICAL to the pre-FQ-509 loop, the same
491
+ conservative default as `liveness`/`pickability`.
492
+ consecutive_adopt_wait — the carried bound for the `descendant_progress`
493
+ adopt-wait. Back-to-back UNCLEAR iters where the
494
+ descendant read `ADVANCING` but STILL had not landed
495
+ the registered picks. Bumped on each ADVANCING adopt-
496
+ wait continue; reset to 0 on any non-ADVANCING UNCLEAR
497
+ iter (so a flapping child cannot accrue it) AND
498
+ implicitly on any non-UNCLEAR outcome. On the Kth
499
+ (`max_adopt_wait`) the adopt-wait rung FALLS THROUGH to
500
+ today's UNCLEAR breaker path (which itself caps at
501
+ `max_unclear`) — a clock-free bound that degrades to
502
+ current behavior rather than a new terminal. UNLIKE
503
+ `consecutive_unclear`, this IS reset on the non-
504
+ advancing branch, but `consecutive_unclear` is NOT
505
+ reset there — so a flapping ALIVE/quiet child still
506
+ reaches `max_unclear` and stops. CARRIED state (it must
507
+ round-trip through the driver's next_state).
508
+ max_adopt_wait — the adopt-wait bound (default 2). Two consecutive
509
+ ADVANCING-but-uncommitted iters is enough evidence the
510
+ descendant is not actually about to land its picks
511
+ (or its "advance" is unrelated drift); fall through to
512
+ the UNCLEAR breaker rather than wait a 3rd.
513
+ consecutive_unproductive_replan_drains — FQ-509-sibling (QWD benign-drain).
514
+ Back-to-back UNPRODUCTIVE /replans, each the response
515
+ to a DRAIN, on the same lane. The drained-twice rung
516
+ (`last_replan_drained`) only arms off a PRODUCTIVE
517
+ /replan (FQ-240) — but a BENIGN genuinely-drained lane
518
+ (every phase already shipped/in-flight, nothing left to
519
+ refill) returns UNPRODUCTIVE from every /replan, so
520
+ drained-twice never arms and the loop spins
521
+ DRAIN→/replan→DRAIN→/replan to the iteration cap. This
522
+ counter catches that: incremented in 5b when a
523
+ REPLAN_DONE is UNPRODUCTIVE *and* the immediately-prior
524
+ gate was a DRAIN (`last_gate_was_drain`); reset to 0 on
525
+ any SHIPPED, any PRODUCTIVE /replan, or any non-DRAIN
526
+ gate verdict (the lane moved off the benign-drain
527
+ pattern). On the Kth, the DRAIN that would route the
528
+ (K+1)th /replan instead STOPs with
529
+ `StopReason.BENIGN_DRAIN` — the kernel reaches the
530
+ honest-STOP from typed verdicts the operator otherwise
531
+ has to eyeball (the QWD run-README override). Default 0
532
+ keeps the loop BYTE-IDENTICAL for any lane that ever
533
+ ships or has a productive /replan.
534
+ max_unproductive_replan_drains — the benign-drain breaker threshold (2). Two
535
+ UNPRODUCTIVE /replans around DRAINs without recovery
536
+ means /replan is structurally unable to refill the lane
537
+ (it is benignly drained) and a third would just re-spin.
538
+ Sized to the QWD memory's measured "2 consecutive
539
+ UNPRODUCTIVE replans around DRAINs → honest-STOP".
540
+ consecutive_unproductive_replan — #506 / docs/258: back-to-back UNPRODUCTIVE
541
+ /replans REGARDLESS of the prior gate. The BROADER
542
+ sibling of `consecutive_unproductive_replan_drains`:
543
+ that one counts only unproductive replans BRACKETED by a
544
+ DRAIN (a benignly-drained lane); this one counts EVERY
545
+ unproductive replan, because the measured pathology (#506:
546
+ /replan = 45% of loop wall-clock, 43% of replan iters
547
+ refill nothing) includes a 53-turn replan that produced 0
548
+ refill even though commits had landed — so the gate was
549
+ NOT a DRAIN and the benign-drain bracket deliberately
550
+ skips it (pinned by
551
+ `test_benign_drain_unproductive_replan_without_prior_drain_no_count`).
552
+ Bumped in 5b on an UNPRODUCTIVE REPLAN_DONE (via the
553
+ `dos.breaker` primitive — the FIRST loop_decide counter
554
+ so expressed); reset to 0 on any PRODUCTIVE replan, any
555
+ SHIPPED, or a non-stale gate (the lane moved off the
556
+ stall). On the Kth, `decide()` STOPs with
557
+ `StopReason.REPLAN_STALLED` + surface. **Opt-in**: only an
558
+ UNPRODUCTIVE `REPLAN_DONE` (`outcome.replan_productivity is
559
+ UNPRODUCTIVE`) ever bumps it, and the FQ-240 default treats
560
+ an unclassified replan as PRODUCTIVE — so a caller that
561
+ never classifies replan productivity never feeds this and
562
+ is BYTE-IDENTICAL to the pre-#506 loop, the same
563
+ conservative default as the benign-drain rung.
564
+ max_unproductive_replan — the REPLAN_STALLED threshold (2). #506: "trip on the
565
+ 2nd unproductive `REPLAN_DONE` — a sweep that refilled
566
+ nothing twice won't on a 3rd identical pass." Two
567
+ expensive (16-22min / ~$5) 0-refill replans in a row is
568
+ enough evidence /replan is structurally unproductive on
569
+ this lane right now.
570
+ last_gate_was_drain — internal one-iteration carry: True iff the gate of the
571
+ immediately-prior iteration was a DRAIN that routed to
572
+ /replan. Read+reset in 5b to know a following
573
+ REPLAN_DONE is the response to a DRAIN (the bracket that
574
+ makes an UNPRODUCTIVE /replan count toward the
575
+ benign-drain breaker). Set in 5c on a DRAIN that routes
576
+ to /replan; cleared on any non-DRAIN outcome. Not a
577
+ stop signal on its own.
578
+ """
579
+
580
+ iteration: int = 1
581
+ last_replan_drained: bool = False
582
+ consecutive_unclear: int = 0
583
+ consecutive_dirty_zero: int = 0
584
+ consecutive_overloaded: int = 0
585
+ consecutive_stale_stamp: int = 0
586
+ gate_mode: str = GATE_HARD
587
+ max_iterations: int = 10
588
+ max_unclear: int = 3
589
+ max_dirty_zero: int = 3
590
+ max_overloaded: int = 3
591
+ max_stale_stamp: int = 3
592
+ consecutive_unproductive_replan_drains: int = 0
593
+ max_unproductive_replan_drains: int = 2
594
+ consecutive_unproductive_replan: int = 0
595
+ max_unproductive_replan: int = 2
596
+ last_gate_was_drain: bool = False
597
+ liveness: Optional[Liveness] = None
598
+ completion: Optional[CompletionVerdict] = None
599
+ convergence: Optional[ConvergenceVerdict] = None
600
+ pickability: Optional[Pickability] = None
601
+ cooldown: Optional[Cooldown] = None
602
+ descendant_progress: Optional[DescendantProgress] = None
603
+ consecutive_adopt_wait: int = 0
604
+ max_adopt_wait: int = 2
605
+
606
+ def __post_init__(self) -> None:
607
+ if self.gate_mode not in GATE_MODES:
608
+ raise ValueError(
609
+ f"unknown gate_mode {self.gate_mode!r} — expected one of {GATE_MODES}"
610
+ )
611
+
612
+
613
+ class StopReason(str, enum.Enum):
614
+ """Why the loop stopped — the named stop conditions, in one enum.
615
+
616
+ These ARE the answer to "under what exact conditions does this loop stop?"
617
+ — every terminal path produces one of these.
618
+ """
619
+
620
+ ITERATION_CAP = "iteration-cap" # reached max_iterations
621
+ DRAINED_TWICE = "drained-twice" # DRAIN after a PRODUCTIVE /replan that still couldn't refill
622
+ DRAIN = "drain" # soft/drive: a single true DRAIN
623
+ BLOCKED = "blocked" # soft/drive: picks blocked (was WEDGE)
624
+ CONSECUTIVE_UNCLEAR = "consecutive-unclear" # circuit breaker
625
+ CONSECUTIVE_DIRTY_ZERO = "consecutive-dirty-zero" # K back-to-back SHIPPED-DIRTY+0 iters
626
+ CONSECUTIVE_OVERLOADED = "consecutive-overloaded" # K back-to-back 529s — outage, not transient
627
+ RATE_LIMITED = "rate-limited" # usage/rate-limit window exhausted
628
+ LAUNCH_FAILED = "launch-failed" # subprocess never started
629
+ UNMEASURED_SHIPPED = "unmeasured-shipped" # FQ-420: SHIPPED claimed, PJ2 measurement owed but missing
630
+ SPINNING = "spinning" # docs/99: liveness() says SPINNING — alive, 0 forward delta (ground-truth anti-spin)
631
+ STALE_STAMP_UNRECONCILED = "stale-stamp-unreconciled" # FQ-452: K consecutive STALE-STAMP/BLOCKED gates /replan never reconciled — refuse to spin another
632
+ BLOCKED_REDISPATCH_INVARIANT = "blocked-redispatch-invariant" # FQ-510: a GATE BLOCKED whose classified cause is re-dispatch-INVARIANT (operator_decision / a false-ship oracle conflation — any reason whose BLOCKED_REASONS[cause].self_heals_via is NOT /replan). A /replan provably cannot clear it, so it re-blocks identically every iteration; honest-STOP on the FIRST such BLOCKED (the post-run analogue of PICK_HELD_INVARIANT) rather than spinning /replan to the FQ-452 cap (~$15-25/1.5h of churn). The operator-decision sub-case is also auto-filed once by the driver's emit-decision-needed actuation.
633
+ COMPLETE = "complete" # docs/117: completion.classify() says COMPLETE — every declared unit verified; the FIRST stop reason that means "finished," not "gave up" (the anti-ITERATION_CAP)
634
+ THRASHING = "thrashing" # docs/117: completion.convergence() says THRASHING/STARVED — the residual won't reach a fixpoint; surface, don't burn the cap silently
635
+ PICK_HELD_INVARIANT = "pick-held-invariant" # docs/168 §5: the next lane is HELD only by a re-dispatch-invariant reason (DRAFT_CLASS/OPERATOR_GATED/SOAK_OPEN/DEPENDENCY_UNMET) — re-dispatch re-blocks identically; honest-STOP + surface the typed hold for routing
636
+ PICK_COOLDOWN = "pick-cooldown" # docs/207 §3: the next unit was attempted-and-didn't-move inside the cooldown window AND nothing fresher is offerable — re-dispatching it would re-storm; honest-STOP + surface the cooled unit (the anti-churn breaker; the ~5%-shipping re-pick storm)
637
+ BENIGN_DRAIN = "benign-drain" # FQ-509-sibling (QWD): K consecutive UNPRODUCTIVE /replans, each bracketed by a DRAIN, on the same lane — the lane is genuinely drained but BENIGN (every phase already shipped/in-flight, nothing to refill). The drained-twice rung never arms (an UNPRODUCTIVE /replan is not a refill attempt, FQ-240), so without this rung the loop spins DRAIN→/replan→DRAIN→/replan to the iteration cap (~$11+/55min for 0 refill). Stop instead + surface (re-scope or wait for the in-flight phases to settle). The benign-drain analogue of DRAINED_TWICE: that one is "a PRODUCTIVE /replan still couldn't refill"; this is "the /replans are all UNPRODUCTIVE because there is nothing left to refill."
638
+ REPLAN_STALLED = "replan-stalled" # #506 / docs/258: K consecutive UNPRODUCTIVE /replans regardless of WHY (the broader sibling of BENIGN_DRAIN). MEASURED: /replan is 45% of all loop wall-clock and 43% of replan iters STALL (0 refill) — a 53-turn replan that refilled nothing even though commits landed (so the gate was NOT a DRAIN, which is exactly the case BENIGN_DRAIN's `last_gate_was_drain` bracket deliberately ignores). BENIGN_DRAIN = "lane empty"; REPLAN_STALLED = "/replan keeps doing costly nothing." Trips on the Kth unproductive REPLAN_DONE ITSELF (default K=2). The FIRST loop_decide rung expressed through the `dos.breaker` primitive rather than a hand-written inline counter.
639
+
640
+ # PERMANENT legacy alias — same object as BLOCKED, so any un-migrated
641
+ # `is StopReason.WEDGE` keeps working (mirrors GateVerdict.WEDGE).
642
+ WEDGE = "blocked"
643
+
644
+ def __str__(self) -> str: # pragma: no cover - trivial
645
+ return self.value
646
+
647
+
648
+ @dataclass(frozen=True)
649
+ class LoopDecision:
650
+ """The single decision `decide()` returns for one iteration.
651
+
652
+ `action` — `"continue"` or `"stop"`. The loop branches on this and nothing
653
+ else; everything below is detail for the continue/stop path it picks.
654
+
655
+ Continue fields (action == "continue"):
656
+ next_mode — `"dispatch"` | `"replan"`: the next iteration's mode.
657
+ reconcile — True iff the loop must run an inline stamp-reconcile pass
658
+ before the next iteration (a soft/drive STALE-STAMP). Read
659
+ straight off `gate_policy`'s GateAction.
660
+
661
+ Stop fields (action == "stop"):
662
+ stop_reason — the named StopReason.
663
+ surface — True iff the stop needs operator attention (a BLOCKED, a
664
+ soft/drive DRAIN). Read off `gate_policy` for gate stops.
665
+
666
+ Always set:
667
+ next_state — the transitioned `LoopState` to carry into the next
668
+ iteration (only meaningful when action == "continue", but
669
+ always returned so the caller never re-derives counters).
670
+ reason — a one-line operator-facing summary for the tally row.
671
+ """
672
+
673
+ action: str # "continue" | "retry-same-iter" | "stop"
674
+ next_state: LoopState
675
+ reason: str
676
+ next_mode: str = ""
677
+ reconcile: bool = False
678
+ stop_reason: Optional[StopReason] = None
679
+ surface: bool = False
680
+ # Set on action == "retry-same-iter" (transient 529 OVERLOADED): seconds the
681
+ # caller should sleep before relaunching the SAME iteration number. The 60s
682
+ # → 270s → 1200s ladder is well inside the prompt-cache TTL on attempt 1 and
683
+ # past it on attempts 2-3; 3 OVERLOADED in a row escalates to STOP via
684
+ # `consecutive_overloaded` (an outage, not transient).
685
+ backoff_seconds: int = 0
686
+
687
+
688
+ _CONTINUE = "continue"
689
+ _STOP = "stop"
690
+ _RETRY_SAME_ITER = "retry-same-iter"
691
+
692
+ # Backoff schedule for OVERLOADED retries — 60s, 270s, 1200s. First step stays
693
+ # inside the prompt-cache TTL (cheap); the second and third pay the cache miss
694
+ # but are still cheaper than burning a real /dispatch iter under server overload.
695
+ # After the third retry hits OVERLOADED again, `consecutive_overloaded` reaches
696
+ # `max_overloaded` and the loop STOPs with CONSECUTIVE_OVERLOADED — that's not
697
+ # a transient capacity blip, it is a sustained outage and an operator should look.
698
+ _OVERLOADED_BACKOFF = (60, 270, 1200)
699
+
700
+
701
+ # ---------------------------------------------------------------------------
702
+ # The breaker bridge (docs/258 — the loop_decide → breaker migration).
703
+ #
704
+ # Every consecutive-streak rung below — UNCLEAR / OVERLOADED / DIRTY-ZERO /
705
+ # STALE-STAMP / benign-drain / the new REPLAN_STALLED — is the SAME mechanism:
706
+ # bump a count, compare it to a max, trip if reached, reset on a clean outcome.
707
+ # `breaker.py` IS that mechanism, lifted into one pure leaf (docs/223). These two
708
+ # helpers are the only bridge `decide()` needs: they turn one of `LoopState`'s
709
+ # int counter fields + its max field into a `breaker.BreakerCounts` /
710
+ # `BreakerPolicy`, run the primitive's fold, and hand back the new count + the
711
+ # trip bit. The int fields STAY the public surface (callers construct/read them);
712
+ # the bump/compare ARITHMETIC is what moves into `breaker`. Mechanism lifted,
713
+ # policy (which field, which threshold, which outcome resets it) stays at the call
714
+ # site — exactly the split `breaker.py`'s docstring argues for.
715
+ #
716
+ # Each loop_decide rung is consecutive-only (no cumulative/flapping rung), so the
717
+ # policy is always `max_consecutive=<max>, max_total=0`.
718
+
719
+
720
+ def _breaker_fail(consecutive: int, max_consecutive: int) -> tuple[int, bool]:
721
+ """Record one failure of a consecutive-only streak. Returns (new_count, is_open).
722
+
723
+ The `breaker.record_failure` fold, specialized to a loop_decide counter:
724
+ `BreakerCounts(consecutive=…)` + `BreakerPolicy(max_consecutive=…, max_total=0)`.
725
+ Byte-identical to the inline `streak = consecutive + 1; is_open = streak >= max`
726
+ it replaces, BECAUSE `record_failure` bumps then `_classify` trips on `>=`.
727
+
728
+ The one boundary the primitive can't take: `max_consecutive == 0`. Inline,
729
+ `max=0` means "trip on the first" (`0+1 >= 0`); but `BreakerPolicy` REFUSES a
730
+ both-zero policy (a breaker that can never trip is a config error). To preserve
731
+ the degenerate exactly, `max == 0` is reproduced here (`new >= 0` is always
732
+ True → trips immediately) rather than routed through the primitive. Every real
733
+ threshold is ≥ 2, so the breaker path is the live one; this guard changes no
734
+ behavior, it only keeps the boundary byte-identical.
735
+ """
736
+ if max_consecutive <= 0:
737
+ new = consecutive + 1
738
+ return new, new >= max_consecutive
739
+ t = breaker.record_failure(
740
+ breaker.BreakerCounts(consecutive=consecutive),
741
+ breaker.BreakerPolicy(max_consecutive=max_consecutive, max_total=0),
742
+ )
743
+ return t.counts.consecutive, t.verdict.is_open
744
+
745
+
746
+ def _replan_stall_policy(state: LoopState) -> breaker.BreakerPolicy:
747
+ """The `BreakerPolicy` for the #506 REPLAN_STALLED rung (docs/258).
748
+
749
+ A consecutive-only policy keyed on `max_unproductive_replan`. Clamped to a
750
+ minimum of 1 so `breaker.record_success` (which only reads the healed count,
751
+ always 0, and never the verdict on the success path) can be called even when a
752
+ caller passes the degenerate `max_unproductive_replan == 0` — the FAILURE path
753
+ routes through `_breaker_fail`, which preserves the trip-on-first degenerate
754
+ itself, so this clamp affects only the success-side classify (whose count is 0
755
+ regardless of threshold).
756
+ """
757
+ return breaker.BreakerPolicy(
758
+ max_consecutive=max(state.max_unproductive_replan, 1), max_total=0
759
+ )
760
+
761
+
762
+ def decide(state: LoopState, outcome: IterationOutcome) -> LoopDecision:
763
+ """Decide continue/stop for one /dispatch-loop iteration. PURE — no I/O.
764
+
765
+ `state` is the working-context carry-over (the iteration that just produced
766
+ `outcome`). `outcome` is that iteration's typed result (Step 3's grep token,
767
+ plus the GATE verdict when applicable).
768
+
769
+ Returns one `LoopDecision`. The decision order is the loop's actual control
770
+ flow, top to bottom — read this function to know exactly when the loop
771
+ stops:
772
+
773
+ 1. LAUNCH_FAILED → stop (a repeating launch failure burns all slots).
774
+ 2. RATE_LIMITED / OVERLOADED → stop / retry-with-backoff; NOT a fault, so
775
+ neither counts toward the UNCLEAR breaker.
776
+ 3. COMPLETE / THRASHING → stop (docs/117 Phase 3): if `state.completion`
777
+ is COMPLETE the work is verifiably DONE — stop, no
778
+ surface (the anti-`ITERATION_CAP`). UNDERDECLARED, or a
779
+ `state.convergence` that `should_surface`
780
+ (THRASHING/STARVED), stops AND surfaces (no fixpoint /
781
+ scope in doubt). Checked AFTER the not-a-fault stops and
782
+ BEFORE SPINNING (a provably-finished run beats a
783
+ zero-delta SPINNING read — the resumed-already-done
784
+ case). Opt-in: `None` skips these rungs → byte-identical.
785
+ 4. SPINNING → stop (docs/99): if `state.liveness` is `SPINNING`, the
786
+ run is alive but landing zero forward delta — a
787
+ ground-truth anti-spin breaker. Checked AFTER the
788
+ upstream/transient breakers (an outage-induced idle is
789
+ not a spin) and BEFORE the outcome block (ground truth
790
+ overrides the SHIPPED self-report). Opt-in: `None`
791
+ liveness skips this rung entirely → byte-identical.
792
+ 4b. PICK_HELD_INVARIANT → stop (docs/168 §5): if `state.pickability` is HELD
793
+ by a re-dispatch-invariant reason (DRAFT_CLASS /
794
+ OPERATOR_GATED / SOAK_OPEN / DEPENDENCY_UNMET), the next
795
+ lane would re-block identically — honest-STOP + surface the
796
+ typed hold for routing rather than spin. Checked AFTER the
797
+ not-a-fault/COMPLETE/SPINNING stops and BEFORE the outcome
798
+ block (the gate beats the self-report). Opt-in: `None`
799
+ skips it → byte-identical.
800
+ 4c. PICK_COOLDOWN → stop (docs/207 §3): if `state.cooldown` is
801
+ RECENTLY_ATTEMPTED (the next unit was attempted-and-didn't-
802
+ move inside the window AND the host already skipped every
803
+ fresher candidate), re-dispatching it would re-storm —
804
+ honest-STOP + surface the cooled unit. The anti-churn
805
+ breaker; checked AFTER PICK_HELD_INVARIANT (an invariant
806
+ hold is more terminal than a time-bounded cooldown). Opt-in:
807
+ `None` skips it → byte-identical.
808
+ 5. UNCLEAR → increment the streak; stop if it hit max_unclear,
809
+ else retry `dispatch`.
810
+ 6. SHIPPED / REPLAN_DONE / GATE → route via the next-mode + drained-twice
811
+ + gate-policy logic, then apply the iteration cap. Within
812
+ the GATE sub-block, a BLOCKED whose `outcome.blocked_cause`
813
+ is re-dispatch-INVARIANT (FQ-510: a cause whose
814
+ `BLOCKED_REASONS[cause].self_heals_via` is NOT `/replan` —
815
+ operator_decision, a false-ship oracle conflation, …) STOPs
816
+ on the FIRST occurrence (`BLOCKED_REDISPATCH_INVARIANT`),
817
+ checked BEFORE the FQ-452 stale-stamp spin-counter so an
818
+ invariant cause never spins /replan to the cap. The
819
+ post-run analogue of rung 4b. A `/replan`-curable BLOCKED,
820
+ or one with no classified cause, falls through unchanged.
821
+
822
+ The iteration cap is applied LAST, after a continue decision is otherwise
823
+ reached, so a stop *reason* (drained-twice, breaker, rate-limit, spinning)
824
+ always wins over the bare cap — the operator wants the specific reason, not
825
+ "reached 5".
826
+ """
827
+ # The in-flight liveness verdict (docs/99) is per-iteration EVIDENCE the
828
+ # caller re-gathers each turn (via `dos liveness`), never carried state like
829
+ # `consecutive_unclear`. Read it into a local for the SPINNING rung below and
830
+ # CLEAR it from `state` up front, so it never survives into ANY returned
831
+ # `next_state` (terminal or continuing) — a stale verdict can't linger and
832
+ # fire spuriously next iteration; the caller must supply a fresh one. This is
833
+ # the evidence-in-not-state-carried discipline (the same reason `now_ms` is an
834
+ # input to `liveness.classify`, never stored), and it is also what makes the
835
+ # ADVANCING / STALLED / no-verdict paths byte-identical to the pre-3a loop:
836
+ # with the field cleared everywhere, their decisions differ in no field at all.
837
+ # The SPINNING `reason` string + `surface=True` carry the *why* for the
838
+ # operator, so dropping the verdict from `next_state` costs no legibility.
839
+ #
840
+ # The completion + convergence verdicts (docs/117 Phase 3) are gathered and
841
+ # cleared the SAME way and for the SAME reason: they are in-flight evidence the
842
+ # caller re-derives each turn (it owns the intent ledger and re-reads git
843
+ # ancestry), never carried state, so a stale verdict must not survive into the
844
+ # next iteration's `state`. With all three cleared up front, every path that
845
+ # does NOT stop on them is byte-identical to the pre-Phase-3 loop.
846
+ live = state.liveness
847
+ comp = state.completion
848
+ conv = state.convergence
849
+ pick = state.pickability
850
+ cool = state.cooldown
851
+ dprog = state.descendant_progress
852
+ state = replace(
853
+ state, liveness=None, completion=None, convergence=None, pickability=None,
854
+ cooldown=None, descendant_progress=None,
855
+ )
856
+
857
+ # 1. LAUNCH_FAILED — the subprocess never produced a valid init envelope.
858
+ # A repeating launch failure would burn every remaining slot, so stop on
859
+ # the first one (the SKILL's Step 2 init-line guard).
860
+ if outcome.kind is OutcomeKind.LAUNCH_FAILED:
861
+ return LoopDecision(
862
+ action=_STOP,
863
+ next_state=state,
864
+ stop_reason=StopReason.LAUNCH_FAILED,
865
+ surface=True,
866
+ reason="iteration subprocess failed to start (no valid init envelope)",
867
+ )
868
+
869
+ # 2. RATE_LIMITED — a hard usage/rate-limit rejection. Every retry fails the
870
+ # same way until the window resets; it is NOT a /dispatch fault, so it
871
+ # must not increment the consecutive-UNCLEAR breaker. Stop and let the
872
+ # operator re-invoke once the window resets.
873
+ if outcome.kind is OutcomeKind.RATE_LIMITED:
874
+ return LoopDecision(
875
+ action=_STOP,
876
+ next_state=state,
877
+ stop_reason=StopReason.RATE_LIMITED,
878
+ surface=True,
879
+ reason="usage/rate-limit window exhausted — not a fault; re-invoke after reset",
880
+ )
881
+
882
+ # 2b. OVERLOADED — a transient 529 / overloaded_error. Unlike a quota window,
883
+ # this clears in seconds to a couple minutes. Retry the SAME iteration
884
+ # with exponential backoff (60s → 270s → 1200s). After
885
+ # `max_overloaded` (3) consecutive OVERLOADED hits, escalate to STOP —
886
+ # that's an outage, not a capacity blip, and the operator should look.
887
+ # The breaker does NOT increment the consecutive-UNCLEAR streak (an
888
+ # OVERLOADED is upstream, not a /dispatch fault), same precedent as
889
+ # RATE_LIMITED.
890
+ if outcome.kind is OutcomeKind.OVERLOADED:
891
+ streak, tripped = _breaker_fail(
892
+ state.consecutive_overloaded, state.max_overloaded
893
+ )
894
+ bumped = replace(state, consecutive_overloaded=streak)
895
+ if tripped:
896
+ return LoopDecision(
897
+ action=_STOP,
898
+ next_state=bumped,
899
+ stop_reason=StopReason.CONSECUTIVE_OVERLOADED,
900
+ surface=True,
901
+ reason=(
902
+ f"{streak} consecutive OVERLOADED (529) hits — sustained "
903
+ f"server-side overload, not a transient blip; stop and "
904
+ f"re-invoke after the upstream incident clears"
905
+ ),
906
+ )
907
+ backoff = _OVERLOADED_BACKOFF[min(streak - 1, len(_OVERLOADED_BACKOFF) - 1)]
908
+ return LoopDecision(
909
+ action=_RETRY_SAME_ITER,
910
+ next_state=bumped,
911
+ backoff_seconds=backoff,
912
+ reason=(
913
+ f"OVERLOADED (streak {streak}/{state.max_overloaded}) — "
914
+ f"transient 529, sleep {backoff}s then retry same iter"
915
+ ),
916
+ )
917
+
918
+ # A non-OVERLOADED outcome resets the OVERLOADED streak — a clean run means
919
+ # the upstream incident cleared.
920
+ state = replace(state, consecutive_overloaded=0)
921
+
922
+ # 3. COMPLETE (docs/117 Phase 3) — the stop-on-DONE gate, the first terminal
923
+ # that means "finished," not "gave up." If the caller gathered a
924
+ # `CompletionVerdict` and it is COMPLETE, every declared unit is verified on
925
+ # the non-forgeable ancestry rung (the residual is empty): the work is done,
926
+ # so stop — cleanly, NO surface (a clean finish is not an operator decision).
927
+ # This is the anti-`ITERATION_CAP`: a healthy loop now terminates HERE, and
928
+ # the cap demotes to a backstop for genuinely pathological runs (docs/117
929
+ # §5.4 — "the critical inversion").
930
+ #
931
+ # Placement is load-bearing and was an explicit operator decision: COMPLETE is
932
+ # checked BEFORE the SPINNING rung. The two can BOTH fire for one legitimate
933
+ # case — a run resumed with nothing left to do has zero git delta since start
934
+ # (SPINNING) AND every declared unit already verified (COMPLETE). When the
935
+ # work is provably finished on the non-forgeable rung, "done" is the honest
936
+ # reason even with zero recent delta, so COMPLETE wins. (It stays AFTER the
937
+ # not-a-fault stops — LAUNCH_FAILED / RATE_LIMITED / OVERLOADED — for the same
938
+ # reason SPINNING does: a run that failed to launch or 529'd on its last turn
939
+ # has not "finished," and the specific outage is the reason the operator
940
+ # wants.)
941
+ #
942
+ # UNDERDECLARED (Phase 4, not emitted yet) → stop AND surface: the run thinks
943
+ # it is done but an external `ScopeSource` says it under-declared its extent;
944
+ # a human must reconcile. We route it through `StopReason.THRASHING` (the
945
+ # "no clean finish, look at this" terminal) with surface=True — the residual
946
+ # is empty but the *scope* is in doubt, which is exactly a surface-for-review.
947
+ # INCOMPLETE / INDETERMINATE never stop here: INCOMPLETE means "continue,
948
+ # re-dispatch the residual" (the caller owns that actuation), and INDETERMINATE
949
+ # means "can't tell from an unsound fold" — we never ASSERT done on it, so it
950
+ # falls through to the existing logic untouched.
951
+ #
952
+ # Opt-in / byte-identical: `comp is None` (the default) skips this rung
953
+ # entirely, so an un-migrated caller is unaffected.
954
+ if comp is not None:
955
+ if comp.state is Completion.COMPLETE:
956
+ return LoopDecision(
957
+ action=_STOP,
958
+ next_state=state,
959
+ stop_reason=StopReason.COMPLETE,
960
+ surface=False,
961
+ reason=(
962
+ "completion() reports COMPLETE — every declared unit is verified "
963
+ "against git ancestry; the residual is empty, so the loop stops "
964
+ "because the work is DONE (stop-on-done, not out-of-budget). "
965
+ + comp.reason
966
+ ),
967
+ )
968
+ if comp.state is Completion.UNDERDECLARED:
969
+ return LoopDecision(
970
+ action=_STOP,
971
+ next_state=state,
972
+ stop_reason=StopReason.THRASHING,
973
+ surface=True,
974
+ reason=(
975
+ "completion() reports UNDERDECLARED — the declared residual is "
976
+ "empty but an external scope check says the extent was "
977
+ "under-declared; stopping and surfacing for a human to reconcile. "
978
+ + comp.reason
979
+ ),
980
+ )
981
+
982
+ # 3b. THRASHING / STARVED (docs/117 Phase 3, §5.2) — the dynamic no-fixpoint
983
+ # gate. COMPLETE above is the STATIC fixpoint (residual empty now); this is
984
+ # its dynamic companion: the residual keeps churning but never empties (each
985
+ # pass closes some work and opens as much — the reviewer-finds-new-findings
986
+ # loop). If the caller gathered a `ConvergenceVerdict` over the residual-size
987
+ # history and it `should_surface` (THRASHING or STARVED), the loop will not
988
+ # reach a fixpoint — stop and surface rather than burn the iteration cap
989
+ # silently. Checked AFTER the COMPLETE gate (a run whose residual just reached
990
+ # 0 is CONVERGING/done, never thrashing) and, like it, before the
991
+ # UNCLEAR/SHIPPED/GATE block. CONVERGING / INSUFFICIENT never stop — "no
992
+ # fixpoint *yet*" is not a stop signal. Opt-in: `conv is None` skips it.
993
+ if conv is not None and conv.state.should_surface:
994
+ return LoopDecision(
995
+ action=_STOP,
996
+ next_state=state,
997
+ stop_reason=StopReason.THRASHING,
998
+ surface=True,
999
+ reason=(
1000
+ "convergence() reports "
1001
+ f"{conv.state.value} — the residual is not trending to empty over "
1002
+ "the recent window; the loop is productive but has no fixpoint, so "
1003
+ "stopping and surfacing rather than spending the cap. " + conv.reason
1004
+ ),
1005
+ )
1006
+
1007
+ # 4. SPINNING (docs/99 / docs/82 Phase-3a) — the ground-truth anti-spin
1008
+ # breaker. If the caller gathered an in-flight `Liveness` verdict for this
1009
+ # run and it is SPINNING (alive — fresh heartbeat — but zero commits and
1010
+ # zero state-mutating lane events since start), the loop is burning tokens
1011
+ # narrating motion it is not making. Stop on the hard evidence rather than
1012
+ # waiting for the iteration cap or a self-report streak.
1013
+ #
1014
+ # Placement is load-bearing: AFTER LAUNCH_FAILED / RATE_LIMITED / OVERLOADED
1015
+ # (a run idle only because it is backing off a 529 / quota window is NOT
1016
+ # spinning — those not-a-fault stops must win, the same precedence they get
1017
+ # over the UNCLEAR breaker), and BEFORE the UNCLEAR / SHIPPED / GATE block
1018
+ # (liveness reads ground truth, and the whole docs/82 thesis is that ground
1019
+ # truth overrides the self-report — a loop reporting SHIPPED every iteration
1020
+ # while landing 0 commits is the canonical spin, and SHIPPED's healthy path
1021
+ # must not pre-empt the verdict). This mirrors UNMEASURED_SHIPPED being
1022
+ # checked FIRST inside the SHIPPED branch: a ground-truth distrust signal
1023
+ # pre-empts the conservative continue.
1024
+ #
1025
+ # Opt-in / byte-identical: `live is None` (the default) skips this rung
1026
+ # entirely, so an un-migrated caller gets the pre-3a behavior exactly.
1027
+ # Only SPINNING stops here — ADVANCING is benign; STALLED ("dead/hung") is
1028
+ # the supervisor's reap input (`supervise.py`), not a live loop's self-stop.
1029
+ if live is Liveness.SPINNING:
1030
+ return LoopDecision(
1031
+ action=_STOP,
1032
+ next_state=state,
1033
+ stop_reason=StopReason.SPINNING,
1034
+ surface=True,
1035
+ reason=(
1036
+ "liveness() reports SPINNING — the run is alive but has landed 0 "
1037
+ "commits and 0 lane events since it started; stopping on "
1038
+ "ground-truth evidence rather than burning the iteration budget "
1039
+ "narrating motion it is not making"
1040
+ ),
1041
+ )
1042
+
1043
+ # 4b. PICK_HELD_INVARIANT (docs/168 §5) — the honest-STOP rung. If the caller
1044
+ # gathered a pre-dispatch `Pickability` verdict for the lane it would
1045
+ # dispatch next and that verdict is HELD by a reason a re-dispatch CANNOT
1046
+ # change (DRAFT_CLASS / OPERATOR_GATED / SOAK_OPEN / DEPENDENCY_UNMET), the
1047
+ # next iteration would re-block on the identical deterministic gate. This is
1048
+ # the drain-trap the host hit on three distinct lanes in 36h (ASI #475
1049
+ # operator-gated, RTN soak, FMP #493 DRAFT): the loop's `decide()` modeled
1050
+ # continue→dispatch on a DRAIN, so the operator had to OVERRIDE with an
1051
+ # "honest STOP" every time. With the hold reason typed, that override
1052
+ # becomes a kernel rule — STOP and surface the typed hold so the host can
1053
+ # route it (DRAFT→/promote, OPERATOR_GATED→escalate a decision, SOAK_OPEN→
1054
+ # wait, never /replan; DEPENDENCY_UNMET→ship the prerequisite).
1055
+ #
1056
+ # EVIDENCE-GATED: it fires ONLY when the verdict is present AND
1057
+ # `is_redispatch_invariant`. An OFFERABLE verdict, or a HELD verdict whose
1058
+ # reason is re-dispatch-CURABLE (IN_FLIGHT / SOFT_CLAIMED_ELSEWHERE /
1059
+ # STALE_CLAIM / COOLDOWN / SHIPPED / UNPARSEABLE — all CAN clear), never
1060
+ # stops here.
1061
+ #
1062
+ # Placement is load-bearing: AFTER the not-a-fault stops (LAUNCH_FAILED /
1063
+ # RATE_LIMITED / OVERLOADED — an outage is not a reason to declare the lane
1064
+ # un-pickable) and AFTER COMPLETE / SPINNING (a provably-finished or
1065
+ # ground-truth-spinning run names a more specific terminal), and BEFORE the
1066
+ # UNCLEAR / SHIPPED / GATE outcome block (an invariant hold on the next lane
1067
+ # pre-empts whatever this iteration's outcome token says — the same "the
1068
+ # gate beats the self-report" precedence the SPINNING rung has).
1069
+ #
1070
+ # Opt-in / byte-identical: `pick is None` (the default) skips this rung
1071
+ # entirely, so an un-migrated caller is unaffected.
1072
+ if pick is not None and pick.is_redispatch_invariant:
1073
+ reason = pick.reason # guaranteed non-None by is_redispatch_invariant
1074
+ return LoopDecision(
1075
+ action=_STOP,
1076
+ next_state=state,
1077
+ stop_reason=StopReason.PICK_HELD_INVARIANT,
1078
+ surface=True,
1079
+ reason=(
1080
+ f"next lane is HELD by {reason.value} — a re-dispatch-invariant "
1081
+ f"hold a re-dispatch cannot change; honest-STOP rather than "
1082
+ f"re-block on the identical gate next iteration. "
1083
+ + (pick.evidence or "")
1084
+ ).strip(),
1085
+ )
1086
+
1087
+ # 4c. PICK_COOLDOWN (docs/207 §3) — the anti-churn breaker. If the caller
1088
+ # gathered a `Cooldown` verdict for the unit it would dispatch NEXT (after
1089
+ # it ALREADY skipped every fresher offerable-and-not-cooled candidate — the
1090
+ # host's pick-selection contract) and that verdict is RECENTLY_ATTEMPTED,
1091
+ # the unit was attempted-and-didn't-move inside the window and nothing
1092
+ # fresher is left. Re-dispatching it would re-storm (the ~5%-shipping
1093
+ # re-pick loop the bare loop hit), so honest-STOP + surface the cooled unit
1094
+ # rather than burn the iteration re-confirming a known drain. This is the
1095
+ # CROSS-RUN memory `liveness` (a single-run verdict) cannot provide.
1096
+ #
1097
+ # EVIDENCE-GATED: fires ONLY when the verdict is present AND `held`
1098
+ # (RECENTLY_ATTEMPTED). A CLEAR verdict — the window elapsed, or nothing
1099
+ # held the unit — never stops; the loop keeps its existing behavior.
1100
+ #
1101
+ # Placement: AFTER the not-a-fault stops + COMPLETE/SPINNING/PICK_HELD
1102
+ # (an invariant hold names a more specific terminal than a cooldown — a
1103
+ # DRAFT lane is held forever, a cooled one only until the wall), and BEFORE
1104
+ # the outcome block (the cooldown pre-empts the iteration's self-report, the
1105
+ # same "the gate beats the self-report" precedence the sibling rungs have).
1106
+ #
1107
+ # Opt-in / byte-identical: `cool is None` (the default) skips this rung.
1108
+ if cool is not None and cool.held:
1109
+ return LoopDecision(
1110
+ action=_STOP,
1111
+ next_state=state,
1112
+ stop_reason=StopReason.PICK_COOLDOWN,
1113
+ surface=True,
1114
+ reason=(
1115
+ f"next unit {cool.unit_id!r} is in a cooldown window — "
1116
+ + (cool.reason or "attempted recently and did not move")
1117
+ + "; nothing fresher is offerable, so honest-STOP rather than "
1118
+ "re-storm a known drain (the anti-churn breaker)"
1119
+ ),
1120
+ )
1121
+
1122
+ # 4. UNCLEAR — crashed/killed before Step 9, or an INTERIM envelope. Retry
1123
+ # as `dispatch`, but increment the streak; three in a row means the
1124
+ # subprocess is failing systematically (the circuit breaker).
1125
+ if outcome.kind is OutcomeKind.UNCLEAR:
1126
+ # 4d. DESCENDANT-PROGRESS adopt-wait (FQ-509) — the pre-check that
1127
+ # distinguishes a *parked-but-PRODUCTIVE* parent from a systematic
1128
+ # failure. A headless `-p` child that PARKED its own turn while a
1129
+ # grandchild it spawned is still committing the registered picks lands
1130
+ # here as UNCLEAR (the parent's ancestry check ran the instant it
1131
+ # exited, saw 0 committed picks, and the token collapsed to UNCLEAR).
1132
+ # Charging that to the UNCLEAR breaker is WRONG: the descendant is
1133
+ # healthy and about to land its commits — counting it as a fault makes
1134
+ # the loop self-stop with CONSECUTIVE_UNCLEAR over live work AND
1135
+ # re-dispatch a fresh child each time instead of waiting for the live
1136
+ # one. When the host supplied `descendant_progress == ADVANCING` (the
1137
+ # descendant landed a forward delta — a real new commit since start, or
1138
+ # the ancestry-backed CHURNING verdict; the host's corpse-guard ensures
1139
+ # a log-touched-but-not-committing "alive" maps to NONE_OBSERVED, never
1140
+ # here), CONTINUE the loop (adopt-wait: re-dispatch so the live child
1141
+ # gets the chance to land its picks → the NEXT iteration's ancestry
1142
+ # check lifts it to SHIPPED) WITHOUT charging the UNCLEAR breaker, and
1143
+ # RESET consecutive_unclear to 0 (a live committing child means the
1144
+ # prior UNCLEARs were not a systematic fault).
1145
+ #
1146
+ # BOUNDED, clock-free: the adopt-wait is itself counted by
1147
+ # `consecutive_adopt_wait`; after `max_adopt_wait` consecutive
1148
+ # ADVANCING-but-the-picks-still-uncommitted iters it FALLS THROUGH to
1149
+ # today's UNCLEAR breaker path (which caps at max_unclear) rather than
1150
+ # a new terminal — so a descendant that keeps "advancing" but never
1151
+ # lands its registered picks can never adopt-wait forever. The continue
1152
+ # also cannot persist past death: descendant_progress is re-gathered
1153
+ # every iteration (cleared up-front), so a child that DIES flips to
1154
+ # DEAD next iter and takes the normal UNCLEAR path.
1155
+ #
1156
+ # Opt-in / byte-identical: `dprog` defaults None (cleared up-front),
1157
+ # and the guard is `dprog is DescendantProgress.ADVANCING` — DEAD,
1158
+ # NONE_OBSERVED, and None all skip it → the rung below is byte-identical
1159
+ # to the pre-FQ-509 loop.
1160
+ if dprog is DescendantProgress.ADVANCING:
1161
+ aw_streak, aw_tripped = _breaker_fail(
1162
+ state.consecutive_adopt_wait, state.max_adopt_wait
1163
+ )
1164
+ if not aw_tripped:
1165
+ # Live committing descendant — adopt-wait. Do NOT charge the
1166
+ # UNCLEAR breaker; reset it (this iter is not a fault).
1167
+ bumped = replace(
1168
+ state, consecutive_adopt_wait=aw_streak, consecutive_unclear=0
1169
+ )
1170
+ return _continue_or_cap(
1171
+ bumped,
1172
+ next_mode="dispatch",
1173
+ reason=(
1174
+ f"descendant FORWARD-PROGRESSING (adopt-wait "
1175
+ f"{aw_streak}/{state.max_adopt_wait}) — the parent parked "
1176
+ f"but a descendant it spawned is committing the registered "
1177
+ f"picks; wait for it to land them, not a /dispatch fault"
1178
+ ),
1179
+ )
1180
+ # aw_tripped: the descendant kept "advancing" but never landed its
1181
+ # picks within the bound → fall through to the normal UNCLEAR breaker
1182
+ # path below (degrade to today's behavior; not a new terminal). The
1183
+ # bumped adopt-wait count rides into next_state via the streak below.
1184
+ # Non-advancing UNCLEAR (DEAD / NONE_OBSERVED / None, or a tripped
1185
+ # adopt-wait): today's exact path. Reset consecutive_adopt_wait (a
1186
+ # non-advancing iter breaks the adopt streak) but NOT consecutive_unclear
1187
+ # (it accrues — so a flapping ALIVE/quiet child still reaches max_unclear).
1188
+ streak, tripped = _breaker_fail(state.consecutive_unclear, state.max_unclear)
1189
+ bumped = replace(state, consecutive_unclear=streak, consecutive_adopt_wait=0)
1190
+ if tripped:
1191
+ return LoopDecision(
1192
+ action=_STOP,
1193
+ next_state=bumped,
1194
+ stop_reason=StopReason.CONSECUTIVE_UNCLEAR,
1195
+ surface=True,
1196
+ reason=(
1197
+ f"{streak} consecutive UNCLEAR iterations — the /dispatch "
1198
+ f"subprocess is failing systematically, not draining a backlog"
1199
+ ),
1200
+ )
1201
+ return _continue_or_cap(
1202
+ bumped,
1203
+ next_mode="dispatch",
1204
+ reason=f"UNCLEAR (streak {streak}/{state.max_unclear}) — retrying dispatch",
1205
+ )
1206
+
1207
+ # A non-UNCLEAR, non-fault iteration completed → reset the UNCLEAR breaker.
1208
+ # The SHIPPED-DIRTY-0 breaker is reset only inside the SHIPPED branch on a
1209
+ # *healthy* SHIPPED outcome (or on a REPLAN_DONE / GATE outcome that
1210
+ # naturally interrupts a back-to-back-SHIPPED streak — handled below).
1211
+ base = replace(state, consecutive_unclear=0)
1212
+ if outcome.kind in (OutcomeKind.REPLAN_DONE, OutcomeKind.GATE):
1213
+ # A non-SHIPPED outcome breaks the back-to-back-SHIPPED-DIRTY-0 streak.
1214
+ base = replace(base, consecutive_dirty_zero=0)
1215
+
1216
+ # 5a. SHIPPED — picks landed. Backlog still has work; clear the drained flag.
1217
+ #
1218
+ # FQ-420 unmeasured-ship STALL (checked FIRST): a SHIPPED token is the
1219
+ # /dispatch child's *self-report*. The PJ2 packet-judge is the kernel's
1220
+ # independent measurement of that claim against the post-fanout commit set.
1221
+ # If the driver asserted a measurement was owed (`measurement_expected`) but
1222
+ # the judge came back None — the FQ-420 shape: head==SHIPPED yet the fanout
1223
+ # run-ts could not be resolved, so PJ2 classify never ran — the kernel has a
1224
+ # claimed ship it could NOT verify. It must not fall through to the healthy
1225
+ # path on the strength of an unverified self-report (that is the exact lie
1226
+ # the substrate exists to refuse — a manual git-log check should never be
1227
+ # what catches it). STALL and surface so the operator re-measures: resolve
1228
+ # the fanout ts from the archive, or treat the ship as unproven. This guard
1229
+ # precedes the dirty-zero / healthy classification because a missing
1230
+ # measurement makes ALL of that sub-classification untrustworthy.
1231
+ #
1232
+ # SHIPPED-DIRTY-0 breaker: a SHIPPED iter that the packet-judge classified
1233
+ # as SHIPPED-DIRTY AND measured 0 commits is the degraded-shipping signal
1234
+ # the breaker counts (input gate says LIVE, packet-judge says DIRTY, no
1235
+ # commits actually landed). K back-to-back instances → stop; this is the
1236
+ # structural defense that justifies the iteration cap raise from 5 to 10
1237
+ # — it kills the degraded-shipping damage path at iter K regardless of cap.
1238
+ # Every other SHIPPED outcome (SHIPPED-CLEAN, SHIPPED-DIRTY with ship_count>0,
1239
+ # or no packet-judge supplied AND none expected) resets the streak. Callers
1240
+ # that do not pass packet_judge/ship_count AND do not set
1241
+ # measurement_expected get pre-breaker behavior — the streak is held
1242
+ # constant rather than incremented; this matches the "treat as PRODUCTIVE
1243
+ # when unclassified" conservative-default precedent (an un-migrated caller
1244
+ # that never measures is trusted; one that SAID it would measure is not).
1245
+ if outcome.kind is OutcomeKind.SHIPPED:
1246
+ # FQ-452: a SHIPPED iteration is genuine forward progress — the lane is
1247
+ # no longer stuck on a stale-stamp gate. Reset the spin-breaker streak.
1248
+ # (Reset here, NOT in the shared `base` block above, because a
1249
+ # REPLAN_DONE must NOT reset it — the /replan is the *response* to the
1250
+ # stale-stamp and the streak has to survive it to ever reach the cap.)
1251
+ base = replace(base, consecutive_stale_stamp=0)
1252
+ # QWD benign-drain: a ship means the lane was NOT benignly drained — clear
1253
+ # the unproductive-replan-drain streak + the prior-DRAIN carry. (Same
1254
+ # reasoning as the stale-stamp reset: reset on a real ship, not in the
1255
+ # shared block, because rung 5b consumes `last_gate_was_drain`.)
1256
+ # #506: a ship also clears the REPLAN_STALLED streak — the lane produced
1257
+ # work, so /replan is not in the 0-refill stall. Like the two resets above,
1258
+ # done here (not in the shared `base` block) so a REPLAN_DONE does NOT reset
1259
+ # it: the stall streak must SURVIVE the dispatch→GATE→/replan cycle between
1260
+ # two unproductive replans to ever reach the threshold (a GATE always sits
1261
+ # between two REPLAN_DONE outcomes, exactly as `consecutive_stale_stamp`
1262
+ # survives the intervening REPLAN_DONE for the mirror reason).
1263
+ base = replace(
1264
+ base,
1265
+ consecutive_unproductive_replan_drains=0,
1266
+ consecutive_unproductive_replan=0,
1267
+ last_gate_was_drain=False,
1268
+ )
1269
+ if outcome.measurement_expected and outcome.packet_judge is None:
1270
+ return LoopDecision(
1271
+ action=_STOP,
1272
+ next_state=replace(base, last_replan_drained=False),
1273
+ stop_reason=StopReason.UNMEASURED_SHIPPED,
1274
+ surface=True,
1275
+ reason=(
1276
+ "SHIPPED claimed but the PJ2 packet-judge measurement is "
1277
+ "missing (fanout run-ts unresolved) — the ship is "
1278
+ "self-reported and unverified; STALL and re-measure rather "
1279
+ "than trust an unmeasured ship"
1280
+ ),
1281
+ )
1282
+ is_dirty_zero = (
1283
+ outcome.packet_judge == "SHIPPED-DIRTY"
1284
+ and outcome.ship_count == 0
1285
+ )
1286
+ if is_dirty_zero:
1287
+ streak, tripped = _breaker_fail(
1288
+ base.consecutive_dirty_zero, base.max_dirty_zero
1289
+ )
1290
+ bumped = replace(
1291
+ base, last_replan_drained=False, consecutive_dirty_zero=streak
1292
+ )
1293
+ if tripped:
1294
+ return LoopDecision(
1295
+ action=_STOP,
1296
+ next_state=bumped,
1297
+ stop_reason=StopReason.CONSECUTIVE_DIRTY_ZERO,
1298
+ surface=True,
1299
+ reason=(
1300
+ f"{streak} consecutive SHIPPED-DIRTY iters with 0 commits "
1301
+ f"— /dispatch is shipping apparently-successful but "
1302
+ f"actually-empty iters (degraded-shipping regression)"
1303
+ ),
1304
+ )
1305
+ return _continue_or_cap(
1306
+ bumped,
1307
+ next_mode="dispatch",
1308
+ reason=(
1309
+ f"SHIPPED-DIRTY-0 (streak {streak}/{base.max_dirty_zero}) "
1310
+ f"— continue dispatch, but watch the streak"
1311
+ ),
1312
+ )
1313
+ # Healthy SHIPPED outcome (SHIPPED-CLEAN, or SHIPPED-DIRTY with ≥1 commit,
1314
+ # or no packet-judge supplied) — reset the dirty-zero streak.
1315
+ nxt = replace(
1316
+ base, last_replan_drained=False, consecutive_dirty_zero=0
1317
+ )
1318
+ return _continue_or_cap(
1319
+ nxt, next_mode="dispatch", reason="SHIPPED — picks shipped, continue dispatch"
1320
+ )
1321
+
1322
+ # 5b. REPLAN_DONE — a /replan iteration completed. Next is `dispatch`. The
1323
+ # FQ-240 fix: arm `last_replan_drained` (the drained-twice trigger) ONLY
1324
+ # when the /replan was PRODUCTIVE — i.e. it actually refilled / gardened.
1325
+ # An UNPRODUCTIVE /replan (the §1.5 no-op skip, or a 0/0/0 sweep) is NOT
1326
+ # a refill attempt; arming the trigger off it would let a DRAIN that
1327
+ # follows a /replan-that-did-nothing false-stop the loop as DRAINED_TWICE
1328
+ # (finding #240's second shape, distinct from the QWB7 STALE-STAMP half).
1329
+ # Default to PRODUCTIVE when unclassified — the conservative pre-FQ-240
1330
+ # behavior, so this change can never make the loop run *longer*.
1331
+ if outcome.kind is OutcomeKind.REPLAN_DONE:
1332
+ productivity = outcome.replan_productivity or ReplanProductivity.PRODUCTIVE
1333
+ productive = productivity is ReplanProductivity.PRODUCTIVE
1334
+ # QWD benign-drain breaker (FQ-509-sibling). An UNPRODUCTIVE /replan whose
1335
+ # immediately-prior gate was a DRAIN (`last_gate_was_drain`) is the
1336
+ # benign-drain signal: /replan was asked to refill a drained lane and
1337
+ # produced nothing because there is nothing left. Count it. A PRODUCTIVE
1338
+ # /replan, or one not preceded by a DRAIN, resets the streak (the lane is
1339
+ # not in the benign-drain spin). The prior-DRAIN carry is consumed either
1340
+ # way (it describes only the one transition into this /replan). The count
1341
+ # is bumped via the `dos.breaker` fold (docs/258); the trip is NOT checked
1342
+ # here — the benign-drain stop fires on the NEXT DRAIN (rung 5c), so we keep
1343
+ # only the new count and discard the trip bit at this point.
1344
+ if productive:
1345
+ benign_streak = 0
1346
+ elif base.last_gate_was_drain:
1347
+ benign_streak, _ = _breaker_fail(
1348
+ base.consecutive_unproductive_replan_drains,
1349
+ base.max_unproductive_replan_drains,
1350
+ )
1351
+ else:
1352
+ benign_streak = base.consecutive_unproductive_replan_drains
1353
+
1354
+ # #506 / docs/258 — the REPLAN_STALLED breaker, the BROADER sibling, and the
1355
+ # FIRST loop_decide rung whose trip is taken straight off `dos.breaker`. An
1356
+ # UNPRODUCTIVE /replan is a failure of this class REGARDLESS of the prior
1357
+ # gate (a costly 0-refill sweep is the pathology whether or not a DRAIN
1358
+ # preceded it — the gap the benign-drain bracket leaves). A PRODUCTIVE
1359
+ # /replan is a success (the sweep refilled → the stall cleared) and heals the
1360
+ # streak. On the Kth (default 2) consecutive unproductive /replan, STOP +
1361
+ # surface rather than spend another 16-22min/~$5 sweep that the measurement
1362
+ # says will refill nothing. Opt-in/byte-identical: an unclassified /replan
1363
+ # defaults to PRODUCTIVE (FQ-240), so a caller that never classifies
1364
+ # productivity records only successes here and never trips this.
1365
+ if productive:
1366
+ stall_t = breaker.record_success(
1367
+ breaker.BreakerCounts(consecutive=base.consecutive_unproductive_replan),
1368
+ _replan_stall_policy(base),
1369
+ )
1370
+ stall_streak = stall_t.counts.consecutive # 0 — healed
1371
+ else:
1372
+ stall_streak, stall_open = _breaker_fail(
1373
+ base.consecutive_unproductive_replan, base.max_unproductive_replan
1374
+ )
1375
+ if stall_open:
1376
+ return LoopDecision(
1377
+ action=_STOP,
1378
+ next_state=replace(
1379
+ base,
1380
+ last_replan_drained=False,
1381
+ consecutive_unproductive_replan_drains=benign_streak,
1382
+ consecutive_unproductive_replan=stall_streak,
1383
+ last_gate_was_drain=False,
1384
+ ),
1385
+ stop_reason=StopReason.REPLAN_STALLED,
1386
+ surface=True,
1387
+ reason=(
1388
+ f"{stall_streak} consecutive UNPRODUCTIVE /replans — /replan "
1389
+ f"keeps refilling nothing (the measured 0-refill stall, ~45% "
1390
+ f"of loop wall-clock); stop and surface rather than spend "
1391
+ f"another ~16-22min/~$5 sweep that won't refill on a "
1392
+ f"{stall_streak + 1}th identical pass"
1393
+ ),
1394
+ )
1395
+
1396
+ nxt = replace(
1397
+ base,
1398
+ last_replan_drained=productive,
1399
+ consecutive_unproductive_replan_drains=benign_streak,
1400
+ consecutive_unproductive_replan=stall_streak,
1401
+ last_gate_was_drain=False,
1402
+ )
1403
+ if productive:
1404
+ reason = "REPLAN_DONE (productive) — backlog refilled, dispatch next (drained-twice armed)"
1405
+ else:
1406
+ reason = (
1407
+ "REPLAN_DONE (unproductive) — /replan did 0 gardening / 0 refill; "
1408
+ "drained-twice NOT armed (a DRAIN next is not drained-twice)"
1409
+ )
1410
+ if benign_streak:
1411
+ reason += (
1412
+ f"; benign-drain streak {benign_streak}/"
1413
+ f"{base.max_unproductive_replan_drains}"
1414
+ )
1415
+ if stall_streak:
1416
+ reason += (
1417
+ f"; replan-stall streak {stall_streak}/"
1418
+ f"{base.max_unproductive_replan}"
1419
+ )
1420
+ return _continue_or_cap(nxt, next_mode="dispatch", reason=reason)
1421
+
1422
+ # 5c. GATE — /dispatch reached Step 9 with child2 skipped. The typed verdict
1423
+ # + the --gate policy decide what to do (the pure `gate_policy`). The
1424
+ # loop-level part this layer adds is the drained-twice counter.
1425
+ assert outcome.kind is OutcomeKind.GATE and outcome.verdict is not None
1426
+ action: GateAction = gate_policy(outcome.verdict, base.gate_mode)
1427
+
1428
+ # FQ-510 — re-dispatch-INVARIANT BLOCKED stop (the post-run analogue of the
1429
+ # pre-launch PICK_HELD_INVARIANT rung 4b). A BLOCKED gate carries a classified
1430
+ # cause (`outcome.blocked_cause`, the dos.tokens.BlockedReason key the driver
1431
+ # mined from the Outcome cell). When that cause's catalog entry self-heals via
1432
+ # something OTHER than /replan — an `operator_decision` (`self_heals_via=""`),
1433
+ # a false-ship oracle conflation / stale-claim / lying-verdict (`/unstick`),
1434
+ # any non-`/replan` remedy — routing it to /replan is structurally wrong: the
1435
+ # next /dispatch re-derives the identical BLOCKED and the loop spins
1436
+ # BLOCKED→/replan→BLOCKED to the FQ-452 cap (3 iters, ~$15-25/1.5h) before the
1437
+ # spin-breaker catches it. So STOP on the FIRST such BLOCKED + surface, instead
1438
+ # of spinning. Checked BEFORE the FQ-452 counter so the invariant cause never
1439
+ # even increments the stale-stamp streak (it is a different, terminal class).
1440
+ # A BLOCKED whose cause IS /replan-curable (lane_soak_gated /
1441
+ # lane_all_inflight_or_deferred / data_gated_closeout — a genuine refill/stamp
1442
+ # drift) falls through to the FQ-452 path unchanged, as does a BLOCKED with no
1443
+ # classified cause (the driver could not name one) — both preserve today's
1444
+ # behavior exactly. The operator-decision sub-case is ALSO auto-filed once by
1445
+ # the driver's `emit-decision-needed` actuation, so the operator sees it in
1446
+ # the findings queue regardless of this stop.
1447
+ if outcome.verdict is Verdict.BLOCKED and outcome.blocked_cause:
1448
+ _info = blocked_reason_for_key(outcome.blocked_cause)
1449
+ if _info is not None and _info.self_heals_via != "/replan":
1450
+ _route = (
1451
+ "file the operator decision (auto-filed) and resolve it"
1452
+ if _info.operator_action_required
1453
+ else f"run {_info.self_heals_via or '/unstick'} for the structural fix"
1454
+ )
1455
+ return LoopDecision(
1456
+ action=_STOP,
1457
+ next_state=base,
1458
+ stop_reason=StopReason.BLOCKED_REDISPATCH_INVARIANT,
1459
+ surface=True,
1460
+ reason=(
1461
+ f"BLOCKED on {outcome.blocked_cause} ({_info.label}) — a "
1462
+ f"re-dispatch-invariant cause a /replan cannot clear; stop on "
1463
+ f"the first occurrence rather than spinning /replan to the "
1464
+ f"FQ-452 cap. Route: {_route}."
1465
+ ),
1466
+ )
1467
+
1468
+ # FQ-452 — the non-converging-spin breaker. A STALE-STAMP or BLOCKED gate
1469
+ # routes to /replan (under hard) or an inline reconcile (under soft/drive),
1470
+ # but when the root cause is plan-meta `remaining:`-list drift the §1.5
1471
+ # skip-gate never reconciles, /replan exits UNPRODUCTIVE and the very next
1472
+ # /dispatch re-derives the same 0-live gate — forever. Count consecutive
1473
+ # STALE-STAMP/BLOCKED gates that DON'T recover; on the Kth, refuse to spin
1474
+ # another /replan into the same unreconciled list and STOP so the operator's
1475
+ # /replan (now carrying the FQ-452 unconditional remaining-reconcile) runs
1476
+ # once and clears it. A LIVE/DRAIN/RACE verdict means the lane moved off the
1477
+ # stale-stamp cause → reset the streak. The streak deliberately SURVIVES the
1478
+ # intervening REPLAN_DONE (handled in 5b, which never touches it) — that is
1479
+ # what lets three dispatch→/replan→dispatch cycles accumulate to the cap.
1480
+ is_stale_stamp_class = outcome.verdict in (Verdict.STALE_STAMP, Verdict.BLOCKED)
1481
+ if is_stale_stamp_class:
1482
+ stale_streak, tripped = _breaker_fail(
1483
+ base.consecutive_stale_stamp, base.max_stale_stamp
1484
+ )
1485
+ base = replace(base, consecutive_stale_stamp=stale_streak)
1486
+ if tripped:
1487
+ return LoopDecision(
1488
+ action=_STOP,
1489
+ next_state=base,
1490
+ stop_reason=StopReason.STALE_STAMP_UNRECONCILED,
1491
+ surface=True,
1492
+ reason=(
1493
+ f"{stale_streak} consecutive {outcome.verdict.value} gates "
1494
+ f"that /replan did not reconcile — the picker keeps deriving "
1495
+ f"0-live from a stale plan-meta `remaining:` list; stop and "
1496
+ f"run a /replan that reconciles the list (plan-meta-gardening) "
1497
+ f"rather than spinning another /replan into the same drift"
1498
+ ),
1499
+ )
1500
+ else:
1501
+ # LIVE / DRAIN / RACE — the lane moved off the stale-stamp cause.
1502
+ base = replace(base, consecutive_stale_stamp=0)
1503
+
1504
+ # QWD benign-drain — a non-DRAIN gate verdict (LIVE / STALE_STAMP / BLOCKED /
1505
+ # RACE) means the lane is NOT in the benign genuinely-drained spin: clear the
1506
+ # unproductive-replan-drain streak + the prior-DRAIN carry. A DRAIN verdict is
1507
+ # the spin's own signal, so it must NOT reset here — its handling (count-check
1508
+ # + arm the carry) lives in the `counts_toward_drain` branch below.
1509
+ if outcome.verdict is not Verdict.DRAIN:
1510
+ base = replace(
1511
+ base,
1512
+ consecutive_unproductive_replan_drains=0,
1513
+ last_gate_was_drain=False,
1514
+ )
1515
+
1516
+ # soft/drive can return next_mode="stop" (a true DRAIN or a BLOCKED) — the
1517
+ # gate policy already decided the loop stops; name the StopReason from the
1518
+ # verdict and pass `surface` through.
1519
+ if action.next_mode == "stop":
1520
+ stop_reason = (
1521
+ StopReason.DRAIN if outcome.verdict is Verdict.DRAIN else StopReason.BLOCKED
1522
+ )
1523
+ return LoopDecision(
1524
+ action=_STOP,
1525
+ next_state=base,
1526
+ stop_reason=stop_reason,
1527
+ surface=action.surface,
1528
+ reason=action.reason,
1529
+ )
1530
+
1531
+ # reconcile=True (a soft/drive STALE-STAMP) → re-dispatch after an inline
1532
+ # stamp-reconcile pass; never counts toward drained-twice.
1533
+ if action.reconcile:
1534
+ return _continue_or_cap(
1535
+ base,
1536
+ next_mode=action.next_mode, # "dispatch"
1537
+ reconcile=True,
1538
+ reason=action.reason,
1539
+ )
1540
+
1541
+ # next_mode == "replan" (hard on any non-LIVE verdict). Now apply the
1542
+ # drained-twice rule, keyed on `action.counts_toward_drain` — QWB7's rule
1543
+ # is DRAIN-only, so STALE-STAMP/BLOCKED route to /replan but never arm a stop.
1544
+ if action.counts_toward_drain:
1545
+ # verdict was DRAIN. QWD benign-drain breaker FIRST (FQ-509-sibling): if
1546
+ # `max_unproductive_replan_drains` UNPRODUCTIVE /replans have already
1547
+ # bracketed DRAINs on this lane, /replan is structurally unable to refill
1548
+ # it — the lane is benignly drained (every phase shipped/in-flight). This
1549
+ # DRAIN is the one that would route the (K+1)th /replan; STOP instead so
1550
+ # the loop does not spin DRAIN→/replan to the iteration cap. This precedes
1551
+ # the drained-twice check because a benign-drain streak only accumulates
1552
+ # when every intervening /replan was UNPRODUCTIVE — which means
1553
+ # `last_replan_drained` is False (it arms only on a PRODUCTIVE /replan), so
1554
+ # the two stops are mutually exclusive and the benign one is the correct
1555
+ # name for the all-unproductive spin.
1556
+ if (
1557
+ base.consecutive_unproductive_replan_drains
1558
+ >= base.max_unproductive_replan_drains
1559
+ ):
1560
+ return LoopDecision(
1561
+ action=_STOP,
1562
+ next_state=base,
1563
+ stop_reason=StopReason.BENIGN_DRAIN,
1564
+ surface=True,
1565
+ reason=(
1566
+ f"DRAIN after {base.consecutive_unproductive_replan_drains} "
1567
+ f"consecutive UNPRODUCTIVE /replans — the lane is genuinely "
1568
+ f"drained but BENIGN (every phase already shipped/in-flight, "
1569
+ f"nothing to refill); /replan cannot refill it, so stop and "
1570
+ f"re-scope (or wait for the in-flight phases to settle) rather "
1571
+ f"than spinning another /replan to the iteration cap"
1572
+ ),
1573
+ )
1574
+ # If the prior iteration was a PRODUCTIVE /replan that followed a DRAIN
1575
+ # (last_replan_drained — armed by 4b only when the /replan actually
1576
+ # refilled/gardened), /replan tried and could not refill → stop early. An
1577
+ # UNPRODUCTIVE /replan never armed the flag (FQ-240), so a DRAIN after a
1578
+ # /replan-that-did-nothing falls through to a fresh /replan route below
1579
+ # rather than a false drained-twice stop.
1580
+ if base.last_replan_drained:
1581
+ return LoopDecision(
1582
+ action=_STOP,
1583
+ next_state=base,
1584
+ stop_reason=StopReason.DRAINED_TWICE,
1585
+ surface=False,
1586
+ reason=(
1587
+ "DRAIN again after a productive /replan — /replan tried but "
1588
+ "could not refill, lane/portfolio genuinely drained"
1589
+ ),
1590
+ )
1591
+ # The normal first drain: route to /replan, disarm the drained-twice flag
1592
+ # (it only becomes meaningful *after* the /replan completes — REPLAN_DONE
1593
+ # re-arms it) and ARM the benign-drain prior-DRAIN carry so an UNPRODUCTIVE
1594
+ # /replan that follows counts toward the benign-drain breaker.
1595
+ nxt = replace(base, last_replan_drained=False, last_gate_was_drain=True)
1596
+ return _continue_or_cap(
1597
+ nxt, next_mode="replan", reason=action.reason
1598
+ )
1599
+
1600
+ # STALE-STAMP / BLOCKED under `hard` — route to /replan, do NOT touch the
1601
+ # drained-twice flag. A stale-stamp/blocked gate can never arm a false stop.
1602
+ return _continue_or_cap(
1603
+ base, next_mode="replan", reason=action.reason
1604
+ )
1605
+
1606
+
1607
+ def _continue_or_cap(
1608
+ next_state: LoopState,
1609
+ *,
1610
+ next_mode: str,
1611
+ reason: str,
1612
+ reconcile: bool = False,
1613
+ ) -> LoopDecision:
1614
+ """Apply the iteration cap as the LAST gate on an otherwise-continue path.
1615
+
1616
+ A continue decision has been reached. But if the iteration that just ran was
1617
+ the `max_iterations`th, the loop is done — there is no slot for `next_mode`.
1618
+ Applying the cap here (and only here) means a specific stop reason
1619
+ (drained-twice, breaker, rate-limit, launch-fail) always wins over the bare
1620
+ cap, because those return a `stop` directly before reaching this helper.
1621
+ """
1622
+ if next_state.iteration >= next_state.max_iterations:
1623
+ return LoopDecision(
1624
+ action=_STOP,
1625
+ next_state=next_state,
1626
+ stop_reason=StopReason.ITERATION_CAP,
1627
+ surface=False,
1628
+ reason=f"reached max_iterations ({next_state.max_iterations})",
1629
+ )
1630
+ advanced = replace(next_state, iteration=next_state.iteration + 1)
1631
+ return LoopDecision(
1632
+ action=_CONTINUE,
1633
+ next_state=advanced,
1634
+ next_mode=next_mode,
1635
+ reconcile=reconcile,
1636
+ reason=reason,
1637
+ )
1638
+
1639
+
1640
+ # ---------------------------------------------------------------------------
1641
+ # Wait-marker budget (OC2 billing addendum, 2026-05-19).
1642
+ #
1643
+ # Every `claude -p` keep-alive marker is its own assistant turn that replays the
1644
+ # full system+skill+context out of cache. Session 4b4ff97c burned 252 markers /
1645
+ # ~26M cache-read tokens / ~$7.80 in one run (91% of the run's cache_read). The
1646
+ # SKILL-level prose caps (/dispatch 2-per-child, /dispatch-loop 4-per-run) are
1647
+ # prose the model must remember; this is the runtime lever — a pure decision the
1648
+ # loop can consult before emitting a marker, so a marker that won't earn its
1649
+ # cache-read cost is refused, not emitted.
1650
+ #
1651
+ # `headless_telemetry.py`'s `keepalive_poll` flag (fires at >=5 markers) is the
1652
+ # POST-HOC surface; this is its PRE-HOC decision-surface sibling. The default
1653
+ # `max_markers` here (4) matches the /dispatch-loop SKILL's 4-per-run prose cap,
1654
+ # so the runtime refusal lands one marker before the telemetry flag would fire.
1655
+ # ---------------------------------------------------------------------------
1656
+
1657
+
1658
+ @dataclass(frozen=True)
1659
+ class WaitMarkerDecision:
1660
+ """Whether to emit one keep-alive wait-marker, and why.
1661
+
1662
+ `allow` — True to emit the marker, False to refuse it. `markers_emitted` is
1663
+ the count carried into the *next* decision (incremented iff allowed).
1664
+ `reason` is operator-facing.
1665
+ """
1666
+
1667
+ allow: bool
1668
+ markers_emitted: int
1669
+ reason: str
1670
+
1671
+
1672
+ def wait_marker_budget(
1673
+ markers_emitted: int,
1674
+ max_markers: int = 4,
1675
+ ) -> WaitMarkerDecision:
1676
+ """Decide whether the loop should emit one more keep-alive wait-marker.
1677
+
1678
+ PURE — the caller passes the running marker count; this returns the
1679
+ allow/refuse decision and the count to carry forward. A refused marker means
1680
+ the loop stops holding the turn open with no-op Bash calls and instead waits
1681
+ on the existing Bash `<task-notification>` (which fires on real exit
1682
+ regardless) — OC1's deterministic orphan sweep is the safety net, so a
1683
+ refused marker cannot silently lose a child.
1684
+
1685
+ `max_markers` defaults to 4 — the /dispatch-loop SKILL's per-run prose cap —
1686
+ so the runtime refusal fires one marker before `headless_telemetry.py`'s
1687
+ `keepalive_poll` flag (>=5) would. Each marker past the budget is pure
1688
+ cache-replay cost (~$0.03-0.10) for no work, so the budget is the cost guard
1689
+ the prose cap could only suggest.
1690
+ """
1691
+ if markers_emitted >= max_markers:
1692
+ return WaitMarkerDecision(
1693
+ allow=False,
1694
+ markers_emitted=markers_emitted,
1695
+ reason=(
1696
+ f"wait-marker budget exhausted ({markers_emitted}/{max_markers}) "
1697
+ f"— each further marker replays full context out of cache for no "
1698
+ f"work; wait on the Bash task-notification, OC1's orphan sweep "
1699
+ f"is the safety net"
1700
+ ),
1701
+ )
1702
+ return WaitMarkerDecision(
1703
+ allow=True,
1704
+ markers_emitted=markers_emitted + 1,
1705
+ reason=f"wait-marker {markers_emitted + 1}/{max_markers} — turn held open",
1706
+ )
1707
+
1708
+
1709
+ # The generalized verdict over this same count — `noop_streak.classify` — re-aims the
1710
+ # arithmetic above off "markers emitted" onto "no-op turns since the last forward
1711
+ # delta" (docs/259 §Follow-up 1). It is a SIBLING module, not a call from here (no new
1712
+ # import edge into loop_decide); a test pins that the two agree on the allow/refuse bit.
1713
+
1714
+
1715
+ def propose_tighter_budget(observed_markers: int, current_max: int = 4) -> int:
1716
+ """Propose a tighter wait-marker budget from an OBSERVED keep-alive burst. PURE.
1717
+
1718
+ The audit→budget closing of the loop (docs/259 §Follow-up 3): `trajectory-audit`'s
1719
+ `keepalive_poll` finding saw `observed_markers` keep-alive markers in one session,
1720
+ under a budget of `current_max`; this proposes a TIGHTER cap so the pre-hoc lever
1721
+ (`wait_marker_budget`) would have refused sooner. ADVISORY — a proposal a human or
1722
+ host consumes, NEVER auto-applied (the kernel computes the number; nothing here
1723
+ feeds it back into `wait_marker_budget`, the PDP/PEP line).
1724
+
1725
+ The arithmetic, and why each clamp:
1726
+
1727
+ * `observed_markers - 1` — the doc's proposal: refuse one marker before the
1728
+ burst's length, so a repeat of the same wait would land under budget.
1729
+ * `min(current_max, …)` — NEVER propose a LOOSER cap than the one already in
1730
+ force (monotone-down). This is the load-bearing clamp, and it encodes the
1731
+ honest reading of a HUGE burst: 252 markers under a 4-cap proposes
1732
+ `min(4, 251) = 4` — i.e. NO tightening — because 252 ≫ 4 does not mean "4 is
1733
+ too loose," it means the cap was not ENFORCED (the hook was unwired or
1734
+ bypassed). The fix for that is to wire `dos hook marker`, not to lower a number
1735
+ that was never consulted; the caller surfaces the `observed > current_max`
1736
+ alarm separately. The clamp only bites — produces a genuinely tighter number —
1737
+ when the burst sat *within* the current cap yet still tripped the telemetry
1738
+ threshold (e.g. observed 5 under a generous current 8 → propose 4).
1739
+ * `max(1, …)` — floor at 1: a 0 budget would refuse the FIRST legitimate
1740
+ wait-marker outright, trapping a loop that has a real reason to wait one turn.
1741
+
1742
+ So: `max(1, min(current_max, observed_markers - 1))`. Monotone-down, floored at 1,
1743
+ and deliberately conservative — a cost-guard proposal never loosens, and a burst
1744
+ that proves non-enforcement yields no spurious "lower the cap" noise.
1745
+ """
1746
+ return max(1, min(current_max, observed_markers - 1))