dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/health.py ADDED
@@ -0,0 +1,552 @@
1
+ """Pre-dispatch lane-health gate — query a lane's *startability* BEFORE a
2
+ child launch, and route to `/unstick` / `/replan` instead of burning a child
3
+ to rediscover a knowable-at-t0 blocker.
4
+
5
+ Motivation (the 2026-06-01 incident this module exists for): a `/dispatch-loop`
6
+ auto-picked a lane, spent ~$9 and ~40 min launching a full `/dispatch` child,
7
+ and only THEN discovered two blockers that were both knowable at second zero —
8
+ (1) the lane's last 8 dispatch runs had all failed on the *same* renderer
9
+ sidecar-drop (a recurring structural blocker that only `/unstick` resolves), and
10
+ (2) the auto-picked lane structurally overlapped a live sibling lease. The loop's
11
+ existing breakers (drained-twice, packet-judge, recurring-wedge) all fire
12
+ *after* a child has run. This gate fires *before*.
13
+
14
+ Design — mirrors `dos.gate_classify`:
15
+
16
+ * `lane_health(...)` is a **pure function**: facts in (live leases, the lane's
17
+ recent verdict history, the lane tree), a typed `HealthVerdict` out. No I/O,
18
+ so it is replay-tested in isolation.
19
+ * `collect_lane_history(...)` is the thin I/O wrapper: it shells `git log` over
20
+ recent dispatch/dispatch-loop archive commits and parses each into a
21
+ `RunRecord`. The caller (the loop's Step 0, or `dos health` CLI) composes the
22
+ two.
23
+
24
+ The gate is **advisory-but-actionable**: it never blocks acquisition itself
25
+ (that is the arbiter's job); it returns a *route* the loop acts on. A
26
+ `route_unstick` means "this lane has been failing the same way — run /unstick
27
+ first"; a `route_replan` means "this lane is soak/data-gated — /replan, not
28
+ /unstick"; `proceed` means "nothing in the history says don't start."
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import json
34
+ import re
35
+ import subprocess
36
+ import sys
37
+ from dataclasses import dataclass, field
38
+ from enum import Enum
39
+
40
+ from dos.lane_overlap import overlap_verdict
41
+
42
+ # How many recent dispatch archive commits to scan for the lane. 12 covers
43
+ # ~a day of an active fleet without walking deep history; tune via the CLI arg.
44
+ DEFAULT_HISTORY_WINDOW = 12
45
+
46
+ # A lane is "recurring-blocked" when at least this many of its recent runs are
47
+ # non-shipping failures on the SAME cause key. 3 matches the recurrence floor
48
+ # the post-hoc recurring-wedge router uses, so the pre-gate and the post-gate
49
+ # agree on what "recurring" means.
50
+ RECURRING_THRESHOLD = 3
51
+
52
+
53
+ class HealthAction(str, Enum):
54
+ """What the loop should do with this lane, right now."""
55
+ PROCEED = "proceed" # history is clean (or shipping) — launch
56
+ ROUTE_UNSTICK = "route_unstick" # recurring structural blocker — /unstick first
57
+ ROUTE_REPLAN = "route_replan" # soak/data-gated — /replan, not /unstick
58
+ OVERLAP_BLOCK = "overlap_block" # a live lease's tree collides — pick elsewhere
59
+
60
+
61
+ # Verdict tokens a dispatch/dispatch-loop archive commit can carry. Kept in
62
+ # sync with dos.verdicts; duplicated as a frozenset here only for cheap parsing
63
+ # (the gate must not depend on the full verdict module to scan a log line).
64
+ _SHIPPING = frozenset({"LIVE", "SHIPPED", "SHIPPED-CLEAN"})
65
+ _NONSHIP_BLOCKER = frozenset({"ERROR", "WEDGE", "BLOCKED", "BLOCKED-OUTCOME", "STALLED"})
66
+ _DRAIN = frozenset({"DRAIN"})
67
+
68
+ # Causes that route to /replan rather than /unstick (soak/data-gated — no
69
+ # structural defect for /unstick to fix). Substring match against the cause
70
+ # text the archive commit carries.
71
+ _REPLAN_CAUSE_CUES = (
72
+ "soak", "soak-gated", "data-gated", "data gated", "awaiting a live run",
73
+ "drain", "drained",
74
+ )
75
+
76
+ _VERDICT_RE = re.compile(r"verdict=([A-Z][A-Z-]*)")
77
+ # A recurrence-count phrase the dispatch archives carry verbatim, e.g.
78
+ # "6th-consecutive", "8th recurrence", "5th consecutive". The ordinal is a
79
+ # strong recurring signal even within a short window.
80
+ _RECURRENCE_RE = re.compile(r"(\d+)(?:st|nd|rd|th)[ -](?:consecutive|recurrence)")
81
+
82
+ # A /dispatch-loop STOP archive halts the loop and (almost always) hands the
83
+ # operator/next-sweep a `/unstick` directive — e.g.
84
+ # "… 0 picks; STOP recurring BLOCKED (APPLY_LANE_BLOCKED_MESH) → /unstick"
85
+ # "… STOP override on recurring APPLY_LANE_POST_UNSTICK_STOP_RESPAWN … → /unstick"
86
+ # The STOP is an *operator-visible directive*: it says "do not re-iterate this
87
+ # lane until something lands." A loop that respawns the same lane anyway —
88
+ # before any operator action or structural commit clears the directive — is the
89
+ # POST-STOP-respawn doom-loop (`APPLY_LANE_POST_UNSTICK_STOP_RESPAWN`, logged 9×
90
+ # in 24h across the apply/tailor/CD lanes; cost-anchor ~$43 per /unstick cycle).
91
+ # `_STOP_RE` detects the STOP token; `_UNSTICK_ROUTE_RE` confirms it routed to
92
+ # /unstick (vs a self-healing /replan stamp-drift STOP, which the existing
93
+ # recurring-blocker rule already handles).
94
+ _STOP_RE = re.compile(r"\bSTOP\b")
95
+ _UNSTICK_ROUTE_RE = re.compile(r"/unstick\b")
96
+
97
+ # The cause_key the POST-STOP-respawn rule emits — kept verbatim equal to the
98
+ # reason_class the hand-rolled PRE-SCREEN WEDGE path has been writing into
99
+ # archive subjects, so /unstick clusters the pre-gate STOP and the in-the-wild
100
+ # respawns under one cause. (Lower-cased to match `BlockedReason` value style.)
101
+ POST_STOP_RESPAWN_CAUSE_KEY = "post_stop_respawn_no_operator_action"
102
+
103
+
104
+ @dataclass(frozen=True)
105
+ class RunRecord:
106
+ """One recent dispatch/dispatch-loop archive commit, parsed."""
107
+ run_ts: str
108
+ verdict: str # normalized token (ERROR/WEDGE/DRAIN/SHIPPED/…), "" if none
109
+ cause: str # free-text cause tail of the commit subject
110
+ recurrence_ordinal: int # parsed "Nth-consecutive/recurrence", 0 if absent
111
+ subject: str # the full commit subject (for evidence)
112
+
113
+ @property
114
+ def is_shipping(self) -> bool:
115
+ return self.verdict in _SHIPPING
116
+
117
+ @property
118
+ def is_blocker(self) -> bool:
119
+ return self.verdict in _NONSHIP_BLOCKER
120
+
121
+ @property
122
+ def is_drain(self) -> bool:
123
+ return self.verdict in _DRAIN
124
+
125
+ @property
126
+ def is_stop(self) -> bool:
127
+ """True iff this archive is a /dispatch-loop STOP (the loop halted
128
+ itself). Detected on the full subject, not the cause tail, since the
129
+ STOP token sits before the em-dash on loop archives."""
130
+ return bool(_STOP_RE.search(self.subject))
131
+
132
+ @property
133
+ def is_stop_with_unstick(self) -> bool:
134
+ """A STOP that routed to /unstick — the operator-visible "do not
135
+ re-iterate this lane until something lands" directive. This is the
136
+ signal the POST-STOP-respawn guard keys on; a STOP that routed only to
137
+ /replan is a stamp-drift halt the recurring-blocker rule already covers,
138
+ so it is deliberately excluded here."""
139
+ return self.is_stop and bool(_UNSTICK_ROUTE_RE.search(self.subject))
140
+
141
+ @property
142
+ def is_operator_action(self) -> bool:
143
+ """True iff this archive subject marks a deliberate operator action that
144
+ clears a prior STOP directive — a commit whose subject carries an
145
+ explicit `operator-action:` token. Such a record, newer than a STOP,
146
+ means the directive was answered and the lane may respawn."""
147
+ return "operator-action:" in self.subject.lower()
148
+
149
+
150
+ @dataclass(frozen=True)
151
+ class HealthVerdict:
152
+ action: HealthAction
153
+ reason: str
154
+ cause_key: str = "" # the recurring cause, when action==ROUTE_*
155
+ runs_considered: int = 0
156
+ blocker_runs: int = 0 # how many of those were same-cause blockers
157
+ overlap_lane: str = "" # the colliding live lease's lane, when OVERLAP_BLOCK
158
+ evidence: tuple[str, ...] = field(default_factory=tuple)
159
+
160
+ @property
161
+ def should_proceed(self) -> bool:
162
+ return self.action == HealthAction.PROCEED
163
+
164
+
165
+ def _normalize_cause(cause: str) -> str:
166
+ """Collapse a free-text cause tail to a stable key for same-cause counting.
167
+
168
+ Deliberately coarse: lower-case, strip the recurrence ordinal and digits,
169
+ squeeze whitespace, keep the first ~8 salient words. Two archive subjects
170
+ describing the same defect ("renderer .prompts.json sidecar drop 6th
171
+ consecutive" vs "renderer-sidecar-drop preflight refuse 8th recurrence")
172
+ must map to the same key, so we key on the stable noun phrase, not the
173
+ ordinal or the exact wording.
174
+ """
175
+ c = cause.lower()
176
+ c = _RECURRENCE_RE.sub("", c)
177
+ c = re.sub(r"\d+", "", c)
178
+ c = re.sub(r"[^a-z._/ -]", " ", c)
179
+ # canonical synonyms — collapse the many phrasings of one recurring cause
180
+ # to a single key so same-cause runs count together (the threshold is
181
+ # per-key). Order matters: most-specific defect first.
182
+ if "sidecar" in c or ".prompts" in c or "prompts.json" in c:
183
+ return "renderer_sidecar_drop"
184
+ if "ship-oracle" in c or "ship_oracle" in c or "false-positive" in c:
185
+ return "ship_oracle_false_positive"
186
+ if "stale" in c and "claim" in c:
187
+ return "stale_claim_false_block"
188
+ if "soak" in c or "data-gated" in c or "data gated" in c:
189
+ return "lane_soak_or_data_gated"
190
+ if "overlap" in c or "collision" in c:
191
+ return "lane_overlap_collision"
192
+ toks = [t for t in re.split(r"[ /._-]+", c) if len(t) > 2][:8]
193
+ return "_".join(toks) if toks else "uncategorized"
194
+
195
+
196
+ def _cause_routes_replan(cause: str) -> bool:
197
+ lc = cause.lower()
198
+ return any(cue in lc for cue in _REPLAN_CAUSE_CUES)
199
+
200
+
201
+ def lane_health(
202
+ lane: str,
203
+ *,
204
+ lane_tree: list[str],
205
+ live_leases: list[dict],
206
+ history: list[RunRecord],
207
+ own_lease_ts: str = "",
208
+ recurring_threshold: int = RECURRING_THRESHOLD,
209
+ ) -> HealthVerdict:
210
+ """Pure pre-dispatch health decision for `lane`.
211
+
212
+ Args:
213
+ lane — the lane about to be dispatched.
214
+ lane_tree — that lane's file-glob tree (for the overlap check).
215
+ live_leases — dicts with at least {lane, lane_kind, tree, loop_ts};
216
+ the loop's OWN lease (own_lease_ts) is excluded.
217
+ history — recent RunRecords for this lane, newest first (from
218
+ `collect_lane_history`).
219
+ own_lease_ts — this loop's own lease ts, so its own lease never
220
+ self-blocks the overlap check.
221
+ recurring_threshold — same-cause blocker count that trips ROUTE_*.
222
+
223
+ Decision order (first match wins):
224
+ 1. OVERLAP_BLOCK — a *foreign* live lease's tree collides with lane_tree
225
+ (via the fixed `overlap_verdict`). Highest priority: starting into a
226
+ real overlap guarantees a mutual wedge.
227
+ 2. ROUTE_UNSTICK (post-STOP respawn) — the most recent meaningful lane
228
+ event is a STOP→/unstick directive with no shipping run or explicit
229
+ operator action newer than it. The loop is respawning a lane the
230
+ previous loop halted; a STOP is an operator-visible "do not re-iterate
231
+ until something lands" directive, not a mesh-state the next iteration
232
+ can clear. Trips on the FIRST such respawn (not the threshold) because
233
+ one ignored STOP is already the doom-loop. See POST_STOP_RESPAWN_*.
234
+ 3. ROUTE_UNSTICK / ROUTE_REPLAN — the recent history is dominated by the
235
+ SAME-cause non-shipping blocker at/over the threshold. Route by cause:
236
+ soak/data-gated → /replan; structural → /unstick.
237
+ 4. PROCEED — anything else (a shipping run in the window, a clean drain,
238
+ mixed causes below threshold, or no history at all).
239
+ """
240
+ # 1. foreign-lease overlap (uses the fixed exact-glob-aware overlap_verdict)
241
+ for lease in live_leases:
242
+ lts = str(lease.get("loop_ts", ""))
243
+ if own_lease_ts and lts == own_lease_ts:
244
+ continue
245
+ llane = str(lease.get("lane", ""))
246
+ if llane == lane:
247
+ continue # same-lane is the arbiter's concern, not an overlap signal
248
+ ltree = list(lease.get("tree", []) or [])
249
+ if not ltree or not lane_tree:
250
+ continue # unknown blast radius handled by the arbiter
251
+ ov = overlap_verdict(list(lane_tree), ltree)
252
+ if not ov.admissible:
253
+ return HealthVerdict(
254
+ action=HealthAction.OVERLAP_BLOCK,
255
+ reason=(f"lane {lane!r} tree collides with live lease "
256
+ f"{llane!r} (loop {lts}): {ov.reason}"),
257
+ overlap_lane=llane,
258
+ runs_considered=len(history),
259
+ evidence=(f"overlap:{llane}:{ov.verdict.value}",),
260
+ )
261
+
262
+ # 2. post-STOP respawn — the previous loop halted this lane with a /unstick
263
+ # directive and nothing has cleared it since. Walk newest-first: the first
264
+ # record that is a shipping run OR an explicit operator action means the
265
+ # directive was answered (lane recovered) → fall through. The first record
266
+ # that is a STOP→/unstick, reached before any such clearing event, means the
267
+ # respawn is re-entering an unanswered STOP → route /unstick on this first
268
+ # respawn rather than burning a child to rediscover the same wedge.
269
+ for rec in history: # newest-first
270
+ if rec.is_shipping or rec.is_operator_action:
271
+ break # the STOP (if any) was cleared — not a doom-loop respawn
272
+ if rec.is_stop_with_unstick:
273
+ return HealthVerdict(
274
+ action=HealthAction.ROUTE_UNSTICK,
275
+ reason=(
276
+ f"lane {lane!r} was STOPped with a /unstick directive at "
277
+ f"{rec.run_ts or 'a recent archive'} and no shipping run or "
278
+ f"operator action has landed since — respawning re-enters an "
279
+ f"unanswered STOP. Route /unstick (or take an operator action) "
280
+ f"before launching a child. STOP subject: {rec.subject[:140]}"
281
+ ),
282
+ cause_key=POST_STOP_RESPAWN_CAUSE_KEY,
283
+ runs_considered=len(history),
284
+ blocker_runs=1,
285
+ evidence=(f"stop:{rec.run_ts}:/unstick",),
286
+ )
287
+
288
+ # 3. recurring same-cause blocker in the recent window
289
+ if history:
290
+ # group blocker runs by normalized cause key
291
+ by_cause: dict[str, list[RunRecord]] = {}
292
+ for rec in history:
293
+ if rec.is_blocker:
294
+ by_cause.setdefault(_normalize_cause(rec.cause), []).append(rec)
295
+ if by_cause:
296
+ # dominant cause = the one with the most blocker runs
297
+ cause_key, recs = max(by_cause.items(), key=lambda kv: len(kv[1]))
298
+ # an explicit "Nth-consecutive" ordinal in the window is itself a
299
+ # recurrence signal even if the window only captured a few of them
300
+ max_ordinal = max((r.recurrence_ordinal for r in recs), default=0)
301
+ tripped = len(recs) >= recurring_threshold or max_ordinal >= recurring_threshold
302
+ # a shipping run more recent than every blocker means the lane
303
+ # recovered — do NOT route (the blocker is stale history)
304
+ newest_ship = next((i for i, r in enumerate(history) if r.is_shipping), None)
305
+ newest_blocker = next((i for i, r in enumerate(history) if r.is_blocker), None)
306
+ recovered = (
307
+ newest_ship is not None
308
+ and newest_blocker is not None
309
+ and newest_ship < newest_blocker # ship is newer (lower index)
310
+ )
311
+ if tripped and not recovered:
312
+ sample_cause = recs[0].cause.strip()
313
+ action = (
314
+ HealthAction.ROUTE_REPLAN
315
+ if _cause_routes_replan(sample_cause)
316
+ else HealthAction.ROUTE_UNSTICK
317
+ )
318
+ route = "replan" if action == HealthAction.ROUTE_REPLAN else "unstick"
319
+ n = max(len(recs), max_ordinal)
320
+ return HealthVerdict(
321
+ action=action,
322
+ reason=(f"lane {lane!r} has {n} recent dispatch run(s) "
323
+ f"blocked on the same cause "
324
+ f"({cause_key}) — route to /{route} before "
325
+ f"spending another child launch. Sample: "
326
+ f"{sample_cause[:120]}"),
327
+ cause_key=cause_key,
328
+ runs_considered=len(history),
329
+ blocker_runs=len(recs),
330
+ evidence=tuple(f"{r.run_ts}:{r.verdict}" for r in recs[:5]),
331
+ )
332
+
333
+ # 3. nothing says don't start
334
+ return HealthVerdict(
335
+ action=HealthAction.PROCEED,
336
+ reason=(f"lane {lane!r} health OK — "
337
+ + (f"{len(history)} recent run(s), no recurring same-cause "
338
+ "blocker" if history else "no recent dispatch history")),
339
+ runs_considered=len(history),
340
+ )
341
+
342
+
343
+ # ── I/O wrapper: parse recent dispatch archive commits into RunRecords ───────
344
+
345
+ def parse_archive_subject(subject: str, lane: str) -> RunRecord | None:
346
+ """Parse one `git log --oneline` subject into a RunRecord, or None if it is
347
+ not a dispatch/dispatch-loop archive for `lane`.
348
+
349
+ Recognized shapes (both carry `verdict=` or a bracketed outcome):
350
+ `docs/dispatch: archive <ts> — <tag> → verdict=ERROR, child2 …`
351
+ `docs/dispatch-loop: archive <ts> — N iters …, 0 picks shipped (<LANE> lane; … verdict=ERROR …)`
352
+
353
+ The lane match is a substring test against the subject (the dispatch-loop
354
+ archives name the lane as `<LANE> lane`; the per-`/dispatch` archives do
355
+ not always carry the lane, so those are matched only when `lane` is the
356
+ empty string — i.e. "all lanes" — see `collect_lane_history`).
357
+ """
358
+ if "archive" not in subject:
359
+ return None
360
+ if "docs/dispatch" not in subject and "docs/dispatch-loop" not in subject:
361
+ return None
362
+ # lane filter: when a specific lane is requested, require it to appear in
363
+ # one of the conventions dispatch archives actually use for the lane name:
364
+ # - "<lane> lane" the dispatch-loop archive convention
365
+ # - "scope <lane>" a --scope <lane> hand-run / inherited child
366
+ # - "(<lane>;" the parenthetical lane tag on some loop archives
367
+ # - "<LANE>_LANE_..." the reason_class convention (APPLY_LANE_BLOCKED_MESH,
368
+ # TAILOR_LANE_FOCUS_..., CD_LANE_OPERATOR_...). STOP
369
+ # archives frequently name the lane ONLY here, so
370
+ # without this clause the post-STOP-respawn guard
371
+ # would fail to attribute the very respawns it
372
+ # exists to catch (the test_real_archive_subject
373
+ # regression that pinned this).
374
+ if lane:
375
+ lane_l = lane.lower()
376
+ subj_l = subject.lower()
377
+ reason_class_tag = f"{lane_l}_lane_"
378
+ if (f"{lane_l} lane" not in subj_l
379
+ and f"scope {lane_l}" not in subj_l
380
+ and f"({lane_l};" not in subj_l
381
+ and reason_class_tag not in subj_l):
382
+ return None
383
+ m_ts = re.search(r"archive\s+(\d{8}T\d{6}Z|\d{8}T\d{4}Z)", subject)
384
+ run_ts = m_ts.group(1) if m_ts else ""
385
+ m_v = _VERDICT_RE.search(subject)
386
+ verdict = m_v.group(1) if m_v else ""
387
+ # cause = the tail after the verdict token (or after the em-dash)
388
+ cause = subject
389
+ if m_v:
390
+ cause = subject[m_v.end():].lstrip(" ,—-")
391
+ elif "—" in subject:
392
+ cause = subject.split("—", 1)[1].strip()
393
+ m_r = _RECURRENCE_RE.search(subject)
394
+ ordinal = int(m_r.group(1)) if m_r else 0
395
+ return RunRecord(
396
+ run_ts=run_ts, verdict=verdict, cause=cause,
397
+ recurrence_ordinal=ordinal, subject=subject,
398
+ )
399
+
400
+
401
+ def collect_lane_history(
402
+ lane: str,
403
+ *,
404
+ git_log_lines: list[str],
405
+ window: int = DEFAULT_HISTORY_WINDOW,
406
+ ) -> list[RunRecord]:
407
+ """Parse `git log --oneline` output into recent RunRecords for `lane`.
408
+
409
+ `git_log_lines` is the raw `git log --oneline -<N> -- docs/_dispatch_loops/
410
+ docs/_chained_runs/` output (one subject per line, newest first). Pass an
411
+ empty `lane` to collect across ALL lanes (the per-`/dispatch` archives that
412
+ do not name a lane are then included). Newest-first order is preserved.
413
+ """
414
+ out: list[RunRecord] = []
415
+ for line in git_log_lines:
416
+ line = line.strip()
417
+ if not line:
418
+ continue
419
+ # drop the leading short-sha from `--oneline`
420
+ subject = line.split(" ", 1)[1] if " " in line else line
421
+ rec = parse_archive_subject(subject, lane)
422
+ if rec is not None:
423
+ out.append(rec)
424
+ if len(out) >= window:
425
+ break
426
+ return out
427
+
428
+
429
+ # ── CLI layer (the I/O composition: git log + leases → health JSON) ──────────
430
+
431
+ # Archive commits live under these dirs; the git-log pathspec scopes the scan.
432
+ _ARCHIVE_PATHSPEC = ("docs/_dispatch_loops/", "docs/_chained_runs/")
433
+
434
+
435
+ def _git_log_subjects(scan_depth: int) -> list[str]:
436
+ """`git log --oneline -<scan_depth> -- <archive dirs>` → subject lines.
437
+
438
+ Best-effort: a git failure (no repo, detached, etc.) yields [] so the gate
439
+ degrades to "no history → proceed" rather than crashing the loop's Step 0.
440
+ """
441
+ try:
442
+ proc = subprocess.run(
443
+ ["git", "log", "--oneline", f"-{scan_depth}", "--", *_ARCHIVE_PATHSPEC],
444
+ capture_output=True, text=True, encoding="utf-8", errors="replace",
445
+ timeout=20,
446
+ )
447
+ except (OSError, subprocess.SubprocessError):
448
+ return []
449
+ if proc.returncode != 0:
450
+ return []
451
+ return [ln for ln in proc.stdout.splitlines() if ln.strip()]
452
+
453
+
454
+ def check(
455
+ lane: str,
456
+ *,
457
+ lane_tree: list[str],
458
+ live_leases: list[dict],
459
+ own_lease_ts: str = "",
460
+ window: int = DEFAULT_HISTORY_WINDOW,
461
+ scan_depth: int | None = None,
462
+ git_log_lines: list[str] | None = None,
463
+ ) -> HealthVerdict:
464
+ """One-call composition: gather the lane's recent history (via git log
465
+ unless `git_log_lines` is supplied for testing) and run `lane_health`.
466
+
467
+ `scan_depth` is how many commits to walk (defaults to ~4× the window so a
468
+ lane-filtered scan still finds `window` matches); `window` caps how many
469
+ matched records feed the decision.
470
+ """
471
+ if git_log_lines is None:
472
+ git_log_lines = _git_log_subjects(scan_depth or max(40, window * 4))
473
+ history = collect_lane_history(lane, git_log_lines=git_log_lines, window=window)
474
+ return lane_health(
475
+ lane, lane_tree=lane_tree, live_leases=live_leases,
476
+ history=history, own_lease_ts=own_lease_ts,
477
+ )
478
+
479
+
480
+ def verdict_to_dict(v: HealthVerdict) -> dict:
481
+ return {
482
+ "action": v.action.value,
483
+ "should_proceed": v.should_proceed,
484
+ "reason": v.reason,
485
+ "cause_key": v.cause_key,
486
+ "runs_considered": v.runs_considered,
487
+ "blocker_runs": v.blocker_runs,
488
+ "overlap_lane": v.overlap_lane,
489
+ "evidence": list(v.evidence),
490
+ }
491
+
492
+
493
+ def cmd_check(args: argparse.Namespace) -> int:
494
+ """`dos health --lane TM --tree '...' --leases-json '...'` → health JSON.
495
+
496
+ Leases + the lane tree are passed IN (the live-lease registry and the
497
+ lane→tree resolver are host-app concerns — the job side supplies them); the
498
+ history is gathered here via git log. Exit code mirrors the action so a
499
+ shell caller can branch without parsing JSON: 0 PROCEED, 3 ROUTE_UNSTICK,
500
+ 4 ROUTE_REPLAN, 6 OVERLAP_BLOCK.
501
+ """
502
+ lane_tree = [t for t in (args.tree or "").split(",") if t.strip()]
503
+ live_leases: list[dict] = []
504
+ if args.leases_json:
505
+ try:
506
+ live_leases = json.loads(args.leases_json)
507
+ except (ValueError, TypeError):
508
+ live_leases = []
509
+ git_lines = None
510
+ if args.git_log_file:
511
+ with open(args.git_log_file, encoding="utf-8") as fh:
512
+ git_lines = [ln for ln in fh.read().splitlines() if ln.strip()]
513
+ v = check(
514
+ args.lane, lane_tree=lane_tree, live_leases=live_leases,
515
+ own_lease_ts=args.own_lease_ts or "", window=args.window,
516
+ git_log_lines=git_lines,
517
+ )
518
+ print(json.dumps(verdict_to_dict(v), indent=2, sort_keys=True))
519
+ return {
520
+ HealthAction.PROCEED: 0,
521
+ HealthAction.ROUTE_UNSTICK: 3,
522
+ HealthAction.ROUTE_REPLAN: 4,
523
+ HealthAction.OVERLAP_BLOCK: 6,
524
+ }[v.action]
525
+
526
+
527
+ def build_parser() -> argparse.ArgumentParser:
528
+ p = argparse.ArgumentParser(
529
+ prog="dos-health",
530
+ description="Pre-dispatch lane-health gate — query a lane's startability.",
531
+ )
532
+ p.add_argument("--lane", required=True, help="the lane about to be dispatched")
533
+ p.add_argument("--tree", default="",
534
+ help="comma-separated file-glob tree for the lane (overlap check)")
535
+ p.add_argument("--leases-json", default="",
536
+ help="JSON array of live leases [{lane,lane_kind,tree,loop_ts}]")
537
+ p.add_argument("--own-lease-ts", default="",
538
+ help="this loop's own lease ts (never self-blocks)")
539
+ p.add_argument("--window", type=int, default=DEFAULT_HISTORY_WINDOW,
540
+ help=f"matched records to consider (default {DEFAULT_HISTORY_WINDOW})")
541
+ p.add_argument("--git-log-file", default="",
542
+ help="read git-log subjects from a file instead of running git (testing)")
543
+ return p
544
+
545
+
546
+ def main(argv: list[str] | None = None) -> int:
547
+ args = build_parser().parse_args(argv)
548
+ return cmd_check(args)
549
+
550
+
551
+ if __name__ == "__main__": # pragma: no cover
552
+ sys.exit(main())