dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/health.py
ADDED
|
@@ -0,0 +1,552 @@
|
|
|
1
|
+
"""Pre-dispatch lane-health gate — query a lane's *startability* BEFORE a
|
|
2
|
+
child launch, and route to `/unstick` / `/replan` instead of burning a child
|
|
3
|
+
to rediscover a knowable-at-t0 blocker.
|
|
4
|
+
|
|
5
|
+
Motivation (the 2026-06-01 incident this module exists for): a `/dispatch-loop`
|
|
6
|
+
auto-picked a lane, spent ~$9 and ~40 min launching a full `/dispatch` child,
|
|
7
|
+
and only THEN discovered two blockers that were both knowable at second zero —
|
|
8
|
+
(1) the lane's last 8 dispatch runs had all failed on the *same* renderer
|
|
9
|
+
sidecar-drop (a recurring structural blocker that only `/unstick` resolves), and
|
|
10
|
+
(2) the auto-picked lane structurally overlapped a live sibling lease. The loop's
|
|
11
|
+
existing breakers (drained-twice, packet-judge, recurring-wedge) all fire
|
|
12
|
+
*after* a child has run. This gate fires *before*.
|
|
13
|
+
|
|
14
|
+
Design — mirrors `dos.gate_classify`:
|
|
15
|
+
|
|
16
|
+
* `lane_health(...)` is a **pure function**: facts in (live leases, the lane's
|
|
17
|
+
recent verdict history, the lane tree), a typed `HealthVerdict` out. No I/O,
|
|
18
|
+
so it is replay-tested in isolation.
|
|
19
|
+
* `collect_lane_history(...)` is the thin I/O wrapper: it shells `git log` over
|
|
20
|
+
recent dispatch/dispatch-loop archive commits and parses each into a
|
|
21
|
+
`RunRecord`. The caller (the loop's Step 0, or `dos health` CLI) composes the
|
|
22
|
+
two.
|
|
23
|
+
|
|
24
|
+
The gate is **advisory-but-actionable**: it never blocks acquisition itself
|
|
25
|
+
(that is the arbiter's job); it returns a *route* the loop acts on. A
|
|
26
|
+
`route_unstick` means "this lane has been failing the same way — run /unstick
|
|
27
|
+
first"; a `route_replan` means "this lane is soak/data-gated — /replan, not
|
|
28
|
+
/unstick"; `proceed` means "nothing in the history says don't start."
|
|
29
|
+
"""
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import argparse
|
|
33
|
+
import json
|
|
34
|
+
import re
|
|
35
|
+
import subprocess
|
|
36
|
+
import sys
|
|
37
|
+
from dataclasses import dataclass, field
|
|
38
|
+
from enum import Enum
|
|
39
|
+
|
|
40
|
+
from dos.lane_overlap import overlap_verdict
|
|
41
|
+
|
|
42
|
+
# How many recent dispatch archive commits to scan for the lane. 12 covers
|
|
43
|
+
# ~a day of an active fleet without walking deep history; tune via the CLI arg.
|
|
44
|
+
DEFAULT_HISTORY_WINDOW = 12
|
|
45
|
+
|
|
46
|
+
# A lane is "recurring-blocked" when at least this many of its recent runs are
|
|
47
|
+
# non-shipping failures on the SAME cause key. 3 matches the recurrence floor
|
|
48
|
+
# the post-hoc recurring-wedge router uses, so the pre-gate and the post-gate
|
|
49
|
+
# agree on what "recurring" means.
|
|
50
|
+
RECURRING_THRESHOLD = 3
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HealthAction(str, Enum):
|
|
54
|
+
"""What the loop should do with this lane, right now."""
|
|
55
|
+
PROCEED = "proceed" # history is clean (or shipping) — launch
|
|
56
|
+
ROUTE_UNSTICK = "route_unstick" # recurring structural blocker — /unstick first
|
|
57
|
+
ROUTE_REPLAN = "route_replan" # soak/data-gated — /replan, not /unstick
|
|
58
|
+
OVERLAP_BLOCK = "overlap_block" # a live lease's tree collides — pick elsewhere
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Verdict tokens a dispatch/dispatch-loop archive commit can carry. Kept in
|
|
62
|
+
# sync with dos.verdicts; duplicated as a frozenset here only for cheap parsing
|
|
63
|
+
# (the gate must not depend on the full verdict module to scan a log line).
|
|
64
|
+
_SHIPPING = frozenset({"LIVE", "SHIPPED", "SHIPPED-CLEAN"})
|
|
65
|
+
_NONSHIP_BLOCKER = frozenset({"ERROR", "WEDGE", "BLOCKED", "BLOCKED-OUTCOME", "STALLED"})
|
|
66
|
+
_DRAIN = frozenset({"DRAIN"})
|
|
67
|
+
|
|
68
|
+
# Causes that route to /replan rather than /unstick (soak/data-gated — no
|
|
69
|
+
# structural defect for /unstick to fix). Substring match against the cause
|
|
70
|
+
# text the archive commit carries.
|
|
71
|
+
_REPLAN_CAUSE_CUES = (
|
|
72
|
+
"soak", "soak-gated", "data-gated", "data gated", "awaiting a live run",
|
|
73
|
+
"drain", "drained",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
_VERDICT_RE = re.compile(r"verdict=([A-Z][A-Z-]*)")
|
|
77
|
+
# A recurrence-count phrase the dispatch archives carry verbatim, e.g.
|
|
78
|
+
# "6th-consecutive", "8th recurrence", "5th consecutive". The ordinal is a
|
|
79
|
+
# strong recurring signal even within a short window.
|
|
80
|
+
_RECURRENCE_RE = re.compile(r"(\d+)(?:st|nd|rd|th)[ -](?:consecutive|recurrence)")
|
|
81
|
+
|
|
82
|
+
# A /dispatch-loop STOP archive halts the loop and (almost always) hands the
|
|
83
|
+
# operator/next-sweep a `/unstick` directive — e.g.
|
|
84
|
+
# "… 0 picks; STOP recurring BLOCKED (APPLY_LANE_BLOCKED_MESH) → /unstick"
|
|
85
|
+
# "… STOP override on recurring APPLY_LANE_POST_UNSTICK_STOP_RESPAWN … → /unstick"
|
|
86
|
+
# The STOP is an *operator-visible directive*: it says "do not re-iterate this
|
|
87
|
+
# lane until something lands." A loop that respawns the same lane anyway —
|
|
88
|
+
# before any operator action or structural commit clears the directive — is the
|
|
89
|
+
# POST-STOP-respawn doom-loop (`APPLY_LANE_POST_UNSTICK_STOP_RESPAWN`, logged 9×
|
|
90
|
+
# in 24h across the apply/tailor/CD lanes; cost-anchor ~$43 per /unstick cycle).
|
|
91
|
+
# `_STOP_RE` detects the STOP token; `_UNSTICK_ROUTE_RE` confirms it routed to
|
|
92
|
+
# /unstick (vs a self-healing /replan stamp-drift STOP, which the existing
|
|
93
|
+
# recurring-blocker rule already handles).
|
|
94
|
+
_STOP_RE = re.compile(r"\bSTOP\b")
|
|
95
|
+
_UNSTICK_ROUTE_RE = re.compile(r"/unstick\b")
|
|
96
|
+
|
|
97
|
+
# The cause_key the POST-STOP-respawn rule emits — kept verbatim equal to the
|
|
98
|
+
# reason_class the hand-rolled PRE-SCREEN WEDGE path has been writing into
|
|
99
|
+
# archive subjects, so /unstick clusters the pre-gate STOP and the in-the-wild
|
|
100
|
+
# respawns under one cause. (Lower-cased to match `BlockedReason` value style.)
|
|
101
|
+
POST_STOP_RESPAWN_CAUSE_KEY = "post_stop_respawn_no_operator_action"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True)
|
|
105
|
+
class RunRecord:
|
|
106
|
+
"""One recent dispatch/dispatch-loop archive commit, parsed."""
|
|
107
|
+
run_ts: str
|
|
108
|
+
verdict: str # normalized token (ERROR/WEDGE/DRAIN/SHIPPED/…), "" if none
|
|
109
|
+
cause: str # free-text cause tail of the commit subject
|
|
110
|
+
recurrence_ordinal: int # parsed "Nth-consecutive/recurrence", 0 if absent
|
|
111
|
+
subject: str # the full commit subject (for evidence)
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def is_shipping(self) -> bool:
|
|
115
|
+
return self.verdict in _SHIPPING
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def is_blocker(self) -> bool:
|
|
119
|
+
return self.verdict in _NONSHIP_BLOCKER
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def is_drain(self) -> bool:
|
|
123
|
+
return self.verdict in _DRAIN
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def is_stop(self) -> bool:
|
|
127
|
+
"""True iff this archive is a /dispatch-loop STOP (the loop halted
|
|
128
|
+
itself). Detected on the full subject, not the cause tail, since the
|
|
129
|
+
STOP token sits before the em-dash on loop archives."""
|
|
130
|
+
return bool(_STOP_RE.search(self.subject))
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def is_stop_with_unstick(self) -> bool:
|
|
134
|
+
"""A STOP that routed to /unstick — the operator-visible "do not
|
|
135
|
+
re-iterate this lane until something lands" directive. This is the
|
|
136
|
+
signal the POST-STOP-respawn guard keys on; a STOP that routed only to
|
|
137
|
+
/replan is a stamp-drift halt the recurring-blocker rule already covers,
|
|
138
|
+
so it is deliberately excluded here."""
|
|
139
|
+
return self.is_stop and bool(_UNSTICK_ROUTE_RE.search(self.subject))
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def is_operator_action(self) -> bool:
|
|
143
|
+
"""True iff this archive subject marks a deliberate operator action that
|
|
144
|
+
clears a prior STOP directive — a commit whose subject carries an
|
|
145
|
+
explicit `operator-action:` token. Such a record, newer than a STOP,
|
|
146
|
+
means the directive was answered and the lane may respawn."""
|
|
147
|
+
return "operator-action:" in self.subject.lower()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass(frozen=True)
|
|
151
|
+
class HealthVerdict:
|
|
152
|
+
action: HealthAction
|
|
153
|
+
reason: str
|
|
154
|
+
cause_key: str = "" # the recurring cause, when action==ROUTE_*
|
|
155
|
+
runs_considered: int = 0
|
|
156
|
+
blocker_runs: int = 0 # how many of those were same-cause blockers
|
|
157
|
+
overlap_lane: str = "" # the colliding live lease's lane, when OVERLAP_BLOCK
|
|
158
|
+
evidence: tuple[str, ...] = field(default_factory=tuple)
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def should_proceed(self) -> bool:
|
|
162
|
+
return self.action == HealthAction.PROCEED
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _normalize_cause(cause: str) -> str:
|
|
166
|
+
"""Collapse a free-text cause tail to a stable key for same-cause counting.
|
|
167
|
+
|
|
168
|
+
Deliberately coarse: lower-case, strip the recurrence ordinal and digits,
|
|
169
|
+
squeeze whitespace, keep the first ~8 salient words. Two archive subjects
|
|
170
|
+
describing the same defect ("renderer .prompts.json sidecar drop 6th
|
|
171
|
+
consecutive" vs "renderer-sidecar-drop preflight refuse 8th recurrence")
|
|
172
|
+
must map to the same key, so we key on the stable noun phrase, not the
|
|
173
|
+
ordinal or the exact wording.
|
|
174
|
+
"""
|
|
175
|
+
c = cause.lower()
|
|
176
|
+
c = _RECURRENCE_RE.sub("", c)
|
|
177
|
+
c = re.sub(r"\d+", "", c)
|
|
178
|
+
c = re.sub(r"[^a-z._/ -]", " ", c)
|
|
179
|
+
# canonical synonyms — collapse the many phrasings of one recurring cause
|
|
180
|
+
# to a single key so same-cause runs count together (the threshold is
|
|
181
|
+
# per-key). Order matters: most-specific defect first.
|
|
182
|
+
if "sidecar" in c or ".prompts" in c or "prompts.json" in c:
|
|
183
|
+
return "renderer_sidecar_drop"
|
|
184
|
+
if "ship-oracle" in c or "ship_oracle" in c or "false-positive" in c:
|
|
185
|
+
return "ship_oracle_false_positive"
|
|
186
|
+
if "stale" in c and "claim" in c:
|
|
187
|
+
return "stale_claim_false_block"
|
|
188
|
+
if "soak" in c or "data-gated" in c or "data gated" in c:
|
|
189
|
+
return "lane_soak_or_data_gated"
|
|
190
|
+
if "overlap" in c or "collision" in c:
|
|
191
|
+
return "lane_overlap_collision"
|
|
192
|
+
toks = [t for t in re.split(r"[ /._-]+", c) if len(t) > 2][:8]
|
|
193
|
+
return "_".join(toks) if toks else "uncategorized"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _cause_routes_replan(cause: str) -> bool:
|
|
197
|
+
lc = cause.lower()
|
|
198
|
+
return any(cue in lc for cue in _REPLAN_CAUSE_CUES)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def lane_health(
|
|
202
|
+
lane: str,
|
|
203
|
+
*,
|
|
204
|
+
lane_tree: list[str],
|
|
205
|
+
live_leases: list[dict],
|
|
206
|
+
history: list[RunRecord],
|
|
207
|
+
own_lease_ts: str = "",
|
|
208
|
+
recurring_threshold: int = RECURRING_THRESHOLD,
|
|
209
|
+
) -> HealthVerdict:
|
|
210
|
+
"""Pure pre-dispatch health decision for `lane`.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
lane — the lane about to be dispatched.
|
|
214
|
+
lane_tree — that lane's file-glob tree (for the overlap check).
|
|
215
|
+
live_leases — dicts with at least {lane, lane_kind, tree, loop_ts};
|
|
216
|
+
the loop's OWN lease (own_lease_ts) is excluded.
|
|
217
|
+
history — recent RunRecords for this lane, newest first (from
|
|
218
|
+
`collect_lane_history`).
|
|
219
|
+
own_lease_ts — this loop's own lease ts, so its own lease never
|
|
220
|
+
self-blocks the overlap check.
|
|
221
|
+
recurring_threshold — same-cause blocker count that trips ROUTE_*.
|
|
222
|
+
|
|
223
|
+
Decision order (first match wins):
|
|
224
|
+
1. OVERLAP_BLOCK — a *foreign* live lease's tree collides with lane_tree
|
|
225
|
+
(via the fixed `overlap_verdict`). Highest priority: starting into a
|
|
226
|
+
real overlap guarantees a mutual wedge.
|
|
227
|
+
2. ROUTE_UNSTICK (post-STOP respawn) — the most recent meaningful lane
|
|
228
|
+
event is a STOP→/unstick directive with no shipping run or explicit
|
|
229
|
+
operator action newer than it. The loop is respawning a lane the
|
|
230
|
+
previous loop halted; a STOP is an operator-visible "do not re-iterate
|
|
231
|
+
until something lands" directive, not a mesh-state the next iteration
|
|
232
|
+
can clear. Trips on the FIRST such respawn (not the threshold) because
|
|
233
|
+
one ignored STOP is already the doom-loop. See POST_STOP_RESPAWN_*.
|
|
234
|
+
3. ROUTE_UNSTICK / ROUTE_REPLAN — the recent history is dominated by the
|
|
235
|
+
SAME-cause non-shipping blocker at/over the threshold. Route by cause:
|
|
236
|
+
soak/data-gated → /replan; structural → /unstick.
|
|
237
|
+
4. PROCEED — anything else (a shipping run in the window, a clean drain,
|
|
238
|
+
mixed causes below threshold, or no history at all).
|
|
239
|
+
"""
|
|
240
|
+
# 1. foreign-lease overlap (uses the fixed exact-glob-aware overlap_verdict)
|
|
241
|
+
for lease in live_leases:
|
|
242
|
+
lts = str(lease.get("loop_ts", ""))
|
|
243
|
+
if own_lease_ts and lts == own_lease_ts:
|
|
244
|
+
continue
|
|
245
|
+
llane = str(lease.get("lane", ""))
|
|
246
|
+
if llane == lane:
|
|
247
|
+
continue # same-lane is the arbiter's concern, not an overlap signal
|
|
248
|
+
ltree = list(lease.get("tree", []) or [])
|
|
249
|
+
if not ltree or not lane_tree:
|
|
250
|
+
continue # unknown blast radius handled by the arbiter
|
|
251
|
+
ov = overlap_verdict(list(lane_tree), ltree)
|
|
252
|
+
if not ov.admissible:
|
|
253
|
+
return HealthVerdict(
|
|
254
|
+
action=HealthAction.OVERLAP_BLOCK,
|
|
255
|
+
reason=(f"lane {lane!r} tree collides with live lease "
|
|
256
|
+
f"{llane!r} (loop {lts}): {ov.reason}"),
|
|
257
|
+
overlap_lane=llane,
|
|
258
|
+
runs_considered=len(history),
|
|
259
|
+
evidence=(f"overlap:{llane}:{ov.verdict.value}",),
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# 2. post-STOP respawn — the previous loop halted this lane with a /unstick
|
|
263
|
+
# directive and nothing has cleared it since. Walk newest-first: the first
|
|
264
|
+
# record that is a shipping run OR an explicit operator action means the
|
|
265
|
+
# directive was answered (lane recovered) → fall through. The first record
|
|
266
|
+
# that is a STOP→/unstick, reached before any such clearing event, means the
|
|
267
|
+
# respawn is re-entering an unanswered STOP → route /unstick on this first
|
|
268
|
+
# respawn rather than burning a child to rediscover the same wedge.
|
|
269
|
+
for rec in history: # newest-first
|
|
270
|
+
if rec.is_shipping or rec.is_operator_action:
|
|
271
|
+
break # the STOP (if any) was cleared — not a doom-loop respawn
|
|
272
|
+
if rec.is_stop_with_unstick:
|
|
273
|
+
return HealthVerdict(
|
|
274
|
+
action=HealthAction.ROUTE_UNSTICK,
|
|
275
|
+
reason=(
|
|
276
|
+
f"lane {lane!r} was STOPped with a /unstick directive at "
|
|
277
|
+
f"{rec.run_ts or 'a recent archive'} and no shipping run or "
|
|
278
|
+
f"operator action has landed since — respawning re-enters an "
|
|
279
|
+
f"unanswered STOP. Route /unstick (or take an operator action) "
|
|
280
|
+
f"before launching a child. STOP subject: {rec.subject[:140]}"
|
|
281
|
+
),
|
|
282
|
+
cause_key=POST_STOP_RESPAWN_CAUSE_KEY,
|
|
283
|
+
runs_considered=len(history),
|
|
284
|
+
blocker_runs=1,
|
|
285
|
+
evidence=(f"stop:{rec.run_ts}:/unstick",),
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# 3. recurring same-cause blocker in the recent window
|
|
289
|
+
if history:
|
|
290
|
+
# group blocker runs by normalized cause key
|
|
291
|
+
by_cause: dict[str, list[RunRecord]] = {}
|
|
292
|
+
for rec in history:
|
|
293
|
+
if rec.is_blocker:
|
|
294
|
+
by_cause.setdefault(_normalize_cause(rec.cause), []).append(rec)
|
|
295
|
+
if by_cause:
|
|
296
|
+
# dominant cause = the one with the most blocker runs
|
|
297
|
+
cause_key, recs = max(by_cause.items(), key=lambda kv: len(kv[1]))
|
|
298
|
+
# an explicit "Nth-consecutive" ordinal in the window is itself a
|
|
299
|
+
# recurrence signal even if the window only captured a few of them
|
|
300
|
+
max_ordinal = max((r.recurrence_ordinal for r in recs), default=0)
|
|
301
|
+
tripped = len(recs) >= recurring_threshold or max_ordinal >= recurring_threshold
|
|
302
|
+
# a shipping run more recent than every blocker means the lane
|
|
303
|
+
# recovered — do NOT route (the blocker is stale history)
|
|
304
|
+
newest_ship = next((i for i, r in enumerate(history) if r.is_shipping), None)
|
|
305
|
+
newest_blocker = next((i for i, r in enumerate(history) if r.is_blocker), None)
|
|
306
|
+
recovered = (
|
|
307
|
+
newest_ship is not None
|
|
308
|
+
and newest_blocker is not None
|
|
309
|
+
and newest_ship < newest_blocker # ship is newer (lower index)
|
|
310
|
+
)
|
|
311
|
+
if tripped and not recovered:
|
|
312
|
+
sample_cause = recs[0].cause.strip()
|
|
313
|
+
action = (
|
|
314
|
+
HealthAction.ROUTE_REPLAN
|
|
315
|
+
if _cause_routes_replan(sample_cause)
|
|
316
|
+
else HealthAction.ROUTE_UNSTICK
|
|
317
|
+
)
|
|
318
|
+
route = "replan" if action == HealthAction.ROUTE_REPLAN else "unstick"
|
|
319
|
+
n = max(len(recs), max_ordinal)
|
|
320
|
+
return HealthVerdict(
|
|
321
|
+
action=action,
|
|
322
|
+
reason=(f"lane {lane!r} has {n} recent dispatch run(s) "
|
|
323
|
+
f"blocked on the same cause "
|
|
324
|
+
f"({cause_key}) — route to /{route} before "
|
|
325
|
+
f"spending another child launch. Sample: "
|
|
326
|
+
f"{sample_cause[:120]}"),
|
|
327
|
+
cause_key=cause_key,
|
|
328
|
+
runs_considered=len(history),
|
|
329
|
+
blocker_runs=len(recs),
|
|
330
|
+
evidence=tuple(f"{r.run_ts}:{r.verdict}" for r in recs[:5]),
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# 3. nothing says don't start
|
|
334
|
+
return HealthVerdict(
|
|
335
|
+
action=HealthAction.PROCEED,
|
|
336
|
+
reason=(f"lane {lane!r} health OK — "
|
|
337
|
+
+ (f"{len(history)} recent run(s), no recurring same-cause "
|
|
338
|
+
"blocker" if history else "no recent dispatch history")),
|
|
339
|
+
runs_considered=len(history),
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# ── I/O wrapper: parse recent dispatch archive commits into RunRecords ───────
|
|
344
|
+
|
|
345
|
+
def parse_archive_subject(subject: str, lane: str) -> RunRecord | None:
|
|
346
|
+
"""Parse one `git log --oneline` subject into a RunRecord, or None if it is
|
|
347
|
+
not a dispatch/dispatch-loop archive for `lane`.
|
|
348
|
+
|
|
349
|
+
Recognized shapes (both carry `verdict=` or a bracketed outcome):
|
|
350
|
+
`docs/dispatch: archive <ts> — <tag> → verdict=ERROR, child2 …`
|
|
351
|
+
`docs/dispatch-loop: archive <ts> — N iters …, 0 picks shipped (<LANE> lane; … verdict=ERROR …)`
|
|
352
|
+
|
|
353
|
+
The lane match is a substring test against the subject (the dispatch-loop
|
|
354
|
+
archives name the lane as `<LANE> lane`; the per-`/dispatch` archives do
|
|
355
|
+
not always carry the lane, so those are matched only when `lane` is the
|
|
356
|
+
empty string — i.e. "all lanes" — see `collect_lane_history`).
|
|
357
|
+
"""
|
|
358
|
+
if "archive" not in subject:
|
|
359
|
+
return None
|
|
360
|
+
if "docs/dispatch" not in subject and "docs/dispatch-loop" not in subject:
|
|
361
|
+
return None
|
|
362
|
+
# lane filter: when a specific lane is requested, require it to appear in
|
|
363
|
+
# one of the conventions dispatch archives actually use for the lane name:
|
|
364
|
+
# - "<lane> lane" the dispatch-loop archive convention
|
|
365
|
+
# - "scope <lane>" a --scope <lane> hand-run / inherited child
|
|
366
|
+
# - "(<lane>;" the parenthetical lane tag on some loop archives
|
|
367
|
+
# - "<LANE>_LANE_..." the reason_class convention (APPLY_LANE_BLOCKED_MESH,
|
|
368
|
+
# TAILOR_LANE_FOCUS_..., CD_LANE_OPERATOR_...). STOP
|
|
369
|
+
# archives frequently name the lane ONLY here, so
|
|
370
|
+
# without this clause the post-STOP-respawn guard
|
|
371
|
+
# would fail to attribute the very respawns it
|
|
372
|
+
# exists to catch (the test_real_archive_subject
|
|
373
|
+
# regression that pinned this).
|
|
374
|
+
if lane:
|
|
375
|
+
lane_l = lane.lower()
|
|
376
|
+
subj_l = subject.lower()
|
|
377
|
+
reason_class_tag = f"{lane_l}_lane_"
|
|
378
|
+
if (f"{lane_l} lane" not in subj_l
|
|
379
|
+
and f"scope {lane_l}" not in subj_l
|
|
380
|
+
and f"({lane_l};" not in subj_l
|
|
381
|
+
and reason_class_tag not in subj_l):
|
|
382
|
+
return None
|
|
383
|
+
m_ts = re.search(r"archive\s+(\d{8}T\d{6}Z|\d{8}T\d{4}Z)", subject)
|
|
384
|
+
run_ts = m_ts.group(1) if m_ts else ""
|
|
385
|
+
m_v = _VERDICT_RE.search(subject)
|
|
386
|
+
verdict = m_v.group(1) if m_v else ""
|
|
387
|
+
# cause = the tail after the verdict token (or after the em-dash)
|
|
388
|
+
cause = subject
|
|
389
|
+
if m_v:
|
|
390
|
+
cause = subject[m_v.end():].lstrip(" ,—-")
|
|
391
|
+
elif "—" in subject:
|
|
392
|
+
cause = subject.split("—", 1)[1].strip()
|
|
393
|
+
m_r = _RECURRENCE_RE.search(subject)
|
|
394
|
+
ordinal = int(m_r.group(1)) if m_r else 0
|
|
395
|
+
return RunRecord(
|
|
396
|
+
run_ts=run_ts, verdict=verdict, cause=cause,
|
|
397
|
+
recurrence_ordinal=ordinal, subject=subject,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def collect_lane_history(
|
|
402
|
+
lane: str,
|
|
403
|
+
*,
|
|
404
|
+
git_log_lines: list[str],
|
|
405
|
+
window: int = DEFAULT_HISTORY_WINDOW,
|
|
406
|
+
) -> list[RunRecord]:
|
|
407
|
+
"""Parse `git log --oneline` output into recent RunRecords for `lane`.
|
|
408
|
+
|
|
409
|
+
`git_log_lines` is the raw `git log --oneline -<N> -- docs/_dispatch_loops/
|
|
410
|
+
docs/_chained_runs/` output (one subject per line, newest first). Pass an
|
|
411
|
+
empty `lane` to collect across ALL lanes (the per-`/dispatch` archives that
|
|
412
|
+
do not name a lane are then included). Newest-first order is preserved.
|
|
413
|
+
"""
|
|
414
|
+
out: list[RunRecord] = []
|
|
415
|
+
for line in git_log_lines:
|
|
416
|
+
line = line.strip()
|
|
417
|
+
if not line:
|
|
418
|
+
continue
|
|
419
|
+
# drop the leading short-sha from `--oneline`
|
|
420
|
+
subject = line.split(" ", 1)[1] if " " in line else line
|
|
421
|
+
rec = parse_archive_subject(subject, lane)
|
|
422
|
+
if rec is not None:
|
|
423
|
+
out.append(rec)
|
|
424
|
+
if len(out) >= window:
|
|
425
|
+
break
|
|
426
|
+
return out
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
# ── CLI layer (the I/O composition: git log + leases → health JSON) ──────────
|
|
430
|
+
|
|
431
|
+
# Archive commits live under these dirs; the git-log pathspec scopes the scan.
|
|
432
|
+
_ARCHIVE_PATHSPEC = ("docs/_dispatch_loops/", "docs/_chained_runs/")
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _git_log_subjects(scan_depth: int) -> list[str]:
|
|
436
|
+
"""`git log --oneline -<scan_depth> -- <archive dirs>` → subject lines.
|
|
437
|
+
|
|
438
|
+
Best-effort: a git failure (no repo, detached, etc.) yields [] so the gate
|
|
439
|
+
degrades to "no history → proceed" rather than crashing the loop's Step 0.
|
|
440
|
+
"""
|
|
441
|
+
try:
|
|
442
|
+
proc = subprocess.run(
|
|
443
|
+
["git", "log", "--oneline", f"-{scan_depth}", "--", *_ARCHIVE_PATHSPEC],
|
|
444
|
+
capture_output=True, text=True, encoding="utf-8", errors="replace",
|
|
445
|
+
timeout=20,
|
|
446
|
+
)
|
|
447
|
+
except (OSError, subprocess.SubprocessError):
|
|
448
|
+
return []
|
|
449
|
+
if proc.returncode != 0:
|
|
450
|
+
return []
|
|
451
|
+
return [ln for ln in proc.stdout.splitlines() if ln.strip()]
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def check(
|
|
455
|
+
lane: str,
|
|
456
|
+
*,
|
|
457
|
+
lane_tree: list[str],
|
|
458
|
+
live_leases: list[dict],
|
|
459
|
+
own_lease_ts: str = "",
|
|
460
|
+
window: int = DEFAULT_HISTORY_WINDOW,
|
|
461
|
+
scan_depth: int | None = None,
|
|
462
|
+
git_log_lines: list[str] | None = None,
|
|
463
|
+
) -> HealthVerdict:
|
|
464
|
+
"""One-call composition: gather the lane's recent history (via git log
|
|
465
|
+
unless `git_log_lines` is supplied for testing) and run `lane_health`.
|
|
466
|
+
|
|
467
|
+
`scan_depth` is how many commits to walk (defaults to ~4× the window so a
|
|
468
|
+
lane-filtered scan still finds `window` matches); `window` caps how many
|
|
469
|
+
matched records feed the decision.
|
|
470
|
+
"""
|
|
471
|
+
if git_log_lines is None:
|
|
472
|
+
git_log_lines = _git_log_subjects(scan_depth or max(40, window * 4))
|
|
473
|
+
history = collect_lane_history(lane, git_log_lines=git_log_lines, window=window)
|
|
474
|
+
return lane_health(
|
|
475
|
+
lane, lane_tree=lane_tree, live_leases=live_leases,
|
|
476
|
+
history=history, own_lease_ts=own_lease_ts,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def verdict_to_dict(v: HealthVerdict) -> dict:
|
|
481
|
+
return {
|
|
482
|
+
"action": v.action.value,
|
|
483
|
+
"should_proceed": v.should_proceed,
|
|
484
|
+
"reason": v.reason,
|
|
485
|
+
"cause_key": v.cause_key,
|
|
486
|
+
"runs_considered": v.runs_considered,
|
|
487
|
+
"blocker_runs": v.blocker_runs,
|
|
488
|
+
"overlap_lane": v.overlap_lane,
|
|
489
|
+
"evidence": list(v.evidence),
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def cmd_check(args: argparse.Namespace) -> int:
|
|
494
|
+
"""`dos health --lane TM --tree '...' --leases-json '...'` → health JSON.
|
|
495
|
+
|
|
496
|
+
Leases + the lane tree are passed IN (the live-lease registry and the
|
|
497
|
+
lane→tree resolver are host-app concerns — the job side supplies them); the
|
|
498
|
+
history is gathered here via git log. Exit code mirrors the action so a
|
|
499
|
+
shell caller can branch without parsing JSON: 0 PROCEED, 3 ROUTE_UNSTICK,
|
|
500
|
+
4 ROUTE_REPLAN, 6 OVERLAP_BLOCK.
|
|
501
|
+
"""
|
|
502
|
+
lane_tree = [t for t in (args.tree or "").split(",") if t.strip()]
|
|
503
|
+
live_leases: list[dict] = []
|
|
504
|
+
if args.leases_json:
|
|
505
|
+
try:
|
|
506
|
+
live_leases = json.loads(args.leases_json)
|
|
507
|
+
except (ValueError, TypeError):
|
|
508
|
+
live_leases = []
|
|
509
|
+
git_lines = None
|
|
510
|
+
if args.git_log_file:
|
|
511
|
+
with open(args.git_log_file, encoding="utf-8") as fh:
|
|
512
|
+
git_lines = [ln for ln in fh.read().splitlines() if ln.strip()]
|
|
513
|
+
v = check(
|
|
514
|
+
args.lane, lane_tree=lane_tree, live_leases=live_leases,
|
|
515
|
+
own_lease_ts=args.own_lease_ts or "", window=args.window,
|
|
516
|
+
git_log_lines=git_lines,
|
|
517
|
+
)
|
|
518
|
+
print(json.dumps(verdict_to_dict(v), indent=2, sort_keys=True))
|
|
519
|
+
return {
|
|
520
|
+
HealthAction.PROCEED: 0,
|
|
521
|
+
HealthAction.ROUTE_UNSTICK: 3,
|
|
522
|
+
HealthAction.ROUTE_REPLAN: 4,
|
|
523
|
+
HealthAction.OVERLAP_BLOCK: 6,
|
|
524
|
+
}[v.action]
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
528
|
+
p = argparse.ArgumentParser(
|
|
529
|
+
prog="dos-health",
|
|
530
|
+
description="Pre-dispatch lane-health gate — query a lane's startability.",
|
|
531
|
+
)
|
|
532
|
+
p.add_argument("--lane", required=True, help="the lane about to be dispatched")
|
|
533
|
+
p.add_argument("--tree", default="",
|
|
534
|
+
help="comma-separated file-glob tree for the lane (overlap check)")
|
|
535
|
+
p.add_argument("--leases-json", default="",
|
|
536
|
+
help="JSON array of live leases [{lane,lane_kind,tree,loop_ts}]")
|
|
537
|
+
p.add_argument("--own-lease-ts", default="",
|
|
538
|
+
help="this loop's own lease ts (never self-blocks)")
|
|
539
|
+
p.add_argument("--window", type=int, default=DEFAULT_HISTORY_WINDOW,
|
|
540
|
+
help=f"matched records to consider (default {DEFAULT_HISTORY_WINDOW})")
|
|
541
|
+
p.add_argument("--git-log-file", default="",
|
|
542
|
+
help="read git-log subjects from a file instead of running git (testing)")
|
|
543
|
+
return p
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def main(argv: list[str] | None = None) -> int:
|
|
547
|
+
args = build_parser().parse_args(argv)
|
|
548
|
+
return cmd_check(args)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
if __name__ == "__main__": # pragma: no cover
|
|
552
|
+
sys.exit(main())
|