dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/sibling_scan.py
ADDED
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
"""sibling_scan — the pure "is another run going to collide with me?" verdicts.
|
|
2
|
+
|
|
3
|
+
A host's dispatch loop spawns headless children and runs alongside sibling
|
|
4
|
+
loops. Three concurrency questions arise, all **domain-free pure verdicts** over
|
|
5
|
+
caller-gathered evidence — the `gate_classify.classify_packet` shape: facts in,
|
|
6
|
+
a typed verdict out, no I/O, no clock read (the clock is injected):
|
|
7
|
+
|
|
8
|
+
1. ORPHAN SWEEP (`scan_for_orphan`) — after an iteration, is any child run-dir
|
|
9
|
+
a *live process nobody is waiting on* (a parent turn ended mid-flight)? A
|
|
10
|
+
run-dir is an ORPHAN iff it has no terminal marker, its log is still
|
|
11
|
+
growing, it is not the current iteration's own child, and it is owned by
|
|
12
|
+
this loop. A live markerless child this loop did NOT spawn is FOREIGN_LIVE
|
|
13
|
+
(record, never adopt — adopting would corrupt that invocation's handoff).
|
|
14
|
+
|
|
15
|
+
2. FOREIGN COLLISION (`classify_foreign_collision`) — given the FOREIGN_LIVE
|
|
16
|
+
children, would this loop's next iteration actually *collide* with one? Only
|
|
17
|
+
if a foreign child's lane tree overlaps this loop's lane (or is unknown /
|
|
18
|
+
exclusive). Disjoint lanes run concurrently — the intended fan-out.
|
|
19
|
+
|
|
20
|
+
3. SIBLING SCAN (`classify_sibling_scan`) — at startup, after taking a lease,
|
|
21
|
+
is there an un-leased *live* sibling loop the arbiter cannot see (a bare
|
|
22
|
+
`/dispatch`, a manual run)? If so: clear (disjoint), reroute (bare loop →
|
|
23
|
+
free lane), or stop (exclusive sibling, or an explicit-scope loop that must
|
|
24
|
+
not be silently moved).
|
|
25
|
+
|
|
26
|
+
THE BOUNDARY — what is kernel vs host (so "kernel imports no host" holds):
|
|
27
|
+
|
|
28
|
+
* KERNEL (here): the three verdict ladders + the disjointness escape (via the
|
|
29
|
+
sibling-kernel `dos._tree.lane_trees_disjoint`, the same arrow `arbiter` and
|
|
30
|
+
`lane_overlap` use). Evidence is FROZEN DATA: a `RunDirState` carries a
|
|
31
|
+
precomputed `has_terminal_marker` BOOL — the host computes it from ITS stamp
|
|
32
|
+
grammar (the `Saved:` / `docs/fanout:` / `docs/dispatch: archive` markers) at
|
|
33
|
+
the boundary, so the kernel never holds a host marker literal. The
|
|
34
|
+
free-lane pool and the lane→tree lookups are caller-supplied.
|
|
35
|
+
* HOST (the caller): the dir-globbing (`docs/_chained_runs/` etc.), the
|
|
36
|
+
log-tail/mtime reads, the terminal-marker grammar that computes
|
|
37
|
+
`has_terminal_marker`, the auto-pick cluster pool, and the lane→tree map.
|
|
38
|
+
|
|
39
|
+
⚓ Evidence-over-narrative: every verdict is derived from filesystem artefacts
|
|
40
|
+
(a marker bool, a log mtime, a lane tree) the caller gathered — never from a
|
|
41
|
+
`result`-envelope prose read.
|
|
42
|
+
"""
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import enum
|
|
46
|
+
from dataclasses import dataclass, field
|
|
47
|
+
from typing import Any, Callable, Optional
|
|
48
|
+
|
|
49
|
+
from dos._tree import lane_trees_disjoint, tree_disjoint_from_all_live
|
|
50
|
+
|
|
51
|
+
# Default staleness window (seconds). A log whose last write is older than this
|
|
52
|
+
# is treated as "not growing" — the process is presumed dead, not orphaned-live.
|
|
53
|
+
DEFAULT_STALENESS_S = 90.0
|
|
54
|
+
|
|
55
|
+
# Default un-leased-sibling liveness window (minutes). A sibling whose newest log
|
|
56
|
+
# is older than this is too quiet to be a live collision.
|
|
57
|
+
DEFAULT_SIBLING_LIVENESS_WINDOW_MIN = 30.0
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def iter_index(iter_dir_name: str) -> int:
|
|
61
|
+
"""Sort key for an ``iter-<n>`` dir name (MQ3X P2 lift). Non-numeric tails
|
|
62
|
+
sort LAST (10**9, not 0 — that silently buckets a malformed dir as iteration
|
|
63
|
+
0 and could mask the real highest iteration). Generic dir-name integer
|
|
64
|
+
parse: no host marker grammar, no I/O — the one sibling-dir classifier that
|
|
65
|
+
is genuinely kernel-pure (the README *text* classifiers hold the host's
|
|
66
|
+
verdict-stamp grammar and stay host-side, per this module's boundary §)."""
|
|
67
|
+
tail = iter_dir_name.split("-")[-1]
|
|
68
|
+
return int(tail) if tail.isdigit() else 10**9
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ===========================================================================
|
|
72
|
+
# 1. Orphan sweep
|
|
73
|
+
# ===========================================================================
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class OrphanStatus(str, enum.Enum):
|
|
77
|
+
"""The verdict `scan_for_orphan` returns for one run-dir scan.
|
|
78
|
+
|
|
79
|
+
`str`-valued so it round-trips as a token into a tally row or log line,
|
|
80
|
+
the same idiom as `gate_classify` verdicts.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
ORPHAN = "ORPHAN"
|
|
84
|
+
FOREIGN_LIVE = "FOREIGN_LIVE"
|
|
85
|
+
TERMINAL = "TERMINAL"
|
|
86
|
+
DEAD = "DEAD"
|
|
87
|
+
CLEAN = "CLEAN"
|
|
88
|
+
|
|
89
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
90
|
+
return self.value
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(frozen=True)
|
|
94
|
+
class RunDirState:
|
|
95
|
+
"""A snapshot of one child run-dir, collected by the caller.
|
|
96
|
+
|
|
97
|
+
The caller does the I/O (globs the run-dirs, reads each log's tail + mtime,
|
|
98
|
+
and — crucially — computes `has_terminal_marker` from ITS OWN stamp grammar)
|
|
99
|
+
and passes these frozen facts here. The scan touches no filesystem and knows
|
|
100
|
+
no host marker literal.
|
|
101
|
+
|
|
102
|
+
Fields:
|
|
103
|
+
ts — the run-dir's UTC timestamp id, for the operator string.
|
|
104
|
+
has_terminal_marker — True iff the child reached a closeout (the host
|
|
105
|
+
checked its log tail against its terminal markers).
|
|
106
|
+
log_mtime_epoch — the child log's last-modified time (epoch seconds).
|
|
107
|
+
is_current_iteration — True iff this is the iteration the loop is
|
|
108
|
+
legitimately mid-wait on (never an orphan).
|
|
109
|
+
log_present — False iff the run-dir has no child log yet (CLEAN).
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
ts: str
|
|
113
|
+
has_terminal_marker: bool = False
|
|
114
|
+
log_mtime_epoch: float = 0.0
|
|
115
|
+
is_current_iteration: bool = False
|
|
116
|
+
log_present: bool = True
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass(frozen=True)
|
|
120
|
+
class OrphanScanResult:
|
|
121
|
+
"""The verdict for one run-dir plus the evidence behind it."""
|
|
122
|
+
|
|
123
|
+
ts: str
|
|
124
|
+
status: OrphanStatus
|
|
125
|
+
reason: str
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def needs_adoption(self) -> bool:
|
|
129
|
+
"""True iff the loop must adopt this run-dir's still-live child."""
|
|
130
|
+
return self.status is OrphanStatus.ORPHAN
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass(frozen=True)
|
|
134
|
+
class OrphanSweepResult:
|
|
135
|
+
"""The result of scanning every run-dir the loop handed in."""
|
|
136
|
+
|
|
137
|
+
orphans: list[OrphanScanResult] = field(default_factory=list)
|
|
138
|
+
foreign_live: list[OrphanScanResult] = field(default_factory=list)
|
|
139
|
+
all: list[OrphanScanResult] = field(default_factory=list)
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def has_orphan(self) -> bool:
|
|
143
|
+
return bool(self.orphans)
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def has_foreign_live(self) -> bool:
|
|
147
|
+
return bool(self.foreign_live)
|
|
148
|
+
|
|
149
|
+
@property
|
|
150
|
+
def summary(self) -> str:
|
|
151
|
+
"""One operator-facing line for the iteration's tally row."""
|
|
152
|
+
if not self.all:
|
|
153
|
+
return "no child run-dirs to scan — clean"
|
|
154
|
+
counts: dict[str, int] = {}
|
|
155
|
+
for r in self.all:
|
|
156
|
+
counts[r.status.value] = counts.get(r.status.value, 0) + 1
|
|
157
|
+
parts = ", ".join(f"{n} {k}" for k, n in sorted(counts.items()))
|
|
158
|
+
tags: list[str] = []
|
|
159
|
+
if self.orphans:
|
|
160
|
+
tags.append(f"ADOPT: {', '.join(o.ts for o in self.orphans)}")
|
|
161
|
+
if self.foreign_live:
|
|
162
|
+
tags.append(f"FOREIGN: {', '.join(f.ts for f in self.foreign_live)}")
|
|
163
|
+
if tags:
|
|
164
|
+
return f"{parts} — " + "; ".join(tags)
|
|
165
|
+
return f"{parts} — clean"
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _coerce_run_dir(obj: Any) -> RunDirState:
|
|
169
|
+
"""Accept a RunDirState or a plain dict (the JSON / fixture shape).
|
|
170
|
+
|
|
171
|
+
`ts` is the only required key; everything else defaults to the safe
|
|
172
|
+
("not an orphan") value, so a partial dict degrades to CLEAN, not a false
|
|
173
|
+
ORPHAN. Accepts a legacy `log_tail`+marker-list shape is NOT supported here:
|
|
174
|
+
the host computes `has_terminal_marker` at the boundary (the seam change).
|
|
175
|
+
"""
|
|
176
|
+
if isinstance(obj, RunDirState):
|
|
177
|
+
return obj
|
|
178
|
+
if not isinstance(obj, dict):
|
|
179
|
+
raise TypeError(
|
|
180
|
+
f"run-dir state must be a RunDirState or dict, got {type(obj).__name__}"
|
|
181
|
+
)
|
|
182
|
+
ts = obj.get("ts")
|
|
183
|
+
if not ts:
|
|
184
|
+
raise ValueError(f"run-dir state is missing 'ts': {obj!r}")
|
|
185
|
+
return RunDirState(
|
|
186
|
+
ts=str(ts),
|
|
187
|
+
has_terminal_marker=bool(obj.get("has_terminal_marker", False)),
|
|
188
|
+
log_mtime_epoch=float(obj.get("log_mtime_epoch", 0.0)),
|
|
189
|
+
is_current_iteration=bool(obj.get("is_current_iteration", False)),
|
|
190
|
+
log_present=bool(obj.get("log_present", True)),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def classify_run_dir(
|
|
195
|
+
state: Any,
|
|
196
|
+
*,
|
|
197
|
+
now_epoch: float,
|
|
198
|
+
staleness_s: float = DEFAULT_STALENESS_S,
|
|
199
|
+
loop_owned_ts: Optional[frozenset[str] | set[str]] = None,
|
|
200
|
+
) -> OrphanScanResult:
|
|
201
|
+
"""Classify ONE run-dir snapshot into an OrphanStatus.
|
|
202
|
+
|
|
203
|
+
PURE — no filesystem, no clock read; the caller passes `now_epoch` once for
|
|
204
|
+
the whole sweep. Decision order (most-specific first, deterministic):
|
|
205
|
+
|
|
206
|
+
1. CLEAN — current iteration's own child, or no log yet.
|
|
207
|
+
2. TERMINAL — `has_terminal_marker` (the host saw a closeout).
|
|
208
|
+
3. DEAD — no marker AND log idle > staleness_s.
|
|
209
|
+
4. FOREIGN_LIVE — no marker AND growing AND loop_owned_ts supplied AND ts
|
|
210
|
+
NOT in it (a different invocation's live child).
|
|
211
|
+
5. ORPHAN — no marker AND growing AND (no loop_owned_ts OR ts in it).
|
|
212
|
+
|
|
213
|
+
`loop_owned_ts=None` preserves the conservative default: every live
|
|
214
|
+
markerless run-dir is ORPHAN (foreign-vs-own left to the operator).
|
|
215
|
+
"""
|
|
216
|
+
st = _coerce_run_dir(state)
|
|
217
|
+
|
|
218
|
+
if st.is_current_iteration or not st.log_present:
|
|
219
|
+
why = (
|
|
220
|
+
"current iteration's own child — legitimately mid-wait"
|
|
221
|
+
if st.is_current_iteration
|
|
222
|
+
else "run-dir has no child log yet — child not started"
|
|
223
|
+
)
|
|
224
|
+
return OrphanScanResult(ts=st.ts, status=OrphanStatus.CLEAN, reason=why)
|
|
225
|
+
|
|
226
|
+
if st.has_terminal_marker:
|
|
227
|
+
return OrphanScanResult(
|
|
228
|
+
ts=st.ts, status=OrphanStatus.TERMINAL,
|
|
229
|
+
reason="child log carries a closeout marker — child reached its terminal step",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
age = now_epoch - st.log_mtime_epoch
|
|
233
|
+
if age > staleness_s:
|
|
234
|
+
return OrphanScanResult(
|
|
235
|
+
ts=st.ts, status=OrphanStatus.DEAD,
|
|
236
|
+
reason=(
|
|
237
|
+
f"no terminal marker and log idle {age:.0f}s (> {staleness_s:.0f}s) "
|
|
238
|
+
"— child died mid-run, treat as a crash"
|
|
239
|
+
),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if loop_owned_ts is not None and st.ts not in loop_owned_ts:
|
|
243
|
+
return OrphanScanResult(
|
|
244
|
+
ts=st.ts, status=OrphanStatus.FOREIGN_LIVE,
|
|
245
|
+
reason=(
|
|
246
|
+
f"no terminal marker and log written {age:.0f}s ago — a LIVE child "
|
|
247
|
+
"from a different invocation (ts not in this loop's owned set); "
|
|
248
|
+
"record but do NOT adopt (would corrupt that invocation's handoff)"
|
|
249
|
+
),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
return OrphanScanResult(
|
|
253
|
+
ts=st.ts, status=OrphanStatus.ORPHAN,
|
|
254
|
+
reason=(
|
|
255
|
+
f"no terminal marker and log written {age:.0f}s ago — a LIVE headless "
|
|
256
|
+
"child nobody is waiting on; adopt it (arm a Monitor, take over handoff)"
|
|
257
|
+
),
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def scan_for_orphan(
|
|
262
|
+
run_dirs: list[Any],
|
|
263
|
+
*,
|
|
264
|
+
now_epoch: float,
|
|
265
|
+
staleness_s: float = DEFAULT_STALENESS_S,
|
|
266
|
+
loop_owned_ts: Optional[frozenset[str] | set[str]] = None,
|
|
267
|
+
) -> OrphanSweepResult:
|
|
268
|
+
"""Scan every run-dir snapshot the loop collected for an orphaned child.
|
|
269
|
+
|
|
270
|
+
PURE — `now_epoch` is required (the caller takes one `time.time()` for the
|
|
271
|
+
whole sweep at the boundary). An empty `run_dirs` returns an all-clean sweep.
|
|
272
|
+
See `classify_run_dir` for the per-dir decision order.
|
|
273
|
+
"""
|
|
274
|
+
results = [
|
|
275
|
+
classify_run_dir(
|
|
276
|
+
d, now_epoch=now_epoch, staleness_s=staleness_s,
|
|
277
|
+
loop_owned_ts=loop_owned_ts,
|
|
278
|
+
)
|
|
279
|
+
for d in run_dirs
|
|
280
|
+
]
|
|
281
|
+
orphans = [r for r in results if r.status is OrphanStatus.ORPHAN]
|
|
282
|
+
foreign = [r for r in results if r.status is OrphanStatus.FOREIGN_LIVE]
|
|
283
|
+
return OrphanSweepResult(orphans=orphans, foreign_live=foreign, all=results)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# ===========================================================================
|
|
287
|
+
# 2. Foreign-collision verdict
|
|
288
|
+
# ===========================================================================
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class ForeignCollisionVerdict(str, enum.Enum):
|
|
292
|
+
"""Whether this loop's next iteration collides with a FOREIGN_LIVE child."""
|
|
293
|
+
|
|
294
|
+
SAFE_CONCURRENT = "SAFE-CONCURRENT"
|
|
295
|
+
COLLISION = "COLLISION"
|
|
296
|
+
NONE = "NONE"
|
|
297
|
+
|
|
298
|
+
def __str__(self) -> str: # pragma: no cover - trivial
|
|
299
|
+
return self.value
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@dataclass(frozen=True)
|
|
303
|
+
class ForeignCollisionResult:
|
|
304
|
+
"""Verdict + the offending lane (if any) for the caller to act on."""
|
|
305
|
+
|
|
306
|
+
verdict: ForeignCollisionVerdict
|
|
307
|
+
colliding_lane: str = ""
|
|
308
|
+
colliding_ts: str = ""
|
|
309
|
+
reason: str = ""
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def classify_foreign_collision(
|
|
313
|
+
*,
|
|
314
|
+
foreign: list[tuple[str, str]],
|
|
315
|
+
my_tree: list[str],
|
|
316
|
+
lane_tree_lookup: Callable[[str], Optional[list[str]]],
|
|
317
|
+
exclusive_lanes: tuple[str, ...] = ("global",),
|
|
318
|
+
) -> ForeignCollisionResult:
|
|
319
|
+
"""Decide whether a next iteration collides with any FOREIGN_LIVE child.
|
|
320
|
+
|
|
321
|
+
PURE — no I/O. The caller resolves each foreign child's lane and passes:
|
|
322
|
+
|
|
323
|
+
foreign — list of (ts, lane) per FOREIGN_LIVE child.
|
|
324
|
+
my_tree — this loop's leased lane tree (file globs).
|
|
325
|
+
lane_tree_lookup — callable(lane) -> list[str] for a foreign lane's tree.
|
|
326
|
+
Returning [] (unknown) is treated as overlapping.
|
|
327
|
+
exclusive_lanes — lane names that are whole-portfolio / exclusive
|
|
328
|
+
(always COLLISION). Caller-supplied (the host passes its
|
|
329
|
+
`cfg.lanes.exclusive`); defaults to the generic `global`
|
|
330
|
+
only — a host with extra exclusive lanes (e.g.
|
|
331
|
+
`orchestration`) passes them, the kernel hardcodes none.
|
|
332
|
+
|
|
333
|
+
Verdict (most-conservative wins; first offender drives the result):
|
|
334
|
+
NONE — no foreign children.
|
|
335
|
+
COLLISION — a foreign child shares this loop's lane, is exclusive,
|
|
336
|
+
has an unknown/empty tree, or overlaps `my_tree`.
|
|
337
|
+
SAFE_CONCURRENT — every foreign child's tree is known, non-empty, AND
|
|
338
|
+
provably disjoint from `my_tree`.
|
|
339
|
+
|
|
340
|
+
Both trees must be known and non-empty to clear — an unknown tree refuses
|
|
341
|
+
(the same disjointness discipline as `classify_sibling_scan`).
|
|
342
|
+
"""
|
|
343
|
+
if not foreign:
|
|
344
|
+
return ForeignCollisionResult(
|
|
345
|
+
ForeignCollisionVerdict.NONE, reason="no FOREIGN_LIVE children")
|
|
346
|
+
if not my_tree:
|
|
347
|
+
ts0, lane0 = foreign[0]
|
|
348
|
+
return ForeignCollisionResult(
|
|
349
|
+
ForeignCollisionVerdict.COLLISION, colliding_lane=lane0,
|
|
350
|
+
colliding_ts=ts0,
|
|
351
|
+
reason=("this loop's own lane tree is unknown — cannot prove "
|
|
352
|
+
"disjointness from any foreign child; stop (conservative)"))
|
|
353
|
+
for ts, lane in foreign:
|
|
354
|
+
norm = (lane or "").strip()
|
|
355
|
+
if not norm or norm in exclusive_lanes:
|
|
356
|
+
return ForeignCollisionResult(
|
|
357
|
+
ForeignCollisionVerdict.COLLISION,
|
|
358
|
+
colliding_lane=norm or "(unknown)", colliding_ts=ts,
|
|
359
|
+
reason=(f"foreign child {ts} has scope {norm or '(unknown)'!r} "
|
|
360
|
+
"— unknown/whole-portfolio blast radius, not provably "
|
|
361
|
+
"disjoint; stop"))
|
|
362
|
+
try:
|
|
363
|
+
foreign_tree = list(lane_tree_lookup(norm) or [])
|
|
364
|
+
except Exception:
|
|
365
|
+
foreign_tree = []
|
|
366
|
+
if not foreign_tree:
|
|
367
|
+
return ForeignCollisionResult(
|
|
368
|
+
ForeignCollisionVerdict.COLLISION, colliding_lane=norm,
|
|
369
|
+
colliding_ts=ts,
|
|
370
|
+
reason=(f"foreign child {ts} lane {norm!r} resolves to an empty "
|
|
371
|
+
"tree — unknown blast radius; stop"))
|
|
372
|
+
if not lane_trees_disjoint(list(my_tree), foreign_tree):
|
|
373
|
+
return ForeignCollisionResult(
|
|
374
|
+
ForeignCollisionVerdict.COLLISION, colliding_lane=norm,
|
|
375
|
+
colliding_ts=ts,
|
|
376
|
+
reason=(f"foreign child {ts} lane {norm!r} tree overlaps this "
|
|
377
|
+
"loop's lane — a next iteration would race its "
|
|
378
|
+
"soft-claim registry; stop"))
|
|
379
|
+
lanes = ", ".join(f"{ts}:{lane}" for ts, lane in foreign)
|
|
380
|
+
return ForeignCollisionResult(
|
|
381
|
+
ForeignCollisionVerdict.SAFE_CONCURRENT,
|
|
382
|
+
reason=(f"all FOREIGN_LIVE children on disjoint lanes ({lanes}) — "
|
|
383
|
+
"safe to continue concurrently (intended parallel fan-out)"))
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# ===========================================================================
|
|
387
|
+
# 3. Un-leased-sibling scan
|
|
388
|
+
# ===========================================================================
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
@dataclass(frozen=True)
|
|
392
|
+
class SiblingScanResult:
|
|
393
|
+
"""Typed verdict of `classify_sibling_scan` — what a loop's Step 0 should do.
|
|
394
|
+
|
|
395
|
+
`verdict` is one of:
|
|
396
|
+
'clear' — no un-leased live sibling (or a disjoint one); proceed.
|
|
397
|
+
'reroute' — a live un-leased cluster/keyword sibling AND this loop was
|
|
398
|
+
bare; re-acquire excluding the sibling's lane. `sibling_lane`
|
|
399
|
+
names the lane to avoid; `free_lanes` lists pickable lanes.
|
|
400
|
+
'stop' — back out: the sibling is on an exclusive lane, OR this loop was
|
|
401
|
+
invoked with an explicit scope (don't silently move it), OR a
|
|
402
|
+
bare loop has no free lane left.
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
verdict: str
|
|
406
|
+
sibling_ts: str = ""
|
|
407
|
+
sibling_scope: str = ""
|
|
408
|
+
sibling_lane: str = ""
|
|
409
|
+
free_lanes: tuple[str, ...] = field(default_factory=tuple)
|
|
410
|
+
reason: str = ""
|
|
411
|
+
|
|
412
|
+
def to_dict(self) -> dict:
|
|
413
|
+
return {
|
|
414
|
+
"verdict": self.verdict, "sibling_ts": self.sibling_ts,
|
|
415
|
+
"sibling_scope": self.sibling_scope,
|
|
416
|
+
"sibling_lane": self.sibling_lane,
|
|
417
|
+
"free_lanes": list(self.free_lanes), "reason": self.reason,
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _disjoint_from_all_live(
|
|
422
|
+
*,
|
|
423
|
+
requested_tree: list[str],
|
|
424
|
+
live: list[dict],
|
|
425
|
+
sibling_tree_lookup: Callable[[str], Optional[list[str]]],
|
|
426
|
+
) -> bool:
|
|
427
|
+
"""The disjointness escape's safety predicate — `requested_tree` provably
|
|
428
|
+
disjoint from EVERY live sibling.
|
|
429
|
+
|
|
430
|
+
Thin alias over `dos._tree.tree_disjoint_from_all_live` (the single, shared
|
|
431
|
+
definition the lane ARBITER's selection-time filter and this post-acquire
|
|
432
|
+
escape both stand on, so they cannot drift apart). Kept as a module-local name
|
|
433
|
+
because this module's existing call sites and tests reference it directly.
|
|
434
|
+
"""
|
|
435
|
+
return tree_disjoint_from_all_live(
|
|
436
|
+
requested_tree=requested_tree,
|
|
437
|
+
live=live,
|
|
438
|
+
sibling_tree_lookup=sibling_tree_lookup,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def live_siblings_subset(
|
|
443
|
+
*,
|
|
444
|
+
siblings: list[dict],
|
|
445
|
+
leased_ts: set[str],
|
|
446
|
+
now_ts: float,
|
|
447
|
+
liveness_window_min: float = DEFAULT_SIBLING_LIVENESS_WINDOW_MIN,
|
|
448
|
+
) -> list[dict]:
|
|
449
|
+
"""The live, un-leased, un-completed subset of pre-collected sibling facts.
|
|
450
|
+
|
|
451
|
+
A sibling counts as a live invisible collision iff: NOT in `leased_ts` (the
|
|
452
|
+
arbiter already sees leased ones), NOT completed, and its newest log was
|
|
453
|
+
touched within `liveness_window_min`. The single definition of "which
|
|
454
|
+
siblings are live right now," shared by `classify_sibling_scan` (the post-
|
|
455
|
+
acquire escape) and the lane ARBITER's FQ-449 selection filter (which must
|
|
456
|
+
feed `tree_disjoint_from_all_live` ONLY genuinely-live siblings, else a long-
|
|
457
|
+
finished run's stale fact would force every bare pick to fall back). Pure —
|
|
458
|
+
`now_ts` injected at the boundary."""
|
|
459
|
+
cutoff = now_ts - liveness_window_min * 60
|
|
460
|
+
out: list[dict] = []
|
|
461
|
+
for s in siblings:
|
|
462
|
+
if s.get("ts") in leased_ts:
|
|
463
|
+
continue
|
|
464
|
+
if s.get("completed"):
|
|
465
|
+
continue
|
|
466
|
+
if (s.get("newest_log_mtime") or 0) < cutoff:
|
|
467
|
+
continue
|
|
468
|
+
out.append(s)
|
|
469
|
+
return out
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def classify_sibling_scan(
|
|
473
|
+
*,
|
|
474
|
+
siblings: list[dict],
|
|
475
|
+
leased_ts: set[str],
|
|
476
|
+
invoked_bare: bool,
|
|
477
|
+
now_ts: float,
|
|
478
|
+
free_lane_pool: list[str],
|
|
479
|
+
requested_tree: Optional[list[str]] = None,
|
|
480
|
+
sibling_tree_lookup: Optional[Callable[[str], Optional[list[str]]]] = None,
|
|
481
|
+
liveness_window_min: float = DEFAULT_SIBLING_LIVENESS_WINDOW_MIN,
|
|
482
|
+
exclusive_lanes: tuple[str, ...] = ("global",),
|
|
483
|
+
) -> SiblingScanResult:
|
|
484
|
+
"""PURE verdict logic for the un-leased-sibling guard. No I/O.
|
|
485
|
+
|
|
486
|
+
`siblings` — pre-collected facts: {ts, newest_log_mtime, completed (bool),
|
|
487
|
+
scope ('global'|'orchestration'|'cluster/keyword'), lane (str)}.
|
|
488
|
+
`leased_ts` — loop_ts values that hold a live lease (arbiter saw them).
|
|
489
|
+
`invoked_bare` — True if this loop got no explicit scope.
|
|
490
|
+
`now_ts` — current epoch seconds (injected at the boundary).
|
|
491
|
+
`free_lane_pool` — the caller's auto-pick lane pool a bare loop reroutes onto
|
|
492
|
+
(host taxonomy — the kernel does not hardcode a cluster set).
|
|
493
|
+
`requested_tree` — THIS loop's requested lane tree; enables the disjointness
|
|
494
|
+
escape for BOTH scope modes (the lane's tree, known + provably disjoint
|
|
495
|
+
from every live sibling). An explicit-scope loop passes its scoped tree;
|
|
496
|
+
a BARE loop passes its AUTO-PICKED lane's tree (the caller resolves it
|
|
497
|
+
after acquire). The escape requires disjointness from EVERY live sibling
|
|
498
|
+
(`_disjoint_from_all_live`), not just `live[0]` — clearing on the first
|
|
499
|
+
while colliding with the second would corrupt a real handoff.
|
|
500
|
+
`sibling_tree_lookup` — callable(lane) -> tree, to evaluate disjointness.
|
|
501
|
+
`liveness_window_min` — how recent a sibling's log must be to count as live.
|
|
502
|
+
`exclusive_lanes` — lanes that dominate the verdict (caller-supplied).
|
|
503
|
+
|
|
504
|
+
A sibling is a genuine invisible collision iff: not in leased_ts, not
|
|
505
|
+
completed, and its newest log is within the liveness window. An exclusive
|
|
506
|
+
sibling dominates (stop, regardless of trees). Otherwise the disjointness
|
|
507
|
+
escape runs (clear if provably disjoint from ALL live siblings); failing
|
|
508
|
+
that, a bare loop reroutes onto a free lane and an explicit-scope loop stops.
|
|
509
|
+
The FIRST live sibling (exclusive-first sort) labels the verdict's evidence.
|
|
510
|
+
"""
|
|
511
|
+
live = live_siblings_subset(
|
|
512
|
+
siblings=siblings, leased_ts=leased_ts, now_ts=now_ts,
|
|
513
|
+
liveness_window_min=liveness_window_min,
|
|
514
|
+
)
|
|
515
|
+
if not live:
|
|
516
|
+
return SiblingScanResult("clear", reason="no un-leased live sibling")
|
|
517
|
+
live.sort(key=lambda s: 0 if s.get("scope") in exclusive_lanes else 1)
|
|
518
|
+
sib = live[0]
|
|
519
|
+
sib_ts = str(sib.get("ts") or "")
|
|
520
|
+
sib_scope = str(sib.get("scope") or "cluster/keyword")
|
|
521
|
+
sib_lane = str(sib.get("lane") or "")
|
|
522
|
+
if sib_scope in exclusive_lanes:
|
|
523
|
+
return SiblingScanResult(
|
|
524
|
+
"stop", sibling_ts=sib_ts, sibling_scope=sib_scope,
|
|
525
|
+
sibling_lane=sib_lane,
|
|
526
|
+
reason=(f"un-leased live sibling {sib_ts} holds exclusive lane "
|
|
527
|
+
f"{sib_scope!r} — this loop must not run alongside it."))
|
|
528
|
+
# The disjointness escape (both scope modes). Tree-disjointness is the SOLE
|
|
529
|
+
# concurrency gate everywhere else in dos (arbiter admission, the orphan
|
|
530
|
+
# sweep's SAFE-CONCURRENT verdict); the sibling scan must honour it too. The
|
|
531
|
+
# escape requires this loop's `requested_tree` to be provably disjoint from
|
|
532
|
+
# *every* live sibling's tree — checking only `live[0]` would clear a loop
|
|
533
|
+
# that collides with `live[1]`. An explicit-scope loop already passed its
|
|
534
|
+
# tree in; a BARE loop passes its AUTO-PICKED lane's tree (resolved by the
|
|
535
|
+
# caller after acquire) — without that, a bare loop could never run
|
|
536
|
+
# concurrently even when it provably cannot collide (the 2026-06-03
|
|
537
|
+
# non-converging-reroute finding: a bare loop rerouted forever off a
|
|
538
|
+
# lane-less read-only `/replan` sibling it could never collide with).
|
|
539
|
+
if requested_tree and sibling_tree_lookup is not None:
|
|
540
|
+
if _disjoint_from_all_live(
|
|
541
|
+
requested_tree=requested_tree, live=live,
|
|
542
|
+
sibling_tree_lookup=sibling_tree_lookup,
|
|
543
|
+
):
|
|
544
|
+
return SiblingScanResult(
|
|
545
|
+
"clear", sibling_ts=sib_ts, sibling_scope=sib_scope,
|
|
546
|
+
sibling_lane=sib_lane,
|
|
547
|
+
reason=(f"un-leased live sibling {sib_ts} on lane {sib_lane!r} "
|
|
548
|
+
f"(+{len(live) - 1} more) but every live sibling's tree "
|
|
549
|
+
f"is disjoint from the requested lane's tree — safe to "
|
|
550
|
+
f"run concurrently."))
|
|
551
|
+
if not invoked_bare:
|
|
552
|
+
return SiblingScanResult(
|
|
553
|
+
"stop", sibling_ts=sib_ts, sibling_scope=sib_scope,
|
|
554
|
+
sibling_lane=sib_lane,
|
|
555
|
+
reason=(f"un-leased live sibling {sib_ts} collides and this loop "
|
|
556
|
+
f"named an explicit scope — not re-routing silently; pick a "
|
|
557
|
+
f"different scope or wait."))
|
|
558
|
+
free = [c for c in free_lane_pool if c != sib_lane]
|
|
559
|
+
if not free:
|
|
560
|
+
return SiblingScanResult(
|
|
561
|
+
"stop", sibling_ts=sib_ts, sibling_scope=sib_scope,
|
|
562
|
+
sibling_lane=sib_lane,
|
|
563
|
+
reason=(f"un-leased live sibling {sib_ts} on lane {sib_lane!r} and "
|
|
564
|
+
f"no other lane free — nothing to re-route onto."))
|
|
565
|
+
return SiblingScanResult(
|
|
566
|
+
"reroute", sibling_ts=sib_ts, sibling_scope=sib_scope,
|
|
567
|
+
sibling_lane=sib_lane, free_lanes=tuple(free),
|
|
568
|
+
reason=(f"un-leased live sibling {sib_ts} on lane {sib_lane!r}; bare "
|
|
569
|
+
f"loop re-routes — re-acquire on a free lane: {free}."))
|