dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/lane_overlap.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Lane-tree overlap policy for `/dispatch-loop` lane arbitration.
|
|
2
|
+
|
|
3
|
+
A *lane* (a `--scope` cluster, a keyword scope, or a bare plan) owns a set of
|
|
4
|
+
repo-relative path globs — its `tree`. Two lanes are safe to run concurrently
|
|
5
|
+
when their trees barely intersect; the binary "any overlap = refuse" rule was
|
|
6
|
+
provably too tight for narrow keyword lanes whose tree shares a handful of
|
|
7
|
+
incidental files with a cluster's summary glob.
|
|
8
|
+
|
|
9
|
+
Read this module as a **lock-compatibility function**, not a swim-lane rule: a
|
|
10
|
+
lane is a leased predicate-lock over a region, and the ratio threshold below is a
|
|
11
|
+
*deliberately loosened* compatibility test (strict disjointness was too
|
|
12
|
+
conservative). That reframing — and why it matters for tuning the threshold and
|
|
13
|
+
for the capability-lattice generalization — is `docs/89_the-lane-is-a-region-lock.md`.
|
|
14
|
+
|
|
15
|
+
The policy is a pure function — list-in, verdict-out — so it is replay-tested
|
|
16
|
+
in isolation (`tests/test_dispatch_lane.py::TestArbitrateSoftOverlap`), the
|
|
17
|
+
same discipline as `scripts/gate_classify.py`.
|
|
18
|
+
|
|
19
|
+
>>> overlap_verdict(["playbooks/ats/workday.yaml"], ["agents/apply_*.py"]).verdict
|
|
20
|
+
<Verdict.ADMIT_SOFT: 'admit_soft'>
|
|
21
|
+
|
|
22
|
+
>>> overlap_verdict(["agents/apply_*.py"], ["agents/apply_*.py"]).verdict
|
|
23
|
+
<Verdict.REFUSE_EXACT_GLOB: 'refuse_exact_glob'>
|
|
24
|
+
|
|
25
|
+
A lane that shares the *identical* glob with a live lease refuses as a hard
|
|
26
|
+
collision (REFUSE_EXACT_GLOB), checked before the ratio test so a real
|
|
27
|
+
write-surface overlap cannot be diluted to ADMIT by padding the requesting
|
|
28
|
+
tree with private files. This closed the 2026-06-01 TM↔tailor mutual-wedge:
|
|
29
|
+
TM (8 entries, sharing only `agents/tailor_*.py` + `agents/tailor_steps/...`
|
|
30
|
+
with the tailor cluster) scored 2/8 = 25 % ≤ 33 % and SOFT-ADMITTED under the
|
|
31
|
+
ratio alone, while the reverse direction refused — an asymmetry that
|
|
32
|
+
*guaranteed* a wedge. Exact-glob equality is symmetric and kills it.
|
|
33
|
+
"""
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
from dataclasses import dataclass
|
|
37
|
+
from enum import Enum
|
|
38
|
+
|
|
39
|
+
from dos._tree import norm_tree_prefix as _norm_tree_prefix
|
|
40
|
+
from dos._tree import prefixes_collide as _prefixes_collide
|
|
41
|
+
|
|
42
|
+
# Ratio threshold: shared/requested above this = refuse. ⅓ is NOT a calibrated
|
|
43
|
+
# soundness bound — it is a STAND-IN that admits a known hazard. The prior-art
|
|
44
|
+
# audit (`docs/114` §A1) is the load-bearing caveat: lane conflict is a *measure*
|
|
45
|
+
# here ("how much of the requested tree shares prefixes"), but 50 years of
|
|
46
|
+
# concurrency control (Gray et al. 1975, *Granularity of Locks*) make
|
|
47
|
+
# lock-compatibility a *boolean predicate* — two writers may share a contended
|
|
48
|
+
# datum ONLY under operation commutativity (O'Neil 1986, escrow), which arbitrary
|
|
49
|
+
# file overwrites lack. So any ⅓ > 0 admits genuine write–write conflicts on the
|
|
50
|
+
# shared remainder (a silent lost-update `verify()` cannot catch — there is no
|
|
51
|
+
# over-claim against git). The value was read off two observed lanes — a narrow
|
|
52
|
+
# keyword lane that should admit (`--scope workday` at 5/16 = 31 %) vs one sharing
|
|
53
|
+
# substantial code with its cluster (`apply-heavy` at 4/10 = 40 %) — i.e. it is an
|
|
54
|
+
# empirical elbow between two examples, not a derived safe bound.
|
|
55
|
+
#
|
|
56
|
+
# Why it is NOT simply flipped to 0 (the audit's first instinct): `docs/114` §F
|
|
57
|
+
# dispositioned that ratio flip as a *detector re-tune* — it trades away the read
|
|
58
|
+
# concurrency ⅓ buys without closing the underlying hazard (two lanes still collide
|
|
59
|
+
# *under any ratio* at the unmediated write moment; DOS is a PDP with no PEP). The
|
|
60
|
+
# sound fix is a real shared/exclusive lock MODE + a glob-intersection disjointness
|
|
61
|
+
# floor enforced at a `dos`-mediated apply-gate, deferred there rather than half-built
|
|
62
|
+
# as a stricter advisory scalar. A workspace that wants the predicate today can set
|
|
63
|
+
# `dos.toml [overlap] ratio_max = 0` (tightening below ⅓ takes effect; loosening above
|
|
64
|
+
# is capped by the floor — `overlap_policy.floor_decision`).
|
|
65
|
+
OVERLAP_RATIO_MAX = 1 / 3
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Verdict(str, Enum):
|
|
69
|
+
"""Admission verdict + reason category. Carried as the policy's typed
|
|
70
|
+
return so the arbiter can render a legible refusal without re-classifying
|
|
71
|
+
a free-text string."""
|
|
72
|
+
ADMIT_DISJOINT = "admit_disjoint" # no shared prefixes at all
|
|
73
|
+
ADMIT_SOFT = "admit_soft" # shared but under the ratio threshold
|
|
74
|
+
REFUSE_OVERLAP = "refuse_overlap" # shared above the ratio threshold
|
|
75
|
+
REFUSE_EXACT_GLOB = "refuse_exact_glob" # both lanes claim an identical glob
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(frozen=True)
|
|
79
|
+
class OverlapDecision:
|
|
80
|
+
verdict: Verdict
|
|
81
|
+
shared: int
|
|
82
|
+
requested: int
|
|
83
|
+
reason: str
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def admissible(self) -> bool:
|
|
87
|
+
return self.verdict in (Verdict.ADMIT_DISJOINT, Verdict.ADMIT_SOFT)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _exact_glob_collisions(req_tree: list[str], lease_tree: list[str]) -> list[str]:
|
|
91
|
+
"""Requested entries whose normalized prefix EXACTLY equals a lease entry's.
|
|
92
|
+
|
|
93
|
+
This is the *hard*-collision detector that the ratio test below cannot
|
|
94
|
+
see. The ratio test measures *how much* of a lane subsumes another — fine
|
|
95
|
+
for the incidental case (a narrow keyword lane's specific file falling
|
|
96
|
+
under a cluster's broad summary glob). But when two lanes name the
|
|
97
|
+
**identical** glob (`agents/tailor_*.py` on both sides), they are claiming
|
|
98
|
+
the *same write region*, not incidentally overlapping — and that is a
|
|
99
|
+
collision at *any* ratio. The bug this closes: a priority plan-lane (TM, 8
|
|
100
|
+
entries, 6 of them private test files) sharing exactly `agents/tailor_*.py`
|
|
101
|
+
with a `tailor` cluster lease scored 2/8 = 25 % ≤ 33 % and SOFT-ADMITTED,
|
|
102
|
+
then the two loops mutually wedged because the reverse direction
|
|
103
|
+
(tailor 2/3 = 67 %) refused. Exact-glob equality is **symmetric**, so it
|
|
104
|
+
yields the same verdict in both directions and kills that asymmetry.
|
|
105
|
+
|
|
106
|
+
The universal empty prefix (a bare ``**/*`` / ``*.py`` that normalizes to
|
|
107
|
+
``""``) is excluded here — a whole-repo glob is handled by the ratio path
|
|
108
|
+
(it collides with everything, so its ratio is already 100 %); treating it
|
|
109
|
+
as an "exact glob" would refuse every pair of whole-repo lanes for the
|
|
110
|
+
wrong reason. Only *named-region* exact matches count.
|
|
111
|
+
"""
|
|
112
|
+
if not req_tree or not lease_tree:
|
|
113
|
+
return []
|
|
114
|
+
lease_exact = {
|
|
115
|
+
_norm_tree_prefix(p)
|
|
116
|
+
for p in lease_tree
|
|
117
|
+
if p and _norm_tree_prefix(p) != ""
|
|
118
|
+
}
|
|
119
|
+
if not lease_exact:
|
|
120
|
+
return []
|
|
121
|
+
seen: set[str] = set()
|
|
122
|
+
hits: list[str] = []
|
|
123
|
+
for r in req_tree:
|
|
124
|
+
if not r:
|
|
125
|
+
continue
|
|
126
|
+
nr = _norm_tree_prefix(r)
|
|
127
|
+
if nr and nr in lease_exact and nr not in seen:
|
|
128
|
+
seen.add(nr)
|
|
129
|
+
hits.append(r)
|
|
130
|
+
return hits
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _shared_count(req_tree: list[str], lease_tree: list[str]) -> int:
|
|
134
|
+
"""Count requested entries that prefix-collide with any lease entry.
|
|
135
|
+
|
|
136
|
+
Each requested entry counts at most once regardless of how many lease
|
|
137
|
+
entries it collides with — symmetric and stable. Prefix collision is the
|
|
138
|
+
same definition `_tree.lane_trees_disjoint` uses, now shared verbatim via
|
|
139
|
+
`_tree.prefixes_collide` so the two cannot drift.
|
|
140
|
+
|
|
141
|
+
A **leading-glob** entry (`**/*`, `*.py`) normalizes to the empty prefix
|
|
142
|
+
``""`` — the *universal* prefix that matches every path. It is KEPT, not
|
|
143
|
+
dropped: a requested whole-repo glob collides with every lease entry, and a
|
|
144
|
+
whole-repo lease glob is collided-with by every requested entry. (Only a
|
|
145
|
+
LITERALLY blank/empty entry — falsy before normalization — carries no path
|
|
146
|
+
information and is filtered.) This is the fix for the bug where ``**/*`` was
|
|
147
|
+
truncated to ``""`` and then dropped, making the broadest possible tree read
|
|
148
|
+
as "touches nothing" and two whole-repo lanes admit concurrently.
|
|
149
|
+
"""
|
|
150
|
+
if not req_tree or not lease_tree:
|
|
151
|
+
return 0
|
|
152
|
+
# Keep the empty prefix when it came from a real (leading-glob) entry; drop
|
|
153
|
+
# only literally-blank entries that carry no path at all.
|
|
154
|
+
lease_prefixes = [_norm_tree_prefix(p) for p in lease_tree if p]
|
|
155
|
+
if not lease_prefixes:
|
|
156
|
+
return 0
|
|
157
|
+
shared = 0
|
|
158
|
+
for r in req_tree:
|
|
159
|
+
if not r:
|
|
160
|
+
continue
|
|
161
|
+
nr = _norm_tree_prefix(r)
|
|
162
|
+
if any(_prefixes_collide(nr, nl) for nl in lease_prefixes):
|
|
163
|
+
shared += 1
|
|
164
|
+
return shared
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def overlap_verdict(
|
|
168
|
+
requested_tree: list[str], lease_tree: list[str],
|
|
169
|
+
*, ratio_max: float = OVERLAP_RATIO_MAX,
|
|
170
|
+
) -> OverlapDecision:
|
|
171
|
+
"""Decide whether a known-tree lane can run alongside a known-tree lease.
|
|
172
|
+
|
|
173
|
+
Empty-tree handling is the caller's job (`_lease_blocks` in
|
|
174
|
+
`fanout_state.py` applies the unknown-blast-radius asymmetry); this
|
|
175
|
+
function is only for known-vs-known.
|
|
176
|
+
|
|
177
|
+
* Any IDENTICAL named glob on both sides → REFUSE_EXACT_GLOB
|
|
178
|
+
(hard collision, checked first — see `_exact_glob_collisions`; this is
|
|
179
|
+
symmetric, so it cannot admit-one / refuse-the-other).
|
|
180
|
+
* No shared prefixes → ADMIT_DISJOINT.
|
|
181
|
+
* Shared ≤ ``ratio_max`` of requested tree → ADMIT_SOFT.
|
|
182
|
+
* Shared > ``ratio_max`` → REFUSE_OVERLAP.
|
|
183
|
+
|
|
184
|
+
``ratio_max`` is the soft-overlap tolerance and defaults to the module
|
|
185
|
+
constant ``OVERLAP_RATIO_MAX`` (⅓) — so every existing caller is
|
|
186
|
+
byte-for-byte unchanged. It is a *parameter* (not a hardcode) because the
|
|
187
|
+
elbow is a calibrated guess, not a theory (`docs/90 §2`): a workspace may
|
|
188
|
+
declare a different value in ``dos.toml`` ``[overlap] ratio_max`` (folded
|
|
189
|
+
onto ``SubstrateConfig`` and threaded here by `overlap_policy.PrefixOverlapPolicy`).
|
|
190
|
+
This is the "thresholds are config, mechanism is kernel" split `liveness`
|
|
191
|
+
already uses for its windows; the **functional form** (a ratio compare)
|
|
192
|
+
stays here, and swapping the form entirely is the `overlap_policy` seam.
|
|
193
|
+
The exact-glob hard floor is INDEPENDENT of ``ratio_max`` — an identical
|
|
194
|
+
glob is a collision at any tolerance, including 0.
|
|
195
|
+
"""
|
|
196
|
+
# Hard floor: two lanes naming the same glob claim the same write region.
|
|
197
|
+
# Checked BEFORE the ratio so a real collision cannot be diluted to
|
|
198
|
+
# admit by padding the requesting tree with private (non-shared) files.
|
|
199
|
+
exact = _exact_glob_collisions(list(requested_tree), list(lease_tree))
|
|
200
|
+
if exact:
|
|
201
|
+
shared_all = _shared_count(list(requested_tree), list(lease_tree))
|
|
202
|
+
preview = ", ".join(exact[:3]) + ("…" if len(exact) > 3 else "")
|
|
203
|
+
return OverlapDecision(
|
|
204
|
+
Verdict.REFUSE_EXACT_GLOB, shared_all, len(requested_tree),
|
|
205
|
+
(f"exact-glob overlap: identical glob claimed by both lanes "
|
|
206
|
+
f"({len(exact)}: {preview}) — same write region, hard collision "
|
|
207
|
+
"regardless of ratio"),
|
|
208
|
+
)
|
|
209
|
+
requested = max(1, len(requested_tree))
|
|
210
|
+
shared = _shared_count(list(requested_tree), list(lease_tree))
|
|
211
|
+
if shared == 0:
|
|
212
|
+
return OverlapDecision(
|
|
213
|
+
Verdict.ADMIT_DISJOINT, shared, len(requested_tree),
|
|
214
|
+
"no shared prefixes — fully disjoint",
|
|
215
|
+
)
|
|
216
|
+
ratio = shared / requested
|
|
217
|
+
if ratio > ratio_max:
|
|
218
|
+
return OverlapDecision(
|
|
219
|
+
Verdict.REFUSE_OVERLAP, shared, len(requested_tree),
|
|
220
|
+
(f"overlap too large ({shared}/{len(requested_tree)} = "
|
|
221
|
+
f"{ratio:.0%} of requested tree shared, threshold "
|
|
222
|
+
f"{ratio_max:.0%})"),
|
|
223
|
+
)
|
|
224
|
+
return OverlapDecision(
|
|
225
|
+
Verdict.ADMIT_SOFT, shared, len(requested_tree),
|
|
226
|
+
(f"soft-overlap admit — {shared}/{len(requested_tree)} = "
|
|
227
|
+
f"{ratio:.0%} of requested tree shared (≤{ratio_max:.0%})"),
|
|
228
|
+
)
|
dos/lease_health.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""lease_health — pure verdicts over lease + child-run liveness facts.
|
|
2
|
+
|
|
3
|
+
Lifted from the job userland's ``scripts/fanout_state.py`` (MQ3X P2, docs/62).
|
|
4
|
+
Two ``classify`` verdicts in the ``liveness`` / ``health`` mold — facts in, a
|
|
5
|
+
typed verdict out, the clock injected, no I/O:
|
|
6
|
+
|
|
7
|
+
* ``classify_lease_health`` — combine a lease's heartbeat age with an
|
|
8
|
+
already-probed ``activity_state`` into a LANE-LEASE verdict
|
|
9
|
+
(LIVE / STALLED / ORPHANED_WORKING / DEAD). The host does the FS activity
|
|
10
|
+
probe and passes the resulting string in; this decides reclaim-vs-keep.
|
|
11
|
+
* ``classify_child_stall`` — the AST4 child-stall guard: given a child run's
|
|
12
|
+
log-quiet age, the HEAD-sha delta since the last check, and the archive-sha
|
|
13
|
+
set, decide ALIVE / DEAD / DOUBLE_ARCHIVE before a /dispatch-loop takeover.
|
|
14
|
+
|
|
15
|
+
Plus ``parse_iso`` — the minute-OR-second ISO stamp parser the lease stack needs
|
|
16
|
+
(both resolutions the host stamp and a journal ``replay()`` produce). Generic,
|
|
17
|
+
clock-free; lives here because ``classify_lease_health`` is its first kernel use.
|
|
18
|
+
|
|
19
|
+
Mechanism-not-policy: the TTL / stall windows are job tuning, supplied on a
|
|
20
|
+
frozen ``LeaseHealthPolicy`` (defaults reproduce the job's historical constants);
|
|
21
|
+
the child-stall quiet window is a ``classify_child_stall`` parameter. The
|
|
22
|
+
verdict STRINGS (LEASE_* / CHILD_*) are the kernel's stable vocabulary.
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import datetime as _dt
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
|
|
29
|
+
# --- lease-health verdict vocabulary --------------------------------------
|
|
30
|
+
LEASE_LIVE = "LIVE"
|
|
31
|
+
LEASE_STALLED = "STALLED"
|
|
32
|
+
LEASE_ORPHANED_WORKING = "ORPHANED_WORKING"
|
|
33
|
+
LEASE_DEAD = "DEAD"
|
|
34
|
+
|
|
35
|
+
# --- child-stall verdict vocabulary ---------------------------------------
|
|
36
|
+
CHILD_ALIVE = "alive"
|
|
37
|
+
CHILD_DEAD = "dead"
|
|
38
|
+
CHILD_DOUBLE_ARCHIVE = "double-archive"
|
|
39
|
+
# A child that is STILL ALIVE (log growing and/or HEAD advancing) but whose
|
|
40
|
+
# every registered pick is already an ancestor of HEAD — i.e. the productive
|
|
41
|
+
# work is durable in git and the continued aliveness is pure waste (the
|
|
42
|
+
# post-commit re-verify / re-commit limit-cycle). The upper skill should
|
|
43
|
+
# TaskStop it and classify the iteration from git ancestry, not keep waiting.
|
|
44
|
+
CHILD_CHURNING = "child-churning"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def parse_iso(s: str) -> _dt.datetime | None:
|
|
48
|
+
"""Parse an ISO stamp → aware UTC datetime; None on malformed input.
|
|
49
|
+
|
|
50
|
+
Accepts BOTH resolutions the lane stack produces:
|
|
51
|
+
* minute ``%Y-%m-%dT%H:%MZ`` — the host stamp, the common case;
|
|
52
|
+
* second ``%Y-%m-%dT%H:%M:%SZ`` — what a journal ``replay()`` writes into
|
|
53
|
+
a reconstructed lease's ``heartbeat_at``.
|
|
54
|
+
Accepting the second form is FORWARD-SAFETY, not cosmetics: a replay-restored
|
|
55
|
+
second-resolution ``heartbeat_at`` fed back to a minute-only parser returns
|
|
56
|
+
None, which makes the TTL backstop silently skip — an immortal-by-TTL lease.
|
|
57
|
+
The minute branch is tried first so the hot path is unchanged; second is a
|
|
58
|
+
strict superset, so existing minute-resolution callers are unaffected.
|
|
59
|
+
"""
|
|
60
|
+
for fmt in ("%Y-%m-%dT%H:%MZ", "%Y-%m-%dT%H:%M:%SZ"):
|
|
61
|
+
try:
|
|
62
|
+
return _dt.datetime.strptime(s, fmt).replace(tzinfo=_dt.timezone.utc)
|
|
63
|
+
except (ValueError, TypeError):
|
|
64
|
+
continue
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass(frozen=True)
|
|
69
|
+
class LeaseHealthPolicy:
|
|
70
|
+
"""The TTL + stall windows that separate the lease-health verdicts — policy.
|
|
71
|
+
|
|
72
|
+
Defaults reproduce the job's historical constants exactly (LANE_LEASE_TTL =
|
|
73
|
+
50 min, stall threshold = 8 min), so a caller passing ``DEFAULT_POLICY`` (or
|
|
74
|
+
nothing) is byte-identical to the pre-lift code.
|
|
75
|
+
|
|
76
|
+
ttl_minutes — past this heartbeat age the lease is unambiguously
|
|
77
|
+
DEAD (the hard TTL backstop, wins over activity).
|
|
78
|
+
stall_threshold_minutes — at or below this, the lease is LIVE; between this
|
|
79
|
+
and the TTL the activity probe decides
|
|
80
|
+
STALLED-vs-ORPHANED_WORKING.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
ttl_minutes: float = 50.0
|
|
84
|
+
stall_threshold_minutes: float = 8.0
|
|
85
|
+
|
|
86
|
+
def __post_init__(self) -> None:
|
|
87
|
+
if self.ttl_minutes < 0 or self.stall_threshold_minutes < 0:
|
|
88
|
+
raise ValueError("lease-health windows must be non-negative (minutes)")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
DEFAULT_POLICY = LeaseHealthPolicy()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def classify_lease_health(
|
|
95
|
+
lease: dict,
|
|
96
|
+
*,
|
|
97
|
+
now: _dt.datetime,
|
|
98
|
+
activity_state: str,
|
|
99
|
+
policy: LeaseHealthPolicy = DEFAULT_POLICY,
|
|
100
|
+
) -> str:
|
|
101
|
+
"""Pure classifier — combine heartbeat age + activity state into a verdict.
|
|
102
|
+
|
|
103
|
+
Inputs are the (already-computed) ``activity_state`` and the lease's
|
|
104
|
+
heartbeat, so the function is unit-testable without any filesystem I/O.
|
|
105
|
+
|
|
106
|
+
Returns one of ``LEASE_LIVE`` / ``LEASE_STALLED`` / ``LEASE_ORPHANED_WORKING``
|
|
107
|
+
/ ``LEASE_DEAD``.
|
|
108
|
+
|
|
109
|
+
* No timestamp at all → treat as immediately stale (age = inf): a malformed
|
|
110
|
+
lease must not block forever.
|
|
111
|
+
* age > ttl → DEAD (hard backstop).
|
|
112
|
+
* age ≤ stall_threshold → LIVE.
|
|
113
|
+
* stall_threshold < age ≤ ttl → the activity probe decides:
|
|
114
|
+
LIVE_DOWNSTREAM / UNKNOWN → ORPHANED_WORKING (never reclaim on missing
|
|
115
|
+
evidence); QUIET → STALLED (genuinely dead → reclaim).
|
|
116
|
+
"""
|
|
117
|
+
hb = parse_iso(lease.get("heartbeat_at", "") or lease.get("acquired_at", ""))
|
|
118
|
+
if hb is None:
|
|
119
|
+
age_min = float("inf")
|
|
120
|
+
else:
|
|
121
|
+
age_min = (now - hb).total_seconds() / 60.0
|
|
122
|
+
if age_min > policy.ttl_minutes:
|
|
123
|
+
return LEASE_DEAD
|
|
124
|
+
if age_min <= policy.stall_threshold_minutes:
|
|
125
|
+
return LEASE_LIVE
|
|
126
|
+
if activity_state == "LIVE_DOWNSTREAM":
|
|
127
|
+
return LEASE_ORPHANED_WORKING
|
|
128
|
+
if activity_state == "UNKNOWN":
|
|
129
|
+
return LEASE_ORPHANED_WORKING
|
|
130
|
+
# activity_state == "QUIET" — genuinely dead.
|
|
131
|
+
return LEASE_STALLED
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class ChildStallResult:
|
|
136
|
+
"""Typed verdict of the AST4 child-stall guard — what /dispatch-loop's upper
|
|
137
|
+
skill should do before taking over a child /dispatch's Steps 8-9.
|
|
138
|
+
|
|
139
|
+
``verdict`` is one of CHILD_ALIVE / CHILD_DEAD / CHILD_DOUBLE_ARCHIVE /
|
|
140
|
+
CHILD_CHURNING.
|
|
141
|
+
``log_age_seconds`` is how long the log has been quiet (None if absent).
|
|
142
|
+
``archive_count`` is the number of archive commits seen for the run-ts.
|
|
143
|
+
``shipped_pick_count`` / ``registered_pick_count`` are the ancestry facts
|
|
144
|
+
that drive the CHURNING verdict (how many of this run's registered picks are
|
|
145
|
+
already ancestors of HEAD, vs how many it registered). Both 0 on a path that
|
|
146
|
+
did not supply them, so the churn check is inert unless the caller measured.
|
|
147
|
+
``reason`` is a one-line human explanation.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
verdict: str
|
|
151
|
+
log_age_seconds: float | None = None
|
|
152
|
+
log_grew: bool = False
|
|
153
|
+
new_commit: bool = False
|
|
154
|
+
archive_count: int = 0
|
|
155
|
+
archive_shas: list[str] | None = None
|
|
156
|
+
shipped_pick_count: int = 0
|
|
157
|
+
registered_pick_count: int = 0
|
|
158
|
+
reason: str = ""
|
|
159
|
+
|
|
160
|
+
def __post_init__(self) -> None:
|
|
161
|
+
if self.archive_shas is None:
|
|
162
|
+
self.archive_shas = []
|
|
163
|
+
|
|
164
|
+
def to_dict(self) -> dict:
|
|
165
|
+
return {
|
|
166
|
+
"verdict": self.verdict,
|
|
167
|
+
"log_age_seconds": self.log_age_seconds,
|
|
168
|
+
"log_grew": self.log_grew,
|
|
169
|
+
"new_commit": self.new_commit,
|
|
170
|
+
"archive_count": self.archive_count,
|
|
171
|
+
"archive_shas": self.archive_shas,
|
|
172
|
+
"shipped_pick_count": self.shipped_pick_count,
|
|
173
|
+
"registered_pick_count": self.registered_pick_count,
|
|
174
|
+
"reason": self.reason,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def classify_child_stall(
|
|
179
|
+
*,
|
|
180
|
+
log_age_seconds: float | None,
|
|
181
|
+
last_commit_sha: str | None,
|
|
182
|
+
current_head_sha: str | None,
|
|
183
|
+
archive_shas: list[str] | None = None,
|
|
184
|
+
quiet_window_s: float = 600.0,
|
|
185
|
+
registered_pick_count: int = 0,
|
|
186
|
+
shipped_pick_count: int = 0,
|
|
187
|
+
) -> ChildStallResult:
|
|
188
|
+
"""PURE verdict logic for the AST4 child-stall guard. No I/O — every input is
|
|
189
|
+
a pre-collected fact, the typed verdict is returned.
|
|
190
|
+
|
|
191
|
+
Decision order:
|
|
192
|
+
1. double-archive first — if the child already shipped its archive, the
|
|
193
|
+
takeover is moot regardless of liveness; reconcile to the child's.
|
|
194
|
+
2. churn — the child is ALIVE (log grew and/or HEAD advanced) but every
|
|
195
|
+
registered pick is already an ancestor of HEAD, so the continued
|
|
196
|
+
aliveness is pure waste; TaskStop it and classify from git ancestry.
|
|
197
|
+
Checked BEFORE the alive branches precisely because churn IS alive —
|
|
198
|
+
the only thing separating it from healthy progress is "is the work
|
|
199
|
+
already shipped", and once that holds the aliveness is definitionally
|
|
200
|
+
waste. ``shipped_pick_count``/``registered_pick_count`` are the
|
|
201
|
+
caller's ancestry measurement; a path that does not measure leaves
|
|
202
|
+
both 0 and this branch is inert (byte-identical to the old behaviour).
|
|
203
|
+
3. log-grew (a-fail) → alive. A growing log is unambiguous liveness.
|
|
204
|
+
4. new-commit-since-last-check (b-fail) → alive. The child committed.
|
|
205
|
+
5. BOTH quiet AND no new commit → dead; takeover may proceed.
|
|
206
|
+
|
|
207
|
+
``quiet_window_s`` default (600s) matches the job's
|
|
208
|
+
CHILD_STALL_QUIET_WINDOW_SECONDS; override for tests / operator tuning.
|
|
209
|
+
|
|
210
|
+
The churn check's kill-safety rests on the caller's ``shipped_pick_count``
|
|
211
|
+
being NEVER-OVER-counted (a foreign-lane commit in the window must not
|
|
212
|
+
inflate it); the job's ``ship_oracle.ancestry_ship_count`` guarantees that
|
|
213
|
+
(it counts only commits whose subject names a registered phase AND that are
|
|
214
|
+
ancestors of HEAD). With ``shipped < registered`` the verdict falls through
|
|
215
|
+
to alive — a still-producing child is never killed.
|
|
216
|
+
"""
|
|
217
|
+
shas = [s for s in (archive_shas or []) if s]
|
|
218
|
+
if len(set(shas)) >= 2:
|
|
219
|
+
return ChildStallResult(
|
|
220
|
+
CHILD_DOUBLE_ARCHIVE, log_age_seconds=log_age_seconds,
|
|
221
|
+
archive_count=len(set(shas)), archive_shas=shas,
|
|
222
|
+
shipped_pick_count=shipped_pick_count,
|
|
223
|
+
registered_pick_count=registered_pick_count,
|
|
224
|
+
reason=(f"{len(set(shas))} archive commits exist for this run-ts "
|
|
225
|
+
f"({', '.join(s[:8] for s in sorted(set(shas)))}) — child "
|
|
226
|
+
f"self-recovered and shipped its own archive; reconcile to "
|
|
227
|
+
f"the child's artefacts, do NOT produce a competing one."))
|
|
228
|
+
# (a) log-growth test: a log that grew within the quiet window → alive.
|
|
229
|
+
log_grew = log_age_seconds is not None and log_age_seconds < quiet_window_s
|
|
230
|
+
# (b) new-commit test: a commit since the last check → still committing.
|
|
231
|
+
new_commit = bool(
|
|
232
|
+
current_head_sha and last_commit_sha
|
|
233
|
+
and current_head_sha != last_commit_sha
|
|
234
|
+
)
|
|
235
|
+
# Churn: alive (by either signal) AND every registered pick already shipped.
|
|
236
|
+
# registered_pick_count > 0 guards against the no-picks iteration (a drain /
|
|
237
|
+
# a /replan has nothing to ship, so it can never be "all shipped").
|
|
238
|
+
work_all_shipped = (
|
|
239
|
+
registered_pick_count > 0
|
|
240
|
+
and shipped_pick_count >= registered_pick_count
|
|
241
|
+
)
|
|
242
|
+
if work_all_shipped and (log_grew or new_commit):
|
|
243
|
+
signal = "writing" if log_grew else "committing"
|
|
244
|
+
return ChildStallResult(
|
|
245
|
+
CHILD_CHURNING, log_age_seconds=log_age_seconds,
|
|
246
|
+
log_grew=log_grew, new_commit=new_commit,
|
|
247
|
+
archive_count=len(set(shas)), archive_shas=shas,
|
|
248
|
+
shipped_pick_count=shipped_pick_count,
|
|
249
|
+
registered_pick_count=registered_pick_count,
|
|
250
|
+
reason=(f"all {registered_pick_count} registered pick(s) are "
|
|
251
|
+
f"ancestors of HEAD ({shipped_pick_count} shipped) yet the "
|
|
252
|
+
f"child is still {signal} — post-commit churn, not progress; "
|
|
253
|
+
f"TaskStop it and classify the iteration from git ancestry."))
|
|
254
|
+
if log_grew:
|
|
255
|
+
return ChildStallResult(
|
|
256
|
+
CHILD_ALIVE, log_age_seconds=log_age_seconds, log_grew=True,
|
|
257
|
+
archive_count=len(set(shas)), archive_shas=shas,
|
|
258
|
+
shipped_pick_count=shipped_pick_count,
|
|
259
|
+
registered_pick_count=registered_pick_count,
|
|
260
|
+
reason=(f"child log grew {log_age_seconds:.0f}s ago "
|
|
261
|
+
f"(< {quiet_window_s:.0f}s quiet window) — still writing, "
|
|
262
|
+
f"not stalled."))
|
|
263
|
+
if new_commit:
|
|
264
|
+
return ChildStallResult(
|
|
265
|
+
CHILD_ALIVE, log_age_seconds=log_age_seconds, new_commit=True,
|
|
266
|
+
archive_count=len(set(shas)), archive_shas=shas,
|
|
267
|
+
shipped_pick_count=shipped_pick_count,
|
|
268
|
+
registered_pick_count=registered_pick_count,
|
|
269
|
+
reason=(f"HEAD advanced {last_commit_sha[:8]} → "
|
|
270
|
+
f"{current_head_sha[:8]} since last check — child still "
|
|
271
|
+
f"committing, not stalled."))
|
|
272
|
+
# Both signals quiet → genuinely dead; the takeover precondition holds.
|
|
273
|
+
age_txt = (f"quiet {log_age_seconds:.0f}s" if log_age_seconds is not None
|
|
274
|
+
else "log absent")
|
|
275
|
+
return ChildStallResult(
|
|
276
|
+
CHILD_DEAD, log_age_seconds=log_age_seconds,
|
|
277
|
+
archive_count=len(set(shas)), archive_shas=shas,
|
|
278
|
+
shipped_pick_count=shipped_pick_count,
|
|
279
|
+
registered_pick_count=registered_pick_count,
|
|
280
|
+
reason=(f"child genuinely dead: {age_txt} (≥ {quiet_window_s:.0f}s "
|
|
281
|
+
f"window) AND no new commit since last check — takeover of "
|
|
282
|
+
f"Steps 8-9 may proceed."))
|