dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/lane_overlap.py ADDED
@@ -0,0 +1,228 @@
1
+ """Lane-tree overlap policy for `/dispatch-loop` lane arbitration.
2
+
3
+ A *lane* (a `--scope` cluster, a keyword scope, or a bare plan) owns a set of
4
+ repo-relative path globs — its `tree`. Two lanes are safe to run concurrently
5
+ when their trees barely intersect; the binary "any overlap = refuse" rule was
6
+ provably too tight for narrow keyword lanes whose tree shares a handful of
7
+ incidental files with a cluster's summary glob.
8
+
9
+ Read this module as a **lock-compatibility function**, not a swim-lane rule: a
10
+ lane is a leased predicate-lock over a region, and the ratio threshold below is a
11
+ *deliberately loosened* compatibility test (strict disjointness was too
12
+ conservative). That reframing — and why it matters for tuning the threshold and
13
+ for the capability-lattice generalization — is `docs/89_the-lane-is-a-region-lock.md`.
14
+
15
+ The policy is a pure function — list-in, verdict-out — so it is replay-tested
16
+ in isolation (`tests/test_dispatch_lane.py::TestArbitrateSoftOverlap`), the
17
+ same discipline as `scripts/gate_classify.py`.
18
+
19
+ >>> overlap_verdict(["playbooks/ats/workday.yaml"], ["agents/apply_*.py"]).verdict
20
+ <Verdict.ADMIT_SOFT: 'admit_soft'>
21
+
22
+ >>> overlap_verdict(["agents/apply_*.py"], ["agents/apply_*.py"]).verdict
23
+ <Verdict.REFUSE_EXACT_GLOB: 'refuse_exact_glob'>
24
+
25
+ A lane that shares the *identical* glob with a live lease refuses as a hard
26
+ collision (REFUSE_EXACT_GLOB), checked before the ratio test so a real
27
+ write-surface overlap cannot be diluted to ADMIT by padding the requesting
28
+ tree with private files. This closed the 2026-06-01 TM↔tailor mutual-wedge:
29
+ TM (8 entries, sharing only `agents/tailor_*.py` + `agents/tailor_steps/...`
30
+ with the tailor cluster) scored 2/8 = 25 % ≤ 33 % and SOFT-ADMITTED under the
31
+ ratio alone, while the reverse direction refused — an asymmetry that
32
+ *guaranteed* a wedge. Exact-glob equality is symmetric and kills it.
33
+ """
34
+ from __future__ import annotations
35
+
36
+ from dataclasses import dataclass
37
+ from enum import Enum
38
+
39
+ from dos._tree import norm_tree_prefix as _norm_tree_prefix
40
+ from dos._tree import prefixes_collide as _prefixes_collide
41
+
42
+ # Ratio threshold: shared/requested above this = refuse. ⅓ is NOT a calibrated
43
+ # soundness bound — it is a STAND-IN that admits a known hazard. The prior-art
44
+ # audit (`docs/114` §A1) is the load-bearing caveat: lane conflict is a *measure*
45
+ # here ("how much of the requested tree shares prefixes"), but 50 years of
46
+ # concurrency control (Gray et al. 1975, *Granularity of Locks*) make
47
+ # lock-compatibility a *boolean predicate* — two writers may share a contended
48
+ # datum ONLY under operation commutativity (O'Neil 1986, escrow), which arbitrary
49
+ # file overwrites lack. So any ⅓ > 0 admits genuine write–write conflicts on the
50
+ # shared remainder (a silent lost-update `verify()` cannot catch — there is no
51
+ # over-claim against git). The value was read off two observed lanes — a narrow
52
+ # keyword lane that should admit (`--scope workday` at 5/16 = 31 %) vs one sharing
53
+ # substantial code with its cluster (`apply-heavy` at 4/10 = 40 %) — i.e. it is an
54
+ # empirical elbow between two examples, not a derived safe bound.
55
+ #
56
+ # Why it is NOT simply flipped to 0 (the audit's first instinct): `docs/114` §F
57
+ # dispositioned that ratio flip as a *detector re-tune* — it trades away the read
58
+ # concurrency ⅓ buys without closing the underlying hazard (two lanes still collide
59
+ # *under any ratio* at the unmediated write moment; DOS is a PDP with no PEP). The
60
+ # sound fix is a real shared/exclusive lock MODE + a glob-intersection disjointness
61
+ # floor enforced at a `dos`-mediated apply-gate, deferred there rather than half-built
62
+ # as a stricter advisory scalar. A workspace that wants the predicate today can set
63
+ # `dos.toml [overlap] ratio_max = 0` (tightening below ⅓ takes effect; loosening above
64
+ # is capped by the floor — `overlap_policy.floor_decision`).
65
+ OVERLAP_RATIO_MAX = 1 / 3
66
+
67
+
68
+ class Verdict(str, Enum):
69
+ """Admission verdict + reason category. Carried as the policy's typed
70
+ return so the arbiter can render a legible refusal without re-classifying
71
+ a free-text string."""
72
+ ADMIT_DISJOINT = "admit_disjoint" # no shared prefixes at all
73
+ ADMIT_SOFT = "admit_soft" # shared but under the ratio threshold
74
+ REFUSE_OVERLAP = "refuse_overlap" # shared above the ratio threshold
75
+ REFUSE_EXACT_GLOB = "refuse_exact_glob" # both lanes claim an identical glob
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class OverlapDecision:
80
+ verdict: Verdict
81
+ shared: int
82
+ requested: int
83
+ reason: str
84
+
85
+ @property
86
+ def admissible(self) -> bool:
87
+ return self.verdict in (Verdict.ADMIT_DISJOINT, Verdict.ADMIT_SOFT)
88
+
89
+
90
+ def _exact_glob_collisions(req_tree: list[str], lease_tree: list[str]) -> list[str]:
91
+ """Requested entries whose normalized prefix EXACTLY equals a lease entry's.
92
+
93
+ This is the *hard*-collision detector that the ratio test below cannot
94
+ see. The ratio test measures *how much* of a lane subsumes another — fine
95
+ for the incidental case (a narrow keyword lane's specific file falling
96
+ under a cluster's broad summary glob). But when two lanes name the
97
+ **identical** glob (`agents/tailor_*.py` on both sides), they are claiming
98
+ the *same write region*, not incidentally overlapping — and that is a
99
+ collision at *any* ratio. The bug this closes: a priority plan-lane (TM, 8
100
+ entries, 6 of them private test files) sharing exactly `agents/tailor_*.py`
101
+ with a `tailor` cluster lease scored 2/8 = 25 % ≤ 33 % and SOFT-ADMITTED,
102
+ then the two loops mutually wedged because the reverse direction
103
+ (tailor 2/3 = 67 %) refused. Exact-glob equality is **symmetric**, so it
104
+ yields the same verdict in both directions and kills that asymmetry.
105
+
106
+ The universal empty prefix (a bare ``**/*`` / ``*.py`` that normalizes to
107
+ ``""``) is excluded here — a whole-repo glob is handled by the ratio path
108
+ (it collides with everything, so its ratio is already 100 %); treating it
109
+ as an "exact glob" would refuse every pair of whole-repo lanes for the
110
+ wrong reason. Only *named-region* exact matches count.
111
+ """
112
+ if not req_tree or not lease_tree:
113
+ return []
114
+ lease_exact = {
115
+ _norm_tree_prefix(p)
116
+ for p in lease_tree
117
+ if p and _norm_tree_prefix(p) != ""
118
+ }
119
+ if not lease_exact:
120
+ return []
121
+ seen: set[str] = set()
122
+ hits: list[str] = []
123
+ for r in req_tree:
124
+ if not r:
125
+ continue
126
+ nr = _norm_tree_prefix(r)
127
+ if nr and nr in lease_exact and nr not in seen:
128
+ seen.add(nr)
129
+ hits.append(r)
130
+ return hits
131
+
132
+
133
+ def _shared_count(req_tree: list[str], lease_tree: list[str]) -> int:
134
+ """Count requested entries that prefix-collide with any lease entry.
135
+
136
+ Each requested entry counts at most once regardless of how many lease
137
+ entries it collides with — symmetric and stable. Prefix collision is the
138
+ same definition `_tree.lane_trees_disjoint` uses, now shared verbatim via
139
+ `_tree.prefixes_collide` so the two cannot drift.
140
+
141
+ A **leading-glob** entry (`**/*`, `*.py`) normalizes to the empty prefix
142
+ ``""`` — the *universal* prefix that matches every path. It is KEPT, not
143
+ dropped: a requested whole-repo glob collides with every lease entry, and a
144
+ whole-repo lease glob is collided-with by every requested entry. (Only a
145
+ LITERALLY blank/empty entry — falsy before normalization — carries no path
146
+ information and is filtered.) This is the fix for the bug where ``**/*`` was
147
+ truncated to ``""`` and then dropped, making the broadest possible tree read
148
+ as "touches nothing" and two whole-repo lanes admit concurrently.
149
+ """
150
+ if not req_tree or not lease_tree:
151
+ return 0
152
+ # Keep the empty prefix when it came from a real (leading-glob) entry; drop
153
+ # only literally-blank entries that carry no path at all.
154
+ lease_prefixes = [_norm_tree_prefix(p) for p in lease_tree if p]
155
+ if not lease_prefixes:
156
+ return 0
157
+ shared = 0
158
+ for r in req_tree:
159
+ if not r:
160
+ continue
161
+ nr = _norm_tree_prefix(r)
162
+ if any(_prefixes_collide(nr, nl) for nl in lease_prefixes):
163
+ shared += 1
164
+ return shared
165
+
166
+
167
+ def overlap_verdict(
168
+ requested_tree: list[str], lease_tree: list[str],
169
+ *, ratio_max: float = OVERLAP_RATIO_MAX,
170
+ ) -> OverlapDecision:
171
+ """Decide whether a known-tree lane can run alongside a known-tree lease.
172
+
173
+ Empty-tree handling is the caller's job (`_lease_blocks` in
174
+ `fanout_state.py` applies the unknown-blast-radius asymmetry); this
175
+ function is only for known-vs-known.
176
+
177
+ * Any IDENTICAL named glob on both sides → REFUSE_EXACT_GLOB
178
+ (hard collision, checked first — see `_exact_glob_collisions`; this is
179
+ symmetric, so it cannot admit-one / refuse-the-other).
180
+ * No shared prefixes → ADMIT_DISJOINT.
181
+ * Shared ≤ ``ratio_max`` of requested tree → ADMIT_SOFT.
182
+ * Shared > ``ratio_max`` → REFUSE_OVERLAP.
183
+
184
+ ``ratio_max`` is the soft-overlap tolerance and defaults to the module
185
+ constant ``OVERLAP_RATIO_MAX`` (⅓) — so every existing caller is
186
+ byte-for-byte unchanged. It is a *parameter* (not a hardcode) because the
187
+ elbow is a calibrated guess, not a theory (`docs/90 §2`): a workspace may
188
+ declare a different value in ``dos.toml`` ``[overlap] ratio_max`` (folded
189
+ onto ``SubstrateConfig`` and threaded here by `overlap_policy.PrefixOverlapPolicy`).
190
+ This is the "thresholds are config, mechanism is kernel" split `liveness`
191
+ already uses for its windows; the **functional form** (a ratio compare)
192
+ stays here, and swapping the form entirely is the `overlap_policy` seam.
193
+ The exact-glob hard floor is INDEPENDENT of ``ratio_max`` — an identical
194
+ glob is a collision at any tolerance, including 0.
195
+ """
196
+ # Hard floor: two lanes naming the same glob claim the same write region.
197
+ # Checked BEFORE the ratio so a real collision cannot be diluted to
198
+ # admit by padding the requesting tree with private (non-shared) files.
199
+ exact = _exact_glob_collisions(list(requested_tree), list(lease_tree))
200
+ if exact:
201
+ shared_all = _shared_count(list(requested_tree), list(lease_tree))
202
+ preview = ", ".join(exact[:3]) + ("…" if len(exact) > 3 else "")
203
+ return OverlapDecision(
204
+ Verdict.REFUSE_EXACT_GLOB, shared_all, len(requested_tree),
205
+ (f"exact-glob overlap: identical glob claimed by both lanes "
206
+ f"({len(exact)}: {preview}) — same write region, hard collision "
207
+ "regardless of ratio"),
208
+ )
209
+ requested = max(1, len(requested_tree))
210
+ shared = _shared_count(list(requested_tree), list(lease_tree))
211
+ if shared == 0:
212
+ return OverlapDecision(
213
+ Verdict.ADMIT_DISJOINT, shared, len(requested_tree),
214
+ "no shared prefixes — fully disjoint",
215
+ )
216
+ ratio = shared / requested
217
+ if ratio > ratio_max:
218
+ return OverlapDecision(
219
+ Verdict.REFUSE_OVERLAP, shared, len(requested_tree),
220
+ (f"overlap too large ({shared}/{len(requested_tree)} = "
221
+ f"{ratio:.0%} of requested tree shared, threshold "
222
+ f"{ratio_max:.0%})"),
223
+ )
224
+ return OverlapDecision(
225
+ Verdict.ADMIT_SOFT, shared, len(requested_tree),
226
+ (f"soft-overlap admit — {shared}/{len(requested_tree)} = "
227
+ f"{ratio:.0%} of requested tree shared (≤{ratio_max:.0%})"),
228
+ )
dos/lease_health.py ADDED
@@ -0,0 +1,282 @@
1
+ """lease_health — pure verdicts over lease + child-run liveness facts.
2
+
3
+ Lifted from the job userland's ``scripts/fanout_state.py`` (MQ3X P2, docs/62).
4
+ Two ``classify`` verdicts in the ``liveness`` / ``health`` mold — facts in, a
5
+ typed verdict out, the clock injected, no I/O:
6
+
7
+ * ``classify_lease_health`` — combine a lease's heartbeat age with an
8
+ already-probed ``activity_state`` into a LANE-LEASE verdict
9
+ (LIVE / STALLED / ORPHANED_WORKING / DEAD). The host does the FS activity
10
+ probe and passes the resulting string in; this decides reclaim-vs-keep.
11
+ * ``classify_child_stall`` — the AST4 child-stall guard: given a child run's
12
+ log-quiet age, the HEAD-sha delta since the last check, and the archive-sha
13
+ set, decide ALIVE / DEAD / DOUBLE_ARCHIVE before a /dispatch-loop takeover.
14
+
15
+ Plus ``parse_iso`` — the minute-OR-second ISO stamp parser the lease stack needs
16
+ (both resolutions the host stamp and a journal ``replay()`` produce). Generic,
17
+ clock-free; lives here because ``classify_lease_health`` is its first kernel use.
18
+
19
+ Mechanism-not-policy: the TTL / stall windows are job tuning, supplied on a
20
+ frozen ``LeaseHealthPolicy`` (defaults reproduce the job's historical constants);
21
+ the child-stall quiet window is a ``classify_child_stall`` parameter. The
22
+ verdict STRINGS (LEASE_* / CHILD_*) are the kernel's stable vocabulary.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import datetime as _dt
27
+ from dataclasses import dataclass
28
+
29
+ # --- lease-health verdict vocabulary --------------------------------------
30
+ LEASE_LIVE = "LIVE"
31
+ LEASE_STALLED = "STALLED"
32
+ LEASE_ORPHANED_WORKING = "ORPHANED_WORKING"
33
+ LEASE_DEAD = "DEAD"
34
+
35
+ # --- child-stall verdict vocabulary ---------------------------------------
36
+ CHILD_ALIVE = "alive"
37
+ CHILD_DEAD = "dead"
38
+ CHILD_DOUBLE_ARCHIVE = "double-archive"
39
+ # A child that is STILL ALIVE (log growing and/or HEAD advancing) but whose
40
+ # every registered pick is already an ancestor of HEAD — i.e. the productive
41
+ # work is durable in git and the continued aliveness is pure waste (the
42
+ # post-commit re-verify / re-commit limit-cycle). The upper skill should
43
+ # TaskStop it and classify the iteration from git ancestry, not keep waiting.
44
+ CHILD_CHURNING = "child-churning"
45
+
46
+
47
+ def parse_iso(s: str) -> _dt.datetime | None:
48
+ """Parse an ISO stamp → aware UTC datetime; None on malformed input.
49
+
50
+ Accepts BOTH resolutions the lane stack produces:
51
+ * minute ``%Y-%m-%dT%H:%MZ`` — the host stamp, the common case;
52
+ * second ``%Y-%m-%dT%H:%M:%SZ`` — what a journal ``replay()`` writes into
53
+ a reconstructed lease's ``heartbeat_at``.
54
+ Accepting the second form is FORWARD-SAFETY, not cosmetics: a replay-restored
55
+ second-resolution ``heartbeat_at`` fed back to a minute-only parser returns
56
+ None, which makes the TTL backstop silently skip — an immortal-by-TTL lease.
57
+ The minute branch is tried first so the hot path is unchanged; second is a
58
+ strict superset, so existing minute-resolution callers are unaffected.
59
+ """
60
+ for fmt in ("%Y-%m-%dT%H:%MZ", "%Y-%m-%dT%H:%M:%SZ"):
61
+ try:
62
+ return _dt.datetime.strptime(s, fmt).replace(tzinfo=_dt.timezone.utc)
63
+ except (ValueError, TypeError):
64
+ continue
65
+ return None
66
+
67
+
68
+ @dataclass(frozen=True)
69
+ class LeaseHealthPolicy:
70
+ """The TTL + stall windows that separate the lease-health verdicts — policy.
71
+
72
+ Defaults reproduce the job's historical constants exactly (LANE_LEASE_TTL =
73
+ 50 min, stall threshold = 8 min), so a caller passing ``DEFAULT_POLICY`` (or
74
+ nothing) is byte-identical to the pre-lift code.
75
+
76
+ ttl_minutes — past this heartbeat age the lease is unambiguously
77
+ DEAD (the hard TTL backstop, wins over activity).
78
+ stall_threshold_minutes — at or below this, the lease is LIVE; between this
79
+ and the TTL the activity probe decides
80
+ STALLED-vs-ORPHANED_WORKING.
81
+ """
82
+
83
+ ttl_minutes: float = 50.0
84
+ stall_threshold_minutes: float = 8.0
85
+
86
+ def __post_init__(self) -> None:
87
+ if self.ttl_minutes < 0 or self.stall_threshold_minutes < 0:
88
+ raise ValueError("lease-health windows must be non-negative (minutes)")
89
+
90
+
91
+ DEFAULT_POLICY = LeaseHealthPolicy()
92
+
93
+
94
+ def classify_lease_health(
95
+ lease: dict,
96
+ *,
97
+ now: _dt.datetime,
98
+ activity_state: str,
99
+ policy: LeaseHealthPolicy = DEFAULT_POLICY,
100
+ ) -> str:
101
+ """Pure classifier — combine heartbeat age + activity state into a verdict.
102
+
103
+ Inputs are the (already-computed) ``activity_state`` and the lease's
104
+ heartbeat, so the function is unit-testable without any filesystem I/O.
105
+
106
+ Returns one of ``LEASE_LIVE`` / ``LEASE_STALLED`` / ``LEASE_ORPHANED_WORKING``
107
+ / ``LEASE_DEAD``.
108
+
109
+ * No timestamp at all → treat as immediately stale (age = inf): a malformed
110
+ lease must not block forever.
111
+ * age > ttl → DEAD (hard backstop).
112
+ * age ≤ stall_threshold → LIVE.
113
+ * stall_threshold < age ≤ ttl → the activity probe decides:
114
+ LIVE_DOWNSTREAM / UNKNOWN → ORPHANED_WORKING (never reclaim on missing
115
+ evidence); QUIET → STALLED (genuinely dead → reclaim).
116
+ """
117
+ hb = parse_iso(lease.get("heartbeat_at", "") or lease.get("acquired_at", ""))
118
+ if hb is None:
119
+ age_min = float("inf")
120
+ else:
121
+ age_min = (now - hb).total_seconds() / 60.0
122
+ if age_min > policy.ttl_minutes:
123
+ return LEASE_DEAD
124
+ if age_min <= policy.stall_threshold_minutes:
125
+ return LEASE_LIVE
126
+ if activity_state == "LIVE_DOWNSTREAM":
127
+ return LEASE_ORPHANED_WORKING
128
+ if activity_state == "UNKNOWN":
129
+ return LEASE_ORPHANED_WORKING
130
+ # activity_state == "QUIET" — genuinely dead.
131
+ return LEASE_STALLED
132
+
133
+
134
+ @dataclass
135
+ class ChildStallResult:
136
+ """Typed verdict of the AST4 child-stall guard — what /dispatch-loop's upper
137
+ skill should do before taking over a child /dispatch's Steps 8-9.
138
+
139
+ ``verdict`` is one of CHILD_ALIVE / CHILD_DEAD / CHILD_DOUBLE_ARCHIVE /
140
+ CHILD_CHURNING.
141
+ ``log_age_seconds`` is how long the log has been quiet (None if absent).
142
+ ``archive_count`` is the number of archive commits seen for the run-ts.
143
+ ``shipped_pick_count`` / ``registered_pick_count`` are the ancestry facts
144
+ that drive the CHURNING verdict (how many of this run's registered picks are
145
+ already ancestors of HEAD, vs how many it registered). Both 0 on a path that
146
+ did not supply them, so the churn check is inert unless the caller measured.
147
+ ``reason`` is a one-line human explanation.
148
+ """
149
+
150
+ verdict: str
151
+ log_age_seconds: float | None = None
152
+ log_grew: bool = False
153
+ new_commit: bool = False
154
+ archive_count: int = 0
155
+ archive_shas: list[str] | None = None
156
+ shipped_pick_count: int = 0
157
+ registered_pick_count: int = 0
158
+ reason: str = ""
159
+
160
+ def __post_init__(self) -> None:
161
+ if self.archive_shas is None:
162
+ self.archive_shas = []
163
+
164
+ def to_dict(self) -> dict:
165
+ return {
166
+ "verdict": self.verdict,
167
+ "log_age_seconds": self.log_age_seconds,
168
+ "log_grew": self.log_grew,
169
+ "new_commit": self.new_commit,
170
+ "archive_count": self.archive_count,
171
+ "archive_shas": self.archive_shas,
172
+ "shipped_pick_count": self.shipped_pick_count,
173
+ "registered_pick_count": self.registered_pick_count,
174
+ "reason": self.reason,
175
+ }
176
+
177
+
178
+ def classify_child_stall(
179
+ *,
180
+ log_age_seconds: float | None,
181
+ last_commit_sha: str | None,
182
+ current_head_sha: str | None,
183
+ archive_shas: list[str] | None = None,
184
+ quiet_window_s: float = 600.0,
185
+ registered_pick_count: int = 0,
186
+ shipped_pick_count: int = 0,
187
+ ) -> ChildStallResult:
188
+ """PURE verdict logic for the AST4 child-stall guard. No I/O — every input is
189
+ a pre-collected fact, the typed verdict is returned.
190
+
191
+ Decision order:
192
+ 1. double-archive first — if the child already shipped its archive, the
193
+ takeover is moot regardless of liveness; reconcile to the child's.
194
+ 2. churn — the child is ALIVE (log grew and/or HEAD advanced) but every
195
+ registered pick is already an ancestor of HEAD, so the continued
196
+ aliveness is pure waste; TaskStop it and classify from git ancestry.
197
+ Checked BEFORE the alive branches precisely because churn IS alive —
198
+ the only thing separating it from healthy progress is "is the work
199
+ already shipped", and once that holds the aliveness is definitionally
200
+ waste. ``shipped_pick_count``/``registered_pick_count`` are the
201
+ caller's ancestry measurement; a path that does not measure leaves
202
+ both 0 and this branch is inert (byte-identical to the old behaviour).
203
+ 3. log-grew (a-fail) → alive. A growing log is unambiguous liveness.
204
+ 4. new-commit-since-last-check (b-fail) → alive. The child committed.
205
+ 5. BOTH quiet AND no new commit → dead; takeover may proceed.
206
+
207
+ ``quiet_window_s`` default (600s) matches the job's
208
+ CHILD_STALL_QUIET_WINDOW_SECONDS; override for tests / operator tuning.
209
+
210
+ The churn check's kill-safety rests on the caller's ``shipped_pick_count``
211
+ being NEVER-OVER-counted (a foreign-lane commit in the window must not
212
+ inflate it); the job's ``ship_oracle.ancestry_ship_count`` guarantees that
213
+ (it counts only commits whose subject names a registered phase AND that are
214
+ ancestors of HEAD). With ``shipped < registered`` the verdict falls through
215
+ to alive — a still-producing child is never killed.
216
+ """
217
+ shas = [s for s in (archive_shas or []) if s]
218
+ if len(set(shas)) >= 2:
219
+ return ChildStallResult(
220
+ CHILD_DOUBLE_ARCHIVE, log_age_seconds=log_age_seconds,
221
+ archive_count=len(set(shas)), archive_shas=shas,
222
+ shipped_pick_count=shipped_pick_count,
223
+ registered_pick_count=registered_pick_count,
224
+ reason=(f"{len(set(shas))} archive commits exist for this run-ts "
225
+ f"({', '.join(s[:8] for s in sorted(set(shas)))}) — child "
226
+ f"self-recovered and shipped its own archive; reconcile to "
227
+ f"the child's artefacts, do NOT produce a competing one."))
228
+ # (a) log-growth test: a log that grew within the quiet window → alive.
229
+ log_grew = log_age_seconds is not None and log_age_seconds < quiet_window_s
230
+ # (b) new-commit test: a commit since the last check → still committing.
231
+ new_commit = bool(
232
+ current_head_sha and last_commit_sha
233
+ and current_head_sha != last_commit_sha
234
+ )
235
+ # Churn: alive (by either signal) AND every registered pick already shipped.
236
+ # registered_pick_count > 0 guards against the no-picks iteration (a drain /
237
+ # a /replan has nothing to ship, so it can never be "all shipped").
238
+ work_all_shipped = (
239
+ registered_pick_count > 0
240
+ and shipped_pick_count >= registered_pick_count
241
+ )
242
+ if work_all_shipped and (log_grew or new_commit):
243
+ signal = "writing" if log_grew else "committing"
244
+ return ChildStallResult(
245
+ CHILD_CHURNING, log_age_seconds=log_age_seconds,
246
+ log_grew=log_grew, new_commit=new_commit,
247
+ archive_count=len(set(shas)), archive_shas=shas,
248
+ shipped_pick_count=shipped_pick_count,
249
+ registered_pick_count=registered_pick_count,
250
+ reason=(f"all {registered_pick_count} registered pick(s) are "
251
+ f"ancestors of HEAD ({shipped_pick_count} shipped) yet the "
252
+ f"child is still {signal} — post-commit churn, not progress; "
253
+ f"TaskStop it and classify the iteration from git ancestry."))
254
+ if log_grew:
255
+ return ChildStallResult(
256
+ CHILD_ALIVE, log_age_seconds=log_age_seconds, log_grew=True,
257
+ archive_count=len(set(shas)), archive_shas=shas,
258
+ shipped_pick_count=shipped_pick_count,
259
+ registered_pick_count=registered_pick_count,
260
+ reason=(f"child log grew {log_age_seconds:.0f}s ago "
261
+ f"(< {quiet_window_s:.0f}s quiet window) — still writing, "
262
+ f"not stalled."))
263
+ if new_commit:
264
+ return ChildStallResult(
265
+ CHILD_ALIVE, log_age_seconds=log_age_seconds, new_commit=True,
266
+ archive_count=len(set(shas)), archive_shas=shas,
267
+ shipped_pick_count=shipped_pick_count,
268
+ registered_pick_count=registered_pick_count,
269
+ reason=(f"HEAD advanced {last_commit_sha[:8]} → "
270
+ f"{current_head_sha[:8]} since last check — child still "
271
+ f"committing, not stalled."))
272
+ # Both signals quiet → genuinely dead; the takeover precondition holds.
273
+ age_txt = (f"quiet {log_age_seconds:.0f}s" if log_age_seconds is not None
274
+ else "log absent")
275
+ return ChildStallResult(
276
+ CHILD_DEAD, log_age_seconds=log_age_seconds,
277
+ archive_count=len(set(shas)), archive_shas=shas,
278
+ shipped_pick_count=shipped_pick_count,
279
+ registered_pick_count=registered_pick_count,
280
+ reason=(f"child genuinely dead: {age_txt} (≥ {quiet_window_s:.0f}s "
281
+ f"window) AND no new commit since last check — takeover of "
282
+ f"Steps 8-9 may proceed."))