dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
dos/journal_delta.py ADDED
@@ -0,0 +1,308 @@
1
+ """journal-delta — the lane-journal progress fold for the liveness verdict.
2
+
3
+ docs/82, LVN **Phase 2** — the journal + heartbeat rungs. Phase 1's heartbeat
4
+ age was caller-supplied (`--last-heartbeat-age-ms`); this grounds the heartbeat
5
+ and the lease-layer-event signal in the **lane journal** so the
6
+ SPINNING-vs-STALLED distinction comes from kernel evidence the agent can't
7
+ forge, not a passed number.
8
+
9
+ This module is `git_delta`'s sibling — the same boundary/evidence split LVN
10
+ Phase 1b established (`docs/82` 1b): the file read (`lane_journal.read_all`)
11
+ happens at the CLI boundary; the **fold here is PURE** — entries in, two numbers
12
+ out, the clock injected, no disk. It is replay-testable on frozen entry lists
13
+ exactly like `lane_journal.replay()`, which is what lets the whole liveness
14
+ ladder be tested without a live multi-minute agent run (the `loop_decide` design
15
+ value, restated for the temporal axis). `liveness.classify` stays byte-pure with
16
+ zero journal-schema awareness: Phase 2 only changes WHERE its two journal inputs
17
+ (`journal_events_since`, `last_heartbeat_age_ms`) come from.
18
+
19
+ It imports only stdlib + the lane-journal *op constants + identity helper* it
20
+ needs (the `OP_*` names + `_lease_identity`) — a one-way sibling-kernel import
21
+ (the same arrow `timeline`→`git_delta` has). It is
22
+ **never** imported BY `lane_journal` (whose job is lease correctness + replay,
23
+ not `ProgressEvidence`-shaped clock semantics).
24
+
25
+ THE HARD PROBLEM this fold resolves: a journal entry carries **no run-id** — it
26
+ is keyed only by `(loop_ts, lane)` (`lane_journal._lease_identity`). So "did
27
+ THIS run move?" cannot be answered from the journal by time alone — a busy
28
+ *neighbor* lane would otherwise manufacture a false ADVANCING for a spinning
29
+ run. The fold attributes on **two axes**:
30
+
31
+ * IDENTITY — every journal rung is scoped to THIS run's lease, passed as
32
+ `lease_key=(loop_ts, lane)`. Only entries whose `_lease_identity` matches
33
+ contribute. **Identity is REQUIRED**: with `lease_key=None` the journal
34
+ rungs do not engage at all (events forced to 0, no heartbeat) — there is no
35
+ host-wide "is *some* lane alive" guess (that signal is too ambiguous to
36
+ certify *this* run). The bare `dos liveness --run-id … --start-sha …`
37
+ North-star form still answers from the commit rung; identity only unlocks
38
+ the *journal* rungs. (Operator choice, 2026-06-01: require identity always.)
39
+ * TIME — among identity-matched entries, an entry's **own append `ts`**
40
+ (never the self-reported, copy-prone `heartbeat_at`) decides whether it
41
+ falls in the run's window.
42
+
43
+ THE ROUNDING RULE — different per rung, deliberately:
44
+
45
+ * EVENT rung (gates ADVANCING; over-counting is FORBIDDEN, docs/82 2c):
46
+ a **bounded window** `(floored start, now + slack]` AND a **lease-birth
47
+ exclusion**. The window is strictly after the run-start floored to its
48
+ containing second (journal `ts` is second-resolution, the run-start is ms) AND
49
+ no later than now plus the same one-second future slack the heartbeat rung
50
+ uses. The lease-birth exclusion drops the FIRST ACQUIRE for this lease — the
51
+ lease coming into existence is not progress on it — by IDENTITY, independent of
52
+ its timestamp (a later re-ACQUIRE after a RELEASE still counts). A same-second
53
+ *pre-start* op is NOT counted (the floor lower bound); the run's own
54
+ establishing ACQUIRE is NOT counted (the birth exclusion); an implausibly
55
+ future-dated op (clock skew / forgery / cross-host merge) is NOT counted (the
56
+ upper bound). Because events ≥1 is the *top-of-ladder* ADVANCING verdict — the
57
+ most consequential — this rung is the BEST-guarded, not the worst: every
58
+ excluded op fails toward SPINNING/STALLED (safe), never invents ADVANCING. This
59
+ fixes "a same-second pre-start op fabricates ADVANCING", "a lone boundary
60
+ ACQUIRE marks a held-but-idle lane ADVANCING forever" (now by identity, so it
61
+ holds even when the ACQUIRE lands seconds after the run-id mint — the real
62
+ dispatch timeline the old `> floor` rule missed), AND "a future-skewed event
63
+ fabricates ADVANCING on a stuck run".
64
+ * HEARTBEAT-freshness rung (alive/dead; the generous direction is safe): the
65
+ start floor does not gate freshness at all — freshness is about *now*, not
66
+ the start window. A future-dated beat (clock skew / forged stamp) beyond the
67
+ one-second slack is dropped (not clamped), failing toward STALLED.
68
+
69
+ Every degrade path fails toward STALLED/SPINNING and never raises (the ADM
70
+ fail-closed analogue): a `_CORRUPT` sentinel, an unparseable `ts`, an empty or
71
+ absent journal — none can invent progress or freshness. `saw_corrupt` is carried
72
+ for a future renderer's data-quality note (Phase 3); it does NOT flip the
73
+ verdict (the count-0/age-None degrade already fails safe) and is not threaded
74
+ into the (byte-unchanged) `ProgressEvidence`.
75
+ """
76
+
77
+ from __future__ import annotations
78
+
79
+ import datetime as dt
80
+ from typing import Iterable, NamedTuple, Optional
81
+
82
+ from dos.lane_journal import ( # sibling-kernel constants/helper (one-way import)
83
+ OP_ACQUIRE,
84
+ OP_HEARTBEAT,
85
+ OP_RECONCILE,
86
+ OP_RELEASE,
87
+ OP_SCAVENGE,
88
+ _lease_identity,
89
+ )
90
+
91
+ # Ops that prove the lease is alive — a fresh ACQUIRE or HEARTBEAT for THIS
92
+ # lease. ACQUIRE stamps the lease's first beat; HEARTBEAT refreshes it (docs/82
93
+ # line 69, liveness.py:158).
94
+ _HEARTBEAT_OPS = frozenset({OP_ACQUIRE, OP_HEARTBEAT})
95
+
96
+ # Ops that count as lease-layer *work* (the ADVANCING event rung) — a deliberate
97
+ # subset of lane_journal._STATE_MUTATING_OPS that EXCLUDES HEARTBEAT. This is the
98
+ # crux of docs/82's ladder (lines 83-85): "fresh heartbeat … but zero …
99
+ # state-mutating journal events → SPINNING" explicitly separates the *freshness*
100
+ # signal (a heartbeat) from *progress* (state mutation). A HEARTBEAT is a
101
+ # keepalive — re-pinging a lease you already hold is the very definition of
102
+ # narrating-aliveness-without-moving — so it proves life (a beat) but is NOT
103
+ # forward progress (not an event). ACQUIRE/RELEASE/SCAVENGE/RECONCILE are real
104
+ # lease transitions: taking, dropping, evicting, or re-asserting a lease is work
105
+ # at the lease layer that the commit rung wouldn't see. (REFUSE grants nothing
106
+ # and _CORRUPT is not work — both already excluded.)
107
+ _EVENT_OPS = frozenset({OP_ACQUIRE, OP_RELEASE, OP_SCAVENGE, OP_RECONCILE})
108
+
109
+ # The op that BRINGS A LEASE INTO EXISTENCE. A lease is born with an ACQUIRE; that
110
+ # birth is the lease starting, NOT forward progress on it — exactly as a process's
111
+ # own fork is not "work the process did." The run's establishing ACQUIRE must
112
+ # therefore be excluded from the EVENT (ADVANCING) count, or a held-but-idle lane
113
+ # that did nothing but take its lease reads ADVANCING forever and SPINNING becomes
114
+ # unreachable (the docs/82 false-clear). The exclusion is by IDENTITY — "the first
115
+ # ACQUIRE for this lease" — not by timestamp: the prior `> floor` rule only excluded
116
+ # it when the ACQUIRE happened to land in the run-start second, which is false in
117
+ # every real dispatch (the lease is acquired seconds after the run-id is minted,
118
+ # past preflight/snapshot/gate). A LATER ACQUIRE (a genuine re-acquire after a
119
+ # RELEASE) is real lease work and still counts — only the establishing one is the
120
+ # lease's birth. Sibling `dispatch_top._events_by_lane` makes the same distinction
121
+ # by gating on the live lease's `acquired_at`.
122
+ _LEASE_BIRTH_OP = OP_ACQUIRE
123
+
124
+ # One second of slack on the future-beat guard: the journal `ts` is
125
+ # second-resolution while `now_ms` is millisecond, so a beat in the current
126
+ # second can legitimately decode to up to ~999 ms *after* now. Beyond this a
127
+ # beat is clock-skew or a forged future stamp — not credible proof-of-life.
128
+ _FUTURE_BEAT_SLACK_MS = 1000
129
+
130
+
131
+ class JournalDelta(NamedTuple):
132
+ """The two numbers `ProgressEvidence` needs from the journal, plus a flag.
133
+
134
+ events_since_start — count of THIS-run lease-*work* ops (ACQUIRE/
135
+ RELEASE/SCAVENGE/RECONCILE, NOT a keepalive
136
+ HEARTBEAT) whose own append `ts` is strictly after
137
+ the floored run start. Flows to
138
+ `journal_events_since`; ≥1 is the lease-layer
139
+ ADVANCING rung (liveness.py:252).
140
+ newest_heartbeat_age_ms — `now_ms − newest credible beat ts` for THIS
141
+ lease; None when there is no credible beat. Flows
142
+ to `last_heartbeat_age_ms`; None reads as STALLED
143
+ (the safe direction, liveness.py:303).
144
+ saw_corrupt — a `_CORRUPT` sentinel was present. Diagnostic
145
+ only: it does NOT change the verdict and is not
146
+ carried into `ProgressEvidence`/`to_dict` (those
147
+ stay byte-unchanged) — reserved for a Phase-3
148
+ renderer's data-quality note.
149
+ """
150
+
151
+ events_since_start: int
152
+ newest_heartbeat_age_ms: Optional[int]
153
+ saw_corrupt: bool
154
+
155
+
156
+ def _parse_journal_ts(s: Optional[str]) -> Optional[int]:
157
+ """Parse a journal stamp to epoch-ms; None on any unparseable/missing input.
158
+
159
+ PURE. Accepts both the second-resolution stamp `lane_journal.append` writes
160
+ (`journal_now_iso`, ``%Y-%m-%dT%H:%M:%SZ``) and a minute-only stamp a
161
+ foreign/lease-copied field might carry (``%Y-%m-%dT%H:%MZ``) — the exact
162
+ two-format tolerance `archive_lock._parse_iso` uses. The explicit
163
+ `tzinfo=utc` is LOAD-BEARING: a naive `timestamp()` would shift by the host
164
+ UTC offset (pinned by `test_parse_journal_ts_known_epoch_ms`).
165
+
166
+ NOTE: a third tiny copy of this kernel's ISO-parse (after
167
+ `archive_lock._parse_iso` and `decisions._parse_iso`). Kept local — all are
168
+ sibling kernel modules, no layer crossing — but a tz/format fix must land in
169
+ all three; flagged for a possible future shared stdlib-only helper.
170
+ """
171
+ if not s:
172
+ return None
173
+ for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%MZ"):
174
+ try:
175
+ parsed = dt.datetime.strptime(s, fmt).replace(tzinfo=dt.timezone.utc)
176
+ except (ValueError, TypeError):
177
+ continue
178
+ return int(parsed.timestamp() * 1000)
179
+ return None
180
+
181
+
182
+ def fold_since(
183
+ entries: Iterable[dict],
184
+ *,
185
+ run_started_ms: int,
186
+ now_ms: int,
187
+ lease_key: Optional[tuple[str, str]] = None,
188
+ ) -> JournalDelta:
189
+ """Fold journal entries into (events-since-start, newest-beat-age) for one run.
190
+
191
+ PURE — entries in, numbers out, the clock injected (`now_ms`), no disk. The
192
+ caller (`dos liveness`'s evidence-gather) does the `lane_journal.read_all`
193
+ at the boundary and passes the materialized list here.
194
+
195
+ `lease_key=(loop_ts, lane)` is THIS run's lease identity. **Identity is
196
+ required for the journal rungs**: with `lease_key=None` the journal cannot be
197
+ attributed to this run, so both rungs go silent — `JournalDelta(0, None,
198
+ saw_corrupt)` — and the commit rung (plus any explicit
199
+ `--last-heartbeat-age-ms` the caller layers on) decides. `saw_corrupt` is
200
+ still reported so a corrupt journal is observable even without identity.
201
+
202
+ The ladder this feeds (`liveness.classify`, unchanged): events ≥1 →
203
+ ADVANCING (lease-layer progress); else a fresh beat age → alive (SPINNING if
204
+ old enough); else None/stale → STALLED.
205
+ """
206
+ saw_corrupt = False
207
+ events = 0
208
+ newest_beat_ms: Optional[int] = None
209
+
210
+ # A blank lease_key ('', '') is treated as NO identity (silent rungs), mirroring
211
+ # lane_journal.replay's `if not key[0] and not key[1]: continue` (lane_journal.py
212
+ # :257): the blank identity is the "no real lease" sentinel, not a lane to match.
213
+ # The CLI never builds a blank key (its `if lane and loop_ts` guard yields None),
214
+ # but a library caller could — so the fold itself refuses to attribute the
215
+ # journal to a blank identity rather than match stray blank-keyed entries.
216
+ if lease_key is not None and not lease_key[0] and not lease_key[1]:
217
+ lease_key = None
218
+
219
+ # The run-start floored to its containing second — journal `ts` is
220
+ # second-resolution, so this is the coarsest instant an entry's second-stamp
221
+ # can be compared against. STRICT `>` against this floor excludes a same-second
222
+ # *pre-start* op (one stamped in the run-start second but before the run's true
223
+ # sub-second start). It is NOT the boundary-ACQUIRE guard — that is the separate
224
+ # lease-birth exclusion below, which is timestamp-independent.
225
+ run_started_floor_ms = (run_started_ms // 1000) * 1000
226
+
227
+ # The lease's establishing ACQUIRE — its BIRTH, not progress. Excluded from the
228
+ # EVENT count by identity (the first ACQUIRE we see for this lease in append
229
+ # order), never by timestamp. `False` until consumed; once we have skipped the
230
+ # birth ACQUIRE, every later lease-work op (incl. a genuine re-ACQUIRE after a
231
+ # RELEASE) counts as real progress. See `_LEASE_BIRTH_OP`.
232
+ seen_lease_birth = False
233
+
234
+ for e in entries:
235
+ op = str(e.get("op") or "")
236
+ if op == "_CORRUPT":
237
+ saw_corrupt = True
238
+ continue # corruption can only REDUCE observed progress, never invent it
239
+
240
+ # IDENTITY axis — every journal rung is scoped to THIS run's lease. With
241
+ # no identity, no entry can be attributed to this run: the rungs go silent.
242
+ if lease_key is None:
243
+ continue
244
+ if _lease_identity(e) != lease_key:
245
+ continue
246
+
247
+ # The entry's OWN append ts is the trusted instant (never the
248
+ # self-reported, copy-prone `heartbeat_at` — that is exactly the kind of
249
+ # narration LVN distrusts). Fall back to `heartbeat_at` ONLY when `ts` is
250
+ # missing/unparseable (a defensive last resort for a foreign writer).
251
+ ts_ms = _parse_journal_ts(e.get("ts"))
252
+ if ts_ms is None:
253
+ ts_ms = _parse_journal_ts(e.get("heartbeat_at"))
254
+ if ts_ms is None:
255
+ continue # can't place this entry in time → drop (the safe direction)
256
+
257
+ # LEASE-BIRTH exclusion — the FIRST ACQUIRE for this lease is the lease
258
+ # coming into existence, not forward progress on it. Skip exactly it from
259
+ # the EVENT count (by identity, not timestamp), then mark the birth
260
+ # consumed so a LATER re-ACQUIRE (after a RELEASE) is counted as real lease
261
+ # work. This is the root fix for the docs/82 false-clear: the prior `>
262
+ # floor` rule only excluded the birth ACQUIRE when it happened to land in
263
+ # the run-start second — true in fixtures, false in every real dispatch
264
+ # where the lease is acquired seconds after the run-id is minted, so a
265
+ # held-but-idle lane's lone ACQUIRE was counted and it read ADVANCING
266
+ # forever. The op still flows to the HEARTBEAT rung below (the birth ACQUIRE
267
+ # IS proof the lease is alive — just not proof it moved).
268
+ is_lease_birth = op == _LEASE_BIRTH_OP and not seen_lease_birth
269
+ if op == _LEASE_BIRTH_OP:
270
+ seen_lease_birth = True
271
+
272
+ # EVENT rung — a lease-*work* op (ACQUIRE/RELEASE/SCAVENGE/RECONCILE, NOT
273
+ # a HEARTBEAT keepalive, NOT the lease's birth ACQUIRE) for this lease, in
274
+ # the window (floored start, now], is lease-layer forward progress (docs/82
275
+ # 2a). Strict `>` the start floor excludes a same-second *pre-start* op; the
276
+ # birth exclusion above excludes the establishing ACQUIRE regardless of when
277
+ # it landed; the SAME future-credibility upper bound the heartbeat rung uses
278
+ # (`<= now + slack`) drops an implausibly future-dated op (NTP step-back
279
+ # between append and read, or the cross-host merge `lane_journal`
280
+ # anticipates). Events ≥1 is the TOP-of-ladder ADVANCING rung — the most
281
+ # consequential verdict — so it must be the BEST-guarded, not the worst: a
282
+ # future-skewed event must fail toward SPINNING/STALLED, never invent
283
+ # ADVANCING (docs/82 2c "over-counting is FORBIDDEN"; design law: never a
284
+ # false ADVANCING). Excluding HEARTBEAT is what makes SPINNING reachable:
285
+ # a fresh heartbeat proves life (a beat, below) without counting as
286
+ # progress — docs/82's "fresh heartbeat … but zero state-mutating events
287
+ # → SPINNING" ladder.
288
+ if (
289
+ op in _EVENT_OPS
290
+ and not is_lease_birth
291
+ and run_started_floor_ms < ts_ms <= now_ms + _FUTURE_BEAT_SLACK_MS
292
+ ):
293
+ events += 1
294
+
295
+ # HEARTBEAT-freshness rung — a fresh ACQUIRE/HEARTBEAT proves the lease
296
+ # is alive NOW (no start-window gate; freshness is about now). Drop a
297
+ # beat dated implausibly in the future (skew/forgery) rather than clamp
298
+ # it to age-0 — that would hide a dead run behind a forged stamp.
299
+ if op in _HEARTBEAT_OPS and ts_ms <= now_ms + _FUTURE_BEAT_SLACK_MS:
300
+ if newest_beat_ms is None or ts_ms > newest_beat_ms:
301
+ newest_beat_ms = ts_ms
302
+
303
+ # Age = now − newest credible beat, clamped at 0 (a sub-second-future beat
304
+ # within the slack is the freshest possible, not a negative age — and
305
+ # `ProgressEvidence` documents ages as ≥0).
306
+ age_ms = None if newest_beat_ms is None else max(0, now_ms - newest_beat_ms)
307
+ return JournalDelta(events_since_start=events, newest_heartbeat_age_ms=age_ms,
308
+ saw_corrupt=saw_corrupt)
dos/judge_eval.py ADDED
@@ -0,0 +1,328 @@
1
+ """The judge-evaluation harness — score an adjudicator, and the rung it occupies.
2
+
3
+ A `dos.judges.Judge` is a *hook*; this module is the *instrument* that makes the hook
4
+ produce a number. It is the "researchers make their own insights" surface: bring your
5
+ own judge (a debate, a learned verifier, a build/test oracle), bring a set of labelled
6
+ claims, and get back the numbers an oversight researcher actually cares about — chiefly
7
+ **the false-clear rate**, the dangerous cell where a judge waves through a claim that is
8
+ in fact false.
9
+
10
+ Two things it computes:
11
+
12
+ 1. **`score(judge, cases)` → `JudgeReport`** — run a judge over labelled cases and
13
+ tabulate the 3×2 confusion grid (the judge's AGREE/DISAGREE/ABSTAIN against each
14
+ claim's ground-truth believable/not), plus the derived rates. This scores the judge
15
+ *in isolation* — how good is it at ruling on the claims it sees.
16
+
17
+ 2. **`compose_deterministic_first(oracle_fn, judge, cases)` → `RungReport`** — the
18
+ *system* number. It runs the trust ladder: the deterministic oracle rules first, the
19
+ judge sees only the residue the oracle abstained on, and whatever neither resolves
20
+ escalates to a human. It reports **rung occupancy** (what fraction of claims each rung
21
+ resolved — det% | judge% | human%, summing to 100%) and the false-clear rate *at each
22
+ rung*. This is the scalable-oversight headline: how much human-review load the judge
23
+ actually removes, and at what integrity cost.
24
+
25
+ Everything here is **pure**: it consumes already-built `Claim`s, calls `run_judge`
26
+ (which is itself fail-to-abstain), and counts. No I/O, no host names — it sits in the
27
+ kernel layer beside `judges`. A `case` is a `(Claim, truth)` pair where ``truth`` is the
28
+ ground-truth believability of the claim (``True`` = the claim is real/correct, ``False``
29
+ = it is a lie / unsupported). The labels are the *researcher's* ground truth — the same
30
+ honesty stance as FleetHorizon's "a lie is `git` showing no commit": the eval is only as
31
+ honest as the labels, so a caller derives them from artifacts, not from the judge.
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ from dataclasses import dataclass
37
+ from typing import Callable, Iterable, Optional
38
+
39
+ from dos.judges import Claim, Judge, JudgeVerdict, Stance, run_judge
40
+
41
+
42
+ # A labelled example: the claim to adjudicate + its ground-truth believability.
43
+ Case = tuple[Claim, bool]
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class JudgeReport:
48
+ """A judge scored over labelled cases — the confusion grid + derived rates.
49
+
50
+ The 3×2 grid is the six counts below (judge stance × ground truth). The named
51
+ cells:
52
+ * ``correct_clear`` — AGREE on a TRUE claim (right: cleared a real claim)
53
+ * ``false_clear`` — AGREE on a FALSE claim (THE DANGEROUS CELL: a lie waved
54
+ through — the one error an oversight layer must minimize)
55
+ * ``correct_flag`` — DISAGREE on a FALSE claim (right: caught a lie)
56
+ * ``false_flag`` — DISAGREE on a TRUE claim (wrong but SAFE: a needless human
57
+ review, never a corruption)
58
+ * ``abstain_true`` / ``abstain_false`` — punted to a human (safe; costs attention)
59
+ """
60
+
61
+ n: int
62
+ correct_clear: int
63
+ false_clear: int
64
+ correct_flag: int
65
+ false_flag: int
66
+ abstain_true: int
67
+ abstain_false: int
68
+ total_cost: float
69
+
70
+ # --- derived rates (all guard against divide-by-zero by returning 0.0) ---
71
+
72
+ @property
73
+ def n_agree(self) -> int:
74
+ return self.correct_clear + self.false_clear
75
+
76
+ @property
77
+ def n_disagree(self) -> int:
78
+ return self.correct_flag + self.false_flag
79
+
80
+ @property
81
+ def n_abstain(self) -> int:
82
+ return self.abstain_true + self.abstain_false
83
+
84
+ @property
85
+ def n_false_claims(self) -> int:
86
+ """Ground-truth FALSE claims in the set — the denominator for leak rate."""
87
+ return self.false_clear + self.correct_flag + self.abstain_false
88
+
89
+ @property
90
+ def false_clear_rate(self) -> float:
91
+ """Of the claims the judge CLEARED (agreed), the fraction that were actually
92
+ false. The precision-of-clearing number: when this judge says "believable,"
93
+ how often is it wrong? The single most important oversight metric — a judge
94
+ is only safe to trust on its own if this is near zero."""
95
+ return (self.false_clear / self.n_agree) if self.n_agree else 0.0
96
+
97
+ @property
98
+ def lie_leak_rate(self) -> float:
99
+ """Of all ground-truth FALSE claims, the fraction the judge waved through
100
+ (AGREE'd). The recall-of-lies number from the other side: what share of real
101
+ lies leaked past this judge entirely (a lie it ABSTAINED on did NOT leak — it
102
+ went to a human). Distinct from `false_clear_rate`: this is /lies, that is
103
+ /clears."""
104
+ return (self.false_clear / self.n_false_claims) if self.n_false_claims else 0.0
105
+
106
+ @property
107
+ def decisive_accuracy(self) -> float:
108
+ """When the judge COMMITTED (did not abstain), how often was it right? —
109
+ (correct_clear + correct_flag) / (agrees + disagrees). Abstentions are
110
+ excluded: this measures the quality of the judge's opinions, separately from
111
+ how often it ventures one (`abstention_rate`)."""
112
+ decisive = self.n_agree + self.n_disagree
113
+ return ((self.correct_clear + self.correct_flag) / decisive) if decisive else 0.0
114
+
115
+ @property
116
+ def abstention_rate(self) -> float:
117
+ """Fraction of claims the judge punted to a human. High abstention is SAFE
118
+ but adds no leverage (the human still does the work); low abstention with a
119
+ low false-clear rate is the valuable regime."""
120
+ return (self.n_abstain / self.n) if self.n else 0.0
121
+
122
+ @property
123
+ def cost_per_claim(self) -> float:
124
+ return (self.total_cost / self.n) if self.n else 0.0
125
+
126
+ def to_dict(self) -> dict:
127
+ return {
128
+ "n": self.n,
129
+ "grid": {
130
+ "correct_clear": self.correct_clear,
131
+ "false_clear": self.false_clear,
132
+ "correct_flag": self.correct_flag,
133
+ "false_flag": self.false_flag,
134
+ "abstain_true": self.abstain_true,
135
+ "abstain_false": self.abstain_false,
136
+ },
137
+ "rates": {
138
+ "false_clear_rate": round(self.false_clear_rate, 4),
139
+ "lie_leak_rate": round(self.lie_leak_rate, 4),
140
+ "decisive_accuracy": round(self.decisive_accuracy, 4),
141
+ "abstention_rate": round(self.abstention_rate, 4),
142
+ "cost_per_claim": round(self.cost_per_claim, 6),
143
+ },
144
+ "total_cost": self.total_cost,
145
+ }
146
+
147
+
148
+ def score(judge: Judge, cases: Iterable[Case], config: object = None) -> JudgeReport:
149
+ """Run ``judge`` over labelled ``cases`` and tabulate the confusion grid.
150
+
151
+ Uses `run_judge` (fail-to-abstain), so a judge that errors on a case contributes
152
+ an ABSTAIN to the grid rather than crashing the eval — the report stays honest
153
+ about a flaky judge instead of hiding it. Pure: it only reads the cases and
154
+ counts.
155
+ """
156
+ cc = fc = cf = ff = at = af = 0
157
+ total_cost = 0.0
158
+ n = 0
159
+ for claim, truth in cases:
160
+ n += 1
161
+ v = run_judge(judge, claim, config)
162
+ total_cost += v.cost
163
+ if v.stance is Stance.AGREE:
164
+ if truth:
165
+ cc += 1
166
+ else:
167
+ fc += 1
168
+ elif v.stance is Stance.DISAGREE:
169
+ if truth:
170
+ ff += 1
171
+ else:
172
+ cf += 1
173
+ else: # ABSTAIN
174
+ if truth:
175
+ at += 1
176
+ else:
177
+ af += 1
178
+ return JudgeReport(
179
+ n=n, correct_clear=cc, false_clear=fc, correct_flag=cf, false_flag=ff,
180
+ abstain_true=at, abstain_false=af, total_cost=total_cost,
181
+ )
182
+
183
+
184
+ # ---------------------------------------------------------------------------
185
+ # Deterministic-first composition — the trust-ladder / rung-occupancy report.
186
+ # ---------------------------------------------------------------------------
187
+
188
+ # An oracle function rules on a claim deterministically, OR signals "I can't" by
189
+ # returning None or an ABSTAIN verdict. This is the seam to the kernel's real oracle
190
+ # (`verify` / `picker_oracle`): a caller wraps whatever deterministic check it has in
191
+ # this shape. The eval ships no oracle of its own — the deterministic rung is the
192
+ # caller's ground-truth checker, exactly the no-plan-needed discipline.
193
+ OracleFn = Callable[[Claim], Optional[JudgeVerdict]]
194
+
195
+
196
+ @dataclass(frozen=True)
197
+ class RungReport:
198
+ """The trust ladder scored: how much each rung resolved, and how well.
199
+
200
+ ``*_resolved`` are the rung-occupancy counts (det + judge + human == n). The
201
+ per-rung false-clear counts let a researcher see the integrity cost of pushing
202
+ work down to the cheaper rung — the whole point of the composition is to move
203
+ load off the human WITHOUT the judge leaking lies, and this report shows both
204
+ halves of that trade at once.
205
+ """
206
+
207
+ n: int
208
+ det_resolved: int # claims the deterministic oracle ruled (agree/disagree)
209
+ judge_resolved: int # residue the judge ruled (agree/disagree)
210
+ human_resolved: int # what neither could — escalated to a human (abstains)
211
+ det_false_clear: int # oracle AGREE on a FALSE claim (should be ~0 by construction)
212
+ judge_false_clear: int # judge AGREE on a FALSE claim — the cost of the JUDGE rung
213
+ judge_report: JudgeReport # the judge scored on the RESIDUE only (its true workload)
214
+
215
+ @property
216
+ def det_occupancy(self) -> float:
217
+ return (self.det_resolved / self.n) if self.n else 0.0
218
+
219
+ @property
220
+ def judge_occupancy(self) -> float:
221
+ return (self.judge_resolved / self.n) if self.n else 0.0
222
+
223
+ @property
224
+ def human_occupancy(self) -> float:
225
+ """The human-review fraction — the scalable-oversight headline. This is what
226
+ the JUDGE rung pulls DOWN: with no judge (the `abstain` baseline) every claim
227
+ the oracle can't rule lands here; a good judge shrinks it."""
228
+ return (self.human_resolved / self.n) if self.n else 0.0
229
+
230
+ def to_dict(self) -> dict:
231
+ return {
232
+ "n": self.n,
233
+ "occupancy": {
234
+ "deterministic": round(self.det_occupancy, 4),
235
+ "judge": round(self.judge_occupancy, 4),
236
+ "human": round(self.human_occupancy, 4),
237
+ },
238
+ "false_clears": {
239
+ "deterministic": self.det_false_clear,
240
+ "judge": self.judge_false_clear,
241
+ },
242
+ "judge_on_residue": self.judge_report.to_dict(),
243
+ }
244
+
245
+
246
+ def _is_decisive(v: Optional[JudgeVerdict]) -> bool:
247
+ """A verdict resolves a claim iff it is a non-None AGREE/DISAGREE. None or ABSTAIN
248
+ means the rung punts the claim onward."""
249
+ return v is not None and v.stance is not Stance.ABSTAIN
250
+
251
+
252
+ def compose_deterministic_first(
253
+ oracle_fn: OracleFn,
254
+ judge: Judge,
255
+ cases: Iterable[Case],
256
+ config: object = None,
257
+ ) -> RungReport:
258
+ """Run the trust ladder and report rung occupancy + per-rung false-clears.
259
+
260
+ The composition is the discipline itself, in code:
261
+ 1. the **deterministic oracle** rules first (`oracle_fn`). If decisive, the
262
+ claim resolves at the DET rung and the judge never sees it (deterministic-
263
+ first: never spend the expensive/unforgeable-proof-lacking rung on what the
264
+ cheap forgery-proof one can settle).
265
+ 2. the **judge** sees ONLY the residue the oracle abstained on, via `run_judge`
266
+ (fail-to-abstain). If decisive, the claim resolves at the JUDGE rung.
267
+ 3. whatever the judge also abstains on **escalates to a HUMAN**.
268
+
269
+ The judge is scored on its *real* workload — the residue, not the full set — so
270
+ `judge_report` answers "how good is this judge at the claims it is actually asked
271
+ to rule on," which is the honest question (its accuracy on claims the oracle
272
+ already settled is irrelevant; it never sees them).
273
+ """
274
+ n = 0
275
+ det_resolved = judge_resolved = human_resolved = 0
276
+ det_fc = 0
277
+ # The judge's confusion grid over the RESIDUE, tabulated inline from the SAME
278
+ # verdicts the ladder uses — the judge runs exactly once per residue claim (no
279
+ # re-run, so cost is counted once and a nondeterministic judge is not sampled
280
+ # twice). `judge_resolved` == cc+fc+cf+ff and `human_resolved` == at+af by
281
+ # construction, so the rung-occupancy counts and the judge report are derived
282
+ # from one pass and cannot drift apart.
283
+ cc = fc = cf = ff = at = af = 0
284
+ judge_cost = 0.0
285
+ residue_n = 0
286
+ for claim, truth in cases:
287
+ n += 1
288
+ ov = oracle_fn(claim)
289
+ if _is_decisive(ov):
290
+ det_resolved += 1
291
+ if ov.stance is Stance.AGREE and not truth:
292
+ det_fc += 1
293
+ continue
294
+ # residue → the judge (run ONCE; tabulate this verdict directly)
295
+ residue_n += 1
296
+ jv = run_judge(judge, claim, config)
297
+ judge_cost += jv.cost
298
+ if jv.stance is Stance.AGREE:
299
+ if truth:
300
+ cc += 1
301
+ else:
302
+ fc += 1
303
+ judge_resolved += 1
304
+ elif jv.stance is Stance.DISAGREE:
305
+ if truth:
306
+ ff += 1
307
+ else:
308
+ cf += 1
309
+ judge_resolved += 1
310
+ else: # ABSTAIN → escalate to a human
311
+ if truth:
312
+ at += 1
313
+ else:
314
+ af += 1
315
+ human_resolved += 1
316
+ judge_report = JudgeReport(
317
+ n=residue_n, correct_clear=cc, false_clear=fc, correct_flag=cf, false_flag=ff,
318
+ abstain_true=at, abstain_false=af, total_cost=judge_cost,
319
+ )
320
+ return RungReport(
321
+ n=n,
322
+ det_resolved=det_resolved,
323
+ judge_resolved=judge_resolved,
324
+ human_resolved=human_resolved,
325
+ det_false_clear=det_fc,
326
+ judge_false_clear=fc, # the judge's AGREE-on-FALSE count, over the residue
327
+ judge_report=judge_report,
328
+ )