dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/journal_delta.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""journal-delta — the lane-journal progress fold for the liveness verdict.
|
|
2
|
+
|
|
3
|
+
docs/82, LVN **Phase 2** — the journal + heartbeat rungs. Phase 1's heartbeat
|
|
4
|
+
age was caller-supplied (`--last-heartbeat-age-ms`); this grounds the heartbeat
|
|
5
|
+
and the lease-layer-event signal in the **lane journal** so the
|
|
6
|
+
SPINNING-vs-STALLED distinction comes from kernel evidence the agent can't
|
|
7
|
+
forge, not a passed number.
|
|
8
|
+
|
|
9
|
+
This module is `git_delta`'s sibling — the same boundary/evidence split LVN
|
|
10
|
+
Phase 1b established (`docs/82` 1b): the file read (`lane_journal.read_all`)
|
|
11
|
+
happens at the CLI boundary; the **fold here is PURE** — entries in, two numbers
|
|
12
|
+
out, the clock injected, no disk. It is replay-testable on frozen entry lists
|
|
13
|
+
exactly like `lane_journal.replay()`, which is what lets the whole liveness
|
|
14
|
+
ladder be tested without a live multi-minute agent run (the `loop_decide` design
|
|
15
|
+
value, restated for the temporal axis). `liveness.classify` stays byte-pure with
|
|
16
|
+
zero journal-schema awareness: Phase 2 only changes WHERE its two journal inputs
|
|
17
|
+
(`journal_events_since`, `last_heartbeat_age_ms`) come from.
|
|
18
|
+
|
|
19
|
+
It imports only stdlib + the lane-journal *op constants + identity helper* it
|
|
20
|
+
needs (the `OP_*` names + `_lease_identity`) — a one-way sibling-kernel import
|
|
21
|
+
(the same arrow `timeline`→`git_delta` has). It is
|
|
22
|
+
**never** imported BY `lane_journal` (whose job is lease correctness + replay,
|
|
23
|
+
not `ProgressEvidence`-shaped clock semantics).
|
|
24
|
+
|
|
25
|
+
THE HARD PROBLEM this fold resolves: a journal entry carries **no run-id** — it
|
|
26
|
+
is keyed only by `(loop_ts, lane)` (`lane_journal._lease_identity`). So "did
|
|
27
|
+
THIS run move?" cannot be answered from the journal by time alone — a busy
|
|
28
|
+
*neighbor* lane would otherwise manufacture a false ADVANCING for a spinning
|
|
29
|
+
run. The fold attributes on **two axes**:
|
|
30
|
+
|
|
31
|
+
* IDENTITY — every journal rung is scoped to THIS run's lease, passed as
|
|
32
|
+
`lease_key=(loop_ts, lane)`. Only entries whose `_lease_identity` matches
|
|
33
|
+
contribute. **Identity is REQUIRED**: with `lease_key=None` the journal
|
|
34
|
+
rungs do not engage at all (events forced to 0, no heartbeat) — there is no
|
|
35
|
+
host-wide "is *some* lane alive" guess (that signal is too ambiguous to
|
|
36
|
+
certify *this* run). The bare `dos liveness --run-id … --start-sha …`
|
|
37
|
+
North-star form still answers from the commit rung; identity only unlocks
|
|
38
|
+
the *journal* rungs. (Operator choice, 2026-06-01: require identity always.)
|
|
39
|
+
* TIME — among identity-matched entries, an entry's **own append `ts`**
|
|
40
|
+
(never the self-reported, copy-prone `heartbeat_at`) decides whether it
|
|
41
|
+
falls in the run's window.
|
|
42
|
+
|
|
43
|
+
THE ROUNDING RULE — different per rung, deliberately:
|
|
44
|
+
|
|
45
|
+
* EVENT rung (gates ADVANCING; over-counting is FORBIDDEN, docs/82 2c):
|
|
46
|
+
a **bounded window** `(floored start, now + slack]` AND a **lease-birth
|
|
47
|
+
exclusion**. The window is strictly after the run-start floored to its
|
|
48
|
+
containing second (journal `ts` is second-resolution, the run-start is ms) AND
|
|
49
|
+
no later than now plus the same one-second future slack the heartbeat rung
|
|
50
|
+
uses. The lease-birth exclusion drops the FIRST ACQUIRE for this lease — the
|
|
51
|
+
lease coming into existence is not progress on it — by IDENTITY, independent of
|
|
52
|
+
its timestamp (a later re-ACQUIRE after a RELEASE still counts). A same-second
|
|
53
|
+
*pre-start* op is NOT counted (the floor lower bound); the run's own
|
|
54
|
+
establishing ACQUIRE is NOT counted (the birth exclusion); an implausibly
|
|
55
|
+
future-dated op (clock skew / forgery / cross-host merge) is NOT counted (the
|
|
56
|
+
upper bound). Because events ≥1 is the *top-of-ladder* ADVANCING verdict — the
|
|
57
|
+
most consequential — this rung is the BEST-guarded, not the worst: every
|
|
58
|
+
excluded op fails toward SPINNING/STALLED (safe), never invents ADVANCING. This
|
|
59
|
+
fixes "a same-second pre-start op fabricates ADVANCING", "a lone boundary
|
|
60
|
+
ACQUIRE marks a held-but-idle lane ADVANCING forever" (now by identity, so it
|
|
61
|
+
holds even when the ACQUIRE lands seconds after the run-id mint — the real
|
|
62
|
+
dispatch timeline the old `> floor` rule missed), AND "a future-skewed event
|
|
63
|
+
fabricates ADVANCING on a stuck run".
|
|
64
|
+
* HEARTBEAT-freshness rung (alive/dead; the generous direction is safe): the
|
|
65
|
+
start floor does not gate freshness at all — freshness is about *now*, not
|
|
66
|
+
the start window. A future-dated beat (clock skew / forged stamp) beyond the
|
|
67
|
+
one-second slack is dropped (not clamped), failing toward STALLED.
|
|
68
|
+
|
|
69
|
+
Every degrade path fails toward STALLED/SPINNING and never raises (the ADM
|
|
70
|
+
fail-closed analogue): a `_CORRUPT` sentinel, an unparseable `ts`, an empty or
|
|
71
|
+
absent journal — none can invent progress or freshness. `saw_corrupt` is carried
|
|
72
|
+
for a future renderer's data-quality note (Phase 3); it does NOT flip the
|
|
73
|
+
verdict (the count-0/age-None degrade already fails safe) and is not threaded
|
|
74
|
+
into the (byte-unchanged) `ProgressEvidence`.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
from __future__ import annotations
|
|
78
|
+
|
|
79
|
+
import datetime as dt
|
|
80
|
+
from typing import Iterable, NamedTuple, Optional
|
|
81
|
+
|
|
82
|
+
from dos.lane_journal import ( # sibling-kernel constants/helper (one-way import)
|
|
83
|
+
OP_ACQUIRE,
|
|
84
|
+
OP_HEARTBEAT,
|
|
85
|
+
OP_RECONCILE,
|
|
86
|
+
OP_RELEASE,
|
|
87
|
+
OP_SCAVENGE,
|
|
88
|
+
_lease_identity,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Ops that prove the lease is alive — a fresh ACQUIRE or HEARTBEAT for THIS
|
|
92
|
+
# lease. ACQUIRE stamps the lease's first beat; HEARTBEAT refreshes it (docs/82
|
|
93
|
+
# line 69, liveness.py:158).
|
|
94
|
+
_HEARTBEAT_OPS = frozenset({OP_ACQUIRE, OP_HEARTBEAT})
|
|
95
|
+
|
|
96
|
+
# Ops that count as lease-layer *work* (the ADVANCING event rung) — a deliberate
|
|
97
|
+
# subset of lane_journal._STATE_MUTATING_OPS that EXCLUDES HEARTBEAT. This is the
|
|
98
|
+
# crux of docs/82's ladder (lines 83-85): "fresh heartbeat … but zero …
|
|
99
|
+
# state-mutating journal events → SPINNING" explicitly separates the *freshness*
|
|
100
|
+
# signal (a heartbeat) from *progress* (state mutation). A HEARTBEAT is a
|
|
101
|
+
# keepalive — re-pinging a lease you already hold is the very definition of
|
|
102
|
+
# narrating-aliveness-without-moving — so it proves life (a beat) but is NOT
|
|
103
|
+
# forward progress (not an event). ACQUIRE/RELEASE/SCAVENGE/RECONCILE are real
|
|
104
|
+
# lease transitions: taking, dropping, evicting, or re-asserting a lease is work
|
|
105
|
+
# at the lease layer that the commit rung wouldn't see. (REFUSE grants nothing
|
|
106
|
+
# and _CORRUPT is not work — both already excluded.)
|
|
107
|
+
_EVENT_OPS = frozenset({OP_ACQUIRE, OP_RELEASE, OP_SCAVENGE, OP_RECONCILE})
|
|
108
|
+
|
|
109
|
+
# The op that BRINGS A LEASE INTO EXISTENCE. A lease is born with an ACQUIRE; that
|
|
110
|
+
# birth is the lease starting, NOT forward progress on it — exactly as a process's
|
|
111
|
+
# own fork is not "work the process did." The run's establishing ACQUIRE must
|
|
112
|
+
# therefore be excluded from the EVENT (ADVANCING) count, or a held-but-idle lane
|
|
113
|
+
# that did nothing but take its lease reads ADVANCING forever and SPINNING becomes
|
|
114
|
+
# unreachable (the docs/82 false-clear). The exclusion is by IDENTITY — "the first
|
|
115
|
+
# ACQUIRE for this lease" — not by timestamp: the prior `> floor` rule only excluded
|
|
116
|
+
# it when the ACQUIRE happened to land in the run-start second, which is false in
|
|
117
|
+
# every real dispatch (the lease is acquired seconds after the run-id is minted,
|
|
118
|
+
# past preflight/snapshot/gate). A LATER ACQUIRE (a genuine re-acquire after a
|
|
119
|
+
# RELEASE) is real lease work and still counts — only the establishing one is the
|
|
120
|
+
# lease's birth. Sibling `dispatch_top._events_by_lane` makes the same distinction
|
|
121
|
+
# by gating on the live lease's `acquired_at`.
|
|
122
|
+
_LEASE_BIRTH_OP = OP_ACQUIRE
|
|
123
|
+
|
|
124
|
+
# One second of slack on the future-beat guard: the journal `ts` is
|
|
125
|
+
# second-resolution while `now_ms` is millisecond, so a beat in the current
|
|
126
|
+
# second can legitimately decode to up to ~999 ms *after* now. Beyond this a
|
|
127
|
+
# beat is clock-skew or a forged future stamp — not credible proof-of-life.
|
|
128
|
+
_FUTURE_BEAT_SLACK_MS = 1000
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class JournalDelta(NamedTuple):
|
|
132
|
+
"""The two numbers `ProgressEvidence` needs from the journal, plus a flag.
|
|
133
|
+
|
|
134
|
+
events_since_start — count of THIS-run lease-*work* ops (ACQUIRE/
|
|
135
|
+
RELEASE/SCAVENGE/RECONCILE, NOT a keepalive
|
|
136
|
+
HEARTBEAT) whose own append `ts` is strictly after
|
|
137
|
+
the floored run start. Flows to
|
|
138
|
+
`journal_events_since`; ≥1 is the lease-layer
|
|
139
|
+
ADVANCING rung (liveness.py:252).
|
|
140
|
+
newest_heartbeat_age_ms — `now_ms − newest credible beat ts` for THIS
|
|
141
|
+
lease; None when there is no credible beat. Flows
|
|
142
|
+
to `last_heartbeat_age_ms`; None reads as STALLED
|
|
143
|
+
(the safe direction, liveness.py:303).
|
|
144
|
+
saw_corrupt — a `_CORRUPT` sentinel was present. Diagnostic
|
|
145
|
+
only: it does NOT change the verdict and is not
|
|
146
|
+
carried into `ProgressEvidence`/`to_dict` (those
|
|
147
|
+
stay byte-unchanged) — reserved for a Phase-3
|
|
148
|
+
renderer's data-quality note.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
events_since_start: int
|
|
152
|
+
newest_heartbeat_age_ms: Optional[int]
|
|
153
|
+
saw_corrupt: bool
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _parse_journal_ts(s: Optional[str]) -> Optional[int]:
|
|
157
|
+
"""Parse a journal stamp to epoch-ms; None on any unparseable/missing input.
|
|
158
|
+
|
|
159
|
+
PURE. Accepts both the second-resolution stamp `lane_journal.append` writes
|
|
160
|
+
(`journal_now_iso`, ``%Y-%m-%dT%H:%M:%SZ``) and a minute-only stamp a
|
|
161
|
+
foreign/lease-copied field might carry (``%Y-%m-%dT%H:%MZ``) — the exact
|
|
162
|
+
two-format tolerance `archive_lock._parse_iso` uses. The explicit
|
|
163
|
+
`tzinfo=utc` is LOAD-BEARING: a naive `timestamp()` would shift by the host
|
|
164
|
+
UTC offset (pinned by `test_parse_journal_ts_known_epoch_ms`).
|
|
165
|
+
|
|
166
|
+
NOTE: a third tiny copy of this kernel's ISO-parse (after
|
|
167
|
+
`archive_lock._parse_iso` and `decisions._parse_iso`). Kept local — all are
|
|
168
|
+
sibling kernel modules, no layer crossing — but a tz/format fix must land in
|
|
169
|
+
all three; flagged for a possible future shared stdlib-only helper.
|
|
170
|
+
"""
|
|
171
|
+
if not s:
|
|
172
|
+
return None
|
|
173
|
+
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%MZ"):
|
|
174
|
+
try:
|
|
175
|
+
parsed = dt.datetime.strptime(s, fmt).replace(tzinfo=dt.timezone.utc)
|
|
176
|
+
except (ValueError, TypeError):
|
|
177
|
+
continue
|
|
178
|
+
return int(parsed.timestamp() * 1000)
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def fold_since(
|
|
183
|
+
entries: Iterable[dict],
|
|
184
|
+
*,
|
|
185
|
+
run_started_ms: int,
|
|
186
|
+
now_ms: int,
|
|
187
|
+
lease_key: Optional[tuple[str, str]] = None,
|
|
188
|
+
) -> JournalDelta:
|
|
189
|
+
"""Fold journal entries into (events-since-start, newest-beat-age) for one run.
|
|
190
|
+
|
|
191
|
+
PURE — entries in, numbers out, the clock injected (`now_ms`), no disk. The
|
|
192
|
+
caller (`dos liveness`'s evidence-gather) does the `lane_journal.read_all`
|
|
193
|
+
at the boundary and passes the materialized list here.
|
|
194
|
+
|
|
195
|
+
`lease_key=(loop_ts, lane)` is THIS run's lease identity. **Identity is
|
|
196
|
+
required for the journal rungs**: with `lease_key=None` the journal cannot be
|
|
197
|
+
attributed to this run, so both rungs go silent — `JournalDelta(0, None,
|
|
198
|
+
saw_corrupt)` — and the commit rung (plus any explicit
|
|
199
|
+
`--last-heartbeat-age-ms` the caller layers on) decides. `saw_corrupt` is
|
|
200
|
+
still reported so a corrupt journal is observable even without identity.
|
|
201
|
+
|
|
202
|
+
The ladder this feeds (`liveness.classify`, unchanged): events ≥1 →
|
|
203
|
+
ADVANCING (lease-layer progress); else a fresh beat age → alive (SPINNING if
|
|
204
|
+
old enough); else None/stale → STALLED.
|
|
205
|
+
"""
|
|
206
|
+
saw_corrupt = False
|
|
207
|
+
events = 0
|
|
208
|
+
newest_beat_ms: Optional[int] = None
|
|
209
|
+
|
|
210
|
+
# A blank lease_key ('', '') is treated as NO identity (silent rungs), mirroring
|
|
211
|
+
# lane_journal.replay's `if not key[0] and not key[1]: continue` (lane_journal.py
|
|
212
|
+
# :257): the blank identity is the "no real lease" sentinel, not a lane to match.
|
|
213
|
+
# The CLI never builds a blank key (its `if lane and loop_ts` guard yields None),
|
|
214
|
+
# but a library caller could — so the fold itself refuses to attribute the
|
|
215
|
+
# journal to a blank identity rather than match stray blank-keyed entries.
|
|
216
|
+
if lease_key is not None and not lease_key[0] and not lease_key[1]:
|
|
217
|
+
lease_key = None
|
|
218
|
+
|
|
219
|
+
# The run-start floored to its containing second — journal `ts` is
|
|
220
|
+
# second-resolution, so this is the coarsest instant an entry's second-stamp
|
|
221
|
+
# can be compared against. STRICT `>` against this floor excludes a same-second
|
|
222
|
+
# *pre-start* op (one stamped in the run-start second but before the run's true
|
|
223
|
+
# sub-second start). It is NOT the boundary-ACQUIRE guard — that is the separate
|
|
224
|
+
# lease-birth exclusion below, which is timestamp-independent.
|
|
225
|
+
run_started_floor_ms = (run_started_ms // 1000) * 1000
|
|
226
|
+
|
|
227
|
+
# The lease's establishing ACQUIRE — its BIRTH, not progress. Excluded from the
|
|
228
|
+
# EVENT count by identity (the first ACQUIRE we see for this lease in append
|
|
229
|
+
# order), never by timestamp. `False` until consumed; once we have skipped the
|
|
230
|
+
# birth ACQUIRE, every later lease-work op (incl. a genuine re-ACQUIRE after a
|
|
231
|
+
# RELEASE) counts as real progress. See `_LEASE_BIRTH_OP`.
|
|
232
|
+
seen_lease_birth = False
|
|
233
|
+
|
|
234
|
+
for e in entries:
|
|
235
|
+
op = str(e.get("op") or "")
|
|
236
|
+
if op == "_CORRUPT":
|
|
237
|
+
saw_corrupt = True
|
|
238
|
+
continue # corruption can only REDUCE observed progress, never invent it
|
|
239
|
+
|
|
240
|
+
# IDENTITY axis — every journal rung is scoped to THIS run's lease. With
|
|
241
|
+
# no identity, no entry can be attributed to this run: the rungs go silent.
|
|
242
|
+
if lease_key is None:
|
|
243
|
+
continue
|
|
244
|
+
if _lease_identity(e) != lease_key:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
# The entry's OWN append ts is the trusted instant (never the
|
|
248
|
+
# self-reported, copy-prone `heartbeat_at` — that is exactly the kind of
|
|
249
|
+
# narration LVN distrusts). Fall back to `heartbeat_at` ONLY when `ts` is
|
|
250
|
+
# missing/unparseable (a defensive last resort for a foreign writer).
|
|
251
|
+
ts_ms = _parse_journal_ts(e.get("ts"))
|
|
252
|
+
if ts_ms is None:
|
|
253
|
+
ts_ms = _parse_journal_ts(e.get("heartbeat_at"))
|
|
254
|
+
if ts_ms is None:
|
|
255
|
+
continue # can't place this entry in time → drop (the safe direction)
|
|
256
|
+
|
|
257
|
+
# LEASE-BIRTH exclusion — the FIRST ACQUIRE for this lease is the lease
|
|
258
|
+
# coming into existence, not forward progress on it. Skip exactly it from
|
|
259
|
+
# the EVENT count (by identity, not timestamp), then mark the birth
|
|
260
|
+
# consumed so a LATER re-ACQUIRE (after a RELEASE) is counted as real lease
|
|
261
|
+
# work. This is the root fix for the docs/82 false-clear: the prior `>
|
|
262
|
+
# floor` rule only excluded the birth ACQUIRE when it happened to land in
|
|
263
|
+
# the run-start second — true in fixtures, false in every real dispatch
|
|
264
|
+
# where the lease is acquired seconds after the run-id is minted, so a
|
|
265
|
+
# held-but-idle lane's lone ACQUIRE was counted and it read ADVANCING
|
|
266
|
+
# forever. The op still flows to the HEARTBEAT rung below (the birth ACQUIRE
|
|
267
|
+
# IS proof the lease is alive — just not proof it moved).
|
|
268
|
+
is_lease_birth = op == _LEASE_BIRTH_OP and not seen_lease_birth
|
|
269
|
+
if op == _LEASE_BIRTH_OP:
|
|
270
|
+
seen_lease_birth = True
|
|
271
|
+
|
|
272
|
+
# EVENT rung — a lease-*work* op (ACQUIRE/RELEASE/SCAVENGE/RECONCILE, NOT
|
|
273
|
+
# a HEARTBEAT keepalive, NOT the lease's birth ACQUIRE) for this lease, in
|
|
274
|
+
# the window (floored start, now], is lease-layer forward progress (docs/82
|
|
275
|
+
# 2a). Strict `>` the start floor excludes a same-second *pre-start* op; the
|
|
276
|
+
# birth exclusion above excludes the establishing ACQUIRE regardless of when
|
|
277
|
+
# it landed; the SAME future-credibility upper bound the heartbeat rung uses
|
|
278
|
+
# (`<= now + slack`) drops an implausibly future-dated op (NTP step-back
|
|
279
|
+
# between append and read, or the cross-host merge `lane_journal`
|
|
280
|
+
# anticipates). Events ≥1 is the TOP-of-ladder ADVANCING rung — the most
|
|
281
|
+
# consequential verdict — so it must be the BEST-guarded, not the worst: a
|
|
282
|
+
# future-skewed event must fail toward SPINNING/STALLED, never invent
|
|
283
|
+
# ADVANCING (docs/82 2c "over-counting is FORBIDDEN"; design law: never a
|
|
284
|
+
# false ADVANCING). Excluding HEARTBEAT is what makes SPINNING reachable:
|
|
285
|
+
# a fresh heartbeat proves life (a beat, below) without counting as
|
|
286
|
+
# progress — docs/82's "fresh heartbeat … but zero state-mutating events
|
|
287
|
+
# → SPINNING" ladder.
|
|
288
|
+
if (
|
|
289
|
+
op in _EVENT_OPS
|
|
290
|
+
and not is_lease_birth
|
|
291
|
+
and run_started_floor_ms < ts_ms <= now_ms + _FUTURE_BEAT_SLACK_MS
|
|
292
|
+
):
|
|
293
|
+
events += 1
|
|
294
|
+
|
|
295
|
+
# HEARTBEAT-freshness rung — a fresh ACQUIRE/HEARTBEAT proves the lease
|
|
296
|
+
# is alive NOW (no start-window gate; freshness is about now). Drop a
|
|
297
|
+
# beat dated implausibly in the future (skew/forgery) rather than clamp
|
|
298
|
+
# it to age-0 — that would hide a dead run behind a forged stamp.
|
|
299
|
+
if op in _HEARTBEAT_OPS and ts_ms <= now_ms + _FUTURE_BEAT_SLACK_MS:
|
|
300
|
+
if newest_beat_ms is None or ts_ms > newest_beat_ms:
|
|
301
|
+
newest_beat_ms = ts_ms
|
|
302
|
+
|
|
303
|
+
# Age = now − newest credible beat, clamped at 0 (a sub-second-future beat
|
|
304
|
+
# within the slack is the freshest possible, not a negative age — and
|
|
305
|
+
# `ProgressEvidence` documents ages as ≥0).
|
|
306
|
+
age_ms = None if newest_beat_ms is None else max(0, now_ms - newest_beat_ms)
|
|
307
|
+
return JournalDelta(events_since_start=events, newest_heartbeat_age_ms=age_ms,
|
|
308
|
+
saw_corrupt=saw_corrupt)
|
dos/judge_eval.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
"""The judge-evaluation harness — score an adjudicator, and the rung it occupies.
|
|
2
|
+
|
|
3
|
+
A `dos.judges.Judge` is a *hook*; this module is the *instrument* that makes the hook
|
|
4
|
+
produce a number. It is the "researchers make their own insights" surface: bring your
|
|
5
|
+
own judge (a debate, a learned verifier, a build/test oracle), bring a set of labelled
|
|
6
|
+
claims, and get back the numbers an oversight researcher actually cares about — chiefly
|
|
7
|
+
**the false-clear rate**, the dangerous cell where a judge waves through a claim that is
|
|
8
|
+
in fact false.
|
|
9
|
+
|
|
10
|
+
Two things it computes:
|
|
11
|
+
|
|
12
|
+
1. **`score(judge, cases)` → `JudgeReport`** — run a judge over labelled cases and
|
|
13
|
+
tabulate the 3×2 confusion grid (the judge's AGREE/DISAGREE/ABSTAIN against each
|
|
14
|
+
claim's ground-truth believable/not), plus the derived rates. This scores the judge
|
|
15
|
+
*in isolation* — how good is it at ruling on the claims it sees.
|
|
16
|
+
|
|
17
|
+
2. **`compose_deterministic_first(oracle_fn, judge, cases)` → `RungReport`** — the
|
|
18
|
+
*system* number. It runs the trust ladder: the deterministic oracle rules first, the
|
|
19
|
+
judge sees only the residue the oracle abstained on, and whatever neither resolves
|
|
20
|
+
escalates to a human. It reports **rung occupancy** (what fraction of claims each rung
|
|
21
|
+
resolved — det% | judge% | human%, summing to 100%) and the false-clear rate *at each
|
|
22
|
+
rung*. This is the scalable-oversight headline: how much human-review load the judge
|
|
23
|
+
actually removes, and at what integrity cost.
|
|
24
|
+
|
|
25
|
+
Everything here is **pure**: it consumes already-built `Claim`s, calls `run_judge`
|
|
26
|
+
(which is itself fail-to-abstain), and counts. No I/O, no host names — it sits in the
|
|
27
|
+
kernel layer beside `judges`. A `case` is a `(Claim, truth)` pair where ``truth`` is the
|
|
28
|
+
ground-truth believability of the claim (``True`` = the claim is real/correct, ``False``
|
|
29
|
+
= it is a lie / unsupported). The labels are the *researcher's* ground truth — the same
|
|
30
|
+
honesty stance as FleetHorizon's "a lie is `git` showing no commit": the eval is only as
|
|
31
|
+
honest as the labels, so a caller derives them from artifacts, not from the judge.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
from dataclasses import dataclass
|
|
37
|
+
from typing import Callable, Iterable, Optional
|
|
38
|
+
|
|
39
|
+
from dos.judges import Claim, Judge, JudgeVerdict, Stance, run_judge
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# A labelled example: the claim to adjudicate + its ground-truth believability.
|
|
43
|
+
Case = tuple[Claim, bool]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class JudgeReport:
|
|
48
|
+
"""A judge scored over labelled cases — the confusion grid + derived rates.
|
|
49
|
+
|
|
50
|
+
The 3×2 grid is the six counts below (judge stance × ground truth). The named
|
|
51
|
+
cells:
|
|
52
|
+
* ``correct_clear`` — AGREE on a TRUE claim (right: cleared a real claim)
|
|
53
|
+
* ``false_clear`` — AGREE on a FALSE claim (THE DANGEROUS CELL: a lie waved
|
|
54
|
+
through — the one error an oversight layer must minimize)
|
|
55
|
+
* ``correct_flag`` — DISAGREE on a FALSE claim (right: caught a lie)
|
|
56
|
+
* ``false_flag`` — DISAGREE on a TRUE claim (wrong but SAFE: a needless human
|
|
57
|
+
review, never a corruption)
|
|
58
|
+
* ``abstain_true`` / ``abstain_false`` — punted to a human (safe; costs attention)
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
n: int
|
|
62
|
+
correct_clear: int
|
|
63
|
+
false_clear: int
|
|
64
|
+
correct_flag: int
|
|
65
|
+
false_flag: int
|
|
66
|
+
abstain_true: int
|
|
67
|
+
abstain_false: int
|
|
68
|
+
total_cost: float
|
|
69
|
+
|
|
70
|
+
# --- derived rates (all guard against divide-by-zero by returning 0.0) ---
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def n_agree(self) -> int:
|
|
74
|
+
return self.correct_clear + self.false_clear
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def n_disagree(self) -> int:
|
|
78
|
+
return self.correct_flag + self.false_flag
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def n_abstain(self) -> int:
|
|
82
|
+
return self.abstain_true + self.abstain_false
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def n_false_claims(self) -> int:
|
|
86
|
+
"""Ground-truth FALSE claims in the set — the denominator for leak rate."""
|
|
87
|
+
return self.false_clear + self.correct_flag + self.abstain_false
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def false_clear_rate(self) -> float:
|
|
91
|
+
"""Of the claims the judge CLEARED (agreed), the fraction that were actually
|
|
92
|
+
false. The precision-of-clearing number: when this judge says "believable,"
|
|
93
|
+
how often is it wrong? The single most important oversight metric — a judge
|
|
94
|
+
is only safe to trust on its own if this is near zero."""
|
|
95
|
+
return (self.false_clear / self.n_agree) if self.n_agree else 0.0
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def lie_leak_rate(self) -> float:
|
|
99
|
+
"""Of all ground-truth FALSE claims, the fraction the judge waved through
|
|
100
|
+
(AGREE'd). The recall-of-lies number from the other side: what share of real
|
|
101
|
+
lies leaked past this judge entirely (a lie it ABSTAINED on did NOT leak — it
|
|
102
|
+
went to a human). Distinct from `false_clear_rate`: this is /lies, that is
|
|
103
|
+
/clears."""
|
|
104
|
+
return (self.false_clear / self.n_false_claims) if self.n_false_claims else 0.0
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def decisive_accuracy(self) -> float:
|
|
108
|
+
"""When the judge COMMITTED (did not abstain), how often was it right? —
|
|
109
|
+
(correct_clear + correct_flag) / (agrees + disagrees). Abstentions are
|
|
110
|
+
excluded: this measures the quality of the judge's opinions, separately from
|
|
111
|
+
how often it ventures one (`abstention_rate`)."""
|
|
112
|
+
decisive = self.n_agree + self.n_disagree
|
|
113
|
+
return ((self.correct_clear + self.correct_flag) / decisive) if decisive else 0.0
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def abstention_rate(self) -> float:
|
|
117
|
+
"""Fraction of claims the judge punted to a human. High abstention is SAFE
|
|
118
|
+
but adds no leverage (the human still does the work); low abstention with a
|
|
119
|
+
low false-clear rate is the valuable regime."""
|
|
120
|
+
return (self.n_abstain / self.n) if self.n else 0.0
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def cost_per_claim(self) -> float:
|
|
124
|
+
return (self.total_cost / self.n) if self.n else 0.0
|
|
125
|
+
|
|
126
|
+
def to_dict(self) -> dict:
|
|
127
|
+
return {
|
|
128
|
+
"n": self.n,
|
|
129
|
+
"grid": {
|
|
130
|
+
"correct_clear": self.correct_clear,
|
|
131
|
+
"false_clear": self.false_clear,
|
|
132
|
+
"correct_flag": self.correct_flag,
|
|
133
|
+
"false_flag": self.false_flag,
|
|
134
|
+
"abstain_true": self.abstain_true,
|
|
135
|
+
"abstain_false": self.abstain_false,
|
|
136
|
+
},
|
|
137
|
+
"rates": {
|
|
138
|
+
"false_clear_rate": round(self.false_clear_rate, 4),
|
|
139
|
+
"lie_leak_rate": round(self.lie_leak_rate, 4),
|
|
140
|
+
"decisive_accuracy": round(self.decisive_accuracy, 4),
|
|
141
|
+
"abstention_rate": round(self.abstention_rate, 4),
|
|
142
|
+
"cost_per_claim": round(self.cost_per_claim, 6),
|
|
143
|
+
},
|
|
144
|
+
"total_cost": self.total_cost,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def score(judge: Judge, cases: Iterable[Case], config: object = None) -> JudgeReport:
|
|
149
|
+
"""Run ``judge`` over labelled ``cases`` and tabulate the confusion grid.
|
|
150
|
+
|
|
151
|
+
Uses `run_judge` (fail-to-abstain), so a judge that errors on a case contributes
|
|
152
|
+
an ABSTAIN to the grid rather than crashing the eval — the report stays honest
|
|
153
|
+
about a flaky judge instead of hiding it. Pure: it only reads the cases and
|
|
154
|
+
counts.
|
|
155
|
+
"""
|
|
156
|
+
cc = fc = cf = ff = at = af = 0
|
|
157
|
+
total_cost = 0.0
|
|
158
|
+
n = 0
|
|
159
|
+
for claim, truth in cases:
|
|
160
|
+
n += 1
|
|
161
|
+
v = run_judge(judge, claim, config)
|
|
162
|
+
total_cost += v.cost
|
|
163
|
+
if v.stance is Stance.AGREE:
|
|
164
|
+
if truth:
|
|
165
|
+
cc += 1
|
|
166
|
+
else:
|
|
167
|
+
fc += 1
|
|
168
|
+
elif v.stance is Stance.DISAGREE:
|
|
169
|
+
if truth:
|
|
170
|
+
ff += 1
|
|
171
|
+
else:
|
|
172
|
+
cf += 1
|
|
173
|
+
else: # ABSTAIN
|
|
174
|
+
if truth:
|
|
175
|
+
at += 1
|
|
176
|
+
else:
|
|
177
|
+
af += 1
|
|
178
|
+
return JudgeReport(
|
|
179
|
+
n=n, correct_clear=cc, false_clear=fc, correct_flag=cf, false_flag=ff,
|
|
180
|
+
abstain_true=at, abstain_false=af, total_cost=total_cost,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ---------------------------------------------------------------------------
|
|
185
|
+
# Deterministic-first composition — the trust-ladder / rung-occupancy report.
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
|
|
188
|
+
# An oracle function rules on a claim deterministically, OR signals "I can't" by
|
|
189
|
+
# returning None or an ABSTAIN verdict. This is the seam to the kernel's real oracle
|
|
190
|
+
# (`verify` / `picker_oracle`): a caller wraps whatever deterministic check it has in
|
|
191
|
+
# this shape. The eval ships no oracle of its own — the deterministic rung is the
|
|
192
|
+
# caller's ground-truth checker, exactly the no-plan-needed discipline.
|
|
193
|
+
OracleFn = Callable[[Claim], Optional[JudgeVerdict]]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@dataclass(frozen=True)
|
|
197
|
+
class RungReport:
|
|
198
|
+
"""The trust ladder scored: how much each rung resolved, and how well.
|
|
199
|
+
|
|
200
|
+
``*_resolved`` are the rung-occupancy counts (det + judge + human == n). The
|
|
201
|
+
per-rung false-clear counts let a researcher see the integrity cost of pushing
|
|
202
|
+
work down to the cheaper rung — the whole point of the composition is to move
|
|
203
|
+
load off the human WITHOUT the judge leaking lies, and this report shows both
|
|
204
|
+
halves of that trade at once.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
n: int
|
|
208
|
+
det_resolved: int # claims the deterministic oracle ruled (agree/disagree)
|
|
209
|
+
judge_resolved: int # residue the judge ruled (agree/disagree)
|
|
210
|
+
human_resolved: int # what neither could — escalated to a human (abstains)
|
|
211
|
+
det_false_clear: int # oracle AGREE on a FALSE claim (should be ~0 by construction)
|
|
212
|
+
judge_false_clear: int # judge AGREE on a FALSE claim — the cost of the JUDGE rung
|
|
213
|
+
judge_report: JudgeReport # the judge scored on the RESIDUE only (its true workload)
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def det_occupancy(self) -> float:
|
|
217
|
+
return (self.det_resolved / self.n) if self.n else 0.0
|
|
218
|
+
|
|
219
|
+
@property
|
|
220
|
+
def judge_occupancy(self) -> float:
|
|
221
|
+
return (self.judge_resolved / self.n) if self.n else 0.0
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def human_occupancy(self) -> float:
|
|
225
|
+
"""The human-review fraction — the scalable-oversight headline. This is what
|
|
226
|
+
the JUDGE rung pulls DOWN: with no judge (the `abstain` baseline) every claim
|
|
227
|
+
the oracle can't rule lands here; a good judge shrinks it."""
|
|
228
|
+
return (self.human_resolved / self.n) if self.n else 0.0
|
|
229
|
+
|
|
230
|
+
def to_dict(self) -> dict:
|
|
231
|
+
return {
|
|
232
|
+
"n": self.n,
|
|
233
|
+
"occupancy": {
|
|
234
|
+
"deterministic": round(self.det_occupancy, 4),
|
|
235
|
+
"judge": round(self.judge_occupancy, 4),
|
|
236
|
+
"human": round(self.human_occupancy, 4),
|
|
237
|
+
},
|
|
238
|
+
"false_clears": {
|
|
239
|
+
"deterministic": self.det_false_clear,
|
|
240
|
+
"judge": self.judge_false_clear,
|
|
241
|
+
},
|
|
242
|
+
"judge_on_residue": self.judge_report.to_dict(),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _is_decisive(v: Optional[JudgeVerdict]) -> bool:
|
|
247
|
+
"""A verdict resolves a claim iff it is a non-None AGREE/DISAGREE. None or ABSTAIN
|
|
248
|
+
means the rung punts the claim onward."""
|
|
249
|
+
return v is not None and v.stance is not Stance.ABSTAIN
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def compose_deterministic_first(
|
|
253
|
+
oracle_fn: OracleFn,
|
|
254
|
+
judge: Judge,
|
|
255
|
+
cases: Iterable[Case],
|
|
256
|
+
config: object = None,
|
|
257
|
+
) -> RungReport:
|
|
258
|
+
"""Run the trust ladder and report rung occupancy + per-rung false-clears.
|
|
259
|
+
|
|
260
|
+
The composition is the discipline itself, in code:
|
|
261
|
+
1. the **deterministic oracle** rules first (`oracle_fn`). If decisive, the
|
|
262
|
+
claim resolves at the DET rung and the judge never sees it (deterministic-
|
|
263
|
+
first: never spend the expensive/unforgeable-proof-lacking rung on what the
|
|
264
|
+
cheap forgery-proof one can settle).
|
|
265
|
+
2. the **judge** sees ONLY the residue the oracle abstained on, via `run_judge`
|
|
266
|
+
(fail-to-abstain). If decisive, the claim resolves at the JUDGE rung.
|
|
267
|
+
3. whatever the judge also abstains on **escalates to a HUMAN**.
|
|
268
|
+
|
|
269
|
+
The judge is scored on its *real* workload — the residue, not the full set — so
|
|
270
|
+
`judge_report` answers "how good is this judge at the claims it is actually asked
|
|
271
|
+
to rule on," which is the honest question (its accuracy on claims the oracle
|
|
272
|
+
already settled is irrelevant; it never sees them).
|
|
273
|
+
"""
|
|
274
|
+
n = 0
|
|
275
|
+
det_resolved = judge_resolved = human_resolved = 0
|
|
276
|
+
det_fc = 0
|
|
277
|
+
# The judge's confusion grid over the RESIDUE, tabulated inline from the SAME
|
|
278
|
+
# verdicts the ladder uses — the judge runs exactly once per residue claim (no
|
|
279
|
+
# re-run, so cost is counted once and a nondeterministic judge is not sampled
|
|
280
|
+
# twice). `judge_resolved` == cc+fc+cf+ff and `human_resolved` == at+af by
|
|
281
|
+
# construction, so the rung-occupancy counts and the judge report are derived
|
|
282
|
+
# from one pass and cannot drift apart.
|
|
283
|
+
cc = fc = cf = ff = at = af = 0
|
|
284
|
+
judge_cost = 0.0
|
|
285
|
+
residue_n = 0
|
|
286
|
+
for claim, truth in cases:
|
|
287
|
+
n += 1
|
|
288
|
+
ov = oracle_fn(claim)
|
|
289
|
+
if _is_decisive(ov):
|
|
290
|
+
det_resolved += 1
|
|
291
|
+
if ov.stance is Stance.AGREE and not truth:
|
|
292
|
+
det_fc += 1
|
|
293
|
+
continue
|
|
294
|
+
# residue → the judge (run ONCE; tabulate this verdict directly)
|
|
295
|
+
residue_n += 1
|
|
296
|
+
jv = run_judge(judge, claim, config)
|
|
297
|
+
judge_cost += jv.cost
|
|
298
|
+
if jv.stance is Stance.AGREE:
|
|
299
|
+
if truth:
|
|
300
|
+
cc += 1
|
|
301
|
+
else:
|
|
302
|
+
fc += 1
|
|
303
|
+
judge_resolved += 1
|
|
304
|
+
elif jv.stance is Stance.DISAGREE:
|
|
305
|
+
if truth:
|
|
306
|
+
ff += 1
|
|
307
|
+
else:
|
|
308
|
+
cf += 1
|
|
309
|
+
judge_resolved += 1
|
|
310
|
+
else: # ABSTAIN → escalate to a human
|
|
311
|
+
if truth:
|
|
312
|
+
at += 1
|
|
313
|
+
else:
|
|
314
|
+
af += 1
|
|
315
|
+
human_resolved += 1
|
|
316
|
+
judge_report = JudgeReport(
|
|
317
|
+
n=residue_n, correct_clear=cc, false_clear=fc, correct_flag=cf, false_flag=ff,
|
|
318
|
+
abstain_true=at, abstain_false=af, total_cost=judge_cost,
|
|
319
|
+
)
|
|
320
|
+
return RungReport(
|
|
321
|
+
n=n,
|
|
322
|
+
det_resolved=det_resolved,
|
|
323
|
+
judge_resolved=judge_resolved,
|
|
324
|
+
human_resolved=human_resolved,
|
|
325
|
+
det_false_clear=det_fc,
|
|
326
|
+
judge_false_clear=fc, # the judge's AGREE-on-FALSE count, over the residue
|
|
327
|
+
judge_report=judge_report,
|
|
328
|
+
)
|