dos-kernel 0.22.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dos/__init__.py +261 -0
- dos/_bin/dos-hook.exe +0 -0
- dos/_filelock.py +255 -0
- dos/_job_policy.py +97 -0
- dos/_tree.py +145 -0
- dos/admission.py +433 -0
- dos/answer_shape.py +299 -0
- dos/arbiter.py +859 -0
- dos/archive_lock.py +266 -0
- dos/arg_provenance.py +814 -0
- dos/attest.py +472 -0
- dos/breaker.py +311 -0
- dos/churn.py +226 -0
- dos/claim_extract.py +229 -0
- dos/claim_ttl.py +150 -0
- dos/cli.py +8721 -0
- dos/commit_audit.py +666 -0
- dos/completion.py +466 -0
- dos/concurrency_class.py +154 -0
- dos/config.py +1380 -0
- dos/config_lint.py +464 -0
- dos/cooldown.py +390 -0
- dos/coverage.py +387 -0
- dos/dangling_intent.py +287 -0
- dos/data_class.py +397 -0
- dos/decisions.py +1274 -0
- dos/decisions_tui.py +251 -0
- dos/dispatch_top.py +740 -0
- dos/dispatch_top_tui.py +116 -0
- dos/drivers/__init__.py +40 -0
- dos/drivers/ci_status.py +630 -0
- dos/drivers/citation_resolve.py +703 -0
- dos/drivers/decision_stop.py +98 -0
- dos/drivers/export_file.py +173 -0
- dos/drivers/export_otlp.py +275 -0
- dos/drivers/export_statsd.py +242 -0
- dos/drivers/hook_dialects.py +391 -0
- dos/drivers/job.py +47 -0
- dos/drivers/llm_judge.py +360 -0
- dos/drivers/memory_recall.py +1231 -0
- dos/drivers/notify_slack.py +373 -0
- dos/drivers/notify_webhook.py +251 -0
- dos/drivers/operator_judge.py +114 -0
- dos/drivers/os_acceptance.py +228 -0
- dos/drivers/paste_log.py +132 -0
- dos/drivers/plan_scope.py +133 -0
- dos/drivers/self_improve.py +375 -0
- dos/drivers/similarity_judge.py +249 -0
- dos/drivers/state_diff.py +274 -0
- dos/drivers/supervisor.py +347 -0
- dos/drivers/watchdog.py +363 -0
- dos/drivers/workshop.py +160 -0
- dos/durable_schema.py +344 -0
- dos/effect_witness.py +393 -0
- dos/efficiency.py +318 -0
- dos/enforce.py +414 -0
- dos/enumerate.py +776 -0
- dos/env_print.py +378 -0
- dos/event_severity.py +258 -0
- dos/evidence.py +692 -0
- dos/exec_capability.py +256 -0
- dos/export_cursor.py +143 -0
- dos/exporter.py +320 -0
- dos/firing_label.py +353 -0
- dos/fleet_roll.py +226 -0
- dos/gate_classify.py +827 -0
- dos/gh4_coverage.py +179 -0
- dos/git_delta.py +122 -0
- dos/guard.py +215 -0
- dos/health.py +552 -0
- dos/help_summary.py +519 -0
- dos/home.py +934 -0
- dos/hook_binary.py +194 -0
- dos/hook_dialect.py +271 -0
- dos/hook_exit.py +191 -0
- dos/hook_install.py +437 -0
- dos/id_alloc.py +304 -0
- dos/improve.py +499 -0
- dos/intent_ledger.py +635 -0
- dos/interpret.py +176 -0
- dos/intervention.py +769 -0
- dos/intervention_eval.py +371 -0
- dos/journal_delta.py +308 -0
- dos/judge_eval.py +328 -0
- dos/judges.py +366 -0
- dos/lane_infer.py +127 -0
- dos/lane_journal.py +1001 -0
- dos/lane_lease.py +952 -0
- dos/lane_overlap.py +228 -0
- dos/lease_health.py +282 -0
- dos/lifecycle.py +211 -0
- dos/liveness.py +352 -0
- dos/lock_modes.py +185 -0
- dos/log_source.py +395 -0
- dos/loop_decide.py +1746 -0
- dos/marker_gate.py +254 -0
- dos/marker_sensor.py +396 -0
- dos/noop_streak.py +280 -0
- dos/notify.py +479 -0
- dos/observe.py +175 -0
- dos/oracle.py +1661 -0
- dos/overlap_eval.py +214 -0
- dos/overlap_policy.py +342 -0
- dos/packet_sidecar.py +267 -0
- dos/phase_shipped.py +1985 -0
- dos/pick_priority.py +225 -0
- dos/pickable.py +369 -0
- dos/picker_oracle.py +1037 -0
- dos/plan_board.py +513 -0
- dos/plan_board_tui.py +113 -0
- dos/plan_source.py +455 -0
- dos/posttool_sensor.py +528 -0
- dos/precursor_gate.py +499 -0
- dos/precursor_gate_eval.py +239 -0
- dos/preflight.py +825 -0
- dos/pretool_sensor.py +490 -0
- dos/proc_delta.py +181 -0
- dos/productivity.py +296 -0
- dos/provider_limit.py +242 -0
- dos/py.typed +4 -0
- dos/reason_morphology.py +299 -0
- dos/reasons.py +449 -0
- dos/reconcile.py +173 -0
- dos/recurring_wedge.py +206 -0
- dos/render.py +393 -0
- dos/result_state.py +468 -0
- dos/resume.py +578 -0
- dos/resume_evidence.py +293 -0
- dos/retention.py +344 -0
- dos/reward.py +372 -0
- dos/rewind.py +587 -0
- dos/rewind_evidence.py +168 -0
- dos/rewind_tokens.py +252 -0
- dos/run_id.py +342 -0
- dos/scope.py +520 -0
- dos/scope_source.py +382 -0
- dos/scout.py +982 -0
- dos/self_modify.py +209 -0
- dos/sibling_scan.py +569 -0
- dos/skills/EXAMPLES.md +584 -0
- dos/skills/dos-class-cycle/SKILL.md +107 -0
- dos/skills/dos-dispatch/SKILL.md +177 -0
- dos/skills/dos-dispatch-loop/SKILL.md +254 -0
- dos/skills/dos-goal-gate/SKILL.md +269 -0
- dos/skills/dos-next-up/SKILL.md +231 -0
- dos/skills/dos-promote/SKILL.md +114 -0
- dos/skills/dos-replan/SKILL.md +159 -0
- dos/skills/dos-replan-loop/SKILL.md +114 -0
- dos/skills/dos-self-improve/SKILL.md +213 -0
- dos/skills/dos-supervise-loop/SKILL.md +180 -0
- dos/skills/dos-unstick/SKILL.md +108 -0
- dos/skills/dos-witness-claim/SKILL.md +251 -0
- dos/stamp.py +1002 -0
- dos/state_health.py +387 -0
- dos/status.py +114 -0
- dos/stop_policy.py +334 -0
- dos/supervise.py +1014 -0
- dos/testwitness.py +392 -0
- dos/timeline.py +1027 -0
- dos/tokens.py +485 -0
- dos/tool_stream.py +393 -0
- dos/tool_stream_eval.py +226 -0
- dos/trace.py +524 -0
- dos/verdict.py +140 -0
- dos/verdict_cli.py +189 -0
- dos/verdict_journal.py +497 -0
- dos/verdict_rollup.py +217 -0
- dos/verdicts.py +181 -0
- dos/wedge_reason.py +282 -0
- dos_kernel-0.22.0.dist-info/METADATA +859 -0
- dos_kernel-0.22.0.dist-info/RECORD +178 -0
- dos_kernel-0.22.0.dist-info/WHEEL +5 -0
- dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
- dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
- dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
- dos_mcp/__init__.py +52 -0
- dos_mcp/py.typed +2 -0
- dos_mcp/server.py +779 -0
dos/intervention_eval.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
"""The intervention-evaluation harness — score an actuation policy by its NET TASK DELTA.
|
|
2
|
+
|
|
3
|
+
docs/143 §13.2 — the missing instrument. Every other DOS axis ships an eval: `judge_eval`
|
|
4
|
+
scores a judge, `overlap_eval` scores a disjointness scorer, and `arg_provenance` shipped a
|
|
5
|
+
*detector* eval (precision/recall over minted-vs-resolved ids). The live benchmark run
|
|
6
|
+
proved the decisive number is none of those: a detector that was *sound* (0 % false-nudge,
|
|
7
|
+
83 % recall) was still **net-harmful** (−9 pp) because the *intervention* it triggered was
|
|
8
|
+
too disruptive (RESULTS.md "⚑ KEY DATA POINT"). So the number that decides deployment is
|
|
9
|
+
not "was the verdict right?" — `arg_provenance`'s eval already answers that — but **"did
|
|
10
|
+
ACTING on the verdict help or hurt the run?"**
|
|
11
|
+
|
|
12
|
+
This module is that instrument: the friendliness gauge for the PEP, the way `overlap_eval`
|
|
13
|
+
is for admission. Bring an `InterventionPolicy` (the confidence-gating knobs), bring a
|
|
14
|
+
corpus of replayed verdicts each labelled with the GROUND-TRUTH outcome of acting on it,
|
|
15
|
+
and get back the headline `net_task_delta` plus the dangerous-cell rates a PEP author
|
|
16
|
+
actually cares about — chiefly **wasted-disruption** (when this policy disrupts, how often
|
|
17
|
+
is it spent on a catch that did not matter — the exact source of the −9 pp).
|
|
18
|
+
|
|
19
|
+
The honesty stance (the same as judge_eval / overlap_eval)
|
|
20
|
+
==========================================================
|
|
21
|
+
|
|
22
|
+
The labels are the RESEARCHER's ground truth, derived from EXECUTED replay arms, never from
|
|
23
|
+
the detector. Specifically:
|
|
24
|
+
|
|
25
|
+
* `truly_minted` — was the flagged id ACTUALLY a mint? (the controlled mint-injection
|
|
26
|
+
knows; a false-flag has this False). The `overlap_eval.collided` "did it actually
|
|
27
|
+
collide" discipline.
|
|
28
|
+
* `mattered_to_score` — did this FK feed a hidden SQL verifier the run was scored on? From
|
|
29
|
+
the verifier set, not the wrapper. This is the −9 pp axis: a true catch the verifier
|
|
30
|
+
never checked buys nothing, so disrupting on it is pure cost.
|
|
31
|
+
* `recovered_if_blocked` / `recovered_if_deferred` — COUNTERFACTUAL ground truth from the
|
|
32
|
+
two EXECUTED A/B arms (a turn-preserving intervention vs a turn-spending one), NOT a
|
|
33
|
+
guessed label. The live run measured the turn-spending recovery at ~75 % (48/64,
|
|
34
|
+
RESULTS.md line 104); a turn-preserving BLOCK is expected higher (it costs no turn).
|
|
35
|
+
|
|
36
|
+
Everything here is **pure**: it consumes already-built `InterventionCase`s, runs the policy
|
|
37
|
+
through `intervention.choose_intervention` (the SAME path the consumer's PEP takes, so the
|
|
38
|
+
grid reflects what would actually be enacted — the `overlap_eval` "score under the floor"
|
|
39
|
+
discipline), and counts in one pass. No I/O, no host names — it sits in the kernel layer
|
|
40
|
+
beside `intervention`.
|
|
41
|
+
|
|
42
|
+
⚠ This is NOT a detector eval. `arg_provenance` precision/recall measures the verdict;
|
|
43
|
+
THIS measures the intervention. The two are orthogonal (the §13 thesis), so they are
|
|
44
|
+
separate instruments by design.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
from __future__ import annotations
|
|
48
|
+
|
|
49
|
+
from dataclasses import dataclass
|
|
50
|
+
from typing import Iterable
|
|
51
|
+
|
|
52
|
+
from dos.arg_provenance import ProvenanceVerdict
|
|
53
|
+
from dos.intervention import (
|
|
54
|
+
BASE_INTERVENTIONS,
|
|
55
|
+
Confidence,
|
|
56
|
+
Intervention,
|
|
57
|
+
InterventionDecision,
|
|
58
|
+
InterventionLadder,
|
|
59
|
+
InterventionPolicy,
|
|
60
|
+
choose_intervention,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# A labelled example — one replayed verdict + the GROUND-TRUTH outcome of acting.
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
@dataclass(frozen=True)
|
|
68
|
+
class InterventionCase:
|
|
69
|
+
"""One replayed verdict + the ground-truth outcome of intervening on it.
|
|
70
|
+
|
|
71
|
+
The `confidence` is NOT stored — it is DERIVED in `score()` via `choose_intervention`
|
|
72
|
+
from the embedded `verdict`, so the scored action can never drift from a hand-labelled
|
|
73
|
+
confidence (the label-drift trap). Every other field is a ground-truth label from an
|
|
74
|
+
EXECUTED replay arm, NOT a guess (the `overlap_eval.collided` honesty discipline).
|
|
75
|
+
|
|
76
|
+
Fields:
|
|
77
|
+
verdict — the real `ProvenanceVerdict` the detector produced. The policy is
|
|
78
|
+
scored against THIS via `choose_intervention` (same path as the
|
|
79
|
+
consumer's PEP), so the eval measures what would be enacted.
|
|
80
|
+
truly_minted — ground truth: was the flagged id ACTUALLY a mint? (False = the
|
|
81
|
+
detector false-flagged a legit derived/resolved id.)
|
|
82
|
+
mattered_to_score — ground truth: did this FK feed a verifier the run was scored on?
|
|
83
|
+
(the −9 pp axis — a true catch the verifier never checks buys
|
|
84
|
+
nothing, so disrupting on it is pure cost.)
|
|
85
|
+
recovered_if_blocked — counterfactual ground truth from the turn-PRESERVING arm
|
|
86
|
+
(WARN/BLOCK): under a turn-preserving intervention, did the agent
|
|
87
|
+
recover (resolve the id correctly)?
|
|
88
|
+
recovered_if_deferred— counterfactual ground truth from the turn-SPENDING arm (DEFER):
|
|
89
|
+
under a re-prompt that costs the turn, did the agent recover?
|
|
90
|
+
(the live ~75 %.)
|
|
91
|
+
label — optional human handle (carried, never scored).
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
verdict: ProvenanceVerdict
|
|
95
|
+
truly_minted: bool
|
|
96
|
+
mattered_to_score: bool
|
|
97
|
+
recovered_if_blocked: bool
|
|
98
|
+
recovered_if_deferred: bool
|
|
99
|
+
label: str = ""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------------------------------------------------------------------------
|
|
103
|
+
# The per-case net-delta ledger — the §13.2 formula, honestly generalized.
|
|
104
|
+
# ---------------------------------------------------------------------------
|
|
105
|
+
def _case_delta(
|
|
106
|
+
case: InterventionCase, action: Intervention, ladder: InterventionLadder
|
|
107
|
+
) -> float:
|
|
108
|
+
"""The net task-delta this `(case, action)` contributes, in units of "one task verifier".
|
|
109
|
+
|
|
110
|
+
GENERALIZES docs/143 §13.2's `caught × recovered × (1 − disruption_cost)` to all cells
|
|
111
|
+
(the product only modeled the recovered-relevant cell; the −9 pp lives in a cell the
|
|
112
|
+
product cannot see). The honest decomposition:
|
|
113
|
+
|
|
114
|
+
* a real PREVENTED corruption that mattered is worth `+(1 − cost)` (a verifier flips
|
|
115
|
+
fail→pass, minus the disruption tax);
|
|
116
|
+
* disruption (`cost`, read from the ladder) is paid whenever the action ACTUATES
|
|
117
|
+
(withholds the turn) — win or lose;
|
|
118
|
+
* a DISPATCHING action (OBSERVE/WARN) lets the real (possibly minted) call land, so it
|
|
119
|
+
has near-zero PREVENTION value but also near-zero disruption cost (WARN's small cost
|
|
120
|
+
is its annotation, not a withheld turn).
|
|
121
|
+
|
|
122
|
+
The IRREVERSIBILITY premise (load-bearing). EnterpriseOps-Gym mutates a shared DB where
|
|
123
|
+
"every action is permanent and irreversible" (docs/143 §1) — there is no rollback. So a
|
|
124
|
+
DISPATCHING action that lets a minted *relevant* write land has **already corrupted the
|
|
125
|
+
scored final state**: a next-turn "correction" is a SECOND write the verifier sees
|
|
126
|
+
alongside the bad FK, not a repair. Therefore a dispatched relevant mint has **zero**
|
|
127
|
+
prevention value — only a WITHHOLDING rung (BLOCK/DEFER) can prevent the corruption.
|
|
128
|
+
This is what makes the §13 thesis crisp: BLOCK prevents; WARN merely informs (and is
|
|
129
|
+
valuable on the OTHER cells, where it costs nothing and avoids the −9pp).
|
|
130
|
+
|
|
131
|
+
Cells:
|
|
132
|
+
truly_minted ∧ mattered:
|
|
133
|
+
withholding (DEFER/BLOCK) → mutation prevented → `+(1−cost)` on recovery, `−cost` if not.
|
|
134
|
+
dispatching (OBSERVE/WARN)→ the bad write LANDED and cannot be un-committed → 0
|
|
135
|
+
prevention value (the annotation may help a LATER, distinct
|
|
136
|
+
step, but not this corrupted row). Near-zero disruption.
|
|
137
|
+
truly_minted ∧ ¬mattered → THE DANGEROUS CELL: a true catch the verifier never checks.
|
|
138
|
+
No gain to win; a withholding action pays pure `−cost` (the
|
|
139
|
+
live −9 pp); a dispatching one ≈ 0.
|
|
140
|
+
¬truly_minted (false-flag) → no gain; a withholding action pays `−cost`, a dispatching
|
|
141
|
+
one ≈ 0.
|
|
142
|
+
|
|
143
|
+
`cost` is ALWAYS `ladder.disruption_cost(action)` (normalized [0,1]) — never a hardcoded
|
|
144
|
+
per-rung constant, so a host-retuned ladder reweights the eval automatically. The model
|
|
145
|
+
is deliberately CONSERVATIVE about the mechanism's upside (a dispatched mint scores 0, not
|
|
146
|
+
a partial credit) so the eval cannot flatter the intervention — the honesty direction.
|
|
147
|
+
"""
|
|
148
|
+
cost = ladder.disruption_cost(action.value)
|
|
149
|
+
dispatches = ladder.dispatches(action.value)
|
|
150
|
+
recovered = (
|
|
151
|
+
case.recovered_if_deferred
|
|
152
|
+
if action is Intervention.DEFER
|
|
153
|
+
else case.recovered_if_blocked
|
|
154
|
+
)
|
|
155
|
+
if case.truly_minted and case.mattered_to_score:
|
|
156
|
+
if dispatches:
|
|
157
|
+
# OBSERVE/WARN: the minted write landed on an irreversible DB → 0 prevention.
|
|
158
|
+
return 0.0
|
|
159
|
+
# DEFER/BLOCK: the mutation was WITHHELD → prevention possible.
|
|
160
|
+
return (1.0 - cost) if recovered else (0.0 - cost)
|
|
161
|
+
if case.truly_minted and not case.mattered_to_score:
|
|
162
|
+
# THE DANGEROUS CELL — a true catch that did not matter. Disrupting buys nothing.
|
|
163
|
+
return -cost if not dispatches else 0.0
|
|
164
|
+
# false-flag — no gain; disruption is pure waste.
|
|
165
|
+
return -cost if not dispatches else 0.0
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
# The report — frozen, @property rates with div-guard, to_dict (mirror overlap_eval).
|
|
170
|
+
# ---------------------------------------------------------------------------
|
|
171
|
+
@dataclass(frozen=True)
|
|
172
|
+
class InterventionReport:
|
|
173
|
+
"""A policy scored over labelled cases — the net-delta ledger + the dangerous-cell rates.
|
|
174
|
+
|
|
175
|
+
The grid is split into the ground-truth crosstab (independent of the chosen action) and
|
|
176
|
+
the actuation ledger (what the policy actually DID). The named dangerous cell is
|
|
177
|
+
`actuated_irrelevant` — disruption spent on a true catch the verifier never checked, the
|
|
178
|
+
exact −9 pp.
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
n: int
|
|
182
|
+
sum_delta: float
|
|
183
|
+
sum_disruption_cost: float # accumulated disruption tax over ACTUATED actions
|
|
184
|
+
# ground-truth grid (independent of the chosen action):
|
|
185
|
+
n_true_relevant: int # truly_minted AND mattered_to_score
|
|
186
|
+
n_true_irrelevant: int # truly_minted AND NOT mattered (the dangerous-cell denom)
|
|
187
|
+
n_false_flag: int # NOT truly_minted
|
|
188
|
+
# actuation ledger (did the chosen action WITHHOLD the turn?):
|
|
189
|
+
n_actuated: int # actions where ladder.actuates() (turn at risk)
|
|
190
|
+
n_informed_only: int # OBSERVE/WARN — turn preserved
|
|
191
|
+
actuated_irrelevant: int # actuated on a true_irrelevant case (the −9 pp cell)
|
|
192
|
+
actuated_false_flag: int # actuated on a false_flag
|
|
193
|
+
n_actuated_relevant: int # actuated on a true_relevant case
|
|
194
|
+
recovered: int # actuated true_relevant that recovered
|
|
195
|
+
|
|
196
|
+
# --- derived rates (all guard against divide-by-zero) ---
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def net_task_delta(self) -> float:
|
|
200
|
+
"""The HEADLINE — mean net task-delta per case, in verifier-flip units. Directly
|
|
201
|
+
comparable to the live −9 pp (a net regression) / +11 pp (the simulator's win). The
|
|
202
|
+
number the whole §13 double-down is built to maximize."""
|
|
203
|
+
return (self.sum_delta / self.n) if self.n else 0.0
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def disruption_efficiency(self) -> float:
|
|
207
|
+
"""Of the turns the policy ACTUATED (withheld), the fraction that bought a real gain
|
|
208
|
+
(a recovered relevant catch). High = disruption well spent."""
|
|
209
|
+
return (self.recovered / self.n_actuated) if self.n_actuated else 0.0
|
|
210
|
+
|
|
211
|
+
@property
|
|
212
|
+
def wasted_disruption_rate(self) -> float:
|
|
213
|
+
"""Of the turns the policy ACTUATED, the fraction wasted — spent on a catch that did
|
|
214
|
+
not matter OR on a false flag. THE DANGEROUS-CELL RATE (the `overlap_eval.false_admit
|
|
215
|
+
_rate` analogue): when this policy disrupts, how often is it for nothing? The single
|
|
216
|
+
number the −9 pp came from."""
|
|
217
|
+
if not self.n_actuated:
|
|
218
|
+
return 0.0
|
|
219
|
+
return (self.actuated_irrelevant + self.actuated_false_flag) / self.n_actuated
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def dangerous_cell_rate(self) -> float:
|
|
223
|
+
"""Of all true-but-IRRELEVANT catches, the fraction the policy actuated on — the
|
|
224
|
+
exact −9 pp cell (a sound catch the verifier never checked, disrupted anyway)."""
|
|
225
|
+
return (
|
|
226
|
+
(self.actuated_irrelevant / self.n_true_irrelevant)
|
|
227
|
+
if self.n_true_irrelevant
|
|
228
|
+
else 0.0
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def coverage(self) -> float:
|
|
233
|
+
"""Of all true-RELEVANT mints (a catch that DID matter), the fraction the policy
|
|
234
|
+
actuated on — recall-of-action. A too-timid all-WARN policy scores ~0 here (it never
|
|
235
|
+
withholds), so this is the counterweight to `wasted_disruption_rate`: a good policy
|
|
236
|
+
is high coverage AND low waste."""
|
|
237
|
+
return (
|
|
238
|
+
(self.n_actuated_relevant / self.n_true_relevant)
|
|
239
|
+
if self.n_true_relevant
|
|
240
|
+
else 0.0
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def net_harmful(self) -> bool:
|
|
245
|
+
"""True iff the policy is a net regression (`net_task_delta < 0`). The boolean the
|
|
246
|
+
`dos intervention-eval` exit code rides — the `overlap_eval.leaked` CI-gate analogue
|
|
247
|
+
(a policy that hurts the run fails CI)."""
|
|
248
|
+
return self.net_task_delta < 0.0
|
|
249
|
+
|
|
250
|
+
def to_dict(self) -> dict:
|
|
251
|
+
return {
|
|
252
|
+
"n": self.n,
|
|
253
|
+
"net_task_delta": round(self.net_task_delta, 4),
|
|
254
|
+
"grid": {
|
|
255
|
+
"true_relevant": self.n_true_relevant,
|
|
256
|
+
"true_irrelevant": self.n_true_irrelevant,
|
|
257
|
+
"false_flag": self.n_false_flag,
|
|
258
|
+
},
|
|
259
|
+
"actuation": {
|
|
260
|
+
"actuated": self.n_actuated,
|
|
261
|
+
"informed_only": self.n_informed_only,
|
|
262
|
+
"actuated_relevant": self.n_actuated_relevant,
|
|
263
|
+
"actuated_irrelevant": self.actuated_irrelevant,
|
|
264
|
+
"actuated_false_flag": self.actuated_false_flag,
|
|
265
|
+
"recovered": self.recovered,
|
|
266
|
+
},
|
|
267
|
+
"rates": {
|
|
268
|
+
"net_task_delta": round(self.net_task_delta, 4),
|
|
269
|
+
"disruption_efficiency": round(self.disruption_efficiency, 4),
|
|
270
|
+
"wasted_disruption_rate": round(self.wasted_disruption_rate, 4),
|
|
271
|
+
"dangerous_cell_rate": round(self.dangerous_cell_rate, 4),
|
|
272
|
+
"coverage": round(self.coverage, 4),
|
|
273
|
+
},
|
|
274
|
+
"sum_disruption_cost": round(self.sum_disruption_cost, 4),
|
|
275
|
+
"net_harmful": self.net_harmful,
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _safe_decision(
|
|
280
|
+
verdict: ProvenanceVerdict, policy: InterventionPolicy, ladder: InterventionLadder
|
|
281
|
+
) -> InterventionDecision:
|
|
282
|
+
"""Run `choose_intervention` fail-SAFE: any raise degrades to the ladder default (WARN).
|
|
283
|
+
|
|
284
|
+
Fail-to-LEAST-DISRUPTIVE — the `overlap_eval` fail-closed-to-floor / `judge_eval`
|
|
285
|
+
fail-to-abstain posture, here as under-intervene. A flaky policy contributes a WARN, not
|
|
286
|
+
a crash, so the report stays honest about it.
|
|
287
|
+
"""
|
|
288
|
+
try:
|
|
289
|
+
return choose_intervention(verdict, policy, ladder)
|
|
290
|
+
except Exception:
|
|
291
|
+
spec = ladder.default()
|
|
292
|
+
return InterventionDecision(
|
|
293
|
+
intervention=Intervention(spec.token),
|
|
294
|
+
confidence=Confidence.LOW,
|
|
295
|
+
rung=spec,
|
|
296
|
+
disruption_cost=ladder.disruption_cost(spec.token),
|
|
297
|
+
unsupported=verdict.unsupported,
|
|
298
|
+
reason="fail-safe: policy raised → ladder default",
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def score(
|
|
303
|
+
policy: InterventionPolicy,
|
|
304
|
+
cases: Iterable[InterventionCase],
|
|
305
|
+
ladder: InterventionLadder = BASE_INTERVENTIONS,
|
|
306
|
+
) -> InterventionReport:
|
|
307
|
+
"""Run `policy` over labelled `cases` (via `choose_intervention`) and tabulate the ledger.
|
|
308
|
+
|
|
309
|
+
The policy is scored through the SAME `choose_intervention` path the consumer's PEP uses
|
|
310
|
+
(the `overlap_eval._admits` "score under the floor" discipline), so the grid reflects
|
|
311
|
+
exactly what would be ENACTED — fail-safe and all. PURE: reads cases, reads the ladder,
|
|
312
|
+
counts in one pass. The actuation buckets use `ladder.actuates()` (data-driven, never a
|
|
313
|
+
hardcoded `{DEFER, BLOCK}`), so a host-added rung is bucketed by its `dispatches` data.
|
|
314
|
+
|
|
315
|
+
Invariant (pinned by a test): `n_actuated == actuated_irrelevant + actuated_false_flag +
|
|
316
|
+
n_actuated_relevant`, and the counts are derived in the same pass as `sum_delta`, so they
|
|
317
|
+
cannot drift apart.
|
|
318
|
+
"""
|
|
319
|
+
n = 0
|
|
320
|
+
sum_delta = 0.0
|
|
321
|
+
sum_disruption = 0.0
|
|
322
|
+
n_true_relevant = n_true_irrelevant = n_false_flag = 0
|
|
323
|
+
n_actuated = n_informed_only = 0
|
|
324
|
+
actuated_irrelevant = actuated_false_flag = n_actuated_relevant = recovered = 0
|
|
325
|
+
|
|
326
|
+
for case in cases:
|
|
327
|
+
n += 1
|
|
328
|
+
decision = _safe_decision(case.verdict, policy, ladder)
|
|
329
|
+
action = decision.intervention
|
|
330
|
+
actuates = ladder.actuates(action.value)
|
|
331
|
+
delta = _case_delta(case, action, ladder)
|
|
332
|
+
sum_delta += delta
|
|
333
|
+
|
|
334
|
+
# ground-truth grid (action-independent)
|
|
335
|
+
if case.truly_minted and case.mattered_to_score:
|
|
336
|
+
n_true_relevant += 1
|
|
337
|
+
elif case.truly_minted:
|
|
338
|
+
n_true_irrelevant += 1
|
|
339
|
+
else:
|
|
340
|
+
n_false_flag += 1
|
|
341
|
+
|
|
342
|
+
# actuation ledger (what the policy DID)
|
|
343
|
+
if actuates:
|
|
344
|
+
n_actuated += 1
|
|
345
|
+
sum_disruption += ladder.disruption_cost(action.value)
|
|
346
|
+
if case.truly_minted and case.mattered_to_score:
|
|
347
|
+
n_actuated_relevant += 1
|
|
348
|
+
if case.recovered_if_deferred if action is Intervention.DEFER \
|
|
349
|
+
else case.recovered_if_blocked:
|
|
350
|
+
recovered += 1
|
|
351
|
+
elif case.truly_minted:
|
|
352
|
+
actuated_irrelevant += 1
|
|
353
|
+
else:
|
|
354
|
+
actuated_false_flag += 1
|
|
355
|
+
else:
|
|
356
|
+
n_informed_only += 1
|
|
357
|
+
|
|
358
|
+
return InterventionReport(
|
|
359
|
+
n=n,
|
|
360
|
+
sum_delta=sum_delta,
|
|
361
|
+
sum_disruption_cost=sum_disruption,
|
|
362
|
+
n_true_relevant=n_true_relevant,
|
|
363
|
+
n_true_irrelevant=n_true_irrelevant,
|
|
364
|
+
n_false_flag=n_false_flag,
|
|
365
|
+
n_actuated=n_actuated,
|
|
366
|
+
n_informed_only=n_informed_only,
|
|
367
|
+
actuated_irrelevant=actuated_irrelevant,
|
|
368
|
+
actuated_false_flag=actuated_false_flag,
|
|
369
|
+
n_actuated_relevant=n_actuated_relevant,
|
|
370
|
+
recovered=recovered,
|
|
371
|
+
)
|