dos-kernel 0.22.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. dos/__init__.py +261 -0
  2. dos/_bin/dos-hook.exe +0 -0
  3. dos/_filelock.py +255 -0
  4. dos/_job_policy.py +97 -0
  5. dos/_tree.py +145 -0
  6. dos/admission.py +433 -0
  7. dos/answer_shape.py +299 -0
  8. dos/arbiter.py +859 -0
  9. dos/archive_lock.py +266 -0
  10. dos/arg_provenance.py +814 -0
  11. dos/attest.py +472 -0
  12. dos/breaker.py +311 -0
  13. dos/churn.py +226 -0
  14. dos/claim_extract.py +229 -0
  15. dos/claim_ttl.py +150 -0
  16. dos/cli.py +8721 -0
  17. dos/commit_audit.py +666 -0
  18. dos/completion.py +466 -0
  19. dos/concurrency_class.py +154 -0
  20. dos/config.py +1380 -0
  21. dos/config_lint.py +464 -0
  22. dos/cooldown.py +390 -0
  23. dos/coverage.py +387 -0
  24. dos/dangling_intent.py +287 -0
  25. dos/data_class.py +397 -0
  26. dos/decisions.py +1274 -0
  27. dos/decisions_tui.py +251 -0
  28. dos/dispatch_top.py +740 -0
  29. dos/dispatch_top_tui.py +116 -0
  30. dos/drivers/__init__.py +40 -0
  31. dos/drivers/ci_status.py +630 -0
  32. dos/drivers/citation_resolve.py +703 -0
  33. dos/drivers/decision_stop.py +98 -0
  34. dos/drivers/export_file.py +173 -0
  35. dos/drivers/export_otlp.py +275 -0
  36. dos/drivers/export_statsd.py +242 -0
  37. dos/drivers/hook_dialects.py +391 -0
  38. dos/drivers/job.py +47 -0
  39. dos/drivers/llm_judge.py +360 -0
  40. dos/drivers/memory_recall.py +1231 -0
  41. dos/drivers/notify_slack.py +373 -0
  42. dos/drivers/notify_webhook.py +251 -0
  43. dos/drivers/operator_judge.py +114 -0
  44. dos/drivers/os_acceptance.py +228 -0
  45. dos/drivers/paste_log.py +132 -0
  46. dos/drivers/plan_scope.py +133 -0
  47. dos/drivers/self_improve.py +375 -0
  48. dos/drivers/similarity_judge.py +249 -0
  49. dos/drivers/state_diff.py +274 -0
  50. dos/drivers/supervisor.py +347 -0
  51. dos/drivers/watchdog.py +363 -0
  52. dos/drivers/workshop.py +160 -0
  53. dos/durable_schema.py +344 -0
  54. dos/effect_witness.py +393 -0
  55. dos/efficiency.py +318 -0
  56. dos/enforce.py +414 -0
  57. dos/enumerate.py +776 -0
  58. dos/env_print.py +378 -0
  59. dos/event_severity.py +258 -0
  60. dos/evidence.py +692 -0
  61. dos/exec_capability.py +256 -0
  62. dos/export_cursor.py +143 -0
  63. dos/exporter.py +320 -0
  64. dos/firing_label.py +353 -0
  65. dos/fleet_roll.py +226 -0
  66. dos/gate_classify.py +827 -0
  67. dos/gh4_coverage.py +179 -0
  68. dos/git_delta.py +122 -0
  69. dos/guard.py +215 -0
  70. dos/health.py +552 -0
  71. dos/help_summary.py +519 -0
  72. dos/home.py +934 -0
  73. dos/hook_binary.py +194 -0
  74. dos/hook_dialect.py +271 -0
  75. dos/hook_exit.py +191 -0
  76. dos/hook_install.py +437 -0
  77. dos/id_alloc.py +304 -0
  78. dos/improve.py +499 -0
  79. dos/intent_ledger.py +635 -0
  80. dos/interpret.py +176 -0
  81. dos/intervention.py +769 -0
  82. dos/intervention_eval.py +371 -0
  83. dos/journal_delta.py +308 -0
  84. dos/judge_eval.py +328 -0
  85. dos/judges.py +366 -0
  86. dos/lane_infer.py +127 -0
  87. dos/lane_journal.py +1001 -0
  88. dos/lane_lease.py +952 -0
  89. dos/lane_overlap.py +228 -0
  90. dos/lease_health.py +282 -0
  91. dos/lifecycle.py +211 -0
  92. dos/liveness.py +352 -0
  93. dos/lock_modes.py +185 -0
  94. dos/log_source.py +395 -0
  95. dos/loop_decide.py +1746 -0
  96. dos/marker_gate.py +254 -0
  97. dos/marker_sensor.py +396 -0
  98. dos/noop_streak.py +280 -0
  99. dos/notify.py +479 -0
  100. dos/observe.py +175 -0
  101. dos/oracle.py +1661 -0
  102. dos/overlap_eval.py +214 -0
  103. dos/overlap_policy.py +342 -0
  104. dos/packet_sidecar.py +267 -0
  105. dos/phase_shipped.py +1985 -0
  106. dos/pick_priority.py +225 -0
  107. dos/pickable.py +369 -0
  108. dos/picker_oracle.py +1037 -0
  109. dos/plan_board.py +513 -0
  110. dos/plan_board_tui.py +113 -0
  111. dos/plan_source.py +455 -0
  112. dos/posttool_sensor.py +528 -0
  113. dos/precursor_gate.py +499 -0
  114. dos/precursor_gate_eval.py +239 -0
  115. dos/preflight.py +825 -0
  116. dos/pretool_sensor.py +490 -0
  117. dos/proc_delta.py +181 -0
  118. dos/productivity.py +296 -0
  119. dos/provider_limit.py +242 -0
  120. dos/py.typed +4 -0
  121. dos/reason_morphology.py +299 -0
  122. dos/reasons.py +449 -0
  123. dos/reconcile.py +173 -0
  124. dos/recurring_wedge.py +206 -0
  125. dos/render.py +393 -0
  126. dos/result_state.py +468 -0
  127. dos/resume.py +578 -0
  128. dos/resume_evidence.py +293 -0
  129. dos/retention.py +344 -0
  130. dos/reward.py +372 -0
  131. dos/rewind.py +587 -0
  132. dos/rewind_evidence.py +168 -0
  133. dos/rewind_tokens.py +252 -0
  134. dos/run_id.py +342 -0
  135. dos/scope.py +520 -0
  136. dos/scope_source.py +382 -0
  137. dos/scout.py +982 -0
  138. dos/self_modify.py +209 -0
  139. dos/sibling_scan.py +569 -0
  140. dos/skills/EXAMPLES.md +584 -0
  141. dos/skills/dos-class-cycle/SKILL.md +107 -0
  142. dos/skills/dos-dispatch/SKILL.md +177 -0
  143. dos/skills/dos-dispatch-loop/SKILL.md +254 -0
  144. dos/skills/dos-goal-gate/SKILL.md +269 -0
  145. dos/skills/dos-next-up/SKILL.md +231 -0
  146. dos/skills/dos-promote/SKILL.md +114 -0
  147. dos/skills/dos-replan/SKILL.md +159 -0
  148. dos/skills/dos-replan-loop/SKILL.md +114 -0
  149. dos/skills/dos-self-improve/SKILL.md +213 -0
  150. dos/skills/dos-supervise-loop/SKILL.md +180 -0
  151. dos/skills/dos-unstick/SKILL.md +108 -0
  152. dos/skills/dos-witness-claim/SKILL.md +251 -0
  153. dos/stamp.py +1002 -0
  154. dos/state_health.py +387 -0
  155. dos/status.py +114 -0
  156. dos/stop_policy.py +334 -0
  157. dos/supervise.py +1014 -0
  158. dos/testwitness.py +392 -0
  159. dos/timeline.py +1027 -0
  160. dos/tokens.py +485 -0
  161. dos/tool_stream.py +393 -0
  162. dos/tool_stream_eval.py +226 -0
  163. dos/trace.py +524 -0
  164. dos/verdict.py +140 -0
  165. dos/verdict_cli.py +189 -0
  166. dos/verdict_journal.py +497 -0
  167. dos/verdict_rollup.py +217 -0
  168. dos/verdicts.py +181 -0
  169. dos/wedge_reason.py +282 -0
  170. dos_kernel-0.22.0.dist-info/METADATA +859 -0
  171. dos_kernel-0.22.0.dist-info/RECORD +178 -0
  172. dos_kernel-0.22.0.dist-info/WHEEL +5 -0
  173. dos_kernel-0.22.0.dist-info/entry_points.txt +39 -0
  174. dos_kernel-0.22.0.dist-info/licenses/LICENSE +21 -0
  175. dos_kernel-0.22.0.dist-info/top_level.txt +2 -0
  176. dos_mcp/__init__.py +52 -0
  177. dos_mcp/py.typed +2 -0
  178. dos_mcp/server.py +779 -0
@@ -0,0 +1,371 @@
1
+ """The intervention-evaluation harness — score an actuation policy by its NET TASK DELTA.
2
+
3
+ docs/143 §13.2 — the missing instrument. Every other DOS axis ships an eval: `judge_eval`
4
+ scores a judge, `overlap_eval` scores a disjointness scorer, and `arg_provenance` shipped a
5
+ *detector* eval (precision/recall over minted-vs-resolved ids). The live benchmark run
6
+ proved the decisive number is none of those: a detector that was *sound* (0 % false-nudge,
7
+ 83 % recall) was still **net-harmful** (−9 pp) because the *intervention* it triggered was
8
+ too disruptive (RESULTS.md "⚑ KEY DATA POINT"). So the number that decides deployment is
9
+ not "was the verdict right?" — `arg_provenance`'s eval already answers that — but **"did
10
+ ACTING on the verdict help or hurt the run?"**
11
+
12
+ This module is that instrument: the friendliness gauge for the PEP, the way `overlap_eval`
13
+ is for admission. Bring an `InterventionPolicy` (the confidence-gating knobs), bring a
14
+ corpus of replayed verdicts each labelled with the GROUND-TRUTH outcome of acting on it,
15
+ and get back the headline `net_task_delta` plus the dangerous-cell rates a PEP author
16
+ actually cares about — chiefly **wasted-disruption** (when this policy disrupts, how often
17
+ is it spent on a catch that did not matter — the exact source of the −9 pp).
18
+
19
+ The honesty stance (the same as judge_eval / overlap_eval)
20
+ ==========================================================
21
+
22
+ The labels are the RESEARCHER's ground truth, derived from EXECUTED replay arms, never from
23
+ the detector. Specifically:
24
+
25
+ * `truly_minted` — was the flagged id ACTUALLY a mint? (the controlled mint-injection
26
+ knows; a false-flag has this False). The `overlap_eval.collided` "did it actually
27
+ collide" discipline.
28
+ * `mattered_to_score` — did this FK feed a hidden SQL verifier the run was scored on? From
29
+ the verifier set, not the wrapper. This is the −9 pp axis: a true catch the verifier
30
+ never checked buys nothing, so disrupting on it is pure cost.
31
+ * `recovered_if_blocked` / `recovered_if_deferred` — COUNTERFACTUAL ground truth from the
32
+ two EXECUTED A/B arms (a turn-preserving intervention vs a turn-spending one), NOT a
33
+ guessed label. The live run measured the turn-spending recovery at ~75 % (48/64,
34
+ RESULTS.md line 104); a turn-preserving BLOCK is expected higher (it costs no turn).
35
+
36
+ Everything here is **pure**: it consumes already-built `InterventionCase`s, runs the policy
37
+ through `intervention.choose_intervention` (the SAME path the consumer's PEP takes, so the
38
+ grid reflects what would actually be enacted — the `overlap_eval` "score under the floor"
39
+ discipline), and counts in one pass. No I/O, no host names — it sits in the kernel layer
40
+ beside `intervention`.
41
+
42
+ ⚠ This is NOT a detector eval. `arg_provenance` precision/recall measures the verdict;
43
+ THIS measures the intervention. The two are orthogonal (the §13 thesis), so they are
44
+ separate instruments by design.
45
+ """
46
+
47
+ from __future__ import annotations
48
+
49
+ from dataclasses import dataclass
50
+ from typing import Iterable
51
+
52
+ from dos.arg_provenance import ProvenanceVerdict
53
+ from dos.intervention import (
54
+ BASE_INTERVENTIONS,
55
+ Confidence,
56
+ Intervention,
57
+ InterventionDecision,
58
+ InterventionLadder,
59
+ InterventionPolicy,
60
+ choose_intervention,
61
+ )
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # A labelled example — one replayed verdict + the GROUND-TRUTH outcome of acting.
66
+ # ---------------------------------------------------------------------------
67
+ @dataclass(frozen=True)
68
+ class InterventionCase:
69
+ """One replayed verdict + the ground-truth outcome of intervening on it.
70
+
71
+ The `confidence` is NOT stored — it is DERIVED in `score()` via `choose_intervention`
72
+ from the embedded `verdict`, so the scored action can never drift from a hand-labelled
73
+ confidence (the label-drift trap). Every other field is a ground-truth label from an
74
+ EXECUTED replay arm, NOT a guess (the `overlap_eval.collided` honesty discipline).
75
+
76
+ Fields:
77
+ verdict — the real `ProvenanceVerdict` the detector produced. The policy is
78
+ scored against THIS via `choose_intervention` (same path as the
79
+ consumer's PEP), so the eval measures what would be enacted.
80
+ truly_minted — ground truth: was the flagged id ACTUALLY a mint? (False = the
81
+ detector false-flagged a legit derived/resolved id.)
82
+ mattered_to_score — ground truth: did this FK feed a verifier the run was scored on?
83
+ (the −9 pp axis — a true catch the verifier never checks buys
84
+ nothing, so disrupting on it is pure cost.)
85
+ recovered_if_blocked — counterfactual ground truth from the turn-PRESERVING arm
86
+ (WARN/BLOCK): under a turn-preserving intervention, did the agent
87
+ recover (resolve the id correctly)?
88
+ recovered_if_deferred— counterfactual ground truth from the turn-SPENDING arm (DEFER):
89
+ under a re-prompt that costs the turn, did the agent recover?
90
+ (the live ~75 %.)
91
+ label — optional human handle (carried, never scored).
92
+ """
93
+
94
+ verdict: ProvenanceVerdict
95
+ truly_minted: bool
96
+ mattered_to_score: bool
97
+ recovered_if_blocked: bool
98
+ recovered_if_deferred: bool
99
+ label: str = ""
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # The per-case net-delta ledger — the §13.2 formula, honestly generalized.
104
+ # ---------------------------------------------------------------------------
105
+ def _case_delta(
106
+ case: InterventionCase, action: Intervention, ladder: InterventionLadder
107
+ ) -> float:
108
+ """The net task-delta this `(case, action)` contributes, in units of "one task verifier".
109
+
110
+ GENERALIZES docs/143 §13.2's `caught × recovered × (1 − disruption_cost)` to all cells
111
+ (the product only modeled the recovered-relevant cell; the −9 pp lives in a cell the
112
+ product cannot see). The honest decomposition:
113
+
114
+ * a real PREVENTED corruption that mattered is worth `+(1 − cost)` (a verifier flips
115
+ fail→pass, minus the disruption tax);
116
+ * disruption (`cost`, read from the ladder) is paid whenever the action ACTUATES
117
+ (withholds the turn) — win or lose;
118
+ * a DISPATCHING action (OBSERVE/WARN) lets the real (possibly minted) call land, so it
119
+ has near-zero PREVENTION value but also near-zero disruption cost (WARN's small cost
120
+ is its annotation, not a withheld turn).
121
+
122
+ The IRREVERSIBILITY premise (load-bearing). EnterpriseOps-Gym mutates a shared DB where
123
+ "every action is permanent and irreversible" (docs/143 §1) — there is no rollback. So a
124
+ DISPATCHING action that lets a minted *relevant* write land has **already corrupted the
125
+ scored final state**: a next-turn "correction" is a SECOND write the verifier sees
126
+ alongside the bad FK, not a repair. Therefore a dispatched relevant mint has **zero**
127
+ prevention value — only a WITHHOLDING rung (BLOCK/DEFER) can prevent the corruption.
128
+ This is what makes the §13 thesis crisp: BLOCK prevents; WARN merely informs (and is
129
+ valuable on the OTHER cells, where it costs nothing and avoids the −9pp).
130
+
131
+ Cells:
132
+ truly_minted ∧ mattered:
133
+ withholding (DEFER/BLOCK) → mutation prevented → `+(1−cost)` on recovery, `−cost` if not.
134
+ dispatching (OBSERVE/WARN)→ the bad write LANDED and cannot be un-committed → 0
135
+ prevention value (the annotation may help a LATER, distinct
136
+ step, but not this corrupted row). Near-zero disruption.
137
+ truly_minted ∧ ¬mattered → THE DANGEROUS CELL: a true catch the verifier never checks.
138
+ No gain to win; a withholding action pays pure `−cost` (the
139
+ live −9 pp); a dispatching one ≈ 0.
140
+ ¬truly_minted (false-flag) → no gain; a withholding action pays `−cost`, a dispatching
141
+ one ≈ 0.
142
+
143
+ `cost` is ALWAYS `ladder.disruption_cost(action)` (normalized [0,1]) — never a hardcoded
144
+ per-rung constant, so a host-retuned ladder reweights the eval automatically. The model
145
+ is deliberately CONSERVATIVE about the mechanism's upside (a dispatched mint scores 0, not
146
+ a partial credit) so the eval cannot flatter the intervention — the honesty direction.
147
+ """
148
+ cost = ladder.disruption_cost(action.value)
149
+ dispatches = ladder.dispatches(action.value)
150
+ recovered = (
151
+ case.recovered_if_deferred
152
+ if action is Intervention.DEFER
153
+ else case.recovered_if_blocked
154
+ )
155
+ if case.truly_minted and case.mattered_to_score:
156
+ if dispatches:
157
+ # OBSERVE/WARN: the minted write landed on an irreversible DB → 0 prevention.
158
+ return 0.0
159
+ # DEFER/BLOCK: the mutation was WITHHELD → prevention possible.
160
+ return (1.0 - cost) if recovered else (0.0 - cost)
161
+ if case.truly_minted and not case.mattered_to_score:
162
+ # THE DANGEROUS CELL — a true catch that did not matter. Disrupting buys nothing.
163
+ return -cost if not dispatches else 0.0
164
+ # false-flag — no gain; disruption is pure waste.
165
+ return -cost if not dispatches else 0.0
166
+
167
+
168
+ # ---------------------------------------------------------------------------
169
+ # The report — frozen, @property rates with div-guard, to_dict (mirror overlap_eval).
170
+ # ---------------------------------------------------------------------------
171
+ @dataclass(frozen=True)
172
+ class InterventionReport:
173
+ """A policy scored over labelled cases — the net-delta ledger + the dangerous-cell rates.
174
+
175
+ The grid is split into the ground-truth crosstab (independent of the chosen action) and
176
+ the actuation ledger (what the policy actually DID). The named dangerous cell is
177
+ `actuated_irrelevant` — disruption spent on a true catch the verifier never checked, the
178
+ exact −9 pp.
179
+ """
180
+
181
+ n: int
182
+ sum_delta: float
183
+ sum_disruption_cost: float # accumulated disruption tax over ACTUATED actions
184
+ # ground-truth grid (independent of the chosen action):
185
+ n_true_relevant: int # truly_minted AND mattered_to_score
186
+ n_true_irrelevant: int # truly_minted AND NOT mattered (the dangerous-cell denom)
187
+ n_false_flag: int # NOT truly_minted
188
+ # actuation ledger (did the chosen action WITHHOLD the turn?):
189
+ n_actuated: int # actions where ladder.actuates() (turn at risk)
190
+ n_informed_only: int # OBSERVE/WARN — turn preserved
191
+ actuated_irrelevant: int # actuated on a true_irrelevant case (the −9 pp cell)
192
+ actuated_false_flag: int # actuated on a false_flag
193
+ n_actuated_relevant: int # actuated on a true_relevant case
194
+ recovered: int # actuated true_relevant that recovered
195
+
196
+ # --- derived rates (all guard against divide-by-zero) ---
197
+
198
+ @property
199
+ def net_task_delta(self) -> float:
200
+ """The HEADLINE — mean net task-delta per case, in verifier-flip units. Directly
201
+ comparable to the live −9 pp (a net regression) / +11 pp (the simulator's win). The
202
+ number the whole §13 double-down is built to maximize."""
203
+ return (self.sum_delta / self.n) if self.n else 0.0
204
+
205
+ @property
206
+ def disruption_efficiency(self) -> float:
207
+ """Of the turns the policy ACTUATED (withheld), the fraction that bought a real gain
208
+ (a recovered relevant catch). High = disruption well spent."""
209
+ return (self.recovered / self.n_actuated) if self.n_actuated else 0.0
210
+
211
+ @property
212
+ def wasted_disruption_rate(self) -> float:
213
+ """Of the turns the policy ACTUATED, the fraction wasted — spent on a catch that did
214
+ not matter OR on a false flag. THE DANGEROUS-CELL RATE (the `overlap_eval.false_admit
215
+ _rate` analogue): when this policy disrupts, how often is it for nothing? The single
216
+ number the −9 pp came from."""
217
+ if not self.n_actuated:
218
+ return 0.0
219
+ return (self.actuated_irrelevant + self.actuated_false_flag) / self.n_actuated
220
+
221
+ @property
222
+ def dangerous_cell_rate(self) -> float:
223
+ """Of all true-but-IRRELEVANT catches, the fraction the policy actuated on — the
224
+ exact −9 pp cell (a sound catch the verifier never checked, disrupted anyway)."""
225
+ return (
226
+ (self.actuated_irrelevant / self.n_true_irrelevant)
227
+ if self.n_true_irrelevant
228
+ else 0.0
229
+ )
230
+
231
+ @property
232
+ def coverage(self) -> float:
233
+ """Of all true-RELEVANT mints (a catch that DID matter), the fraction the policy
234
+ actuated on — recall-of-action. A too-timid all-WARN policy scores ~0 here (it never
235
+ withholds), so this is the counterweight to `wasted_disruption_rate`: a good policy
236
+ is high coverage AND low waste."""
237
+ return (
238
+ (self.n_actuated_relevant / self.n_true_relevant)
239
+ if self.n_true_relevant
240
+ else 0.0
241
+ )
242
+
243
+ @property
244
+ def net_harmful(self) -> bool:
245
+ """True iff the policy is a net regression (`net_task_delta < 0`). The boolean the
246
+ `dos intervention-eval` exit code rides — the `overlap_eval.leaked` CI-gate analogue
247
+ (a policy that hurts the run fails CI)."""
248
+ return self.net_task_delta < 0.0
249
+
250
+ def to_dict(self) -> dict:
251
+ return {
252
+ "n": self.n,
253
+ "net_task_delta": round(self.net_task_delta, 4),
254
+ "grid": {
255
+ "true_relevant": self.n_true_relevant,
256
+ "true_irrelevant": self.n_true_irrelevant,
257
+ "false_flag": self.n_false_flag,
258
+ },
259
+ "actuation": {
260
+ "actuated": self.n_actuated,
261
+ "informed_only": self.n_informed_only,
262
+ "actuated_relevant": self.n_actuated_relevant,
263
+ "actuated_irrelevant": self.actuated_irrelevant,
264
+ "actuated_false_flag": self.actuated_false_flag,
265
+ "recovered": self.recovered,
266
+ },
267
+ "rates": {
268
+ "net_task_delta": round(self.net_task_delta, 4),
269
+ "disruption_efficiency": round(self.disruption_efficiency, 4),
270
+ "wasted_disruption_rate": round(self.wasted_disruption_rate, 4),
271
+ "dangerous_cell_rate": round(self.dangerous_cell_rate, 4),
272
+ "coverage": round(self.coverage, 4),
273
+ },
274
+ "sum_disruption_cost": round(self.sum_disruption_cost, 4),
275
+ "net_harmful": self.net_harmful,
276
+ }
277
+
278
+
279
+ def _safe_decision(
280
+ verdict: ProvenanceVerdict, policy: InterventionPolicy, ladder: InterventionLadder
281
+ ) -> InterventionDecision:
282
+ """Run `choose_intervention` fail-SAFE: any raise degrades to the ladder default (WARN).
283
+
284
+ Fail-to-LEAST-DISRUPTIVE — the `overlap_eval` fail-closed-to-floor / `judge_eval`
285
+ fail-to-abstain posture, here as under-intervene. A flaky policy contributes a WARN, not
286
+ a crash, so the report stays honest about it.
287
+ """
288
+ try:
289
+ return choose_intervention(verdict, policy, ladder)
290
+ except Exception:
291
+ spec = ladder.default()
292
+ return InterventionDecision(
293
+ intervention=Intervention(spec.token),
294
+ confidence=Confidence.LOW,
295
+ rung=spec,
296
+ disruption_cost=ladder.disruption_cost(spec.token),
297
+ unsupported=verdict.unsupported,
298
+ reason="fail-safe: policy raised → ladder default",
299
+ )
300
+
301
+
302
+ def score(
303
+ policy: InterventionPolicy,
304
+ cases: Iterable[InterventionCase],
305
+ ladder: InterventionLadder = BASE_INTERVENTIONS,
306
+ ) -> InterventionReport:
307
+ """Run `policy` over labelled `cases` (via `choose_intervention`) and tabulate the ledger.
308
+
309
+ The policy is scored through the SAME `choose_intervention` path the consumer's PEP uses
310
+ (the `overlap_eval._admits` "score under the floor" discipline), so the grid reflects
311
+ exactly what would be ENACTED — fail-safe and all. PURE: reads cases, reads the ladder,
312
+ counts in one pass. The actuation buckets use `ladder.actuates()` (data-driven, never a
313
+ hardcoded `{DEFER, BLOCK}`), so a host-added rung is bucketed by its `dispatches` data.
314
+
315
+ Invariant (pinned by a test): `n_actuated == actuated_irrelevant + actuated_false_flag +
316
+ n_actuated_relevant`, and the counts are derived in the same pass as `sum_delta`, so they
317
+ cannot drift apart.
318
+ """
319
+ n = 0
320
+ sum_delta = 0.0
321
+ sum_disruption = 0.0
322
+ n_true_relevant = n_true_irrelevant = n_false_flag = 0
323
+ n_actuated = n_informed_only = 0
324
+ actuated_irrelevant = actuated_false_flag = n_actuated_relevant = recovered = 0
325
+
326
+ for case in cases:
327
+ n += 1
328
+ decision = _safe_decision(case.verdict, policy, ladder)
329
+ action = decision.intervention
330
+ actuates = ladder.actuates(action.value)
331
+ delta = _case_delta(case, action, ladder)
332
+ sum_delta += delta
333
+
334
+ # ground-truth grid (action-independent)
335
+ if case.truly_minted and case.mattered_to_score:
336
+ n_true_relevant += 1
337
+ elif case.truly_minted:
338
+ n_true_irrelevant += 1
339
+ else:
340
+ n_false_flag += 1
341
+
342
+ # actuation ledger (what the policy DID)
343
+ if actuates:
344
+ n_actuated += 1
345
+ sum_disruption += ladder.disruption_cost(action.value)
346
+ if case.truly_minted and case.mattered_to_score:
347
+ n_actuated_relevant += 1
348
+ if case.recovered_if_deferred if action is Intervention.DEFER \
349
+ else case.recovered_if_blocked:
350
+ recovered += 1
351
+ elif case.truly_minted:
352
+ actuated_irrelevant += 1
353
+ else:
354
+ actuated_false_flag += 1
355
+ else:
356
+ n_informed_only += 1
357
+
358
+ return InterventionReport(
359
+ n=n,
360
+ sum_delta=sum_delta,
361
+ sum_disruption_cost=sum_disruption,
362
+ n_true_relevant=n_true_relevant,
363
+ n_true_irrelevant=n_true_irrelevant,
364
+ n_false_flag=n_false_flag,
365
+ n_actuated=n_actuated,
366
+ n_informed_only=n_informed_only,
367
+ actuated_irrelevant=actuated_irrelevant,
368
+ actuated_false_flag=actuated_false_flag,
369
+ n_actuated_relevant=n_actuated_relevant,
370
+ recovered=recovered,
371
+ )