hermeskill 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hermeskill/__init__.py ADDED
@@ -0,0 +1,57 @@
1
+ """Hermeskill SDK — apoptosis protocol core for AI agent supervision.
2
+
3
+ Framework-agnostic core: WatcherState, symptom checks, death certificates,
4
+ kill-event client, operator CLI. Install a framework adapter on top:
5
+
6
+ pip install hermeskill-hermes # Hermes Agent plugin (recommended)
7
+
8
+ The bare `hermeskill` package imports with no third-party agent-framework
9
+ dependencies.
10
+
11
+ Public exceptions:
12
+
13
+ from hermeskill import HermeskillTerminated
14
+
15
+ # Raised by framework adapters and `checkpoint()` when the agent is
16
+ # killed by Hermeskill. Catch at your top-level run loop if you need
17
+ # cleanup before exit.
18
+
19
+ `checkpoint()` is a cooperative termination point for custom run loops;
20
+ raises HermeskillTerminated if a kill directive is pending.
21
+ """
22
+
23
+ from hermeskill._version import __version__
24
+ from hermeskill.calibration import LabeledKill, build_calibration_report
25
+ from hermeskill.exceptions import HermeskillError, HermeskillTerminated
26
+ from hermeskill.supervisor import Heartbeat, ProcessSupervisor, SupervisorResult
27
+
28
+ __all__ = [
29
+ "Heartbeat",
30
+ "HermeskillError",
31
+ "HermeskillTerminated",
32
+ "LabeledKill",
33
+ "ProcessSupervisor",
34
+ "SupervisorResult",
35
+ "__version__",
36
+ "build_calibration_report",
37
+ "checkpoint",
38
+ ]
39
+
40
+
41
+ def checkpoint() -> None:
42
+ """Cooperative termination point for custom run loops.
43
+
44
+ Call inside long-running synchronous work to give Hermeskill a chance to
45
+ terminate the agent. Raises HermeskillTerminated if any registered watcher
46
+ has its apoptosis flag set; no-op otherwise. Safe to call from code with
47
+ no registered watcher (returns immediately).
48
+ """
49
+ from hermeskill.exceptions import HermeskillTerminated
50
+ from hermeskill.watcher import all_watchers
51
+
52
+ for state in all_watchers():
53
+ if state.terminate_requested:
54
+ raise HermeskillTerminated(
55
+ state.terminate_reason or "terminated",
56
+ kill_event_id=state.terminate_kill_event_id,
57
+ )
hermeskill/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0a1"
@@ -0,0 +1,342 @@
1
+ """L2 forced-termination watchdog.
2
+
3
+ L1 (cooperative termination) is handled by the framework adapter — the
4
+ kill stub or checkpoint raise at tool/chain boundaries. It works as long
5
+ as the agent's event loop is alive and reaching await points. When it
6
+ isn't — agent is wedged inside a sync tool, or stubbornly ignoring the
7
+ cooperative signal — we need an out-of-band path that can cancel from
8
+ outside the loop.
9
+
10
+ That's L2: **one daemon `threading.Thread` per watched agent**, holding a
11
+ reference to the agent's asyncio loop and main `Task`. The thread sleeps
12
+ on `state._terminate_event`. When apoptosis fires, it waits the policy's
13
+ `cooperative_grace_seconds`, checks whether the task finished on its own
14
+ (L1 worked → no escalation), and if not, calls
15
+ `loop.call_soon_threadsafe(task.cancel)` — scheduling cancellation from
16
+ *outside* the loop, which is the part that defeats the wedged-loop case.
17
+
18
+ **Why a thread, not an asyncio task.** If we scheduled the L2 timer with
19
+ `asyncio.create_task(...)` in the agent's own loop, it would queue
20
+ behind whatever's blocking that loop — i.e. behind the very thing it's
21
+ trying to interrupt. Same-loop scheduling defeats the entire purpose.
22
+ Run as a thread, run outside the loop. *Do not* refactor this back into
23
+ the loop in a future cleanup pass — leave this comment as ballast.
24
+
25
+ **Honest limitation.** `task.cancel()` raises CancelledError at the next
26
+ *await point*. If an agent is wedged in pure-Python CPU code (`while
27
+ True: pass` inside a sync tool with no awaits anywhere reachable), the
28
+ cancellation will not fire — Python provides no portable way to
29
+ interrupt a thread mid-bytecode. The watchdog logs the escalation
30
+ attempt; in that case the only real recourse is killing the OS process
31
+ (operator escalation, M3 webhook fires, M5 grants document the case).
32
+ The watchdog still handles the realistic case (async tool wedged on a
33
+ slow network call ignoring cooperative shutdown) — which is what the
34
+ plan's "blocked-loop test" intends to exercise.
35
+
36
+ Public surface: `Watchdog(state, grace_seconds)`, `.arm(loop, task)`,
37
+ `.stop()`. Idempotent arming — call from `on_chain_start` every time;
38
+ the first call starts the thread, later calls just refresh the captured
39
+ loop + task in case a new invocation runs in a different task.
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import asyncio
45
+ import logging
46
+ import threading
47
+ import time
48
+ from datetime import UTC, datetime
49
+ from typing import TYPE_CHECKING
50
+
51
+ from hermeskill.types import (
52
+ DeathCertificate,
53
+ KillEventIn,
54
+ ShutdownLogEntry,
55
+ TriggerType,
56
+ )
57
+
58
+ if TYPE_CHECKING:
59
+ from hermeskill.watcher import WatcherState
60
+
61
+ logger = logging.getLogger("hermeskill.apoptosis")
62
+
63
+
64
+ class Watchdog:
65
+ """L2 forced-termination thread. One per `WatcherState`."""
66
+
67
+ # Polling cadence for the thread's main wait + grace-period loops.
68
+ # Trades responsiveness against wakeup cost; 100ms is plenty fast for
69
+ # human-perceptible kill latency without burning CPU on idle agents.
70
+ _POLL_SECONDS = 0.1
71
+
72
+ def __init__(self, state: WatcherState, *, grace_seconds: float) -> None:
73
+ self.state = state
74
+ self.grace_seconds = grace_seconds
75
+ self._loop: asyncio.AbstractEventLoop | None = None
76
+ self._task: asyncio.Task[object] | None = None
77
+ self._thread: threading.Thread | None = None
78
+ self._stop = threading.Event()
79
+ # Guards the loop/task slots. Cheap — only touched on arm() + on
80
+ # transitions inside _run().
81
+ self._lock = threading.Lock()
82
+ # True iff we've already issued a call_soon_threadsafe(task.cancel)
83
+ # for this kill; prevents double-cancel on long-grace policies.
84
+ self._escalated = False
85
+
86
+ # --- public API -------------------------------------------------------
87
+
88
+ def arm(
89
+ self,
90
+ loop: asyncio.AbstractEventLoop,
91
+ task: asyncio.Task[object],
92
+ ) -> None:
93
+ """Capture the loop + task to watch. Idempotent.
94
+
95
+ On first call: starts the daemon thread.
96
+ On later calls: refreshes the loop/task slots (a new ainvoke may
97
+ run in a different task than the previous one).
98
+
99
+ Safe to call from any thread, including the agent's own loop.
100
+ """
101
+ with self._lock:
102
+ self._loop = loop
103
+ self._task = task
104
+ if self._thread is None:
105
+ self._thread = threading.Thread(
106
+ target=self._run,
107
+ daemon=True,
108
+ name=f"hermeskill-watchdog-{self.state.agent_id}",
109
+ )
110
+ self._thread.start()
111
+
112
+ def stop(self, *, join_timeout: float = 2.0) -> None:
113
+ """Signal the thread to exit. Does NOT force-cancel the task.
114
+
115
+ Called on agent unregister / process shutdown. The thread joins
116
+ within `join_timeout`; if it doesn't, we log and move on (daemon
117
+ threads die with the process anyway).
118
+ """
119
+ self._stop.set()
120
+ # Poke the terminate_event so a thread blocked on it wakes up to
121
+ # observe the stop flag. (We can't `notify` a threading.Event the
122
+ # same way as a Condition — set() is the only signal mechanism.)
123
+ self.state._terminate_event.set()
124
+ with self._lock:
125
+ thread = self._thread
126
+ if thread is not None and thread.is_alive():
127
+ thread.join(timeout=join_timeout)
128
+ if thread.is_alive():
129
+ logger.warning(
130
+ "hermeskill L2 watchdog: thread %s did not join within %.1fs",
131
+ thread.name,
132
+ join_timeout,
133
+ )
134
+
135
+ # --- thread body ------------------------------------------------------
136
+
137
+ def _run(self) -> None:
138
+ """The daemon thread: wait for kill, give grace, escalate.
139
+
140
+ Loop structure:
141
+ 1. Wait on `_terminate_event` (with timeout so we can poll
142
+ `_stop` and the flag).
143
+ 2. When triggered, wait `grace_seconds` for cooperative
144
+ termination — checking `task.done()` periodically to bail
145
+ out early when L1 wins.
146
+ 3. If task still alive after grace: escalate via
147
+ `loop.call_soon_threadsafe(task.cancel)`.
148
+ 4. Exit. One watchdog = one kill — no re-arm on the same state.
149
+ """
150
+ logger.debug(
151
+ "hermeskill L2 watchdog armed for agent %s (grace=%.1fs)",
152
+ self.state.agent_id,
153
+ self.grace_seconds,
154
+ )
155
+ try:
156
+ # --- step 1: wait for kill signal -----------------------
157
+ while not self._stop.is_set():
158
+ triggered = self.state._terminate_event.wait(timeout=self._POLL_SECONDS)
159
+ if self._stop.is_set():
160
+ return
161
+ # Defensive: also check the flag in case a caller wrote
162
+ # it directly without going through request_termination.
163
+ if triggered or self.state.terminate_requested:
164
+ break
165
+ else:
166
+ return # stopped before any kill
167
+
168
+ # --- step 2: cooperative-grace window -------------------
169
+ deadline = time.monotonic() + self.grace_seconds
170
+ while time.monotonic() < deadline:
171
+ if self._stop.is_set():
172
+ return
173
+ with self._lock:
174
+ task = self._task
175
+ if task is not None and task.done():
176
+ logger.debug(
177
+ "hermeskill L2 watchdog: agent %s cooperated, no escalation",
178
+ self.state.agent_id,
179
+ )
180
+ return
181
+ time.sleep(self._POLL_SECONDS)
182
+
183
+ # --- step 3: escalate -----------------------------------
184
+ self._escalate()
185
+ except Exception:
186
+ logger.exception("hermeskill L2 watchdog crashed for agent %s", self.state.agent_id)
187
+
188
+ def _escalate(self) -> None:
189
+ with self._lock:
190
+ loop = self._loop
191
+ task = self._task
192
+ already = self._escalated
193
+ self._escalated = True
194
+
195
+ if already:
196
+ return
197
+ if loop is None or task is None:
198
+ logger.warning(
199
+ "hermeskill L2 watchdog: no loop/task captured for agent %s; "
200
+ "cannot escalate (this is the case the docstring's 'honest "
201
+ "limitation' note describes — operator must kill the process)",
202
+ self.state.agent_id,
203
+ )
204
+ return
205
+ if task.done():
206
+ return # narrowly raced with cooperative completion
207
+
208
+ logger.warning(
209
+ "hermeskill L2 watchdog: cooperative grace (%.1fs) expired for "
210
+ "agent %s; forcing task cancellation",
211
+ self.grace_seconds,
212
+ self.state.agent_id,
213
+ )
214
+ try:
215
+ loop.call_soon_threadsafe(task.cancel)
216
+ except RuntimeError:
217
+ # Loop already closed — nothing left to cancel against.
218
+ logger.debug(
219
+ "hermeskill L2 watchdog: loop already closed for agent %s",
220
+ self.state.agent_id,
221
+ )
222
+ # Record a lifecycle event AND a shutdown-log step so the death
223
+ # cert shows the watchdog fired and audit can correlate timings.
224
+ try:
225
+ self.state.record_lifecycle(
226
+ "watchdog_escalated",
227
+ grace_seconds=self.grace_seconds,
228
+ )
229
+ self.state.record_shutdown_step(
230
+ "watchdog_escalated",
231
+ grace_seconds=self.grace_seconds,
232
+ )
233
+ except Exception:
234
+ logger.exception("watchdog: failed to record escalation lifecycle")
235
+
236
+
237
+ # --- death certificate builder + posting ----------------------------------
238
+
239
+
240
+ def build_death_certificate(
241
+ state: WatcherState,
242
+ *,
243
+ terminated_at: datetime | None = None,
244
+ ) -> DeathCertificate:
245
+ """Snapshot `state` into a forensic death certificate.
246
+
247
+ The cert is built at the very end of the death sequence, after L1
248
+ cooperative termination has raised `HermeskillTerminated` and the
249
+ wrapper has caught it. By then:
250
+
251
+ * `state.terminate_requested` is True
252
+ * `state.terminate_reason` is set (first-cause-wins)
253
+ * `state.terminate_requested_at` is the time the decision was made
254
+ * `state.symptoms_log` holds every symptom (terminal + warning) the
255
+ agent saw during its lifetime
256
+ * `state.shutdown_log` holds the structured shutdown steps so far
257
+
258
+ `terminated_at` defaults to now() — the moment of cert build, which
259
+ is effectively the moment of death from the SDK's POV.
260
+
261
+ The cert intentionally does NOT include `customer_id` / `policy_id` /
262
+ `feedback_url` — those are server-authoritative (the SDK shouldn't
263
+ be in the business of claiming customer ownership; the server fills
264
+ them from the API key and from M3's signed-token machinery).
265
+ """
266
+ now = terminated_at or datetime.now(UTC)
267
+ triggered_at = state.terminate_requested_at or now
268
+ reason = state.terminate_reason or "unknown"
269
+
270
+ # M4: branch on `state.manual_kill` rather than `terminate_reason`.
271
+ # The poller writes the dict atomically with the flag flip, so its
272
+ # presence is the authoritative signal that this kill was operator-
273
+ # issued.
274
+ manual = state.manual_kill
275
+ if manual is not None:
276
+ trigger_type = TriggerType.MANUAL
277
+ operator = manual.get("operator")
278
+ operator_reason = manual.get("operator_reason")
279
+ else:
280
+ trigger_type = TriggerType.AUTO
281
+ operator = None
282
+ operator_reason = None
283
+
284
+ return DeathCertificate(
285
+ agent_id=state.agent_id,
286
+ triggered_at=triggered_at,
287
+ terminated_at=now,
288
+ trigger_type=trigger_type,
289
+ trigger_reason=reason,
290
+ symptoms_log=list(state.symptoms_log),
291
+ final_state={}, # v2 / cleanup-hook hookpoint
292
+ shutdown_log=[_normalize_step(s) for s in state.shutdown_log],
293
+ operator=operator,
294
+ operator_reason=operator_reason,
295
+ )
296
+
297
+
298
+ def build_kill_event_payload(
299
+ state: WatcherState,
300
+ *,
301
+ terminated_at: datetime | None = None,
302
+ ) -> KillEventIn:
303
+ """Wrap the death cert into the `POST /agents/{id}/kill_events` body."""
304
+ cert = build_death_certificate(state, terminated_at=terminated_at)
305
+ return KillEventIn(
306
+ trigger_type=cert.trigger_type,
307
+ trigger_reason=cert.trigger_reason,
308
+ triggered_at=cert.triggered_at,
309
+ terminated_at=cert.terminated_at,
310
+ death_certificate=cert,
311
+ shutdown_log=cert.shutdown_log,
312
+ )
313
+
314
+
315
+ def _normalize_step(raw: dict[str, object]) -> ShutdownLogEntry:
316
+ """Coerce a `record_shutdown_step()`-format dict into the typed model.
317
+
318
+ Steps are appended to `state.shutdown_log` as plain dicts (cheap
319
+ write path); we type-validate them only when the cert is built.
320
+ """
321
+ at_value = raw.get("at")
322
+ if isinstance(at_value, str):
323
+ at = datetime.fromisoformat(at_value)
324
+ elif isinstance(at_value, datetime):
325
+ at = at_value
326
+ else:
327
+ at = datetime.now(UTC)
328
+ duration_raw = raw.get("duration_ms")
329
+ duration_ms: float | None = (
330
+ None if duration_raw is None else float(duration_raw) # type: ignore[arg-type]
331
+ )
332
+ detail = raw.get("detail") or {}
333
+ if not isinstance(detail, dict):
334
+ detail = {}
335
+ step_raw = raw.get("step")
336
+ step = str(step_raw) if step_raw is not None else "unknown"
337
+ return ShutdownLogEntry(
338
+ step=step,
339
+ at=at,
340
+ duration_ms=duration_ms,
341
+ detail=detail,
342
+ )
@@ -0,0 +1,235 @@
1
+ """Feedback-driven threshold calibration (Phase 4).
2
+
3
+ The control plane already collects an operator's verdict on every kill via the
4
+ one-click feedback link baked into each death certificate
5
+ (`kill_events.feedback_label`). Until now those labels just sat in the database.
6
+ This module turns them into a **transparent, advisory** calibration report: per
7
+ symptom, how often did kills under a given policy get labeled false-positive,
8
+ and — if that rate is high enough on a large enough sample — what looser
9
+ threshold should a human *consider* setting.
10
+
11
+ Design constraints (these are deliberate, and they are the point):
12
+
13
+ * **Suggest-only.** We never mutate a policy. Policies are SDK-defined
14
+ constants (`hermeskill.policies`); the "suggestion" is literally "edit that
15
+ constant." Auto-tuning limits from agent-influenced feedback would be both
16
+ an overclaim and a genuine safety hole.
17
+ * **No learning / no ML.** It's a rate, a sample-size gate, and one fixed
18
+ conservative step. A reviewer can read the whole rule in a minute and
19
+ trust it precisely because there's no black box.
20
+ * **Evidence over precision.** The suggested number is a conservative nudge
21
+ (`* 1.5`, rounded to something readable). What should actually drive the
22
+ decision is the evidence we lead with: the false-positive rate and n.
23
+ * **False positives only.** See `hermeskill.types.calibration` — the data can't
24
+ speak to kills that never fired, so we never recommend tightening.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import math
30
+ from collections import Counter
31
+ from collections.abc import Iterable
32
+ from dataclasses import dataclass
33
+
34
+ from hermeskill.types import (
35
+ CalibrationReport,
36
+ FeedbackLabel,
37
+ Policy,
38
+ SymptomCalibration,
39
+ SymptomType,
40
+ )
41
+
42
+ # --- tunables (transparent, documented) ----------------------------------
43
+
44
+ #: Below this many labeled kills for a symptom, we report stats but make no
45
+ #: suggestion — a 1-of-1 false positive is noise, not a signal.
46
+ MIN_SAMPLES_PER_SYMPTOM = 5
47
+
48
+ #: Only suggest loosening when at least this fraction of a symptom's labeled
49
+ #: kills were false positives. 30% wrong is a real calibration problem.
50
+ FALSE_POSITIVE_ACTION_THRESHOLD = 0.30
51
+
52
+ #: The fixed conservative step. We loosen by half, then round to a readable
53
+ #: value. Intentionally *not* derived from the false-positive rate — a
54
+ #: rate-scaled number ("3 → 4.2") reads as false precision; a flat nudge plus
55
+ #: the evidence reads as honest.
56
+ LOOSEN_FACTOR = 1.5
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class _Knob:
61
+ """A symptom's single numeric threshold, and how to round a suggestion."""
62
+
63
+ field: str
64
+ kind: str # "int" | "seconds" | "usd"
65
+
66
+
67
+ #: Symptoms that map to exactly one numeric threshold worth suggesting. The
68
+ #: others (tool_scope_violation → allowlist, heartbeat_stale → liveness,
69
+ #: manual_kill → operator) have no single knob and are reported stats-only.
70
+ _SYMPTOM_KNOB: dict[SymptomType, _Knob] = {
71
+ SymptomType.LOOP: _Knob("max_loop_repeats", "int"),
72
+ SymptomType.TOKEN_RUNAWAY: _Knob("max_cost_usd", "usd"),
73
+ SymptomType.WALL_CLOCK: _Knob("max_runtime_seconds", "seconds"),
74
+ }
75
+
76
+
77
+ @dataclass(frozen=True)
78
+ class LabeledKill:
79
+ """One past kill plus the operator's verdict on it.
80
+
81
+ The minimal input the calibrator needs. The control plane builds these
82
+ from `kill_events` rows (symptom extracted from the death cert's terminal
83
+ `symptoms_log` entry, label from `feedback_label`); tests build them
84
+ directly. Unlabeled kills are simply not passed in.
85
+ """
86
+
87
+ symptom: SymptomType
88
+ label: FeedbackLabel
89
+
90
+
91
+ def _loosen(current: float, kind: str) -> float:
92
+ """Apply the fixed conservative step and round to a readable value."""
93
+ raw = current * LOOSEN_FACTOR
94
+ if kind == "int":
95
+ return float(math.ceil(raw))
96
+ if kind == "seconds":
97
+ # Nearest minute reads better than 450.0s.
98
+ return float(round(raw / 60) * 60)
99
+ # usd
100
+ return round(raw, 2)
101
+
102
+
103
+ def _confidence_for(n: int) -> str:
104
+ """Sample-size → confidence tier. Below MIN_SAMPLES it's not called."""
105
+ if n >= 30:
106
+ return "high"
107
+ if n >= 10:
108
+ return "medium"
109
+ return "low"
110
+
111
+
112
+ def _pct(rate: float) -> str:
113
+ return f"{rate * 100:.0f}%"
114
+
115
+
116
+ def _calibrate_symptom(
117
+ symptom: SymptomType, labels: list[FeedbackLabel], policy: Policy
118
+ ) -> SymptomCalibration:
119
+ counts = Counter(labels)
120
+ total = len(labels)
121
+ good = counts[FeedbackLabel.GOOD_KILL]
122
+ false_pos = counts[FeedbackLabel.FALSE_POSITIVE]
123
+ missed = counts[FeedbackLabel.MISSED_KILL]
124
+ other = counts[FeedbackLabel.OTHER]
125
+ fp_rate = false_pos / total if total else 0.0
126
+
127
+ knob = _SYMPTOM_KNOB.get(symptom)
128
+ base = SymptomCalibration(
129
+ symptom=symptom,
130
+ total_labeled=total,
131
+ good_kills=good,
132
+ false_positives=false_pos,
133
+ missed_kills=missed,
134
+ other=other,
135
+ false_positive_rate=fp_rate,
136
+ confidence="insufficient_data",
137
+ rationale="",
138
+ )
139
+
140
+ # 1. Not enough data to say anything.
141
+ if total < MIN_SAMPLES_PER_SYMPTOM:
142
+ return base.model_copy(
143
+ update={
144
+ "rationale": (
145
+ f"n={total} labeled kill(s); need "
146
+ f"{MIN_SAMPLES_PER_SYMPTOM}+ before suggesting a change."
147
+ )
148
+ }
149
+ )
150
+
151
+ confidence = _confidence_for(total)
152
+
153
+ # 2. No single numeric knob for this symptom — stats only.
154
+ if knob is None:
155
+ return base.model_copy(
156
+ update={
157
+ "confidence": confidence,
158
+ "rationale": (
159
+ f"{_pct(fp_rate)} false-positive (n={total}). No single "
160
+ f"numeric threshold maps to {symptom.value}; review the "
161
+ f"tool allowlist / liveness settings by hand."
162
+ ),
163
+ }
164
+ )
165
+
166
+ current = float(getattr(policy.thresholds, knob.field))
167
+
168
+ # 3. Well-calibrated — false-positive rate within tolerance.
169
+ if fp_rate < FALSE_POSITIVE_ACTION_THRESHOLD:
170
+ return base.model_copy(
171
+ update={
172
+ "threshold_field": knob.field,
173
+ "current_value": current,
174
+ "confidence": confidence,
175
+ "rationale": (
176
+ f"{_pct(fp_rate)} false-positive (n={total}) — within "
177
+ f"tolerance ({_pct(FALSE_POSITIVE_ACTION_THRESHOLD)}); "
178
+ f"no change suggested."
179
+ ),
180
+ }
181
+ )
182
+
183
+ # 4. Too many false positives — suggest loosening.
184
+ suggested = _loosen(current, knob.kind)
185
+ return base.model_copy(
186
+ update={
187
+ "threshold_field": knob.field,
188
+ "current_value": current,
189
+ "suggested_value": suggested,
190
+ "confidence": confidence,
191
+ "rationale": (
192
+ f"{_pct(fp_rate)} of {symptom.value} kills under "
193
+ f"'{policy.name}' were labeled false-positive (n={total}). "
194
+ f"Consider raising {knob.field} "
195
+ f"{_fmt(current, knob.kind)}→{_fmt(suggested, knob.kind)}."
196
+ ),
197
+ }
198
+ )
199
+
200
+
201
+ def _fmt(value: float, kind: str) -> str:
202
+ """Render a threshold value the way a human writes it in the policy."""
203
+ if kind == "usd":
204
+ return f"${value:g}"
205
+ if kind == "seconds":
206
+ return f"{value:g}s"
207
+ return f"{value:g}"
208
+
209
+
210
+ def build_calibration_report(
211
+ policy: Policy, labeled_kills: Iterable[LabeledKill]
212
+ ) -> CalibrationReport:
213
+ """Aggregate labeled kills into an advisory calibration report.
214
+
215
+ Pure and deterministic: same inputs → same report, no I/O. Symptoms are
216
+ reported in `SymptomType` declaration order, but only those with at least
217
+ one labeled kill appear. A symptom with a high false-positive rate on a
218
+ sufficient sample gets a loosening suggestion; everything else is
219
+ stats-only (see the four branches in `_calibrate_symptom`).
220
+ """
221
+ by_symptom: dict[SymptomType, list[FeedbackLabel]] = {}
222
+ for kill in labeled_kills:
223
+ by_symptom.setdefault(kill.symptom, []).append(kill.label)
224
+
225
+ symptoms = [
226
+ _calibrate_symptom(symptom, by_symptom[symptom], policy)
227
+ for symptom in SymptomType
228
+ if symptom in by_symptom
229
+ ]
230
+ total = sum(s.total_labeled for s in symptoms)
231
+ return CalibrationReport(
232
+ policy_name=policy.name,
233
+ total_labeled_kills=total,
234
+ symptoms=symptoms,
235
+ )