hermeskill 0.1.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hermeskill-0.1.0a1/.gitignore +26 -0
- hermeskill-0.1.0a1/PKG-INFO +23 -0
- hermeskill-0.1.0a1/README.md +11 -0
- hermeskill-0.1.0a1/pyproject.toml +23 -0
- hermeskill-0.1.0a1/src/hermeskill/__init__.py +57 -0
- hermeskill-0.1.0a1/src/hermeskill/_version.py +1 -0
- hermeskill-0.1.0a1/src/hermeskill/apoptosis.py +342 -0
- hermeskill-0.1.0a1/src/hermeskill/calibration.py +235 -0
- hermeskill-0.1.0a1/src/hermeskill/certificate.py +87 -0
- hermeskill-0.1.0a1/src/hermeskill/checks.py +292 -0
- hermeskill-0.1.0a1/src/hermeskill/cli.py +769 -0
- hermeskill-0.1.0a1/src/hermeskill/client.py +433 -0
- hermeskill-0.1.0a1/src/hermeskill/config.py +120 -0
- hermeskill-0.1.0a1/src/hermeskill/exceptions.py +19 -0
- hermeskill-0.1.0a1/src/hermeskill/policies.py +128 -0
- hermeskill-0.1.0a1/src/hermeskill/pricing.py +82 -0
- hermeskill-0.1.0a1/src/hermeskill/py.typed +0 -0
- hermeskill-0.1.0a1/src/hermeskill/supervisor.py +257 -0
- hermeskill-0.1.0a1/src/hermeskill/types/__init__.py +69 -0
- hermeskill-0.1.0a1/src/hermeskill/types/agents.py +33 -0
- hermeskill-0.1.0a1/src/hermeskill/types/calibration.py +66 -0
- hermeskill-0.1.0a1/src/hermeskill/types/enums.py +70 -0
- hermeskill-0.1.0a1/src/hermeskill/types/events.py +46 -0
- hermeskill-0.1.0a1/src/hermeskill/types/feedback.py +29 -0
- hermeskill-0.1.0a1/src/hermeskill/types/grants.py +75 -0
- hermeskill-0.1.0a1/src/hermeskill/types/heartbeats.py +26 -0
- hermeskill-0.1.0a1/src/hermeskill/types/kills.py +117 -0
- hermeskill-0.1.0a1/src/hermeskill/types/policy.py +51 -0
- hermeskill-0.1.0a1/src/hermeskill/watcher.py +661 -0
- hermeskill-0.1.0a1/tests/_supervisor_targets.py +56 -0
- hermeskill-0.1.0a1/tests/conftest.py +33 -0
- hermeskill-0.1.0a1/tests/test_calibration.py +139 -0
- hermeskill-0.1.0a1/tests/test_certificate.py +59 -0
- hermeskill-0.1.0a1/tests/test_checks.py +523 -0
- hermeskill-0.1.0a1/tests/test_cli.py +733 -0
- hermeskill-0.1.0a1/tests/test_client.py +216 -0
- hermeskill-0.1.0a1/tests/test_config.py +102 -0
- hermeskill-0.1.0a1/tests/test_enable_hermes.py +72 -0
- hermeskill-0.1.0a1/tests/test_grants.py +214 -0
- hermeskill-0.1.0a1/tests/test_manual_kill.py +251 -0
- hermeskill-0.1.0a1/tests/test_policies.py +187 -0
- hermeskill-0.1.0a1/tests/test_pricing.py +39 -0
- hermeskill-0.1.0a1/tests/test_smoke.py +54 -0
- hermeskill-0.1.0a1/tests/test_supervisor.py +118 -0
- hermeskill-0.1.0a1/tests/test_watchdog.py +380 -0
- hermeskill-0.1.0a1/tests/test_watcher.py +267 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
.venv/
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
*.pyo
|
|
5
|
+
*.egg-info/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.pytest_cache/
|
|
9
|
+
.mypy_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.hypothesis/
|
|
12
|
+
.coverage
|
|
13
|
+
htmlcov/
|
|
14
|
+
*.db
|
|
15
|
+
*.sqlite
|
|
16
|
+
.env
|
|
17
|
+
.env.local
|
|
18
|
+
~/.hermeskill/
|
|
19
|
+
.idea/
|
|
20
|
+
.vscode/
|
|
21
|
+
*.log
|
|
22
|
+
.maestro/
|
|
23
|
+
.claude/settings.local.json
|
|
24
|
+
learn/
|
|
25
|
+
TODO.md
|
|
26
|
+
PUBLISH_READINESS.md
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hermeskill
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Summary: Hermeskill SDK — apoptosis protocol core: watcher state, symptom checks, death certificates, and control-plane client
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: httpx>=0.27
|
|
8
|
+
Requires-Dist: pydantic>=2.8
|
|
9
|
+
Requires-Dist: rich>=13.7
|
|
10
|
+
Requires-Dist: typer>=0.12
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# hermeskill
|
|
14
|
+
|
|
15
|
+
The Hermeskill SDK — `WatcherState`, symptom checks, death certificates,
|
|
16
|
+
control-plane client, and the `hermeskill` operator CLI.
|
|
17
|
+
|
|
18
|
+
This is the framework-agnostic core. The supported framework adapter today is
|
|
19
|
+
[`hermeskill-hermes`](../hermeskill-hermes) — install that package to supervise
|
|
20
|
+
Hermes Agent sessions with one `pip install`.
|
|
21
|
+
|
|
22
|
+
See the [repo root README](../../README.md) for product overview, install
|
|
23
|
+
walkthrough, environment variables, and operator workflows.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# hermeskill
|
|
2
|
+
|
|
3
|
+
The Hermeskill SDK — `WatcherState`, symptom checks, death certificates,
|
|
4
|
+
control-plane client, and the `hermeskill` operator CLI.
|
|
5
|
+
|
|
6
|
+
This is the framework-agnostic core. The supported framework adapter today is
|
|
7
|
+
[`hermeskill-hermes`](../hermeskill-hermes) — install that package to supervise
|
|
8
|
+
Hermes Agent sessions with one `pip install`.
|
|
9
|
+
|
|
10
|
+
See the [repo root README](../../README.md) for product overview, install
|
|
11
|
+
walkthrough, environment variables, and operator workflows.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "hermeskill"
|
|
3
|
+
version = "0.1.0a1"
|
|
4
|
+
description = "Hermeskill SDK — apoptosis protocol core: watcher state, symptom checks, death certificates, and control-plane client"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
dependencies = [
|
|
9
|
+
"pydantic>=2.8",
|
|
10
|
+
"httpx>=0.27",
|
|
11
|
+
"typer>=0.12",
|
|
12
|
+
"rich>=13.7",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
hermeskill = "hermeskill.cli:app"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["hatchling"]
|
|
20
|
+
build-backend = "hatchling.build"
|
|
21
|
+
|
|
22
|
+
[tool.hatch.build.targets.wheel]
|
|
23
|
+
packages = ["src/hermeskill"]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Hermeskill SDK — apoptosis protocol core for AI agent supervision.
|
|
2
|
+
|
|
3
|
+
Framework-agnostic core: WatcherState, symptom checks, death certificates,
|
|
4
|
+
kill-event client, operator CLI. Install a framework adapter on top:
|
|
5
|
+
|
|
6
|
+
pip install hermeskill-hermes # Hermes Agent plugin (recommended)
|
|
7
|
+
|
|
8
|
+
The bare `hermeskill` package imports with no third-party agent-framework
|
|
9
|
+
dependencies.
|
|
10
|
+
|
|
11
|
+
Public exceptions:
|
|
12
|
+
|
|
13
|
+
from hermeskill import HermeskillTerminated
|
|
14
|
+
|
|
15
|
+
# Raised by framework adapters and `checkpoint()` when the agent is
|
|
16
|
+
# killed by Hermeskill. Catch at your top-level run loop if you need
|
|
17
|
+
# cleanup before exit.
|
|
18
|
+
|
|
19
|
+
`checkpoint()` is a cooperative termination point for custom run loops;
|
|
20
|
+
raises HermeskillTerminated if a kill directive is pending.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from hermeskill._version import __version__
|
|
24
|
+
from hermeskill.calibration import LabeledKill, build_calibration_report
|
|
25
|
+
from hermeskill.exceptions import HermeskillError, HermeskillTerminated
|
|
26
|
+
from hermeskill.supervisor import Heartbeat, ProcessSupervisor, SupervisorResult
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"Heartbeat",
|
|
30
|
+
"HermeskillError",
|
|
31
|
+
"HermeskillTerminated",
|
|
32
|
+
"LabeledKill",
|
|
33
|
+
"ProcessSupervisor",
|
|
34
|
+
"SupervisorResult",
|
|
35
|
+
"__version__",
|
|
36
|
+
"build_calibration_report",
|
|
37
|
+
"checkpoint",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def checkpoint() -> None:
|
|
42
|
+
"""Cooperative termination point for custom run loops.
|
|
43
|
+
|
|
44
|
+
Call inside long-running synchronous work to give Hermeskill a chance to
|
|
45
|
+
terminate the agent. Raises HermeskillTerminated if any registered watcher
|
|
46
|
+
has its apoptosis flag set; no-op otherwise. Safe to call from code with
|
|
47
|
+
no registered watcher (returns immediately).
|
|
48
|
+
"""
|
|
49
|
+
from hermeskill.exceptions import HermeskillTerminated
|
|
50
|
+
from hermeskill.watcher import all_watchers
|
|
51
|
+
|
|
52
|
+
for state in all_watchers():
|
|
53
|
+
if state.terminate_requested:
|
|
54
|
+
raise HermeskillTerminated(
|
|
55
|
+
state.terminate_reason or "terminated",
|
|
56
|
+
kill_event_id=state.terminate_kill_event_id,
|
|
57
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0a1"
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""L2 forced-termination watchdog.
|
|
2
|
+
|
|
3
|
+
L1 (cooperative termination) is handled by the framework adapter — the
|
|
4
|
+
kill stub or checkpoint raise at tool/chain boundaries. It works as long
|
|
5
|
+
as the agent's event loop is alive and reaching await points. When it
|
|
6
|
+
isn't — agent is wedged inside a sync tool, or stubbornly ignoring the
|
|
7
|
+
cooperative signal — we need an out-of-band path that can cancel from
|
|
8
|
+
outside the loop.
|
|
9
|
+
|
|
10
|
+
That's L2: **one daemon `threading.Thread` per watched agent**, holding a
|
|
11
|
+
reference to the agent's asyncio loop and main `Task`. The thread sleeps
|
|
12
|
+
on `state._terminate_event`. When apoptosis fires, it waits the policy's
|
|
13
|
+
`cooperative_grace_seconds`, checks whether the task finished on its own
|
|
14
|
+
(L1 worked → no escalation), and if not, calls
|
|
15
|
+
`loop.call_soon_threadsafe(task.cancel)` — scheduling cancellation from
|
|
16
|
+
*outside* the loop, which is the part that defeats the wedged-loop case.
|
|
17
|
+
|
|
18
|
+
**Why a thread, not an asyncio task.** If we scheduled the L2 timer with
|
|
19
|
+
`asyncio.create_task(...)` in the agent's own loop, it would queue
|
|
20
|
+
behind whatever's blocking that loop — i.e. behind the very thing it's
|
|
21
|
+
trying to interrupt. Same-loop scheduling defeats the entire purpose.
|
|
22
|
+
Run as a thread, run outside the loop. *Do not* refactor this back into
|
|
23
|
+
the loop in a future cleanup pass — leave this comment as ballast.
|
|
24
|
+
|
|
25
|
+
**Honest limitation.** `task.cancel()` raises CancelledError at the next
|
|
26
|
+
*await point*. If an agent is wedged in pure-Python CPU code (`while
|
|
27
|
+
True: pass` inside a sync tool with no awaits anywhere reachable), the
|
|
28
|
+
cancellation will not fire — Python provides no portable way to
|
|
29
|
+
interrupt a thread mid-bytecode. The watchdog logs the escalation
|
|
30
|
+
attempt; in that case the only real recourse is killing the OS process
|
|
31
|
+
(operator escalation, M3 webhook fires, M5 grants document the case).
|
|
32
|
+
The watchdog still handles the realistic case (async tool wedged on a
|
|
33
|
+
slow network call ignoring cooperative shutdown) — which is what the
|
|
34
|
+
plan's "blocked-loop test" intends to exercise.
|
|
35
|
+
|
|
36
|
+
Public surface: `Watchdog(state, grace_seconds)`, `.arm(loop, task)`,
|
|
37
|
+
`.stop()`. Idempotent arming — call from `on_chain_start` every time;
|
|
38
|
+
the first call starts the thread, later calls just refresh the captured
|
|
39
|
+
loop + task in case a new invocation runs in a different task.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import asyncio
|
|
45
|
+
import logging
|
|
46
|
+
import threading
|
|
47
|
+
import time
|
|
48
|
+
from datetime import UTC, datetime
|
|
49
|
+
from typing import TYPE_CHECKING
|
|
50
|
+
|
|
51
|
+
from hermeskill.types import (
|
|
52
|
+
DeathCertificate,
|
|
53
|
+
KillEventIn,
|
|
54
|
+
ShutdownLogEntry,
|
|
55
|
+
TriggerType,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
if TYPE_CHECKING:
|
|
59
|
+
from hermeskill.watcher import WatcherState
|
|
60
|
+
|
|
61
|
+
logger = logging.getLogger("hermeskill.apoptosis")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Watchdog:
|
|
65
|
+
"""L2 forced-termination thread. One per `WatcherState`."""
|
|
66
|
+
|
|
67
|
+
# Polling cadence for the thread's main wait + grace-period loops.
|
|
68
|
+
# Trades responsiveness against wakeup cost; 100ms is plenty fast for
|
|
69
|
+
# human-perceptible kill latency without burning CPU on idle agents.
|
|
70
|
+
_POLL_SECONDS = 0.1
|
|
71
|
+
|
|
72
|
+
def __init__(self, state: WatcherState, *, grace_seconds: float) -> None:
|
|
73
|
+
self.state = state
|
|
74
|
+
self.grace_seconds = grace_seconds
|
|
75
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
76
|
+
self._task: asyncio.Task[object] | None = None
|
|
77
|
+
self._thread: threading.Thread | None = None
|
|
78
|
+
self._stop = threading.Event()
|
|
79
|
+
# Guards the loop/task slots. Cheap — only touched on arm() + on
|
|
80
|
+
# transitions inside _run().
|
|
81
|
+
self._lock = threading.Lock()
|
|
82
|
+
# True iff we've already issued a call_soon_threadsafe(task.cancel)
|
|
83
|
+
# for this kill; prevents double-cancel on long-grace policies.
|
|
84
|
+
self._escalated = False
|
|
85
|
+
|
|
86
|
+
# --- public API -------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
def arm(
|
|
89
|
+
self,
|
|
90
|
+
loop: asyncio.AbstractEventLoop,
|
|
91
|
+
task: asyncio.Task[object],
|
|
92
|
+
) -> None:
|
|
93
|
+
"""Capture the loop + task to watch. Idempotent.
|
|
94
|
+
|
|
95
|
+
On first call: starts the daemon thread.
|
|
96
|
+
On later calls: refreshes the loop/task slots (a new ainvoke may
|
|
97
|
+
run in a different task than the previous one).
|
|
98
|
+
|
|
99
|
+
Safe to call from any thread, including the agent's own loop.
|
|
100
|
+
"""
|
|
101
|
+
with self._lock:
|
|
102
|
+
self._loop = loop
|
|
103
|
+
self._task = task
|
|
104
|
+
if self._thread is None:
|
|
105
|
+
self._thread = threading.Thread(
|
|
106
|
+
target=self._run,
|
|
107
|
+
daemon=True,
|
|
108
|
+
name=f"hermeskill-watchdog-{self.state.agent_id}",
|
|
109
|
+
)
|
|
110
|
+
self._thread.start()
|
|
111
|
+
|
|
112
|
+
def stop(self, *, join_timeout: float = 2.0) -> None:
|
|
113
|
+
"""Signal the thread to exit. Does NOT force-cancel the task.
|
|
114
|
+
|
|
115
|
+
Called on agent unregister / process shutdown. The thread joins
|
|
116
|
+
within `join_timeout`; if it doesn't, we log and move on (daemon
|
|
117
|
+
threads die with the process anyway).
|
|
118
|
+
"""
|
|
119
|
+
self._stop.set()
|
|
120
|
+
# Poke the terminate_event so a thread blocked on it wakes up to
|
|
121
|
+
# observe the stop flag. (We can't `notify` a threading.Event the
|
|
122
|
+
# same way as a Condition — set() is the only signal mechanism.)
|
|
123
|
+
self.state._terminate_event.set()
|
|
124
|
+
with self._lock:
|
|
125
|
+
thread = self._thread
|
|
126
|
+
if thread is not None and thread.is_alive():
|
|
127
|
+
thread.join(timeout=join_timeout)
|
|
128
|
+
if thread.is_alive():
|
|
129
|
+
logger.warning(
|
|
130
|
+
"hermeskill L2 watchdog: thread %s did not join within %.1fs",
|
|
131
|
+
thread.name,
|
|
132
|
+
join_timeout,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# --- thread body ------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def _run(self) -> None:
|
|
138
|
+
"""The daemon thread: wait for kill, give grace, escalate.
|
|
139
|
+
|
|
140
|
+
Loop structure:
|
|
141
|
+
1. Wait on `_terminate_event` (with timeout so we can poll
|
|
142
|
+
`_stop` and the flag).
|
|
143
|
+
2. When triggered, wait `grace_seconds` for cooperative
|
|
144
|
+
termination — checking `task.done()` periodically to bail
|
|
145
|
+
out early when L1 wins.
|
|
146
|
+
3. If task still alive after grace: escalate via
|
|
147
|
+
`loop.call_soon_threadsafe(task.cancel)`.
|
|
148
|
+
4. Exit. One watchdog = one kill — no re-arm on the same state.
|
|
149
|
+
"""
|
|
150
|
+
logger.debug(
|
|
151
|
+
"hermeskill L2 watchdog armed for agent %s (grace=%.1fs)",
|
|
152
|
+
self.state.agent_id,
|
|
153
|
+
self.grace_seconds,
|
|
154
|
+
)
|
|
155
|
+
try:
|
|
156
|
+
# --- step 1: wait for kill signal -----------------------
|
|
157
|
+
while not self._stop.is_set():
|
|
158
|
+
triggered = self.state._terminate_event.wait(timeout=self._POLL_SECONDS)
|
|
159
|
+
if self._stop.is_set():
|
|
160
|
+
return
|
|
161
|
+
# Defensive: also check the flag in case a caller wrote
|
|
162
|
+
# it directly without going through request_termination.
|
|
163
|
+
if triggered or self.state.terminate_requested:
|
|
164
|
+
break
|
|
165
|
+
else:
|
|
166
|
+
return # stopped before any kill
|
|
167
|
+
|
|
168
|
+
# --- step 2: cooperative-grace window -------------------
|
|
169
|
+
deadline = time.monotonic() + self.grace_seconds
|
|
170
|
+
while time.monotonic() < deadline:
|
|
171
|
+
if self._stop.is_set():
|
|
172
|
+
return
|
|
173
|
+
with self._lock:
|
|
174
|
+
task = self._task
|
|
175
|
+
if task is not None and task.done():
|
|
176
|
+
logger.debug(
|
|
177
|
+
"hermeskill L2 watchdog: agent %s cooperated, no escalation",
|
|
178
|
+
self.state.agent_id,
|
|
179
|
+
)
|
|
180
|
+
return
|
|
181
|
+
time.sleep(self._POLL_SECONDS)
|
|
182
|
+
|
|
183
|
+
# --- step 3: escalate -----------------------------------
|
|
184
|
+
self._escalate()
|
|
185
|
+
except Exception:
|
|
186
|
+
logger.exception("hermeskill L2 watchdog crashed for agent %s", self.state.agent_id)
|
|
187
|
+
|
|
188
|
+
def _escalate(self) -> None:
|
|
189
|
+
with self._lock:
|
|
190
|
+
loop = self._loop
|
|
191
|
+
task = self._task
|
|
192
|
+
already = self._escalated
|
|
193
|
+
self._escalated = True
|
|
194
|
+
|
|
195
|
+
if already:
|
|
196
|
+
return
|
|
197
|
+
if loop is None or task is None:
|
|
198
|
+
logger.warning(
|
|
199
|
+
"hermeskill L2 watchdog: no loop/task captured for agent %s; "
|
|
200
|
+
"cannot escalate (this is the case the docstring's 'honest "
|
|
201
|
+
"limitation' note describes — operator must kill the process)",
|
|
202
|
+
self.state.agent_id,
|
|
203
|
+
)
|
|
204
|
+
return
|
|
205
|
+
if task.done():
|
|
206
|
+
return # narrowly raced with cooperative completion
|
|
207
|
+
|
|
208
|
+
logger.warning(
|
|
209
|
+
"hermeskill L2 watchdog: cooperative grace (%.1fs) expired for "
|
|
210
|
+
"agent %s; forcing task cancellation",
|
|
211
|
+
self.grace_seconds,
|
|
212
|
+
self.state.agent_id,
|
|
213
|
+
)
|
|
214
|
+
try:
|
|
215
|
+
loop.call_soon_threadsafe(task.cancel)
|
|
216
|
+
except RuntimeError:
|
|
217
|
+
# Loop already closed — nothing left to cancel against.
|
|
218
|
+
logger.debug(
|
|
219
|
+
"hermeskill L2 watchdog: loop already closed for agent %s",
|
|
220
|
+
self.state.agent_id,
|
|
221
|
+
)
|
|
222
|
+
# Record a lifecycle event AND a shutdown-log step so the death
|
|
223
|
+
# cert shows the watchdog fired and audit can correlate timings.
|
|
224
|
+
try:
|
|
225
|
+
self.state.record_lifecycle(
|
|
226
|
+
"watchdog_escalated",
|
|
227
|
+
grace_seconds=self.grace_seconds,
|
|
228
|
+
)
|
|
229
|
+
self.state.record_shutdown_step(
|
|
230
|
+
"watchdog_escalated",
|
|
231
|
+
grace_seconds=self.grace_seconds,
|
|
232
|
+
)
|
|
233
|
+
except Exception:
|
|
234
|
+
logger.exception("watchdog: failed to record escalation lifecycle")
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
# --- death certificate builder + posting ----------------------------------
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def build_death_certificate(
|
|
241
|
+
state: WatcherState,
|
|
242
|
+
*,
|
|
243
|
+
terminated_at: datetime | None = None,
|
|
244
|
+
) -> DeathCertificate:
|
|
245
|
+
"""Snapshot `state` into a forensic death certificate.
|
|
246
|
+
|
|
247
|
+
The cert is built at the very end of the death sequence, after L1
|
|
248
|
+
cooperative termination has raised `HermeskillTerminated` and the
|
|
249
|
+
wrapper has caught it. By then:
|
|
250
|
+
|
|
251
|
+
* `state.terminate_requested` is True
|
|
252
|
+
* `state.terminate_reason` is set (first-cause-wins)
|
|
253
|
+
* `state.terminate_requested_at` is the time the decision was made
|
|
254
|
+
* `state.symptoms_log` holds every symptom (terminal + warning) the
|
|
255
|
+
agent saw during its lifetime
|
|
256
|
+
* `state.shutdown_log` holds the structured shutdown steps so far
|
|
257
|
+
|
|
258
|
+
`terminated_at` defaults to now() — the moment of cert build, which
|
|
259
|
+
is effectively the moment of death from the SDK's POV.
|
|
260
|
+
|
|
261
|
+
The cert intentionally does NOT include `customer_id` / `policy_id` /
|
|
262
|
+
`feedback_url` — those are server-authoritative (the SDK shouldn't
|
|
263
|
+
be in the business of claiming customer ownership; the server fills
|
|
264
|
+
them from the API key and from M3's signed-token machinery).
|
|
265
|
+
"""
|
|
266
|
+
now = terminated_at or datetime.now(UTC)
|
|
267
|
+
triggered_at = state.terminate_requested_at or now
|
|
268
|
+
reason = state.terminate_reason or "unknown"
|
|
269
|
+
|
|
270
|
+
# M4: branch on `state.manual_kill` rather than `terminate_reason`.
|
|
271
|
+
# The poller writes the dict atomically with the flag flip, so its
|
|
272
|
+
# presence is the authoritative signal that this kill was operator-
|
|
273
|
+
# issued.
|
|
274
|
+
manual = state.manual_kill
|
|
275
|
+
if manual is not None:
|
|
276
|
+
trigger_type = TriggerType.MANUAL
|
|
277
|
+
operator = manual.get("operator")
|
|
278
|
+
operator_reason = manual.get("operator_reason")
|
|
279
|
+
else:
|
|
280
|
+
trigger_type = TriggerType.AUTO
|
|
281
|
+
operator = None
|
|
282
|
+
operator_reason = None
|
|
283
|
+
|
|
284
|
+
return DeathCertificate(
|
|
285
|
+
agent_id=state.agent_id,
|
|
286
|
+
triggered_at=triggered_at,
|
|
287
|
+
terminated_at=now,
|
|
288
|
+
trigger_type=trigger_type,
|
|
289
|
+
trigger_reason=reason,
|
|
290
|
+
symptoms_log=list(state.symptoms_log),
|
|
291
|
+
final_state={}, # v2 / cleanup-hook hookpoint
|
|
292
|
+
shutdown_log=[_normalize_step(s) for s in state.shutdown_log],
|
|
293
|
+
operator=operator,
|
|
294
|
+
operator_reason=operator_reason,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def build_kill_event_payload(
|
|
299
|
+
state: WatcherState,
|
|
300
|
+
*,
|
|
301
|
+
terminated_at: datetime | None = None,
|
|
302
|
+
) -> KillEventIn:
|
|
303
|
+
"""Wrap the death cert into the `POST /agents/{id}/kill_events` body."""
|
|
304
|
+
cert = build_death_certificate(state, terminated_at=terminated_at)
|
|
305
|
+
return KillEventIn(
|
|
306
|
+
trigger_type=cert.trigger_type,
|
|
307
|
+
trigger_reason=cert.trigger_reason,
|
|
308
|
+
triggered_at=cert.triggered_at,
|
|
309
|
+
terminated_at=cert.terminated_at,
|
|
310
|
+
death_certificate=cert,
|
|
311
|
+
shutdown_log=cert.shutdown_log,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _normalize_step(raw: dict[str, object]) -> ShutdownLogEntry:
|
|
316
|
+
"""Coerce a `record_shutdown_step()`-format dict into the typed model.
|
|
317
|
+
|
|
318
|
+
Steps are appended to `state.shutdown_log` as plain dicts (cheap
|
|
319
|
+
write path); we type-validate them only when the cert is built.
|
|
320
|
+
"""
|
|
321
|
+
at_value = raw.get("at")
|
|
322
|
+
if isinstance(at_value, str):
|
|
323
|
+
at = datetime.fromisoformat(at_value)
|
|
324
|
+
elif isinstance(at_value, datetime):
|
|
325
|
+
at = at_value
|
|
326
|
+
else:
|
|
327
|
+
at = datetime.now(UTC)
|
|
328
|
+
duration_raw = raw.get("duration_ms")
|
|
329
|
+
duration_ms: float | None = (
|
|
330
|
+
None if duration_raw is None else float(duration_raw) # type: ignore[arg-type]
|
|
331
|
+
)
|
|
332
|
+
detail = raw.get("detail") or {}
|
|
333
|
+
if not isinstance(detail, dict):
|
|
334
|
+
detail = {}
|
|
335
|
+
step_raw = raw.get("step")
|
|
336
|
+
step = str(step_raw) if step_raw is not None else "unknown"
|
|
337
|
+
return ShutdownLogEntry(
|
|
338
|
+
step=step,
|
|
339
|
+
at=at,
|
|
340
|
+
duration_ms=duration_ms,
|
|
341
|
+
detail=detail,
|
|
342
|
+
)
|