agent_hypervisor 3.4.0__tar.gz → 3.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/.gitignore +1 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/PKG-INFO +2 -2
- agent_hypervisor-3.6.0/examples/requirements.txt +1 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/notebooks/README.md +2 -1
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/pyproject.toml +2 -2
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/server.py +5 -8
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/core.py +17 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/vouching.py +5 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/event_bus.py +97 -39
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/providers.py +12 -5
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/breach_detector.py +17 -1
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/orchestrator.py +21 -5
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/state_machine.py +21 -1
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/kill_switch.py +89 -15
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/rate_limiter.py +65 -39
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/sso.py +15 -1
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_agent_manager.py +38 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_breach_detector.py +44 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_kill_switch.py +71 -0
- agent_hypervisor-3.6.0/tests/test_providers.py +44 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_rate_limiter.py +127 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_liability.py +15 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_observability.py +103 -1
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga.py +34 -0
- agent_hypervisor-3.4.0/examples/requirements.txt +0 -1
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/CHANGELOG.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/LICENSE +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/README.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/SECURITY.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/benchmarks/bench_hypervisor.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/benchmarks/results/BENCHMARKS.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/benchmarks/results/benchmarks.json +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/docs/api-reference.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/docs/joint-liability-guide.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/dashboard/README.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/dashboard/app.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/dashboard/requirements.txt +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/demo.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/Dockerfile +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/README.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/dashboard.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/sample_workflow.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/server.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/config/hypervisor.yaml +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/docker-compose.yml +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/notebooks/hypervisor-exploration.ipynb +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/models.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/commitment.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/delta.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/gc.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/cli/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/cli/formatters.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/cli/session_commands.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/constants.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/iatp_adapter.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/nexus_adapter.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/verification_adapter.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/attribution.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/ledger.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/quarantine.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/slashing.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/models.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/causal_trace.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/prometheus_collector.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/saga_span_exporter.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/py.typed +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/reversibility/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/reversibility/registry.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/classifier.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/elevation.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/enforcer.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/checkpoint.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/dsl.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/fan_out.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/schema.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/intent_locks.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/isolation.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/vector_clock.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/verification/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/verification/history.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/integration/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/integration/test_hypervisor_e2e.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/integration/test_scenarios.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_classifier.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_shapley_attribution.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/__init__.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_audit.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_cli.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_config_validation.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_liability_improvements.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_models.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_prometheus_collector.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_reversibility_registry.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_ring_improvements.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_rings.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga_improvements.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga_schema.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga_span_exporter.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_session.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_session_security.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_slashing.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_vfs_substrate.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/execution-rings-workflow/README.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/execution-rings-workflow/demo.py +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/saga-compensation/README.md +0 -0
- {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/saga-compensation/demo.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agent_hypervisor
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.6.0
|
|
4
4
|
Summary: Public Preview — Agent Hypervisor: Runtime supervisor for multi-agent Shared Sessions with Execution Rings, Joint Liability, Saga Orchestration, and hash-chained audit trails
|
|
5
5
|
Project-URL: Homepage, https://github.com/microsoft/agent-governance-toolkit
|
|
6
6
|
Project-URL: Repository, https://github.com/microsoft/agent-governance-toolkit
|
|
@@ -35,7 +35,7 @@ Requires-Dist: web3<8.0,>=6.0.0; extra == 'blockchain'
|
|
|
35
35
|
Provides-Extra: dev
|
|
36
36
|
Requires-Dist: hypothesis<7.0,>=6.0.0; extra == 'dev'
|
|
37
37
|
Requires-Dist: jsonschema<5.0,>=4.0.0; extra == 'dev'
|
|
38
|
-
Requires-Dist: mypy<
|
|
38
|
+
Requires-Dist: mypy<3.0,>=1.8.0; extra == 'dev'
|
|
39
39
|
Requires-Dist: pytest-asyncio<2.0,>=0.23.0; extra == 'dev'
|
|
40
40
|
Requires-Dist: pytest-cov<8.0,>=4.0.0; extra == 'dev'
|
|
41
41
|
Requires-Dist: pytest<10.0,>=8.0.0; extra == 'dev'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
agent-hypervisor>=3.5.0
|
|
@@ -11,7 +11,8 @@ Interactive Jupyter notebooks for exploring the **agent-hypervisor** runtime.
|
|
|
11
11
|
## Quick Start
|
|
12
12
|
|
|
13
13
|
```bash
|
|
14
|
-
# From the
|
|
14
|
+
# From the agent-hypervisor package root
|
|
15
|
+
cd agent-governance-python/agent-hypervisor
|
|
15
16
|
pip install -e ".[dev]" plotly nest-asyncio
|
|
16
17
|
jupyter notebook notebooks/
|
|
17
18
|
```
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "agent_hypervisor"
|
|
7
|
-
version = "3.
|
|
7
|
+
version = "3.6.0"
|
|
8
8
|
description = "Public Preview — Agent Hypervisor: Runtime supervisor for multi-agent Shared Sessions with Execution Rings, Joint Liability, Saga Orchestration, and hash-chained audit trails"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -58,7 +58,7 @@ dev = [
|
|
|
58
58
|
"pytest-cov>=4.0.0,<8.0",
|
|
59
59
|
"hypothesis>=6.0.0,<7.0",
|
|
60
60
|
"ruff>=0.4.0,<1.0",
|
|
61
|
-
"mypy>=1.8.0,<
|
|
61
|
+
"mypy>=1.8.0,<3.0",
|
|
62
62
|
"jsonschema>=4.0.0,<5.0",
|
|
63
63
|
]
|
|
64
64
|
blockchain = [
|
|
@@ -180,19 +180,16 @@ async def get_stats() -> StatsResponse:
|
|
|
180
180
|
"""Get overall hypervisor statistics."""
|
|
181
181
|
hv = _hv()
|
|
182
182
|
bus = _bus()
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
)
|
|
186
|
-
active_sagas = sum(
|
|
187
|
-
len(m.saga.active_sagas) for m in hv._sessions.values()
|
|
188
|
-
)
|
|
183
|
+
sessions = hv.sessions
|
|
184
|
+
total_participants = sum(m.sso.participant_count for m in sessions)
|
|
185
|
+
active_sagas = sum(len(m.saga.active_sagas) for m in sessions)
|
|
189
186
|
return StatsResponse(
|
|
190
187
|
version=__version__,
|
|
191
|
-
total_sessions=
|
|
188
|
+
total_sessions=hv.session_count,
|
|
192
189
|
active_sessions=len(hv.active_sessions),
|
|
193
190
|
total_participants=total_participants,
|
|
194
191
|
active_sagas=active_sagas,
|
|
195
|
-
total_vouches=
|
|
192
|
+
total_vouches=hv.vouching.vouch_count,
|
|
196
193
|
event_count=bus.event_count,
|
|
197
194
|
)
|
|
198
195
|
|
|
@@ -306,6 +306,23 @@ class Hypervisor:
|
|
|
306
306
|
return [self._sessions[sid] for sid in self._active_ids
|
|
307
307
|
if sid in self._sessions]
|
|
308
308
|
|
|
309
|
+
@property
|
|
310
|
+
def sessions(self) -> list[ManagedSession]:
|
|
311
|
+
"""All managed sessions, including archived/terminating ones.
|
|
312
|
+
|
|
313
|
+
``active_sessions`` filters via ``_active_ids``; this property
|
|
314
|
+
exposes the full registry for callers (admin APIs, monitoring,
|
|
315
|
+
stats) that need a count or iterator over every session the
|
|
316
|
+
Hypervisor is still tracking. Returns a snapshot list so callers
|
|
317
|
+
can iterate without holding any internal reference.
|
|
318
|
+
"""
|
|
319
|
+
return list(self._sessions.values())
|
|
320
|
+
|
|
321
|
+
@property
|
|
322
|
+
def session_count(self) -> int:
|
|
323
|
+
"""Total number of managed sessions, including archived/terminating."""
|
|
324
|
+
return len(self._sessions)
|
|
325
|
+
|
|
309
326
|
def _get_session(self, session_id: str) -> ManagedSession:
|
|
310
327
|
managed = self._sessions.get(session_id)
|
|
311
328
|
if not managed:
|
|
@@ -57,6 +57,11 @@ class VouchingEngine:
|
|
|
57
57
|
self._vouches: dict[str, VouchRecord] = {}
|
|
58
58
|
self.max_exposure = max_exposure or self.DEFAULT_MAX_EXPOSURE
|
|
59
59
|
|
|
60
|
+
@property
|
|
61
|
+
def vouch_count(self) -> int:
|
|
62
|
+
"""Total number of sponsorship records (active + released)."""
|
|
63
|
+
return len(self._vouches)
|
|
64
|
+
|
|
60
65
|
def vouch(
|
|
61
66
|
self,
|
|
62
67
|
voucher_did: str,
|
|
@@ -10,13 +10,21 @@ full replay debugging, post-mortem analysis, and real-time monitoring.
|
|
|
10
10
|
|
|
11
11
|
from __future__ import annotations
|
|
12
12
|
|
|
13
|
+
import threading
|
|
13
14
|
import uuid
|
|
15
|
+
from collections import deque
|
|
14
16
|
from collections.abc import Callable
|
|
15
17
|
from dataclasses import dataclass, field
|
|
16
18
|
from datetime import UTC, datetime
|
|
17
19
|
from enum import Enum
|
|
18
20
|
from typing import Any
|
|
19
21
|
|
|
22
|
+
# Default cap for the in-memory event store. Hypervisor deployments run for
|
|
23
|
+
# weeks; an unbounded list eventually OOMs. The cap is configurable via the
|
|
24
|
+
# ``HypervisorEventBus(max_events=...)`` constructor; ``None`` opts back into
|
|
25
|
+
# unbounded growth for tests or analysis tooling that needs full history.
|
|
26
|
+
DEFAULT_MAX_EVENTS = 100_000
|
|
27
|
+
|
|
20
28
|
|
|
21
29
|
class EventType(str, Enum):
|
|
22
30
|
"""Categorised hypervisor event types."""
|
|
@@ -119,34 +127,61 @@ class HypervisorEventBus:
|
|
|
119
127
|
- Event count and statistics
|
|
120
128
|
"""
|
|
121
129
|
|
|
122
|
-
def __init__(self) -> None:
|
|
123
|
-
|
|
130
|
+
def __init__(self, max_events: int | None = DEFAULT_MAX_EVENTS) -> None:
|
|
131
|
+
"""Create an event bus.
|
|
132
|
+
|
|
133
|
+
``max_events`` caps the in-memory store. Each per-key index list
|
|
134
|
+
(by-type, by-session, by-agent) is independently capped to the
|
|
135
|
+
same value, so a single chatty session cannot starve the
|
|
136
|
+
history of other sessions. Pass ``None`` to disable the cap
|
|
137
|
+
(testing or full-replay tooling).
|
|
138
|
+
"""
|
|
139
|
+
self._max_events = max_events
|
|
140
|
+
# `deque` with `maxlen` evicts the oldest entry on overflow in
|
|
141
|
+
# O(1), avoiding the OOM cliff of an unbounded `list`.
|
|
142
|
+
self._events: deque[HypervisorEvent] = deque(maxlen=max_events)
|
|
124
143
|
self._subscribers: dict[EventType | None, list[EventHandler]] = {}
|
|
125
|
-
self._by_type: dict[EventType,
|
|
126
|
-
self._by_session: dict[str,
|
|
127
|
-
self._by_agent: dict[str,
|
|
144
|
+
self._by_type: dict[EventType, deque[HypervisorEvent]] = {}
|
|
145
|
+
self._by_session: dict[str, deque[HypervisorEvent]] = {}
|
|
146
|
+
self._by_agent: dict[str, deque[HypervisorEvent]] = {}
|
|
147
|
+
# Use an RLock so a subscriber that re-enters the bus (e.g.
|
|
148
|
+
# emits an event in response to another event) doesn't deadlock.
|
|
149
|
+
self._lock = threading.RLock()
|
|
150
|
+
|
|
151
|
+
def _new_index_deque(self) -> deque[HypervisorEvent]:
|
|
152
|
+
return deque(maxlen=self._max_events)
|
|
128
153
|
|
|
129
154
|
def emit(self, event: HypervisorEvent) -> None:
|
|
130
155
|
"""Append an event and notify subscribers."""
|
|
131
|
-
self.
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
156
|
+
with self._lock:
|
|
157
|
+
self._events.append(event)
|
|
158
|
+
|
|
159
|
+
self._by_type.setdefault(
|
|
160
|
+
event.event_type, self._new_index_deque()
|
|
161
|
+
).append(event)
|
|
162
|
+
|
|
163
|
+
if event.session_id:
|
|
164
|
+
self._by_session.setdefault(
|
|
165
|
+
event.session_id, self._new_index_deque()
|
|
166
|
+
).append(event)
|
|
167
|
+
|
|
168
|
+
if event.agent_did:
|
|
169
|
+
self._by_agent.setdefault(
|
|
170
|
+
event.agent_did, self._new_index_deque()
|
|
171
|
+
).append(event)
|
|
172
|
+
|
|
173
|
+
# Snapshot subscriber lists while holding the lock so a
|
|
174
|
+
# subscriber that mutates the registry mid-notify doesn't
|
|
175
|
+
# invalidate iteration.
|
|
176
|
+
type_subs = list(self._subscribers.get(event.event_type, ()))
|
|
177
|
+
wildcard_subs = list(self._subscribers.get(None, ()))
|
|
178
|
+
|
|
179
|
+
# Invoke handlers outside the lock so a slow subscriber can't
|
|
180
|
+
# serialize the entire bus or, worse, deadlock with a caller
|
|
181
|
+
# that also holds an external lock.
|
|
182
|
+
for handler in type_subs:
|
|
146
183
|
handler(event)
|
|
147
|
-
|
|
148
|
-
# Notify wildcard subscribers
|
|
149
|
-
for handler in self._subscribers.get(None, []):
|
|
184
|
+
for handler in wildcard_subs:
|
|
150
185
|
handler(event)
|
|
151
186
|
|
|
152
187
|
def subscribe(
|
|
@@ -155,20 +190,25 @@ class HypervisorEventBus:
|
|
|
155
190
|
handler: EventHandler | None = None,
|
|
156
191
|
) -> None:
|
|
157
192
|
"""Subscribe to events. Use event_type=None for all events."""
|
|
158
|
-
if handler:
|
|
193
|
+
if not handler:
|
|
194
|
+
return
|
|
195
|
+
with self._lock:
|
|
159
196
|
self._subscribers.setdefault(event_type, []).append(handler)
|
|
160
197
|
|
|
161
198
|
def query_by_type(self, event_type: EventType) -> list[HypervisorEvent]:
|
|
162
199
|
"""Get all events of a specific type."""
|
|
163
|
-
|
|
200
|
+
with self._lock:
|
|
201
|
+
return list(self._by_type.get(event_type, ()))
|
|
164
202
|
|
|
165
203
|
def query_by_session(self, session_id: str) -> list[HypervisorEvent]:
|
|
166
204
|
"""Get all events for a specific session."""
|
|
167
|
-
|
|
205
|
+
with self._lock:
|
|
206
|
+
return list(self._by_session.get(session_id, ()))
|
|
168
207
|
|
|
169
208
|
def query_by_agent(self, agent_did: str) -> list[HypervisorEvent]:
|
|
170
209
|
"""Get all events involving a specific agent."""
|
|
171
|
-
|
|
210
|
+
with self._lock:
|
|
211
|
+
return list(self._by_agent.get(agent_did, ()))
|
|
172
212
|
|
|
173
213
|
def query_by_time_range(
|
|
174
214
|
self,
|
|
@@ -178,7 +218,8 @@ class HypervisorEventBus:
|
|
|
178
218
|
"""Get events within a time range."""
|
|
179
219
|
if end is None:
|
|
180
220
|
end = datetime.now(UTC)
|
|
181
|
-
|
|
221
|
+
with self._lock:
|
|
222
|
+
return [e for e in self._events if start <= e.timestamp <= end]
|
|
182
223
|
|
|
183
224
|
def query(
|
|
184
225
|
self,
|
|
@@ -188,7 +229,8 @@ class HypervisorEventBus:
|
|
|
188
229
|
limit: int | None = None,
|
|
189
230
|
) -> list[HypervisorEvent]:
|
|
190
231
|
"""Flexible query with multiple filters."""
|
|
191
|
-
|
|
232
|
+
with self._lock:
|
|
233
|
+
results: list[HypervisorEvent] = list(self._events)
|
|
192
234
|
|
|
193
235
|
if event_type is not None:
|
|
194
236
|
results = [e for e in results if e.event_type == event_type]
|
|
@@ -204,19 +246,35 @@ class HypervisorEventBus:
|
|
|
204
246
|
|
|
205
247
|
@property
|
|
206
248
|
def event_count(self) -> int:
|
|
207
|
-
|
|
249
|
+
with self._lock:
|
|
250
|
+
return len(self._events)
|
|
208
251
|
|
|
209
252
|
@property
|
|
210
253
|
def all_events(self) -> list[HypervisorEvent]:
|
|
211
|
-
|
|
254
|
+
with self._lock:
|
|
255
|
+
return list(self._events)
|
|
212
256
|
|
|
213
257
|
def type_counts(self) -> dict[str, int]:
|
|
214
258
|
"""Return count of events per type."""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
259
|
+
with self._lock:
|
|
260
|
+
return {t.value: len(evts) for t, evts in self._by_type.items()}
|
|
261
|
+
|
|
262
|
+
def _clear(self) -> None:
|
|
263
|
+
"""Clear all events. **Test-only — do not call in production.**
|
|
264
|
+
|
|
265
|
+
The event bus is wired into the hypervisor as a long-lived,
|
|
266
|
+
process-singleton-shaped collaborator (see
|
|
267
|
+
``hypervisor.api.server._event_bus``): production calls would
|
|
268
|
+
wipe the audit trail of every running session at once.
|
|
269
|
+
|
|
270
|
+
The leading underscore makes the test-only contract visible at
|
|
271
|
+
every call site. The method is kept on the class (rather than
|
|
272
|
+
moved to a test helper) because some tests construct a fresh
|
|
273
|
+
bus and then exercise the clear path itself; it just shouldn't
|
|
274
|
+
be reached from non-test code.
|
|
275
|
+
"""
|
|
276
|
+
with self._lock:
|
|
277
|
+
self._events.clear()
|
|
278
|
+
self._by_type.clear()
|
|
279
|
+
self._by_session.clear()
|
|
280
|
+
self._by_agent.clear()
|
|
@@ -69,27 +69,34 @@ def get_liability_engine(**kwargs: Any):
|
|
|
69
69
|
"""Get the best available liability engine.
|
|
70
70
|
|
|
71
71
|
Advanced: Shapley-value fault attribution with vouch cascades.
|
|
72
|
-
Community:
|
|
72
|
+
Community: ``LiabilityMatrix`` from ``hypervisor.liability``.
|
|
73
73
|
"""
|
|
74
74
|
provider = _discover_provider(PROVIDER_GROUPS["liability"])
|
|
75
75
|
if provider is not None:
|
|
76
76
|
return provider(**kwargs)
|
|
77
77
|
|
|
78
|
-
|
|
79
|
-
|
|
78
|
+
# Community fallback. The previous import targeted
|
|
79
|
+
# ``hypervisor.liability.engine.LiabilityEngine`` which does not
|
|
80
|
+
# exist in this tree; ``LiabilityMatrix`` is the real public-
|
|
81
|
+
# edition entry point.
|
|
82
|
+
from hypervisor.liability import LiabilityMatrix
|
|
83
|
+
return LiabilityMatrix(**kwargs)
|
|
80
84
|
|
|
81
85
|
|
|
82
86
|
def get_saga_engine(**kwargs: Any):
|
|
83
87
|
"""Get the best available saga orchestration engine.
|
|
84
88
|
|
|
85
89
|
Advanced: Multi-pattern saga with parallel fan-out and escalation.
|
|
86
|
-
Community:
|
|
90
|
+
Community: ``SagaOrchestrator`` from ``hypervisor.saga.orchestrator``.
|
|
87
91
|
"""
|
|
88
92
|
provider = _discover_provider(PROVIDER_GROUPS["saga_engine"])
|
|
89
93
|
if provider is not None:
|
|
90
94
|
return provider(**kwargs)
|
|
91
95
|
|
|
92
|
-
|
|
96
|
+
# Community fallback. The previous import targeted
|
|
97
|
+
# ``hypervisor.saga.engine.SagaOrchestrator`` which does not exist
|
|
98
|
+
# in this tree; the real module is ``hypervisor.saga.orchestrator``.
|
|
99
|
+
from hypervisor.saga.orchestrator import SagaOrchestrator
|
|
93
100
|
return SagaOrchestrator(**kwargs)
|
|
94
101
|
|
|
95
102
|
|
|
@@ -127,8 +127,24 @@ class RingBreachDetector:
|
|
|
127
127
|
window.popleft()
|
|
128
128
|
|
|
129
129
|
# --- 3. Compute actual rate (calls / second) ---
|
|
130
|
+
# Dividing by the full ``window_seconds`` underestimates the rate
|
|
131
|
+
# when the window has just begun — 10 calls in the first 2s of a
|
|
132
|
+
# 60s window would read as 0.16/s instead of 5/s, missing real
|
|
133
|
+
# bursts. Use the shorter of (window, time_since_first_event)
|
|
134
|
+
# as the denominator so early bursts surface accurately. A single
|
|
135
|
+
# call has no rate to measure (need ≥2 samples for an interval),
|
|
136
|
+
# so fall back to the conservative full-window divisor.
|
|
130
137
|
call_count = len(window)
|
|
131
|
-
|
|
138
|
+
if self.window_seconds <= 0 or call_count == 0:
|
|
139
|
+
actual_rate = 0.0
|
|
140
|
+
elif call_count < 2:
|
|
141
|
+
actual_rate = call_count / self.window_seconds
|
|
142
|
+
else:
|
|
143
|
+
time_since_first = max(now - window[0], 0.0)
|
|
144
|
+
# Floor at 1ms: prevents divide-by-zero for ultra-tight
|
|
145
|
+
# bursts while still surfacing them with a high rate.
|
|
146
|
+
denominator = max(min(self.window_seconds, time_since_first), 1e-3)
|
|
147
|
+
actual_rate = call_count / denominator
|
|
132
148
|
|
|
133
149
|
# --- 4. Ring-distance amplifier ---
|
|
134
150
|
# Upward calls (low value = higher privilege) are escalations.
|
|
@@ -90,6 +90,18 @@ class SagaOrchestrator:
|
|
|
90
90
|
"""
|
|
91
91
|
Execute a single saga step with timeout and retry support.
|
|
92
92
|
|
|
93
|
+
Cancellation semantics on timeout:
|
|
94
|
+
``asyncio.wait_for`` cancels the wrapped coroutine on timeout
|
|
95
|
+
*and* awaits the cancellation before raising ``TimeoutError``,
|
|
96
|
+
so a cooperative executor (one with ``await`` points)
|
|
97
|
+
receives ``CancelledError`` and has a chance to release
|
|
98
|
+
resources before this method moves on to FAILED. An executor
|
|
99
|
+
with no ``await`` points (synchronous CPU work inside an
|
|
100
|
+
``async def``) is not cancellable by Python — the timeout
|
|
101
|
+
will only fire once the executor yields control.
|
|
102
|
+
Callers needing hard-kill semantics must run such executors
|
|
103
|
+
in a process or thread pool and arrange external termination.
|
|
104
|
+
|
|
93
105
|
Args:
|
|
94
106
|
saga_id: Saga identifier
|
|
95
107
|
step_id: Step identifier
|
|
@@ -127,9 +139,11 @@ class SagaOrchestrator:
|
|
|
127
139
|
step.error = str(last_error)
|
|
128
140
|
step.transition(StepState.FAILED)
|
|
129
141
|
if attempt < attempts - 1:
|
|
130
|
-
#
|
|
131
|
-
|
|
132
|
-
|
|
142
|
+
# Move FAILED → PENDING through the state table,
|
|
143
|
+
# not by direct mutation. Bypassing transition()
|
|
144
|
+
# would skip the validity check and the timestamp
|
|
145
|
+
# bookkeeping.
|
|
146
|
+
step.reset_for_retry()
|
|
133
147
|
await asyncio.sleep(
|
|
134
148
|
self.DEFAULT_RETRY_DELAY_SECONDS * (attempt + 1)
|
|
135
149
|
)
|
|
@@ -138,8 +152,7 @@ class SagaOrchestrator:
|
|
|
138
152
|
step.error = str(e)
|
|
139
153
|
step.transition(StepState.FAILED)
|
|
140
154
|
if attempt < attempts - 1:
|
|
141
|
-
step.
|
|
142
|
-
step.error = None
|
|
155
|
+
step.reset_for_retry()
|
|
143
156
|
await asyncio.sleep(
|
|
144
157
|
self.DEFAULT_RETRY_DELAY_SECONDS * (attempt + 1)
|
|
145
158
|
)
|
|
@@ -157,6 +170,9 @@ class SagaOrchestrator:
|
|
|
157
170
|
"""
|
|
158
171
|
Run compensation (rollback) for all committed steps in reverse order.
|
|
159
172
|
|
|
173
|
+
Cancellation semantics on timeout match ``execute_step``: see that
|
|
174
|
+
method's docstring for the cooperative-cancel contract.
|
|
175
|
+
|
|
160
176
|
Args:
|
|
161
177
|
saga_id: Saga identifier
|
|
162
178
|
compensator: Async callable that takes a SagaStep and calls its Undo_API
|
|
@@ -45,7 +45,11 @@ STEP_TRANSITIONS: dict[StepState, set[StepState]] = {
|
|
|
45
45
|
StepState.COMPENSATING: {StepState.COMPENSATED, StepState.COMPENSATION_FAILED},
|
|
46
46
|
StepState.COMPENSATED: set(),
|
|
47
47
|
StepState.COMPENSATION_FAILED: set(),
|
|
48
|
-
|
|
48
|
+
# FAILED → PENDING is allowed only via reset_for_retry, which is
|
|
49
|
+
# the documented retry path. The transition table mirrors that
|
|
50
|
+
# explicitly so reset_for_retry can call transition() instead of
|
|
51
|
+
# mutating state directly and bypassing the table.
|
|
52
|
+
StepState.FAILED: {StepState.PENDING},
|
|
49
53
|
}
|
|
50
54
|
|
|
51
55
|
SAGA_TRANSITIONS: dict[SagaState, set[SagaState]] = {
|
|
@@ -96,6 +100,22 @@ class SagaStep:
|
|
|
96
100
|
):
|
|
97
101
|
self.completed_at = now
|
|
98
102
|
|
|
103
|
+
def reset_for_retry(self) -> None:
|
|
104
|
+
"""Move a FAILED step back to PENDING for another execution attempt.
|
|
105
|
+
|
|
106
|
+
Goes through ``transition()`` rather than mutating ``state``
|
|
107
|
+
directly so the move is recorded in the state table. Also
|
|
108
|
+
clears the per-attempt error and completion timestamp so the
|
|
109
|
+
next attempt starts from a clean slate.
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
SagaStateError: If the step is not currently in FAILED
|
|
113
|
+
state. Retries are only valid from FAILED.
|
|
114
|
+
"""
|
|
115
|
+
self.transition(StepState.PENDING)
|
|
116
|
+
self.error = None
|
|
117
|
+
self.completed_at = None
|
|
118
|
+
|
|
99
119
|
|
|
100
120
|
@dataclass
|
|
101
121
|
class Saga:
|
|
@@ -10,6 +10,7 @@ in-flight saga steps to a substitute agent when one is available.
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
12
|
import logging
|
|
13
|
+
import threading
|
|
13
14
|
import uuid
|
|
14
15
|
from collections.abc import Callable
|
|
15
16
|
from dataclasses import dataclass, field
|
|
@@ -18,6 +19,11 @@ from enum import Enum
|
|
|
18
19
|
|
|
19
20
|
_logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
22
|
+
# Maximum wall time we wait for an agent's termination callback to complete
|
|
23
|
+
# before declaring it hung. The kill switch must remain responsive — a slow
|
|
24
|
+
# callback should not block the entire kill flow.
|
|
25
|
+
DEFAULT_CALLBACK_TIMEOUT_SECONDS = 5.0
|
|
26
|
+
|
|
21
27
|
|
|
22
28
|
class KillReason(str, Enum):
|
|
23
29
|
"""Why an agent was killed."""
|
|
@@ -76,10 +82,16 @@ class KillSwitch:
|
|
|
76
82
|
callback to stop the agent process.
|
|
77
83
|
"""
|
|
78
84
|
|
|
79
|
-
def __init__(
|
|
85
|
+
def __init__(
|
|
86
|
+
self, callback_timeout: float = DEFAULT_CALLBACK_TIMEOUT_SECONDS
|
|
87
|
+
) -> None:
|
|
80
88
|
self._kill_history: list[KillResult] = []
|
|
81
89
|
self._substitutes: dict[str, list[str]] = {}
|
|
82
90
|
self._agents: dict[str, Callable[[], None]] = {}
|
|
91
|
+
self._callback_timeout = callback_timeout
|
|
92
|
+
# RLock so a callback that itself re-enters the kill switch
|
|
93
|
+
# (e.g. unregisters another agent) does not deadlock.
|
|
94
|
+
self._lock = threading.RLock()
|
|
83
95
|
|
|
84
96
|
# ── Agent process registry ─────────────────────────────────────
|
|
85
97
|
|
|
@@ -87,11 +99,13 @@ class KillSwitch:
|
|
|
87
99
|
self, agent_did: str, process_handle: Callable[[], None]
|
|
88
100
|
) -> None:
|
|
89
101
|
"""Register an agent with its termination callback."""
|
|
90
|
-
self.
|
|
102
|
+
with self._lock:
|
|
103
|
+
self._agents[agent_did] = process_handle
|
|
91
104
|
|
|
92
105
|
def unregister_agent(self, agent_did: str) -> None:
|
|
93
106
|
"""Remove an agent from the process registry."""
|
|
94
|
-
self.
|
|
107
|
+
with self._lock:
|
|
108
|
+
self._agents.pop(agent_did, None)
|
|
95
109
|
|
|
96
110
|
# ── Substitute management ──────────────────────────────────────
|
|
97
111
|
|
|
@@ -99,14 +113,16 @@ class KillSwitch:
|
|
|
99
113
|
self, session_id: str, agent_did: str
|
|
100
114
|
) -> None:
|
|
101
115
|
"""Register a substitute agent for a session."""
|
|
102
|
-
self.
|
|
116
|
+
with self._lock:
|
|
117
|
+
self._substitutes.setdefault(session_id, []).append(agent_did)
|
|
103
118
|
|
|
104
119
|
def unregister_substitute(
|
|
105
120
|
self, session_id: str, agent_did: str
|
|
106
121
|
) -> None:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
subs
|
|
122
|
+
with self._lock:
|
|
123
|
+
subs = self._substitutes.get(session_id, [])
|
|
124
|
+
if agent_did in subs:
|
|
125
|
+
subs.remove(agent_did)
|
|
110
126
|
|
|
111
127
|
# ── Kill ───────────────────────────────────────────────────────
|
|
112
128
|
|
|
@@ -118,11 +134,25 @@ class KillSwitch:
|
|
|
118
134
|
in_flight_steps: list[dict] | None = None,
|
|
119
135
|
details: str = "",
|
|
120
136
|
) -> KillResult:
|
|
121
|
-
"""Kill an agent, handing off in-flight steps to a substitute if available.
|
|
137
|
+
"""Kill an agent, handing off in-flight steps to a substitute if available.
|
|
138
|
+
|
|
139
|
+
Registration invariant: the agent is unregistered from the
|
|
140
|
+
process registry **unconditionally** at the end of this method,
|
|
141
|
+
regardless of whether the termination callback succeeded
|
|
142
|
+
(``terminated=True``) or failed/timed out (``terminated=False``).
|
|
143
|
+
This is intentional. The kill *intent* is durably recorded in
|
|
144
|
+
``_kill_history`` and surfaced via the returned ``KillResult``;
|
|
145
|
+
leaving the callback registered would falsely advertise the
|
|
146
|
+
agent as live and re-callable when its process state is
|
|
147
|
+
actually unknown. Callers who detect ``terminated=False`` and
|
|
148
|
+
want to retry must re-register the agent (presumably with a
|
|
149
|
+
new, working callback) before issuing the second ``kill()``.
|
|
150
|
+
"""
|
|
122
151
|
in_flight = in_flight_steps or []
|
|
123
152
|
|
|
124
|
-
|
|
125
|
-
|
|
153
|
+
with self._lock:
|
|
154
|
+
substitute = self._find_substitute(session_id, agent_did)
|
|
155
|
+
callback = self._agents.get(agent_did)
|
|
126
156
|
|
|
127
157
|
handoffs: list[StepHandoff] = []
|
|
128
158
|
handoff_success_count = 0
|
|
@@ -148,12 +178,14 @@ class KillSwitch:
|
|
|
148
178
|
)
|
|
149
179
|
)
|
|
150
180
|
|
|
151
|
-
#
|
|
181
|
+
# Invoke the termination callback *outside* the lock and with a
|
|
182
|
+
# wall-clock timeout. A slow or hung callback must not freeze the
|
|
183
|
+
# kill flow — the whole point of a kill switch is responsiveness.
|
|
152
184
|
terminated = False
|
|
153
|
-
callback = self._agents.get(agent_did)
|
|
154
185
|
if callback is not None:
|
|
155
|
-
|
|
156
|
-
|
|
186
|
+
terminated = self._invoke_callback_with_timeout(
|
|
187
|
+
agent_did, callback
|
|
188
|
+
)
|
|
157
189
|
else:
|
|
158
190
|
_logger.warning(
|
|
159
191
|
"No termination callback registered for agent %s",
|
|
@@ -172,11 +204,53 @@ class KillSwitch:
|
|
|
172
204
|
terminated=terminated,
|
|
173
205
|
details=details,
|
|
174
206
|
)
|
|
175
|
-
self.
|
|
207
|
+
with self._lock:
|
|
208
|
+
self._kill_history.append(result)
|
|
176
209
|
self.unregister_substitute(session_id, agent_did)
|
|
177
210
|
self.unregister_agent(agent_did)
|
|
178
211
|
return result
|
|
179
212
|
|
|
213
|
+
def _invoke_callback_with_timeout(
|
|
214
|
+
self, agent_did: str, callback: Callable[[], None]
|
|
215
|
+
) -> bool:
|
|
216
|
+
"""Run *callback* in a daemon thread bounded by ``callback_timeout``.
|
|
217
|
+
|
|
218
|
+
Returns ``True`` if the callback completed cleanly within the
|
|
219
|
+
timeout, ``False`` if it timed out or raised. A hung callback
|
|
220
|
+
is left to its fate (daemon thread); the kill switch returns
|
|
221
|
+
and remains usable for the next kill.
|
|
222
|
+
"""
|
|
223
|
+
error_box: list[BaseException] = []
|
|
224
|
+
|
|
225
|
+
def _runner() -> None:
|
|
226
|
+
try:
|
|
227
|
+
callback()
|
|
228
|
+
except BaseException as exc: # noqa: BLE001 — surface but don't propagate
|
|
229
|
+
error_box.append(exc)
|
|
230
|
+
|
|
231
|
+
thread = threading.Thread(
|
|
232
|
+
target=_runner, name=f"kill-callback:{agent_did}", daemon=True
|
|
233
|
+
)
|
|
234
|
+
thread.start()
|
|
235
|
+
thread.join(timeout=self._callback_timeout)
|
|
236
|
+
|
|
237
|
+
if thread.is_alive():
|
|
238
|
+
_logger.error(
|
|
239
|
+
"Termination callback for %s exceeded %.2fs; leaving daemon thread to drain",
|
|
240
|
+
agent_did,
|
|
241
|
+
self._callback_timeout,
|
|
242
|
+
)
|
|
243
|
+
return False
|
|
244
|
+
if error_box:
|
|
245
|
+
_logger.error(
|
|
246
|
+
"Termination callback for %s raised %s: %s",
|
|
247
|
+
agent_did,
|
|
248
|
+
type(error_box[0]).__name__,
|
|
249
|
+
error_box[0],
|
|
250
|
+
)
|
|
251
|
+
return False
|
|
252
|
+
return True
|
|
253
|
+
|
|
180
254
|
def _find_substitute(
|
|
181
255
|
self, session_id: str, exclude_did: str
|
|
182
256
|
) -> str | None:
|