agent_hypervisor 3.4.0__tar.gz → 3.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/.gitignore +1 -0
  2. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/PKG-INFO +2 -2
  3. agent_hypervisor-3.6.0/examples/requirements.txt +1 -0
  4. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/notebooks/README.md +2 -1
  5. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/pyproject.toml +2 -2
  6. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/server.py +5 -8
  7. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/core.py +17 -0
  8. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/vouching.py +5 -0
  9. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/event_bus.py +97 -39
  10. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/providers.py +12 -5
  11. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/breach_detector.py +17 -1
  12. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/orchestrator.py +21 -5
  13. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/state_machine.py +21 -1
  14. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/kill_switch.py +89 -15
  15. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/rate_limiter.py +65 -39
  16. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/sso.py +15 -1
  17. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_agent_manager.py +38 -0
  18. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_breach_detector.py +44 -0
  19. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_kill_switch.py +71 -0
  20. agent_hypervisor-3.6.0/tests/test_providers.py +44 -0
  21. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_rate_limiter.py +127 -0
  22. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_liability.py +15 -0
  23. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_observability.py +103 -1
  24. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga.py +34 -0
  25. agent_hypervisor-3.4.0/examples/requirements.txt +0 -1
  26. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/CHANGELOG.md +0 -0
  27. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/LICENSE +0 -0
  28. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/README.md +0 -0
  29. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/SECURITY.md +0 -0
  30. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/benchmarks/bench_hypervisor.py +0 -0
  31. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/benchmarks/results/BENCHMARKS.md +0 -0
  32. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/benchmarks/results/benchmarks.json +0 -0
  33. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/docs/api-reference.md +0 -0
  34. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/docs/joint-liability-guide.md +0 -0
  35. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/dashboard/README.md +0 -0
  36. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/dashboard/app.py +0 -0
  37. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/dashboard/requirements.txt +0 -0
  38. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/demo.py +0 -0
  39. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/Dockerfile +0 -0
  40. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/README.md +0 -0
  41. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/__init__.py +0 -0
  42. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/dashboard.py +0 -0
  43. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/sample_workflow.py +0 -0
  44. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/app/server.py +0 -0
  45. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/config/hypervisor.yaml +0 -0
  46. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/examples/docker-compose/docker-compose.yml +0 -0
  47. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/notebooks/hypervisor-exploration.ipynb +0 -0
  48. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/__init__.py +0 -0
  49. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/__init__.py +0 -0
  50. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/models.py +0 -0
  51. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/__init__.py +0 -0
  52. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/commitment.py +0 -0
  53. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/delta.py +0 -0
  54. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/audit/gc.py +0 -0
  55. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/cli/__init__.py +0 -0
  56. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/cli/formatters.py +0 -0
  57. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/cli/session_commands.py +0 -0
  58. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/constants.py +0 -0
  59. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/__init__.py +0 -0
  60. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/iatp_adapter.py +0 -0
  61. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/nexus_adapter.py +0 -0
  62. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/integrations/verification_adapter.py +0 -0
  63. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/__init__.py +0 -0
  64. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/attribution.py +0 -0
  65. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/ledger.py +0 -0
  66. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/quarantine.py +0 -0
  67. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/slashing.py +0 -0
  68. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/models.py +0 -0
  69. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/__init__.py +0 -0
  70. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/causal_trace.py +0 -0
  71. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/prometheus_collector.py +0 -0
  72. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/saga_span_exporter.py +0 -0
  73. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/py.typed +0 -0
  74. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/reversibility/__init__.py +0 -0
  75. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/reversibility/registry.py +0 -0
  76. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/__init__.py +0 -0
  77. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/classifier.py +0 -0
  78. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/elevation.py +0 -0
  79. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/enforcer.py +0 -0
  80. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/__init__.py +0 -0
  81. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/checkpoint.py +0 -0
  82. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/dsl.py +0 -0
  83. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/fan_out.py +0 -0
  84. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/schema.py +0 -0
  85. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/__init__.py +0 -0
  86. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/__init__.py +0 -0
  87. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/intent_locks.py +0 -0
  88. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/isolation.py +0 -0
  89. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/session/vector_clock.py +0 -0
  90. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/verification/__init__.py +0 -0
  91. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/verification/history.py +0 -0
  92. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/__init__.py +0 -0
  93. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/integration/__init__.py +0 -0
  94. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/integration/test_hypervisor_e2e.py +0 -0
  95. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/integration/test_scenarios.py +0 -0
  96. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_classifier.py +0 -0
  97. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/test_shapley_attribution.py +0 -0
  98. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/__init__.py +0 -0
  99. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_audit.py +0 -0
  100. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_cli.py +0 -0
  101. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_config_validation.py +0 -0
  102. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_liability_improvements.py +0 -0
  103. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_models.py +0 -0
  104. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_prometheus_collector.py +0 -0
  105. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_reversibility_registry.py +0 -0
  106. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_ring_improvements.py +0 -0
  107. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_rings.py +0 -0
  108. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga_improvements.py +0 -0
  109. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga_schema.py +0 -0
  110. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_saga_span_exporter.py +0 -0
  111. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_session.py +0 -0
  112. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_session_security.py +0 -0
  113. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_slashing.py +0 -0
  114. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tests/unit/test_vfs_substrate.py +0 -0
  115. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/execution-rings-workflow/README.md +0 -0
  116. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/execution-rings-workflow/demo.py +0 -0
  117. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/saga-compensation/README.md +0 -0
  118. {agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/tutorials/saga-compensation/demo.py +0 -0
@@ -465,3 +465,4 @@ _site/
465
465
 
466
466
  # Code Security Assessment artifacts
467
467
  .security-assessment/
468
+ *.tgz
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agent_hypervisor
3
- Version: 3.4.0
3
+ Version: 3.6.0
4
4
  Summary: Public Preview — Agent Hypervisor: Runtime supervisor for multi-agent Shared Sessions with Execution Rings, Joint Liability, Saga Orchestration, and hash-chained audit trails
5
5
  Project-URL: Homepage, https://github.com/microsoft/agent-governance-toolkit
6
6
  Project-URL: Repository, https://github.com/microsoft/agent-governance-toolkit
@@ -35,7 +35,7 @@ Requires-Dist: web3<8.0,>=6.0.0; extra == 'blockchain'
35
35
  Provides-Extra: dev
36
36
  Requires-Dist: hypothesis<7.0,>=6.0.0; extra == 'dev'
37
37
  Requires-Dist: jsonschema<5.0,>=4.0.0; extra == 'dev'
38
- Requires-Dist: mypy<2.0,>=1.8.0; extra == 'dev'
38
+ Requires-Dist: mypy<3.0,>=1.8.0; extra == 'dev'
39
39
  Requires-Dist: pytest-asyncio<2.0,>=0.23.0; extra == 'dev'
40
40
  Requires-Dist: pytest-cov<8.0,>=4.0.0; extra == 'dev'
41
41
  Requires-Dist: pytest<10.0,>=8.0.0; extra == 'dev'
@@ -0,0 +1 @@
1
+ agent-hypervisor>=3.5.0
@@ -11,7 +11,8 @@ Interactive Jupyter notebooks for exploring the **agent-hypervisor** runtime.
11
11
  ## Quick Start
12
12
 
13
13
  ```bash
14
- # From the repository root
14
+ # From the agent-hypervisor package root
15
+ cd agent-governance-python/agent-hypervisor
15
16
  pip install -e ".[dev]" plotly nest-asyncio
16
17
  jupyter notebook notebooks/
17
18
  ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "agent_hypervisor"
7
- version = "3.4.0"
7
+ version = "3.6.0"
8
8
  description = "Public Preview — Agent Hypervisor: Runtime supervisor for multi-agent Shared Sessions with Execution Rings, Joint Liability, Saga Orchestration, and hash-chained audit trails"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -58,7 +58,7 @@ dev = [
58
58
  "pytest-cov>=4.0.0,<8.0",
59
59
  "hypothesis>=6.0.0,<7.0",
60
60
  "ruff>=0.4.0,<1.0",
61
- "mypy>=1.8.0,<2.0",
61
+ "mypy>=1.8.0,<3.0",
62
62
  "jsonschema>=4.0.0,<5.0",
63
63
  ]
64
64
  blockchain = [
@@ -180,19 +180,16 @@ async def get_stats() -> StatsResponse:
180
180
  """Get overall hypervisor statistics."""
181
181
  hv = _hv()
182
182
  bus = _bus()
183
- total_participants = sum(
184
- m.sso.participant_count for m in hv._sessions.values()
185
- )
186
- active_sagas = sum(
187
- len(m.saga.active_sagas) for m in hv._sessions.values()
188
- )
183
+ sessions = hv.sessions
184
+ total_participants = sum(m.sso.participant_count for m in sessions)
185
+ active_sagas = sum(len(m.saga.active_sagas) for m in sessions)
189
186
  return StatsResponse(
190
187
  version=__version__,
191
- total_sessions=len(hv._sessions),
188
+ total_sessions=hv.session_count,
192
189
  active_sessions=len(hv.active_sessions),
193
190
  total_participants=total_participants,
194
191
  active_sagas=active_sagas,
195
- total_vouches=len(hv.vouching._vouches),
192
+ total_vouches=hv.vouching.vouch_count,
196
193
  event_count=bus.event_count,
197
194
  )
198
195
 
@@ -306,6 +306,23 @@ class Hypervisor:
306
306
  return [self._sessions[sid] for sid in self._active_ids
307
307
  if sid in self._sessions]
308
308
 
309
+ @property
310
+ def sessions(self) -> list[ManagedSession]:
311
+ """All managed sessions, including archived/terminating ones.
312
+
313
+ ``active_sessions`` filters via ``_active_ids``; this property
314
+ exposes the full registry for callers (admin APIs, monitoring,
315
+ stats) that need a count or iterator over every session the
316
+ Hypervisor is still tracking. Returns a snapshot list so callers
317
+ can iterate without holding any internal reference.
318
+ """
319
+ return list(self._sessions.values())
320
+
321
+ @property
322
+ def session_count(self) -> int:
323
+ """Total number of managed sessions, including archived/terminating."""
324
+ return len(self._sessions)
325
+
309
326
  def _get_session(self, session_id: str) -> ManagedSession:
310
327
  managed = self._sessions.get(session_id)
311
328
  if not managed:
@@ -57,6 +57,11 @@ class VouchingEngine:
57
57
  self._vouches: dict[str, VouchRecord] = {}
58
58
  self.max_exposure = max_exposure or self.DEFAULT_MAX_EXPOSURE
59
59
 
60
+ @property
61
+ def vouch_count(self) -> int:
62
+ """Total number of sponsorship records (active + released)."""
63
+ return len(self._vouches)
64
+
60
65
  def vouch(
61
66
  self,
62
67
  voucher_did: str,
@@ -10,13 +10,21 @@ full replay debugging, post-mortem analysis, and real-time monitoring.
10
10
 
11
11
  from __future__ import annotations
12
12
 
13
+ import threading
13
14
  import uuid
15
+ from collections import deque
14
16
  from collections.abc import Callable
15
17
  from dataclasses import dataclass, field
16
18
  from datetime import UTC, datetime
17
19
  from enum import Enum
18
20
  from typing import Any
19
21
 
22
+ # Default cap for the in-memory event store. Hypervisor deployments run for
23
+ # weeks; an unbounded list eventually OOMs. The cap is configurable via the
24
+ # ``HypervisorEventBus(max_events=...)`` constructor; ``None`` opts back into
25
+ # unbounded growth for tests or analysis tooling that needs full history.
26
+ DEFAULT_MAX_EVENTS = 100_000
27
+
20
28
 
21
29
  class EventType(str, Enum):
22
30
  """Categorised hypervisor event types."""
@@ -119,34 +127,61 @@ class HypervisorEventBus:
119
127
  - Event count and statistics
120
128
  """
121
129
 
122
- def __init__(self) -> None:
123
- self._events: list[HypervisorEvent] = []
130
+ def __init__(self, max_events: int | None = DEFAULT_MAX_EVENTS) -> None:
131
+ """Create an event bus.
132
+
133
+ ``max_events`` caps the in-memory store. Each per-key index list
134
+ (by-type, by-session, by-agent) is independently capped to the
135
+ same value, so a single chatty session cannot starve the
136
+ history of other sessions. Pass ``None`` to disable the cap
137
+ (testing or full-replay tooling).
138
+ """
139
+ self._max_events = max_events
140
+ # `deque` with `maxlen` evicts the oldest entry on overflow in
141
+ # O(1), avoiding the OOM cliff of an unbounded `list`.
142
+ self._events: deque[HypervisorEvent] = deque(maxlen=max_events)
124
143
  self._subscribers: dict[EventType | None, list[EventHandler]] = {}
125
- self._by_type: dict[EventType, list[HypervisorEvent]] = {}
126
- self._by_session: dict[str, list[HypervisorEvent]] = {}
127
- self._by_agent: dict[str, list[HypervisorEvent]] = {}
144
+ self._by_type: dict[EventType, deque[HypervisorEvent]] = {}
145
+ self._by_session: dict[str, deque[HypervisorEvent]] = {}
146
+ self._by_agent: dict[str, deque[HypervisorEvent]] = {}
147
+ # Use an RLock so a subscriber that re-enters the bus (e.g.
148
+ # emits an event in response to another event) doesn't deadlock.
149
+ self._lock = threading.RLock()
150
+
151
+ def _new_index_deque(self) -> deque[HypervisorEvent]:
152
+ return deque(maxlen=self._max_events)
128
153
 
129
154
  def emit(self, event: HypervisorEvent) -> None:
130
155
  """Append an event and notify subscribers."""
131
- self._events.append(event)
132
-
133
- # Index by type
134
- self._by_type.setdefault(event.event_type, []).append(event)
135
-
136
- # Index by session
137
- if event.session_id:
138
- self._by_session.setdefault(event.session_id, []).append(event)
139
-
140
- # Index by agent
141
- if event.agent_did:
142
- self._by_agent.setdefault(event.agent_did, []).append(event)
143
-
144
- # Notify type-specific subscribers
145
- for handler in self._subscribers.get(event.event_type, []):
156
+ with self._lock:
157
+ self._events.append(event)
158
+
159
+ self._by_type.setdefault(
160
+ event.event_type, self._new_index_deque()
161
+ ).append(event)
162
+
163
+ if event.session_id:
164
+ self._by_session.setdefault(
165
+ event.session_id, self._new_index_deque()
166
+ ).append(event)
167
+
168
+ if event.agent_did:
169
+ self._by_agent.setdefault(
170
+ event.agent_did, self._new_index_deque()
171
+ ).append(event)
172
+
173
+ # Snapshot subscriber lists while holding the lock so a
174
+ # subscriber that mutates the registry mid-notify doesn't
175
+ # invalidate iteration.
176
+ type_subs = list(self._subscribers.get(event.event_type, ()))
177
+ wildcard_subs = list(self._subscribers.get(None, ()))
178
+
179
+ # Invoke handlers outside the lock so a slow subscriber can't
180
+ # serialize the entire bus or, worse, deadlock with a caller
181
+ # that also holds an external lock.
182
+ for handler in type_subs:
146
183
  handler(event)
147
-
148
- # Notify wildcard subscribers
149
- for handler in self._subscribers.get(None, []):
184
+ for handler in wildcard_subs:
150
185
  handler(event)
151
186
 
152
187
  def subscribe(
@@ -155,20 +190,25 @@ class HypervisorEventBus:
155
190
  handler: EventHandler | None = None,
156
191
  ) -> None:
157
192
  """Subscribe to events. Use event_type=None for all events."""
158
- if handler:
193
+ if not handler:
194
+ return
195
+ with self._lock:
159
196
  self._subscribers.setdefault(event_type, []).append(handler)
160
197
 
161
198
  def query_by_type(self, event_type: EventType) -> list[HypervisorEvent]:
162
199
  """Get all events of a specific type."""
163
- return list(self._by_type.get(event_type, []))
200
+ with self._lock:
201
+ return list(self._by_type.get(event_type, ()))
164
202
 
165
203
  def query_by_session(self, session_id: str) -> list[HypervisorEvent]:
166
204
  """Get all events for a specific session."""
167
- return list(self._by_session.get(session_id, []))
205
+ with self._lock:
206
+ return list(self._by_session.get(session_id, ()))
168
207
 
169
208
  def query_by_agent(self, agent_did: str) -> list[HypervisorEvent]:
170
209
  """Get all events involving a specific agent."""
171
- return list(self._by_agent.get(agent_did, []))
210
+ with self._lock:
211
+ return list(self._by_agent.get(agent_did, ()))
172
212
 
173
213
  def query_by_time_range(
174
214
  self,
@@ -178,7 +218,8 @@ class HypervisorEventBus:
178
218
  """Get events within a time range."""
179
219
  if end is None:
180
220
  end = datetime.now(UTC)
181
- return [e for e in self._events if start <= e.timestamp <= end]
221
+ with self._lock:
222
+ return [e for e in self._events if start <= e.timestamp <= end]
182
223
 
183
224
  def query(
184
225
  self,
@@ -188,7 +229,8 @@ class HypervisorEventBus:
188
229
  limit: int | None = None,
189
230
  ) -> list[HypervisorEvent]:
190
231
  """Flexible query with multiple filters."""
191
- results = self._events
232
+ with self._lock:
233
+ results: list[HypervisorEvent] = list(self._events)
192
234
 
193
235
  if event_type is not None:
194
236
  results = [e for e in results if e.event_type == event_type]
@@ -204,19 +246,35 @@ class HypervisorEventBus:
204
246
 
205
247
  @property
206
248
  def event_count(self) -> int:
207
- return len(self._events)
249
+ with self._lock:
250
+ return len(self._events)
208
251
 
209
252
  @property
210
253
  def all_events(self) -> list[HypervisorEvent]:
211
- return list(self._events)
254
+ with self._lock:
255
+ return list(self._events)
212
256
 
213
257
  def type_counts(self) -> dict[str, int]:
214
258
  """Return count of events per type."""
215
- return {t.value: len(evts) for t, evts in self._by_type.items()}
216
-
217
- def clear(self) -> None:
218
- """Clear all events (for testing)."""
219
- self._events.clear()
220
- self._by_type.clear()
221
- self._by_session.clear()
222
- self._by_agent.clear()
259
+ with self._lock:
260
+ return {t.value: len(evts) for t, evts in self._by_type.items()}
261
+
262
+ def _clear(self) -> None:
263
+ """Clear all events. **Test-only — do not call in production.**
264
+
265
+ The event bus is wired into the hypervisor as a long-lived,
266
+ process-singleton-shaped collaborator (see
267
+ ``hypervisor.api.server._event_bus``): production calls would
268
+ wipe the audit trail of every running session at once.
269
+
270
+ The leading underscore makes the test-only contract visible at
271
+ every call site. The method is kept on the class (rather than
272
+ moved to a test helper) because some tests construct a fresh
273
+ bus and then exercise the clear path itself; it just shouldn't
274
+ be reached from non-test code.
275
+ """
276
+ with self._lock:
277
+ self._events.clear()
278
+ self._by_type.clear()
279
+ self._by_session.clear()
280
+ self._by_agent.clear()
@@ -69,27 +69,34 @@ def get_liability_engine(**kwargs: Any):
69
69
  """Get the best available liability engine.
70
70
 
71
71
  Advanced: Shapley-value fault attribution with vouch cascades.
72
- Community: Basic vouching with linear slashing.
72
+ Community: ``LiabilityMatrix`` from ``hypervisor.liability``.
73
73
  """
74
74
  provider = _discover_provider(PROVIDER_GROUPS["liability"])
75
75
  if provider is not None:
76
76
  return provider(**kwargs)
77
77
 
78
- from hypervisor.liability.engine import LiabilityEngine
79
- return LiabilityEngine(**kwargs)
78
+ # Community fallback. The previous import targeted
79
+ # ``hypervisor.liability.engine.LiabilityEngine`` which does not
80
+ # exist in this tree; ``LiabilityMatrix`` is the real public-
81
+ # edition entry point.
82
+ from hypervisor.liability import LiabilityMatrix
83
+ return LiabilityMatrix(**kwargs)
80
84
 
81
85
 
82
86
  def get_saga_engine(**kwargs: Any):
83
87
  """Get the best available saga orchestration engine.
84
88
 
85
89
  Advanced: Multi-pattern saga with parallel fan-out and escalation.
86
- Community: Sequential saga with basic compensation.
90
+ Community: ``SagaOrchestrator`` from ``hypervisor.saga.orchestrator``.
87
91
  """
88
92
  provider = _discover_provider(PROVIDER_GROUPS["saga_engine"])
89
93
  if provider is not None:
90
94
  return provider(**kwargs)
91
95
 
92
- from hypervisor.saga.engine import SagaOrchestrator
96
+ # Community fallback. The previous import targeted
97
+ # ``hypervisor.saga.engine.SagaOrchestrator`` which does not exist
98
+ # in this tree; the real module is ``hypervisor.saga.orchestrator``.
99
+ from hypervisor.saga.orchestrator import SagaOrchestrator
93
100
  return SagaOrchestrator(**kwargs)
94
101
 
95
102
 
@@ -127,8 +127,24 @@ class RingBreachDetector:
127
127
  window.popleft()
128
128
 
129
129
  # --- 3. Compute actual rate (calls / second) ---
130
+ # Dividing by the full ``window_seconds`` underestimates the rate
131
+ # when the window has just begun — 10 calls in the first 2s of a
132
+ # 60s window would read as 0.16/s instead of 5/s, missing real
133
+ # bursts. Use the shorter of (window, time_since_first_event)
134
+ # as the denominator so early bursts surface accurately. A single
135
+ # call has no rate to measure (need ≥2 samples for an interval),
136
+ # so fall back to the conservative full-window divisor.
130
137
  call_count = len(window)
131
- actual_rate = call_count / self.window_seconds if self.window_seconds > 0 else 0.0
138
+ if self.window_seconds <= 0 or call_count == 0:
139
+ actual_rate = 0.0
140
+ elif call_count < 2:
141
+ actual_rate = call_count / self.window_seconds
142
+ else:
143
+ time_since_first = max(now - window[0], 0.0)
144
+ # Floor at 1ms: prevents divide-by-zero for ultra-tight
145
+ # bursts while still surfacing them with a high rate.
146
+ denominator = max(min(self.window_seconds, time_since_first), 1e-3)
147
+ actual_rate = call_count / denominator
132
148
 
133
149
  # --- 4. Ring-distance amplifier ---
134
150
  # Upward calls (low value = higher privilege) are escalations.
@@ -90,6 +90,18 @@ class SagaOrchestrator:
90
90
  """
91
91
  Execute a single saga step with timeout and retry support.
92
92
 
93
+ Cancellation semantics on timeout:
94
+ ``asyncio.wait_for`` cancels the wrapped coroutine on timeout
95
+ *and* awaits the cancellation before raising ``TimeoutError``,
96
+ so a cooperative executor (one with ``await`` points)
97
+ receives ``CancelledError`` and has a chance to release
98
+ resources before this method moves on to FAILED. An executor
99
+ with no ``await`` points (synchronous CPU work inside an
100
+ ``async def``) is not cancellable by Python — the timeout
101
+ will only fire once the executor yields control.
102
+ Callers needing hard-kill semantics must run such executors
103
+ in a process or thread pool and arrange external termination.
104
+
93
105
  Args:
94
106
  saga_id: Saga identifier
95
107
  step_id: Step identifier
@@ -127,9 +139,11 @@ class SagaOrchestrator:
127
139
  step.error = str(last_error)
128
140
  step.transition(StepState.FAILED)
129
141
  if attempt < attempts - 1:
130
- # Reset to PENDING for retry
131
- step.state = StepState.PENDING
132
- step.error = None
142
+ # Move FAILED PENDING through the state table,
143
+ # not by direct mutation. Bypassing transition()
144
+ # would skip the validity check and the timestamp
145
+ # bookkeeping.
146
+ step.reset_for_retry()
133
147
  await asyncio.sleep(
134
148
  self.DEFAULT_RETRY_DELAY_SECONDS * (attempt + 1)
135
149
  )
@@ -138,8 +152,7 @@ class SagaOrchestrator:
138
152
  step.error = str(e)
139
153
  step.transition(StepState.FAILED)
140
154
  if attempt < attempts - 1:
141
- step.state = StepState.PENDING
142
- step.error = None
155
+ step.reset_for_retry()
143
156
  await asyncio.sleep(
144
157
  self.DEFAULT_RETRY_DELAY_SECONDS * (attempt + 1)
145
158
  )
@@ -157,6 +170,9 @@ class SagaOrchestrator:
157
170
  """
158
171
  Run compensation (rollback) for all committed steps in reverse order.
159
172
 
173
+ Cancellation semantics on timeout match ``execute_step``: see that
174
+ method's docstring for the cooperative-cancel contract.
175
+
160
176
  Args:
161
177
  saga_id: Saga identifier
162
178
  compensator: Async callable that takes a SagaStep and calls its Undo_API
@@ -45,7 +45,11 @@ STEP_TRANSITIONS: dict[StepState, set[StepState]] = {
45
45
  StepState.COMPENSATING: {StepState.COMPENSATED, StepState.COMPENSATION_FAILED},
46
46
  StepState.COMPENSATED: set(),
47
47
  StepState.COMPENSATION_FAILED: set(),
48
- StepState.FAILED: set(),
48
+ # FAILED → PENDING is allowed only via reset_for_retry, which is
49
+ # the documented retry path. The transition table mirrors that
50
+ # explicitly so reset_for_retry can call transition() instead of
51
+ # mutating state directly and bypassing the table.
52
+ StepState.FAILED: {StepState.PENDING},
49
53
  }
50
54
 
51
55
  SAGA_TRANSITIONS: dict[SagaState, set[SagaState]] = {
@@ -96,6 +100,22 @@ class SagaStep:
96
100
  ):
97
101
  self.completed_at = now
98
102
 
103
+ def reset_for_retry(self) -> None:
104
+ """Move a FAILED step back to PENDING for another execution attempt.
105
+
106
+ Goes through ``transition()`` rather than mutating ``state``
107
+ directly so the move is recorded in the state table. Also
108
+ clears the per-attempt error and completion timestamp so the
109
+ next attempt starts from a clean slate.
110
+
111
+ Raises:
112
+ SagaStateError: If the step is not currently in FAILED
113
+ state. Retries are only valid from FAILED.
114
+ """
115
+ self.transition(StepState.PENDING)
116
+ self.error = None
117
+ self.completed_at = None
118
+
99
119
 
100
120
  @dataclass
101
121
  class Saga:
@@ -10,6 +10,7 @@ in-flight saga steps to a substitute agent when one is available.
10
10
  from __future__ import annotations
11
11
 
12
12
  import logging
13
+ import threading
13
14
  import uuid
14
15
  from collections.abc import Callable
15
16
  from dataclasses import dataclass, field
@@ -18,6 +19,11 @@ from enum import Enum
18
19
 
19
20
  _logger = logging.getLogger(__name__)
20
21
 
22
+ # Maximum wall time we wait for an agent's termination callback to complete
23
+ # before declaring it hung. The kill switch must remain responsive — a slow
24
+ # callback should not block the entire kill flow.
25
+ DEFAULT_CALLBACK_TIMEOUT_SECONDS = 5.0
26
+
21
27
 
22
28
  class KillReason(str, Enum):
23
29
  """Why an agent was killed."""
@@ -76,10 +82,16 @@ class KillSwitch:
76
82
  callback to stop the agent process.
77
83
  """
78
84
 
79
- def __init__(self) -> None:
85
+ def __init__(
86
+ self, callback_timeout: float = DEFAULT_CALLBACK_TIMEOUT_SECONDS
87
+ ) -> None:
80
88
  self._kill_history: list[KillResult] = []
81
89
  self._substitutes: dict[str, list[str]] = {}
82
90
  self._agents: dict[str, Callable[[], None]] = {}
91
+ self._callback_timeout = callback_timeout
92
+ # RLock so a callback that itself re-enters the kill switch
93
+ # (e.g. unregisters another agent) does not deadlock.
94
+ self._lock = threading.RLock()
83
95
 
84
96
  # ── Agent process registry ─────────────────────────────────────
85
97
 
@@ -87,11 +99,13 @@ class KillSwitch:
87
99
  self, agent_did: str, process_handle: Callable[[], None]
88
100
  ) -> None:
89
101
  """Register an agent with its termination callback."""
90
- self._agents[agent_did] = process_handle
102
+ with self._lock:
103
+ self._agents[agent_did] = process_handle
91
104
 
92
105
  def unregister_agent(self, agent_did: str) -> None:
93
106
  """Remove an agent from the process registry."""
94
- self._agents.pop(agent_did, None)
107
+ with self._lock:
108
+ self._agents.pop(agent_did, None)
95
109
 
96
110
  # ── Substitute management ──────────────────────────────────────
97
111
 
@@ -99,14 +113,16 @@ class KillSwitch:
99
113
  self, session_id: str, agent_did: str
100
114
  ) -> None:
101
115
  """Register a substitute agent for a session."""
102
- self._substitutes.setdefault(session_id, []).append(agent_did)
116
+ with self._lock:
117
+ self._substitutes.setdefault(session_id, []).append(agent_did)
103
118
 
104
119
  def unregister_substitute(
105
120
  self, session_id: str, agent_did: str
106
121
  ) -> None:
107
- subs = self._substitutes.get(session_id, [])
108
- if agent_did in subs:
109
- subs.remove(agent_did)
122
+ with self._lock:
123
+ subs = self._substitutes.get(session_id, [])
124
+ if agent_did in subs:
125
+ subs.remove(agent_did)
110
126
 
111
127
  # ── Kill ───────────────────────────────────────────────────────
112
128
 
@@ -118,11 +134,25 @@ class KillSwitch:
118
134
  in_flight_steps: list[dict] | None = None,
119
135
  details: str = "",
120
136
  ) -> KillResult:
121
- """Kill an agent, handing off in-flight steps to a substitute if available."""
137
+ """Kill an agent, handing off in-flight steps to a substitute if available.
138
+
139
+ Registration invariant: the agent is unregistered from the
140
+ process registry **unconditionally** at the end of this method,
141
+ regardless of whether the termination callback succeeded
142
+ (``terminated=True``) or failed/timed out (``terminated=False``).
143
+ This is intentional. The kill *intent* is durably recorded in
144
+ ``_kill_history`` and surfaced via the returned ``KillResult``;
145
+ leaving the callback registered would falsely advertise the
146
+ agent as live and re-callable when its process state is
147
+ actually unknown. Callers who detect ``terminated=False`` and
148
+ want to retry must re-register the agent (presumably with a
149
+ new, working callback) before issuing the second ``kill()``.
150
+ """
122
151
  in_flight = in_flight_steps or []
123
152
 
124
- # Attempt to find a substitute for handoff
125
- substitute = self._find_substitute(session_id, agent_did)
153
+ with self._lock:
154
+ substitute = self._find_substitute(session_id, agent_did)
155
+ callback = self._agents.get(agent_did)
126
156
 
127
157
  handoffs: list[StepHandoff] = []
128
158
  handoff_success_count = 0
@@ -148,12 +178,14 @@ class KillSwitch:
148
178
  )
149
179
  )
150
180
 
151
- # Terminate the agent process
181
+ # Invoke the termination callback *outside* the lock and with a
182
+ # wall-clock timeout. A slow or hung callback must not freeze the
183
+ # kill flow — the whole point of a kill switch is responsiveness.
152
184
  terminated = False
153
- callback = self._agents.get(agent_did)
154
185
  if callback is not None:
155
- callback()
156
- terminated = True
186
+ terminated = self._invoke_callback_with_timeout(
187
+ agent_did, callback
188
+ )
157
189
  else:
158
190
  _logger.warning(
159
191
  "No termination callback registered for agent %s",
@@ -172,11 +204,53 @@ class KillSwitch:
172
204
  terminated=terminated,
173
205
  details=details,
174
206
  )
175
- self._kill_history.append(result)
207
+ with self._lock:
208
+ self._kill_history.append(result)
176
209
  self.unregister_substitute(session_id, agent_did)
177
210
  self.unregister_agent(agent_did)
178
211
  return result
179
212
 
213
+ def _invoke_callback_with_timeout(
214
+ self, agent_did: str, callback: Callable[[], None]
215
+ ) -> bool:
216
+ """Run *callback* in a daemon thread bounded by ``callback_timeout``.
217
+
218
+ Returns ``True`` if the callback completed cleanly within the
219
+ timeout, ``False`` if it timed out or raised. A hung callback
220
+ is left to its fate (daemon thread); the kill switch returns
221
+ and remains usable for the next kill.
222
+ """
223
+ error_box: list[BaseException] = []
224
+
225
+ def _runner() -> None:
226
+ try:
227
+ callback()
228
+ except BaseException as exc: # noqa: BLE001 — surface but don't propagate
229
+ error_box.append(exc)
230
+
231
+ thread = threading.Thread(
232
+ target=_runner, name=f"kill-callback:{agent_did}", daemon=True
233
+ )
234
+ thread.start()
235
+ thread.join(timeout=self._callback_timeout)
236
+
237
+ if thread.is_alive():
238
+ _logger.error(
239
+ "Termination callback for %s exceeded %.2fs; leaving daemon thread to drain",
240
+ agent_did,
241
+ self._callback_timeout,
242
+ )
243
+ return False
244
+ if error_box:
245
+ _logger.error(
246
+ "Termination callback for %s raised %s: %s",
247
+ agent_did,
248
+ type(error_box[0]).__name__,
249
+ error_box[0],
250
+ )
251
+ return False
252
+ return True
253
+
180
254
  def _find_substitute(
181
255
  self, session_id: str, exclude_did: str
182
256
  ) -> str | None: