PyPI - agent_hypervisor - Versions diffs - 3.4.0__tar.gz → 3.6.0__tar.gz - Mend

agent_hypervisor 3.4.0tar.gz → 3.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/.gitignore RENAMED Viewed

@@ -465,3 +465,4 @@ _site/
 # Code Security Assessment artifacts
 .security-assessment/
+*.tgz

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: agent_hypervisor
-Version: 3.4.0
+Version: 3.6.0
 Summary: Public Preview — Agent Hypervisor: Runtime supervisor for multi-agent Shared Sessions with Execution Rings, Joint Liability, Saga Orchestration, and hash-chained audit trails
 Project-URL: Homepage, https://github.com/microsoft/agent-governance-toolkit
 Project-URL: Repository, https://github.com/microsoft/agent-governance-toolkit
@@ -35,7 +35,7 @@ Requires-Dist: web3<8.0,>=6.0.0; extra == 'blockchain'
 Provides-Extra: dev
 Requires-Dist: hypothesis<7.0,>=6.0.0; extra == 'dev'
 Requires-Dist: jsonschema<5.0,>=4.0.0; extra == 'dev'
-Requires-Dist: mypy<2.0,>=1.8.0; extra == 'dev'
+Requires-Dist: mypy<3.0,>=1.8.0; extra == 'dev'
 Requires-Dist: pytest-asyncio<2.0,>=0.23.0; extra == 'dev'
 Requires-Dist: pytest-cov<8.0,>=4.0.0; extra == 'dev'
 Requires-Dist: pytest<10.0,>=8.0.0; extra == 'dev'

agent_hypervisor-3.6.0/examples/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ agent-hypervisor>=3.5.0

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/notebooks/README.md RENAMED Viewed

@@ -11,7 +11,8 @@ Interactive Jupyter notebooks for exploring the **agent-hypervisor** runtime.
 ## Quick Start
 ```bash
-# From the repository root
+# From the agent-hypervisor package root
+cd agent-governance-python/agent-hypervisor
 pip install -e ".[dev]" plotly nest-asyncio
 jupyter notebook notebooks/
 ```

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "agent_hypervisor"
-version = "3.4.0"
+version = "3.6.0"
 description = "Public Preview — Agent Hypervisor: Runtime supervisor for multi-agent Shared Sessions with Execution Rings, Joint Liability, Saga Orchestration, and hash-chained audit trails"
 readme = "README.md"
 license = {text = "MIT"}
@@ -58,7 +58,7 @@ dev = [
     "pytest-cov>=4.0.0,<8.0",
     "hypothesis>=6.0.0,<7.0",
     "ruff>=0.4.0,<1.0",
-    "mypy>=1.8.0,<2.0",
+    "mypy>=1.8.0,<3.0",
     "jsonschema>=4.0.0,<5.0",
 ]
 blockchain = [

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/api/server.py RENAMED Viewed

@@ -180,19 +180,16 @@ async def get_stats() -> StatsResponse:
     """Get overall hypervisor statistics."""
     hv = _hv()
     bus = _bus()
-    total_participants = sum(
-        m.sso.participant_count for m in hv._sessions.values()
-    )
-    active_sagas = sum(
-        len(m.saga.active_sagas) for m in hv._sessions.values()
-    )
+    sessions = hv.sessions
+    total_participants = sum(m.sso.participant_count for m in sessions)
+    active_sagas = sum(len(m.saga.active_sagas) for m in sessions)
     return StatsResponse(
         version=__version__,
-        total_sessions=len(hv._sessions),
+        total_sessions=hv.session_count,
         active_sessions=len(hv.active_sessions),
         total_participants=total_participants,
         active_sagas=active_sagas,
-        total_vouches=len(hv.vouching._vouches),
+        total_vouches=hv.vouching.vouch_count,
         event_count=bus.event_count,
     )

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/core.py RENAMED Viewed

@@ -306,6 +306,23 @@ class Hypervisor:
         return [self._sessions[sid] for sid in self._active_ids
                 if sid in self._sessions]
+    @property
+    def sessions(self) -> list[ManagedSession]:
+        """All managed sessions, including archived/terminating ones.
+        ``active_sessions`` filters via ``_active_ids``; this property
+        exposes the full registry for callers (admin APIs, monitoring,
+        stats) that need a count or iterator over every session the
+        Hypervisor is still tracking. Returns a snapshot list so callers
+        can iterate without holding any internal reference.
+        """
+        return list(self._sessions.values())
+    @property
+    def session_count(self) -> int:
+        """Total number of managed sessions, including archived/terminating."""
+        return len(self._sessions)
     def _get_session(self, session_id: str) -> ManagedSession:
         managed = self._sessions.get(session_id)
         if not managed:

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/liability/vouching.py RENAMED Viewed

@@ -57,6 +57,11 @@ class VouchingEngine:
         self._vouches: dict[str, VouchRecord] = {}
         self.max_exposure = max_exposure or self.DEFAULT_MAX_EXPOSURE
+    @property
+    def vouch_count(self) -> int:
+        """Total number of sponsorship records (active + released)."""
+        return len(self._vouches)
     def vouch(
         self,
         voucher_did: str,

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/observability/event_bus.py RENAMED Viewed

@@ -10,13 +10,21 @@ full replay debugging, post-mortem analysis, and real-time monitoring.
 from __future__ import annotations
+import threading
 import uuid
+from collections import deque
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from enum import Enum
 from typing import Any
+# Default cap for the in-memory event store. Hypervisor deployments run for
+# weeks; an unbounded list eventually OOMs. The cap is configurable via the
+# ``HypervisorEventBus(max_events=...)`` constructor; ``None`` opts back into
+# unbounded growth for tests or analysis tooling that needs full history.
+DEFAULT_MAX_EVENTS = 100_000
 class EventType(str, Enum):
     """Categorised hypervisor event types."""
@@ -119,34 +127,61 @@ class HypervisorEventBus:
     - Event count and statistics
     """
-    def __init__(self) -> None:
-        self._events: list[HypervisorEvent] = []
+    def __init__(self, max_events: int | None = DEFAULT_MAX_EVENTS) -> None:
+        """Create an event bus.
+        ``max_events`` caps the in-memory store. Each per-key index list
+        (by-type, by-session, by-agent) is independently capped to the
+        same value, so a single chatty session cannot starve the
+        history of other sessions. Pass ``None`` to disable the cap
+        (testing or full-replay tooling).
+        """
+        self._max_events = max_events
+        # `deque` with `maxlen` evicts the oldest entry on overflow in
+        # O(1), avoiding the OOM cliff of an unbounded `list`.
+        self._events: deque[HypervisorEvent] = deque(maxlen=max_events)
         self._subscribers: dict[EventType | None, list[EventHandler]] = {}
-        self._by_type: dict[EventType, list[HypervisorEvent]] = {}
-        self._by_session: dict[str, list[HypervisorEvent]] = {}
-        self._by_agent: dict[str, list[HypervisorEvent]] = {}
+        self._by_type: dict[EventType, deque[HypervisorEvent]] = {}
+        self._by_session: dict[str, deque[HypervisorEvent]] = {}
+        self._by_agent: dict[str, deque[HypervisorEvent]] = {}
+        # Use an RLock so a subscriber that re-enters the bus (e.g.
+        # emits an event in response to another event) doesn't deadlock.
+        self._lock = threading.RLock()
+    def _new_index_deque(self) -> deque[HypervisorEvent]:
+        return deque(maxlen=self._max_events)
     def emit(self, event: HypervisorEvent) -> None:
         """Append an event and notify subscribers."""
-        self._events.append(event)
-        # Index by type
-        self._by_type.setdefault(event.event_type, []).append(event)
-        # Index by session
-        if event.session_id:
-            self._by_session.setdefault(event.session_id, []).append(event)
-        # Index by agent
-        if event.agent_did:
-            self._by_agent.setdefault(event.agent_did, []).append(event)
-        # Notify type-specific subscribers
-        for handler in self._subscribers.get(event.event_type, []):
+        with self._lock:
+            self._events.append(event)
+            self._by_type.setdefault(
+                event.event_type, self._new_index_deque()
+            ).append(event)
+            if event.session_id:
+                self._by_session.setdefault(
+                    event.session_id, self._new_index_deque()
+                ).append(event)
+            if event.agent_did:
+                self._by_agent.setdefault(
+                    event.agent_did, self._new_index_deque()
+                ).append(event)
+            # Snapshot subscriber lists while holding the lock so a
+            # subscriber that mutates the registry mid-notify doesn't
+            # invalidate iteration.
+            type_subs = list(self._subscribers.get(event.event_type, ()))
+            wildcard_subs = list(self._subscribers.get(None, ()))
+        # Invoke handlers outside the lock so a slow subscriber can't
+        # serialize the entire bus or, worse, deadlock with a caller
+        # that also holds an external lock.
+        for handler in type_subs:
             handler(event)
-        # Notify wildcard subscribers
-        for handler in self._subscribers.get(None, []):
+        for handler in wildcard_subs:
             handler(event)
     def subscribe(
@@ -155,20 +190,25 @@ class HypervisorEventBus:
         handler: EventHandler | None = None,
     ) -> None:
         """Subscribe to events. Use event_type=None for all events."""
-        if handler:
+        if not handler:
+            return
+        with self._lock:
             self._subscribers.setdefault(event_type, []).append(handler)
     def query_by_type(self, event_type: EventType) -> list[HypervisorEvent]:
         """Get all events of a specific type."""
-        return list(self._by_type.get(event_type, []))
+        with self._lock:
+            return list(self._by_type.get(event_type, ()))
     def query_by_session(self, session_id: str) -> list[HypervisorEvent]:
         """Get all events for a specific session."""
-        return list(self._by_session.get(session_id, []))
+        with self._lock:
+            return list(self._by_session.get(session_id, ()))
     def query_by_agent(self, agent_did: str) -> list[HypervisorEvent]:
         """Get all events involving a specific agent."""
-        return list(self._by_agent.get(agent_did, []))
+        with self._lock:
+            return list(self._by_agent.get(agent_did, ()))
     def query_by_time_range(
         self,
@@ -178,7 +218,8 @@ class HypervisorEventBus:
         """Get events within a time range."""
         if end is None:
             end = datetime.now(UTC)
-        return [e for e in self._events if start <= e.timestamp <= end]
+        with self._lock:
+            return [e for e in self._events if start <= e.timestamp <= end]
     def query(
         self,
@@ -188,7 +229,8 @@ class HypervisorEventBus:
         limit: int | None = None,
     ) -> list[HypervisorEvent]:
         """Flexible query with multiple filters."""
-        results = self._events
+        with self._lock:
+            results: list[HypervisorEvent] = list(self._events)
         if event_type is not None:
             results = [e for e in results if e.event_type == event_type]
@@ -204,19 +246,35 @@ class HypervisorEventBus:
     @property
     def event_count(self) -> int:
-        return len(self._events)
+        with self._lock:
+            return len(self._events)
     @property
     def all_events(self) -> list[HypervisorEvent]:
-        return list(self._events)
+        with self._lock:
+            return list(self._events)
     def type_counts(self) -> dict[str, int]:
         """Return count of events per type."""
-        return {t.value: len(evts) for t, evts in self._by_type.items()}
-    def clear(self) -> None:
-        """Clear all events (for testing)."""
-        self._events.clear()
-        self._by_type.clear()
-        self._by_session.clear()
-        self._by_agent.clear()
+        with self._lock:
+            return {t.value: len(evts) for t, evts in self._by_type.items()}
+    def _clear(self) -> None:
+        """Clear all events. **Test-only — do not call in production.**
+        The event bus is wired into the hypervisor as a long-lived,
+        process-singleton-shaped collaborator (see
+        ``hypervisor.api.server._event_bus``): production calls would
+        wipe the audit trail of every running session at once.
+        The leading underscore makes the test-only contract visible at
+        every call site. The method is kept on the class (rather than
+        moved to a test helper) because some tests construct a fresh
+        bus and then exercise the clear path itself; it just shouldn't
+        be reached from non-test code.
+        """
+        with self._lock:
+            self._events.clear()
+            self._by_type.clear()
+            self._by_session.clear()
+            self._by_agent.clear()

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/providers.py RENAMED Viewed

@@ -69,27 +69,34 @@ def get_liability_engine(**kwargs: Any):
     """Get the best available liability engine.
     Advanced: Shapley-value fault attribution with vouch cascades.
-    Community: Basic vouching with linear slashing.
+    Community: ``LiabilityMatrix`` from ``hypervisor.liability``.
     """
     provider = _discover_provider(PROVIDER_GROUPS["liability"])
     if provider is not None:
         return provider(**kwargs)
-    from hypervisor.liability.engine import LiabilityEngine
-    return LiabilityEngine(**kwargs)
+    # Community fallback. The previous import targeted
+    # ``hypervisor.liability.engine.LiabilityEngine`` which does not
+    # exist in this tree; ``LiabilityMatrix`` is the real public-
+    # edition entry point.
+    from hypervisor.liability import LiabilityMatrix
+    return LiabilityMatrix(**kwargs)
 def get_saga_engine(**kwargs: Any):
     """Get the best available saga orchestration engine.
     Advanced: Multi-pattern saga with parallel fan-out and escalation.
-    Community: Sequential saga with basic compensation.
+    Community: ``SagaOrchestrator`` from ``hypervisor.saga.orchestrator``.
     """
     provider = _discover_provider(PROVIDER_GROUPS["saga_engine"])
     if provider is not None:
         return provider(**kwargs)
-    from hypervisor.saga.engine import SagaOrchestrator
+    # Community fallback. The previous import targeted
+    # ``hypervisor.saga.engine.SagaOrchestrator`` which does not exist
+    # in this tree; the real module is ``hypervisor.saga.orchestrator``.
+    from hypervisor.saga.orchestrator import SagaOrchestrator
     return SagaOrchestrator(**kwargs)

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/rings/breach_detector.py RENAMED Viewed

@@ -127,8 +127,24 @@ class RingBreachDetector:
             window.popleft()
         # --- 3. Compute actual rate (calls / second) ---
+        # Dividing by the full ``window_seconds`` underestimates the rate
+        # when the window has just begun — 10 calls in the first 2s of a
+        # 60s window would read as 0.16/s instead of 5/s, missing real
+        # bursts. Use the shorter of (window, time_since_first_event)
+        # as the denominator so early bursts surface accurately. A single
+        # call has no rate to measure (need ≥2 samples for an interval),
+        # so fall back to the conservative full-window divisor.
         call_count = len(window)
-        actual_rate = call_count / self.window_seconds if self.window_seconds > 0 else 0.0
+        if self.window_seconds <= 0 or call_count == 0:
+            actual_rate = 0.0
+        elif call_count < 2:
+            actual_rate = call_count / self.window_seconds
+        else:
+            time_since_first = max(now - window[0], 0.0)
+            # Floor at 1ms: prevents divide-by-zero for ultra-tight
+            # bursts while still surfacing them with a high rate.
+            denominator = max(min(self.window_seconds, time_since_first), 1e-3)
+            actual_rate = call_count / denominator
         # --- 4. Ring-distance amplifier ---
         #   Upward calls (low value = higher privilege) are escalations.

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/orchestrator.py RENAMED Viewed

@@ -90,6 +90,18 @@ class SagaOrchestrator:
         """
         Execute a single saga step with timeout and retry support.
+        Cancellation semantics on timeout:
+            ``asyncio.wait_for`` cancels the wrapped coroutine on timeout
+            *and* awaits the cancellation before raising ``TimeoutError``,
+            so a cooperative executor (one with ``await`` points)
+            receives ``CancelledError`` and has a chance to release
+            resources before this method moves on to FAILED. An executor
+            with no ``await`` points (synchronous CPU work inside an
+            ``async def``) is not cancellable by Python — the timeout
+            will only fire once the executor yields control.
+            Callers needing hard-kill semantics must run such executors
+            in a process or thread pool and arrange external termination.
         Args:
             saga_id: Saga identifier
             step_id: Step identifier
@@ -127,9 +139,11 @@ class SagaOrchestrator:
                 step.error = str(last_error)
                 step.transition(StepState.FAILED)
                 if attempt < attempts - 1:
-                    # Reset to PENDING for retry
-                    step.state = StepState.PENDING
-                    step.error = None
+                    # Move FAILED → PENDING through the state table,
+                    # not by direct mutation. Bypassing transition()
+                    # would skip the validity check and the timestamp
+                    # bookkeeping.
+                    step.reset_for_retry()
                     await asyncio.sleep(
                         self.DEFAULT_RETRY_DELAY_SECONDS * (attempt + 1)
                     )
@@ -138,8 +152,7 @@ class SagaOrchestrator:
                 step.error = str(e)
                 step.transition(StepState.FAILED)
                 if attempt < attempts - 1:
-                    step.state = StepState.PENDING
-                    step.error = None
+                    step.reset_for_retry()
                     await asyncio.sleep(
                         self.DEFAULT_RETRY_DELAY_SECONDS * (attempt + 1)
                     )
@@ -157,6 +170,9 @@ class SagaOrchestrator:
         """
         Run compensation (rollback) for all committed steps in reverse order.
+        Cancellation semantics on timeout match ``execute_step``: see that
+        method's docstring for the cooperative-cancel contract.
         Args:
             saga_id: Saga identifier
             compensator: Async callable that takes a SagaStep and calls its Undo_API

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/saga/state_machine.py RENAMED Viewed

@@ -45,7 +45,11 @@ STEP_TRANSITIONS: dict[StepState, set[StepState]] = {
     StepState.COMPENSATING: {StepState.COMPENSATED, StepState.COMPENSATION_FAILED},
     StepState.COMPENSATED: set(),
     StepState.COMPENSATION_FAILED: set(),
-    StepState.FAILED: set(),
+    # FAILED → PENDING is allowed only via reset_for_retry, which is
+    # the documented retry path. The transition table mirrors that
+    # explicitly so reset_for_retry can call transition() instead of
+    # mutating state directly and bypassing the table.
+    StepState.FAILED: {StepState.PENDING},
 }
 SAGA_TRANSITIONS: dict[SagaState, set[SagaState]] = {
@@ -96,6 +100,22 @@ class SagaStep:
         ):
             self.completed_at = now
+    def reset_for_retry(self) -> None:
+        """Move a FAILED step back to PENDING for another execution attempt.
+        Goes through ``transition()`` rather than mutating ``state``
+        directly so the move is recorded in the state table. Also
+        clears the per-attempt error and completion timestamp so the
+        next attempt starts from a clean slate.
+        Raises:
+            SagaStateError: If the step is not currently in FAILED
+                state. Retries are only valid from FAILED.
+        """
+        self.transition(StepState.PENDING)
+        self.error = None
+        self.completed_at = None
 @dataclass
 class Saga:

{agent_hypervisor-3.4.0 → agent_hypervisor-3.6.0}/src/hypervisor/security/kill_switch.py RENAMED Viewed

@@ -10,6 +10,7 @@ in-flight saga steps to a substitute agent when one is available.
 from __future__ import annotations
 import logging
+import threading
 import uuid
 from collections.abc import Callable
 from dataclasses import dataclass, field
@@ -18,6 +19,11 @@ from enum import Enum
 _logger = logging.getLogger(__name__)
+# Maximum wall time we wait for an agent's termination callback to complete
+# before declaring it hung. The kill switch must remain responsive — a slow
+# callback should not block the entire kill flow.
+DEFAULT_CALLBACK_TIMEOUT_SECONDS = 5.0
 class KillReason(str, Enum):
     """Why an agent was killed."""
@@ -76,10 +82,16 @@ class KillSwitch:
     callback to stop the agent process.
     """
-    def __init__(self) -> None:
+    def __init__(
+        self, callback_timeout: float = DEFAULT_CALLBACK_TIMEOUT_SECONDS
+    ) -> None:
         self._kill_history: list[KillResult] = []
         self._substitutes: dict[str, list[str]] = {}
         self._agents: dict[str, Callable[[], None]] = {}
+        self._callback_timeout = callback_timeout
+        # RLock so a callback that itself re-enters the kill switch
+        # (e.g. unregisters another agent) does not deadlock.
+        self._lock = threading.RLock()
     # ── Agent process registry ─────────────────────────────────────
@@ -87,11 +99,13 @@ class KillSwitch:
         self, agent_did: str, process_handle: Callable[[], None]
     ) -> None:
         """Register an agent with its termination callback."""
-        self._agents[agent_did] = process_handle
+        with self._lock:
+            self._agents[agent_did] = process_handle
     def unregister_agent(self, agent_did: str) -> None:
         """Remove an agent from the process registry."""
-        self._agents.pop(agent_did, None)
+        with self._lock:
+            self._agents.pop(agent_did, None)
     # ── Substitute management ──────────────────────────────────────
@@ -99,14 +113,16 @@ class KillSwitch:
         self, session_id: str, agent_did: str
     ) -> None:
         """Register a substitute agent for a session."""
-        self._substitutes.setdefault(session_id, []).append(agent_did)
+        with self._lock:
+            self._substitutes.setdefault(session_id, []).append(agent_did)
     def unregister_substitute(
         self, session_id: str, agent_did: str
     ) -> None:
-        subs = self._substitutes.get(session_id, [])
-        if agent_did in subs:
-            subs.remove(agent_did)
+        with self._lock:
+            subs = self._substitutes.get(session_id, [])
+            if agent_did in subs:
+                subs.remove(agent_did)
     # ── Kill ───────────────────────────────────────────────────────
@@ -118,11 +134,25 @@ class KillSwitch:
         in_flight_steps: list[dict] | None = None,
         details: str = "",
     ) -> KillResult:
-        """Kill an agent, handing off in-flight steps to a substitute if available."""
+        """Kill an agent, handing off in-flight steps to a substitute if available.
+        Registration invariant: the agent is unregistered from the
+        process registry **unconditionally** at the end of this method,
+        regardless of whether the termination callback succeeded
+        (``terminated=True``) or failed/timed out (``terminated=False``).
+        This is intentional. The kill *intent* is durably recorded in
+        ``_kill_history`` and surfaced via the returned ``KillResult``;
+        leaving the callback registered would falsely advertise the
+        agent as live and re-callable when its process state is
+        actually unknown. Callers who detect ``terminated=False`` and
+        want to retry must re-register the agent (presumably with a
+        new, working callback) before issuing the second ``kill()``.
+        """
         in_flight = in_flight_steps or []
-        # Attempt to find a substitute for handoff
-        substitute = self._find_substitute(session_id, agent_did)
+        with self._lock:
+            substitute = self._find_substitute(session_id, agent_did)
+            callback = self._agents.get(agent_did)
         handoffs: list[StepHandoff] = []
         handoff_success_count = 0
@@ -148,12 +178,14 @@ class KillSwitch:
                     )
                 )
-        # Terminate the agent process
+        # Invoke the termination callback *outside* the lock and with a
+        # wall-clock timeout. A slow or hung callback must not freeze the
+        # kill flow — the whole point of a kill switch is responsiveness.
         terminated = False
-        callback = self._agents.get(agent_did)
         if callback is not None:
-            callback()
-            terminated = True
+            terminated = self._invoke_callback_with_timeout(
+                agent_did, callback
+            )
         else:
             _logger.warning(
                 "No termination callback registered for agent %s",
@@ -172,11 +204,53 @@ class KillSwitch:
             terminated=terminated,
             details=details,
         )
-        self._kill_history.append(result)
+        with self._lock:
+            self._kill_history.append(result)
         self.unregister_substitute(session_id, agent_did)
         self.unregister_agent(agent_did)
         return result
+    def _invoke_callback_with_timeout(
+        self, agent_did: str, callback: Callable[[], None]
+    ) -> bool:
+        """Run *callback* in a daemon thread bounded by ``callback_timeout``.
+        Returns ``True`` if the callback completed cleanly within the
+        timeout, ``False`` if it timed out or raised. A hung callback
+        is left to its fate (daemon thread); the kill switch returns
+        and remains usable for the next kill.
+        """
+        error_box: list[BaseException] = []
+        def _runner() -> None:
+            try:
+                callback()
+            except BaseException as exc:  # noqa: BLE001 — surface but don't propagate
+                error_box.append(exc)
+        thread = threading.Thread(
+            target=_runner, name=f"kill-callback:{agent_did}", daemon=True
+        )
+        thread.start()
+        thread.join(timeout=self._callback_timeout)
+        if thread.is_alive():
+            _logger.error(
+                "Termination callback for %s exceeded %.2fs; leaving daemon thread to drain",
+                agent_did,
+                self._callback_timeout,
+            )
+            return False
+        if error_box:
+            _logger.error(
+                "Termination callback for %s raised %s: %s",
+                agent_did,
+                type(error_box[0]).__name__,
+                error_box[0],
+            )
+            return False
+        return True
     def _find_substitute(
         self, session_id: str, exclude_did: str
     ) -> str | None:

agent_hypervisor 3.4.0__tar.gz → 3.6.0__tar.gz

agent_hypervisor 3.4.0tar.gz → 3.6.0tar.gz