agent_hypervisor 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_hypervisor-3.1.0.dist-info/METADATA +824 -0
- agent_hypervisor-3.1.0.dist-info/RECORD +60 -0
- agent_hypervisor-3.1.0.dist-info/WHEEL +4 -0
- agent_hypervisor-3.1.0.dist-info/entry_points.txt +2 -0
- agent_hypervisor-3.1.0.dist-info/licenses/LICENSE +21 -0
- hypervisor/__init__.py +160 -0
- hypervisor/api/__init__.py +7 -0
- hypervisor/api/models.py +285 -0
- hypervisor/api/server.py +742 -0
- hypervisor/audit/__init__.py +4 -0
- hypervisor/audit/commitment.py +76 -0
- hypervisor/audit/delta.py +135 -0
- hypervisor/audit/gc.py +99 -0
- hypervisor/cli/__init__.py +3 -0
- hypervisor/cli/formatters.py +99 -0
- hypervisor/cli/session_commands.py +200 -0
- hypervisor/constants.py +106 -0
- hypervisor/core.py +352 -0
- hypervisor/integrations/__init__.py +10 -0
- hypervisor/integrations/iatp_adapter.py +142 -0
- hypervisor/integrations/nexus_adapter.py +108 -0
- hypervisor/integrations/verification_adapter.py +122 -0
- hypervisor/liability/__init__.py +142 -0
- hypervisor/liability/attribution.py +86 -0
- hypervisor/liability/ledger.py +121 -0
- hypervisor/liability/quarantine.py +119 -0
- hypervisor/liability/slashing.py +80 -0
- hypervisor/liability/vouching.py +134 -0
- hypervisor/models.py +277 -0
- hypervisor/observability/__init__.py +27 -0
- hypervisor/observability/causal_trace.py +70 -0
- hypervisor/observability/event_bus.py +222 -0
- hypervisor/observability/prometheus_collector.py +248 -0
- hypervisor/observability/saga_span_exporter.py +341 -0
- hypervisor/providers.py +121 -0
- hypervisor/py.typed +0 -0
- hypervisor/reversibility/__init__.py +3 -0
- hypervisor/reversibility/registry.py +108 -0
- hypervisor/rings/__init__.py +21 -0
- hypervisor/rings/breach_detector.py +200 -0
- hypervisor/rings/classifier.py +78 -0
- hypervisor/rings/elevation.py +219 -0
- hypervisor/rings/enforcer.py +97 -0
- hypervisor/saga/__init__.py +22 -0
- hypervisor/saga/checkpoint.py +110 -0
- hypervisor/saga/dsl.py +190 -0
- hypervisor/saga/fan_out.py +126 -0
- hypervisor/saga/orchestrator.py +229 -0
- hypervisor/saga/schema.py +244 -0
- hypervisor/saga/state_machine.py +157 -0
- hypervisor/security/__init__.py +13 -0
- hypervisor/security/kill_switch.py +200 -0
- hypervisor/security/rate_limiter.py +190 -0
- hypervisor/session/__init__.py +194 -0
- hypervisor/session/intent_locks.py +118 -0
- hypervisor/session/isolation.py +37 -0
- hypervisor/session/sso.py +169 -0
- hypervisor/session/vector_clock.py +118 -0
- hypervisor/verification/__init__.py +3 -0
- hypervisor/verification/history.py +173 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
# Public Preview — basic implementation
|
|
4
|
+
"""
|
|
5
|
+
JSON Schema validation for Saga DSL definitions.
|
|
6
|
+
|
|
7
|
+
Validates saga definitions at parse time with clear error messages
|
|
8
|
+
for missing fields, invalid types, and constraint violations.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import jsonschema
|
|
16
|
+
|
|
17
|
+
# Valid action type prefixes for step action_ids
|
|
18
|
+
VALID_ACTION_PREFIXES = (
|
|
19
|
+
"model.", "data.", "deploy.", "validate.", "notify.",
|
|
20
|
+
"infra.", "security.", "monitor.", "config.", "test.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
SAGA_STEP_SCHEMA: dict[str, Any] = {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"required": ["id", "action_id", "agent"],
|
|
26
|
+
"properties": {
|
|
27
|
+
"id": {
|
|
28
|
+
"type": "string",
|
|
29
|
+
"minLength": 1,
|
|
30
|
+
"description": "Unique step identifier",
|
|
31
|
+
},
|
|
32
|
+
"action_id": {
|
|
33
|
+
"type": "string",
|
|
34
|
+
"minLength": 1,
|
|
35
|
+
"description": "Action type (e.g. 'model.validate', 'deploy.k8s')",
|
|
36
|
+
},
|
|
37
|
+
"agent": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"minLength": 1,
|
|
40
|
+
"description": "Agent DID or identifier",
|
|
41
|
+
},
|
|
42
|
+
"execute_api": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"description": "API endpoint for execution",
|
|
45
|
+
},
|
|
46
|
+
"undo_api": {
|
|
47
|
+
"type": ["string", "null"],
|
|
48
|
+
"description": "API endpoint for compensation/rollback",
|
|
49
|
+
},
|
|
50
|
+
"timeout": {
|
|
51
|
+
"type": "integer",
|
|
52
|
+
"minimum": 1,
|
|
53
|
+
"maximum": 86400,
|
|
54
|
+
"description": "Timeout in seconds (1–86400)",
|
|
55
|
+
},
|
|
56
|
+
"retries": {
|
|
57
|
+
"type": "integer",
|
|
58
|
+
"minimum": 0,
|
|
59
|
+
"maximum": 10,
|
|
60
|
+
"description": "Max retries (0–10)",
|
|
61
|
+
},
|
|
62
|
+
"checkpoint_goal": {
|
|
63
|
+
"type": ["string", "null"],
|
|
64
|
+
"description": "Semantic checkpoint goal",
|
|
65
|
+
},
|
|
66
|
+
"depends_on": {
|
|
67
|
+
"type": "array",
|
|
68
|
+
"items": {"type": "string", "minLength": 1},
|
|
69
|
+
"uniqueItems": True,
|
|
70
|
+
"description": "Step IDs this step depends on",
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
"additionalProperties": False,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
SAGA_DEFINITION_SCHEMA: dict[str, Any] = {
|
|
77
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
78
|
+
"title": "SagaDefinition",
|
|
79
|
+
"description": "Schema for saga DSL definitions",
|
|
80
|
+
"type": "object",
|
|
81
|
+
"required": ["name", "session_id", "steps"],
|
|
82
|
+
"properties": {
|
|
83
|
+
"name": {
|
|
84
|
+
"type": "string",
|
|
85
|
+
"minLength": 1,
|
|
86
|
+
"description": "Saga name",
|
|
87
|
+
},
|
|
88
|
+
"session_id": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"minLength": 1,
|
|
91
|
+
"description": "Session identifier",
|
|
92
|
+
},
|
|
93
|
+
"saga_id": {
|
|
94
|
+
"type": "string",
|
|
95
|
+
"description": "Optional saga identifier",
|
|
96
|
+
},
|
|
97
|
+
"steps": {
|
|
98
|
+
"type": "array",
|
|
99
|
+
"minItems": 1,
|
|
100
|
+
"items": SAGA_STEP_SCHEMA,
|
|
101
|
+
"description": "Ordered list of saga steps",
|
|
102
|
+
},
|
|
103
|
+
"fan_out": {
|
|
104
|
+
"type": "array",
|
|
105
|
+
"items": {
|
|
106
|
+
"type": "object",
|
|
107
|
+
"properties": {
|
|
108
|
+
"policy": {"type": "string"},
|
|
109
|
+
"branches": {
|
|
110
|
+
"type": "array",
|
|
111
|
+
"items": {"type": "string"},
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
"description": "Fan-out groups (Public Preview: ignored)",
|
|
116
|
+
},
|
|
117
|
+
"metadata": {
|
|
118
|
+
"type": "object",
|
|
119
|
+
"description": "Arbitrary metadata",
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
"additionalProperties": False,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class SagaSchemaValidator:
|
|
127
|
+
"""Validates saga definitions against JSON schema and semantic rules."""
|
|
128
|
+
|
|
129
|
+
def __init__(self) -> None:
|
|
130
|
+
self._validator = jsonschema.Draft202012Validator(SAGA_DEFINITION_SCHEMA)
|
|
131
|
+
|
|
132
|
+
def validate(self, definition: dict[str, Any]) -> list[str]:
|
|
133
|
+
"""Validate definition and return list of error messages (empty = valid).
|
|
134
|
+
|
|
135
|
+
Performs both JSON schema validation and semantic checks:
|
|
136
|
+
- Required fields and types
|
|
137
|
+
- Step structure constraints
|
|
138
|
+
- Unique step IDs
|
|
139
|
+
- Valid action type prefixes
|
|
140
|
+
- Timeout and retry ranges
|
|
141
|
+
- Compensation requirements
|
|
142
|
+
- Step dependency references
|
|
143
|
+
"""
|
|
144
|
+
errors: list[str] = []
|
|
145
|
+
|
|
146
|
+
# JSON schema validation
|
|
147
|
+
for error in sorted(self._validator.iter_errors(definition), key=lambda e: list(e.path)):
|
|
148
|
+
path = ".".join(str(p) for p in error.absolute_path) or "(root)"
|
|
149
|
+
errors.append(f"[{path}] {error.message}")
|
|
150
|
+
|
|
151
|
+
# Semantic checks only if basic structure is valid
|
|
152
|
+
if not errors and isinstance(definition.get("steps"), list):
|
|
153
|
+
errors.extend(self._check_semantic_rules(definition))
|
|
154
|
+
|
|
155
|
+
return errors
|
|
156
|
+
|
|
157
|
+
def validate_or_raise(self, definition: dict[str, Any]) -> None:
|
|
158
|
+
"""Validate and raise SagaSchemaError if invalid."""
|
|
159
|
+
errors = self.validate(definition)
|
|
160
|
+
if errors:
|
|
161
|
+
raise SagaSchemaError(
|
|
162
|
+
f"Saga definition has {len(errors)} validation error(s):\n"
|
|
163
|
+
+ "\n".join(f" - {e}" for e in errors),
|
|
164
|
+
errors=errors,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def _check_semantic_rules(self, definition: dict[str, Any]) -> list[str]:
|
|
168
|
+
errors: list[str] = []
|
|
169
|
+
steps = definition["steps"]
|
|
170
|
+
step_ids: set[str] = set()
|
|
171
|
+
|
|
172
|
+
for i, step in enumerate(steps):
|
|
173
|
+
sid = step.get("id", f"<index {i}>")
|
|
174
|
+
|
|
175
|
+
# Duplicate step IDs
|
|
176
|
+
if sid in step_ids:
|
|
177
|
+
errors.append(f"Duplicate step ID: '{sid}'")
|
|
178
|
+
step_ids.add(sid)
|
|
179
|
+
|
|
180
|
+
# Action type prefix validation
|
|
181
|
+
action_id = step.get("action_id", "")
|
|
182
|
+
if action_id and not any(action_id.startswith(p) for p in VALID_ACTION_PREFIXES):
|
|
183
|
+
errors.append(
|
|
184
|
+
f"Step '{sid}': action_id '{action_id}' does not start with a "
|
|
185
|
+
f"valid prefix ({', '.join(VALID_ACTION_PREFIXES)})"
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Compensation requirement: every step should have undo_api
|
|
189
|
+
if step.get("undo_api") is None:
|
|
190
|
+
errors.append(
|
|
191
|
+
f"Step '{sid}': missing 'undo_api' — every action should have a compensation endpoint"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Dependency validation
|
|
195
|
+
for step in steps:
|
|
196
|
+
for dep in step.get("depends_on", []):
|
|
197
|
+
if dep not in step_ids:
|
|
198
|
+
errors.append(
|
|
199
|
+
f"Step '{step['id']}': depends_on references unknown step '{dep}'"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Circular dependency detection
|
|
203
|
+
errors.extend(self._check_circular_deps(steps))
|
|
204
|
+
|
|
205
|
+
return errors
|
|
206
|
+
|
|
207
|
+
def _check_circular_deps(self, steps: list[dict[str, Any]]) -> list[str]:
|
|
208
|
+
"""Detect circular dependencies via DFS."""
|
|
209
|
+
graph: dict[str, list[str]] = {}
|
|
210
|
+
for step in steps:
|
|
211
|
+
sid = step.get("id", "")
|
|
212
|
+
graph[sid] = step.get("depends_on", [])
|
|
213
|
+
|
|
214
|
+
visited: set[str] = set()
|
|
215
|
+
in_stack: set[str] = set()
|
|
216
|
+
errors: list[str] = []
|
|
217
|
+
|
|
218
|
+
def dfs(node: str) -> bool:
|
|
219
|
+
if node in in_stack:
|
|
220
|
+
errors.append(f"Circular dependency detected involving step '{node}'")
|
|
221
|
+
return True
|
|
222
|
+
if node in visited:
|
|
223
|
+
return False
|
|
224
|
+
visited.add(node)
|
|
225
|
+
in_stack.add(node)
|
|
226
|
+
for dep in graph.get(node, []):
|
|
227
|
+
if dfs(dep):
|
|
228
|
+
return True
|
|
229
|
+
in_stack.discard(node)
|
|
230
|
+
return False
|
|
231
|
+
|
|
232
|
+
for sid in graph:
|
|
233
|
+
if sid not in visited:
|
|
234
|
+
dfs(sid)
|
|
235
|
+
|
|
236
|
+
return errors
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class SagaSchemaError(Exception):
|
|
240
|
+
"""Raised when a saga definition fails schema validation."""
|
|
241
|
+
|
|
242
|
+
def __init__(self, message: str, errors: list[str] | None = None) -> None:
|
|
243
|
+
super().__init__(message)
|
|
244
|
+
self.errors = errors or []
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""
|
|
4
|
+
Saga State Machine
|
|
5
|
+
|
|
6
|
+
Formal state tracking for individual saga steps and overall saga lifecycle,
|
|
7
|
+
with persistence support for crash recovery.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import UTC, datetime
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StepState(str, Enum):
|
|
19
|
+
"""State of an individual saga step."""
|
|
20
|
+
|
|
21
|
+
PENDING = "pending"
|
|
22
|
+
EXECUTING = "executing"
|
|
23
|
+
COMMITTED = "committed"
|
|
24
|
+
COMPENSATING = "compensating"
|
|
25
|
+
COMPENSATED = "compensated"
|
|
26
|
+
COMPENSATION_FAILED = "compensation_failed"
|
|
27
|
+
FAILED = "failed"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SagaState(str, Enum):
|
|
31
|
+
"""State of the overall saga."""
|
|
32
|
+
|
|
33
|
+
RUNNING = "running"
|
|
34
|
+
COMPENSATING = "compensating"
|
|
35
|
+
COMPLETED = "completed"
|
|
36
|
+
FAILED = "failed"
|
|
37
|
+
ESCALATED = "escalated"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Valid state transitions
|
|
41
|
+
STEP_TRANSITIONS: dict[StepState, set[StepState]] = {
|
|
42
|
+
StepState.PENDING: {StepState.EXECUTING},
|
|
43
|
+
StepState.EXECUTING: {StepState.COMMITTED, StepState.FAILED},
|
|
44
|
+
StepState.COMMITTED: {StepState.COMPENSATING},
|
|
45
|
+
StepState.COMPENSATING: {StepState.COMPENSATED, StepState.COMPENSATION_FAILED},
|
|
46
|
+
StepState.COMPENSATED: set(),
|
|
47
|
+
StepState.COMPENSATION_FAILED: set(),
|
|
48
|
+
StepState.FAILED: set(),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
SAGA_TRANSITIONS: dict[SagaState, set[SagaState]] = {
|
|
52
|
+
SagaState.RUNNING: {SagaState.COMPENSATING, SagaState.COMPLETED, SagaState.FAILED},
|
|
53
|
+
SagaState.COMPENSATING: {SagaState.COMPLETED, SagaState.FAILED, SagaState.ESCALATED},
|
|
54
|
+
SagaState.COMPLETED: set(),
|
|
55
|
+
SagaState.FAILED: set(),
|
|
56
|
+
SagaState.ESCALATED: set(),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class SagaStep:
|
|
62
|
+
"""A single step in a saga."""
|
|
63
|
+
|
|
64
|
+
step_id: str
|
|
65
|
+
action_id: str
|
|
66
|
+
agent_did: str
|
|
67
|
+
execute_api: str
|
|
68
|
+
undo_api: str | None = None
|
|
69
|
+
state: StepState = StepState.PENDING
|
|
70
|
+
execute_result: Any | None = None
|
|
71
|
+
compensation_result: Any | None = None
|
|
72
|
+
error: str | None = None
|
|
73
|
+
started_at: datetime | None = None
|
|
74
|
+
completed_at: datetime | None = None
|
|
75
|
+
timeout_seconds: int = 300
|
|
76
|
+
max_retries: int = 0
|
|
77
|
+
retry_count: int = 0
|
|
78
|
+
|
|
79
|
+
def transition(self, new_state: StepState) -> None:
|
|
80
|
+
"""Transition to a new state, enforcing valid transitions."""
|
|
81
|
+
allowed = STEP_TRANSITIONS.get(self.state, set())
|
|
82
|
+
if new_state not in allowed:
|
|
83
|
+
raise SagaStateError(
|
|
84
|
+
f"Invalid step transition: {self.state.value} → {new_state.value}. "
|
|
85
|
+
f"Allowed: {[s.value for s in allowed]}"
|
|
86
|
+
)
|
|
87
|
+
self.state = new_state
|
|
88
|
+
now = datetime.now(UTC)
|
|
89
|
+
if new_state == StepState.EXECUTING:
|
|
90
|
+
self.started_at = now
|
|
91
|
+
elif new_state in (
|
|
92
|
+
StepState.COMMITTED,
|
|
93
|
+
StepState.COMPENSATED,
|
|
94
|
+
StepState.COMPENSATION_FAILED,
|
|
95
|
+
StepState.FAILED,
|
|
96
|
+
):
|
|
97
|
+
self.completed_at = now
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class Saga:
|
|
102
|
+
"""A saga consisting of ordered steps."""
|
|
103
|
+
|
|
104
|
+
saga_id: str
|
|
105
|
+
session_id: str
|
|
106
|
+
steps: list[SagaStep] = field(default_factory=list)
|
|
107
|
+
state: SagaState = SagaState.RUNNING
|
|
108
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
109
|
+
completed_at: datetime | None = None
|
|
110
|
+
error: str | None = None
|
|
111
|
+
|
|
112
|
+
def transition(self, new_state: SagaState) -> None:
|
|
113
|
+
"""Transition the saga to a new state."""
|
|
114
|
+
allowed = SAGA_TRANSITIONS.get(self.state, set())
|
|
115
|
+
if new_state not in allowed:
|
|
116
|
+
raise SagaStateError(
|
|
117
|
+
f"Invalid saga transition: {self.state.value} → {new_state.value}. "
|
|
118
|
+
f"Allowed: {[s.value for s in allowed]}"
|
|
119
|
+
)
|
|
120
|
+
self.state = new_state
|
|
121
|
+
if new_state in (SagaState.COMPLETED, SagaState.FAILED, SagaState.ESCALATED):
|
|
122
|
+
self.completed_at = datetime.now(UTC)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def committed_steps(self) -> list[SagaStep]:
|
|
126
|
+
"""Steps that completed execution (need compensation on rollback)."""
|
|
127
|
+
return [s for s in self.steps if s.state == StepState.COMMITTED]
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def committed_steps_reversed(self) -> list[SagaStep]:
|
|
131
|
+
"""Committed steps in reverse order for rollback."""
|
|
132
|
+
return list(reversed(self.committed_steps))
|
|
133
|
+
|
|
134
|
+
def to_dict(self) -> dict:
|
|
135
|
+
"""Serialize for VFS persistence."""
|
|
136
|
+
return {
|
|
137
|
+
"saga_id": self.saga_id,
|
|
138
|
+
"session_id": self.session_id,
|
|
139
|
+
"state": self.state.value,
|
|
140
|
+
"created_at": self.created_at.isoformat(),
|
|
141
|
+
"completed_at": self.completed_at.isoformat() if self.completed_at else None,
|
|
142
|
+
"error": self.error,
|
|
143
|
+
"steps": [
|
|
144
|
+
{
|
|
145
|
+
"step_id": s.step_id,
|
|
146
|
+
"action_id": s.action_id,
|
|
147
|
+
"agent_did": s.agent_did,
|
|
148
|
+
"state": s.state.value,
|
|
149
|
+
"error": s.error,
|
|
150
|
+
}
|
|
151
|
+
for s in self.steps
|
|
152
|
+
],
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class SagaStateError(Exception):
|
|
157
|
+
"""Raised for invalid saga state transitions."""
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""Security module — rate limiting, kill switch, and agent protection."""
|
|
4
|
+
|
|
5
|
+
from hypervisor.security.kill_switch import KillResult, KillSwitch
|
|
6
|
+
from hypervisor.security.rate_limiter import AgentRateLimiter, RateLimitExceeded
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AgentRateLimiter",
|
|
10
|
+
"RateLimitExceeded",
|
|
11
|
+
"KillSwitch",
|
|
12
|
+
"KillResult",
|
|
13
|
+
]
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""
|
|
4
|
+
Kill Switch — agent termination with optional handoff.
|
|
5
|
+
|
|
6
|
+
Terminates agent processes via registered callbacks and hands off
|
|
7
|
+
in-flight saga steps to a substitute agent when one is available.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import uuid
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import UTC, datetime
|
|
17
|
+
from enum import Enum
|
|
18
|
+
|
|
19
|
+
_logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class KillReason(str, Enum):
|
|
23
|
+
"""Why an agent was killed."""
|
|
24
|
+
|
|
25
|
+
BEHAVIORAL_DRIFT = "behavioral_drift"
|
|
26
|
+
RATE_LIMIT = "rate_limit"
|
|
27
|
+
RING_BREACH = "ring_breach"
|
|
28
|
+
MANUAL = "manual"
|
|
29
|
+
QUARANTINE_TIMEOUT = "quarantine_timeout"
|
|
30
|
+
SESSION_TIMEOUT = "session_timeout"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class HandoffStatus(str, Enum):
|
|
34
|
+
"""Status of a saga step handoff."""
|
|
35
|
+
|
|
36
|
+
PENDING = "pending"
|
|
37
|
+
HANDED_OFF = "handed_off"
|
|
38
|
+
FAILED = "failed"
|
|
39
|
+
COMPENSATED = "compensated"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class StepHandoff:
|
|
44
|
+
"""A saga step being handed off to a substitute or compensated."""
|
|
45
|
+
|
|
46
|
+
step_id: str
|
|
47
|
+
saga_id: str
|
|
48
|
+
from_agent: str
|
|
49
|
+
to_agent: str | None = None
|
|
50
|
+
status: HandoffStatus = HandoffStatus.COMPENSATED
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class KillResult:
|
|
55
|
+
"""Result of a kill switch operation."""
|
|
56
|
+
|
|
57
|
+
kill_id: str = field(default_factory=lambda: f"kill:{uuid.uuid4().hex[:8]}")
|
|
58
|
+
agent_did: str = ""
|
|
59
|
+
session_id: str = ""
|
|
60
|
+
reason: KillReason = KillReason.MANUAL
|
|
61
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
62
|
+
handoffs: list[StepHandoff] = field(default_factory=list)
|
|
63
|
+
handoff_success_count: int = 0
|
|
64
|
+
compensation_triggered: bool = False
|
|
65
|
+
terminated: bool = False
|
|
66
|
+
details: str = ""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class KillSwitch:
|
|
70
|
+
"""
|
|
71
|
+
Kill switch with agent process registry and handoff support.
|
|
72
|
+
|
|
73
|
+
Agents register termination callbacks via ``register_agent``. When
|
|
74
|
+
``kill`` is called the switch hands in-flight saga steps to a
|
|
75
|
+
registered substitute (if any) and then invokes the termination
|
|
76
|
+
callback to stop the agent process.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(self) -> None:
|
|
80
|
+
self._kill_history: list[KillResult] = []
|
|
81
|
+
self._substitutes: dict[str, list[str]] = {}
|
|
82
|
+
self._agents: dict[str, Callable[[], None]] = {}
|
|
83
|
+
|
|
84
|
+
# ── Agent process registry ─────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
def register_agent(
|
|
87
|
+
self, agent_did: str, process_handle: Callable[[], None]
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Register an agent with its termination callback."""
|
|
90
|
+
self._agents[agent_did] = process_handle
|
|
91
|
+
|
|
92
|
+
def unregister_agent(self, agent_did: str) -> None:
|
|
93
|
+
"""Remove an agent from the process registry."""
|
|
94
|
+
self._agents.pop(agent_did, None)
|
|
95
|
+
|
|
96
|
+
# ── Substitute management ──────────────────────────────────────
|
|
97
|
+
|
|
98
|
+
def register_substitute(
|
|
99
|
+
self, session_id: str, agent_did: str
|
|
100
|
+
) -> None:
|
|
101
|
+
"""Register a substitute agent for a session."""
|
|
102
|
+
self._substitutes.setdefault(session_id, []).append(agent_did)
|
|
103
|
+
|
|
104
|
+
def unregister_substitute(
|
|
105
|
+
self, session_id: str, agent_did: str
|
|
106
|
+
) -> None:
|
|
107
|
+
subs = self._substitutes.get(session_id, [])
|
|
108
|
+
if agent_did in subs:
|
|
109
|
+
subs.remove(agent_did)
|
|
110
|
+
|
|
111
|
+
# ── Kill ───────────────────────────────────────────────────────
|
|
112
|
+
|
|
113
|
+
def kill(
|
|
114
|
+
self,
|
|
115
|
+
agent_did: str,
|
|
116
|
+
session_id: str,
|
|
117
|
+
reason: KillReason,
|
|
118
|
+
in_flight_steps: list[dict] | None = None,
|
|
119
|
+
details: str = "",
|
|
120
|
+
) -> KillResult:
|
|
121
|
+
"""Kill an agent, handing off in-flight steps to a substitute if available."""
|
|
122
|
+
in_flight = in_flight_steps or []
|
|
123
|
+
|
|
124
|
+
# Attempt to find a substitute for handoff
|
|
125
|
+
substitute = self._find_substitute(session_id, agent_did)
|
|
126
|
+
|
|
127
|
+
handoffs: list[StepHandoff] = []
|
|
128
|
+
handoff_success_count = 0
|
|
129
|
+
for step_info in in_flight:
|
|
130
|
+
if substitute is not None:
|
|
131
|
+
handoffs.append(
|
|
132
|
+
StepHandoff(
|
|
133
|
+
step_id=step_info.get("step_id", ""),
|
|
134
|
+
saga_id=step_info.get("saga_id", ""),
|
|
135
|
+
from_agent=agent_did,
|
|
136
|
+
to_agent=substitute,
|
|
137
|
+
status=HandoffStatus.HANDED_OFF,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
handoff_success_count += 1
|
|
141
|
+
else:
|
|
142
|
+
handoffs.append(
|
|
143
|
+
StepHandoff(
|
|
144
|
+
step_id=step_info.get("step_id", ""),
|
|
145
|
+
saga_id=step_info.get("saga_id", ""),
|
|
146
|
+
from_agent=agent_did,
|
|
147
|
+
status=HandoffStatus.COMPENSATED,
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Terminate the agent process
|
|
152
|
+
terminated = False
|
|
153
|
+
callback = self._agents.get(agent_did)
|
|
154
|
+
if callback is not None:
|
|
155
|
+
callback()
|
|
156
|
+
terminated = True
|
|
157
|
+
else:
|
|
158
|
+
_logger.warning(
|
|
159
|
+
"No termination callback registered for agent %s",
|
|
160
|
+
agent_did,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
result = KillResult(
|
|
164
|
+
agent_did=agent_did,
|
|
165
|
+
session_id=session_id,
|
|
166
|
+
reason=reason,
|
|
167
|
+
handoffs=handoffs,
|
|
168
|
+
handoff_success_count=handoff_success_count,
|
|
169
|
+
compensation_triggered=any(
|
|
170
|
+
h.status == HandoffStatus.COMPENSATED for h in handoffs
|
|
171
|
+
),
|
|
172
|
+
terminated=terminated,
|
|
173
|
+
details=details,
|
|
174
|
+
)
|
|
175
|
+
self._kill_history.append(result)
|
|
176
|
+
self.unregister_substitute(session_id, agent_did)
|
|
177
|
+
self.unregister_agent(agent_did)
|
|
178
|
+
return result
|
|
179
|
+
|
|
180
|
+
def _find_substitute(
|
|
181
|
+
self, session_id: str, exclude_did: str
|
|
182
|
+
) -> str | None:
|
|
183
|
+
"""Find a registered substitute for the session, excluding the given agent."""
|
|
184
|
+
subs = self._substitutes.get(session_id, [])
|
|
185
|
+
for sub in subs:
|
|
186
|
+
if sub != exclude_did:
|
|
187
|
+
return sub
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def kill_history(self) -> list[KillResult]:
|
|
192
|
+
return list(self._kill_history)
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def total_kills(self) -> int:
|
|
196
|
+
return len(self._kill_history)
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def total_handoffs(self) -> int:
|
|
200
|
+
return sum(r.handoff_success_count for r in self._kill_history)
|