proxilion 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proxilion/__init__.py +136 -0
- proxilion/audit/__init__.py +133 -0
- proxilion/audit/base_exporters.py +527 -0
- proxilion/audit/compliance/__init__.py +130 -0
- proxilion/audit/compliance/base.py +457 -0
- proxilion/audit/compliance/eu_ai_act.py +603 -0
- proxilion/audit/compliance/iso27001.py +544 -0
- proxilion/audit/compliance/soc2.py +491 -0
- proxilion/audit/events.py +493 -0
- proxilion/audit/explainability.py +1173 -0
- proxilion/audit/exporters/__init__.py +58 -0
- proxilion/audit/exporters/aws_s3.py +636 -0
- proxilion/audit/exporters/azure_storage.py +608 -0
- proxilion/audit/exporters/cloud_base.py +468 -0
- proxilion/audit/exporters/gcp_storage.py +570 -0
- proxilion/audit/exporters/multi_exporter.py +498 -0
- proxilion/audit/hash_chain.py +652 -0
- proxilion/audit/logger.py +543 -0
- proxilion/caching/__init__.py +49 -0
- proxilion/caching/tool_cache.py +633 -0
- proxilion/context/__init__.py +73 -0
- proxilion/context/context_window.py +556 -0
- proxilion/context/message_history.py +505 -0
- proxilion/context/session.py +735 -0
- proxilion/contrib/__init__.py +51 -0
- proxilion/contrib/anthropic.py +609 -0
- proxilion/contrib/google.py +1012 -0
- proxilion/contrib/langchain.py +641 -0
- proxilion/contrib/mcp.py +893 -0
- proxilion/contrib/openai.py +646 -0
- proxilion/core.py +3058 -0
- proxilion/decorators.py +966 -0
- proxilion/engines/__init__.py +287 -0
- proxilion/engines/base.py +266 -0
- proxilion/engines/casbin_engine.py +412 -0
- proxilion/engines/opa_engine.py +493 -0
- proxilion/engines/simple.py +437 -0
- proxilion/exceptions.py +887 -0
- proxilion/guards/__init__.py +54 -0
- proxilion/guards/input_guard.py +522 -0
- proxilion/guards/output_guard.py +634 -0
- proxilion/observability/__init__.py +198 -0
- proxilion/observability/cost_tracker.py +866 -0
- proxilion/observability/hooks.py +683 -0
- proxilion/observability/metrics.py +798 -0
- proxilion/observability/session_cost_tracker.py +1063 -0
- proxilion/policies/__init__.py +67 -0
- proxilion/policies/base.py +304 -0
- proxilion/policies/builtin.py +486 -0
- proxilion/policies/registry.py +376 -0
- proxilion/providers/__init__.py +201 -0
- proxilion/providers/adapter.py +468 -0
- proxilion/providers/anthropic_adapter.py +330 -0
- proxilion/providers/gemini_adapter.py +391 -0
- proxilion/providers/openai_adapter.py +294 -0
- proxilion/py.typed +0 -0
- proxilion/resilience/__init__.py +81 -0
- proxilion/resilience/degradation.py +615 -0
- proxilion/resilience/fallback.py +555 -0
- proxilion/resilience/retry.py +554 -0
- proxilion/scheduling/__init__.py +57 -0
- proxilion/scheduling/priority_queue.py +419 -0
- proxilion/scheduling/scheduler.py +459 -0
- proxilion/security/__init__.py +244 -0
- proxilion/security/agent_trust.py +968 -0
- proxilion/security/behavioral_drift.py +794 -0
- proxilion/security/cascade_protection.py +869 -0
- proxilion/security/circuit_breaker.py +428 -0
- proxilion/security/cost_limiter.py +690 -0
- proxilion/security/idor_protection.py +460 -0
- proxilion/security/intent_capsule.py +849 -0
- proxilion/security/intent_validator.py +495 -0
- proxilion/security/memory_integrity.py +767 -0
- proxilion/security/rate_limiter.py +509 -0
- proxilion/security/scope_enforcer.py +680 -0
- proxilion/security/sequence_validator.py +636 -0
- proxilion/security/trust_boundaries.py +784 -0
- proxilion/streaming/__init__.py +70 -0
- proxilion/streaming/detector.py +761 -0
- proxilion/streaming/transformer.py +674 -0
- proxilion/timeouts/__init__.py +55 -0
- proxilion/timeouts/decorators.py +477 -0
- proxilion/timeouts/manager.py +545 -0
- proxilion/tools/__init__.py +69 -0
- proxilion/tools/decorators.py +493 -0
- proxilion/tools/registry.py +732 -0
- proxilion/types.py +339 -0
- proxilion/validation/__init__.py +93 -0
- proxilion/validation/pydantic_schema.py +351 -0
- proxilion/validation/schema.py +651 -0
- proxilion-0.0.1.dist-info/METADATA +872 -0
- proxilion-0.0.1.dist-info/RECORD +94 -0
- proxilion-0.0.1.dist-info/WHEEL +4 -0
- proxilion-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cascading failure protection for Proxilion.
|
|
3
|
+
|
|
4
|
+
This module provides cascade-aware circuit breaking to prevent failures
|
|
5
|
+
from propagating through dependent tools and services.
|
|
6
|
+
|
|
7
|
+
Quick Start:
|
|
8
|
+
>>> from proxilion.security import (
|
|
9
|
+
... DependencyGraph,
|
|
10
|
+
... CascadeProtector,
|
|
11
|
+
... CircuitBreakerRegistry,
|
|
12
|
+
... )
|
|
13
|
+
>>>
|
|
14
|
+
>>> # Build dependency graph
|
|
15
|
+
>>> graph = DependencyGraph()
|
|
16
|
+
>>> graph.add_dependency("user_service", "database")
|
|
17
|
+
>>> graph.add_dependency("order_service", "user_service")
|
|
18
|
+
>>> graph.add_dependency("order_service", "inventory")
|
|
19
|
+
>>>
|
|
20
|
+
>>> # Create cascade protector
|
|
21
|
+
>>> registry = CircuitBreakerRegistry()
|
|
22
|
+
>>> protector = CascadeProtector(graph, registry)
|
|
23
|
+
>>>
|
|
24
|
+
>>> # Check health before calling a tool
|
|
25
|
+
>>> state = protector.check_cascade_health("order_service")
|
|
26
|
+
>>> if state == CascadeState.HEALTHY:
|
|
27
|
+
... # Safe to call
|
|
28
|
+
... result = call_order_service()
|
|
29
|
+
>>> elif state == CascadeState.DEGRADED:
|
|
30
|
+
... # Proceed with caution, some dependencies may be failing
|
|
31
|
+
... result = call_order_service(retry=False)
|
|
32
|
+
>>> else:
|
|
33
|
+
... # FAILING or ISOLATED - use fallback
|
|
34
|
+
... result = fallback_response()
|
|
35
|
+
|
|
36
|
+
Cascade States:
|
|
37
|
+
- HEALTHY: All dependencies are functioning normally.
|
|
38
|
+
- DEGRADED: Some dependencies have failures but the tool can still function.
|
|
39
|
+
- FAILING: Critical dependencies are failing, tool should not be called.
|
|
40
|
+
- ISOLATED: Tool has been manually isolated from the system.
|
|
41
|
+
|
|
42
|
+
Integration with Circuit Breakers:
|
|
43
|
+
>>> # When a circuit breaker opens, propagate the failure
|
|
44
|
+
>>> def on_breaker_open(tool_name):
|
|
45
|
+
... affected = protector.propagate_failure(tool_name)
|
|
46
|
+
... print(f"Failure in {tool_name} affected {len(affected)} tools")
|
|
47
|
+
>>>
|
|
48
|
+
>>> # Register callback with circuit breaker
|
|
49
|
+
>>> registry = CascadeAwareCircuitBreakerRegistry(protector)
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
import logging
|
|
55
|
+
import threading
|
|
56
|
+
from collections import defaultdict
|
|
57
|
+
from collections.abc import Callable
|
|
58
|
+
from dataclasses import dataclass, field
|
|
59
|
+
from datetime import datetime, timezone
|
|
60
|
+
from enum import Enum
|
|
61
|
+
from typing import Any
|
|
62
|
+
|
|
63
|
+
from proxilion.security.circuit_breaker import CircuitBreakerRegistry, CircuitState
|
|
64
|
+
|
|
65
|
+
logger = logging.getLogger(__name__)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class CascadeState(Enum):
|
|
69
|
+
"""State of a tool in the cascade protection system."""
|
|
70
|
+
|
|
71
|
+
HEALTHY = "healthy"
|
|
72
|
+
"""All dependencies are functioning normally."""
|
|
73
|
+
|
|
74
|
+
DEGRADED = "degraded"
|
|
75
|
+
"""Some dependencies have failures but the tool can still function."""
|
|
76
|
+
|
|
77
|
+
FAILING = "failing"
|
|
78
|
+
"""Critical dependencies are failing, tool should not be called."""
|
|
79
|
+
|
|
80
|
+
ISOLATED = "isolated"
|
|
81
|
+
"""Tool has been manually isolated from the system."""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class DependencyInfo:
|
|
86
|
+
"""Information about a dependency."""
|
|
87
|
+
|
|
88
|
+
name: str
|
|
89
|
+
"""Name of the dependency."""
|
|
90
|
+
|
|
91
|
+
critical: bool = True
|
|
92
|
+
"""Whether this dependency is critical for the dependent tool."""
|
|
93
|
+
|
|
94
|
+
fallback: str | None = None
|
|
95
|
+
"""Optional fallback tool to use if this dependency fails."""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class CascadeEvent:
|
|
100
|
+
"""Record of a cascade event."""
|
|
101
|
+
|
|
102
|
+
timestamp: datetime
|
|
103
|
+
"""When the event occurred."""
|
|
104
|
+
|
|
105
|
+
source_tool: str
|
|
106
|
+
"""The tool that initiated the cascade."""
|
|
107
|
+
|
|
108
|
+
affected_tools: set[str]
|
|
109
|
+
"""Tools affected by the cascade."""
|
|
110
|
+
|
|
111
|
+
event_type: str
|
|
112
|
+
"""Type of event (failure_propagated, recovery_started, etc.)."""
|
|
113
|
+
|
|
114
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
115
|
+
"""Additional event details."""
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class DependencyGraph:
|
|
119
|
+
"""
|
|
120
|
+
Directed acyclic graph (DAG) of tool dependencies.
|
|
121
|
+
|
|
122
|
+
Tracks which tools depend on other tools, enabling cascade-aware
|
|
123
|
+
failure handling.
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
>>> graph = DependencyGraph()
|
|
127
|
+
>>> graph.add_dependency("api_gateway", "auth_service")
|
|
128
|
+
>>> graph.add_dependency("api_gateway", "rate_limiter")
|
|
129
|
+
>>> graph.add_dependency("auth_service", "database")
|
|
130
|
+
>>>
|
|
131
|
+
>>> # Get direct dependencies
|
|
132
|
+
>>> graph.get_dependencies("api_gateway")
|
|
133
|
+
{'auth_service', 'rate_limiter'}
|
|
134
|
+
>>>
|
|
135
|
+
>>> # Get all transitive dependencies
|
|
136
|
+
>>> graph.get_upstream("api_gateway")
|
|
137
|
+
{'auth_service', 'rate_limiter', 'database'}
|
|
138
|
+
>>>
|
|
139
|
+
>>> # Get tools that depend on this one
|
|
140
|
+
>>> graph.get_dependents("database")
|
|
141
|
+
{'auth_service'}
|
|
142
|
+
>>>
|
|
143
|
+
>>> # Get all tools that would be affected by a failure
|
|
144
|
+
>>> graph.get_downstream("database")
|
|
145
|
+
{'auth_service', 'api_gateway'}
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def __init__(self):
|
|
149
|
+
"""Initialize the dependency graph."""
|
|
150
|
+
self._dependencies: dict[str, dict[str, DependencyInfo]] = defaultdict(dict)
|
|
151
|
+
self._dependents: dict[str, set[str]] = defaultdict(set)
|
|
152
|
+
self._lock = threading.RLock()
|
|
153
|
+
|
|
154
|
+
def add_dependency(
|
|
155
|
+
self,
|
|
156
|
+
tool: str,
|
|
157
|
+
depends_on: str,
|
|
158
|
+
critical: bool = True,
|
|
159
|
+
fallback: str | None = None,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Add a dependency relationship.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
tool: The tool that has the dependency.
|
|
166
|
+
depends_on: The tool it depends on.
|
|
167
|
+
critical: Whether this is a critical dependency.
|
|
168
|
+
fallback: Optional fallback tool if the dependency fails.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ValueError: If adding this dependency would create a cycle.
|
|
172
|
+
|
|
173
|
+
Example:
|
|
174
|
+
>>> graph.add_dependency("order_service", "database")
|
|
175
|
+
>>> graph.add_dependency("order_service", "cache", critical=False)
|
|
176
|
+
"""
|
|
177
|
+
with self._lock:
|
|
178
|
+
# Check if adding this would create a cycle
|
|
179
|
+
if self._would_create_cycle(tool, depends_on):
|
|
180
|
+
raise ValueError(
|
|
181
|
+
f"Adding dependency {tool} -> {depends_on} would create a cycle"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
self._dependencies[tool][depends_on] = DependencyInfo(
|
|
185
|
+
name=depends_on,
|
|
186
|
+
critical=critical,
|
|
187
|
+
fallback=fallback,
|
|
188
|
+
)
|
|
189
|
+
self._dependents[depends_on].add(tool)
|
|
190
|
+
|
|
191
|
+
def remove_dependency(self, tool: str, depends_on: str) -> bool:
|
|
192
|
+
"""
|
|
193
|
+
Remove a dependency relationship.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
tool: The tool that has the dependency.
|
|
197
|
+
depends_on: The dependency to remove.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
True if the dependency was removed, False if not found.
|
|
201
|
+
"""
|
|
202
|
+
with self._lock:
|
|
203
|
+
if depends_on in self._dependencies.get(tool, {}):
|
|
204
|
+
del self._dependencies[tool][depends_on]
|
|
205
|
+
self._dependents[depends_on].discard(tool)
|
|
206
|
+
return True
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
def get_dependencies(self, tool: str) -> set[str]:
|
|
210
|
+
"""
|
|
211
|
+
Get direct dependencies of a tool.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
tool: The tool to get dependencies for.
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Set of tool names this tool directly depends on.
|
|
218
|
+
"""
|
|
219
|
+
with self._lock:
|
|
220
|
+
return set(self._dependencies.get(tool, {}).keys())
|
|
221
|
+
|
|
222
|
+
def get_dependency_info(self, tool: str, depends_on: str) -> DependencyInfo | None:
|
|
223
|
+
"""
|
|
224
|
+
Get detailed info about a dependency.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
tool: The tool that has the dependency.
|
|
228
|
+
depends_on: The dependency to get info for.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
DependencyInfo if found, None otherwise.
|
|
232
|
+
"""
|
|
233
|
+
with self._lock:
|
|
234
|
+
return self._dependencies.get(tool, {}).get(depends_on)
|
|
235
|
+
|
|
236
|
+
def get_dependents(self, tool: str) -> set[str]:
|
|
237
|
+
"""
|
|
238
|
+
Get tools that directly depend on this tool.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
tool: The tool to get dependents for.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Set of tool names that directly depend on this tool.
|
|
245
|
+
"""
|
|
246
|
+
with self._lock:
|
|
247
|
+
return set(self._dependents.get(tool, set()))
|
|
248
|
+
|
|
249
|
+
def get_upstream(self, tool: str) -> set[str]:
|
|
250
|
+
"""
|
|
251
|
+
Get all transitive dependencies (upstream tools).
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
tool: The tool to get upstream dependencies for.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Set of all tools this tool transitively depends on.
|
|
258
|
+
"""
|
|
259
|
+
with self._lock:
|
|
260
|
+
visited: set[str] = set()
|
|
261
|
+
self._collect_upstream(tool, visited)
|
|
262
|
+
return visited
|
|
263
|
+
|
|
264
|
+
def _collect_upstream(self, tool: str, visited: set[str]) -> None:
|
|
265
|
+
"""Recursively collect upstream dependencies."""
|
|
266
|
+
for dep in self._dependencies.get(tool, {}):
|
|
267
|
+
if dep not in visited:
|
|
268
|
+
visited.add(dep)
|
|
269
|
+
self._collect_upstream(dep, visited)
|
|
270
|
+
|
|
271
|
+
def get_downstream(self, tool: str) -> set[str]:
|
|
272
|
+
"""
|
|
273
|
+
Get all tools that would be affected by this tool's failure.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
tool: The tool to get downstream dependents for.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Set of all tools that transitively depend on this tool.
|
|
280
|
+
"""
|
|
281
|
+
with self._lock:
|
|
282
|
+
visited: set[str] = set()
|
|
283
|
+
self._collect_downstream(tool, visited)
|
|
284
|
+
return visited
|
|
285
|
+
|
|
286
|
+
def _collect_downstream(self, tool: str, visited: set[str]) -> None:
|
|
287
|
+
"""Recursively collect downstream dependents."""
|
|
288
|
+
for dependent in self._dependents.get(tool, set()):
|
|
289
|
+
if dependent not in visited:
|
|
290
|
+
visited.add(dependent)
|
|
291
|
+
self._collect_downstream(dependent, visited)
|
|
292
|
+
|
|
293
|
+
def has_cycle(self) -> bool:
|
|
294
|
+
"""
|
|
295
|
+
Check if the graph contains any cycles.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
True if a cycle exists, False otherwise.
|
|
299
|
+
"""
|
|
300
|
+
with self._lock:
|
|
301
|
+
visited: set[str] = set()
|
|
302
|
+
rec_stack: set[str] = set()
|
|
303
|
+
|
|
304
|
+
for tool in self._dependencies:
|
|
305
|
+
if self._has_cycle_from(tool, visited, rec_stack):
|
|
306
|
+
return True
|
|
307
|
+
return False
|
|
308
|
+
|
|
309
|
+
def _has_cycle_from(
|
|
310
|
+
self,
|
|
311
|
+
tool: str,
|
|
312
|
+
visited: set[str],
|
|
313
|
+
rec_stack: set[str],
|
|
314
|
+
) -> bool:
|
|
315
|
+
"""Check for cycle starting from a specific tool."""
|
|
316
|
+
visited.add(tool)
|
|
317
|
+
rec_stack.add(tool)
|
|
318
|
+
|
|
319
|
+
for dep in self._dependencies.get(tool, {}):
|
|
320
|
+
if dep not in visited:
|
|
321
|
+
if self._has_cycle_from(dep, visited, rec_stack):
|
|
322
|
+
return True
|
|
323
|
+
elif dep in rec_stack:
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
rec_stack.remove(tool)
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
def _would_create_cycle(self, tool: str, depends_on: str) -> bool:
|
|
330
|
+
"""Check if adding a dependency would create a cycle."""
|
|
331
|
+
if tool == depends_on:
|
|
332
|
+
return True
|
|
333
|
+
|
|
334
|
+
# Check if depends_on can reach tool (which would create a cycle)
|
|
335
|
+
upstream_of_depends_on = self.get_upstream(depends_on)
|
|
336
|
+
return tool in upstream_of_depends_on or tool == depends_on
|
|
337
|
+
|
|
338
|
+
def get_all_tools(self) -> set[str]:
|
|
339
|
+
"""Get all tools in the graph."""
|
|
340
|
+
with self._lock:
|
|
341
|
+
tools = set(self._dependencies.keys())
|
|
342
|
+
for deps in self._dependencies.values():
|
|
343
|
+
tools.update(deps.keys())
|
|
344
|
+
return tools
|
|
345
|
+
|
|
346
|
+
def get_critical_dependencies(self, tool: str) -> set[str]:
|
|
347
|
+
"""
|
|
348
|
+
Get only critical dependencies of a tool.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
tool: The tool to get critical dependencies for.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Set of critical dependency names.
|
|
355
|
+
"""
|
|
356
|
+
with self._lock:
|
|
357
|
+
return {
|
|
358
|
+
name
|
|
359
|
+
for name, info in self._dependencies.get(tool, {}).items()
|
|
360
|
+
if info.critical
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
def to_dict(self) -> dict[str, list[dict[str, Any]]]:
|
|
364
|
+
"""Convert graph to dictionary for serialization."""
|
|
365
|
+
with self._lock:
|
|
366
|
+
return {
|
|
367
|
+
tool: [
|
|
368
|
+
{
|
|
369
|
+
"name": info.name,
|
|
370
|
+
"critical": info.critical,
|
|
371
|
+
"fallback": info.fallback,
|
|
372
|
+
}
|
|
373
|
+
for info in deps.values()
|
|
374
|
+
]
|
|
375
|
+
for tool, deps in self._dependencies.items()
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
class CascadeProtector:
|
|
380
|
+
"""
|
|
381
|
+
Main class for cascade-aware failure protection.
|
|
382
|
+
|
|
383
|
+
Monitors the health of tools and their dependencies, propagating
|
|
384
|
+
failure information and managing recovery.
|
|
385
|
+
|
|
386
|
+
Example:
|
|
387
|
+
>>> graph = DependencyGraph()
|
|
388
|
+
>>> graph.add_dependency("api", "database")
|
|
389
|
+
>>> graph.add_dependency("api", "cache", critical=False)
|
|
390
|
+
>>>
|
|
391
|
+
>>> registry = CircuitBreakerRegistry()
|
|
392
|
+
>>> protector = CascadeProtector(graph, registry)
|
|
393
|
+
>>>
|
|
394
|
+
>>> # Check health before calling
|
|
395
|
+
>>> state = protector.check_cascade_health("api")
|
|
396
|
+
>>> if state in (CascadeState.FAILING, CascadeState.ISOLATED):
|
|
397
|
+
... return use_fallback()
|
|
398
|
+
>>>
|
|
399
|
+
>>> # When a failure occurs, propagate it
|
|
400
|
+
>>> affected = protector.propagate_failure("database")
|
|
401
|
+
>>> print(f"Database failure affected {len(affected)} tools")
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
def __init__(
|
|
405
|
+
self,
|
|
406
|
+
graph: DependencyGraph,
|
|
407
|
+
circuit_registry: CircuitBreakerRegistry | None = None,
|
|
408
|
+
degraded_threshold: int = 1,
|
|
409
|
+
failing_threshold: int = 2,
|
|
410
|
+
):
|
|
411
|
+
"""
|
|
412
|
+
Initialize the cascade protector.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
graph: The dependency graph to use.
|
|
416
|
+
circuit_registry: Optional circuit breaker registry for integration.
|
|
417
|
+
degraded_threshold: Number of failing dependencies to mark as DEGRADED.
|
|
418
|
+
failing_threshold: Number of critical failing deps to mark as FAILING.
|
|
419
|
+
"""
|
|
420
|
+
self.graph = graph
|
|
421
|
+
self.circuit_registry = circuit_registry
|
|
422
|
+
self.degraded_threshold = degraded_threshold
|
|
423
|
+
self.failing_threshold = failing_threshold
|
|
424
|
+
|
|
425
|
+
self._tool_states: dict[str, CascadeState] = {}
|
|
426
|
+
self._isolated_tools: set[str] = set()
|
|
427
|
+
self._events: list[CascadeEvent] = []
|
|
428
|
+
self._lock = threading.RLock()
|
|
429
|
+
self._state_listeners: list[Callable[[str, CascadeState, CascadeState], None]] = []
|
|
430
|
+
|
|
431
|
+
def check_cascade_health(self, tool: str) -> CascadeState:
|
|
432
|
+
"""
|
|
433
|
+
Check the cascade health of a tool.
|
|
434
|
+
|
|
435
|
+
Args:
|
|
436
|
+
tool: The tool to check health for.
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
The current cascade state of the tool.
|
|
440
|
+
|
|
441
|
+
Example:
|
|
442
|
+
>>> state = protector.check_cascade_health("user_service")
|
|
443
|
+
>>> if state == CascadeState.HEALTHY:
|
|
444
|
+
... # All good
|
|
445
|
+
... pass
|
|
446
|
+
>>> elif state == CascadeState.DEGRADED:
|
|
447
|
+
... # Some non-critical dependencies failing
|
|
448
|
+
... pass
|
|
449
|
+
"""
|
|
450
|
+
with self._lock:
|
|
451
|
+
# Check if manually isolated
|
|
452
|
+
if tool in self._isolated_tools:
|
|
453
|
+
return CascadeState.ISOLATED
|
|
454
|
+
|
|
455
|
+
# Check cached state if we have one (this is set by propagate_failure)
|
|
456
|
+
if tool in self._tool_states:
|
|
457
|
+
cached_state = self._tool_states[tool]
|
|
458
|
+
# Return cached FAILING or ISOLATED states
|
|
459
|
+
if cached_state in (CascadeState.FAILING, CascadeState.ISOLATED):
|
|
460
|
+
return cached_state
|
|
461
|
+
|
|
462
|
+
# Calculate state based on dependencies
|
|
463
|
+
return self._calculate_state(tool)
|
|
464
|
+
|
|
465
|
+
def _calculate_state(self, tool: str) -> CascadeState:
|
|
466
|
+
"""Calculate the cascade state based on dependency health."""
|
|
467
|
+
dependencies = self.graph.get_dependencies(tool)
|
|
468
|
+
if not dependencies:
|
|
469
|
+
return CascadeState.HEALTHY
|
|
470
|
+
|
|
471
|
+
failing_critical = 0
|
|
472
|
+
failing_total = 0
|
|
473
|
+
|
|
474
|
+
for dep in dependencies:
|
|
475
|
+
dep_state = self._get_tool_state(dep)
|
|
476
|
+
if dep_state in (CascadeState.FAILING, CascadeState.ISOLATED):
|
|
477
|
+
failing_total += 1
|
|
478
|
+
dep_info = self.graph.get_dependency_info(tool, dep)
|
|
479
|
+
if dep_info and dep_info.critical:
|
|
480
|
+
failing_critical += 1
|
|
481
|
+
elif dep_state == CascadeState.DEGRADED:
|
|
482
|
+
failing_total += 0.5 # Degraded contributes half
|
|
483
|
+
|
|
484
|
+
if failing_critical >= self.failing_threshold:
|
|
485
|
+
return CascadeState.FAILING
|
|
486
|
+
elif failing_total >= self.degraded_threshold:
|
|
487
|
+
return CascadeState.DEGRADED
|
|
488
|
+
else:
|
|
489
|
+
return CascadeState.HEALTHY
|
|
490
|
+
|
|
491
|
+
def _get_tool_state(self, tool: str) -> CascadeState:
|
|
492
|
+
"""Get the state of a tool, checking circuit breakers if available."""
|
|
493
|
+
# Check manual isolation first
|
|
494
|
+
if tool in self._isolated_tools:
|
|
495
|
+
return CascadeState.ISOLATED
|
|
496
|
+
|
|
497
|
+
# Check circuit breaker state
|
|
498
|
+
if self.circuit_registry:
|
|
499
|
+
try:
|
|
500
|
+
breaker = self.circuit_registry.get(tool, auto_create=False)
|
|
501
|
+
if breaker.state == CircuitState.OPEN:
|
|
502
|
+
return CascadeState.FAILING
|
|
503
|
+
elif breaker.state == CircuitState.HALF_OPEN:
|
|
504
|
+
return CascadeState.DEGRADED
|
|
505
|
+
except KeyError:
|
|
506
|
+
pass # No breaker registered
|
|
507
|
+
|
|
508
|
+
# Check cached state
|
|
509
|
+
return self._tool_states.get(tool, CascadeState.HEALTHY)
|
|
510
|
+
|
|
511
|
+
def propagate_failure(self, tool: str) -> set[str]:
|
|
512
|
+
"""
|
|
513
|
+
Propagate a failure through the dependency graph.
|
|
514
|
+
|
|
515
|
+
When a tool fails, this method marks all dependent tools as
|
|
516
|
+
DEGRADED or FAILING based on their dependency configuration.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
tool: The tool that failed.
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
Set of affected tool names.
|
|
523
|
+
|
|
524
|
+
Example:
|
|
525
|
+
>>> affected = protector.propagate_failure("database")
|
|
526
|
+
>>> print(f"Affected tools: {affected}")
|
|
527
|
+
"""
|
|
528
|
+
with self._lock:
|
|
529
|
+
affected: set[str] = set()
|
|
530
|
+
|
|
531
|
+
# Mark the failing tool
|
|
532
|
+
old_state = self._tool_states.get(tool, CascadeState.HEALTHY)
|
|
533
|
+
self._tool_states[tool] = CascadeState.FAILING
|
|
534
|
+
self._notify_state_change(tool, old_state, CascadeState.FAILING)
|
|
535
|
+
|
|
536
|
+
# Propagate to dependents
|
|
537
|
+
self._propagate_to_dependents(tool, affected)
|
|
538
|
+
|
|
539
|
+
# Record event
|
|
540
|
+
self._events.append(
|
|
541
|
+
CascadeEvent(
|
|
542
|
+
timestamp=datetime.now(timezone.utc),
|
|
543
|
+
source_tool=tool,
|
|
544
|
+
affected_tools=affected,
|
|
545
|
+
event_type="failure_propagated",
|
|
546
|
+
details={"total_affected": len(affected)},
|
|
547
|
+
)
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
logger.warning(
|
|
551
|
+
f"Cascade failure propagated from {tool}: "
|
|
552
|
+
f"{len(affected)} tools affected"
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
return affected
|
|
556
|
+
|
|
557
|
+
def _propagate_to_dependents(self, tool: str, affected: set[str]) -> None:
|
|
558
|
+
"""Recursively propagate failure state to dependent tools."""
|
|
559
|
+
dependents = self.graph.get_dependents(tool)
|
|
560
|
+
|
|
561
|
+
for dependent in dependents:
|
|
562
|
+
if dependent in affected:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
affected.add(dependent)
|
|
566
|
+
|
|
567
|
+
# Calculate new state for this dependent
|
|
568
|
+
new_state = self._calculate_state(dependent)
|
|
569
|
+
old_state = self._tool_states.get(dependent, CascadeState.HEALTHY)
|
|
570
|
+
|
|
571
|
+
if new_state != old_state:
|
|
572
|
+
self._tool_states[dependent] = new_state
|
|
573
|
+
self._notify_state_change(dependent, old_state, new_state)
|
|
574
|
+
|
|
575
|
+
# Continue propagation if this tool is now failing
|
|
576
|
+
if new_state in (CascadeState.FAILING, CascadeState.DEGRADED):
|
|
577
|
+
self._propagate_to_dependents(dependent, affected)
|
|
578
|
+
|
|
579
|
+
def isolate_tool(self, tool: str) -> set[str]:
|
|
580
|
+
"""
|
|
581
|
+
Manually isolate a tool from the system.
|
|
582
|
+
|
|
583
|
+
Isolated tools are treated as failing and their dependents
|
|
584
|
+
are marked accordingly.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
tool: The tool to isolate.
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
Set of affected tool names.
|
|
591
|
+
|
|
592
|
+
Example:
|
|
593
|
+
>>> # Isolate a tool for maintenance
|
|
594
|
+
>>> affected = protector.isolate_tool("database")
|
|
595
|
+
"""
|
|
596
|
+
with self._lock:
|
|
597
|
+
self._isolated_tools.add(tool)
|
|
598
|
+
old_state = self._tool_states.get(tool, CascadeState.HEALTHY)
|
|
599
|
+
self._tool_states[tool] = CascadeState.ISOLATED
|
|
600
|
+
self._notify_state_change(tool, old_state, CascadeState.ISOLATED)
|
|
601
|
+
|
|
602
|
+
affected = self.propagate_failure(tool)
|
|
603
|
+
|
|
604
|
+
self._events.append(
|
|
605
|
+
CascadeEvent(
|
|
606
|
+
timestamp=datetime.now(timezone.utc),
|
|
607
|
+
source_tool=tool,
|
|
608
|
+
affected_tools=affected,
|
|
609
|
+
event_type="tool_isolated",
|
|
610
|
+
)
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
logger.info(f"Tool {tool} isolated, {len(affected)} tools affected")
|
|
614
|
+
return affected
|
|
615
|
+
|
|
616
|
+
def recover_tool(self, tool: str) -> set[str]:
|
|
617
|
+
"""
|
|
618
|
+
Attempt to recover a tool from failed/isolated state.
|
|
619
|
+
|
|
620
|
+
This removes the tool from isolation and recalculates states
|
|
621
|
+
for all dependents.
|
|
622
|
+
|
|
623
|
+
Args:
|
|
624
|
+
tool: The tool to recover.
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
Set of tools that may have improved states.
|
|
628
|
+
|
|
629
|
+
Example:
|
|
630
|
+
>>> # After maintenance, recover the tool
|
|
631
|
+
>>> recovered = protector.recover_tool("database")
|
|
632
|
+
"""
|
|
633
|
+
with self._lock:
|
|
634
|
+
recovered: set[str] = set()
|
|
635
|
+
|
|
636
|
+
# Remove from isolation
|
|
637
|
+
self._isolated_tools.discard(tool)
|
|
638
|
+
|
|
639
|
+
# Check if circuit breaker is still failing
|
|
640
|
+
actual_state = CascadeState.HEALTHY
|
|
641
|
+
if self.circuit_registry:
|
|
642
|
+
try:
|
|
643
|
+
breaker = self.circuit_registry.get(tool, auto_create=False)
|
|
644
|
+
if breaker.state == CircuitState.OPEN:
|
|
645
|
+
actual_state = CascadeState.FAILING
|
|
646
|
+
elif breaker.state == CircuitState.HALF_OPEN:
|
|
647
|
+
actual_state = CascadeState.DEGRADED
|
|
648
|
+
except KeyError:
|
|
649
|
+
pass
|
|
650
|
+
|
|
651
|
+
old_state = self._tool_states.get(tool, CascadeState.HEALTHY)
|
|
652
|
+
if actual_state != old_state:
|
|
653
|
+
self._tool_states[tool] = actual_state
|
|
654
|
+
self._notify_state_change(tool, old_state, actual_state)
|
|
655
|
+
|
|
656
|
+
# Recalculate states for all dependents
|
|
657
|
+
self._recalculate_downstream(tool, recovered)
|
|
658
|
+
|
|
659
|
+
self._events.append(
|
|
660
|
+
CascadeEvent(
|
|
661
|
+
timestamp=datetime.now(timezone.utc),
|
|
662
|
+
source_tool=tool,
|
|
663
|
+
affected_tools=recovered,
|
|
664
|
+
event_type="recovery_started",
|
|
665
|
+
)
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
logger.info(f"Tool {tool} recovery started, {len(recovered)} tools may recover")
|
|
669
|
+
return recovered
|
|
670
|
+
|
|
671
|
+
def _recalculate_downstream(self, tool: str, recovered: set[str]) -> None:
|
|
672
|
+
"""Recalculate states for downstream tools after recovery."""
|
|
673
|
+
dependents = self.graph.get_downstream(tool)
|
|
674
|
+
|
|
675
|
+
for dependent in dependents:
|
|
676
|
+
old_state = self._tool_states.get(dependent, CascadeState.HEALTHY)
|
|
677
|
+
new_state = self._calculate_state(dependent)
|
|
678
|
+
|
|
679
|
+
if new_state != old_state:
|
|
680
|
+
self._tool_states[dependent] = new_state
|
|
681
|
+
self._notify_state_change(dependent, old_state, new_state)
|
|
682
|
+
|
|
683
|
+
# If state improved, add to recovered set
|
|
684
|
+
if self._state_value(new_state) < self._state_value(old_state):
|
|
685
|
+
recovered.add(dependent)
|
|
686
|
+
|
|
687
|
+
def _state_value(self, state: CascadeState) -> int:
|
|
688
|
+
"""Get numeric value for state comparison (higher = worse)."""
|
|
689
|
+
return {
|
|
690
|
+
CascadeState.HEALTHY: 0,
|
|
691
|
+
CascadeState.DEGRADED: 1,
|
|
692
|
+
CascadeState.FAILING: 2,
|
|
693
|
+
CascadeState.ISOLATED: 3,
|
|
694
|
+
}.get(state, 0)
|
|
695
|
+
|
|
696
|
+
def get_healthy_alternatives(self, tool: str) -> list[str]:
|
|
697
|
+
"""
|
|
698
|
+
Get healthy alternatives/fallbacks for a failing tool.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
tool: The tool to find alternatives for.
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
List of healthy alternative tool names.
|
|
705
|
+
|
|
706
|
+
Example:
|
|
707
|
+
>>> alternatives = protector.get_healthy_alternatives("primary_db")
|
|
708
|
+
>>> if alternatives:
|
|
709
|
+
... use_tool(alternatives[0])
|
|
710
|
+
"""
|
|
711
|
+
with self._lock:
|
|
712
|
+
alternatives: list[str] = []
|
|
713
|
+
|
|
714
|
+
# Check configured fallbacks in dependencies
|
|
715
|
+
for dependent in self.graph.get_dependents(tool):
|
|
716
|
+
dep_info = self.graph.get_dependency_info(dependent, tool)
|
|
717
|
+
if dep_info and dep_info.fallback:
|
|
718
|
+
fallback_state = self.check_cascade_health(dep_info.fallback)
|
|
719
|
+
if fallback_state == CascadeState.HEALTHY:
|
|
720
|
+
if dep_info.fallback not in alternatives:
|
|
721
|
+
alternatives.append(dep_info.fallback)
|
|
722
|
+
|
|
723
|
+
return alternatives
|
|
724
|
+
|
|
725
|
+
def add_state_listener(
|
|
726
|
+
self,
|
|
727
|
+
listener: Callable[[str, CascadeState, CascadeState], None],
|
|
728
|
+
) -> None:
|
|
729
|
+
"""
|
|
730
|
+
Add a listener for state changes.
|
|
731
|
+
|
|
732
|
+
Args:
|
|
733
|
+
listener: Callback function(tool, old_state, new_state).
|
|
734
|
+
|
|
735
|
+
Example:
|
|
736
|
+
>>> def on_state_change(tool, old, new):
|
|
737
|
+
... print(f"{tool}: {old.value} -> {new.value}")
|
|
738
|
+
>>> protector.add_state_listener(on_state_change)
|
|
739
|
+
"""
|
|
740
|
+
self._state_listeners.append(listener)
|
|
741
|
+
|
|
742
|
+
def _notify_state_change(
|
|
743
|
+
self,
|
|
744
|
+
tool: str,
|
|
745
|
+
old_state: CascadeState,
|
|
746
|
+
new_state: CascadeState,
|
|
747
|
+
) -> None:
|
|
748
|
+
"""Notify listeners of a state change."""
|
|
749
|
+
for listener in self._state_listeners:
|
|
750
|
+
try:
|
|
751
|
+
listener(tool, old_state, new_state)
|
|
752
|
+
except Exception as e:
|
|
753
|
+
logger.error(f"Error in state listener: {e}")
|
|
754
|
+
|
|
755
|
+
def get_cascade_events(self, limit: int = 100) -> list[CascadeEvent]:
|
|
756
|
+
"""
|
|
757
|
+
Get recent cascade events.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
limit: Maximum number of events to return.
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
List of recent cascade events, newest first.
|
|
764
|
+
"""
|
|
765
|
+
with self._lock:
|
|
766
|
+
return list(reversed(self._events[-limit:]))
|
|
767
|
+
|
|
768
|
+
def get_all_states(self) -> dict[str, CascadeState]:
|
|
769
|
+
"""Get the current state of all tracked tools."""
|
|
770
|
+
with self._lock:
|
|
771
|
+
# Calculate states for all tools in the graph
|
|
772
|
+
all_tools = self.graph.get_all_tools()
|
|
773
|
+
return {tool: self.check_cascade_health(tool) for tool in all_tools}
|
|
774
|
+
|
|
775
|
+
def get_failing_tools(self) -> set[str]:
|
|
776
|
+
"""Get all tools currently in FAILING or ISOLATED state."""
|
|
777
|
+
with self._lock:
|
|
778
|
+
return {
|
|
779
|
+
tool
|
|
780
|
+
for tool, state in self._tool_states.items()
|
|
781
|
+
if state in (CascadeState.FAILING, CascadeState.ISOLATED)
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
def get_degraded_tools(self) -> set[str]:
|
|
785
|
+
"""Get all tools currently in DEGRADED state."""
|
|
786
|
+
with self._lock:
|
|
787
|
+
return {
|
|
788
|
+
tool
|
|
789
|
+
for tool, state in self._tool_states.items()
|
|
790
|
+
if state == CascadeState.DEGRADED
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
def reset(self) -> None:
|
|
794
|
+
"""Reset all cascade states."""
|
|
795
|
+
with self._lock:
|
|
796
|
+
self._tool_states.clear()
|
|
797
|
+
self._isolated_tools.clear()
|
|
798
|
+
self._events.clear()
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
class CascadeAwareCircuitBreakerRegistry(CircuitBreakerRegistry):
|
|
802
|
+
"""
|
|
803
|
+
Circuit breaker registry with cascade protection integration.
|
|
804
|
+
|
|
805
|
+
Automatically propagates failures through the cascade protector
|
|
806
|
+
when circuit breakers open.
|
|
807
|
+
|
|
808
|
+
Example:
|
|
809
|
+
>>> graph = DependencyGraph()
|
|
810
|
+
>>> graph.add_dependency("api", "database")
|
|
811
|
+
>>>
|
|
812
|
+
>>> protector = CascadeProtector(graph)
|
|
813
|
+
>>> registry = CascadeAwareCircuitBreakerRegistry(protector)
|
|
814
|
+
>>>
|
|
815
|
+
>>> # When the database circuit opens, cascade protector is notified
|
|
816
|
+
>>> breaker = registry.get("database")
|
|
817
|
+
>>> try:
|
|
818
|
+
... result = breaker.call(database_query)
|
|
819
|
+
... except:
|
|
820
|
+
... # Circuit may open after enough failures
|
|
821
|
+
... pass
|
|
822
|
+
"""
|
|
823
|
+
|
|
824
|
+
def __init__(
|
|
825
|
+
self,
|
|
826
|
+
cascade_protector: CascadeProtector,
|
|
827
|
+
default_config: dict[str, Any] | None = None,
|
|
828
|
+
):
|
|
829
|
+
"""
|
|
830
|
+
Initialize the cascade-aware registry.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
cascade_protector: The cascade protector to notify on failures.
|
|
834
|
+
default_config: Default circuit breaker configuration.
|
|
835
|
+
"""
|
|
836
|
+
super().__init__(default_config)
|
|
837
|
+
self._cascade_protector = cascade_protector
|
|
838
|
+
|
|
839
|
+
def on_circuit_open(self, name: str) -> set[str]:
|
|
840
|
+
"""
|
|
841
|
+
Handle a circuit opening.
|
|
842
|
+
|
|
843
|
+
Args:
|
|
844
|
+
name: Name of the circuit that opened.
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
Set of affected tools from cascade propagation.
|
|
848
|
+
"""
|
|
849
|
+
affected = self._cascade_protector.propagate_failure(name)
|
|
850
|
+
logger.warning(
|
|
851
|
+
f"Cascade from {name}: {len(affected)} tools affected"
|
|
852
|
+
)
|
|
853
|
+
return affected
|
|
854
|
+
|
|
855
|
+
def on_circuit_close(self, name: str) -> set[str]:
|
|
856
|
+
"""
|
|
857
|
+
Handle a circuit closing (recovery).
|
|
858
|
+
|
|
859
|
+
Args:
|
|
860
|
+
name: Name of the circuit that closed.
|
|
861
|
+
|
|
862
|
+
Returns:
|
|
863
|
+
Set of tools that may have recovered.
|
|
864
|
+
"""
|
|
865
|
+
recovered = self._cascade_protector.recover_tool(name)
|
|
866
|
+
logger.info(
|
|
867
|
+
f"Recovery from {name}: {len(recovered)} tools may recover"
|
|
868
|
+
)
|
|
869
|
+
return recovered
|