proxilion 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. proxilion/__init__.py +136 -0
  2. proxilion/audit/__init__.py +133 -0
  3. proxilion/audit/base_exporters.py +527 -0
  4. proxilion/audit/compliance/__init__.py +130 -0
  5. proxilion/audit/compliance/base.py +457 -0
  6. proxilion/audit/compliance/eu_ai_act.py +603 -0
  7. proxilion/audit/compliance/iso27001.py +544 -0
  8. proxilion/audit/compliance/soc2.py +491 -0
  9. proxilion/audit/events.py +493 -0
  10. proxilion/audit/explainability.py +1173 -0
  11. proxilion/audit/exporters/__init__.py +58 -0
  12. proxilion/audit/exporters/aws_s3.py +636 -0
  13. proxilion/audit/exporters/azure_storage.py +608 -0
  14. proxilion/audit/exporters/cloud_base.py +468 -0
  15. proxilion/audit/exporters/gcp_storage.py +570 -0
  16. proxilion/audit/exporters/multi_exporter.py +498 -0
  17. proxilion/audit/hash_chain.py +652 -0
  18. proxilion/audit/logger.py +543 -0
  19. proxilion/caching/__init__.py +49 -0
  20. proxilion/caching/tool_cache.py +633 -0
  21. proxilion/context/__init__.py +73 -0
  22. proxilion/context/context_window.py +556 -0
  23. proxilion/context/message_history.py +505 -0
  24. proxilion/context/session.py +735 -0
  25. proxilion/contrib/__init__.py +51 -0
  26. proxilion/contrib/anthropic.py +609 -0
  27. proxilion/contrib/google.py +1012 -0
  28. proxilion/contrib/langchain.py +641 -0
  29. proxilion/contrib/mcp.py +893 -0
  30. proxilion/contrib/openai.py +646 -0
  31. proxilion/core.py +3058 -0
  32. proxilion/decorators.py +966 -0
  33. proxilion/engines/__init__.py +287 -0
  34. proxilion/engines/base.py +266 -0
  35. proxilion/engines/casbin_engine.py +412 -0
  36. proxilion/engines/opa_engine.py +493 -0
  37. proxilion/engines/simple.py +437 -0
  38. proxilion/exceptions.py +887 -0
  39. proxilion/guards/__init__.py +54 -0
  40. proxilion/guards/input_guard.py +522 -0
  41. proxilion/guards/output_guard.py +634 -0
  42. proxilion/observability/__init__.py +198 -0
  43. proxilion/observability/cost_tracker.py +866 -0
  44. proxilion/observability/hooks.py +683 -0
  45. proxilion/observability/metrics.py +798 -0
  46. proxilion/observability/session_cost_tracker.py +1063 -0
  47. proxilion/policies/__init__.py +67 -0
  48. proxilion/policies/base.py +304 -0
  49. proxilion/policies/builtin.py +486 -0
  50. proxilion/policies/registry.py +376 -0
  51. proxilion/providers/__init__.py +201 -0
  52. proxilion/providers/adapter.py +468 -0
  53. proxilion/providers/anthropic_adapter.py +330 -0
  54. proxilion/providers/gemini_adapter.py +391 -0
  55. proxilion/providers/openai_adapter.py +294 -0
  56. proxilion/py.typed +0 -0
  57. proxilion/resilience/__init__.py +81 -0
  58. proxilion/resilience/degradation.py +615 -0
  59. proxilion/resilience/fallback.py +555 -0
  60. proxilion/resilience/retry.py +554 -0
  61. proxilion/scheduling/__init__.py +57 -0
  62. proxilion/scheduling/priority_queue.py +419 -0
  63. proxilion/scheduling/scheduler.py +459 -0
  64. proxilion/security/__init__.py +244 -0
  65. proxilion/security/agent_trust.py +968 -0
  66. proxilion/security/behavioral_drift.py +794 -0
  67. proxilion/security/cascade_protection.py +869 -0
  68. proxilion/security/circuit_breaker.py +428 -0
  69. proxilion/security/cost_limiter.py +690 -0
  70. proxilion/security/idor_protection.py +460 -0
  71. proxilion/security/intent_capsule.py +849 -0
  72. proxilion/security/intent_validator.py +495 -0
  73. proxilion/security/memory_integrity.py +767 -0
  74. proxilion/security/rate_limiter.py +509 -0
  75. proxilion/security/scope_enforcer.py +680 -0
  76. proxilion/security/sequence_validator.py +636 -0
  77. proxilion/security/trust_boundaries.py +784 -0
  78. proxilion/streaming/__init__.py +70 -0
  79. proxilion/streaming/detector.py +761 -0
  80. proxilion/streaming/transformer.py +674 -0
  81. proxilion/timeouts/__init__.py +55 -0
  82. proxilion/timeouts/decorators.py +477 -0
  83. proxilion/timeouts/manager.py +545 -0
  84. proxilion/tools/__init__.py +69 -0
  85. proxilion/tools/decorators.py +493 -0
  86. proxilion/tools/registry.py +732 -0
  87. proxilion/types.py +339 -0
  88. proxilion/validation/__init__.py +93 -0
  89. proxilion/validation/pydantic_schema.py +351 -0
  90. proxilion/validation/schema.py +651 -0
  91. proxilion-0.0.1.dist-info/METADATA +872 -0
  92. proxilion-0.0.1.dist-info/RECORD +94 -0
  93. proxilion-0.0.1.dist-info/WHEEL +4 -0
  94. proxilion-0.0.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,869 @@
1
+ """
2
+ Cascading failure protection for Proxilion.
3
+
4
+ This module provides cascade-aware circuit breaking to prevent failures
5
+ from propagating through dependent tools and services.
6
+
7
+ Quick Start:
8
+ >>> from proxilion.security import (
9
+ ... DependencyGraph,
10
+ ... CascadeProtector,
11
+ ... CircuitBreakerRegistry,
12
+ ... )
13
+ >>>
14
+ >>> # Build dependency graph
15
+ >>> graph = DependencyGraph()
16
+ >>> graph.add_dependency("user_service", "database")
17
+ >>> graph.add_dependency("order_service", "user_service")
18
+ >>> graph.add_dependency("order_service", "inventory")
19
+ >>>
20
+ >>> # Create cascade protector
21
+ >>> registry = CircuitBreakerRegistry()
22
+ >>> protector = CascadeProtector(graph, registry)
23
+ >>>
24
+ >>> # Check health before calling a tool
25
+ >>> state = protector.check_cascade_health("order_service")
26
+ >>> if state == CascadeState.HEALTHY:
27
+ ... # Safe to call
28
+ ... result = call_order_service()
29
+ >>> elif state == CascadeState.DEGRADED:
30
+ ... # Proceed with caution, some dependencies may be failing
31
+ ... result = call_order_service(retry=False)
32
+ >>> else:
33
+ ... # FAILING or ISOLATED - use fallback
34
+ ... result = fallback_response()
35
+
36
+ Cascade States:
37
+ - HEALTHY: All dependencies are functioning normally.
38
+ - DEGRADED: Some dependencies have failures but the tool can still function.
39
+ - FAILING: Critical dependencies are failing, tool should not be called.
40
+ - ISOLATED: Tool has been manually isolated from the system.
41
+
42
+ Integration with Circuit Breakers:
43
+ >>> # When a circuit breaker opens, propagate the failure
44
+ >>> def on_breaker_open(tool_name):
45
+ ... affected = protector.propagate_failure(tool_name)
46
+ ... print(f"Failure in {tool_name} affected {len(affected)} tools")
47
+ >>>
48
+ >>> # Register callback with circuit breaker
49
+ >>> registry = CascadeAwareCircuitBreakerRegistry(protector)
50
+ """
51
+
52
+ from __future__ import annotations
53
+
54
+ import logging
55
+ import threading
56
+ from collections import defaultdict
57
+ from collections.abc import Callable
58
+ from dataclasses import dataclass, field
59
+ from datetime import datetime, timezone
60
+ from enum import Enum
61
+ from typing import Any
62
+
63
+ from proxilion.security.circuit_breaker import CircuitBreakerRegistry, CircuitState
64
+
65
+ logger = logging.getLogger(__name__)
66
+
67
+
68
+ class CascadeState(Enum):
69
+ """State of a tool in the cascade protection system."""
70
+
71
+ HEALTHY = "healthy"
72
+ """All dependencies are functioning normally."""
73
+
74
+ DEGRADED = "degraded"
75
+ """Some dependencies have failures but the tool can still function."""
76
+
77
+ FAILING = "failing"
78
+ """Critical dependencies are failing, tool should not be called."""
79
+
80
+ ISOLATED = "isolated"
81
+ """Tool has been manually isolated from the system."""
82
+
83
+
84
+ @dataclass
85
+ class DependencyInfo:
86
+ """Information about a dependency."""
87
+
88
+ name: str
89
+ """Name of the dependency."""
90
+
91
+ critical: bool = True
92
+ """Whether this dependency is critical for the dependent tool."""
93
+
94
+ fallback: str | None = None
95
+ """Optional fallback tool to use if this dependency fails."""
96
+
97
+
98
+ @dataclass
99
+ class CascadeEvent:
100
+ """Record of a cascade event."""
101
+
102
+ timestamp: datetime
103
+ """When the event occurred."""
104
+
105
+ source_tool: str
106
+ """The tool that initiated the cascade."""
107
+
108
+ affected_tools: set[str]
109
+ """Tools affected by the cascade."""
110
+
111
+ event_type: str
112
+ """Type of event (failure_propagated, recovery_started, etc.)."""
113
+
114
+ details: dict[str, Any] = field(default_factory=dict)
115
+ """Additional event details."""
116
+
117
+
118
+ class DependencyGraph:
119
+ """
120
+ Directed acyclic graph (DAG) of tool dependencies.
121
+
122
+ Tracks which tools depend on other tools, enabling cascade-aware
123
+ failure handling.
124
+
125
+ Example:
126
+ >>> graph = DependencyGraph()
127
+ >>> graph.add_dependency("api_gateway", "auth_service")
128
+ >>> graph.add_dependency("api_gateway", "rate_limiter")
129
+ >>> graph.add_dependency("auth_service", "database")
130
+ >>>
131
+ >>> # Get direct dependencies
132
+ >>> graph.get_dependencies("api_gateway")
133
+ {'auth_service', 'rate_limiter'}
134
+ >>>
135
+ >>> # Get all transitive dependencies
136
+ >>> graph.get_upstream("api_gateway")
137
+ {'auth_service', 'rate_limiter', 'database'}
138
+ >>>
139
+ >>> # Get tools that depend on this one
140
+ >>> graph.get_dependents("database")
141
+ {'auth_service'}
142
+ >>>
143
+ >>> # Get all tools that would be affected by a failure
144
+ >>> graph.get_downstream("database")
145
+ {'auth_service', 'api_gateway'}
146
+ """
147
+
148
+ def __init__(self):
149
+ """Initialize the dependency graph."""
150
+ self._dependencies: dict[str, dict[str, DependencyInfo]] = defaultdict(dict)
151
+ self._dependents: dict[str, set[str]] = defaultdict(set)
152
+ self._lock = threading.RLock()
153
+
154
+ def add_dependency(
155
+ self,
156
+ tool: str,
157
+ depends_on: str,
158
+ critical: bool = True,
159
+ fallback: str | None = None,
160
+ ) -> None:
161
+ """
162
+ Add a dependency relationship.
163
+
164
+ Args:
165
+ tool: The tool that has the dependency.
166
+ depends_on: The tool it depends on.
167
+ critical: Whether this is a critical dependency.
168
+ fallback: Optional fallback tool if the dependency fails.
169
+
170
+ Raises:
171
+ ValueError: If adding this dependency would create a cycle.
172
+
173
+ Example:
174
+ >>> graph.add_dependency("order_service", "database")
175
+ >>> graph.add_dependency("order_service", "cache", critical=False)
176
+ """
177
+ with self._lock:
178
+ # Check if adding this would create a cycle
179
+ if self._would_create_cycle(tool, depends_on):
180
+ raise ValueError(
181
+ f"Adding dependency {tool} -> {depends_on} would create a cycle"
182
+ )
183
+
184
+ self._dependencies[tool][depends_on] = DependencyInfo(
185
+ name=depends_on,
186
+ critical=critical,
187
+ fallback=fallback,
188
+ )
189
+ self._dependents[depends_on].add(tool)
190
+
191
+ def remove_dependency(self, tool: str, depends_on: str) -> bool:
192
+ """
193
+ Remove a dependency relationship.
194
+
195
+ Args:
196
+ tool: The tool that has the dependency.
197
+ depends_on: The dependency to remove.
198
+
199
+ Returns:
200
+ True if the dependency was removed, False if not found.
201
+ """
202
+ with self._lock:
203
+ if depends_on in self._dependencies.get(tool, {}):
204
+ del self._dependencies[tool][depends_on]
205
+ self._dependents[depends_on].discard(tool)
206
+ return True
207
+ return False
208
+
209
+ def get_dependencies(self, tool: str) -> set[str]:
210
+ """
211
+ Get direct dependencies of a tool.
212
+
213
+ Args:
214
+ tool: The tool to get dependencies for.
215
+
216
+ Returns:
217
+ Set of tool names this tool directly depends on.
218
+ """
219
+ with self._lock:
220
+ return set(self._dependencies.get(tool, {}).keys())
221
+
222
+ def get_dependency_info(self, tool: str, depends_on: str) -> DependencyInfo | None:
223
+ """
224
+ Get detailed info about a dependency.
225
+
226
+ Args:
227
+ tool: The tool that has the dependency.
228
+ depends_on: The dependency to get info for.
229
+
230
+ Returns:
231
+ DependencyInfo if found, None otherwise.
232
+ """
233
+ with self._lock:
234
+ return self._dependencies.get(tool, {}).get(depends_on)
235
+
236
+ def get_dependents(self, tool: str) -> set[str]:
237
+ """
238
+ Get tools that directly depend on this tool.
239
+
240
+ Args:
241
+ tool: The tool to get dependents for.
242
+
243
+ Returns:
244
+ Set of tool names that directly depend on this tool.
245
+ """
246
+ with self._lock:
247
+ return set(self._dependents.get(tool, set()))
248
+
249
+ def get_upstream(self, tool: str) -> set[str]:
250
+ """
251
+ Get all transitive dependencies (upstream tools).
252
+
253
+ Args:
254
+ tool: The tool to get upstream dependencies for.
255
+
256
+ Returns:
257
+ Set of all tools this tool transitively depends on.
258
+ """
259
+ with self._lock:
260
+ visited: set[str] = set()
261
+ self._collect_upstream(tool, visited)
262
+ return visited
263
+
264
+ def _collect_upstream(self, tool: str, visited: set[str]) -> None:
265
+ """Recursively collect upstream dependencies."""
266
+ for dep in self._dependencies.get(tool, {}):
267
+ if dep not in visited:
268
+ visited.add(dep)
269
+ self._collect_upstream(dep, visited)
270
+
271
+ def get_downstream(self, tool: str) -> set[str]:
272
+ """
273
+ Get all tools that would be affected by this tool's failure.
274
+
275
+ Args:
276
+ tool: The tool to get downstream dependents for.
277
+
278
+ Returns:
279
+ Set of all tools that transitively depend on this tool.
280
+ """
281
+ with self._lock:
282
+ visited: set[str] = set()
283
+ self._collect_downstream(tool, visited)
284
+ return visited
285
+
286
+ def _collect_downstream(self, tool: str, visited: set[str]) -> None:
287
+ """Recursively collect downstream dependents."""
288
+ for dependent in self._dependents.get(tool, set()):
289
+ if dependent not in visited:
290
+ visited.add(dependent)
291
+ self._collect_downstream(dependent, visited)
292
+
293
+ def has_cycle(self) -> bool:
294
+ """
295
+ Check if the graph contains any cycles.
296
+
297
+ Returns:
298
+ True if a cycle exists, False otherwise.
299
+ """
300
+ with self._lock:
301
+ visited: set[str] = set()
302
+ rec_stack: set[str] = set()
303
+
304
+ for tool in self._dependencies:
305
+ if self._has_cycle_from(tool, visited, rec_stack):
306
+ return True
307
+ return False
308
+
309
+ def _has_cycle_from(
310
+ self,
311
+ tool: str,
312
+ visited: set[str],
313
+ rec_stack: set[str],
314
+ ) -> bool:
315
+ """Check for cycle starting from a specific tool."""
316
+ visited.add(tool)
317
+ rec_stack.add(tool)
318
+
319
+ for dep in self._dependencies.get(tool, {}):
320
+ if dep not in visited:
321
+ if self._has_cycle_from(dep, visited, rec_stack):
322
+ return True
323
+ elif dep in rec_stack:
324
+ return True
325
+
326
+ rec_stack.remove(tool)
327
+ return False
328
+
329
+ def _would_create_cycle(self, tool: str, depends_on: str) -> bool:
330
+ """Check if adding a dependency would create a cycle."""
331
+ if tool == depends_on:
332
+ return True
333
+
334
+ # Check if depends_on can reach tool (which would create a cycle)
335
+ upstream_of_depends_on = self.get_upstream(depends_on)
336
+ return tool in upstream_of_depends_on or tool == depends_on
337
+
338
+ def get_all_tools(self) -> set[str]:
339
+ """Get all tools in the graph."""
340
+ with self._lock:
341
+ tools = set(self._dependencies.keys())
342
+ for deps in self._dependencies.values():
343
+ tools.update(deps.keys())
344
+ return tools
345
+
346
+ def get_critical_dependencies(self, tool: str) -> set[str]:
347
+ """
348
+ Get only critical dependencies of a tool.
349
+
350
+ Args:
351
+ tool: The tool to get critical dependencies for.
352
+
353
+ Returns:
354
+ Set of critical dependency names.
355
+ """
356
+ with self._lock:
357
+ return {
358
+ name
359
+ for name, info in self._dependencies.get(tool, {}).items()
360
+ if info.critical
361
+ }
362
+
363
+ def to_dict(self) -> dict[str, list[dict[str, Any]]]:
364
+ """Convert graph to dictionary for serialization."""
365
+ with self._lock:
366
+ return {
367
+ tool: [
368
+ {
369
+ "name": info.name,
370
+ "critical": info.critical,
371
+ "fallback": info.fallback,
372
+ }
373
+ for info in deps.values()
374
+ ]
375
+ for tool, deps in self._dependencies.items()
376
+ }
377
+
378
+
379
+ class CascadeProtector:
380
+ """
381
+ Main class for cascade-aware failure protection.
382
+
383
+ Monitors the health of tools and their dependencies, propagating
384
+ failure information and managing recovery.
385
+
386
+ Example:
387
+ >>> graph = DependencyGraph()
388
+ >>> graph.add_dependency("api", "database")
389
+ >>> graph.add_dependency("api", "cache", critical=False)
390
+ >>>
391
+ >>> registry = CircuitBreakerRegistry()
392
+ >>> protector = CascadeProtector(graph, registry)
393
+ >>>
394
+ >>> # Check health before calling
395
+ >>> state = protector.check_cascade_health("api")
396
+ >>> if state in (CascadeState.FAILING, CascadeState.ISOLATED):
397
+ ... return use_fallback()
398
+ >>>
399
+ >>> # When a failure occurs, propagate it
400
+ >>> affected = protector.propagate_failure("database")
401
+ >>> print(f"Database failure affected {len(affected)} tools")
402
+ """
403
+
404
+ def __init__(
405
+ self,
406
+ graph: DependencyGraph,
407
+ circuit_registry: CircuitBreakerRegistry | None = None,
408
+ degraded_threshold: int = 1,
409
+ failing_threshold: int = 2,
410
+ ):
411
+ """
412
+ Initialize the cascade protector.
413
+
414
+ Args:
415
+ graph: The dependency graph to use.
416
+ circuit_registry: Optional circuit breaker registry for integration.
417
+ degraded_threshold: Number of failing dependencies to mark as DEGRADED.
418
+ failing_threshold: Number of critical failing deps to mark as FAILING.
419
+ """
420
+ self.graph = graph
421
+ self.circuit_registry = circuit_registry
422
+ self.degraded_threshold = degraded_threshold
423
+ self.failing_threshold = failing_threshold
424
+
425
+ self._tool_states: dict[str, CascadeState] = {}
426
+ self._isolated_tools: set[str] = set()
427
+ self._events: list[CascadeEvent] = []
428
+ self._lock = threading.RLock()
429
+ self._state_listeners: list[Callable[[str, CascadeState, CascadeState], None]] = []
430
+
431
+ def check_cascade_health(self, tool: str) -> CascadeState:
432
+ """
433
+ Check the cascade health of a tool.
434
+
435
+ Args:
436
+ tool: The tool to check health for.
437
+
438
+ Returns:
439
+ The current cascade state of the tool.
440
+
441
+ Example:
442
+ >>> state = protector.check_cascade_health("user_service")
443
+ >>> if state == CascadeState.HEALTHY:
444
+ ... # All good
445
+ ... pass
446
+ >>> elif state == CascadeState.DEGRADED:
447
+ ... # Some non-critical dependencies failing
448
+ ... pass
449
+ """
450
+ with self._lock:
451
+ # Check if manually isolated
452
+ if tool in self._isolated_tools:
453
+ return CascadeState.ISOLATED
454
+
455
+ # Check cached state if we have one (this is set by propagate_failure)
456
+ if tool in self._tool_states:
457
+ cached_state = self._tool_states[tool]
458
+ # Return cached FAILING or ISOLATED states
459
+ if cached_state in (CascadeState.FAILING, CascadeState.ISOLATED):
460
+ return cached_state
461
+
462
+ # Calculate state based on dependencies
463
+ return self._calculate_state(tool)
464
+
465
+ def _calculate_state(self, tool: str) -> CascadeState:
466
+ """Calculate the cascade state based on dependency health."""
467
+ dependencies = self.graph.get_dependencies(tool)
468
+ if not dependencies:
469
+ return CascadeState.HEALTHY
470
+
471
+ failing_critical = 0
472
+ failing_total = 0
473
+
474
+ for dep in dependencies:
475
+ dep_state = self._get_tool_state(dep)
476
+ if dep_state in (CascadeState.FAILING, CascadeState.ISOLATED):
477
+ failing_total += 1
478
+ dep_info = self.graph.get_dependency_info(tool, dep)
479
+ if dep_info and dep_info.critical:
480
+ failing_critical += 1
481
+ elif dep_state == CascadeState.DEGRADED:
482
+ failing_total += 0.5 # Degraded contributes half
483
+
484
+ if failing_critical >= self.failing_threshold:
485
+ return CascadeState.FAILING
486
+ elif failing_total >= self.degraded_threshold:
487
+ return CascadeState.DEGRADED
488
+ else:
489
+ return CascadeState.HEALTHY
490
+
491
+ def _get_tool_state(self, tool: str) -> CascadeState:
492
+ """Get the state of a tool, checking circuit breakers if available."""
493
+ # Check manual isolation first
494
+ if tool in self._isolated_tools:
495
+ return CascadeState.ISOLATED
496
+
497
+ # Check circuit breaker state
498
+ if self.circuit_registry:
499
+ try:
500
+ breaker = self.circuit_registry.get(tool, auto_create=False)
501
+ if breaker.state == CircuitState.OPEN:
502
+ return CascadeState.FAILING
503
+ elif breaker.state == CircuitState.HALF_OPEN:
504
+ return CascadeState.DEGRADED
505
+ except KeyError:
506
+ pass # No breaker registered
507
+
508
+ # Check cached state
509
+ return self._tool_states.get(tool, CascadeState.HEALTHY)
510
+
511
+ def propagate_failure(self, tool: str) -> set[str]:
512
+ """
513
+ Propagate a failure through the dependency graph.
514
+
515
+ When a tool fails, this method marks all dependent tools as
516
+ DEGRADED or FAILING based on their dependency configuration.
517
+
518
+ Args:
519
+ tool: The tool that failed.
520
+
521
+ Returns:
522
+ Set of affected tool names.
523
+
524
+ Example:
525
+ >>> affected = protector.propagate_failure("database")
526
+ >>> print(f"Affected tools: {affected}")
527
+ """
528
+ with self._lock:
529
+ affected: set[str] = set()
530
+
531
+ # Mark the failing tool
532
+ old_state = self._tool_states.get(tool, CascadeState.HEALTHY)
533
+ self._tool_states[tool] = CascadeState.FAILING
534
+ self._notify_state_change(tool, old_state, CascadeState.FAILING)
535
+
536
+ # Propagate to dependents
537
+ self._propagate_to_dependents(tool, affected)
538
+
539
+ # Record event
540
+ self._events.append(
541
+ CascadeEvent(
542
+ timestamp=datetime.now(timezone.utc),
543
+ source_tool=tool,
544
+ affected_tools=affected,
545
+ event_type="failure_propagated",
546
+ details={"total_affected": len(affected)},
547
+ )
548
+ )
549
+
550
+ logger.warning(
551
+ f"Cascade failure propagated from {tool}: "
552
+ f"{len(affected)} tools affected"
553
+ )
554
+
555
+ return affected
556
+
557
+ def _propagate_to_dependents(self, tool: str, affected: set[str]) -> None:
558
+ """Recursively propagate failure state to dependent tools."""
559
+ dependents = self.graph.get_dependents(tool)
560
+
561
+ for dependent in dependents:
562
+ if dependent in affected:
563
+ continue
564
+
565
+ affected.add(dependent)
566
+
567
+ # Calculate new state for this dependent
568
+ new_state = self._calculate_state(dependent)
569
+ old_state = self._tool_states.get(dependent, CascadeState.HEALTHY)
570
+
571
+ if new_state != old_state:
572
+ self._tool_states[dependent] = new_state
573
+ self._notify_state_change(dependent, old_state, new_state)
574
+
575
+ # Continue propagation if this tool is now failing
576
+ if new_state in (CascadeState.FAILING, CascadeState.DEGRADED):
577
+ self._propagate_to_dependents(dependent, affected)
578
+
579
+ def isolate_tool(self, tool: str) -> set[str]:
580
+ """
581
+ Manually isolate a tool from the system.
582
+
583
+ Isolated tools are treated as failing and their dependents
584
+ are marked accordingly.
585
+
586
+ Args:
587
+ tool: The tool to isolate.
588
+
589
+ Returns:
590
+ Set of affected tool names.
591
+
592
+ Example:
593
+ >>> # Isolate a tool for maintenance
594
+ >>> affected = protector.isolate_tool("database")
595
+ """
596
+ with self._lock:
597
+ self._isolated_tools.add(tool)
598
+ old_state = self._tool_states.get(tool, CascadeState.HEALTHY)
599
+ self._tool_states[tool] = CascadeState.ISOLATED
600
+ self._notify_state_change(tool, old_state, CascadeState.ISOLATED)
601
+
602
+ affected = self.propagate_failure(tool)
603
+
604
+ self._events.append(
605
+ CascadeEvent(
606
+ timestamp=datetime.now(timezone.utc),
607
+ source_tool=tool,
608
+ affected_tools=affected,
609
+ event_type="tool_isolated",
610
+ )
611
+ )
612
+
613
+ logger.info(f"Tool {tool} isolated, {len(affected)} tools affected")
614
+ return affected
615
+
616
+ def recover_tool(self, tool: str) -> set[str]:
617
+ """
618
+ Attempt to recover a tool from failed/isolated state.
619
+
620
+ This removes the tool from isolation and recalculates states
621
+ for all dependents.
622
+
623
+ Args:
624
+ tool: The tool to recover.
625
+
626
+ Returns:
627
+ Set of tools that may have improved states.
628
+
629
+ Example:
630
+ >>> # After maintenance, recover the tool
631
+ >>> recovered = protector.recover_tool("database")
632
+ """
633
+ with self._lock:
634
+ recovered: set[str] = set()
635
+
636
+ # Remove from isolation
637
+ self._isolated_tools.discard(tool)
638
+
639
+ # Check if circuit breaker is still failing
640
+ actual_state = CascadeState.HEALTHY
641
+ if self.circuit_registry:
642
+ try:
643
+ breaker = self.circuit_registry.get(tool, auto_create=False)
644
+ if breaker.state == CircuitState.OPEN:
645
+ actual_state = CascadeState.FAILING
646
+ elif breaker.state == CircuitState.HALF_OPEN:
647
+ actual_state = CascadeState.DEGRADED
648
+ except KeyError:
649
+ pass
650
+
651
+ old_state = self._tool_states.get(tool, CascadeState.HEALTHY)
652
+ if actual_state != old_state:
653
+ self._tool_states[tool] = actual_state
654
+ self._notify_state_change(tool, old_state, actual_state)
655
+
656
+ # Recalculate states for all dependents
657
+ self._recalculate_downstream(tool, recovered)
658
+
659
+ self._events.append(
660
+ CascadeEvent(
661
+ timestamp=datetime.now(timezone.utc),
662
+ source_tool=tool,
663
+ affected_tools=recovered,
664
+ event_type="recovery_started",
665
+ )
666
+ )
667
+
668
+ logger.info(f"Tool {tool} recovery started, {len(recovered)} tools may recover")
669
+ return recovered
670
+
671
+ def _recalculate_downstream(self, tool: str, recovered: set[str]) -> None:
672
+ """Recalculate states for downstream tools after recovery."""
673
+ dependents = self.graph.get_downstream(tool)
674
+
675
+ for dependent in dependents:
676
+ old_state = self._tool_states.get(dependent, CascadeState.HEALTHY)
677
+ new_state = self._calculate_state(dependent)
678
+
679
+ if new_state != old_state:
680
+ self._tool_states[dependent] = new_state
681
+ self._notify_state_change(dependent, old_state, new_state)
682
+
683
+ # If state improved, add to recovered set
684
+ if self._state_value(new_state) < self._state_value(old_state):
685
+ recovered.add(dependent)
686
+
687
+ def _state_value(self, state: CascadeState) -> int:
688
+ """Get numeric value for state comparison (higher = worse)."""
689
+ return {
690
+ CascadeState.HEALTHY: 0,
691
+ CascadeState.DEGRADED: 1,
692
+ CascadeState.FAILING: 2,
693
+ CascadeState.ISOLATED: 3,
694
+ }.get(state, 0)
695
+
696
+ def get_healthy_alternatives(self, tool: str) -> list[str]:
697
+ """
698
+ Get healthy alternatives/fallbacks for a failing tool.
699
+
700
+ Args:
701
+ tool: The tool to find alternatives for.
702
+
703
+ Returns:
704
+ List of healthy alternative tool names.
705
+
706
+ Example:
707
+ >>> alternatives = protector.get_healthy_alternatives("primary_db")
708
+ >>> if alternatives:
709
+ ... use_tool(alternatives[0])
710
+ """
711
+ with self._lock:
712
+ alternatives: list[str] = []
713
+
714
+ # Check configured fallbacks in dependencies
715
+ for dependent in self.graph.get_dependents(tool):
716
+ dep_info = self.graph.get_dependency_info(dependent, tool)
717
+ if dep_info and dep_info.fallback:
718
+ fallback_state = self.check_cascade_health(dep_info.fallback)
719
+ if fallback_state == CascadeState.HEALTHY:
720
+ if dep_info.fallback not in alternatives:
721
+ alternatives.append(dep_info.fallback)
722
+
723
+ return alternatives
724
+
725
+ def add_state_listener(
726
+ self,
727
+ listener: Callable[[str, CascadeState, CascadeState], None],
728
+ ) -> None:
729
+ """
730
+ Add a listener for state changes.
731
+
732
+ Args:
733
+ listener: Callback function(tool, old_state, new_state).
734
+
735
+ Example:
736
+ >>> def on_state_change(tool, old, new):
737
+ ... print(f"{tool}: {old.value} -> {new.value}")
738
+ >>> protector.add_state_listener(on_state_change)
739
+ """
740
+ self._state_listeners.append(listener)
741
+
742
+ def _notify_state_change(
743
+ self,
744
+ tool: str,
745
+ old_state: CascadeState,
746
+ new_state: CascadeState,
747
+ ) -> None:
748
+ """Notify listeners of a state change."""
749
+ for listener in self._state_listeners:
750
+ try:
751
+ listener(tool, old_state, new_state)
752
+ except Exception as e:
753
+ logger.error(f"Error in state listener: {e}")
754
+
755
+ def get_cascade_events(self, limit: int = 100) -> list[CascadeEvent]:
756
+ """
757
+ Get recent cascade events.
758
+
759
+ Args:
760
+ limit: Maximum number of events to return.
761
+
762
+ Returns:
763
+ List of recent cascade events, newest first.
764
+ """
765
+ with self._lock:
766
+ return list(reversed(self._events[-limit:]))
767
+
768
+ def get_all_states(self) -> dict[str, CascadeState]:
769
+ """Get the current state of all tracked tools."""
770
+ with self._lock:
771
+ # Calculate states for all tools in the graph
772
+ all_tools = self.graph.get_all_tools()
773
+ return {tool: self.check_cascade_health(tool) for tool in all_tools}
774
+
775
+ def get_failing_tools(self) -> set[str]:
776
+ """Get all tools currently in FAILING or ISOLATED state."""
777
+ with self._lock:
778
+ return {
779
+ tool
780
+ for tool, state in self._tool_states.items()
781
+ if state in (CascadeState.FAILING, CascadeState.ISOLATED)
782
+ }
783
+
784
+ def get_degraded_tools(self) -> set[str]:
785
+ """Get all tools currently in DEGRADED state."""
786
+ with self._lock:
787
+ return {
788
+ tool
789
+ for tool, state in self._tool_states.items()
790
+ if state == CascadeState.DEGRADED
791
+ }
792
+
793
+ def reset(self) -> None:
794
+ """Reset all cascade states."""
795
+ with self._lock:
796
+ self._tool_states.clear()
797
+ self._isolated_tools.clear()
798
+ self._events.clear()
799
+
800
+
801
+ class CascadeAwareCircuitBreakerRegistry(CircuitBreakerRegistry):
802
+ """
803
+ Circuit breaker registry with cascade protection integration.
804
+
805
+ Automatically propagates failures through the cascade protector
806
+ when circuit breakers open.
807
+
808
+ Example:
809
+ >>> graph = DependencyGraph()
810
+ >>> graph.add_dependency("api", "database")
811
+ >>>
812
+ >>> protector = CascadeProtector(graph)
813
+ >>> registry = CascadeAwareCircuitBreakerRegistry(protector)
814
+ >>>
815
+ >>> # When the database circuit opens, cascade protector is notified
816
+ >>> breaker = registry.get("database")
817
+ >>> try:
818
+ ... result = breaker.call(database_query)
819
+ ... except:
820
+ ... # Circuit may open after enough failures
821
+ ... pass
822
+ """
823
+
824
+ def __init__(
825
+ self,
826
+ cascade_protector: CascadeProtector,
827
+ default_config: dict[str, Any] | None = None,
828
+ ):
829
+ """
830
+ Initialize the cascade-aware registry.
831
+
832
+ Args:
833
+ cascade_protector: The cascade protector to notify on failures.
834
+ default_config: Default circuit breaker configuration.
835
+ """
836
+ super().__init__(default_config)
837
+ self._cascade_protector = cascade_protector
838
+
839
+ def on_circuit_open(self, name: str) -> set[str]:
840
+ """
841
+ Handle a circuit opening.
842
+
843
+ Args:
844
+ name: Name of the circuit that opened.
845
+
846
+ Returns:
847
+ Set of affected tools from cascade propagation.
848
+ """
849
+ affected = self._cascade_protector.propagate_failure(name)
850
+ logger.warning(
851
+ f"Cascade from {name}: {len(affected)} tools affected"
852
+ )
853
+ return affected
854
+
855
+ def on_circuit_close(self, name: str) -> set[str]:
856
+ """
857
+ Handle a circuit closing (recovery).
858
+
859
+ Args:
860
+ name: Name of the circuit that closed.
861
+
862
+ Returns:
863
+ Set of tools that may have recovered.
864
+ """
865
+ recovered = self._cascade_protector.recover_tool(name)
866
+ logger.info(
867
+ f"Recovery from {name}: {len(recovered)} tools may recover"
868
+ )
869
+ return recovered