mcp-hangar 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_hangar/__init__.py +139 -0
- mcp_hangar/application/__init__.py +1 -0
- mcp_hangar/application/commands/__init__.py +67 -0
- mcp_hangar/application/commands/auth_commands.py +118 -0
- mcp_hangar/application/commands/auth_handlers.py +296 -0
- mcp_hangar/application/commands/commands.py +59 -0
- mcp_hangar/application/commands/handlers.py +189 -0
- mcp_hangar/application/discovery/__init__.py +21 -0
- mcp_hangar/application/discovery/discovery_metrics.py +283 -0
- mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
- mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
- mcp_hangar/application/discovery/security_validator.py +414 -0
- mcp_hangar/application/event_handlers/__init__.py +50 -0
- mcp_hangar/application/event_handlers/alert_handler.py +191 -0
- mcp_hangar/application/event_handlers/audit_handler.py +203 -0
- mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
- mcp_hangar/application/event_handlers/logging_handler.py +69 -0
- mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
- mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
- mcp_hangar/application/event_handlers/security_handler.py +604 -0
- mcp_hangar/application/mcp/tooling.py +158 -0
- mcp_hangar/application/ports/__init__.py +9 -0
- mcp_hangar/application/ports/observability.py +237 -0
- mcp_hangar/application/queries/__init__.py +52 -0
- mcp_hangar/application/queries/auth_handlers.py +237 -0
- mcp_hangar/application/queries/auth_queries.py +118 -0
- mcp_hangar/application/queries/handlers.py +227 -0
- mcp_hangar/application/read_models/__init__.py +11 -0
- mcp_hangar/application/read_models/provider_views.py +139 -0
- mcp_hangar/application/sagas/__init__.py +11 -0
- mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
- mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
- mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
- mcp_hangar/application/services/__init__.py +9 -0
- mcp_hangar/application/services/provider_service.py +208 -0
- mcp_hangar/application/services/traced_provider_service.py +211 -0
- mcp_hangar/bootstrap/runtime.py +328 -0
- mcp_hangar/context.py +178 -0
- mcp_hangar/domain/__init__.py +117 -0
- mcp_hangar/domain/contracts/__init__.py +57 -0
- mcp_hangar/domain/contracts/authentication.py +225 -0
- mcp_hangar/domain/contracts/authorization.py +229 -0
- mcp_hangar/domain/contracts/event_store.py +178 -0
- mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
- mcp_hangar/domain/contracts/persistence.py +383 -0
- mcp_hangar/domain/contracts/provider_runtime.py +146 -0
- mcp_hangar/domain/discovery/__init__.py +20 -0
- mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
- mcp_hangar/domain/discovery/discovered_provider.py +185 -0
- mcp_hangar/domain/discovery/discovery_service.py +412 -0
- mcp_hangar/domain/discovery/discovery_source.py +192 -0
- mcp_hangar/domain/events.py +433 -0
- mcp_hangar/domain/exceptions.py +525 -0
- mcp_hangar/domain/model/__init__.py +70 -0
- mcp_hangar/domain/model/aggregate.py +58 -0
- mcp_hangar/domain/model/circuit_breaker.py +152 -0
- mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
- mcp_hangar/domain/model/event_sourced_provider.py +423 -0
- mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
- mcp_hangar/domain/model/health_tracker.py +183 -0
- mcp_hangar/domain/model/load_balancer.py +185 -0
- mcp_hangar/domain/model/provider.py +810 -0
- mcp_hangar/domain/model/provider_group.py +656 -0
- mcp_hangar/domain/model/tool_catalog.py +105 -0
- mcp_hangar/domain/policies/__init__.py +19 -0
- mcp_hangar/domain/policies/provider_health.py +187 -0
- mcp_hangar/domain/repository.py +249 -0
- mcp_hangar/domain/security/__init__.py +85 -0
- mcp_hangar/domain/security/input_validator.py +710 -0
- mcp_hangar/domain/security/rate_limiter.py +387 -0
- mcp_hangar/domain/security/roles.py +237 -0
- mcp_hangar/domain/security/sanitizer.py +387 -0
- mcp_hangar/domain/security/secrets.py +501 -0
- mcp_hangar/domain/services/__init__.py +20 -0
- mcp_hangar/domain/services/audit_service.py +376 -0
- mcp_hangar/domain/services/image_builder.py +328 -0
- mcp_hangar/domain/services/provider_launcher.py +1046 -0
- mcp_hangar/domain/value_objects.py +1138 -0
- mcp_hangar/errors.py +818 -0
- mcp_hangar/fastmcp_server.py +1105 -0
- mcp_hangar/gc.py +134 -0
- mcp_hangar/infrastructure/__init__.py +79 -0
- mcp_hangar/infrastructure/async_executor.py +133 -0
- mcp_hangar/infrastructure/auth/__init__.py +37 -0
- mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
- mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
- mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
- mcp_hangar/infrastructure/auth/middleware.py +340 -0
- mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
- mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
- mcp_hangar/infrastructure/auth/projections.py +366 -0
- mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
- mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
- mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
- mcp_hangar/infrastructure/command_bus.py +112 -0
- mcp_hangar/infrastructure/discovery/__init__.py +110 -0
- mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
- mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
- mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
- mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
- mcp_hangar/infrastructure/event_bus.py +260 -0
- mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
- mcp_hangar/infrastructure/event_store.py +396 -0
- mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
- mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
- mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
- mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
- mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
- mcp_hangar/infrastructure/metrics_publisher.py +36 -0
- mcp_hangar/infrastructure/observability/__init__.py +10 -0
- mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
- mcp_hangar/infrastructure/persistence/__init__.py +33 -0
- mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
- mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
- mcp_hangar/infrastructure/persistence/database.py +333 -0
- mcp_hangar/infrastructure/persistence/database_common.py +330 -0
- mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
- mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
- mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
- mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
- mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
- mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
- mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
- mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
- mcp_hangar/infrastructure/query_bus.py +153 -0
- mcp_hangar/infrastructure/saga_manager.py +401 -0
- mcp_hangar/logging_config.py +209 -0
- mcp_hangar/metrics.py +1007 -0
- mcp_hangar/models.py +31 -0
- mcp_hangar/observability/__init__.py +54 -0
- mcp_hangar/observability/health.py +487 -0
- mcp_hangar/observability/metrics.py +319 -0
- mcp_hangar/observability/tracing.py +433 -0
- mcp_hangar/progress.py +542 -0
- mcp_hangar/retry.py +613 -0
- mcp_hangar/server/__init__.py +120 -0
- mcp_hangar/server/__main__.py +6 -0
- mcp_hangar/server/auth_bootstrap.py +340 -0
- mcp_hangar/server/auth_cli.py +335 -0
- mcp_hangar/server/auth_config.py +305 -0
- mcp_hangar/server/bootstrap.py +735 -0
- mcp_hangar/server/cli.py +161 -0
- mcp_hangar/server/config.py +224 -0
- mcp_hangar/server/context.py +215 -0
- mcp_hangar/server/http_auth_middleware.py +165 -0
- mcp_hangar/server/lifecycle.py +467 -0
- mcp_hangar/server/state.py +117 -0
- mcp_hangar/server/tools/__init__.py +16 -0
- mcp_hangar/server/tools/discovery.py +186 -0
- mcp_hangar/server/tools/groups.py +75 -0
- mcp_hangar/server/tools/health.py +301 -0
- mcp_hangar/server/tools/provider.py +939 -0
- mcp_hangar/server/tools/registry.py +320 -0
- mcp_hangar/server/validation.py +113 -0
- mcp_hangar/stdio_client.py +229 -0
- mcp_hangar-0.2.0.dist-info/METADATA +347 -0
- mcp_hangar-0.2.0.dist-info/RECORD +160 -0
- mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
- mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
- mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""Provider Failover Saga - failover to backup providers on failure."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import time
|
|
5
|
+
from typing import Dict, List, Optional, Set, Type
|
|
6
|
+
|
|
7
|
+
from ...domain.events import DomainEvent, ProviderDegraded, ProviderStarted, ProviderStopped
|
|
8
|
+
from ...infrastructure.saga_manager import EventTriggeredSaga
|
|
9
|
+
from ...logging_config import get_logger
|
|
10
|
+
from ..commands import Command, StartProviderCommand, StopProviderCommand
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FailoverConfig:
|
|
17
|
+
"""Configuration for a failover pair."""
|
|
18
|
+
|
|
19
|
+
primary_id: str
|
|
20
|
+
backup_id: str
|
|
21
|
+
auto_failback: bool = True # Automatically fail back to primary when it recovers
|
|
22
|
+
failback_delay_s: float = 30.0 # Delay before failing back to primary
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class FailoverState:
|
|
27
|
+
"""State of an active failover."""
|
|
28
|
+
|
|
29
|
+
primary_id: str
|
|
30
|
+
backup_id: str
|
|
31
|
+
failed_at: float
|
|
32
|
+
backup_started_at: Optional[float] = None
|
|
33
|
+
is_active: bool = True
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ProviderFailoverSaga(EventTriggeredSaga):
|
|
37
|
+
"""
|
|
38
|
+
Saga that orchestrates failover to backup providers.
|
|
39
|
+
|
|
40
|
+
Failover Strategy:
|
|
41
|
+
1. Configure primary-backup pairs
|
|
42
|
+
2. When primary is degraded/stopped, start the backup
|
|
43
|
+
3. Optionally, fail back to primary when it recovers
|
|
44
|
+
4. Track active failovers to prevent cycles
|
|
45
|
+
|
|
46
|
+
Configuration:
|
|
47
|
+
- Failover pairs: Define which providers are backups for others
|
|
48
|
+
- Auto-failback: Whether to automatically switch back to primary
|
|
49
|
+
- Failback delay: How long to wait before failing back
|
|
50
|
+
|
|
51
|
+
Usage:
|
|
52
|
+
saga = ProviderFailoverSaga()
|
|
53
|
+
saga.configure_failover("primary-provider", "backup-provider")
|
|
54
|
+
saga_manager.register_event_saga(saga)
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self):
|
|
58
|
+
super().__init__()
|
|
59
|
+
|
|
60
|
+
# Failover configuration: primary_id -> FailoverConfig
|
|
61
|
+
self._failover_configs: Dict[str, FailoverConfig] = {}
|
|
62
|
+
|
|
63
|
+
# Active failovers: primary_id -> FailoverState
|
|
64
|
+
self._active_failovers: Dict[str, FailoverState] = {}
|
|
65
|
+
|
|
66
|
+
# Providers currently acting as backups (to avoid cascading failovers)
|
|
67
|
+
self._active_backups: Set[str] = set()
|
|
68
|
+
|
|
69
|
+
# Providers pending failback: primary_id -> scheduled_time
|
|
70
|
+
self._pending_failbacks: Dict[str, float] = {}
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def saga_type(self) -> str:
|
|
74
|
+
return "provider_failover"
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def handled_events(self) -> List[Type[DomainEvent]]:
|
|
78
|
+
return [ProviderDegraded, ProviderStarted, ProviderStopped]
|
|
79
|
+
|
|
80
|
+
def configure_failover(
|
|
81
|
+
self,
|
|
82
|
+
primary_id: str,
|
|
83
|
+
backup_id: str,
|
|
84
|
+
auto_failback: bool = True,
|
|
85
|
+
failback_delay_s: float = 30.0,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Configure a failover pair.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
primary_id: Primary provider ID
|
|
92
|
+
backup_id: Backup provider ID
|
|
93
|
+
auto_failback: Whether to automatically fail back when primary recovers
|
|
94
|
+
failback_delay_s: Delay before failing back
|
|
95
|
+
"""
|
|
96
|
+
self._failover_configs[primary_id] = FailoverConfig(
|
|
97
|
+
primary_id=primary_id,
|
|
98
|
+
backup_id=backup_id,
|
|
99
|
+
auto_failback=auto_failback,
|
|
100
|
+
failback_delay_s=failback_delay_s,
|
|
101
|
+
)
|
|
102
|
+
logger.info(f"Configured failover: {primary_id} -> {backup_id}")
|
|
103
|
+
|
|
104
|
+
def remove_failover(self, primary_id: str) -> bool:
|
|
105
|
+
"""Remove a failover configuration."""
|
|
106
|
+
if primary_id in self._failover_configs:
|
|
107
|
+
del self._failover_configs[primary_id]
|
|
108
|
+
return True
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
def handle(self, event: DomainEvent) -> List[Command]:
|
|
112
|
+
"""Handle failover-related events."""
|
|
113
|
+
if isinstance(event, ProviderDegraded):
|
|
114
|
+
return self._handle_degraded(event)
|
|
115
|
+
elif isinstance(event, ProviderStarted):
|
|
116
|
+
return self._handle_started(event)
|
|
117
|
+
elif isinstance(event, ProviderStopped):
|
|
118
|
+
return self._handle_stopped(event)
|
|
119
|
+
return []
|
|
120
|
+
|
|
121
|
+
def _handle_degraded(self, event: ProviderDegraded) -> List[Command]:
|
|
122
|
+
"""
|
|
123
|
+
Handle provider degraded event.
|
|
124
|
+
|
|
125
|
+
Initiates failover if this is a primary provider with a configured backup.
|
|
126
|
+
"""
|
|
127
|
+
provider_id = event.provider_id
|
|
128
|
+
commands = []
|
|
129
|
+
|
|
130
|
+
# Check if this provider is a backup currently serving
|
|
131
|
+
if provider_id in self._active_backups:
|
|
132
|
+
logger.warning(f"Backup provider {provider_id} degraded - no further failover")
|
|
133
|
+
return []
|
|
134
|
+
|
|
135
|
+
# Check if this provider has a backup configured
|
|
136
|
+
config = self._failover_configs.get(provider_id)
|
|
137
|
+
if not config:
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
# Check if failover is already active
|
|
141
|
+
if provider_id in self._active_failovers:
|
|
142
|
+
logger.debug(f"Failover already active for {provider_id}")
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
# Initiate failover
|
|
146
|
+
logger.info(f"Initiating failover: {provider_id} -> {config.backup_id}")
|
|
147
|
+
|
|
148
|
+
self._active_failovers[provider_id] = FailoverState(
|
|
149
|
+
primary_id=provider_id,
|
|
150
|
+
backup_id=config.backup_id,
|
|
151
|
+
failed_at=time.time(),
|
|
152
|
+
)
|
|
153
|
+
self._active_backups.add(config.backup_id)
|
|
154
|
+
|
|
155
|
+
# Start backup provider
|
|
156
|
+
commands.append(StartProviderCommand(provider_id=config.backup_id))
|
|
157
|
+
|
|
158
|
+
return commands
|
|
159
|
+
|
|
160
|
+
def _handle_started(self, event: ProviderStarted) -> List[Command]:
|
|
161
|
+
"""
|
|
162
|
+
Handle provider started event.
|
|
163
|
+
|
|
164
|
+
- If it's a backup being started, mark failover as complete
|
|
165
|
+
- If it's a primary recovering, consider failback
|
|
166
|
+
"""
|
|
167
|
+
provider_id = event.provider_id
|
|
168
|
+
commands = []
|
|
169
|
+
|
|
170
|
+
# Check if this is a backup being started for failover
|
|
171
|
+
for primary_id, state in self._active_failovers.items():
|
|
172
|
+
if state.backup_id == provider_id and state.backup_started_at is None:
|
|
173
|
+
state.backup_started_at = time.time()
|
|
174
|
+
logger.info(f"Failover complete: {primary_id} -> {provider_id}")
|
|
175
|
+
|
|
176
|
+
# Check if this is a primary recovering
|
|
177
|
+
if provider_id in self._active_failovers:
|
|
178
|
+
state = self._active_failovers[provider_id]
|
|
179
|
+
config = self._failover_configs.get(provider_id)
|
|
180
|
+
|
|
181
|
+
if config and config.auto_failback:
|
|
182
|
+
# Schedule failback
|
|
183
|
+
failback_time = time.time() + config.failback_delay_s
|
|
184
|
+
self._pending_failbacks[provider_id] = failback_time
|
|
185
|
+
|
|
186
|
+
logger.info(f"Primary {provider_id} recovered, scheduling failback in {config.failback_delay_s}s")
|
|
187
|
+
|
|
188
|
+
# In a real implementation, you'd use a scheduler
|
|
189
|
+
# For now, immediately trigger failback commands
|
|
190
|
+
commands.extend(self._execute_failback(provider_id))
|
|
191
|
+
|
|
192
|
+
return commands
|
|
193
|
+
|
|
194
|
+
def _handle_stopped(self, event: ProviderStopped) -> List[Command]:
|
|
195
|
+
"""
|
|
196
|
+
Handle provider stopped event.
|
|
197
|
+
|
|
198
|
+
Clean up failover state if a backup is stopped.
|
|
199
|
+
"""
|
|
200
|
+
provider_id = event.provider_id
|
|
201
|
+
commands = []
|
|
202
|
+
|
|
203
|
+
# If a backup is stopped, clean up
|
|
204
|
+
if provider_id in self._active_backups:
|
|
205
|
+
self._active_backups.discard(provider_id)
|
|
206
|
+
|
|
207
|
+
# Find and clean up the failover state
|
|
208
|
+
for primary_id, state in list(self._active_failovers.items()):
|
|
209
|
+
if state.backup_id == provider_id:
|
|
210
|
+
del self._active_failovers[primary_id]
|
|
211
|
+
self._pending_failbacks.pop(primary_id, None)
|
|
212
|
+
logger.info(f"Failover {primary_id} -> {provider_id} ended")
|
|
213
|
+
|
|
214
|
+
return commands
|
|
215
|
+
|
|
216
|
+
def _execute_failback(self, primary_id: str) -> List[Command]:
|
|
217
|
+
"""Execute failback to primary provider."""
|
|
218
|
+
commands = []
|
|
219
|
+
|
|
220
|
+
state = self._active_failovers.get(primary_id)
|
|
221
|
+
config = self._failover_configs.get(primary_id)
|
|
222
|
+
|
|
223
|
+
if not state or not config:
|
|
224
|
+
return []
|
|
225
|
+
|
|
226
|
+
logger.info(f"Executing failback: {state.backup_id} -> {primary_id}")
|
|
227
|
+
|
|
228
|
+
# Stop the backup (primary is already running)
|
|
229
|
+
commands.append(StopProviderCommand(provider_id=state.backup_id, reason="failback"))
|
|
230
|
+
|
|
231
|
+
# Clean up failover state
|
|
232
|
+
del self._active_failovers[primary_id]
|
|
233
|
+
self._active_backups.discard(state.backup_id)
|
|
234
|
+
self._pending_failbacks.pop(primary_id, None)
|
|
235
|
+
|
|
236
|
+
return commands
|
|
237
|
+
|
|
238
|
+
def get_active_failovers(self) -> Dict[str, FailoverState]:
|
|
239
|
+
"""Get all active failovers."""
|
|
240
|
+
return dict(self._active_failovers)
|
|
241
|
+
|
|
242
|
+
def get_failover_config(self, primary_id: str) -> Optional[FailoverConfig]:
|
|
243
|
+
"""Get failover configuration for a provider."""
|
|
244
|
+
return self._failover_configs.get(primary_id)
|
|
245
|
+
|
|
246
|
+
def get_all_configs(self) -> Dict[str, FailoverConfig]:
|
|
247
|
+
"""Get all failover configurations."""
|
|
248
|
+
return dict(self._failover_configs)
|
|
249
|
+
|
|
250
|
+
def is_backup_active(self, provider_id: str) -> bool:
|
|
251
|
+
"""Check if a provider is currently serving as a backup."""
|
|
252
|
+
return provider_id in self._active_backups
|
|
253
|
+
|
|
254
|
+
def force_failback(self, primary_id: str) -> List[Command]:
|
|
255
|
+
"""Manually force a failback to primary."""
|
|
256
|
+
return self._execute_failback(primary_id)
|
|
257
|
+
|
|
258
|
+
def cancel_failover(self, primary_id: str) -> bool:
|
|
259
|
+
"""Cancel an active failover (keeps backup running)."""
|
|
260
|
+
if primary_id in self._active_failovers:
|
|
261
|
+
state = self._active_failovers[primary_id]
|
|
262
|
+
self._active_backups.discard(state.backup_id)
|
|
263
|
+
del self._active_failovers[primary_id]
|
|
264
|
+
self._pending_failbacks.pop(primary_id, None)
|
|
265
|
+
return True
|
|
266
|
+
return False
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Provider Recovery Saga - automatically recover degraded providers."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Dict, List, Optional, Type
|
|
5
|
+
|
|
6
|
+
from ...domain.events import DomainEvent, HealthCheckFailed, ProviderDegraded, ProviderStarted, ProviderStopped
|
|
7
|
+
from ...infrastructure.saga_manager import EventTriggeredSaga
|
|
8
|
+
from ...logging_config import get_logger
|
|
9
|
+
from ..commands import Command, StartProviderCommand, StopProviderCommand
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ProviderRecoverySaga(EventTriggeredSaga):
|
|
15
|
+
"""
|
|
16
|
+
Saga that orchestrates automatic provider recovery after failures.
|
|
17
|
+
|
|
18
|
+
Recovery Strategy:
|
|
19
|
+
1. When a provider is degraded, schedule a retry
|
|
20
|
+
2. Apply exponential backoff between retries
|
|
21
|
+
3. After max retries, give up and stop the provider
|
|
22
|
+
4. Reset retry count when provider starts successfully
|
|
23
|
+
|
|
24
|
+
Configuration:
|
|
25
|
+
- max_retries: Maximum number of restart attempts (default: 3)
|
|
26
|
+
- initial_backoff_s: Initial backoff duration in seconds (default: 5)
|
|
27
|
+
- max_backoff_s: Maximum backoff duration (default: 60)
|
|
28
|
+
- backoff_multiplier: Backoff multiplier for exponential growth (default: 2)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
max_retries: int = 3,
|
|
34
|
+
initial_backoff_s: float = 5.0,
|
|
35
|
+
max_backoff_s: float = 60.0,
|
|
36
|
+
backoff_multiplier: float = 2.0,
|
|
37
|
+
):
|
|
38
|
+
super().__init__()
|
|
39
|
+
|
|
40
|
+
self._max_retries = max_retries
|
|
41
|
+
self._initial_backoff_s = initial_backoff_s
|
|
42
|
+
self._max_backoff_s = max_backoff_s
|
|
43
|
+
self._backoff_multiplier = backoff_multiplier
|
|
44
|
+
|
|
45
|
+
# Track retry state per provider
|
|
46
|
+
# provider_id -> {"retries": int, "last_attempt": float, "next_retry": float}
|
|
47
|
+
self._retry_state: Dict[str, Dict] = {}
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def saga_type(self) -> str:
|
|
51
|
+
return "provider_recovery"
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def handled_events(self) -> List[Type[DomainEvent]]:
|
|
55
|
+
return [ProviderDegraded, ProviderStarted, ProviderStopped, HealthCheckFailed]
|
|
56
|
+
|
|
57
|
+
def handle(self, event: DomainEvent) -> List[Command]:
|
|
58
|
+
"""Handle recovery-related events."""
|
|
59
|
+
if isinstance(event, ProviderDegraded):
|
|
60
|
+
return self._handle_degraded(event)
|
|
61
|
+
elif isinstance(event, ProviderStarted):
|
|
62
|
+
return self._handle_started(event)
|
|
63
|
+
elif isinstance(event, ProviderStopped):
|
|
64
|
+
return self._handle_stopped(event)
|
|
65
|
+
elif isinstance(event, HealthCheckFailed):
|
|
66
|
+
return self._handle_health_failed(event)
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
def _handle_degraded(self, event: ProviderDegraded) -> List[Command]:
|
|
70
|
+
"""
|
|
71
|
+
Handle provider degraded event.
|
|
72
|
+
|
|
73
|
+
Initiates recovery by scheduling a restart with backoff.
|
|
74
|
+
"""
|
|
75
|
+
provider_id = event.provider_id
|
|
76
|
+
|
|
77
|
+
# Initialize retry state if needed
|
|
78
|
+
if provider_id not in self._retry_state:
|
|
79
|
+
self._retry_state[provider_id] = {
|
|
80
|
+
"retries": 0,
|
|
81
|
+
"last_attempt": 0,
|
|
82
|
+
"next_retry": 0,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
state = self._retry_state[provider_id]
|
|
86
|
+
state["retries"] += 1
|
|
87
|
+
state["last_attempt"] = time.time()
|
|
88
|
+
|
|
89
|
+
# Check if max retries exceeded
|
|
90
|
+
if state["retries"] > self._max_retries:
|
|
91
|
+
logger.warning(f"Provider {provider_id} exceeded max retries ({self._max_retries}), stopping recovery")
|
|
92
|
+
# Stop the provider permanently
|
|
93
|
+
return [StopProviderCommand(provider_id=provider_id, reason="max_retries_exceeded")]
|
|
94
|
+
|
|
95
|
+
# Calculate backoff
|
|
96
|
+
backoff = self._calculate_backoff(state["retries"])
|
|
97
|
+
state["next_retry"] = time.time() + backoff
|
|
98
|
+
|
|
99
|
+
logger.info(
|
|
100
|
+
f"Provider {provider_id} degraded, scheduling retry "
|
|
101
|
+
f"{state['retries']}/{self._max_retries} in {backoff:.1f}s"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Note: In a real implementation, you would use a scheduler
|
|
105
|
+
# to delay the command. For now, we return it immediately.
|
|
106
|
+
# The provider's internal backoff will handle timing.
|
|
107
|
+
return [StartProviderCommand(provider_id=provider_id)]
|
|
108
|
+
|
|
109
|
+
def _handle_started(self, event: ProviderStarted) -> List[Command]:
|
|
110
|
+
"""
|
|
111
|
+
Handle provider started event.
|
|
112
|
+
|
|
113
|
+
Resets retry count on successful start.
|
|
114
|
+
"""
|
|
115
|
+
provider_id = event.provider_id
|
|
116
|
+
|
|
117
|
+
if provider_id in self._retry_state:
|
|
118
|
+
old_retries = self._retry_state[provider_id]["retries"]
|
|
119
|
+
self._retry_state[provider_id] = {
|
|
120
|
+
"retries": 0,
|
|
121
|
+
"last_attempt": 0,
|
|
122
|
+
"next_retry": 0,
|
|
123
|
+
}
|
|
124
|
+
if old_retries > 0:
|
|
125
|
+
logger.info(f"Provider {provider_id} recovered successfully after {old_retries} retries")
|
|
126
|
+
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
def _handle_stopped(self, event: ProviderStopped) -> List[Command]:
|
|
130
|
+
"""
|
|
131
|
+
Handle provider stopped event.
|
|
132
|
+
|
|
133
|
+
Clears retry state for normally stopped providers.
|
|
134
|
+
"""
|
|
135
|
+
provider_id = event.provider_id
|
|
136
|
+
|
|
137
|
+
# Only clear state for intentional stops
|
|
138
|
+
if event.reason in ("shutdown", "idle", "user_request"):
|
|
139
|
+
self._retry_state.pop(provider_id, None)
|
|
140
|
+
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
def _handle_health_failed(self, event: HealthCheckFailed) -> List[Command]:
|
|
144
|
+
"""
|
|
145
|
+
Handle health check failed event.
|
|
146
|
+
|
|
147
|
+
May trigger preemptive recovery for severely degraded providers.
|
|
148
|
+
"""
|
|
149
|
+
# If failures are severe but provider not yet degraded, no action.
|
|
150
|
+
# The ProviderDegraded event will handle actual recovery.
|
|
151
|
+
return []
|
|
152
|
+
|
|
153
|
+
def _calculate_backoff(self, retry_count: int) -> float:
|
|
154
|
+
"""Calculate backoff duration for a retry count."""
|
|
155
|
+
backoff = self._initial_backoff_s * (self._backoff_multiplier ** (retry_count - 1))
|
|
156
|
+
return min(backoff, self._max_backoff_s)
|
|
157
|
+
|
|
158
|
+
def get_retry_state(self, provider_id: str) -> Optional[Dict]:
|
|
159
|
+
"""Get retry state for a provider (for monitoring)."""
|
|
160
|
+
return self._retry_state.get(provider_id)
|
|
161
|
+
|
|
162
|
+
def get_all_retry_states(self) -> Dict[str, Dict]:
|
|
163
|
+
"""Get all retry states (for monitoring)."""
|
|
164
|
+
return dict(self._retry_state)
|
|
165
|
+
|
|
166
|
+
def reset_retry_state(self, provider_id: str) -> None:
|
|
167
|
+
"""Manually reset retry state for a provider."""
|
|
168
|
+
self._retry_state.pop(provider_id, None)
|
|
169
|
+
|
|
170
|
+
def reset_all_retry_states(self) -> None:
|
|
171
|
+
"""Reset all retry states."""
|
|
172
|
+
self._retry_state.clear()
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Provider application service - orchestrates use cases."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from ...domain.exceptions import ProviderNotFoundError
|
|
6
|
+
from ...domain.model import Provider
|
|
7
|
+
from ...domain.repository import IProviderRepository
|
|
8
|
+
from ...infrastructure.event_bus import EventBus
|
|
9
|
+
from ...logging_config import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ProviderService:
|
|
15
|
+
"""
|
|
16
|
+
Application service for provider operations.
|
|
17
|
+
|
|
18
|
+
Orchestrates use cases by:
|
|
19
|
+
- Loading providers from repository
|
|
20
|
+
- Executing domain operations
|
|
21
|
+
- Publishing collected domain events
|
|
22
|
+
- Returning results
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
repository: IProviderRepository,
|
|
28
|
+
event_bus: EventBus,
|
|
29
|
+
):
|
|
30
|
+
self._repository = repository
|
|
31
|
+
self._event_bus = event_bus
|
|
32
|
+
|
|
33
|
+
def _publish_events(self, provider: Provider) -> None:
|
|
34
|
+
"""Publish all collected events from provider."""
|
|
35
|
+
events = provider.collect_events()
|
|
36
|
+
for event in events:
|
|
37
|
+
try:
|
|
38
|
+
self._event_bus.publish(event)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
logger.error(f"Failed to publish event {event.__class__.__name__}: {e}")
|
|
41
|
+
|
|
42
|
+
def _get_provider(self, provider_id: str) -> Provider:
|
|
43
|
+
"""Get provider or raise ProviderNotFoundError."""
|
|
44
|
+
provider = self._repository.get(provider_id)
|
|
45
|
+
if provider is None:
|
|
46
|
+
raise ProviderNotFoundError(provider_id)
|
|
47
|
+
return provider
|
|
48
|
+
|
|
49
|
+
# --- Use Cases ---
|
|
50
|
+
|
|
51
|
+
def list_providers(self) -> List[Dict[str, Any]]:
|
|
52
|
+
"""
|
|
53
|
+
Use case: List all providers with their status.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
List of provider status dictionaries
|
|
57
|
+
"""
|
|
58
|
+
result = []
|
|
59
|
+
for provider_id, provider in self._repository.get_all().items():
|
|
60
|
+
result.append(provider.to_status_dict())
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
def start_provider(self, provider_id: str) -> Dict[str, Any]:
|
|
64
|
+
"""
|
|
65
|
+
Use case: Explicitly start a provider.
|
|
66
|
+
|
|
67
|
+
Ensures provider is ready and returns its status.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
provider_id: Provider identifier
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Dictionary with provider state and tools
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ProviderNotFoundError: If provider doesn't exist
|
|
77
|
+
"""
|
|
78
|
+
provider = self._get_provider(provider_id)
|
|
79
|
+
provider.ensure_ready()
|
|
80
|
+
self._publish_events(provider)
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
"provider": provider_id,
|
|
84
|
+
"state": provider.state.value,
|
|
85
|
+
"tools": provider.get_tool_names(),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
def stop_provider(self, provider_id: str) -> Dict[str, Any]:
|
|
89
|
+
"""
|
|
90
|
+
Use case: Explicitly stop a provider.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
provider_id: Provider identifier
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Confirmation dictionary
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ProviderNotFoundError: If provider doesn't exist
|
|
100
|
+
"""
|
|
101
|
+
provider = self._get_provider(provider_id)
|
|
102
|
+
provider.shutdown()
|
|
103
|
+
self._publish_events(provider)
|
|
104
|
+
|
|
105
|
+
return {"stopped": provider_id}
|
|
106
|
+
|
|
107
|
+
def get_provider_tools(self, provider_id: str) -> Dict[str, Any]:
|
|
108
|
+
"""
|
|
109
|
+
Use case: Get detailed tool schemas for a provider.
|
|
110
|
+
|
|
111
|
+
Ensures provider is ready before returning tools.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
provider_id: Provider identifier
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Dictionary with provider ID and tool schemas
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
ProviderNotFoundError: If provider doesn't exist
|
|
121
|
+
"""
|
|
122
|
+
provider = self._get_provider(provider_id)
|
|
123
|
+
provider.ensure_ready()
|
|
124
|
+
self._publish_events(provider)
|
|
125
|
+
|
|
126
|
+
tools_list = []
|
|
127
|
+
for tool in provider.tools:
|
|
128
|
+
tools_list.append(tool.to_dict())
|
|
129
|
+
|
|
130
|
+
return {"provider": provider_id, "tools": tools_list}
|
|
131
|
+
|
|
132
|
+
def invoke_tool(
|
|
133
|
+
self,
|
|
134
|
+
provider_id: str,
|
|
135
|
+
tool_name: str,
|
|
136
|
+
arguments: Dict[str, Any],
|
|
137
|
+
timeout: float = 30.0,
|
|
138
|
+
) -> Dict[str, Any]:
|
|
139
|
+
"""
|
|
140
|
+
Use case: Invoke a tool on a provider.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
provider_id: Provider identifier
|
|
144
|
+
tool_name: Tool name
|
|
145
|
+
arguments: Tool arguments
|
|
146
|
+
timeout: Timeout in seconds
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Tool result dictionary
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
ProviderNotFoundError: If provider doesn't exist
|
|
153
|
+
ToolNotFoundError: If tool doesn't exist
|
|
154
|
+
ToolInvocationError: If invocation fails
|
|
155
|
+
"""
|
|
156
|
+
provider = self._get_provider(provider_id)
|
|
157
|
+
result = provider.invoke_tool(tool_name, arguments, timeout)
|
|
158
|
+
self._publish_events(provider)
|
|
159
|
+
|
|
160
|
+
return result
|
|
161
|
+
|
|
162
|
+
def health_check(self, provider_id: str) -> bool:
|
|
163
|
+
"""
|
|
164
|
+
Use case: Perform health check on a provider.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
provider_id: Provider identifier
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
True if healthy, False otherwise
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
ProviderNotFoundError: If provider doesn't exist
|
|
174
|
+
"""
|
|
175
|
+
provider = self._get_provider(provider_id)
|
|
176
|
+
healthy = provider.health_check()
|
|
177
|
+
self._publish_events(provider)
|
|
178
|
+
|
|
179
|
+
return healthy
|
|
180
|
+
|
|
181
|
+
def check_all_health(self) -> Dict[str, bool]:
|
|
182
|
+
"""
|
|
183
|
+
Use case: Check health of all providers.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Dictionary mapping provider_id to health status
|
|
187
|
+
"""
|
|
188
|
+
results = {}
|
|
189
|
+
for provider_id, provider in self._repository.get_all().items():
|
|
190
|
+
results[provider_id] = provider.health_check()
|
|
191
|
+
self._publish_events(provider)
|
|
192
|
+
|
|
193
|
+
return results
|
|
194
|
+
|
|
195
|
+
def shutdown_idle_providers(self) -> List[str]:
|
|
196
|
+
"""
|
|
197
|
+
Use case: Shutdown all idle providers.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
List of provider IDs that were shutdown
|
|
201
|
+
"""
|
|
202
|
+
shutdown_ids = []
|
|
203
|
+
for provider_id, provider in self._repository.get_all().items():
|
|
204
|
+
if provider.maybe_shutdown_idle():
|
|
205
|
+
shutdown_ids.append(provider_id)
|
|
206
|
+
self._publish_events(provider)
|
|
207
|
+
|
|
208
|
+
return shutdown_ids
|