mcp-hangar 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. mcp_hangar/__init__.py +139 -0
  2. mcp_hangar/application/__init__.py +1 -0
  3. mcp_hangar/application/commands/__init__.py +67 -0
  4. mcp_hangar/application/commands/auth_commands.py +118 -0
  5. mcp_hangar/application/commands/auth_handlers.py +296 -0
  6. mcp_hangar/application/commands/commands.py +59 -0
  7. mcp_hangar/application/commands/handlers.py +189 -0
  8. mcp_hangar/application/discovery/__init__.py +21 -0
  9. mcp_hangar/application/discovery/discovery_metrics.py +283 -0
  10. mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
  11. mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
  12. mcp_hangar/application/discovery/security_validator.py +414 -0
  13. mcp_hangar/application/event_handlers/__init__.py +50 -0
  14. mcp_hangar/application/event_handlers/alert_handler.py +191 -0
  15. mcp_hangar/application/event_handlers/audit_handler.py +203 -0
  16. mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
  17. mcp_hangar/application/event_handlers/logging_handler.py +69 -0
  18. mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
  19. mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
  20. mcp_hangar/application/event_handlers/security_handler.py +604 -0
  21. mcp_hangar/application/mcp/tooling.py +158 -0
  22. mcp_hangar/application/ports/__init__.py +9 -0
  23. mcp_hangar/application/ports/observability.py +237 -0
  24. mcp_hangar/application/queries/__init__.py +52 -0
  25. mcp_hangar/application/queries/auth_handlers.py +237 -0
  26. mcp_hangar/application/queries/auth_queries.py +118 -0
  27. mcp_hangar/application/queries/handlers.py +227 -0
  28. mcp_hangar/application/read_models/__init__.py +11 -0
  29. mcp_hangar/application/read_models/provider_views.py +139 -0
  30. mcp_hangar/application/sagas/__init__.py +11 -0
  31. mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
  32. mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
  33. mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
  34. mcp_hangar/application/services/__init__.py +9 -0
  35. mcp_hangar/application/services/provider_service.py +208 -0
  36. mcp_hangar/application/services/traced_provider_service.py +211 -0
  37. mcp_hangar/bootstrap/runtime.py +328 -0
  38. mcp_hangar/context.py +178 -0
  39. mcp_hangar/domain/__init__.py +117 -0
  40. mcp_hangar/domain/contracts/__init__.py +57 -0
  41. mcp_hangar/domain/contracts/authentication.py +225 -0
  42. mcp_hangar/domain/contracts/authorization.py +229 -0
  43. mcp_hangar/domain/contracts/event_store.py +178 -0
  44. mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
  45. mcp_hangar/domain/contracts/persistence.py +383 -0
  46. mcp_hangar/domain/contracts/provider_runtime.py +146 -0
  47. mcp_hangar/domain/discovery/__init__.py +20 -0
  48. mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
  49. mcp_hangar/domain/discovery/discovered_provider.py +185 -0
  50. mcp_hangar/domain/discovery/discovery_service.py +412 -0
  51. mcp_hangar/domain/discovery/discovery_source.py +192 -0
  52. mcp_hangar/domain/events.py +433 -0
  53. mcp_hangar/domain/exceptions.py +525 -0
  54. mcp_hangar/domain/model/__init__.py +70 -0
  55. mcp_hangar/domain/model/aggregate.py +58 -0
  56. mcp_hangar/domain/model/circuit_breaker.py +152 -0
  57. mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
  58. mcp_hangar/domain/model/event_sourced_provider.py +423 -0
  59. mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
  60. mcp_hangar/domain/model/health_tracker.py +183 -0
  61. mcp_hangar/domain/model/load_balancer.py +185 -0
  62. mcp_hangar/domain/model/provider.py +810 -0
  63. mcp_hangar/domain/model/provider_group.py +656 -0
  64. mcp_hangar/domain/model/tool_catalog.py +105 -0
  65. mcp_hangar/domain/policies/__init__.py +19 -0
  66. mcp_hangar/domain/policies/provider_health.py +187 -0
  67. mcp_hangar/domain/repository.py +249 -0
  68. mcp_hangar/domain/security/__init__.py +85 -0
  69. mcp_hangar/domain/security/input_validator.py +710 -0
  70. mcp_hangar/domain/security/rate_limiter.py +387 -0
  71. mcp_hangar/domain/security/roles.py +237 -0
  72. mcp_hangar/domain/security/sanitizer.py +387 -0
  73. mcp_hangar/domain/security/secrets.py +501 -0
  74. mcp_hangar/domain/services/__init__.py +20 -0
  75. mcp_hangar/domain/services/audit_service.py +376 -0
  76. mcp_hangar/domain/services/image_builder.py +328 -0
  77. mcp_hangar/domain/services/provider_launcher.py +1046 -0
  78. mcp_hangar/domain/value_objects.py +1138 -0
  79. mcp_hangar/errors.py +818 -0
  80. mcp_hangar/fastmcp_server.py +1105 -0
  81. mcp_hangar/gc.py +134 -0
  82. mcp_hangar/infrastructure/__init__.py +79 -0
  83. mcp_hangar/infrastructure/async_executor.py +133 -0
  84. mcp_hangar/infrastructure/auth/__init__.py +37 -0
  85. mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
  86. mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
  87. mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
  88. mcp_hangar/infrastructure/auth/middleware.py +340 -0
  89. mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
  90. mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
  91. mcp_hangar/infrastructure/auth/projections.py +366 -0
  92. mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
  93. mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
  94. mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
  95. mcp_hangar/infrastructure/command_bus.py +112 -0
  96. mcp_hangar/infrastructure/discovery/__init__.py +110 -0
  97. mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
  98. mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
  99. mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
  100. mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
  101. mcp_hangar/infrastructure/event_bus.py +260 -0
  102. mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
  103. mcp_hangar/infrastructure/event_store.py +396 -0
  104. mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
  105. mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
  106. mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
  107. mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
  108. mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
  109. mcp_hangar/infrastructure/metrics_publisher.py +36 -0
  110. mcp_hangar/infrastructure/observability/__init__.py +10 -0
  111. mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
  112. mcp_hangar/infrastructure/persistence/__init__.py +33 -0
  113. mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
  114. mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
  115. mcp_hangar/infrastructure/persistence/database.py +333 -0
  116. mcp_hangar/infrastructure/persistence/database_common.py +330 -0
  117. mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
  118. mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
  119. mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
  120. mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
  121. mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
  122. mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
  123. mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
  124. mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
  125. mcp_hangar/infrastructure/query_bus.py +153 -0
  126. mcp_hangar/infrastructure/saga_manager.py +401 -0
  127. mcp_hangar/logging_config.py +209 -0
  128. mcp_hangar/metrics.py +1007 -0
  129. mcp_hangar/models.py +31 -0
  130. mcp_hangar/observability/__init__.py +54 -0
  131. mcp_hangar/observability/health.py +487 -0
  132. mcp_hangar/observability/metrics.py +319 -0
  133. mcp_hangar/observability/tracing.py +433 -0
  134. mcp_hangar/progress.py +542 -0
  135. mcp_hangar/retry.py +613 -0
  136. mcp_hangar/server/__init__.py +120 -0
  137. mcp_hangar/server/__main__.py +6 -0
  138. mcp_hangar/server/auth_bootstrap.py +340 -0
  139. mcp_hangar/server/auth_cli.py +335 -0
  140. mcp_hangar/server/auth_config.py +305 -0
  141. mcp_hangar/server/bootstrap.py +735 -0
  142. mcp_hangar/server/cli.py +161 -0
  143. mcp_hangar/server/config.py +224 -0
  144. mcp_hangar/server/context.py +215 -0
  145. mcp_hangar/server/http_auth_middleware.py +165 -0
  146. mcp_hangar/server/lifecycle.py +467 -0
  147. mcp_hangar/server/state.py +117 -0
  148. mcp_hangar/server/tools/__init__.py +16 -0
  149. mcp_hangar/server/tools/discovery.py +186 -0
  150. mcp_hangar/server/tools/groups.py +75 -0
  151. mcp_hangar/server/tools/health.py +301 -0
  152. mcp_hangar/server/tools/provider.py +939 -0
  153. mcp_hangar/server/tools/registry.py +320 -0
  154. mcp_hangar/server/validation.py +113 -0
  155. mcp_hangar/stdio_client.py +229 -0
  156. mcp_hangar-0.2.0.dist-info/METADATA +347 -0
  157. mcp_hangar-0.2.0.dist-info/RECORD +160 -0
  158. mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
  159. mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
  160. mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,266 @@
1
+ """Provider Failover Saga - failover to backup providers on failure."""
2
+
3
+ from dataclasses import dataclass
4
+ import time
5
+ from typing import Dict, List, Optional, Set, Type
6
+
7
+ from ...domain.events import DomainEvent, ProviderDegraded, ProviderStarted, ProviderStopped
8
+ from ...infrastructure.saga_manager import EventTriggeredSaga
9
+ from ...logging_config import get_logger
10
+ from ..commands import Command, StartProviderCommand, StopProviderCommand
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class FailoverConfig:
17
+ """Configuration for a failover pair."""
18
+
19
+ primary_id: str
20
+ backup_id: str
21
+ auto_failback: bool = True # Automatically fail back to primary when it recovers
22
+ failback_delay_s: float = 30.0 # Delay before failing back to primary
23
+
24
+
25
+ @dataclass
26
+ class FailoverState:
27
+ """State of an active failover."""
28
+
29
+ primary_id: str
30
+ backup_id: str
31
+ failed_at: float
32
+ backup_started_at: Optional[float] = None
33
+ is_active: bool = True
34
+
35
+
36
+ class ProviderFailoverSaga(EventTriggeredSaga):
37
+ """
38
+ Saga that orchestrates failover to backup providers.
39
+
40
+ Failover Strategy:
41
+ 1. Configure primary-backup pairs
42
+ 2. When primary is degraded/stopped, start the backup
43
+ 3. Optionally, fail back to primary when it recovers
44
+ 4. Track active failovers to prevent cycles
45
+
46
+ Configuration:
47
+ - Failover pairs: Define which providers are backups for others
48
+ - Auto-failback: Whether to automatically switch back to primary
49
+ - Failback delay: How long to wait before failing back
50
+
51
+ Usage:
52
+ saga = ProviderFailoverSaga()
53
+ saga.configure_failover("primary-provider", "backup-provider")
54
+ saga_manager.register_event_saga(saga)
55
+ """
56
+
57
+ def __init__(self):
58
+ super().__init__()
59
+
60
+ # Failover configuration: primary_id -> FailoverConfig
61
+ self._failover_configs: Dict[str, FailoverConfig] = {}
62
+
63
+ # Active failovers: primary_id -> FailoverState
64
+ self._active_failovers: Dict[str, FailoverState] = {}
65
+
66
+ # Providers currently acting as backups (to avoid cascading failovers)
67
+ self._active_backups: Set[str] = set()
68
+
69
+ # Providers pending failback: primary_id -> scheduled_time
70
+ self._pending_failbacks: Dict[str, float] = {}
71
+
72
+ @property
73
+ def saga_type(self) -> str:
74
+ return "provider_failover"
75
+
76
+ @property
77
+ def handled_events(self) -> List[Type[DomainEvent]]:
78
+ return [ProviderDegraded, ProviderStarted, ProviderStopped]
79
+
80
+ def configure_failover(
81
+ self,
82
+ primary_id: str,
83
+ backup_id: str,
84
+ auto_failback: bool = True,
85
+ failback_delay_s: float = 30.0,
86
+ ) -> None:
87
+ """
88
+ Configure a failover pair.
89
+
90
+ Args:
91
+ primary_id: Primary provider ID
92
+ backup_id: Backup provider ID
93
+ auto_failback: Whether to automatically fail back when primary recovers
94
+ failback_delay_s: Delay before failing back
95
+ """
96
+ self._failover_configs[primary_id] = FailoverConfig(
97
+ primary_id=primary_id,
98
+ backup_id=backup_id,
99
+ auto_failback=auto_failback,
100
+ failback_delay_s=failback_delay_s,
101
+ )
102
+ logger.info(f"Configured failover: {primary_id} -> {backup_id}")
103
+
104
+ def remove_failover(self, primary_id: str) -> bool:
105
+ """Remove a failover configuration."""
106
+ if primary_id in self._failover_configs:
107
+ del self._failover_configs[primary_id]
108
+ return True
109
+ return False
110
+
111
+ def handle(self, event: DomainEvent) -> List[Command]:
112
+ """Handle failover-related events."""
113
+ if isinstance(event, ProviderDegraded):
114
+ return self._handle_degraded(event)
115
+ elif isinstance(event, ProviderStarted):
116
+ return self._handle_started(event)
117
+ elif isinstance(event, ProviderStopped):
118
+ return self._handle_stopped(event)
119
+ return []
120
+
121
+ def _handle_degraded(self, event: ProviderDegraded) -> List[Command]:
122
+ """
123
+ Handle provider degraded event.
124
+
125
+ Initiates failover if this is a primary provider with a configured backup.
126
+ """
127
+ provider_id = event.provider_id
128
+ commands = []
129
+
130
+ # Check if this provider is a backup currently serving
131
+ if provider_id in self._active_backups:
132
+ logger.warning(f"Backup provider {provider_id} degraded - no further failover")
133
+ return []
134
+
135
+ # Check if this provider has a backup configured
136
+ config = self._failover_configs.get(provider_id)
137
+ if not config:
138
+ return []
139
+
140
+ # Check if failover is already active
141
+ if provider_id in self._active_failovers:
142
+ logger.debug(f"Failover already active for {provider_id}")
143
+ return []
144
+
145
+ # Initiate failover
146
+ logger.info(f"Initiating failover: {provider_id} -> {config.backup_id}")
147
+
148
+ self._active_failovers[provider_id] = FailoverState(
149
+ primary_id=provider_id,
150
+ backup_id=config.backup_id,
151
+ failed_at=time.time(),
152
+ )
153
+ self._active_backups.add(config.backup_id)
154
+
155
+ # Start backup provider
156
+ commands.append(StartProviderCommand(provider_id=config.backup_id))
157
+
158
+ return commands
159
+
160
+ def _handle_started(self, event: ProviderStarted) -> List[Command]:
161
+ """
162
+ Handle provider started event.
163
+
164
+ - If it's a backup being started, mark failover as complete
165
+ - If it's a primary recovering, consider failback
166
+ """
167
+ provider_id = event.provider_id
168
+ commands = []
169
+
170
+ # Check if this is a backup being started for failover
171
+ for primary_id, state in self._active_failovers.items():
172
+ if state.backup_id == provider_id and state.backup_started_at is None:
173
+ state.backup_started_at = time.time()
174
+ logger.info(f"Failover complete: {primary_id} -> {provider_id}")
175
+
176
+ # Check if this is a primary recovering
177
+ if provider_id in self._active_failovers:
178
+ state = self._active_failovers[provider_id]
179
+ config = self._failover_configs.get(provider_id)
180
+
181
+ if config and config.auto_failback:
182
+ # Schedule failback
183
+ failback_time = time.time() + config.failback_delay_s
184
+ self._pending_failbacks[provider_id] = failback_time
185
+
186
+ logger.info(f"Primary {provider_id} recovered, scheduling failback in {config.failback_delay_s}s")
187
+
188
+ # In a real implementation, you'd use a scheduler
189
+ # For now, immediately trigger failback commands
190
+ commands.extend(self._execute_failback(provider_id))
191
+
192
+ return commands
193
+
194
+ def _handle_stopped(self, event: ProviderStopped) -> List[Command]:
195
+ """
196
+ Handle provider stopped event.
197
+
198
+ Clean up failover state if a backup is stopped.
199
+ """
200
+ provider_id = event.provider_id
201
+ commands = []
202
+
203
+ # If a backup is stopped, clean up
204
+ if provider_id in self._active_backups:
205
+ self._active_backups.discard(provider_id)
206
+
207
+ # Find and clean up the failover state
208
+ for primary_id, state in list(self._active_failovers.items()):
209
+ if state.backup_id == provider_id:
210
+ del self._active_failovers[primary_id]
211
+ self._pending_failbacks.pop(primary_id, None)
212
+ logger.info(f"Failover {primary_id} -> {provider_id} ended")
213
+
214
+ return commands
215
+
216
+ def _execute_failback(self, primary_id: str) -> List[Command]:
217
+ """Execute failback to primary provider."""
218
+ commands = []
219
+
220
+ state = self._active_failovers.get(primary_id)
221
+ config = self._failover_configs.get(primary_id)
222
+
223
+ if not state or not config:
224
+ return []
225
+
226
+ logger.info(f"Executing failback: {state.backup_id} -> {primary_id}")
227
+
228
+ # Stop the backup (primary is already running)
229
+ commands.append(StopProviderCommand(provider_id=state.backup_id, reason="failback"))
230
+
231
+ # Clean up failover state
232
+ del self._active_failovers[primary_id]
233
+ self._active_backups.discard(state.backup_id)
234
+ self._pending_failbacks.pop(primary_id, None)
235
+
236
+ return commands
237
+
238
+ def get_active_failovers(self) -> Dict[str, FailoverState]:
239
+ """Get all active failovers."""
240
+ return dict(self._active_failovers)
241
+
242
+ def get_failover_config(self, primary_id: str) -> Optional[FailoverConfig]:
243
+ """Get failover configuration for a provider."""
244
+ return self._failover_configs.get(primary_id)
245
+
246
+ def get_all_configs(self) -> Dict[str, FailoverConfig]:
247
+ """Get all failover configurations."""
248
+ return dict(self._failover_configs)
249
+
250
+ def is_backup_active(self, provider_id: str) -> bool:
251
+ """Check if a provider is currently serving as a backup."""
252
+ return provider_id in self._active_backups
253
+
254
+ def force_failback(self, primary_id: str) -> List[Command]:
255
+ """Manually force a failback to primary."""
256
+ return self._execute_failback(primary_id)
257
+
258
+ def cancel_failover(self, primary_id: str) -> bool:
259
+ """Cancel an active failover (keeps backup running)."""
260
+ if primary_id in self._active_failovers:
261
+ state = self._active_failovers[primary_id]
262
+ self._active_backups.discard(state.backup_id)
263
+ del self._active_failovers[primary_id]
264
+ self._pending_failbacks.pop(primary_id, None)
265
+ return True
266
+ return False
@@ -0,0 +1,172 @@
1
+ """Provider Recovery Saga - automatically recover degraded providers."""
2
+
3
+ import time
4
+ from typing import Dict, List, Optional, Type
5
+
6
+ from ...domain.events import DomainEvent, HealthCheckFailed, ProviderDegraded, ProviderStarted, ProviderStopped
7
+ from ...infrastructure.saga_manager import EventTriggeredSaga
8
+ from ...logging_config import get_logger
9
+ from ..commands import Command, StartProviderCommand, StopProviderCommand
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class ProviderRecoverySaga(EventTriggeredSaga):
15
+ """
16
+ Saga that orchestrates automatic provider recovery after failures.
17
+
18
+ Recovery Strategy:
19
+ 1. When a provider is degraded, schedule a retry
20
+ 2. Apply exponential backoff between retries
21
+ 3. After max retries, give up and stop the provider
22
+ 4. Reset retry count when provider starts successfully
23
+
24
+ Configuration:
25
+ - max_retries: Maximum number of restart attempts (default: 3)
26
+ - initial_backoff_s: Initial backoff duration in seconds (default: 5)
27
+ - max_backoff_s: Maximum backoff duration (default: 60)
28
+ - backoff_multiplier: Backoff multiplier for exponential growth (default: 2)
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ max_retries: int = 3,
34
+ initial_backoff_s: float = 5.0,
35
+ max_backoff_s: float = 60.0,
36
+ backoff_multiplier: float = 2.0,
37
+ ):
38
+ super().__init__()
39
+
40
+ self._max_retries = max_retries
41
+ self._initial_backoff_s = initial_backoff_s
42
+ self._max_backoff_s = max_backoff_s
43
+ self._backoff_multiplier = backoff_multiplier
44
+
45
+ # Track retry state per provider
46
+ # provider_id -> {"retries": int, "last_attempt": float, "next_retry": float}
47
+ self._retry_state: Dict[str, Dict] = {}
48
+
49
+ @property
50
+ def saga_type(self) -> str:
51
+ return "provider_recovery"
52
+
53
+ @property
54
+ def handled_events(self) -> List[Type[DomainEvent]]:
55
+ return [ProviderDegraded, ProviderStarted, ProviderStopped, HealthCheckFailed]
56
+
57
+ def handle(self, event: DomainEvent) -> List[Command]:
58
+ """Handle recovery-related events."""
59
+ if isinstance(event, ProviderDegraded):
60
+ return self._handle_degraded(event)
61
+ elif isinstance(event, ProviderStarted):
62
+ return self._handle_started(event)
63
+ elif isinstance(event, ProviderStopped):
64
+ return self._handle_stopped(event)
65
+ elif isinstance(event, HealthCheckFailed):
66
+ return self._handle_health_failed(event)
67
+ return []
68
+
69
+ def _handle_degraded(self, event: ProviderDegraded) -> List[Command]:
70
+ """
71
+ Handle provider degraded event.
72
+
73
+ Initiates recovery by scheduling a restart with backoff.
74
+ """
75
+ provider_id = event.provider_id
76
+
77
+ # Initialize retry state if needed
78
+ if provider_id not in self._retry_state:
79
+ self._retry_state[provider_id] = {
80
+ "retries": 0,
81
+ "last_attempt": 0,
82
+ "next_retry": 0,
83
+ }
84
+
85
+ state = self._retry_state[provider_id]
86
+ state["retries"] += 1
87
+ state["last_attempt"] = time.time()
88
+
89
+ # Check if max retries exceeded
90
+ if state["retries"] > self._max_retries:
91
+ logger.warning(f"Provider {provider_id} exceeded max retries ({self._max_retries}), stopping recovery")
92
+ # Stop the provider permanently
93
+ return [StopProviderCommand(provider_id=provider_id, reason="max_retries_exceeded")]
94
+
95
+ # Calculate backoff
96
+ backoff = self._calculate_backoff(state["retries"])
97
+ state["next_retry"] = time.time() + backoff
98
+
99
+ logger.info(
100
+ f"Provider {provider_id} degraded, scheduling retry "
101
+ f"{state['retries']}/{self._max_retries} in {backoff:.1f}s"
102
+ )
103
+
104
+ # Note: In a real implementation, you would use a scheduler
105
+ # to delay the command. For now, we return it immediately.
106
+ # The provider's internal backoff will handle timing.
107
+ return [StartProviderCommand(provider_id=provider_id)]
108
+
109
+ def _handle_started(self, event: ProviderStarted) -> List[Command]:
110
+ """
111
+ Handle provider started event.
112
+
113
+ Resets retry count on successful start.
114
+ """
115
+ provider_id = event.provider_id
116
+
117
+ if provider_id in self._retry_state:
118
+ old_retries = self._retry_state[provider_id]["retries"]
119
+ self._retry_state[provider_id] = {
120
+ "retries": 0,
121
+ "last_attempt": 0,
122
+ "next_retry": 0,
123
+ }
124
+ if old_retries > 0:
125
+ logger.info(f"Provider {provider_id} recovered successfully after {old_retries} retries")
126
+
127
+ return []
128
+
129
+ def _handle_stopped(self, event: ProviderStopped) -> List[Command]:
130
+ """
131
+ Handle provider stopped event.
132
+
133
+ Clears retry state for normally stopped providers.
134
+ """
135
+ provider_id = event.provider_id
136
+
137
+ # Only clear state for intentional stops
138
+ if event.reason in ("shutdown", "idle", "user_request"):
139
+ self._retry_state.pop(provider_id, None)
140
+
141
+ return []
142
+
143
+ def _handle_health_failed(self, event: HealthCheckFailed) -> List[Command]:
144
+ """
145
+ Handle health check failed event.
146
+
147
+ May trigger preemptive recovery for severely degraded providers.
148
+ """
149
+ # If failures are severe but provider not yet degraded, no action.
150
+ # The ProviderDegraded event will handle actual recovery.
151
+ return []
152
+
153
+ def _calculate_backoff(self, retry_count: int) -> float:
154
+ """Calculate backoff duration for a retry count."""
155
+ backoff = self._initial_backoff_s * (self._backoff_multiplier ** (retry_count - 1))
156
+ return min(backoff, self._max_backoff_s)
157
+
158
+ def get_retry_state(self, provider_id: str) -> Optional[Dict]:
159
+ """Get retry state for a provider (for monitoring)."""
160
+ return self._retry_state.get(provider_id)
161
+
162
+ def get_all_retry_states(self) -> Dict[str, Dict]:
163
+ """Get all retry states (for monitoring)."""
164
+ return dict(self._retry_state)
165
+
166
+ def reset_retry_state(self, provider_id: str) -> None:
167
+ """Manually reset retry state for a provider."""
168
+ self._retry_state.pop(provider_id, None)
169
+
170
+ def reset_all_retry_states(self) -> None:
171
+ """Reset all retry states."""
172
+ self._retry_state.clear()
@@ -0,0 +1,9 @@
1
+ """Application services - use case orchestration."""
2
+
3
+ from .provider_service import ProviderService
4
+ from .traced_provider_service import TracedProviderService
5
+
6
+ __all__ = [
7
+ "ProviderService",
8
+ "TracedProviderService",
9
+ ]
@@ -0,0 +1,208 @@
1
+ """Provider application service - orchestrates use cases."""
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ from ...domain.exceptions import ProviderNotFoundError
6
+ from ...domain.model import Provider
7
+ from ...domain.repository import IProviderRepository
8
+ from ...infrastructure.event_bus import EventBus
9
+ from ...logging_config import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class ProviderService:
15
+ """
16
+ Application service for provider operations.
17
+
18
+ Orchestrates use cases by:
19
+ - Loading providers from repository
20
+ - Executing domain operations
21
+ - Publishing collected domain events
22
+ - Returning results
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ repository: IProviderRepository,
28
+ event_bus: EventBus,
29
+ ):
30
+ self._repository = repository
31
+ self._event_bus = event_bus
32
+
33
+ def _publish_events(self, provider: Provider) -> None:
34
+ """Publish all collected events from provider."""
35
+ events = provider.collect_events()
36
+ for event in events:
37
+ try:
38
+ self._event_bus.publish(event)
39
+ except Exception as e:
40
+ logger.error(f"Failed to publish event {event.__class__.__name__}: {e}")
41
+
42
+ def _get_provider(self, provider_id: str) -> Provider:
43
+ """Get provider or raise ProviderNotFoundError."""
44
+ provider = self._repository.get(provider_id)
45
+ if provider is None:
46
+ raise ProviderNotFoundError(provider_id)
47
+ return provider
48
+
49
+ # --- Use Cases ---
50
+
51
+ def list_providers(self) -> List[Dict[str, Any]]:
52
+ """
53
+ Use case: List all providers with their status.
54
+
55
+ Returns:
56
+ List of provider status dictionaries
57
+ """
58
+ result = []
59
+ for provider_id, provider in self._repository.get_all().items():
60
+ result.append(provider.to_status_dict())
61
+ return result
62
+
63
+ def start_provider(self, provider_id: str) -> Dict[str, Any]:
64
+ """
65
+ Use case: Explicitly start a provider.
66
+
67
+ Ensures provider is ready and returns its status.
68
+
69
+ Args:
70
+ provider_id: Provider identifier
71
+
72
+ Returns:
73
+ Dictionary with provider state and tools
74
+
75
+ Raises:
76
+ ProviderNotFoundError: If provider doesn't exist
77
+ """
78
+ provider = self._get_provider(provider_id)
79
+ provider.ensure_ready()
80
+ self._publish_events(provider)
81
+
82
+ return {
83
+ "provider": provider_id,
84
+ "state": provider.state.value,
85
+ "tools": provider.get_tool_names(),
86
+ }
87
+
88
+ def stop_provider(self, provider_id: str) -> Dict[str, Any]:
89
+ """
90
+ Use case: Explicitly stop a provider.
91
+
92
+ Args:
93
+ provider_id: Provider identifier
94
+
95
+ Returns:
96
+ Confirmation dictionary
97
+
98
+ Raises:
99
+ ProviderNotFoundError: If provider doesn't exist
100
+ """
101
+ provider = self._get_provider(provider_id)
102
+ provider.shutdown()
103
+ self._publish_events(provider)
104
+
105
+ return {"stopped": provider_id}
106
+
107
+ def get_provider_tools(self, provider_id: str) -> Dict[str, Any]:
108
+ """
109
+ Use case: Get detailed tool schemas for a provider.
110
+
111
+ Ensures provider is ready before returning tools.
112
+
113
+ Args:
114
+ provider_id: Provider identifier
115
+
116
+ Returns:
117
+ Dictionary with provider ID and tool schemas
118
+
119
+ Raises:
120
+ ProviderNotFoundError: If provider doesn't exist
121
+ """
122
+ provider = self._get_provider(provider_id)
123
+ provider.ensure_ready()
124
+ self._publish_events(provider)
125
+
126
+ tools_list = []
127
+ for tool in provider.tools:
128
+ tools_list.append(tool.to_dict())
129
+
130
+ return {"provider": provider_id, "tools": tools_list}
131
+
132
+ def invoke_tool(
133
+ self,
134
+ provider_id: str,
135
+ tool_name: str,
136
+ arguments: Dict[str, Any],
137
+ timeout: float = 30.0,
138
+ ) -> Dict[str, Any]:
139
+ """
140
+ Use case: Invoke a tool on a provider.
141
+
142
+ Args:
143
+ provider_id: Provider identifier
144
+ tool_name: Tool name
145
+ arguments: Tool arguments
146
+ timeout: Timeout in seconds
147
+
148
+ Returns:
149
+ Tool result dictionary
150
+
151
+ Raises:
152
+ ProviderNotFoundError: If provider doesn't exist
153
+ ToolNotFoundError: If tool doesn't exist
154
+ ToolInvocationError: If invocation fails
155
+ """
156
+ provider = self._get_provider(provider_id)
157
+ result = provider.invoke_tool(tool_name, arguments, timeout)
158
+ self._publish_events(provider)
159
+
160
+ return result
161
+
162
+ def health_check(self, provider_id: str) -> bool:
163
+ """
164
+ Use case: Perform health check on a provider.
165
+
166
+ Args:
167
+ provider_id: Provider identifier
168
+
169
+ Returns:
170
+ True if healthy, False otherwise
171
+
172
+ Raises:
173
+ ProviderNotFoundError: If provider doesn't exist
174
+ """
175
+ provider = self._get_provider(provider_id)
176
+ healthy = provider.health_check()
177
+ self._publish_events(provider)
178
+
179
+ return healthy
180
+
181
+ def check_all_health(self) -> Dict[str, bool]:
182
+ """
183
+ Use case: Check health of all providers.
184
+
185
+ Returns:
186
+ Dictionary mapping provider_id to health status
187
+ """
188
+ results = {}
189
+ for provider_id, provider in self._repository.get_all().items():
190
+ results[provider_id] = provider.health_check()
191
+ self._publish_events(provider)
192
+
193
+ return results
194
+
195
+ def shutdown_idle_providers(self) -> List[str]:
196
+ """
197
+ Use case: Shutdown all idle providers.
198
+
199
+ Returns:
200
+ List of provider IDs that were shutdown
201
+ """
202
+ shutdown_ids = []
203
+ for provider_id, provider in self._repository.get_all().items():
204
+ if provider.maybe_shutdown_idle():
205
+ shutdown_ids.append(provider_id)
206
+ self._publish_events(provider)
207
+
208
+ return shutdown_ids