mcp-hangar 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. mcp_hangar/__init__.py +139 -0
  2. mcp_hangar/application/__init__.py +1 -0
  3. mcp_hangar/application/commands/__init__.py +67 -0
  4. mcp_hangar/application/commands/auth_commands.py +118 -0
  5. mcp_hangar/application/commands/auth_handlers.py +296 -0
  6. mcp_hangar/application/commands/commands.py +59 -0
  7. mcp_hangar/application/commands/handlers.py +189 -0
  8. mcp_hangar/application/discovery/__init__.py +21 -0
  9. mcp_hangar/application/discovery/discovery_metrics.py +283 -0
  10. mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
  11. mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
  12. mcp_hangar/application/discovery/security_validator.py +414 -0
  13. mcp_hangar/application/event_handlers/__init__.py +50 -0
  14. mcp_hangar/application/event_handlers/alert_handler.py +191 -0
  15. mcp_hangar/application/event_handlers/audit_handler.py +203 -0
  16. mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
  17. mcp_hangar/application/event_handlers/logging_handler.py +69 -0
  18. mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
  19. mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
  20. mcp_hangar/application/event_handlers/security_handler.py +604 -0
  21. mcp_hangar/application/mcp/tooling.py +158 -0
  22. mcp_hangar/application/ports/__init__.py +9 -0
  23. mcp_hangar/application/ports/observability.py +237 -0
  24. mcp_hangar/application/queries/__init__.py +52 -0
  25. mcp_hangar/application/queries/auth_handlers.py +237 -0
  26. mcp_hangar/application/queries/auth_queries.py +118 -0
  27. mcp_hangar/application/queries/handlers.py +227 -0
  28. mcp_hangar/application/read_models/__init__.py +11 -0
  29. mcp_hangar/application/read_models/provider_views.py +139 -0
  30. mcp_hangar/application/sagas/__init__.py +11 -0
  31. mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
  32. mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
  33. mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
  34. mcp_hangar/application/services/__init__.py +9 -0
  35. mcp_hangar/application/services/provider_service.py +208 -0
  36. mcp_hangar/application/services/traced_provider_service.py +211 -0
  37. mcp_hangar/bootstrap/runtime.py +328 -0
  38. mcp_hangar/context.py +178 -0
  39. mcp_hangar/domain/__init__.py +117 -0
  40. mcp_hangar/domain/contracts/__init__.py +57 -0
  41. mcp_hangar/domain/contracts/authentication.py +225 -0
  42. mcp_hangar/domain/contracts/authorization.py +229 -0
  43. mcp_hangar/domain/contracts/event_store.py +178 -0
  44. mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
  45. mcp_hangar/domain/contracts/persistence.py +383 -0
  46. mcp_hangar/domain/contracts/provider_runtime.py +146 -0
  47. mcp_hangar/domain/discovery/__init__.py +20 -0
  48. mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
  49. mcp_hangar/domain/discovery/discovered_provider.py +185 -0
  50. mcp_hangar/domain/discovery/discovery_service.py +412 -0
  51. mcp_hangar/domain/discovery/discovery_source.py +192 -0
  52. mcp_hangar/domain/events.py +433 -0
  53. mcp_hangar/domain/exceptions.py +525 -0
  54. mcp_hangar/domain/model/__init__.py +70 -0
  55. mcp_hangar/domain/model/aggregate.py +58 -0
  56. mcp_hangar/domain/model/circuit_breaker.py +152 -0
  57. mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
  58. mcp_hangar/domain/model/event_sourced_provider.py +423 -0
  59. mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
  60. mcp_hangar/domain/model/health_tracker.py +183 -0
  61. mcp_hangar/domain/model/load_balancer.py +185 -0
  62. mcp_hangar/domain/model/provider.py +810 -0
  63. mcp_hangar/domain/model/provider_group.py +656 -0
  64. mcp_hangar/domain/model/tool_catalog.py +105 -0
  65. mcp_hangar/domain/policies/__init__.py +19 -0
  66. mcp_hangar/domain/policies/provider_health.py +187 -0
  67. mcp_hangar/domain/repository.py +249 -0
  68. mcp_hangar/domain/security/__init__.py +85 -0
  69. mcp_hangar/domain/security/input_validator.py +710 -0
  70. mcp_hangar/domain/security/rate_limiter.py +387 -0
  71. mcp_hangar/domain/security/roles.py +237 -0
  72. mcp_hangar/domain/security/sanitizer.py +387 -0
  73. mcp_hangar/domain/security/secrets.py +501 -0
  74. mcp_hangar/domain/services/__init__.py +20 -0
  75. mcp_hangar/domain/services/audit_service.py +376 -0
  76. mcp_hangar/domain/services/image_builder.py +328 -0
  77. mcp_hangar/domain/services/provider_launcher.py +1046 -0
  78. mcp_hangar/domain/value_objects.py +1138 -0
  79. mcp_hangar/errors.py +818 -0
  80. mcp_hangar/fastmcp_server.py +1105 -0
  81. mcp_hangar/gc.py +134 -0
  82. mcp_hangar/infrastructure/__init__.py +79 -0
  83. mcp_hangar/infrastructure/async_executor.py +133 -0
  84. mcp_hangar/infrastructure/auth/__init__.py +37 -0
  85. mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
  86. mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
  87. mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
  88. mcp_hangar/infrastructure/auth/middleware.py +340 -0
  89. mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
  90. mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
  91. mcp_hangar/infrastructure/auth/projections.py +366 -0
  92. mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
  93. mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
  94. mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
  95. mcp_hangar/infrastructure/command_bus.py +112 -0
  96. mcp_hangar/infrastructure/discovery/__init__.py +110 -0
  97. mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
  98. mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
  99. mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
  100. mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
  101. mcp_hangar/infrastructure/event_bus.py +260 -0
  102. mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
  103. mcp_hangar/infrastructure/event_store.py +396 -0
  104. mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
  105. mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
  106. mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
  107. mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
  108. mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
  109. mcp_hangar/infrastructure/metrics_publisher.py +36 -0
  110. mcp_hangar/infrastructure/observability/__init__.py +10 -0
  111. mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
  112. mcp_hangar/infrastructure/persistence/__init__.py +33 -0
  113. mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
  114. mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
  115. mcp_hangar/infrastructure/persistence/database.py +333 -0
  116. mcp_hangar/infrastructure/persistence/database_common.py +330 -0
  117. mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
  118. mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
  119. mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
  120. mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
  121. mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
  122. mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
  123. mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
  124. mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
  125. mcp_hangar/infrastructure/query_bus.py +153 -0
  126. mcp_hangar/infrastructure/saga_manager.py +401 -0
  127. mcp_hangar/logging_config.py +209 -0
  128. mcp_hangar/metrics.py +1007 -0
  129. mcp_hangar/models.py +31 -0
  130. mcp_hangar/observability/__init__.py +54 -0
  131. mcp_hangar/observability/health.py +487 -0
  132. mcp_hangar/observability/metrics.py +319 -0
  133. mcp_hangar/observability/tracing.py +433 -0
  134. mcp_hangar/progress.py +542 -0
  135. mcp_hangar/retry.py +613 -0
  136. mcp_hangar/server/__init__.py +120 -0
  137. mcp_hangar/server/__main__.py +6 -0
  138. mcp_hangar/server/auth_bootstrap.py +340 -0
  139. mcp_hangar/server/auth_cli.py +335 -0
  140. mcp_hangar/server/auth_config.py +305 -0
  141. mcp_hangar/server/bootstrap.py +735 -0
  142. mcp_hangar/server/cli.py +161 -0
  143. mcp_hangar/server/config.py +224 -0
  144. mcp_hangar/server/context.py +215 -0
  145. mcp_hangar/server/http_auth_middleware.py +165 -0
  146. mcp_hangar/server/lifecycle.py +467 -0
  147. mcp_hangar/server/state.py +117 -0
  148. mcp_hangar/server/tools/__init__.py +16 -0
  149. mcp_hangar/server/tools/discovery.py +186 -0
  150. mcp_hangar/server/tools/groups.py +75 -0
  151. mcp_hangar/server/tools/health.py +301 -0
  152. mcp_hangar/server/tools/provider.py +939 -0
  153. mcp_hangar/server/tools/registry.py +320 -0
  154. mcp_hangar/server/validation.py +113 -0
  155. mcp_hangar/stdio_client.py +229 -0
  156. mcp_hangar-0.2.0.dist-info/METADATA +347 -0
  157. mcp_hangar-0.2.0.dist-info/RECORD +160 -0
  158. mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
  159. mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
  160. mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,810 @@
1
+ """Provider aggregate root - the main domain entity."""
2
+
3
+ import threading
4
+ import time
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from ...logging_config import get_logger
8
+ from ..contracts.metrics_publisher import IMetricsPublisher, NullMetricsPublisher
9
+ from ..events import (
10
+ HealthCheckFailed,
11
+ HealthCheckPassed,
12
+ ProviderDegraded,
13
+ ProviderIdleDetected,
14
+ ProviderStarted,
15
+ ProviderStateChanged,
16
+ ProviderStopped,
17
+ ToolInvocationCompleted,
18
+ ToolInvocationFailed,
19
+ ToolInvocationRequested,
20
+ )
21
+ from ..exceptions import (
22
+ CannotStartProviderError,
23
+ InvalidStateTransitionError,
24
+ ProviderStartError,
25
+ ToolInvocationError,
26
+ ToolNotFoundError,
27
+ )
28
+ from ..value_objects import CorrelationId, HealthCheckInterval, IdleTTL, ProviderId, ProviderMode, ProviderState
29
+ from .aggregate import AggregateRoot
30
+ from .health_tracker import HealthTracker
31
+ from .tool_catalog import ToolCatalog, ToolSchema
32
+
33
+ logger = get_logger(__name__)
34
+
35
+
36
+ # Valid state transitions
37
+ VALID_TRANSITIONS = {
38
+ ProviderState.COLD: {ProviderState.INITIALIZING},
39
+ ProviderState.INITIALIZING: {
40
+ ProviderState.READY,
41
+ ProviderState.DEAD,
42
+ ProviderState.DEGRADED,
43
+ },
44
+ ProviderState.READY: {
45
+ ProviderState.COLD,
46
+ ProviderState.DEAD,
47
+ ProviderState.DEGRADED,
48
+ },
49
+ ProviderState.DEGRADED: {ProviderState.INITIALIZING, ProviderState.COLD},
50
+ ProviderState.DEAD: {ProviderState.INITIALIZING, ProviderState.DEGRADED},
51
+ }
52
+
53
+
54
+ class Provider(AggregateRoot):
55
+ """
56
+ Provider aggregate root.
57
+
58
+ Manages the complete lifecycle of an MCP provider including:
59
+ - State machine with valid transitions
60
+ - Health tracking and circuit breaker logic
61
+ - Tool catalog management
62
+ - Process/client management
63
+
64
+ All public operations are thread-safe using internal locking.
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ provider_id: str,
70
+ mode: str | ProviderMode, # Accept both string and enum
71
+ command: Optional[List[str]] = None,
72
+ image: Optional[str] = None,
73
+ endpoint: Optional[str] = None,
74
+ env: Optional[Dict[str, str]] = None,
75
+ idle_ttl_s: int | IdleTTL = 300, # Accept both int and value object
76
+ health_check_interval_s: int | HealthCheckInterval = 60, # Accept both int and value object
77
+ max_consecutive_failures: int = 3,
78
+ # Container-specific options
79
+ volumes: Optional[List[str]] = None,
80
+ build: Optional[Dict[str, str]] = None,
81
+ resources: Optional[Dict[str, str]] = None,
82
+ network: str = "none",
83
+ read_only: bool = True,
84
+ user: Optional[str] = None, # UID:GID or username
85
+ description: Optional[str] = None, # Description/preprompt for AI models
86
+ # Pre-defined tools (allows visibility before provider starts)
87
+ tools: Optional[List[Dict[str, Any]]] = None,
88
+ # Dependencies
89
+ metrics_publisher: Optional[IMetricsPublisher] = None,
90
+ ):
91
+ super().__init__()
92
+
93
+ # Identity
94
+ self._id = ProviderId(provider_id)
95
+
96
+ # Mode - normalize to ProviderMode enum (container -> docker)
97
+ self._mode = ProviderMode.normalize(mode)
98
+
99
+ self._description = description
100
+
101
+ # Configuration - normalize to value objects
102
+ self._command = command
103
+ self._image = image
104
+ self._endpoint = endpoint
105
+ self._env = env or {}
106
+
107
+ # Idle TTL - normalize to value object
108
+ if isinstance(idle_ttl_s, IdleTTL):
109
+ self._idle_ttl = idle_ttl_s
110
+ else:
111
+ self._idle_ttl = IdleTTL(idle_ttl_s)
112
+
113
+ # Health check interval - normalize to value object
114
+ if isinstance(health_check_interval_s, HealthCheckInterval):
115
+ self._health_check_interval = health_check_interval_s
116
+ else:
117
+ self._health_check_interval = HealthCheckInterval(health_check_interval_s)
118
+
119
+ # Container-specific configuration
120
+ self._volumes = volumes or []
121
+ self._build = build # {"dockerfile": "...", "context": "..."}
122
+ self._resources = resources or {"memory": "512m", "cpu": "1.0"}
123
+ self._network = network
124
+ self._read_only = read_only
125
+ self._user = user
126
+
127
+ # Dependencies (Dependency Inversion Principle)
128
+ self._metrics_publisher = metrics_publisher or NullMetricsPublisher()
129
+
130
+ # State
131
+ self._state = ProviderState.COLD
132
+ self._health = HealthTracker(max_consecutive_failures=max_consecutive_failures)
133
+ self._tools = ToolCatalog()
134
+ self._client: Optional[Any] = None # StdioClient
135
+ self._meta: Dict[str, Any] = {}
136
+ self._last_used: float = 0.0
137
+
138
+ # Pre-load tools from configuration (allows visibility before start)
139
+ self._tools_predefined = False
140
+ if tools:
141
+ self._tools.update_from_list(tools)
142
+ self._tools_predefined = True
143
+
144
+ # Thread safety
145
+ self._lock = threading.RLock()
146
+
147
+ # --- Properties ---
148
+
149
+ @property
150
+ def id(self) -> ProviderId:
151
+ """Provider identifier."""
152
+ return self._id
153
+
154
+ @property
155
+ def provider_id(self) -> str:
156
+ """Provider identifier as string (for backward compatibility)."""
157
+ return str(self._id)
158
+
159
+ @property
160
+ def mode(self) -> ProviderMode:
161
+ """Provider mode enum."""
162
+ return self._mode
163
+
164
+ @property
165
+ def mode_str(self) -> str:
166
+ """Provider mode as string (for backward compatibility)."""
167
+ return self._mode.value
168
+
169
+ @property
170
+ def description(self) -> Optional[str]:
171
+ """Provider description for AI models."""
172
+ return self._description
173
+
174
+ @property
175
+ def state(self) -> ProviderState:
176
+ """Current provider state."""
177
+ with self._lock:
178
+ return self._state
179
+
180
+ @property
181
+ def health(self) -> HealthTracker:
182
+ """Health tracker."""
183
+ return self._health
184
+
185
+ @property
186
+ def tools(self) -> ToolCatalog:
187
+ """Tool catalog."""
188
+ return self._tools
189
+
190
+ @property
191
+ def has_tools(self) -> bool:
192
+ """Check if provider has any tools registered (predefined or discovered)."""
193
+ return self._tools.count() > 0
194
+
195
+ @property
196
+ def tools_predefined(self) -> bool:
197
+ """Check if tools were predefined in configuration (no startup needed for visibility)."""
198
+ return self._tools_predefined
199
+
200
+ @property
201
+ def is_alive(self) -> bool:
202
+ """Check if provider client is alive."""
203
+ with self._lock:
204
+ return self._client is not None and self._client.is_alive()
205
+
206
+ @property
207
+ def last_used(self) -> float:
208
+ """Timestamp of last tool invocation."""
209
+ with self._lock:
210
+ return self._last_used
211
+
212
+ @property
213
+ def idle_time(self) -> float:
214
+ """Time since last use in seconds."""
215
+ with self._lock:
216
+ if self._last_used == 0:
217
+ return 0.0
218
+ return time.time() - self._last_used
219
+
220
+ @property
221
+ def is_idle(self) -> bool:
222
+ """Check if provider has been idle longer than TTL."""
223
+ with self._lock:
224
+ if self._state != ProviderState.READY:
225
+ return False
226
+ if self._last_used == 0:
227
+ return False
228
+ return self.idle_time > self._idle_ttl.seconds
229
+
230
+ @property
231
+ def meta(self) -> Dict[str, Any]:
232
+ """Provider metadata."""
233
+ with self._lock:
234
+ return dict(self._meta)
235
+
236
+ @property
237
+ def lock(self) -> threading.RLock:
238
+ """Get the internal lock (for backward compatibility)."""
239
+ return self._lock
240
+
241
+ # --- State Management ---
242
+
243
+ def _transition_to(self, new_state: ProviderState) -> None:
244
+ """
245
+ Transition to a new state (must hold lock).
246
+
247
+ Validates the transition is valid according to state machine rules.
248
+ Records a ProviderStateChanged event.
249
+ """
250
+ if new_state == self._state:
251
+ return
252
+
253
+ if new_state not in VALID_TRANSITIONS.get(self._state, set()):
254
+ raise InvalidStateTransitionError(self.provider_id, str(self._state.value), str(new_state.value))
255
+
256
+ old_state = self._state
257
+ self._state = new_state
258
+ self._increment_version()
259
+
260
+ self._record_event(
261
+ ProviderStateChanged(
262
+ provider_id=self.provider_id,
263
+ old_state=str(old_state.value),
264
+ new_state=str(new_state.value),
265
+ )
266
+ )
267
+
268
+ def _can_start(self) -> tuple:
269
+ """
270
+ Check if provider can be started (must hold lock).
271
+
272
+ Returns: (can_start, reason, time_until_retry)
273
+ """
274
+ if self._state == ProviderState.READY:
275
+ if self._client and self._client.is_alive():
276
+ return True, "already_ready", 0
277
+
278
+ if self._state == ProviderState.DEGRADED:
279
+ if not self._health.can_retry():
280
+ time_left = self._health.time_until_retry()
281
+ return False, "backoff_not_elapsed", time_left
282
+
283
+ return True, "", 0
284
+
285
+ # --- Business Operations ---
286
+
287
+ def ensure_ready(self) -> None:
288
+ """
289
+ Ensure provider is in READY state, starting if necessary.
290
+
291
+ Thread-safe. Blocks until ready or raises exception.
292
+
293
+ Raises:
294
+ CannotStartProviderError: If backoff hasn't elapsed
295
+ ProviderStartError: If provider fails to start
296
+ """
297
+ with self._lock:
298
+ # Fast path - already ready
299
+ if self._state == ProviderState.READY:
300
+ if self._client and self._client.is_alive():
301
+ return
302
+ # Client died
303
+ logger.warning(f"provider_dead: {self.provider_id}")
304
+ self._state = ProviderState.DEAD
305
+
306
+ # Check if we can start
307
+ can_start, reason, time_left = self._can_start()
308
+ if not can_start:
309
+ raise CannotStartProviderError(
310
+ self.provider_id,
311
+ f"backoff not elapsed, retry in {time_left:.1f}s",
312
+ time_left,
313
+ )
314
+
315
+ # Start if needed
316
+ if self._state in (
317
+ ProviderState.COLD,
318
+ ProviderState.DEAD,
319
+ ProviderState.DEGRADED,
320
+ ):
321
+ self._start()
322
+
323
+ def _start(self) -> None:
324
+ """
325
+ Start provider process (must hold lock).
326
+
327
+ Handles subprocess, docker, container modes.
328
+ """
329
+ start_time = time.time()
330
+ self._transition_to(ProviderState.INITIALIZING)
331
+
332
+ cold_start_time = self._begin_cold_start_tracking()
333
+
334
+ try:
335
+ client = self._create_client()
336
+ self._perform_mcp_handshake(client)
337
+ self._finalize_start(client, start_time)
338
+ self._end_cold_start_tracking(cold_start_time, success=True)
339
+
340
+ except ProviderStartError:
341
+ self._end_cold_start_tracking(cold_start_time, success=False)
342
+ self._handle_start_failure(None)
343
+ raise
344
+ except Exception as e:
345
+ self._end_cold_start_tracking(cold_start_time, success=False)
346
+ self._handle_start_failure(e)
347
+ raise ProviderStartError(self.provider_id, str(e)) from e
348
+
349
+ def _begin_cold_start_tracking(self) -> Optional[float]:
350
+ """Begin tracking cold start metrics. Returns start timestamp."""
351
+ try:
352
+ self._metrics_publisher.begin_cold_start(self.provider_id)
353
+ return time.time()
354
+ except Exception:
355
+ return None
356
+
357
+ def _end_cold_start_tracking(self, start_time: Optional[float], success: bool) -> None:
358
+ """End cold start tracking and record metrics."""
359
+ if start_time is None:
360
+ return
361
+ try:
362
+ if success:
363
+ duration = time.time() - start_time
364
+ self._metrics_publisher.record_cold_start(self.provider_id, duration, self._mode.value)
365
+ self._metrics_publisher.end_cold_start(self.provider_id)
366
+ except Exception:
367
+ pass
368
+
369
+ def _create_client(self) -> Any:
370
+ """Create and return the appropriate client based on mode."""
371
+ from ..services.provider_launcher import get_launcher
372
+
373
+ launcher = get_launcher(self._mode.value)
374
+ config = self._get_launch_config()
375
+ return launcher.launch(**config)
376
+
377
+ def _get_launch_config(self) -> Dict[str, Any]:
378
+ """Get launch configuration for the current mode."""
379
+ if self._mode == ProviderMode.SUBPROCESS:
380
+ return {"command": self._command, "env": self._env}
381
+
382
+ if self._mode == ProviderMode.DOCKER:
383
+ return {
384
+ "image": self._image,
385
+ "volumes": self._volumes,
386
+ "env": self._env,
387
+ "memory_limit": self._resources.get("memory", "512m"),
388
+ "cpu_limit": self._resources.get("cpu", "1.0"),
389
+ "network": self._network,
390
+ "read_only": self._read_only,
391
+ "user": self._user,
392
+ }
393
+
394
+ if self._mode.value in ("container", "podman"):
395
+ return {
396
+ "image": self._get_container_image(),
397
+ "volumes": self._volumes,
398
+ "env": self._env,
399
+ "memory_limit": self._resources.get("memory", "512m"),
400
+ "cpu_limit": self._resources.get("cpu", "1.0"),
401
+ "network": self._network,
402
+ "read_only": self._read_only,
403
+ "user": self._user,
404
+ }
405
+
406
+ raise ValueError(f"unsupported_mode: {self._mode.value}")
407
+
408
+ def _get_container_image(self) -> str:
409
+ """Get or build container image."""
410
+ from ..services.image_builder import BuildConfig, get_image_builder
411
+
412
+ if self._build and self._build.get("dockerfile"):
413
+ runtime = "podman" if self._mode.value == "podman" else "auto"
414
+ builder = get_image_builder(runtime=runtime)
415
+ build_config = BuildConfig(
416
+ dockerfile=self._build["dockerfile"],
417
+ context=self._build.get("context", "."),
418
+ tag=self._build.get("tag"),
419
+ )
420
+ image = builder.build_if_needed(build_config)
421
+ logger.info(f"Built image for {self.provider_id}: {image}")
422
+ return image
423
+
424
+ if not self._image:
425
+ raise ProviderStartError(
426
+ self.provider_id,
427
+ "Container mode requires 'image' or 'build.dockerfile'",
428
+ )
429
+ return self._image
430
+
431
+ def _perform_mcp_handshake(self, client: Any) -> None:
432
+ """Perform MCP initialize and tools/list handshake."""
433
+ # Initialize
434
+ init_resp = client.call(
435
+ "initialize",
436
+ {
437
+ "protocolVersion": "2024-11-05",
438
+ "capabilities": {},
439
+ "clientInfo": {"name": "mcp-registry", "version": "1.0.0"},
440
+ },
441
+ timeout=10.0,
442
+ )
443
+
444
+ if "error" in init_resp:
445
+ error_msg = init_resp["error"].get("message", "unknown")
446
+ self._log_client_error(client, error_msg)
447
+ raise ProviderStartError(self.provider_id, f"init_failed: {error_msg}")
448
+
449
+ # Discover tools
450
+ tools_resp = client.call("tools/list", {}, timeout=10.0)
451
+ if "error" in tools_resp:
452
+ error_msg = tools_resp["error"].get("message", "unknown")
453
+ raise ProviderStartError(self.provider_id, f"tools_list_failed: {error_msg}")
454
+
455
+ tool_list = tools_resp.get("result", {}).get("tools", [])
456
+ self._tools.update_from_list(tool_list)
457
+
458
+ def _log_client_error(self, client: Any, error_msg: str) -> None:
459
+ """Log detailed error info for debugging (especially in CI)."""
460
+ if error_msg != "reader_died":
461
+ return
462
+
463
+ proc = getattr(client, "process", None)
464
+ if not proc:
465
+ return
466
+
467
+ # Try to capture stderr
468
+ stderr = getattr(proc, "stderr", None)
469
+ if stderr:
470
+ try:
471
+ err_bytes = stderr.read()
472
+ if err_bytes:
473
+ err_text = (err_bytes if isinstance(err_bytes, str) else err_bytes.decode(errors="replace")).strip()
474
+ if err_text:
475
+ logger.error(f"provider_container_stderr: {err_text}")
476
+ except Exception:
477
+ pass
478
+
479
+ # Log exit code
480
+ try:
481
+ rc = proc.poll()
482
+ if rc is not None:
483
+ logger.error(f"provider_process_exit_code: {rc}")
484
+ except Exception:
485
+ pass
486
+
487
+ def _finalize_start(self, client: Any, start_time: float) -> None:
488
+ """Finalize successful provider start."""
489
+ self._client = client
490
+ self._meta = {
491
+ "init_result": {},
492
+ "tools_count": self._tools.count(),
493
+ "started_at": time.time(),
494
+ }
495
+ self._transition_to(ProviderState.READY)
496
+ self._health.record_success()
497
+ self._last_used = time.time()
498
+
499
+ startup_duration_ms = (time.time() - start_time) * 1000
500
+ self._record_event(
501
+ ProviderStarted(
502
+ provider_id=self.provider_id,
503
+ mode=self._mode.value,
504
+ tools_count=self._tools.count(),
505
+ startup_duration_ms=startup_duration_ms,
506
+ )
507
+ )
508
+
509
+ logger.info(f"provider_started: {self.provider_id}, mode={self._mode.value}, tools={self._tools.count()}")
510
+
511
+ def _handle_start_failure(self, error: Optional[Exception]) -> None:
512
+ """Handle start failure (must hold lock)."""
513
+ # Clean up client if partially started
514
+ if self._client:
515
+ try:
516
+ self._client.close()
517
+ except Exception:
518
+ pass
519
+ self._client = None
520
+
521
+ self._health.record_failure()
522
+
523
+ error_str = str(error) if error else "unknown error"
524
+
525
+ # Determine new state
526
+ if self._health.should_degrade():
527
+ # Use direct assignment to avoid transition validation issues
528
+ self._state = ProviderState.DEGRADED
529
+ self._increment_version()
530
+
531
+ logger.warning(f"provider_degraded: {self.provider_id}, failures={self._health.consecutive_failures}")
532
+
533
+ self._record_event(
534
+ ProviderDegraded(
535
+ provider_id=self.provider_id,
536
+ consecutive_failures=self._health.consecutive_failures,
537
+ total_failures=self._health.total_failures,
538
+ reason=error_str,
539
+ )
540
+ )
541
+ else:
542
+ self._state = ProviderState.DEAD
543
+ self._increment_version()
544
+
545
+ logger.error(f"provider_start_failed: {self.provider_id}, error={error_str}")
546
+
547
+ def invoke_tool(self, tool_name: str, arguments: Dict[str, Any], timeout: float = 30.0) -> Dict[str, Any]:
548
+ """
549
+ Invoke a tool on this provider.
550
+
551
+ Thread-safe. Ensures provider is ready before invocation.
552
+
553
+ Args:
554
+ tool_name: Name of the tool to invoke
555
+ arguments: Tool arguments
556
+ timeout: Timeout in seconds
557
+
558
+ Returns:
559
+ Tool result dictionary
560
+
561
+ Raises:
562
+ CannotStartProviderError: If provider cannot be started
563
+ ToolNotFoundError: If tool doesn't exist
564
+ ToolInvocationError: If invocation fails
565
+ """
566
+ correlation_id = str(CorrelationId())
567
+
568
+ with self._lock:
569
+ # Ensure ready
570
+ self.ensure_ready()
571
+
572
+ # Check tool exists
573
+ if not self._tools.has(tool_name):
574
+ # Try refreshing tools once
575
+ self._refresh_tools()
576
+
577
+ if not self._tools.has(tool_name):
578
+ raise ToolNotFoundError(self.provider_id, tool_name)
579
+
580
+ self._health._total_invocations += 1
581
+
582
+ # Record start event
583
+ self._record_event(
584
+ ToolInvocationRequested(
585
+ provider_id=self.provider_id,
586
+ tool_name=tool_name,
587
+ correlation_id=correlation_id,
588
+ arguments=arguments,
589
+ )
590
+ )
591
+
592
+ start_time = time.time()
593
+
594
+ try:
595
+ response = self._client.call(
596
+ "tools/call",
597
+ {"name": tool_name, "arguments": arguments},
598
+ timeout=timeout,
599
+ )
600
+
601
+ if "error" in response:
602
+ error_msg = response["error"].get("message", "unknown")
603
+ self._health.record_invocation_failure()
604
+
605
+ self._record_event(
606
+ ToolInvocationFailed(
607
+ provider_id=self.provider_id,
608
+ tool_name=tool_name,
609
+ correlation_id=correlation_id,
610
+ error_message=error_msg,
611
+ error_type=str(response["error"].get("code", "unknown")),
612
+ )
613
+ )
614
+
615
+ raise ToolInvocationError(
616
+ self.provider_id,
617
+ f"tool_error: {error_msg}",
618
+ {"tool_name": tool_name, "correlation_id": correlation_id},
619
+ )
620
+
621
+ # Success
622
+ duration_ms = (time.time() - start_time) * 1000
623
+ self._health.record_success()
624
+ self._last_used = time.time()
625
+
626
+ result = response.get("result", {})
627
+ self._record_event(
628
+ ToolInvocationCompleted(
629
+ provider_id=self.provider_id,
630
+ tool_name=tool_name,
631
+ correlation_id=correlation_id,
632
+ duration_ms=duration_ms,
633
+ result_size_bytes=len(str(result)),
634
+ )
635
+ )
636
+
637
+ logger.debug(f"tool_invoked: {correlation_id}, provider={self.provider_id}, tool={tool_name}")
638
+
639
+ return result
640
+
641
+ except ToolInvocationError:
642
+ raise
643
+ except Exception as e:
644
+ self._health.record_failure()
645
+
646
+ self._record_event(
647
+ ToolInvocationFailed(
648
+ provider_id=self.provider_id,
649
+ tool_name=tool_name,
650
+ correlation_id=correlation_id,
651
+ error_message=str(e),
652
+ error_type=type(e).__name__,
653
+ )
654
+ )
655
+
656
+ logger.error(
657
+ f"tool_invocation_failed: {correlation_id}, "
658
+ f"provider={self.provider_id}, tool={tool_name}, error={e}"
659
+ )
660
+
661
+ raise ToolInvocationError(
662
+ self.provider_id,
663
+ str(e),
664
+ {"tool_name": tool_name, "correlation_id": correlation_id},
665
+ ) from e
666
+
667
+ def _refresh_tools(self) -> None:
668
+ """Refresh tool catalog from provider (must hold lock)."""
669
+ if not self._client or not self._client.is_alive():
670
+ return
671
+
672
+ try:
673
+ tools_resp = self._client.call("tools/list", {}, timeout=5.0)
674
+ if "result" in tools_resp:
675
+ tool_list = tools_resp.get("result", {}).get("tools", [])
676
+ self._tools.update_from_list(tool_list)
677
+ except Exception as e:
678
+ logger.warning(f"tool_refresh_failed: {self.provider_id}, error={e}")
679
+
680
+ def health_check(self) -> bool:
681
+ """
682
+ Perform active health check.
683
+
684
+ Thread-safe. Returns True if healthy.
685
+ """
686
+ with self._lock:
687
+ if self._state != ProviderState.READY:
688
+ return False
689
+
690
+ if not self._client or not self._client.is_alive():
691
+ self._state = ProviderState.DEAD
692
+ self._increment_version()
693
+ return False
694
+
695
+ try:
696
+ start_time = time.time()
697
+ response = self._client.call("tools/list", {}, timeout=5.0)
698
+
699
+ if "error" in response:
700
+ raise Exception(response["error"].get("message", "unknown"))
701
+
702
+ duration_ms = (time.time() - start_time) * 1000
703
+ self._health.record_success()
704
+
705
+ self._record_event(HealthCheckPassed(provider_id=self.provider_id, duration_ms=duration_ms))
706
+
707
+ return True
708
+
709
+ except Exception as e:
710
+ self._health.record_failure()
711
+
712
+ self._record_event(
713
+ HealthCheckFailed(
714
+ provider_id=self.provider_id,
715
+ consecutive_failures=self._health.consecutive_failures,
716
+ error_message=str(e),
717
+ )
718
+ )
719
+
720
+ logger.warning(f"health_check_failed: {self.provider_id}, error={e}")
721
+
722
+ if self._health.should_degrade():
723
+ self._state = ProviderState.DEGRADED
724
+ self._increment_version()
725
+
726
+ logger.warning(f"provider_degraded_by_health_check: {self.provider_id}")
727
+
728
+ self._record_event(
729
+ ProviderDegraded(
730
+ provider_id=self.provider_id,
731
+ consecutive_failures=self._health.consecutive_failures,
732
+ total_failures=self._health.total_failures,
733
+ reason="health_check_failures",
734
+ )
735
+ )
736
+
737
+ return False
738
+
739
+ def maybe_shutdown_idle(self) -> bool:
740
+ """
741
+ Shutdown if idle past TTL.
742
+
743
+ Thread-safe. Returns True if shutdown was performed.
744
+ """
745
+ with self._lock:
746
+ if self._state != ProviderState.READY:
747
+ return False
748
+
749
+ idle_time = time.time() - self._last_used
750
+ if idle_time > self._idle_ttl.seconds:
751
+ self._record_event(
752
+ ProviderIdleDetected(
753
+ provider_id=self.provider_id,
754
+ idle_duration_s=idle_time,
755
+ last_used_at=self._last_used,
756
+ )
757
+ )
758
+
759
+ logger.info(f"provider_idle_shutdown: {self.provider_id}, idle={idle_time:.1f}s")
760
+ self._shutdown_internal(reason="idle")
761
+ return True
762
+
763
+ return False
764
+
765
+ def shutdown(self) -> None:
766
+ """Explicit shutdown (public API). Thread-safe."""
767
+ with self._lock:
768
+ self._shutdown_internal(reason="shutdown")
769
+
770
+ def _shutdown_internal(self, reason: str = "shutdown") -> None:
771
+ """Shutdown implementation (must hold lock)."""
772
+ if self._client:
773
+ try:
774
+ self._client.close()
775
+ except Exception as e:
776
+ logger.warning(f"shutdown_error: {self.provider_id}, error={e}")
777
+ self._client = None
778
+
779
+ self._state = ProviderState.COLD
780
+ self._increment_version()
781
+ self._tools.clear()
782
+ self._meta.clear()
783
+
784
+ self._record_event(ProviderStopped(provider_id=self.provider_id, reason=reason))
785
+
786
+ # --- Compatibility Methods ---
787
+
788
+ def get_tool_names(self) -> List[str]:
789
+ """Get list of available tool names."""
790
+ with self._lock:
791
+ return self._tools.list_names()
792
+
793
+ def get_tools_dict(self) -> Dict[str, ToolSchema]:
794
+ """Get tools as dictionary (for backward compatibility)."""
795
+ with self._lock:
796
+ return self._tools.to_dict()
797
+
798
+ def to_status_dict(self) -> Dict[str, Any]:
799
+ """Get status as dictionary (for registry.list)."""
800
+ with self._lock:
801
+ return {
802
+ "provider": self.provider_id,
803
+ "state": self._state.value,
804
+ "alive": self._client is not None and self._client.is_alive(),
805
+ "mode": self._mode.value,
806
+ "image_or_command": self._image or self._command,
807
+ "tools_cached": self._tools.list_names(),
808
+ "health": self._health.to_dict(),
809
+ "meta": dict(self._meta),
810
+ }