omnibase_infra 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/adapters/adapter_onex_tool_execution.py +451 -0
  3. omnibase_infra/capabilities/__init__.py +15 -0
  4. omnibase_infra/capabilities/capability_inference_rules.py +211 -0
  5. omnibase_infra/capabilities/contract_capability_extractor.py +221 -0
  6. omnibase_infra/capabilities/intent_type_extractor.py +160 -0
  7. omnibase_infra/cli/commands.py +1 -1
  8. omnibase_infra/configs/widget_mapping.yaml +176 -0
  9. omnibase_infra/contracts/handlers/filesystem/handler_contract.yaml +5 -2
  10. omnibase_infra/contracts/handlers/mcp/handler_contract.yaml +5 -2
  11. omnibase_infra/enums/__init__.py +6 -0
  12. omnibase_infra/enums/enum_handler_error_type.py +10 -0
  13. omnibase_infra/enums/enum_handler_source_mode.py +72 -0
  14. omnibase_infra/enums/enum_kafka_acks.py +99 -0
  15. omnibase_infra/errors/error_compute_registry.py +4 -1
  16. omnibase_infra/errors/error_event_bus_registry.py +4 -1
  17. omnibase_infra/errors/error_infra.py +3 -1
  18. omnibase_infra/errors/error_policy_registry.py +4 -1
  19. omnibase_infra/event_bus/event_bus_kafka.py +1 -1
  20. omnibase_infra/event_bus/models/config/model_kafka_event_bus_config.py +59 -10
  21. omnibase_infra/handlers/__init__.py +8 -1
  22. omnibase_infra/handlers/handler_consul.py +7 -1
  23. omnibase_infra/handlers/handler_db.py +10 -3
  24. omnibase_infra/handlers/handler_graph.py +10 -5
  25. omnibase_infra/handlers/handler_http.py +8 -2
  26. omnibase_infra/handlers/handler_intent.py +387 -0
  27. omnibase_infra/handlers/handler_mcp.py +745 -63
  28. omnibase_infra/handlers/handler_vault.py +11 -5
  29. omnibase_infra/handlers/mixins/mixin_consul_kv.py +4 -3
  30. omnibase_infra/handlers/mixins/mixin_consul_service.py +2 -1
  31. omnibase_infra/handlers/registration_storage/handler_registration_storage_postgres.py +7 -0
  32. omnibase_infra/handlers/service_discovery/handler_service_discovery_consul.py +308 -4
  33. omnibase_infra/handlers/service_discovery/models/model_service_info.py +10 -0
  34. omnibase_infra/mixins/mixin_async_circuit_breaker.py +3 -2
  35. omnibase_infra/mixins/mixin_node_introspection.py +42 -7
  36. omnibase_infra/mixins/mixin_retry_execution.py +1 -1
  37. omnibase_infra/models/discovery/model_introspection_config.py +11 -0
  38. omnibase_infra/models/handlers/__init__.py +48 -5
  39. omnibase_infra/models/handlers/model_bootstrap_handler_descriptor.py +162 -0
  40. omnibase_infra/models/handlers/model_contract_discovery_result.py +6 -4
  41. omnibase_infra/models/handlers/model_handler_descriptor.py +15 -0
  42. omnibase_infra/models/handlers/model_handler_source_config.py +220 -0
  43. omnibase_infra/models/mcp/__init__.py +15 -0
  44. omnibase_infra/models/mcp/model_mcp_contract_config.py +80 -0
  45. omnibase_infra/models/mcp/model_mcp_server_config.py +67 -0
  46. omnibase_infra/models/mcp/model_mcp_tool_definition.py +73 -0
  47. omnibase_infra/models/mcp/model_mcp_tool_parameter.py +35 -0
  48. omnibase_infra/models/registration/model_node_capabilities.py +11 -0
  49. omnibase_infra/models/registration/model_node_introspection_event.py +9 -0
  50. omnibase_infra/models/runtime/model_handler_contract.py +25 -9
  51. omnibase_infra/models/runtime/model_loaded_handler.py +9 -0
  52. omnibase_infra/nodes/architecture_validator/contract_architecture_validator.yaml +0 -5
  53. omnibase_infra/nodes/architecture_validator/registry/registry_infra_architecture_validator.py +17 -10
  54. omnibase_infra/nodes/effects/contract.yaml +0 -5
  55. omnibase_infra/nodes/node_registration_orchestrator/contract.yaml +7 -0
  56. omnibase_infra/nodes/node_registration_orchestrator/handlers/handler_node_introspected.py +86 -1
  57. omnibase_infra/nodes/node_registration_orchestrator/introspection_event_router.py +3 -3
  58. omnibase_infra/nodes/node_registration_orchestrator/plugin.py +1 -1
  59. omnibase_infra/nodes/node_registration_orchestrator/registry/registry_infra_node_registration_orchestrator.py +9 -8
  60. omnibase_infra/nodes/node_registration_orchestrator/timeout_coordinator.py +4 -3
  61. omnibase_infra/nodes/node_registration_orchestrator/wiring.py +14 -13
  62. omnibase_infra/nodes/node_registration_storage_effect/contract.yaml +0 -5
  63. omnibase_infra/nodes/node_registration_storage_effect/node.py +4 -1
  64. omnibase_infra/nodes/node_registration_storage_effect/registry/registry_infra_registration_storage.py +47 -26
  65. omnibase_infra/nodes/node_registry_effect/contract.yaml +0 -5
  66. omnibase_infra/nodes/node_registry_effect/handlers/handler_partial_retry.py +2 -1
  67. omnibase_infra/nodes/node_service_discovery_effect/registry/registry_infra_service_discovery.py +28 -20
  68. omnibase_infra/plugins/examples/plugin_json_normalizer.py +2 -2
  69. omnibase_infra/plugins/examples/plugin_json_normalizer_error_handling.py +2 -2
  70. omnibase_infra/plugins/plugin_compute_base.py +16 -2
  71. omnibase_infra/protocols/__init__.py +2 -0
  72. omnibase_infra/protocols/protocol_container_aware.py +200 -0
  73. omnibase_infra/protocols/protocol_event_projector.py +1 -1
  74. omnibase_infra/runtime/__init__.py +90 -1
  75. omnibase_infra/runtime/binding_config_resolver.py +102 -37
  76. omnibase_infra/runtime/constants_notification.py +75 -0
  77. omnibase_infra/runtime/contract_handler_discovery.py +6 -1
  78. omnibase_infra/runtime/handler_bootstrap_source.py +507 -0
  79. omnibase_infra/runtime/handler_contract_config_loader.py +603 -0
  80. omnibase_infra/runtime/handler_contract_source.py +267 -186
  81. omnibase_infra/runtime/handler_identity.py +81 -0
  82. omnibase_infra/runtime/handler_plugin_loader.py +19 -2
  83. omnibase_infra/runtime/handler_registry.py +11 -3
  84. omnibase_infra/runtime/handler_source_resolver.py +326 -0
  85. omnibase_infra/runtime/mixin_semver_cache.py +25 -1
  86. omnibase_infra/runtime/mixins/__init__.py +7 -0
  87. omnibase_infra/runtime/mixins/mixin_projector_notification_publishing.py +566 -0
  88. omnibase_infra/runtime/mixins/mixin_projector_sql_operations.py +31 -10
  89. omnibase_infra/runtime/models/__init__.py +24 -0
  90. omnibase_infra/runtime/models/model_health_check_result.py +2 -1
  91. omnibase_infra/runtime/models/model_projector_notification_config.py +171 -0
  92. omnibase_infra/runtime/models/model_transition_notification_outbox_config.py +112 -0
  93. omnibase_infra/runtime/models/model_transition_notification_outbox_metrics.py +140 -0
  94. omnibase_infra/runtime/models/model_transition_notification_publisher_metrics.py +357 -0
  95. omnibase_infra/runtime/projector_plugin_loader.py +1 -1
  96. omnibase_infra/runtime/projector_shell.py +229 -1
  97. omnibase_infra/runtime/protocol_lifecycle_executor.py +6 -6
  98. omnibase_infra/runtime/protocols/__init__.py +10 -0
  99. omnibase_infra/runtime/registry/registry_protocol_binding.py +16 -15
  100. omnibase_infra/runtime/registry_contract_source.py +693 -0
  101. omnibase_infra/runtime/registry_policy.py +9 -326
  102. omnibase_infra/runtime/secret_resolver.py +4 -2
  103. omnibase_infra/runtime/service_kernel.py +11 -3
  104. omnibase_infra/runtime/service_message_dispatch_engine.py +4 -2
  105. omnibase_infra/runtime/service_runtime_host_process.py +589 -106
  106. omnibase_infra/runtime/transition_notification_outbox.py +1190 -0
  107. omnibase_infra/runtime/transition_notification_publisher.py +764 -0
  108. omnibase_infra/runtime/util_container_wiring.py +6 -5
  109. omnibase_infra/runtime/util_wiring.py +17 -4
  110. omnibase_infra/schemas/schema_transition_notification_outbox.sql +245 -0
  111. omnibase_infra/services/__init__.py +21 -0
  112. omnibase_infra/services/corpus_capture.py +7 -1
  113. omnibase_infra/services/mcp/__init__.py +31 -0
  114. omnibase_infra/services/mcp/mcp_server_lifecycle.py +449 -0
  115. omnibase_infra/services/mcp/service_mcp_tool_discovery.py +411 -0
  116. omnibase_infra/services/mcp/service_mcp_tool_registry.py +329 -0
  117. omnibase_infra/services/mcp/service_mcp_tool_sync.py +547 -0
  118. omnibase_infra/services/registry_api/__init__.py +40 -0
  119. omnibase_infra/services/registry_api/main.py +261 -0
  120. omnibase_infra/services/registry_api/models/__init__.py +66 -0
  121. omnibase_infra/services/registry_api/models/model_capability_widget_mapping.py +38 -0
  122. omnibase_infra/services/registry_api/models/model_pagination_info.py +48 -0
  123. omnibase_infra/services/registry_api/models/model_registry_discovery_response.py +73 -0
  124. omnibase_infra/services/registry_api/models/model_registry_health_response.py +49 -0
  125. omnibase_infra/services/registry_api/models/model_registry_instance_view.py +88 -0
  126. omnibase_infra/services/registry_api/models/model_registry_node_view.py +88 -0
  127. omnibase_infra/services/registry_api/models/model_registry_summary.py +60 -0
  128. omnibase_infra/services/registry_api/models/model_response_list_instances.py +43 -0
  129. omnibase_infra/services/registry_api/models/model_response_list_nodes.py +51 -0
  130. omnibase_infra/services/registry_api/models/model_warning.py +49 -0
  131. omnibase_infra/services/registry_api/models/model_widget_defaults.py +28 -0
  132. omnibase_infra/services/registry_api/models/model_widget_mapping.py +51 -0
  133. omnibase_infra/services/registry_api/routes.py +371 -0
  134. omnibase_infra/services/registry_api/service.py +837 -0
  135. omnibase_infra/services/service_capability_query.py +4 -4
  136. omnibase_infra/services/service_health.py +3 -2
  137. omnibase_infra/services/service_timeout_emitter.py +20 -3
  138. omnibase_infra/services/service_timeout_scanner.py +7 -3
  139. omnibase_infra/services/session/__init__.py +56 -0
  140. omnibase_infra/services/session/config_consumer.py +120 -0
  141. omnibase_infra/services/session/config_store.py +139 -0
  142. omnibase_infra/services/session/consumer.py +1007 -0
  143. omnibase_infra/services/session/protocol_session_aggregator.py +117 -0
  144. omnibase_infra/services/session/store.py +997 -0
  145. omnibase_infra/utils/__init__.py +19 -0
  146. omnibase_infra/utils/util_atomic_file.py +261 -0
  147. omnibase_infra/utils/util_db_transaction.py +239 -0
  148. omnibase_infra/utils/util_dsn_validation.py +1 -1
  149. omnibase_infra/utils/util_retry_optimistic.py +281 -0
  150. omnibase_infra/validation/__init__.py +3 -19
  151. omnibase_infra/validation/contracts/security.validation.yaml +114 -0
  152. omnibase_infra/validation/infra_validators.py +35 -24
  153. omnibase_infra/validation/validation_exemptions.yaml +140 -9
  154. omnibase_infra/validation/validator_chain_propagation.py +2 -2
  155. omnibase_infra/validation/validator_runtime_shape.py +1 -1
  156. omnibase_infra/validation/validator_security.py +473 -370
  157. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/METADATA +3 -3
  158. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/RECORD +161 -98
  159. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/WHEEL +0 -0
  160. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/entry_points.txt +0 -0
  161. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -13,6 +13,7 @@ Key Features:
13
13
  - Dynamic tool discovery from ONEX node registry
14
14
  - Contract-to-MCP schema generation
15
15
  - Request/response correlation for observability
16
+ - Internal uvicorn server lifecycle management (OMN-1282)
16
17
 
17
18
  Note:
18
19
  This handler requires the `mcp` package (anthropic-ai/mcp-python-sdk).
@@ -21,12 +22,18 @@ Note:
21
22
 
22
23
  from __future__ import annotations
23
24
 
25
+ import asyncio
24
26
  import logging
25
27
  import time
26
28
  from typing import TYPE_CHECKING
27
29
  from uuid import UUID, uuid4
28
30
 
31
+ import uvicorn
29
32
  from pydantic import ValidationError
33
+ from starlette.applications import Starlette
34
+ from starlette.requests import Request
35
+ from starlette.responses import JSONResponse
36
+ from starlette.routing import Route
30
37
 
31
38
  from omnibase_core.models.dispatch import ModelHandlerOutput
32
39
  from omnibase_infra.enums import (
@@ -46,11 +53,18 @@ from omnibase_infra.handlers.models.mcp import (
46
53
  ModelMcpToolResult,
47
54
  )
48
55
  from omnibase_infra.mixins import MixinAsyncCircuitBreaker, MixinEnvelopeExtraction
56
+ from omnibase_infra.services.mcp import MCPServerLifecycle, ModelMCPServerConfig
49
57
 
50
58
  if TYPE_CHECKING:
51
- from collections.abc import Sequence
59
+ from collections.abc import Callable, Coroutine, Sequence
52
60
 
53
61
  from omnibase_core.models.container.model_onex_container import ModelONEXContainer
62
+ from omnibase_infra.adapters.adapter_onex_tool_execution import (
63
+ AdapterONEXToolExecution,
64
+ )
65
+ from omnibase_infra.services.mcp.service_mcp_tool_registry import (
66
+ ServiceMCPToolRegistry,
67
+ )
54
68
  from omnibase_spi.protocols.types.protocol_mcp_tool_types import (
55
69
  ProtocolMCPToolDefinition,
56
70
  )
@@ -60,6 +74,64 @@ logger = logging.getLogger(__name__)
60
74
  # Handler ID for ModelHandlerOutput
61
75
  HANDLER_ID_MCP: str = "mcp-handler"
62
76
 
77
+ # Shutdown timeout constants (can be overridden via class attributes)
78
+ _DEFAULT_SHUTDOWN_TIMEOUT: float = 5.0
79
+ _DEFAULT_CANCEL_TIMEOUT: float = 1.0
80
+ _DEFAULT_STARTUP_TIMEOUT: float = 2.0
81
+
82
+ # Error message truncation limit for health check responses
83
+ _ERROR_MESSAGE_MAX_LENGTH: int = 200
84
+
85
+
86
+ def _require_config_value[T](
87
+ config: dict[str, object],
88
+ key: str,
89
+ expected_type: type[T],
90
+ correlation_id: UUID,
91
+ ) -> T:
92
+ """Extract required config value or raise ProtocolConfigurationError.
93
+
94
+ Per CLAUDE.md configuration rules, the `.env` file is the SINGLE SOURCE OF TRUTH.
95
+ There should be ZERO hardcoded fallbacks - all configuration must be explicitly
96
+ provided. If missing, this function raises an error rather than using defaults.
97
+
98
+ Args:
99
+ config: Configuration dictionary to extract value from.
100
+ key: Configuration key to look up.
101
+ expected_type: Expected Python type for the value.
102
+ correlation_id: Correlation ID for error context.
103
+
104
+ Returns:
105
+ The validated configuration value.
106
+
107
+ Raises:
108
+ ProtocolConfigurationError: If value is missing or has wrong type.
109
+ """
110
+ value = config.get(key)
111
+ if value is None:
112
+ raise ProtocolConfigurationError(
113
+ f"Missing required config: '{key}'. Must be set in .env or runtime config.",
114
+ context=ModelInfraErrorContext.with_correlation(
115
+ correlation_id=correlation_id,
116
+ transport_type=EnumInfraTransportType.MCP,
117
+ operation="initialize",
118
+ target_name="handler_mcp",
119
+ ),
120
+ )
121
+ if not isinstance(value, expected_type):
122
+ raise ProtocolConfigurationError(
123
+ f"Invalid config type for '{key}': expected {expected_type.__name__}, "
124
+ f"got {type(value).__name__}",
125
+ context=ModelInfraErrorContext.with_correlation(
126
+ correlation_id=correlation_id,
127
+ transport_type=EnumInfraTransportType.MCP,
128
+ operation="initialize",
129
+ target_name="handler_mcp",
130
+ ),
131
+ )
132
+ return value
133
+
134
+
63
135
  # Supported operations
64
136
  _SUPPORTED_OPERATIONS: frozenset[str] = frozenset(
65
137
  {op.value for op in EnumMcpOperationType}
@@ -108,27 +180,75 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
108
180
  - Timeout enforcement via asyncio.wait_for()
109
181
  - Full observability through the ONEX runtime
110
182
  See: TODO(OMN-1288) for dispatcher integration tracking
183
+
184
+ Class Attributes:
185
+ shutdown_timeout: Timeout for graceful server shutdown (default: 5.0s).
186
+ cancel_timeout: Timeout for forced cancellation after graceful fails (default: 1.0s).
187
+ startup_timeout: Timeout for server readiness check during startup (default: 2.0s).
111
188
  """
112
189
 
113
- def __init__(self, container: ModelONEXContainer) -> None:
114
- """Initialize HandlerMCP with ONEX container for dependency injection.
190
+ # Configurable timeout attributes (can be overridden on subclasses or instances)
191
+ shutdown_timeout: float = _DEFAULT_SHUTDOWN_TIMEOUT
192
+ cancel_timeout: float = _DEFAULT_CANCEL_TIMEOUT
193
+ startup_timeout: float = _DEFAULT_STARTUP_TIMEOUT
194
+
195
+ def __init__(
196
+ self,
197
+ container: ModelONEXContainer | None = None,
198
+ registry: ServiceMCPToolRegistry | None = None,
199
+ executor: AdapterONEXToolExecution | None = None,
200
+ ) -> None:
201
+ """Initialize HandlerMCP with optional ONEX container for dependency injection.
115
202
 
116
203
  Args:
117
- container: ONEX container providing dependency injection for
118
- services, configuration, and runtime context.
204
+ container: Optional ONEX container providing dependency injection for
205
+ services, configuration, and runtime context. When None, the handler
206
+ operates in standalone mode without container-based DI.
207
+ registry: Optional MCP tool registry for dynamic tool discovery.
208
+ If provided, tools are looked up from this registry. If not
209
+ provided, the handler uses its local _tool_registry dict.
210
+ executor: Optional tool execution adapter for dispatching to
211
+ ONEX orchestrators. If provided, tool calls are routed through
212
+ this adapter. If not provided, placeholder execution is used.
119
213
 
120
214
  Note:
121
- The container is stored for interface compliance with the standard ONEX
122
- handler pattern (def __init__(self, container: ModelONEXContainer)) and
123
- to enable future DI-based service resolution (e.g., dispatcher routing,
124
- metrics integration). Currently, the handler operates independently but
125
- the container parameter ensures API consistency across all handlers.
215
+ The container parameter is optional to support two instantiation paths:
216
+ 1. Registry-based: RuntimeHostProcess creates handlers via registry lookup
217
+ with no-argument constructor calls. Container is None in this case.
218
+ 2. DI-based: Explicit container injection for full ONEX integration.
219
+
220
+ When container is provided, it enables future DI-based service resolution
221
+ (e.g., dispatcher routing, metrics integration).
222
+
223
+ MCP Integration (OMN-1281):
224
+ When registry and executor are provided, the handler operates in
225
+ "integrated mode" with full MCP tool discovery and execution:
226
+ - Tools are discovered from Consul via ServiceMCPToolDiscovery
227
+ - Tool list is cached in ServiceMCPToolRegistry
228
+ - Tool execution routes through AdapterONEXToolExecution
229
+ - Hot reload updates are received via ServiceMCPToolSync
230
+
231
+ Server Lifecycle (OMN-1282):
232
+ The handler owns its uvicorn server lifecycle. When initialize() is
233
+ called, the handler starts a uvicorn server in a background task.
234
+ When shutdown() is called, the server is gracefully stopped.
126
235
  """
127
236
  self._container = container
128
237
  self._config: ModelMcpHandlerConfig | None = None
129
238
  self._initialized: bool = False
130
239
  self._tool_registry: dict[str, ProtocolMCPToolDefinition] = {}
131
240
 
241
+ # MCP integration components (OMN-1281)
242
+ self._mcp_registry: ServiceMCPToolRegistry | None = registry
243
+ self._mcp_executor: AdapterONEXToolExecution | None = executor
244
+
245
+ # Server lifecycle components (OMN-1282)
246
+ self._server: uvicorn.Server | None = None
247
+ self._server_task: asyncio.Task[None] | None = None
248
+ self._lifecycle: MCPServerLifecycle | None = None
249
+ self._skip_server: bool = False # Track if server was intentionally skipped
250
+ self._server_started_at: float | None = None # Timestamp for uptime tracking
251
+
132
252
  @property
133
253
  def handler_type(self) -> EnumHandlerType:
134
254
  """Return the architectural role of this handler.
@@ -158,8 +278,194 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
158
278
  """
159
279
  return EnumInfraTransportType.MCP
160
280
 
281
+ def _create_json_endpoint(
282
+ self,
283
+ response_factory: Callable[[], Coroutine[object, object, dict[str, object]]],
284
+ ) -> Callable[[Request], Coroutine[object, object, JSONResponse]]:
285
+ """Create a JSON endpoint that wraps an async response factory.
286
+
287
+ This method creates a Starlette-compatible async route handler that:
288
+ 1. Calls the provided response_factory to generate response data
289
+ 2. Wraps the data in a JSONResponse
290
+
291
+ Args:
292
+ response_factory: Async callable that returns the response data dict.
293
+ The factory is called on each request to generate fresh data.
294
+
295
+ Returns:
296
+ Async function suitable for Starlette Route.
297
+ """
298
+
299
+ async def endpoint(_request: Request) -> JSONResponse:
300
+ data = await response_factory()
301
+ return JSONResponse(data)
302
+
303
+ return endpoint
304
+
305
+ def _create_health_endpoint(
306
+ self,
307
+ ) -> Callable[[Request], Coroutine[object, object, JSONResponse]]:
308
+ """Create health endpoint with explicit handler binding.
309
+
310
+ Returns a coroutine function that closes over `self` explicitly,
311
+ avoiding fragile closure patterns with intermediate variables.
312
+
313
+ Returns:
314
+ Async function suitable for Starlette Route.
315
+ """
316
+ # Capture reference explicitly in closure scope
317
+ handler = self
318
+
319
+ async def get_health_data() -> dict[str, object]:
320
+ """Return health status data for the MCP server."""
321
+ tool_count = 0
322
+ if handler._lifecycle and handler._lifecycle.registry:
323
+ tool_count = handler._lifecycle.registry.tool_count
324
+ return {
325
+ "status": "healthy",
326
+ "tool_count": tool_count,
327
+ "initialized": handler._initialized,
328
+ }
329
+
330
+ return self._create_json_endpoint(get_health_data)
331
+
332
+ def _create_tools_list_endpoint(
333
+ self,
334
+ ) -> Callable[[Request], Coroutine[object, object, JSONResponse]]:
335
+ """Create tools list endpoint with explicit handler binding.
336
+
337
+ Returns a coroutine function that closes over `self` explicitly,
338
+ avoiding fragile closure patterns with intermediate variables.
339
+
340
+ Returns:
341
+ Async function suitable for Starlette Route.
342
+ """
343
+ # Capture reference explicitly in closure scope
344
+ handler = self
345
+
346
+ async def get_tools_data() -> dict[str, object]:
347
+ """Return list of available MCP tools."""
348
+ if handler._lifecycle and handler._lifecycle.registry:
349
+ tools = await handler._lifecycle.registry.list_tools()
350
+ return {
351
+ "tools": [
352
+ {
353
+ "name": t.name,
354
+ "description": t.description,
355
+ "endpoint": t.endpoint,
356
+ }
357
+ for t in tools
358
+ ]
359
+ }
360
+ return {"tools": []}
361
+
362
+ return self._create_json_endpoint(get_tools_data)
363
+
364
+ async def _wait_for_server_ready(
365
+ self,
366
+ host: str,
367
+ port: int,
368
+ timeout: float = 2.0,
369
+ poll_interval: float = 0.05,
370
+ ) -> None:
371
+ """Wait for server to be ready by polling TCP connect.
372
+
373
+ Args:
374
+ host: Server host
375
+ port: Server port
376
+ timeout: Maximum time to wait
377
+ poll_interval: Time between connection attempts
378
+
379
+ Raises:
380
+ ProtocolConfigurationError: If server doesn't start within timeout
381
+
382
+ Note:
383
+ Circuit Breaker Failures Are NOT Recorded Here
384
+
385
+ This method is for startup verification, not runtime health checking.
386
+ TCP connect failures during startup are expected and transient - the
387
+ server is still spinning up and will become available shortly.
388
+
389
+ Circuit breaker tracking is intentionally omitted because:
390
+
391
+ 1. Startup retries are bounded and transient - the method either succeeds
392
+ within the timeout or raises ProtocolConfigurationError, ending startup.
393
+
394
+ 2. Recording startup failures would pollute circuit breaker metrics with
395
+ expected transient failures, potentially triggering an open circuit
396
+ before the server even starts.
397
+
398
+ 3. Circuit breakers are designed for runtime fault tolerance - detecting
399
+ when a previously-healthy service becomes unhealthy. Startup behavior
400
+ is fundamentally different: we expect failures until success.
401
+
402
+ 4. If the server fails to start within timeout, we fail fast with
403
+ ProtocolConfigurationError rather than entering a degraded state.
404
+
405
+ Circuit breaker tracking should occur during runtime operations (e.g.,
406
+ tool execution, health checks) where failures indicate actual service
407
+ degradation rather than expected startup latency.
408
+ """
409
+ import socket
410
+
411
+ start_time = time.perf_counter()
412
+ last_error: Exception | None = None
413
+
414
+ while time.perf_counter() - start_time < timeout:
415
+ # Check if server task has failed
416
+ if self._server_task is not None and self._server_task.done():
417
+ exc = self._server_task.exception()
418
+ if exc:
419
+ ctx = ModelInfraErrorContext.with_correlation(
420
+ transport_type=EnumInfraTransportType.MCP,
421
+ operation="server_startup",
422
+ target_name="mcp_handler",
423
+ )
424
+ raise ProtocolConfigurationError(
425
+ f"Server failed to start: {exc}",
426
+ context=ctx,
427
+ ) from exc
428
+
429
+ # Try TCP connect
430
+ try:
431
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
432
+ sock.settimeout(poll_interval)
433
+ # TCP PROTOCOL REQUIREMENT (NOT a config fallback):
434
+ # When a server binds to 0.0.0.0 (INADDR_ANY), it listens on all
435
+ # interfaces but you cannot connect() to 0.0.0.0 - it's not a
436
+ # routable address. TCP requires connecting to a specific interface.
437
+ # Using 127.0.0.1 (loopback) is the correct way to reach a local
438
+ # server that bound to 0.0.0.0. This is standard TCP/IP behavior,
439
+ # not an environment configuration fallback.
440
+ effective_host = "127.0.0.1" if host == "0.0.0.0" else host # noqa: S104
441
+ result = sock.connect_ex((effective_host, port))
442
+ sock.close()
443
+ if result == 0:
444
+ return # Server is ready
445
+ except Exception as e:
446
+ last_error = e
447
+
448
+ await asyncio.sleep(poll_interval)
449
+
450
+ # Timeout reached
451
+ ctx = ModelInfraErrorContext.with_correlation(
452
+ transport_type=EnumInfraTransportType.MCP,
453
+ operation="server_startup",
454
+ target_name="mcp_handler",
455
+ )
456
+ raise ProtocolConfigurationError(
457
+ f"Server failed to start within {timeout}s. Last error: {last_error}",
458
+ context=ctx,
459
+ )
460
+
161
461
  async def initialize(self, config: dict[str, object]) -> None:
162
- """Initialize MCP handler with configuration.
462
+ """Initialize MCP handler with configuration and optionally start uvicorn server.
463
+
464
+ This method performs the following steps:
465
+ 1. Parse and validate handler configuration
466
+ 2. Initialize MCPServerLifecycle for tool discovery (unless skip_server=True)
467
+ 3. Create Starlette app with /health and /mcp/tools endpoints
468
+ 4. Start uvicorn server in a background task (unless skip_server=True)
163
469
 
164
470
  Args:
165
471
  config: Configuration dict containing:
@@ -170,9 +476,19 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
170
476
  - json_response: Return JSON responses (default: True)
171
477
  - timeout_seconds: Tool execution timeout (default: 30.0)
172
478
  - max_tools: Maximum tools to expose (default: 100)
479
+ - consul_host: Consul server hostname (REQUIRED - no default)
480
+ - consul_port: Consul server port (REQUIRED - no default)
481
+ - kafka_enabled: Whether to enable Kafka hot reload (REQUIRED - no default)
482
+ - dev_mode: Whether to run in development mode (REQUIRED - no default)
483
+ - contracts_dir: Directory for contract scanning in dev mode (optional)
484
+ - skip_server: Skip starting uvicorn server (default: False).
485
+ Use for unit testing to avoid port binding.
173
486
 
174
487
  Raises:
175
- ProtocolConfigurationError: If configuration is invalid.
488
+ ProtocolConfigurationError: If configuration is invalid or required
489
+ config values (consul_host, consul_port, kafka_enabled, dev_mode)
490
+ are missing. Per CLAUDE.md, .env is the single source of truth -
491
+ no hardcoded fallbacks are used.
176
492
  """
177
493
  init_correlation_id = uuid4()
178
494
 
@@ -203,22 +519,175 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
203
519
  transport_type=EnumInfraTransportType.MCP,
204
520
  )
205
521
 
206
- # Note: The MCP server is created lazily when start_server() is called
207
- # This allows the handler to be initialized before tools are registered
522
+ # Check if server startup should be skipped (for unit testing)
523
+ skip_server_val = config.get("skip_server")
524
+ skip_server: bool = (
525
+ skip_server_val if isinstance(skip_server_val, bool) else False
526
+ )
527
+ self._skip_server = skip_server
528
+
529
+ if not skip_server:
530
+ # Build MCPServerConfig from handler config (OMN-1282)
531
+ # Map handler config fields to lifecycle config fields
532
+ #
533
+ # Per CLAUDE.md: .env is the SINGLE SOURCE OF TRUTH.
534
+ # No hardcoded fallbacks - all required config must be explicit.
535
+ # The _require_config_value helper validates type, cast() is for mypy.
536
+ consul_host = _require_config_value(
537
+ config, "consul_host", str, init_correlation_id
538
+ )
539
+ consul_port = _require_config_value(
540
+ config, "consul_port", int, init_correlation_id
541
+ )
542
+ kafka_enabled = _require_config_value(
543
+ config, "kafka_enabled", bool, init_correlation_id
544
+ )
545
+ dev_mode = _require_config_value(
546
+ config, "dev_mode", bool, init_correlation_id
547
+ )
548
+ # contracts_dir is optional - only used when dev_mode=True
549
+ contracts_dir_val = config.get("contracts_dir")
550
+ contracts_dir: str | None = (
551
+ contracts_dir_val if isinstance(contracts_dir_val, str) else None
552
+ )
553
+
554
+ server_config = ModelMCPServerConfig(
555
+ consul_host=consul_host,
556
+ consul_port=consul_port,
557
+ kafka_enabled=kafka_enabled,
558
+ http_host=self._config.host,
559
+ http_port=self._config.port,
560
+ default_timeout=self._config.timeout_seconds,
561
+ dev_mode=dev_mode,
562
+ contracts_dir=contracts_dir,
563
+ )
564
+
565
+ # Wrap entire server startup in try/except to ensure cleanup
566
+ # if ANY step fails after lifecycle starts. This prevents:
567
+ # - Orphan lifecycle resources (registry, executor, sync)
568
+ # - Orphan server tasks
569
+ # - Resource leaks from partial initialization
570
+ try:
571
+ # Create and start MCPServerLifecycle for tool discovery
572
+ # Container is required for lifecycle initialization
573
+ if self._container is None:
574
+ raise ValueError(
575
+ "Container required for MCPServerLifecycle initialization"
576
+ )
577
+ self._lifecycle = MCPServerLifecycle(
578
+ container=self._container,
579
+ config=server_config,
580
+ bus=None,
581
+ )
582
+ await self._lifecycle.start()
583
+
584
+ # Update MCP registry and executor references from lifecycle
585
+ if self._lifecycle.registry is not None:
586
+ self._mcp_registry = self._lifecycle.registry
587
+ if self._lifecycle.executor is not None:
588
+ self._mcp_executor = self._lifecycle.executor
589
+
590
+ # Create Starlette app with HTTP endpoints (OMN-1282)
591
+ # Use factory methods for explicit handler reference binding
592
+ health_endpoint = self._create_health_endpoint()
593
+ tools_list_endpoint = self._create_tools_list_endpoint()
594
+
595
+ app = Starlette(
596
+ routes=[
597
+ Route("/health", health_endpoint, methods=["GET"]),
598
+ Route("/mcp/tools", tools_list_endpoint, methods=["GET"]),
599
+ ],
600
+ )
601
+
602
+ # Create uvicorn server config and server
603
+ uvicorn_config = uvicorn.Config(
604
+ app=app,
605
+ host=self._config.host,
606
+ port=self._config.port,
607
+ log_level="info",
608
+ )
609
+ self._server = uvicorn.Server(uvicorn_config)
610
+
611
+ # Start server in background task
612
+ self._server_task = asyncio.create_task(self._server.serve())
613
+
614
+ # Wait for server to be ready before marking as initialized
615
+ await self._wait_for_server_ready(
616
+ self._config.host,
617
+ self._config.port,
618
+ timeout=self.startup_timeout,
619
+ )
620
+ self._server_started_at = time.time()
621
+
622
+ except Exception as startup_error:
623
+ # Any failure during server startup - clean up all resources
624
+ # This handles failures in:
625
+ # - lifecycle.start() (Consul/contract discovery)
626
+ # - Starlette app creation
627
+ # - uvicorn config/server creation
628
+ # - server task creation
629
+ # - server readiness check
630
+ logger.exception(
631
+ "MCP server startup failed, cleaning up resources",
632
+ extra={
633
+ "host": self._config.host,
634
+ "port": self._config.port,
635
+ "lifecycle_created": self._lifecycle is not None,
636
+ "server_created": self._server is not None,
637
+ "server_task_created": self._server_task is not None,
638
+ "correlation_id": str(init_correlation_id),
639
+ },
640
+ )
641
+ # shutdown() safely handles partially initialized state:
642
+ # - Checks each component before cleanup
643
+ # - Safe to call even if components weren't created
644
+ await self.shutdown()
645
+ ctx = ModelInfraErrorContext(
646
+ transport_type=EnumInfraTransportType.MCP,
647
+ operation="initialize",
648
+ target_name="mcp_handler",
649
+ correlation_id=init_correlation_id,
650
+ )
651
+ raise ProtocolConfigurationError(
652
+ f"MCP server startup failed: {startup_error}",
653
+ context=ctx,
654
+ ) from startup_error
655
+
208
656
  self._initialized = True
209
657
 
210
- logger.info(
211
- "%s initialized successfully",
212
- self.__class__.__name__,
213
- extra={
214
- "handler": self.__class__.__name__,
215
- "host": self._config.host,
216
- "port": self._config.port,
217
- "path": self._config.path,
218
- "stateless": self._config.stateless,
219
- "correlation_id": str(init_correlation_id),
220
- },
221
- )
658
+ tool_count = 0
659
+ if self._lifecycle and self._lifecycle.registry:
660
+ tool_count = self._lifecycle.registry.tool_count
661
+
662
+ if skip_server:
663
+ logger.info(
664
+ "%s initialized successfully (server skipped)",
665
+ self.__class__.__name__,
666
+ extra={
667
+ "handler": self.__class__.__name__,
668
+ "host": self._config.host,
669
+ "port": self._config.port,
670
+ "path": self._config.path,
671
+ "stateless": self._config.stateless,
672
+ "skip_server": True,
673
+ "correlation_id": str(init_correlation_id),
674
+ },
675
+ )
676
+ else:
677
+ logger.info(
678
+ "%s initialized successfully - uvicorn server running",
679
+ self.__class__.__name__,
680
+ extra={
681
+ "handler": self.__class__.__name__,
682
+ "host": self._config.host,
683
+ "port": self._config.port,
684
+ "path": self._config.path,
685
+ "stateless": self._config.stateless,
686
+ "tool_count": tool_count,
687
+ "url": f"http://{self._config.host}:{self._config.port}",
688
+ "correlation_id": str(init_correlation_id),
689
+ },
690
+ )
222
691
 
223
692
  except ValidationError as e:
224
693
  ctx = ModelInfraErrorContext(
@@ -242,11 +711,102 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
242
711
  ) from e
243
712
 
244
713
  async def shutdown(self) -> None:
245
- """Shutdown MCP handler and release resources."""
714
+ """Shutdown MCP handler with timeout protection.
715
+
716
+ This method performs graceful shutdown with timeout protection:
717
+ 1. Signal uvicorn server to stop
718
+ 2. Wait for server task with timeout (max 5s graceful, 1s forced)
719
+ 3. Shutdown MCPServerLifecycle (registry, discovery, sync)
720
+ 4. Clear tool registry and reset state
721
+
722
+ Safe to call multiple times. Never hangs indefinitely (max ~6s with defaults).
723
+
724
+ Note:
725
+ Timeouts are configurable via class attributes:
726
+ - shutdown_timeout: Graceful shutdown timeout (default: 5.0s)
727
+ - cancel_timeout: Forced cancellation timeout (default: 1.0s)
728
+ """
729
+ shutdown_correlation_id = uuid4()
730
+
731
+ logger.info(
732
+ "Shutting down %s",
733
+ self.__class__.__name__,
734
+ extra={
735
+ "handler": self.__class__.__name__,
736
+ "correlation_id": str(shutdown_correlation_id),
737
+ },
738
+ )
739
+
740
+ # Stop uvicorn server with timeout protection (OMN-1282)
741
+ if (
742
+ self._server is not None
743
+ and self._server_task is not None
744
+ and not self._skip_server
745
+ ):
746
+ # Signal server to stop
747
+ self._server.should_exit = True
748
+
749
+ try:
750
+ # Wait for graceful shutdown with timeout
751
+ logger.debug(
752
+ "Waiting for server task to complete",
753
+ extra={
754
+ "timeout_seconds": self.shutdown_timeout,
755
+ "correlation_id": str(shutdown_correlation_id),
756
+ },
757
+ )
758
+ await asyncio.wait_for(self._server_task, timeout=self.shutdown_timeout)
759
+ except TimeoutError:
760
+ logger.warning(
761
+ "Server shutdown timed out, forcing cancellation",
762
+ extra={
763
+ "timeout_seconds": self.shutdown_timeout,
764
+ "correlation_id": str(shutdown_correlation_id),
765
+ },
766
+ )
767
+ self._server_task.cancel()
768
+ try:
769
+ await asyncio.wait_for(
770
+ self._server_task, timeout=self.cancel_timeout
771
+ )
772
+ except (TimeoutError, asyncio.CancelledError):
773
+ pass # Best effort
774
+ except asyncio.CancelledError:
775
+ logger.debug(
776
+ "Server task was cancelled",
777
+ extra={"correlation_id": str(shutdown_correlation_id)},
778
+ )
779
+
780
+ # Shutdown lifecycle (registry, discovery, sync)
781
+ if self._lifecycle is not None:
782
+ logger.debug(
783
+ "Shutting down MCPServerLifecycle",
784
+ extra={"correlation_id": str(shutdown_correlation_id)},
785
+ )
786
+ await self._lifecycle.shutdown()
787
+ self._lifecycle = None
788
+
789
+ # Clear registry and executor references
790
+ self._mcp_registry = None
791
+ self._mcp_executor = None
792
+
793
+ # Clear all state
246
794
  self._tool_registry.clear()
247
795
  self._config = None
248
796
  self._initialized = False
249
- logger.info("HandlerMCP shutdown complete")
797
+ self._server = None
798
+ self._server_task = None
799
+ self._skip_server = False
800
+ self._server_started_at = None
801
+
802
+ logger.info(
803
+ "%s shutdown complete",
804
+ self.__class__.__name__,
805
+ extra={
806
+ "handler": self.__class__.__name__,
807
+ "correlation_id": str(shutdown_correlation_id),
808
+ },
809
+ )
250
810
 
251
811
  async def execute(
252
812
  self, envelope: dict[str, object]
@@ -566,23 +1126,23 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
566
1126
  ) -> dict[str, object]:
567
1127
  """Execute a registered tool.
568
1128
 
569
- This method delegates to the ONEX node that provides this tool.
570
- The actual implementation will route through the ONEX dispatcher.
1129
+ This method delegates to the ONEX orchestrator that provides this tool.
1130
+ When operating in integrated mode (with registry and executor), the tool
1131
+ is looked up from the MCP registry and executed via the execution adapter.
571
1132
 
572
1133
  Circuit breaker protection is applied to prevent cascading failures
573
1134
  when tool execution repeatedly fails.
574
1135
 
575
- Timeout Enforcement:
576
- The tool execution timeout (config.timeout_seconds, default: 30.0s)
577
- will be enforced when dispatcher integration is complete. The timeout
578
- will be applied using asyncio.wait_for() around the dispatcher call.
1136
+ Integration Mode (OMN-1281):
1137
+ When _mcp_registry and _mcp_executor are configured:
1138
+ 1. Look up the tool definition from the MCP registry
1139
+ 2. Delegate execution to AdapterONEXToolExecution
1140
+ 3. The adapter dispatches to the orchestrator endpoint
1141
+ 4. Timeout is enforced by the adapter using the tool's timeout_seconds
579
1142
 
580
- Currently, timeout enforcement is handled at the protocol level by:
581
- - uvicorn request timeout settings
582
- - MCP SDK's internal timeout handling
583
- - HTTP client timeouts on the caller side
584
-
585
- See: TODO(OMN-1288) for dispatcher timeout integration
1143
+ Legacy Mode:
1144
+ When registry/executor are not configured, returns placeholder response
1145
+ for backward compatibility.
586
1146
 
587
1147
  Args:
588
1148
  tool_name: Name of the tool to execute.
@@ -593,26 +1153,54 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
593
1153
  Tool execution result.
594
1154
 
595
1155
  Raises:
596
- InfraUnavailableError: If tool execution fails or circuit is open.
1156
+ InfraUnavailableError: If tool not found or execution fails.
597
1157
  """
598
1158
  # Check circuit breaker before tool execution
599
1159
  async with self._circuit_breaker_lock:
600
1160
  await self._check_circuit_breaker("execute_tool", correlation_id)
601
1161
 
602
1162
  try:
603
- # TODO(OMN-1288): Implement actual tool execution via ONEX dispatcher
604
- # Integration plan:
605
- # 1. Look up the ONEX node that provides this tool from container registry
606
- # 2. Build a ModelEventEnvelope for the node with proper correlation ID
607
- # 3. Dispatch to the node via the ONEX runtime dispatcher
608
- # 4. Apply timeout enforcement via asyncio.wait_for(dispatch(), timeout)
609
- # using self._config.timeout_seconds (default: 30.0s)
610
- # 5. Transform the node response to MCP-compatible format
611
- # 6. Handle dispatcher errors (timeout, node not found, execution failure)
612
- #
613
- # For now, return a placeholder response
1163
+ # Integrated mode: use MCP registry and executor (OMN-1281)
1164
+ if self._mcp_registry is not None and self._mcp_executor is not None:
1165
+ # Look up tool from registry
1166
+ tool = await self._mcp_registry.get_tool(tool_name)
1167
+ if tool is None:
1168
+ ctx = ModelInfraErrorContext.with_correlation(
1169
+ correlation_id=correlation_id,
1170
+ transport_type=self.transport_type,
1171
+ operation="execute_tool",
1172
+ target_name=tool_name,
1173
+ )
1174
+ raise InfraUnavailableError(
1175
+ f"Tool not found: {tool_name}",
1176
+ context=ctx,
1177
+ )
1178
+
1179
+ logger.info(
1180
+ "Executing MCP tool via adapter",
1181
+ extra={
1182
+ "tool_name": tool_name,
1183
+ "argument_count": len(arguments),
1184
+ "correlation_id": str(correlation_id),
1185
+ },
1186
+ )
1187
+
1188
+ # Execute via adapter
1189
+ result = await self._mcp_executor.execute(
1190
+ tool=tool,
1191
+ arguments=arguments,
1192
+ correlation_id=correlation_id,
1193
+ )
1194
+
1195
+ # Reset circuit breaker on success
1196
+ async with self._circuit_breaker_lock:
1197
+ await self._reset_circuit_breaker()
1198
+
1199
+ return result
1200
+
1201
+ # Legacy mode: placeholder response for backward compatibility
614
1202
  logger.info(
615
- "Tool execution requested",
1203
+ "Tool execution requested (placeholder mode)",
616
1204
  extra={
617
1205
  "tool_name": tool_name,
618
1206
  "argument_count": len(arguments),
@@ -620,7 +1208,7 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
620
1208
  },
621
1209
  )
622
1210
 
623
- result: dict[str, object] = {
1211
+ placeholder_result: dict[str, object] = {
624
1212
  "message": f"Tool '{tool_name}' executed successfully",
625
1213
  "arguments_received": list(arguments.keys()),
626
1214
  }
@@ -629,7 +1217,13 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
629
1217
  async with self._circuit_breaker_lock:
630
1218
  await self._reset_circuit_breaker()
631
1219
 
632
- return result
1220
+ return placeholder_result
1221
+
1222
+ except InfraUnavailableError:
1223
+ # Record failure in circuit breaker and re-raise
1224
+ async with self._circuit_breaker_lock:
1225
+ await self._record_circuit_failure("execute_tool", correlation_id)
1226
+ raise
633
1227
 
634
1228
  except Exception:
635
1229
  # Record failure in circuit breaker
@@ -706,7 +1300,7 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
706
1300
 
707
1301
  Returns:
708
1302
  dict containing handler type, category, transport type,
709
- supported operations, configuration, and tool count.
1303
+ supported operations, configuration, tool count, and server state.
710
1304
  """
711
1305
  config_dict: dict[str, object] = {}
712
1306
  if self._config:
@@ -720,28 +1314,116 @@ class HandlerMCP(MixinEnvelopeExtraction, MixinAsyncCircuitBreaker):
720
1314
  "max_tools": self._config.max_tools,
721
1315
  }
722
1316
 
1317
+ # Include lifecycle tool count if available (OMN-1282)
1318
+ tool_count = len(self._tool_registry)
1319
+ if self._lifecycle and self._lifecycle.registry:
1320
+ tool_count = self._lifecycle.registry.tool_count
1321
+
723
1322
  return {
724
1323
  "handler_type": self.handler_type.value,
725
1324
  "handler_category": self.handler_category.value,
726
1325
  "transport_type": self.transport_type.value,
727
1326
  "supported_operations": sorted(_SUPPORTED_OPERATIONS),
728
- "tool_count": len(self._tool_registry),
1327
+ "tool_count": tool_count,
729
1328
  "config": config_dict,
730
1329
  "initialized": self._initialized,
1330
+ "server_running": self._server is not None
1331
+ and self._server_task is not None,
1332
+ "lifecycle_running": self._lifecycle is not None
1333
+ and self._lifecycle.is_running,
731
1334
  "version": "0.1.0-mvp",
732
1335
  }
733
1336
 
734
1337
  async def health_check(self) -> dict[str, object]:
735
- """Check handler health and connectivity.
1338
+ """Check handler health and server status.
736
1339
 
737
- Returns:
738
- Health status including initialization state and tool count.
1340
+ Returns unhealthy if:
1341
+ - Not initialized
1342
+ - Server task has crashed/completed unexpectedly
1343
+ - Server task was cancelled
1344
+
1345
+ Note:
1346
+ When skip_server=True was used during initialization, the handler is
1347
+ considered healthy if initialized, even without a running server.
1348
+ This enables unit testing without actual port binding.
739
1349
  """
1350
+ if not self._initialized:
1351
+ return {
1352
+ "healthy": False,
1353
+ "reason": "not_initialized",
1354
+ "transport_type": self.transport_type.value,
1355
+ }
1356
+
1357
+ if self._skip_server:
1358
+ return {
1359
+ "healthy": True,
1360
+ "skip_server": True,
1361
+ "transport_type": self.transport_type.value,
1362
+ "initialized": True,
1363
+ }
1364
+
1365
+ # Capture server task reference once to avoid TOCTOU race conditions.
1366
+ # If _server_task is reassigned (e.g., by concurrent shutdown()),
1367
+ # we work with the captured reference consistently.
1368
+ server_task = self._server_task
1369
+
1370
+ # Check server task state
1371
+ if server_task is None:
1372
+ return {
1373
+ "healthy": False,
1374
+ "reason": "server_task_missing",
1375
+ "transport_type": self.transport_type.value,
1376
+ "initialized": True,
1377
+ }
1378
+
1379
+ if server_task.done():
1380
+ # Task completed - check why
1381
+ if server_task.cancelled():
1382
+ return {
1383
+ "healthy": False,
1384
+ "reason": "server_cancelled",
1385
+ "transport_type": self.transport_type.value,
1386
+ "initialized": True,
1387
+ }
1388
+
1389
+ exc = server_task.exception()
1390
+ if exc is not None:
1391
+ return {
1392
+ "healthy": False,
1393
+ "reason": "server_crashed",
1394
+ "error": str(exc)[:_ERROR_MESSAGE_MAX_LENGTH],
1395
+ "transport_type": self.transport_type.value,
1396
+ "initialized": True,
1397
+ }
1398
+
1399
+ # Exited cleanly but unexpectedly
1400
+ return {
1401
+ "healthy": False,
1402
+ "reason": "server_exited",
1403
+ "transport_type": self.transport_type.value,
1404
+ "initialized": True,
1405
+ }
1406
+
1407
+ # Task is still running - healthy
1408
+ # Include lifecycle tool count if available (OMN-1282)
1409
+ tool_count = len(self._tool_registry)
1410
+ if self._lifecycle and self._lifecycle.registry:
1411
+ tool_count = self._lifecycle.registry.tool_count
1412
+
1413
+ lifecycle_running = self._lifecycle is not None and self._lifecycle.is_running
1414
+
740
1415
  return {
741
- "healthy": self._initialized,
742
- "initialized": self._initialized,
743
- "tool_count": len(self._tool_registry),
1416
+ "healthy": True,
1417
+ "initialized": True,
1418
+ "server_running": True,
1419
+ "tool_count": tool_count,
744
1420
  "transport_type": self.transport_type.value,
1421
+ "lifecycle_running": lifecycle_running,
1422
+ "uptime_seconds": (
1423
+ time.time() - self._server_started_at
1424
+ if self._server_started_at is not None
1425
+ else None
1426
+ ),
745
1427
  }
746
1428
 
747
1429