omnibase_infra 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/adapters/adapter_onex_tool_execution.py +446 -0
  3. omnibase_infra/cli/commands.py +1 -1
  4. omnibase_infra/configs/widget_mapping.yaml +176 -0
  5. omnibase_infra/contracts/handlers/filesystem/handler_contract.yaml +4 -1
  6. omnibase_infra/contracts/handlers/mcp/handler_contract.yaml +4 -1
  7. omnibase_infra/errors/error_compute_registry.py +4 -1
  8. omnibase_infra/errors/error_event_bus_registry.py +4 -1
  9. omnibase_infra/errors/error_infra.py +3 -1
  10. omnibase_infra/errors/error_policy_registry.py +4 -1
  11. omnibase_infra/handlers/handler_db.py +2 -1
  12. omnibase_infra/handlers/handler_graph.py +10 -5
  13. omnibase_infra/handlers/handler_mcp.py +736 -63
  14. omnibase_infra/handlers/mixins/mixin_consul_kv.py +4 -3
  15. omnibase_infra/handlers/mixins/mixin_consul_service.py +2 -1
  16. omnibase_infra/handlers/service_discovery/handler_service_discovery_consul.py +301 -4
  17. omnibase_infra/handlers/service_discovery/models/model_service_info.py +10 -0
  18. omnibase_infra/mixins/mixin_async_circuit_breaker.py +3 -2
  19. omnibase_infra/mixins/mixin_node_introspection.py +24 -7
  20. omnibase_infra/mixins/mixin_retry_execution.py +1 -1
  21. omnibase_infra/models/handlers/__init__.py +10 -0
  22. omnibase_infra/models/handlers/model_bootstrap_handler_descriptor.py +162 -0
  23. omnibase_infra/models/handlers/model_handler_descriptor.py +15 -0
  24. omnibase_infra/models/mcp/__init__.py +15 -0
  25. omnibase_infra/models/mcp/model_mcp_contract_config.py +80 -0
  26. omnibase_infra/models/mcp/model_mcp_server_config.py +67 -0
  27. omnibase_infra/models/mcp/model_mcp_tool_definition.py +73 -0
  28. omnibase_infra/models/mcp/model_mcp_tool_parameter.py +35 -0
  29. omnibase_infra/models/registration/model_node_capabilities.py +11 -0
  30. omnibase_infra/nodes/architecture_validator/contract_architecture_validator.yaml +0 -5
  31. omnibase_infra/nodes/architecture_validator/registry/registry_infra_architecture_validator.py +17 -10
  32. omnibase_infra/nodes/effects/contract.yaml +0 -5
  33. omnibase_infra/nodes/node_registration_orchestrator/contract.yaml +7 -0
  34. omnibase_infra/nodes/node_registration_orchestrator/handlers/handler_node_introspected.py +86 -1
  35. omnibase_infra/nodes/node_registration_orchestrator/introspection_event_router.py +3 -3
  36. omnibase_infra/nodes/node_registration_orchestrator/registry/registry_infra_node_registration_orchestrator.py +9 -8
  37. omnibase_infra/nodes/node_registration_orchestrator/wiring.py +14 -13
  38. omnibase_infra/nodes/node_registration_storage_effect/contract.yaml +0 -5
  39. omnibase_infra/nodes/node_registration_storage_effect/registry/registry_infra_registration_storage.py +46 -25
  40. omnibase_infra/nodes/node_registry_effect/contract.yaml +0 -5
  41. omnibase_infra/nodes/node_registry_effect/handlers/handler_partial_retry.py +2 -1
  42. omnibase_infra/nodes/node_service_discovery_effect/registry/registry_infra_service_discovery.py +24 -19
  43. omnibase_infra/plugins/examples/plugin_json_normalizer.py +2 -2
  44. omnibase_infra/plugins/examples/plugin_json_normalizer_error_handling.py +2 -2
  45. omnibase_infra/plugins/plugin_compute_base.py +16 -2
  46. omnibase_infra/protocols/protocol_event_projector.py +1 -1
  47. omnibase_infra/runtime/__init__.py +51 -1
  48. omnibase_infra/runtime/binding_config_resolver.py +102 -37
  49. omnibase_infra/runtime/constants_notification.py +75 -0
  50. omnibase_infra/runtime/contract_handler_discovery.py +6 -1
  51. omnibase_infra/runtime/handler_bootstrap_source.py +514 -0
  52. omnibase_infra/runtime/handler_contract_config_loader.py +603 -0
  53. omnibase_infra/runtime/handler_contract_source.py +289 -167
  54. omnibase_infra/runtime/handler_plugin_loader.py +4 -2
  55. omnibase_infra/runtime/mixin_semver_cache.py +25 -1
  56. omnibase_infra/runtime/mixins/__init__.py +7 -0
  57. omnibase_infra/runtime/mixins/mixin_projector_notification_publishing.py +566 -0
  58. omnibase_infra/runtime/mixins/mixin_projector_sql_operations.py +31 -10
  59. omnibase_infra/runtime/models/__init__.py +24 -0
  60. omnibase_infra/runtime/models/model_health_check_result.py +2 -1
  61. omnibase_infra/runtime/models/model_projector_notification_config.py +171 -0
  62. omnibase_infra/runtime/models/model_transition_notification_outbox_config.py +112 -0
  63. omnibase_infra/runtime/models/model_transition_notification_outbox_metrics.py +140 -0
  64. omnibase_infra/runtime/models/model_transition_notification_publisher_metrics.py +357 -0
  65. omnibase_infra/runtime/projector_plugin_loader.py +1 -1
  66. omnibase_infra/runtime/projector_shell.py +229 -1
  67. omnibase_infra/runtime/protocols/__init__.py +10 -0
  68. omnibase_infra/runtime/registry/registry_protocol_binding.py +3 -2
  69. omnibase_infra/runtime/registry_policy.py +9 -326
  70. omnibase_infra/runtime/secret_resolver.py +4 -2
  71. omnibase_infra/runtime/service_kernel.py +10 -2
  72. omnibase_infra/runtime/service_message_dispatch_engine.py +4 -2
  73. omnibase_infra/runtime/service_runtime_host_process.py +225 -15
  74. omnibase_infra/runtime/transition_notification_outbox.py +1190 -0
  75. omnibase_infra/runtime/transition_notification_publisher.py +764 -0
  76. omnibase_infra/runtime/util_container_wiring.py +6 -5
  77. omnibase_infra/runtime/util_wiring.py +5 -1
  78. omnibase_infra/schemas/schema_transition_notification_outbox.sql +245 -0
  79. omnibase_infra/services/mcp/__init__.py +31 -0
  80. omnibase_infra/services/mcp/mcp_server_lifecycle.py +443 -0
  81. omnibase_infra/services/mcp/service_mcp_tool_discovery.py +411 -0
  82. omnibase_infra/services/mcp/service_mcp_tool_registry.py +329 -0
  83. omnibase_infra/services/mcp/service_mcp_tool_sync.py +547 -0
  84. omnibase_infra/services/registry_api/__init__.py +40 -0
  85. omnibase_infra/services/registry_api/main.py +243 -0
  86. omnibase_infra/services/registry_api/models/__init__.py +66 -0
  87. omnibase_infra/services/registry_api/models/model_capability_widget_mapping.py +38 -0
  88. omnibase_infra/services/registry_api/models/model_pagination_info.py +48 -0
  89. omnibase_infra/services/registry_api/models/model_registry_discovery_response.py +73 -0
  90. omnibase_infra/services/registry_api/models/model_registry_health_response.py +49 -0
  91. omnibase_infra/services/registry_api/models/model_registry_instance_view.py +88 -0
  92. omnibase_infra/services/registry_api/models/model_registry_node_view.py +88 -0
  93. omnibase_infra/services/registry_api/models/model_registry_summary.py +60 -0
  94. omnibase_infra/services/registry_api/models/model_response_list_instances.py +43 -0
  95. omnibase_infra/services/registry_api/models/model_response_list_nodes.py +51 -0
  96. omnibase_infra/services/registry_api/models/model_warning.py +49 -0
  97. omnibase_infra/services/registry_api/models/model_widget_defaults.py +28 -0
  98. omnibase_infra/services/registry_api/models/model_widget_mapping.py +51 -0
  99. omnibase_infra/services/registry_api/routes.py +371 -0
  100. omnibase_infra/services/registry_api/service.py +846 -0
  101. omnibase_infra/services/service_capability_query.py +4 -4
  102. omnibase_infra/services/service_health.py +3 -2
  103. omnibase_infra/services/service_timeout_emitter.py +13 -2
  104. omnibase_infra/utils/util_dsn_validation.py +1 -1
  105. omnibase_infra/validation/__init__.py +3 -19
  106. omnibase_infra/validation/contracts/security.validation.yaml +114 -0
  107. omnibase_infra/validation/infra_validators.py +35 -24
  108. omnibase_infra/validation/validation_exemptions.yaml +113 -9
  109. omnibase_infra/validation/validator_chain_propagation.py +2 -2
  110. omnibase_infra/validation/validator_runtime_shape.py +1 -1
  111. omnibase_infra/validation/validator_security.py +473 -370
  112. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/METADATA +2 -2
  113. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/RECORD +116 -74
  114. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/WHEEL +0 -0
  115. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/entry_points.txt +0 -0
  116. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,764 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """
4
+ Transition Notification Publisher Implementation.
5
+
6
+ Publishes state transition notifications after projection commits. This enables
7
+ orchestrators to reliably detect state transitions via the Observer pattern,
8
+ maintaining loose coupling between reducers and workflow coordinators.
9
+
10
+ Architecture Overview:
11
+ This service implements post-commit notification publishing in the ONEX
12
+ state machine architecture:
13
+
14
+ 1. Reducers commit state transitions to projections
15
+ 2. Post-commit hook creates ModelStateTransitionNotification
16
+ 3. TransitionNotificationPublisher publishes to event bus
17
+ 4. Orchestrators subscribe and coordinate downstream workflows
18
+
19
+ ```
20
+ Reducer -> Projection Commit -> Notification Publisher -> Event Bus
21
+ |
22
+ v
23
+ Orchestrators (subscribers)
24
+ ```
25
+
26
+ Design Principles:
27
+ - **Loose Coupling**: Reducers don't know about orchestrators
28
+ - **At-Least-Once Delivery**: Consumers handle idempotency via projection_version
29
+ - **Circuit Breaker**: Resilience against event bus failures
30
+ - **Correlation Tracking**: Full distributed tracing support
31
+
32
+ Concurrency Safety:
33
+ This implementation is coroutine-safe for concurrent async publishing.
34
+ Uses asyncio locks for circuit breaker state management. Note: This is
35
+ coroutine-safe, not thread-safe. For multi-threaded access, additional
36
+ synchronization would be required.
37
+
38
+ Error Handling:
39
+ All methods raise ONEX error types:
40
+ - InfraConnectionError: Event bus unavailable or connection failed
41
+ - InfraTimeoutError: Publish operation timed out
42
+ - InfraUnavailableError: Circuit breaker open
43
+
44
+ Example Usage:
45
+ ```python
46
+ from omnibase_infra.runtime import TransitionNotificationPublisher
47
+ from omnibase_core.models.notifications import ModelStateTransitionNotification
48
+
49
+ # Initialize publisher with event bus
50
+ publisher = TransitionNotificationPublisher(
51
+ event_bus=kafka_event_bus,
52
+ topic="onex.fsm.state.transitions.v1",
53
+ )
54
+
55
+ # Publish single notification
56
+ notification = ModelStateTransitionNotification(
57
+ aggregate_type="registration",
58
+ aggregate_id=entity_id,
59
+ from_state="pending",
60
+ to_state="active",
61
+ projection_version=1,
62
+ correlation_id=correlation_id,
63
+ causation_id=event_id,
64
+ timestamp=datetime.now(UTC),
65
+ )
66
+ await publisher.publish(notification)
67
+
68
+ # Batch publish
69
+ await publisher.publish_batch([notification1, notification2])
70
+
71
+ # Get metrics
72
+ metrics = publisher.get_metrics()
73
+ print(f"Published {metrics.notifications_published} notifications")
74
+ ```
75
+
76
+ Related Tickets:
77
+ - OMN-1139: Implement TransitionNotificationPublisher
78
+
79
+ See Also:
80
+ - ProtocolTransitionNotificationPublisher: Protocol definition (omnibase_core)
81
+ - ModelStateTransitionNotification: Notification model (omnibase_core)
82
+ - ProtocolEventBusLike: Event bus protocol
83
+ """
84
+
85
+ from __future__ import annotations
86
+
87
+ import asyncio
88
+ import logging
89
+ import time
90
+ from datetime import UTC, datetime
91
+ from typing import TYPE_CHECKING, NamedTuple
92
+ from uuid import UUID
93
+
94
+ from omnibase_core.models.events.model_event_envelope import ModelEventEnvelope
95
+ from omnibase_core.models.notifications import ModelStateTransitionNotification
96
+ from omnibase_core.protocols.notifications import (
97
+ ProtocolTransitionNotificationPublisher,
98
+ )
99
+ from omnibase_core.utils.util_uuid_service import UtilUUID
100
+ from omnibase_infra.enums import EnumInfraTransportType
101
+ from omnibase_infra.errors import (
102
+ InfraConnectionError,
103
+ InfraTimeoutError,
104
+ InfraUnavailableError,
105
+ ModelInfraErrorContext,
106
+ ModelTimeoutErrorContext,
107
+ )
108
+ from omnibase_infra.mixins import MixinAsyncCircuitBreaker
109
+ from omnibase_infra.models.resilience import ModelCircuitBreakerConfig
110
+ from omnibase_infra.runtime.models.model_transition_notification_publisher_metrics import (
111
+ ModelTransitionNotificationPublisherMetrics,
112
+ )
113
+ from omnibase_infra.utils.util_error_sanitization import sanitize_error_string
114
+
115
+ if TYPE_CHECKING:
116
+ from omnibase_infra.protocols import ProtocolEventBusLike
117
+
118
+ logger = logging.getLogger(__name__)
119
+
120
+
121
+ class FailedNotificationRecord(NamedTuple):
122
+ """Record of a failed notification publish attempt.
123
+
124
+ Used to track failures during batch publishing operations with clear
125
+ field semantics for error reporting and debugging.
126
+
127
+ Attributes:
128
+ aggregate_type: The type of aggregate that failed (e.g., "registration").
129
+ aggregate_id: The ID of the aggregate (as string for error reporting).
130
+ error_message: Sanitized error message describing the failure.
131
+ """
132
+
133
+ aggregate_type: str
134
+ aggregate_id: str
135
+ error_message: str
136
+
137
+
138
+ class TransitionNotificationPublisher(MixinAsyncCircuitBreaker):
139
+ """Publishes transition notifications after projection commits.
140
+
141
+ Implements ProtocolTransitionNotificationPublisher from omnibase_core.
142
+ Provides at-least-once delivery semantics for state transition notifications
143
+ to enable orchestrator coordination without tight coupling to reducers.
144
+
145
+ Features:
146
+ - Protocol compliant (ProtocolTransitionNotificationPublisher)
147
+ - Circuit breaker resilience (MixinAsyncCircuitBreaker)
148
+ - Metrics tracking for observability
149
+ - Batch publishing for efficiency
150
+ - Correlation ID propagation for distributed tracing
151
+
152
+ Circuit Breaker:
153
+ Uses MixinAsyncCircuitBreaker for resilience:
154
+ - Opens after consecutive failures (configurable threshold)
155
+ - Resets after timeout period (configurable)
156
+ - Raises InfraUnavailableError when open
157
+
158
+ Thread Safety:
159
+ Coroutine-safe via asyncio.Lock for circuit breaker state.
160
+ Not thread-safe - use only from async context.
161
+
162
+ Attributes:
163
+ _event_bus: Event bus for publishing notifications
164
+ _topic: Target topic for notifications
165
+ _lock: Async lock for metrics updates
166
+ _publisher_id: Unique identifier for this publisher instance
167
+
168
+ Example:
169
+ >>> publisher = TransitionNotificationPublisher(event_bus, topic="notifications.v1")
170
+ >>> await publisher.publish(notification)
171
+ >>> metrics = publisher.get_metrics()
172
+ >>> print(f"Success rate: {metrics.publish_success_rate():.2%}")
173
+ """
174
+
175
+ # Default maximum number of failures to track in memory during batch operations.
176
+ # Prevents unbounded memory growth for very large batches with many failures.
177
+ # Can be overridden via constructor parameter for large batch tuning.
178
+ DEFAULT_MAX_TRACKED_FAILURES: int = 100
179
+
180
+ def __init__(
181
+ self,
182
+ event_bus: ProtocolEventBusLike,
183
+ topic: str,
184
+ *,
185
+ publisher_id: str | None = None,
186
+ circuit_breaker_threshold: int = 5,
187
+ circuit_breaker_reset_timeout: float = 60.0,
188
+ max_tracked_failures: int = DEFAULT_MAX_TRACKED_FAILURES,
189
+ ) -> None:
190
+ """Initialize transition notification publisher.
191
+
192
+ Args:
193
+ event_bus: Event bus implementing ProtocolEventBusLike for publishing.
194
+ Must support publish_envelope() method.
195
+ topic: Target topic for transition notifications. Required.
196
+ This should be configured in the projector's contract or
197
+ notification config rather than hardcoded. Example topics:
198
+ - "onex.fsm.state.transitions.v1"
199
+ - "registration.state.transitions.v1"
200
+ publisher_id: Optional unique identifier for this publisher instance.
201
+ If not provided, a UUID will be generated.
202
+ circuit_breaker_threshold: Maximum failures before opening circuit.
203
+ Default: 5
204
+ circuit_breaker_reset_timeout: Seconds before automatic reset.
205
+ Default: 60.0
206
+ max_tracked_failures: Maximum number of failures to track in memory
207
+ during batch operations. Prevents unbounded memory growth for
208
+ very large batches with many failures. For large batch operations,
209
+ this can be tuned higher to capture more failure details.
210
+ Default: 100
211
+
212
+ Example:
213
+ >>> publisher = TransitionNotificationPublisher(
214
+ ... event_bus=kafka_event_bus,
215
+ ... topic="onex.fsm.state.transitions.v1",
216
+ ... circuit_breaker_threshold=3,
217
+ ... circuit_breaker_reset_timeout=30.0,
218
+ ... max_tracked_failures=200, # Tune for large batches
219
+ ... )
220
+ """
221
+ self._event_bus = event_bus
222
+ self._topic = topic
223
+ self._publisher_id = (
224
+ publisher_id or f"transition-publisher-{UtilUUID.generate()!s}"
225
+ )
226
+ self._lock = asyncio.Lock()
227
+ self._max_tracked_failures = max_tracked_failures
228
+
229
+ # Metrics tracking
230
+ self._notifications_published = 0
231
+ self._notifications_failed = 0
232
+ self._batch_operations = 0
233
+ self._batch_notifications_attempted = 0
234
+ self._batch_notifications_total = 0
235
+ self._batch_failures_truncated = 0
236
+ self._last_publish_at: datetime | None = None
237
+ self._last_publish_duration_ms: float = 0.0
238
+ self._total_publish_duration_ms: float = 0.0
239
+ self._max_publish_duration_ms: float = 0.0
240
+ self._started_at = datetime.now(UTC)
241
+
242
+ # Initialize circuit breaker with configured settings
243
+ # Note: the mixin sets self.circuit_breaker_threshold and
244
+ # self.circuit_breaker_reset_timeout as instance attributes
245
+ cb_config = ModelCircuitBreakerConfig(
246
+ threshold=circuit_breaker_threshold,
247
+ reset_timeout_seconds=circuit_breaker_reset_timeout,
248
+ service_name=f"transition-notification-publisher.{topic}",
249
+ transport_type=EnumInfraTransportType.KAFKA,
250
+ )
251
+ self._init_circuit_breaker_from_config(cb_config)
252
+
253
+ logger.info(
254
+ "TransitionNotificationPublisher initialized",
255
+ extra={
256
+ "publisher_id": self._publisher_id,
257
+ "topic": self._topic,
258
+ "circuit_breaker_threshold": circuit_breaker_threshold,
259
+ "circuit_breaker_reset_timeout": circuit_breaker_reset_timeout,
260
+ "max_tracked_failures": self._max_tracked_failures,
261
+ },
262
+ )
263
+
264
+ @property
265
+ def topic(self) -> str:
266
+ """Get the configured topic."""
267
+ return self._topic
268
+
269
+ @property
270
+ def publisher_id(self) -> str:
271
+ """Get the publisher identifier."""
272
+ return self._publisher_id
273
+
274
+ async def publish(
275
+ self,
276
+ notification: ModelStateTransitionNotification,
277
+ ) -> None:
278
+ """Publish a single state transition notification.
279
+
280
+ Wraps the notification in a ModelEventEnvelope and publishes to the
281
+ configured topic via the event bus. Implements at-least-once delivery
282
+ semantics - consumers should handle idempotency via projection_version.
283
+
284
+ Args:
285
+ notification: The state transition notification to publish.
286
+
287
+ Raises:
288
+ InfraConnectionError: If event bus connection fails.
289
+ InfraTimeoutError: If publish operation times out.
290
+ InfraUnavailableError: If circuit breaker is open.
291
+
292
+ Example:
293
+ >>> notification = ModelStateTransitionNotification(
294
+ ... aggregate_type="registration",
295
+ ... aggregate_id=uuid4(),
296
+ ... from_state="pending",
297
+ ... to_state="active",
298
+ ... projection_version=1,
299
+ ... correlation_id=uuid4(),
300
+ ... causation_id=uuid4(),
301
+ ... timestamp=datetime.now(UTC),
302
+ ... )
303
+ >>> await publisher.publish(notification)
304
+ """
305
+ correlation_id = notification.correlation_id
306
+ start_time = time.monotonic()
307
+
308
+ # Check circuit breaker before operation
309
+ async with self._circuit_breaker_lock:
310
+ await self._check_circuit_breaker("publish", correlation_id)
311
+
312
+ ctx = ModelInfraErrorContext.with_correlation(
313
+ correlation_id=correlation_id,
314
+ transport_type=EnumInfraTransportType.KAFKA,
315
+ operation="publish_transition_notification",
316
+ target_name=self._topic,
317
+ )
318
+
319
+ try:
320
+ # Create envelope wrapping the notification model directly.
321
+ # ModelEventEnvelope[T] is generic and handles Pydantic models natively,
322
+ # serializing them lazily when needed via to_dict_lazy().
323
+ envelope = ModelEventEnvelope[ModelStateTransitionNotification](
324
+ payload=notification,
325
+ correlation_id=notification.correlation_id,
326
+ source_tool=self._publisher_id,
327
+ )
328
+
329
+ # Publish to event bus
330
+ await self._event_bus.publish_envelope(envelope, self._topic)
331
+
332
+ # Calculate duration
333
+ duration_ms = (time.monotonic() - start_time) * 1000
334
+
335
+ # Record success
336
+ async with self._circuit_breaker_lock:
337
+ await self._reset_circuit_breaker()
338
+
339
+ # Update metrics
340
+ async with self._lock:
341
+ self._notifications_published += 1
342
+ self._last_publish_at = datetime.now(UTC)
343
+ self._last_publish_duration_ms = duration_ms
344
+ self._total_publish_duration_ms += duration_ms
345
+ self._max_publish_duration_ms = max(
346
+ self._max_publish_duration_ms, duration_ms
347
+ )
348
+
349
+ logger.debug(
350
+ "Published transition notification",
351
+ extra={
352
+ "aggregate_type": notification.aggregate_type,
353
+ "aggregate_id": str(notification.aggregate_id),
354
+ "from_state": notification.from_state,
355
+ "to_state": notification.to_state,
356
+ "projection_version": notification.projection_version,
357
+ "correlation_id": str(correlation_id),
358
+ "duration_ms": duration_ms,
359
+ },
360
+ )
361
+
362
+ except (InfraUnavailableError, InfraTimeoutError):
363
+ # Re-raise infrastructure errors without wrapping - preserve error semantics
364
+ await self._handle_failure("publish", correlation_id)
365
+ raise
366
+
367
+ except TimeoutError as e:
368
+ await self._handle_failure("publish", correlation_id)
369
+ timeout_ctx = ModelTimeoutErrorContext(
370
+ transport_type=EnumInfraTransportType.KAFKA,
371
+ operation="publish_transition_notification",
372
+ target_name=self._topic,
373
+ correlation_id=correlation_id,
374
+ )
375
+ raise InfraTimeoutError(
376
+ f"Timeout publishing transition notification for "
377
+ f"{notification.aggregate_type}:{notification.aggregate_id}",
378
+ context=timeout_ctx,
379
+ ) from e
380
+
381
+ except Exception as e:
382
+ await self._handle_failure("publish", correlation_id)
383
+ raise InfraConnectionError(
384
+ f"Failed to publish transition notification for "
385
+ f"{notification.aggregate_type}:{notification.aggregate_id}",
386
+ context=ctx,
387
+ ) from e
388
+
389
+ async def publish_batch(
390
+ self,
391
+ notifications: list[ModelStateTransitionNotification],
392
+ ) -> None:
393
+ """Publish multiple state transition notifications.
394
+
395
+ Publishes each notification sequentially, continuing on individual
396
+ failures. This method is provided for efficiency when multiple
397
+ transitions occur in a single unit of work.
398
+
399
+ Ordering:
400
+ Notifications are published in the order provided. The order is
401
+ preserved when delivery order matters for workflow correctness.
402
+
403
+ Error Handling:
404
+ If any notification fails to publish, the error is raised after
405
+ attempting all notifications. Partial success is possible.
406
+
407
+ Circuit Breaker Behavior:
408
+ The circuit breaker is checked only at the start of the batch
409
+ operation. However, individual publish() calls within the batch
410
+ can trip the circuit breaker if they fail. If the circuit breaker
411
+ opens mid-batch (due to accumulated failures from individual
412
+ publish calls), subsequent notifications in the batch will fail
413
+ with InfraUnavailableError. This is expected "partial success"
414
+ behavior - the batch continues attempting all notifications, but
415
+ failures are recorded and reported at the end.
416
+
417
+ Correlation ID Behavior:
418
+ The batch uses the **first notification's correlation_id** for all
419
+ batch-level operations:
420
+
421
+ - Circuit breaker checks (at batch start)
422
+ - Batch summary logging ("Batch publish completed")
423
+ - Error context creation (when raising InfraConnectionError)
424
+ - Failure summary logging ("Batch publish failures - details")
425
+
426
+ However, **individual notification errors are logged with their own
427
+ correlation_id**. When a specific notification fails within the batch,
428
+ the warning log entry includes that notification's correlation_id,
429
+ not the batch correlation_id.
430
+
431
+ This design is intentional:
432
+
433
+ 1. **Batch-level traceability**: Using a single correlation_id for
434
+ batch operations allows operators to correlate all batch-related
435
+ log entries and metrics under one trace ID.
436
+
437
+ 2. **Per-notification traceability**: Individual failure logs retain
438
+ their specific correlation_id, enabling operators to trace the
439
+ complete lifecycle of each notification independently.
440
+
441
+ Example log correlation::
442
+
443
+ # Batch-level log (uses first notification's correlation_id)
444
+ {"message": "Batch publish completed", "correlation_id": "aaa-111"}
445
+
446
+ # Individual failure log (uses that notification's correlation_id)
447
+ {"message": "Failed to publish notification in batch",
448
+ "correlation_id": "bbb-222"}
449
+
450
+ Args:
451
+ notifications: List of notifications to publish.
452
+
453
+ Raises:
454
+ InfraConnectionError: If event bus connection fails.
455
+ InfraTimeoutError: If publish operation times out.
456
+ InfraUnavailableError: If circuit breaker is open (at batch start
457
+ or if tripped mid-batch by individual publish failures).
458
+
459
+ Example:
460
+ >>> notifications = [notification1, notification2, notification3]
461
+ >>> await publisher.publish_batch(notifications)
462
+ """
463
+ if not notifications:
464
+ return
465
+
466
+ correlation_id = notifications[0].correlation_id
467
+ start_time = time.monotonic()
468
+
469
+ # Batch-level circuit breaker check for fail-fast behavior.
470
+ # NOTE: This check is NOT redundant with the per-notification check in publish().
471
+ # - This check: Fail-fast before starting any work if circuit is already open
472
+ # - Per-notification checks in publish(): Handle circuit opening MID-batch due to
473
+ # accumulated failures during batch processing (expected partial-success behavior)
474
+ # See docstring "Circuit Breaker Behavior" section for full explanation.
475
+ async with self._circuit_breaker_lock:
476
+ await self._check_circuit_breaker("publish_batch", correlation_id)
477
+
478
+ success_count = 0
479
+ last_error: Exception | None = None
480
+ failed_notifications: list[FailedNotificationRecord] = []
481
+ truncation_occurred = False
482
+ # Track error types to determine most severe error for final raise.
483
+ # Severity order: InfraUnavailableError > InfraTimeoutError > InfraConnectionError
484
+ encountered_unavailable = False
485
+ encountered_timeout = False
486
+
487
+ for notification in notifications:
488
+ try:
489
+ await self.publish(notification)
490
+ success_count += 1
491
+ except (
492
+ InfraConnectionError,
493
+ InfraTimeoutError,
494
+ InfraUnavailableError,
495
+ ) as e:
496
+ last_error = e
497
+ # Track error types for determining most severe error to raise
498
+ if isinstance(e, InfraUnavailableError):
499
+ encountered_unavailable = True
500
+ elif isinstance(e, InfraTimeoutError):
501
+ encountered_timeout = True
502
+ # Only track failures up to the limit to prevent unbounded memory growth
503
+ if len(failed_notifications) < self._max_tracked_failures:
504
+ failed_notifications.append(
505
+ FailedNotificationRecord(
506
+ aggregate_type=notification.aggregate_type,
507
+ aggregate_id=str(notification.aggregate_id),
508
+ error_message=sanitize_error_string(str(e)),
509
+ )
510
+ )
511
+ else:
512
+ # Mark that truncation occurred (limit reached)
513
+ truncation_occurred = True
514
+ logger.warning(
515
+ "Failed to publish notification in batch",
516
+ extra={
517
+ "aggregate_type": notification.aggregate_type,
518
+ "aggregate_id": str(notification.aggregate_id),
519
+ "error": sanitize_error_string(str(e)),
520
+ "correlation_id": str(notification.correlation_id),
521
+ },
522
+ )
523
+ # Continue with remaining notifications
524
+
525
+ # Calculate duration
526
+ duration_ms = (time.monotonic() - start_time) * 1000
527
+
528
+ # Update batch metrics
529
+ async with self._lock:
530
+ self._batch_operations += 1
531
+ self._batch_notifications_attempted += len(notifications)
532
+ self._batch_notifications_total += success_count
533
+ if truncation_occurred:
534
+ self._batch_failures_truncated += 1
535
+
536
+ failure_count = len(notifications) - success_count
537
+
538
+ # Log aggregate failure information when truncation occurs
539
+ if truncation_occurred:
540
+ failure_summary = self._summarize_failure_types(failed_notifications)
541
+ untracked_failures = failure_count - len(failed_notifications)
542
+ logger.warning(
543
+ "Batch publish failure tracking truncated",
544
+ extra={
545
+ "correlation_id": str(correlation_id),
546
+ "total_failures": failure_count,
547
+ "tracked_failures": len(failed_notifications),
548
+ "untracked_failures": untracked_failures,
549
+ "max_tracked_failures": self._max_tracked_failures,
550
+ "failure_type_summary": failure_summary,
551
+ },
552
+ )
553
+
554
+ logger.info(
555
+ "Batch publish completed",
556
+ extra={
557
+ "total": len(notifications),
558
+ "success": success_count,
559
+ "failed": failure_count,
560
+ "duration_ms": duration_ms,
561
+ "correlation_id": str(correlation_id),
562
+ },
563
+ )
564
+
565
+ # Raise with detailed failure information if any failures occurred
566
+ if last_error is not None:
567
+ ctx = ModelInfraErrorContext.with_correlation(
568
+ correlation_id=correlation_id,
569
+ transport_type=EnumInfraTransportType.KAFKA,
570
+ operation="publish_batch",
571
+ target_name=self._topic,
572
+ )
573
+
574
+ # Log failure details for debugging before raising truncated error.
575
+ # Limit logged failures to prevent oversized log entries while
576
+ # preserving full counts for metrics and observability.
577
+ max_logged_failures = 10
578
+ logged_failures = [
579
+ {
580
+ "aggregate_type": record.aggregate_type,
581
+ "aggregate_id": record.aggregate_id,
582
+ "error_message": record.error_message,
583
+ }
584
+ for record in failed_notifications[:max_logged_failures]
585
+ ]
586
+ failures_truncated = len(failed_notifications) > max_logged_failures
587
+
588
+ logger.warning(
589
+ "Batch publish failures - details",
590
+ extra={
591
+ "correlation_id": str(correlation_id),
592
+ "topic": self._topic,
593
+ "total_notifications": len(notifications),
594
+ "success_count": success_count,
595
+ "failure_count": failure_count,
596
+ "tracked_failures": len(failed_notifications),
597
+ "max_tracked_failures": self._max_tracked_failures,
598
+ "logged_failures": len(logged_failures),
599
+ "failures_truncated": failures_truncated,
600
+ "failures": logged_failures,
601
+ },
602
+ )
603
+
604
+ # Build detailed error message showing first 3 failures
605
+ failure_details = "; ".join(
606
+ f"{record.aggregate_type}:{record.aggregate_id[:8]}... - "
607
+ f"{record.error_message[:50]}"
608
+ for record in failed_notifications[:3]
609
+ )
610
+ if failure_count > 3:
611
+ failure_details += f" ... and {failure_count - 3} more"
612
+
613
+ error_message = (
614
+ f"Batch publish partially failed: {failure_count}/{len(notifications)} "
615
+ f"notifications failed ({success_count} succeeded). "
616
+ f"Failures: [{failure_details}]"
617
+ )
618
+
619
+ # Raise the most severe error type encountered during batch processing.
620
+ # Severity order: InfraUnavailableError > InfraTimeoutError > InfraConnectionError
621
+ # This preserves error semantics so callers can handle appropriately
622
+ # (e.g., retry on timeout, skip on unavailable).
623
+ if encountered_unavailable:
624
+ raise InfraUnavailableError(
625
+ error_message,
626
+ context=ctx,
627
+ ) from last_error
628
+ if encountered_timeout:
629
+ timeout_ctx = ModelTimeoutErrorContext(
630
+ transport_type=EnumInfraTransportType.KAFKA,
631
+ operation="publish_batch",
632
+ target_name=self._topic,
633
+ correlation_id=correlation_id,
634
+ )
635
+ raise InfraTimeoutError(
636
+ error_message,
637
+ context=timeout_ctx,
638
+ ) from last_error
639
+ raise InfraConnectionError(
640
+ error_message,
641
+ context=ctx,
642
+ ) from last_error
643
+
644
+ async def _handle_failure(
645
+ self,
646
+ operation: str,
647
+ correlation_id: UUID,
648
+ ) -> None:
649
+ """Handle a publish failure by recording circuit breaker failure.
650
+
651
+ Args:
652
+ operation: Operation name for logging
653
+ correlation_id: Correlation ID for tracing
654
+ """
655
+ async with self._circuit_breaker_lock:
656
+ await self._record_circuit_failure(operation, correlation_id)
657
+
658
+ async with self._lock:
659
+ self._notifications_failed += 1
660
+
661
+ def _summarize_failure_types(
662
+ self, failures: list[FailedNotificationRecord]
663
+ ) -> dict[str, int]:
664
+ """Summarize failure types by grouping error messages.
665
+
666
+ Groups failures by a simplified error pattern (first 50 characters of
667
+ the error message) to help operators understand what types of errors
668
+ are occurring, even when detailed failure records are truncated.
669
+
670
+ Args:
671
+ failures: List of failed notification records to summarize.
672
+
673
+ Returns:
674
+ Dictionary mapping error pattern (truncated error message) to
675
+ the count of failures with that pattern.
676
+
677
+ Example:
678
+ >>> failures = [
679
+ ... FailedNotificationRecord("reg", "id1", "Connection refused to broker"),
680
+ ... FailedNotificationRecord("reg", "id2", "Connection refused to broker"),
681
+ ... FailedNotificationRecord("reg", "id3", "Timeout waiting for response"),
682
+ ... ]
683
+ >>> summary = publisher._summarize_failure_types(failures)
684
+ >>> # {"Connection refused to broker": 2, "Timeout waiting for response": 1}
685
+ """
686
+ summary: dict[str, int] = {}
687
+ for failure in failures:
688
+ # Use first 50 chars as the pattern key for grouping
689
+ pattern = failure.error_message[:50]
690
+ summary[pattern] = summary.get(pattern, 0) + 1
691
+ return summary
692
+
693
+ def get_metrics(self) -> ModelTransitionNotificationPublisherMetrics:
694
+ """Get current publisher metrics.
695
+
696
+ Returns a snapshot of the publisher's operational metrics including
697
+ notification counts, timing information, and circuit breaker state.
698
+
699
+ Returns:
700
+ ModelTransitionNotificationPublisherMetrics with current values.
701
+
702
+ Example:
703
+ >>> metrics = publisher.get_metrics()
704
+ >>> print(f"Published: {metrics.notifications_published}")
705
+ >>> print(f"Success rate: {metrics.publish_success_rate():.2%}")
706
+ >>> print(f"Healthy: {metrics.is_healthy()}")
707
+ """
708
+ # Get circuit breaker state
709
+ cb_state = self._get_circuit_breaker_state()
710
+ cb_open = cb_state.get("state") == "open"
711
+ failures_value = cb_state.get("failures", 0)
712
+ consecutive_failures = failures_value if isinstance(failures_value, int) else 0
713
+
714
+ # Calculate average duration (only from successful publishes since
715
+ # _total_publish_duration_ms is only updated on success)
716
+ average_duration = (
717
+ self._total_publish_duration_ms / self._notifications_published
718
+ if self._notifications_published > 0
719
+ else 0.0
720
+ )
721
+
722
+ return ModelTransitionNotificationPublisherMetrics(
723
+ publisher_id=self._publisher_id,
724
+ topic=self._topic,
725
+ notifications_published=self._notifications_published,
726
+ notifications_failed=self._notifications_failed,
727
+ batch_operations=self._batch_operations,
728
+ batch_notifications_attempted=self._batch_notifications_attempted,
729
+ batch_notifications_total=self._batch_notifications_total,
730
+ batch_failures_truncated=self._batch_failures_truncated,
731
+ last_publish_at=self._last_publish_at,
732
+ last_publish_duration_ms=self._last_publish_duration_ms,
733
+ average_publish_duration_ms=average_duration,
734
+ max_publish_duration_ms=self._max_publish_duration_ms,
735
+ circuit_breaker_open=cb_open,
736
+ consecutive_failures=consecutive_failures,
737
+ started_at=self._started_at,
738
+ )
739
+
740
+
741
+ # Protocol compliance check (runtime_checkable allows isinstance checks)
742
+ def _verify_protocol_compliance() -> None: # pragma: no cover
743
+ """Verify TransitionNotificationPublisher implements the protocol.
744
+
745
+ This function is never called at runtime - it exists purely for static
746
+ type checking verification that the implementation is protocol-compliant.
747
+ """
748
+ from typing import cast
749
+
750
+ from omnibase_infra.event_bus.event_bus_inmemory import EventBusInmemory
751
+
752
+ # Create instance to verify protocol compliance
753
+ bus = cast("ProtocolEventBusLike", EventBusInmemory())
754
+ publisher: ProtocolTransitionNotificationPublisher = (
755
+ TransitionNotificationPublisher(
756
+ event_bus=bus,
757
+ topic="onex.fsm.state.transitions.v1",
758
+ )
759
+ )
760
+ # Use the variable to silence unused warnings
761
+ _ = publisher
762
+
763
+
764
+ __all__: list[str] = ["FailedNotificationRecord", "TransitionNotificationPublisher"]