omnibase_infra 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/adapters/adapter_onex_tool_execution.py +446 -0
  3. omnibase_infra/cli/commands.py +1 -1
  4. omnibase_infra/configs/widget_mapping.yaml +176 -0
  5. omnibase_infra/contracts/handlers/filesystem/handler_contract.yaml +4 -1
  6. omnibase_infra/contracts/handlers/mcp/handler_contract.yaml +4 -1
  7. omnibase_infra/errors/error_compute_registry.py +4 -1
  8. omnibase_infra/errors/error_event_bus_registry.py +4 -1
  9. omnibase_infra/errors/error_infra.py +3 -1
  10. omnibase_infra/errors/error_policy_registry.py +4 -1
  11. omnibase_infra/handlers/handler_db.py +2 -1
  12. omnibase_infra/handlers/handler_graph.py +10 -5
  13. omnibase_infra/handlers/handler_mcp.py +736 -63
  14. omnibase_infra/handlers/mixins/mixin_consul_kv.py +4 -3
  15. omnibase_infra/handlers/mixins/mixin_consul_service.py +2 -1
  16. omnibase_infra/handlers/service_discovery/handler_service_discovery_consul.py +301 -4
  17. omnibase_infra/handlers/service_discovery/models/model_service_info.py +10 -0
  18. omnibase_infra/mixins/mixin_async_circuit_breaker.py +3 -2
  19. omnibase_infra/mixins/mixin_node_introspection.py +24 -7
  20. omnibase_infra/mixins/mixin_retry_execution.py +1 -1
  21. omnibase_infra/models/handlers/__init__.py +10 -0
  22. omnibase_infra/models/handlers/model_bootstrap_handler_descriptor.py +162 -0
  23. omnibase_infra/models/handlers/model_handler_descriptor.py +15 -0
  24. omnibase_infra/models/mcp/__init__.py +15 -0
  25. omnibase_infra/models/mcp/model_mcp_contract_config.py +80 -0
  26. omnibase_infra/models/mcp/model_mcp_server_config.py +67 -0
  27. omnibase_infra/models/mcp/model_mcp_tool_definition.py +73 -0
  28. omnibase_infra/models/mcp/model_mcp_tool_parameter.py +35 -0
  29. omnibase_infra/models/registration/model_node_capabilities.py +11 -0
  30. omnibase_infra/nodes/architecture_validator/contract_architecture_validator.yaml +0 -5
  31. omnibase_infra/nodes/architecture_validator/registry/registry_infra_architecture_validator.py +17 -10
  32. omnibase_infra/nodes/effects/contract.yaml +0 -5
  33. omnibase_infra/nodes/node_registration_orchestrator/contract.yaml +7 -0
  34. omnibase_infra/nodes/node_registration_orchestrator/handlers/handler_node_introspected.py +86 -1
  35. omnibase_infra/nodes/node_registration_orchestrator/introspection_event_router.py +3 -3
  36. omnibase_infra/nodes/node_registration_orchestrator/registry/registry_infra_node_registration_orchestrator.py +9 -8
  37. omnibase_infra/nodes/node_registration_orchestrator/wiring.py +14 -13
  38. omnibase_infra/nodes/node_registration_storage_effect/contract.yaml +0 -5
  39. omnibase_infra/nodes/node_registration_storage_effect/registry/registry_infra_registration_storage.py +46 -25
  40. omnibase_infra/nodes/node_registry_effect/contract.yaml +0 -5
  41. omnibase_infra/nodes/node_registry_effect/handlers/handler_partial_retry.py +2 -1
  42. omnibase_infra/nodes/node_service_discovery_effect/registry/registry_infra_service_discovery.py +24 -19
  43. omnibase_infra/plugins/examples/plugin_json_normalizer.py +2 -2
  44. omnibase_infra/plugins/examples/plugin_json_normalizer_error_handling.py +2 -2
  45. omnibase_infra/plugins/plugin_compute_base.py +16 -2
  46. omnibase_infra/protocols/protocol_event_projector.py +1 -1
  47. omnibase_infra/runtime/__init__.py +51 -1
  48. omnibase_infra/runtime/binding_config_resolver.py +102 -37
  49. omnibase_infra/runtime/constants_notification.py +75 -0
  50. omnibase_infra/runtime/contract_handler_discovery.py +6 -1
  51. omnibase_infra/runtime/handler_bootstrap_source.py +514 -0
  52. omnibase_infra/runtime/handler_contract_config_loader.py +603 -0
  53. omnibase_infra/runtime/handler_contract_source.py +289 -167
  54. omnibase_infra/runtime/handler_plugin_loader.py +4 -2
  55. omnibase_infra/runtime/mixin_semver_cache.py +25 -1
  56. omnibase_infra/runtime/mixins/__init__.py +7 -0
  57. omnibase_infra/runtime/mixins/mixin_projector_notification_publishing.py +566 -0
  58. omnibase_infra/runtime/mixins/mixin_projector_sql_operations.py +31 -10
  59. omnibase_infra/runtime/models/__init__.py +24 -0
  60. omnibase_infra/runtime/models/model_health_check_result.py +2 -1
  61. omnibase_infra/runtime/models/model_projector_notification_config.py +171 -0
  62. omnibase_infra/runtime/models/model_transition_notification_outbox_config.py +112 -0
  63. omnibase_infra/runtime/models/model_transition_notification_outbox_metrics.py +140 -0
  64. omnibase_infra/runtime/models/model_transition_notification_publisher_metrics.py +357 -0
  65. omnibase_infra/runtime/projector_plugin_loader.py +1 -1
  66. omnibase_infra/runtime/projector_shell.py +229 -1
  67. omnibase_infra/runtime/protocols/__init__.py +10 -0
  68. omnibase_infra/runtime/registry/registry_protocol_binding.py +3 -2
  69. omnibase_infra/runtime/registry_policy.py +9 -326
  70. omnibase_infra/runtime/secret_resolver.py +4 -2
  71. omnibase_infra/runtime/service_kernel.py +10 -2
  72. omnibase_infra/runtime/service_message_dispatch_engine.py +4 -2
  73. omnibase_infra/runtime/service_runtime_host_process.py +225 -15
  74. omnibase_infra/runtime/transition_notification_outbox.py +1190 -0
  75. omnibase_infra/runtime/transition_notification_publisher.py +764 -0
  76. omnibase_infra/runtime/util_container_wiring.py +6 -5
  77. omnibase_infra/runtime/util_wiring.py +5 -1
  78. omnibase_infra/schemas/schema_transition_notification_outbox.sql +245 -0
  79. omnibase_infra/services/mcp/__init__.py +31 -0
  80. omnibase_infra/services/mcp/mcp_server_lifecycle.py +443 -0
  81. omnibase_infra/services/mcp/service_mcp_tool_discovery.py +411 -0
  82. omnibase_infra/services/mcp/service_mcp_tool_registry.py +329 -0
  83. omnibase_infra/services/mcp/service_mcp_tool_sync.py +547 -0
  84. omnibase_infra/services/registry_api/__init__.py +40 -0
  85. omnibase_infra/services/registry_api/main.py +243 -0
  86. omnibase_infra/services/registry_api/models/__init__.py +66 -0
  87. omnibase_infra/services/registry_api/models/model_capability_widget_mapping.py +38 -0
  88. omnibase_infra/services/registry_api/models/model_pagination_info.py +48 -0
  89. omnibase_infra/services/registry_api/models/model_registry_discovery_response.py +73 -0
  90. omnibase_infra/services/registry_api/models/model_registry_health_response.py +49 -0
  91. omnibase_infra/services/registry_api/models/model_registry_instance_view.py +88 -0
  92. omnibase_infra/services/registry_api/models/model_registry_node_view.py +88 -0
  93. omnibase_infra/services/registry_api/models/model_registry_summary.py +60 -0
  94. omnibase_infra/services/registry_api/models/model_response_list_instances.py +43 -0
  95. omnibase_infra/services/registry_api/models/model_response_list_nodes.py +51 -0
  96. omnibase_infra/services/registry_api/models/model_warning.py +49 -0
  97. omnibase_infra/services/registry_api/models/model_widget_defaults.py +28 -0
  98. omnibase_infra/services/registry_api/models/model_widget_mapping.py +51 -0
  99. omnibase_infra/services/registry_api/routes.py +371 -0
  100. omnibase_infra/services/registry_api/service.py +846 -0
  101. omnibase_infra/services/service_capability_query.py +4 -4
  102. omnibase_infra/services/service_health.py +3 -2
  103. omnibase_infra/services/service_timeout_emitter.py +13 -2
  104. omnibase_infra/utils/util_dsn_validation.py +1 -1
  105. omnibase_infra/validation/__init__.py +3 -19
  106. omnibase_infra/validation/contracts/security.validation.yaml +114 -0
  107. omnibase_infra/validation/infra_validators.py +35 -24
  108. omnibase_infra/validation/validation_exemptions.yaml +113 -9
  109. omnibase_infra/validation/validator_chain_propagation.py +2 -2
  110. omnibase_infra/validation/validator_runtime_shape.py +1 -1
  111. omnibase_infra/validation/validator_security.py +473 -370
  112. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/METADATA +2 -2
  113. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/RECORD +116 -74
  114. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/WHEEL +0 -0
  115. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/entry_points.txt +0 -0
  116. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1190 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """Transition Notification Outbox for guaranteed delivery.
4
+
5
+ This module implements the outbox pattern for state transition notifications.
6
+ The outbox stores notifications in the same database transaction as projections,
7
+ then processes them asynchronously via a background processor to ensure
8
+ at-least-once delivery semantics.
9
+
10
+ At-Least-Once Delivery Semantics:
11
+ This implementation guarantees that every notification will be delivered
12
+ **at least once**, but **duplicates are possible** during failure scenarios:
13
+
14
+ - If the publisher succeeds but the database update fails, the notification
15
+ will be re-published on the next processing cycle.
16
+ - If the processor crashes after publishing but before marking as processed,
17
+ the notification will be re-published when the processor restarts.
18
+ - Network partitions or timeouts can cause similar duplicate delivery.
19
+
20
+ **CRITICAL**: Consumers MUST implement idempotent message handling. This
21
+ typically means:
22
+
23
+ - Tracking processed notification IDs (using ``notification_id`` field)
24
+ - Using database upserts with conflict detection
25
+ - Designing state transitions to be idempotent (same transition twice = no-op)
26
+
27
+ Database Schema (must be created before use):
28
+ ```sql
29
+ CREATE TABLE transition_notification_outbox (
30
+ id BIGSERIAL PRIMARY KEY,
31
+ notification_data JSONB NOT NULL,
32
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
33
+ processed_at TIMESTAMPTZ,
34
+ retry_count INT NOT NULL DEFAULT 0,
35
+ last_error TEXT,
36
+ aggregate_type TEXT NOT NULL,
37
+ aggregate_id UUID NOT NULL
38
+ );
39
+
40
+ -- Index for efficient pending notification queries
41
+ CREATE INDEX idx_outbox_pending ON transition_notification_outbox (created_at)
42
+ WHERE processed_at IS NULL;
43
+
44
+ -- Index for aggregate-specific queries
45
+ CREATE INDEX idx_outbox_aggregate ON transition_notification_outbox
46
+ (aggregate_type, aggregate_id);
47
+ ```
48
+
49
+ Key Features:
50
+ - Stores notifications in same transaction as projection writes
51
+ - Background processor publishes pending notifications
52
+ - SELECT FOR UPDATE SKIP LOCKED for safe concurrent processing
53
+ - Retry tracking with error recording
54
+ - Configurable batch size and poll interval
55
+ - Graceful shutdown with proper lifecycle management
56
+
57
+ Concurrency Safety:
58
+ This implementation is coroutine-safe using asyncio primitives:
59
+ - Background loop protected by asyncio.Lock
60
+ - Shutdown signaling via asyncio.Event
61
+ Note: This is coroutine-safe, not thread-safe.
62
+
63
+ Related Tickets:
64
+ - OMN-1139: TransitionNotificationOutbox implementation (Optional Enhancement)
65
+
66
+ .. versionadded:: 0.8.0
67
+ """
68
+
69
+ from __future__ import annotations
70
+
71
+ import asyncio
72
+ import logging
73
+ from uuid import UUID
74
+
75
+ import asyncpg
76
+
77
+ # Use core model and protocol
78
+ from omnibase_core.models.notifications import ModelStateTransitionNotification
79
+ from omnibase_core.protocols.notifications import (
80
+ ProtocolTransitionNotificationPublisher,
81
+ )
82
+ from omnibase_core.utils.util_uuid_service import UtilUUID
83
+ from omnibase_infra.enums import EnumInfraTransportType
84
+ from omnibase_infra.errors import (
85
+ InfraConnectionError,
86
+ InfraTimeoutError,
87
+ ModelInfraErrorContext,
88
+ ModelTimeoutErrorContext,
89
+ ProtocolConfigurationError,
90
+ RuntimeHostError,
91
+ )
92
+ from omnibase_infra.models.projectors.util_sql_identifiers import quote_identifier
93
+ from omnibase_infra.runtime.models.model_transition_notification_outbox_metrics import (
94
+ ModelTransitionNotificationOutboxMetrics,
95
+ )
96
+ from omnibase_infra.utils.util_error_sanitization import sanitize_error_string
97
+
98
+ logger = logging.getLogger(__name__)
99
+
100
+
101
+ class TransitionNotificationOutbox:
102
+ """Outbox pattern for guaranteed notification delivery.
103
+
104
+ Stores notifications in the same database transaction as projections,
105
+ ensuring at-least-once semantics. A background processor publishes
106
+ pending notifications asynchronously.
107
+
108
+ Warning:
109
+ **Duplicate Delivery**: This implementation provides at-least-once
110
+ delivery, meaning **duplicates are possible** during failures. If the
111
+ publisher succeeds but the subsequent database update fails (marking
112
+ the notification as processed), the notification will be re-published
113
+ on the next processing cycle. Consumers MUST implement idempotent
114
+ message handling to safely handle duplicate notifications.
115
+
116
+ The outbox pattern solves the dual-write problem: when you need to
117
+ update a database AND publish an event, either operation could fail
118
+ independently, leading to inconsistent state. By writing the event
119
+ to an outbox table in the same transaction as the data change, we
120
+ guarantee atomicity. A separate process then reads from the outbox
121
+ and publishes events.
122
+
123
+ Dead Letter Queue (DLQ) Support:
124
+ When configured with ``max_retries`` and ``dlq_publisher``, notifications
125
+ that exceed the retry threshold are moved to a dead letter queue instead
126
+ of being retried indefinitely. This prevents poison messages from blocking
127
+ the outbox and provides a way to inspect and replay failed notifications.
128
+
129
+ DLQ notifications are published with the original notification payload,
130
+ allowing downstream consumers to process or investigate failures.
131
+
132
+ Warning:
133
+ **DLQ Unavailability Risk**: If the DLQ itself becomes permanently
134
+ unavailable, notifications that have exceeded ``max_retries`` will
135
+ continue to be retried indefinitely. This occurs because ``retry_count``
136
+ is intentionally NOT incremented when DLQ publish fails (to preserve
137
+ the retry state for when the DLQ recovers).
138
+
139
+ **Monitoring Recommendation**: Monitor for notifications matching:
140
+ ``processed_at IS NULL AND retry_count >= max_retries``. Notifications
141
+ in this state indicate DLQ availability issues requiring operator
142
+ intervention.
143
+
144
+ Attributes:
145
+ table_name: Name of the outbox table (default: "transition_notification_outbox")
146
+ batch_size: Number of notifications to process per batch (default: 100)
147
+ poll_interval: Seconds between processing polls when idle (default: 1.0)
148
+ shutdown_timeout: Seconds to wait for graceful shutdown during stop() (default: 10.0)
149
+ is_running: Whether the background processor is running
150
+ max_retries: Maximum retry attempts before moving to DLQ (None if DLQ disabled)
151
+ dlq_topic: DLQ topic name for metrics/logging (None if DLQ disabled)
152
+
153
+ Concurrency Safety:
154
+ This implementation is coroutine-safe using asyncio primitives:
155
+ - Background loop protected by ``_lock`` (asyncio.Lock)
156
+ - Shutdown signaling via ``_shutdown_event`` (asyncio.Event)
157
+ Note: This is coroutine-safe, not thread-safe.
158
+
159
+ Example:
160
+ >>> from asyncpg import create_pool
161
+ >>> from omnibase_infra.runtime import TransitionNotificationOutbox
162
+ >>>
163
+ >>> # Create outbox with publisher
164
+ >>> pool = await create_pool(dsn)
165
+ >>> publisher = KafkaTransitionPublisher()
166
+ >>> outbox = TransitionNotificationOutbox(
167
+ ... pool=pool,
168
+ ... publisher=publisher,
169
+ ... batch_size=50,
170
+ ... poll_interval_seconds=0.5,
171
+ ... )
172
+ >>>
173
+ >>> # Start background processor
174
+ >>> await outbox.start()
175
+ >>>
176
+ >>> # In projection transaction - store notification
177
+ >>> async with pool.acquire() as conn:
178
+ ... async with conn.transaction():
179
+ ... # Update projection...
180
+ ... await projector.project(event, correlation_id)
181
+ ... # Store notification in same transaction
182
+ ... await outbox.store(notification, conn)
183
+ >>>
184
+ >>> # Stop gracefully
185
+ >>> await outbox.stop()
186
+
187
+ Example with DLQ:
188
+ >>> # Create outbox with DLQ support
189
+ >>> dlq_publisher = KafkaDLQPublisher(topic="notifications-dlq")
190
+ >>> outbox = TransitionNotificationOutbox(
191
+ ... pool=pool,
192
+ ... publisher=publisher,
193
+ ... max_retries=3,
194
+ ... dlq_publisher=dlq_publisher,
195
+ ... dlq_topic="notifications-dlq",
196
+ ... )
197
+ >>> # Notifications failing 3+ times will be moved to DLQ
198
+
199
+ Related:
200
+ - OMN-1139: TransitionNotificationOutbox implementation
201
+ - ProtocolTransitionNotificationPublisher: Publisher protocol
202
+ - ModelStateTransitionNotification: Notification model
203
+ """
204
+
205
+ # Default configuration values
206
+ DEFAULT_TABLE_NAME: str = "transition_notification_outbox"
207
+ DEFAULT_BATCH_SIZE: int = 100
208
+ DEFAULT_POLL_INTERVAL_SECONDS: float = 1.0
209
+ DEFAULT_QUERY_TIMEOUT_SECONDS: float = 30.0
210
+ DEFAULT_STRICT_TRANSACTION_MODE: bool = True
211
+ DEFAULT_SHUTDOWN_TIMEOUT_SECONDS: float = 10.0
212
+ MAX_ERROR_MESSAGE_LENGTH: int = 1000
213
+
214
+ def __init__(
215
+ self,
216
+ pool: asyncpg.Pool,
217
+ publisher: ProtocolTransitionNotificationPublisher,
218
+ table_name: str = DEFAULT_TABLE_NAME,
219
+ batch_size: int = DEFAULT_BATCH_SIZE,
220
+ poll_interval_seconds: float = DEFAULT_POLL_INTERVAL_SECONDS,
221
+ query_timeout_seconds: float = DEFAULT_QUERY_TIMEOUT_SECONDS,
222
+ strict_transaction_mode: bool = DEFAULT_STRICT_TRANSACTION_MODE,
223
+ shutdown_timeout_seconds: float = DEFAULT_SHUTDOWN_TIMEOUT_SECONDS,
224
+ max_retries: int | None = None,
225
+ dlq_publisher: ProtocolTransitionNotificationPublisher | None = None,
226
+ dlq_topic: str | None = None,
227
+ ) -> None:
228
+ """Initialize the TransitionNotificationOutbox.
229
+
230
+ Args:
231
+ pool: asyncpg connection pool for database access.
232
+ publisher: Publisher implementation for delivering notifications.
233
+ table_name: Name of the outbox table (default: "transition_notification_outbox").
234
+ batch_size: Maximum notifications to process per batch (default: 100).
235
+ poll_interval_seconds: Seconds between polls when idle (default: 1.0).
236
+ query_timeout_seconds: Timeout for database queries (default: 30.0).
237
+ strict_transaction_mode: If True (default), raises ProtocolConfigurationError
238
+ when store() is called outside a transaction context, providing
239
+ fail-fast behavior to catch misconfiguration early. If False,
240
+ logs a warning but continues execution (atomicity not guaranteed).
241
+ shutdown_timeout_seconds: Timeout in seconds for graceful shutdown
242
+ during stop() (default: 10.0). If the background processor does
243
+ not complete within this timeout, it will be cancelled.
244
+ max_retries: Maximum retry attempts before moving notification to DLQ.
245
+ Must be >= 1 if specified. If None (default), DLQ is disabled.
246
+ dlq_publisher: Publisher for dead letter queue. Required if max_retries
247
+ is specified. If None when max_retries is set, raises
248
+ ProtocolConfigurationError.
249
+ dlq_topic: Topic name for DLQ (for metrics/logging purposes).
250
+ Optional, used for observability.
251
+
252
+ Raises:
253
+ ProtocolConfigurationError: If pool or publisher is None, if
254
+ configuration values are invalid, if max_retries < 1, or if
255
+ max_retries is set but dlq_publisher is None.
256
+ """
257
+ context = ModelInfraErrorContext(
258
+ transport_type=EnumInfraTransportType.DATABASE,
259
+ operation="outbox_init",
260
+ )
261
+
262
+ if pool is None:
263
+ raise ProtocolConfigurationError(
264
+ "pool cannot be None",
265
+ context=context,
266
+ )
267
+ if publisher is None:
268
+ raise ProtocolConfigurationError(
269
+ "publisher cannot be None",
270
+ context=context,
271
+ )
272
+ if batch_size < 1:
273
+ raise ProtocolConfigurationError(
274
+ f"batch_size must be >= 1, got {batch_size}",
275
+ context=context,
276
+ parameter="batch_size",
277
+ value=batch_size,
278
+ )
279
+ if poll_interval_seconds <= 0:
280
+ raise ProtocolConfigurationError(
281
+ f"poll_interval_seconds must be > 0, got {poll_interval_seconds}",
282
+ context=context,
283
+ parameter="poll_interval_seconds",
284
+ value=poll_interval_seconds,
285
+ )
286
+ if shutdown_timeout_seconds <= 0:
287
+ raise ProtocolConfigurationError(
288
+ f"shutdown_timeout_seconds must be > 0, got {shutdown_timeout_seconds}",
289
+ context=context,
290
+ parameter="shutdown_timeout_seconds",
291
+ value=shutdown_timeout_seconds,
292
+ )
293
+
294
+ # DLQ validation
295
+ if max_retries is not None and max_retries < 1:
296
+ raise ProtocolConfigurationError(
297
+ f"max_retries must be >= 1, got {max_retries}",
298
+ context=context,
299
+ parameter="max_retries",
300
+ value=max_retries,
301
+ )
302
+
303
+ if max_retries is not None and dlq_publisher is None:
304
+ raise ProtocolConfigurationError(
305
+ "dlq_publisher is required when max_retries is configured",
306
+ context=context,
307
+ parameter="dlq_publisher",
308
+ )
309
+
310
+ if dlq_publisher is not None and max_retries is None:
311
+ logger.warning(
312
+ "dlq_publisher configured but max_retries is None - DLQ will never be used",
313
+ extra={
314
+ "table_name": table_name,
315
+ "dlq_topic": dlq_topic,
316
+ },
317
+ )
318
+
319
+ self._pool = pool
320
+ self._publisher = publisher
321
+ self._table_name = table_name
322
+ self._batch_size = batch_size
323
+ self._poll_interval = poll_interval_seconds
324
+ self._query_timeout = query_timeout_seconds
325
+ self._strict_transaction_mode = strict_transaction_mode
326
+ self._shutdown_timeout = shutdown_timeout_seconds
327
+
328
+ # State management
329
+ self._running = False
330
+ self._lock = asyncio.Lock()
331
+ self._shutdown_event = asyncio.Event()
332
+ self._processor_task: asyncio.Task[None] | None = None
333
+
334
+ # Metrics tracking
335
+ self._notifications_stored: int = 0
336
+ self._notifications_processed: int = 0
337
+ self._notifications_failed: int = 0
338
+ self._notifications_sent_to_dlq: int = 0
339
+ self._dlq_publish_failures: int = 0
340
+
341
+ # DLQ configuration
342
+ self._max_retries = max_retries
343
+ self._dlq_publisher = dlq_publisher
344
+ self._dlq_topic = dlq_topic
345
+
346
+ logger.debug(
347
+ "TransitionNotificationOutbox initialized",
348
+ extra={
349
+ "table_name": table_name,
350
+ "batch_size": batch_size,
351
+ "poll_interval_seconds": poll_interval_seconds,
352
+ "strict_transaction_mode": strict_transaction_mode,
353
+ "shutdown_timeout_seconds": shutdown_timeout_seconds,
354
+ "max_retries": max_retries,
355
+ "dlq_enabled": dlq_publisher is not None,
356
+ "dlq_topic": dlq_topic,
357
+ },
358
+ )
359
+
360
+ @property
361
+ def table_name(self) -> str:
362
+ """Return the outbox table name."""
363
+ return self._table_name
364
+
365
+ @property
366
+ def batch_size(self) -> int:
367
+ """Return the batch size for processing."""
368
+ return self._batch_size
369
+
370
+ @property
371
+ def poll_interval(self) -> float:
372
+ """Return the poll interval in seconds."""
373
+ return self._poll_interval
374
+
375
+ @property
376
+ def shutdown_timeout(self) -> float:
377
+ """Return the shutdown timeout in seconds."""
378
+ return self._shutdown_timeout
379
+
380
+ @property
381
+ def is_running(self) -> bool:
382
+ """Return whether the background processor is running."""
383
+ return self._running
384
+
385
+ @property
386
+ def notifications_stored(self) -> int:
387
+ """Return total notifications stored."""
388
+ return self._notifications_stored
389
+
390
+ @property
391
+ def notifications_processed(self) -> int:
392
+ """Return total notifications successfully processed."""
393
+ return self._notifications_processed
394
+
395
+ @property
396
+ def notifications_failed(self) -> int:
397
+ """Return total notifications that failed processing."""
398
+ return self._notifications_failed
399
+
400
+ @property
401
+ def strict_transaction_mode(self) -> bool:
402
+ """Return whether strict transaction mode is enabled.
403
+
404
+ When enabled, store() raises ProtocolConfigurationError if called
405
+ outside a transaction context, rather than just logging a warning.
406
+ """
407
+ return self._strict_transaction_mode
408
+
409
+ @property
410
+ def max_retries(self) -> int | None:
411
+ """Return the max retries before DLQ, or None if DLQ disabled."""
412
+ return self._max_retries
413
+
414
+ @property
415
+ def dlq_topic(self) -> str | None:
416
+ """Return the DLQ topic name for metrics/logging."""
417
+ return self._dlq_topic
418
+
419
+ @property
420
+ def notifications_sent_to_dlq(self) -> int:
421
+ """Return total notifications sent to DLQ."""
422
+ return self._notifications_sent_to_dlq
423
+
424
+ @property
425
+ def dlq_publish_failures(self) -> int:
426
+ """Return count of failed DLQ publish attempts.
427
+
428
+ Non-zero values indicate DLQ availability issues. Monitor this metric
429
+ to detect when the DLQ is unavailable, which can cause infinite retry
430
+ loops for notifications that have exceeded max_retries.
431
+ """
432
+ return self._dlq_publish_failures
433
+
434
+ async def store(
435
+ self,
436
+ notification: ModelStateTransitionNotification,
437
+ conn: asyncpg.Connection,
438
+ ) -> None:
439
+ """Store notification in outbox using the same connection/transaction.
440
+
441
+ This method MUST be called within the same transaction as the projection
442
+ write to ensure atomicity. The notification will be picked up by the
443
+ background processor and published asynchronously.
444
+
445
+ Warning:
446
+ If called outside a transaction (auto-commit mode), behavior depends
447
+ on ``strict_transaction_mode``:
448
+
449
+ - **strict_transaction_mode=True** (default): Raises ProtocolConfigurationError
450
+ immediately, providing fail-fast behavior to catch misconfiguration early.
451
+ - **strict_transaction_mode=False**: Logs a WARNING but continues execution.
452
+ The atomicity guarantee with projection writes will be broken in this case.
453
+
454
+ Args:
455
+ notification: The state transition notification to store.
456
+ conn: The database connection from the current transaction.
457
+ MUST be the same connection used for the projection write.
458
+
459
+ Raises:
460
+ ProtocolConfigurationError: If strict_transaction_mode is True and
461
+ store() is called outside a transaction context.
462
+ InfraConnectionError: If database connection fails.
463
+ InfraTimeoutError: If store operation times out.
464
+ RuntimeHostError: For other database errors.
465
+
466
+ Example:
467
+ >>> async with pool.acquire() as conn:
468
+ ... async with conn.transaction():
469
+ ... # Update projection in same transaction
470
+ ... await projector.project(event, correlation_id)
471
+ ... # Store notification - uses same transaction
472
+ ... await outbox.store(notification, conn)
473
+ """
474
+ correlation_id = notification.correlation_id
475
+ ctx = ModelInfraErrorContext(
476
+ transport_type=EnumInfraTransportType.DATABASE,
477
+ operation="outbox_store",
478
+ target_name=self._table_name,
479
+ correlation_id=correlation_id,
480
+ )
481
+
482
+ # Check transaction context - behavior depends on strict_transaction_mode
483
+ if not conn.is_in_transaction():
484
+ if self._strict_transaction_mode:
485
+ raise ProtocolConfigurationError(
486
+ "store() called outside transaction context in strict mode - "
487
+ "atomicity with projection not guaranteed",
488
+ context=ctx,
489
+ )
490
+ logger.warning(
491
+ "store() called outside transaction context - "
492
+ "atomicity with projection not guaranteed",
493
+ extra={
494
+ "table_name": self._table_name,
495
+ "aggregate_type": notification.aggregate_type,
496
+ "aggregate_id": str(notification.aggregate_id),
497
+ "correlation_id": str(correlation_id),
498
+ },
499
+ )
500
+
501
+ # Build INSERT query - table name from trusted config, quoted for safety
502
+ # S608: Safe - table name from constructor, quoted via quote_identifier()
503
+ table_quoted = quote_identifier(self._table_name)
504
+ query = f"""
505
+ INSERT INTO {table_quoted}
506
+ (notification_data, aggregate_type, aggregate_id)
507
+ VALUES ($1, $2, $3)
508
+ """ # noqa: S608
509
+
510
+ try:
511
+ await conn.execute(
512
+ query,
513
+ notification.model_dump_json(),
514
+ notification.aggregate_type,
515
+ notification.aggregate_id,
516
+ timeout=self._query_timeout,
517
+ )
518
+
519
+ self._notifications_stored += 1
520
+
521
+ logger.debug(
522
+ "Notification stored in outbox",
523
+ extra={
524
+ "aggregate_type": notification.aggregate_type,
525
+ "aggregate_id": str(notification.aggregate_id),
526
+ "correlation_id": str(correlation_id),
527
+ },
528
+ )
529
+
530
+ except asyncpg.PostgresConnectionError as e:
531
+ raise InfraConnectionError(
532
+ f"Failed to store notification in outbox: {self._table_name}",
533
+ context=ctx,
534
+ ) from e
535
+
536
+ except asyncpg.QueryCanceledError as e:
537
+ timeout_ctx = ModelTimeoutErrorContext(
538
+ transport_type=EnumInfraTransportType.DATABASE,
539
+ operation="outbox_store",
540
+ target_name=self._table_name,
541
+ correlation_id=correlation_id,
542
+ )
543
+ raise InfraTimeoutError(
544
+ f"Timeout storing notification in outbox: {self._table_name}",
545
+ context=timeout_ctx,
546
+ ) from e
547
+
548
+ except Exception as e:
549
+ raise RuntimeHostError(
550
+ f"Failed to store notification: {type(e).__name__}",
551
+ context=ctx,
552
+ ) from e
553
+
554
+ async def process_pending(self) -> int:
555
+ """Process pending notifications from outbox.
556
+
557
+ Fetches pending notifications using SELECT FOR UPDATE SKIP LOCKED
558
+ for safe concurrent processing, publishes them via the publisher,
559
+ and marks them as processed.
560
+
561
+ Returns:
562
+ Count of successfully processed notifications.
563
+
564
+ Raises:
565
+ InfraConnectionError: If database connection fails.
566
+ InfraTimeoutError: If query times out.
567
+ RuntimeHostError: For other database errors.
568
+
569
+ Note:
570
+ Individual notification publish failures are recorded but do not
571
+ cause the method to raise. The failed notification's retry_count
572
+ and last_error are updated in the database.
573
+ """
574
+ correlation_id = UtilUUID.generate_correlation_id()
575
+ ctx = ModelInfraErrorContext(
576
+ transport_type=EnumInfraTransportType.DATABASE,
577
+ operation="outbox_process_pending",
578
+ target_name=self._table_name,
579
+ correlation_id=correlation_id,
580
+ )
581
+
582
+ # Build queries - table name from trusted config, quoted for safety
583
+ table_quoted = quote_identifier(self._table_name)
584
+
585
+ # SELECT query with FOR UPDATE SKIP LOCKED for concurrent safety
586
+ # S608: Safe - table name from constructor, quoted via quote_identifier()
587
+ select_query = f"""
588
+ SELECT id, notification_data, retry_count
589
+ FROM {table_quoted}
590
+ WHERE processed_at IS NULL
591
+ ORDER BY created_at
592
+ LIMIT $1
593
+ FOR UPDATE SKIP LOCKED
594
+ """ # noqa: S608
595
+
596
+ # UPDATE queries
597
+ # S608: Safe - table name from constructor, quoted via quote_identifier()
598
+ update_success_query = f"""
599
+ UPDATE {table_quoted}
600
+ SET processed_at = NOW()
601
+ WHERE id = $1
602
+ """ # noqa: S608
603
+
604
+ update_failure_query = f"""
605
+ UPDATE {table_quoted}
606
+ SET retry_count = retry_count + 1, last_error = $2
607
+ WHERE id = $1
608
+ """ # noqa: S608
609
+
610
+ # S608: Safe - table name from constructor, quoted via quote_identifier()
611
+ update_dlq_query = f"""
612
+ UPDATE {table_quoted}
613
+ SET processed_at = NOW(), last_error = $2
614
+ WHERE id = $1
615
+ """ # noqa: S608
616
+
617
+ try:
618
+ async with self._pool.acquire() as conn:
619
+ # Wrap in transaction to maintain row locks from SELECT FOR UPDATE
620
+ # Without explicit transaction, locks are released immediately after SELECT
621
+ async with conn.transaction():
622
+ # Fetch pending notifications
623
+ rows = await conn.fetch(
624
+ select_query,
625
+ self._batch_size,
626
+ timeout=self._query_timeout,
627
+ )
628
+
629
+ if not rows:
630
+ return 0
631
+
632
+ processed = 0
633
+
634
+ for row in rows:
635
+ row_id: int = row["id"]
636
+ notification_data = row["notification_data"]
637
+ row_retry_count: int = row["retry_count"]
638
+
639
+ try:
640
+ # Parse notification - asyncpg returns dict for JSONB columns
641
+ if isinstance(notification_data, dict):
642
+ notification = (
643
+ ModelStateTransitionNotification.model_validate(
644
+ notification_data
645
+ )
646
+ )
647
+ else:
648
+ notification = ModelStateTransitionNotification.model_validate_json(
649
+ notification_data
650
+ )
651
+
652
+ # Check if notification should be moved to DLQ
653
+ if self._should_move_to_dlq(row_retry_count):
654
+ dlq_success = await self._move_to_dlq(
655
+ row_id=row_id,
656
+ notification=notification,
657
+ retry_count=row_retry_count,
658
+ conn=conn,
659
+ update_dlq_query=update_dlq_query,
660
+ correlation_id=correlation_id,
661
+ )
662
+ if dlq_success:
663
+ processed += (
664
+ 1 # Count as processed since it's been handled
665
+ )
666
+ # Skip normal publishing regardless - DLQ failures will retry
667
+ continue
668
+
669
+ # Publish notification
670
+ await self._publisher.publish(notification)
671
+
672
+ # Mark as processed
673
+ await conn.execute(
674
+ update_success_query,
675
+ row_id,
676
+ timeout=self._query_timeout,
677
+ )
678
+
679
+ processed += 1
680
+ self._notifications_processed += 1
681
+
682
+ logger.debug(
683
+ "Notification published from outbox",
684
+ extra={
685
+ "outbox_id": row_id,
686
+ "aggregate_type": notification.aggregate_type,
687
+ "aggregate_id": str(notification.aggregate_id),
688
+ "correlation_id": str(notification.correlation_id),
689
+ },
690
+ )
691
+
692
+ except Exception as e:
693
+ # Record failure but continue processing other notifications
694
+ self._notifications_failed += 1
695
+ error_message = sanitize_error_string(str(e))
696
+
697
+ try:
698
+ await conn.execute(
699
+ update_failure_query,
700
+ row_id,
701
+ error_message[
702
+ : self.MAX_ERROR_MESSAGE_LENGTH
703
+ ], # Truncate for DB column
704
+ timeout=self._query_timeout,
705
+ )
706
+ except (asyncpg.PostgresError, TimeoutError) as update_err:
707
+ # Log but continue - the outbox row will be retried
708
+ logger.warning(
709
+ "Failed to record outbox failure, row will be retried",
710
+ extra={
711
+ "outbox_id": row_id,
712
+ "original_error": error_message,
713
+ "update_error": sanitize_error_string(
714
+ str(update_err)
715
+ ),
716
+ "update_error_type": type(update_err).__name__,
717
+ "correlation_id": str(correlation_id),
718
+ },
719
+ )
720
+
721
+ logger.warning(
722
+ "Failed to publish notification from outbox",
723
+ extra={
724
+ "outbox_id": row_id,
725
+ "error": error_message,
726
+ "error_type": type(e).__name__,
727
+ "correlation_id": str(correlation_id),
728
+ },
729
+ )
730
+
731
+ return processed
732
+
733
+ except asyncpg.PostgresConnectionError as e:
734
+ raise InfraConnectionError(
735
+ f"Failed to connect for outbox processing: {self._table_name}",
736
+ context=ctx,
737
+ ) from e
738
+
739
+ except asyncpg.QueryCanceledError as e:
740
+ timeout_ctx = ModelTimeoutErrorContext(
741
+ transport_type=EnumInfraTransportType.DATABASE,
742
+ operation="outbox_process_pending",
743
+ target_name=self._table_name,
744
+ correlation_id=correlation_id,
745
+ )
746
+ raise InfraTimeoutError(
747
+ f"Timeout processing outbox: {self._table_name}",
748
+ context=timeout_ctx,
749
+ ) from e
750
+
751
+ except Exception as e:
752
+ raise RuntimeHostError(
753
+ f"Failed to process outbox: {type(e).__name__}",
754
+ context=ctx,
755
+ ) from e
756
+
757
+ async def start(self) -> None:
758
+ """Start the background processor.
759
+
760
+ Starts a background task that continuously processes pending
761
+ notifications from the outbox. The processor polls at the configured
762
+ interval when idle.
763
+
764
+ Idempotency:
765
+ Calling start() on an already-running processor is a no-op
766
+ with a warning log.
767
+
768
+ Example:
769
+ >>> outbox = TransitionNotificationOutbox(pool, publisher)
770
+ >>> await outbox.start()
771
+ >>> # Processor now running in background
772
+ """
773
+ async with self._lock:
774
+ # Check both _running flag and whether task exists and is not done
775
+ # This prevents starting a second loop if stop() is in progress
776
+ if self._running or (
777
+ self._processor_task is not None and not self._processor_task.done()
778
+ ):
779
+ logger.warning(
780
+ "Outbox processor already running or task still active, ignoring start()",
781
+ extra={"table_name": self._table_name},
782
+ )
783
+ return
784
+
785
+ self._shutdown_event.clear()
786
+ self._running = True
787
+ self._processor_task = asyncio.create_task(self._processor_loop())
788
+
789
+ logger.info(
790
+ "Outbox processor started",
791
+ extra={
792
+ "table_name": self._table_name,
793
+ "batch_size": self._batch_size,
794
+ "poll_interval_seconds": self._poll_interval,
795
+ },
796
+ )
797
+
798
+ async def stop(self) -> None:
799
+ """Stop the background processor gracefully.
800
+
801
+ Signals the processor to stop and waits for any in-flight processing
802
+ to complete. After stop() returns, no more notifications will be
803
+ processed until start() is called again.
804
+
805
+ Idempotency:
806
+ Calling stop() on an already-stopped processor is a no-op.
807
+
808
+ Thread Safety:
809
+ The shutdown event is set and processor task captured while holding
810
+ the lock to prevent race conditions with concurrent start() calls.
811
+ The task is awaited outside the lock to avoid deadlock.
812
+
813
+ Example:
814
+ >>> await outbox.stop()
815
+ >>> # Processor stopped, safe to shutdown
816
+ """
817
+ # Capture task reference while holding lock to prevent race with start()
818
+ async with self._lock:
819
+ if not self._running:
820
+ logger.debug(
821
+ "Outbox processor already stopped, ignoring stop()",
822
+ extra={"table_name": self._table_name},
823
+ )
824
+ return
825
+
826
+ self._running = False
827
+ # Signal shutdown INSIDE lock to prevent race with start() clearing it
828
+ self._shutdown_event.set()
829
+ # Capture task reference INSIDE lock before releasing
830
+ processor_task = self._processor_task
831
+
832
+ # Wait for processor task to complete OUTSIDE lock to avoid deadlock
833
+ if processor_task is not None:
834
+ try:
835
+ await asyncio.wait_for(processor_task, timeout=self._shutdown_timeout)
836
+ except TimeoutError:
837
+ logger.warning(
838
+ "Outbox processor did not complete within timeout, cancelling",
839
+ extra={"table_name": self._table_name},
840
+ )
841
+ processor_task.cancel()
842
+ try:
843
+ await processor_task
844
+ except asyncio.CancelledError:
845
+ pass
846
+ except asyncio.CancelledError:
847
+ pass
848
+
849
+ # Clear task reference safely - only if it's still the same task
850
+ async with self._lock:
851
+ if self._processor_task is processor_task:
852
+ self._processor_task = None
853
+
854
+ logger.info(
855
+ "Outbox processor stopped",
856
+ extra={
857
+ "table_name": self._table_name,
858
+ "notifications_stored": self._notifications_stored,
859
+ "notifications_processed": self._notifications_processed,
860
+ "notifications_failed": self._notifications_failed,
861
+ },
862
+ )
863
+
864
+ async def _processor_loop(self) -> None:
865
+ """Background loop that processes pending notifications.
866
+
867
+ This method runs continuously until stop() is called, processing
868
+ pending notifications in batches. When no notifications are pending,
869
+ it sleeps for the configured poll interval.
870
+
871
+ Error Handling:
872
+ Processing errors are logged but do not crash the loop. The
873
+ loop continues processing after errors to maintain availability.
874
+ """
875
+ logger.debug(
876
+ "Outbox processor loop started",
877
+ extra={"table_name": self._table_name},
878
+ )
879
+
880
+ try:
881
+ while not self._shutdown_event.is_set():
882
+ try:
883
+ # Process pending notifications
884
+ processed = await self.process_pending()
885
+
886
+ # If no notifications processed, wait before polling again
887
+ if processed == 0:
888
+ try:
889
+ await asyncio.wait_for(
890
+ self._shutdown_event.wait(),
891
+ timeout=self._poll_interval,
892
+ )
893
+ # Shutdown event was set - exit loop
894
+ break
895
+ except TimeoutError:
896
+ # Poll interval elapsed - continue processing
897
+ pass
898
+
899
+ except Exception as e:
900
+ # Log error but continue processing
901
+ logger.exception(
902
+ "Error in outbox processor loop, continuing",
903
+ extra={
904
+ "table_name": self._table_name,
905
+ "error": sanitize_error_string(str(e)),
906
+ "error_type": type(e).__name__,
907
+ },
908
+ )
909
+ # Wait before retrying after error
910
+ try:
911
+ await asyncio.wait_for(
912
+ self._shutdown_event.wait(),
913
+ timeout=self._poll_interval,
914
+ )
915
+ break
916
+ except TimeoutError:
917
+ pass
918
+
919
+ except asyncio.CancelledError:
920
+ logger.info(
921
+ "Outbox processor loop cancelled",
922
+ extra={"table_name": self._table_name},
923
+ )
924
+ raise
925
+
926
+ finally:
927
+ logger.debug(
928
+ "Outbox processor loop exiting",
929
+ extra={
930
+ "table_name": self._table_name,
931
+ "notifications_processed": self._notifications_processed,
932
+ },
933
+ )
934
+
935
+ def _should_move_to_dlq(self, retry_count: int) -> bool:
936
+ """Check if notification should be moved to DLQ.
937
+
938
+ Args:
939
+ retry_count: Current retry count for the notification.
940
+
941
+ Returns:
942
+ True if the notification should be moved to DLQ, False otherwise.
943
+ """
944
+ if self._max_retries is None or self._dlq_publisher is None:
945
+ return False
946
+ return retry_count >= self._max_retries
947
+
948
+ async def _move_to_dlq(
949
+ self,
950
+ row_id: int,
951
+ notification: ModelStateTransitionNotification,
952
+ retry_count: int,
953
+ conn: asyncpg.Connection,
954
+ update_dlq_query: str,
955
+ correlation_id: UUID,
956
+ ) -> bool:
957
+ """Move a notification to the dead letter queue.
958
+
959
+ Publishes the notification to the DLQ via the dlq_publisher, marks
960
+ the original record as processed with an error message, and updates
961
+ metrics.
962
+
963
+ Args:
964
+ row_id: Database row ID of the notification.
965
+ notification: The parsed notification to move to DLQ.
966
+ retry_count: Current retry count for the notification.
967
+ conn: Database connection for updates.
968
+ update_dlq_query: SQL query to mark notification as processed.
969
+ correlation_id: Correlation ID for logging.
970
+
971
+ Returns:
972
+ True if the notification was successfully moved to DLQ, False otherwise.
973
+
974
+ Note:
975
+ If DLQ publish fails, the notification is NOT marked as processed
976
+ and will be retried on the next processing cycle. This ensures
977
+ no data loss even if the DLQ is temporarily unavailable. The
978
+ retry_count is NOT incremented on DLQ failure since it already
979
+ exceeds max_retries.
980
+
981
+ Warning:
982
+ If the DLQ is **permanently** unavailable, this creates an infinite
983
+ retry loop for notifications exceeding max_retries. Monitor for
984
+ ``processed_at IS NULL AND retry_count >= max_retries`` to detect
985
+ this condition.
986
+ """
987
+ if self._dlq_publisher is None:
988
+ # Should not happen due to _should_move_to_dlq check, but defensive
989
+ return False
990
+
991
+ dlq_error_message = f"Moved to DLQ after {retry_count} retries"
992
+
993
+ try:
994
+ # Publish to DLQ
995
+ await self._dlq_publisher.publish(notification)
996
+
997
+ # Mark as processed with DLQ error message
998
+ await conn.execute(
999
+ update_dlq_query,
1000
+ row_id,
1001
+ dlq_error_message[: self.MAX_ERROR_MESSAGE_LENGTH],
1002
+ timeout=self._query_timeout,
1003
+ )
1004
+
1005
+ self._notifications_sent_to_dlq += 1
1006
+ self._notifications_processed += 1 # DLQ-handled counts as processed
1007
+
1008
+ logger.warning(
1009
+ "Notification moved to DLQ after exceeding max retries",
1010
+ extra={
1011
+ "outbox_id": row_id,
1012
+ "aggregate_type": notification.aggregate_type,
1013
+ "aggregate_id": str(notification.aggregate_id),
1014
+ "correlation_id": str(notification.correlation_id),
1015
+ "retry_count": retry_count,
1016
+ "max_retries": self._max_retries,
1017
+ "dlq_topic": self._dlq_topic,
1018
+ "batch_correlation_id": str(correlation_id),
1019
+ },
1020
+ )
1021
+
1022
+ return True
1023
+
1024
+ except Exception as e:
1025
+ # DLQ publish failed - do NOT mark as processed
1026
+ # Notification will be retried on next cycle without incrementing retry_count
1027
+ # WARNING: If DLQ is permanently unavailable, this creates infinite retries.
1028
+ # Monitor: processed_at IS NULL AND retry_count >= max_retries
1029
+ self._dlq_publish_failures += 1
1030
+ error_message = sanitize_error_string(str(e))
1031
+ logger.exception(
1032
+ "Failed to publish notification to DLQ, will retry",
1033
+ extra={
1034
+ "outbox_id": row_id,
1035
+ "aggregate_type": notification.aggregate_type,
1036
+ "aggregate_id": str(notification.aggregate_id),
1037
+ "correlation_id": str(notification.correlation_id),
1038
+ "retry_count": retry_count,
1039
+ "error": error_message,
1040
+ "error_type": type(e).__name__,
1041
+ "dlq_topic": self._dlq_topic,
1042
+ "batch_correlation_id": str(correlation_id),
1043
+ },
1044
+ )
1045
+ return False
1046
+
1047
+ def get_metrics(self) -> ModelTransitionNotificationOutboxMetrics:
1048
+ """Return current outbox metrics for observability.
1049
+
1050
+ Returns:
1051
+ Typed metrics model containing:
1052
+ - table_name: The outbox table name
1053
+ - is_running: Whether processor is running
1054
+ - notifications_stored: Total notifications stored
1055
+ - notifications_processed: Total notifications successfully processed
1056
+ - notifications_failed: Total notifications that failed processing
1057
+ - notifications_sent_to_dlq: Total notifications moved to DLQ
1058
+ - dlq_publish_failures: Count of failed DLQ publish attempts
1059
+ - batch_size: Configured batch size
1060
+ - poll_interval_seconds: Configured poll interval
1061
+ - max_retries: Max retries before DLQ (None if DLQ disabled)
1062
+ - dlq_topic: DLQ topic name (None if DLQ disabled)
1063
+
1064
+ Example:
1065
+ >>> metrics = outbox.get_metrics()
1066
+ >>> print(f"Processed: {metrics.notifications_processed}")
1067
+ >>> print(f"Sent to DLQ: {metrics.notifications_sent_to_dlq}")
1068
+ >>> if metrics.dlq_publish_failures > 0:
1069
+ ... print(f"WARNING: {metrics.dlq_publish_failures} DLQ failures")
1070
+ """
1071
+ return ModelTransitionNotificationOutboxMetrics(
1072
+ table_name=self._table_name,
1073
+ is_running=self._running,
1074
+ notifications_stored=self._notifications_stored,
1075
+ notifications_processed=self._notifications_processed,
1076
+ notifications_failed=self._notifications_failed,
1077
+ notifications_sent_to_dlq=self._notifications_sent_to_dlq,
1078
+ dlq_publish_failures=self._dlq_publish_failures,
1079
+ batch_size=self._batch_size,
1080
+ poll_interval_seconds=self._poll_interval,
1081
+ max_retries=self._max_retries,
1082
+ dlq_topic=self._dlq_topic,
1083
+ )
1084
+
1085
+ async def cleanup_processed(
1086
+ self,
1087
+ retention_days: int = 7,
1088
+ ) -> int:
1089
+ """Delete old processed notifications from outbox.
1090
+
1091
+ Removes processed notifications older than the specified retention
1092
+ period to prevent table bloat. Should be called periodically via
1093
+ cron or scheduled task.
1094
+
1095
+ Args:
1096
+ retention_days: Number of days to retain processed records.
1097
+ Must be >= 0. Default: 7 days.
1098
+
1099
+ Returns:
1100
+ Count of deleted records.
1101
+
1102
+ Raises:
1103
+ ProtocolConfigurationError: If retention_days is negative.
1104
+ InfraConnectionError: If database connection fails.
1105
+ InfraTimeoutError: If query times out.
1106
+ RuntimeHostError: For other database errors.
1107
+
1108
+ Example:
1109
+ >>> # Delete records processed more than 7 days ago
1110
+ >>> deleted = await outbox.cleanup_processed(retention_days=7)
1111
+ >>> print(f"Cleaned up {deleted} old records")
1112
+ >>>
1113
+ >>> # Delete all processed records immediately
1114
+ >>> deleted = await outbox.cleanup_processed(retention_days=0)
1115
+ """
1116
+ correlation_id = UtilUUID.generate_correlation_id()
1117
+ ctx = ModelInfraErrorContext(
1118
+ transport_type=EnumInfraTransportType.DATABASE,
1119
+ operation="outbox_cleanup",
1120
+ target_name=self._table_name,
1121
+ correlation_id=correlation_id,
1122
+ )
1123
+
1124
+ if retention_days < 0:
1125
+ raise ProtocolConfigurationError(
1126
+ f"retention_days must be >= 0, got {retention_days}",
1127
+ context=ctx,
1128
+ parameter="retention_days",
1129
+ value=retention_days,
1130
+ )
1131
+
1132
+ table_quoted = quote_identifier(self._table_name)
1133
+ # S608: Safe - table name from constructor, quoted via quote_identifier()
1134
+ # retention_days passed as $1 parameter via make_interval()
1135
+ query = f"""
1136
+ DELETE FROM {table_quoted}
1137
+ WHERE processed_at IS NOT NULL
1138
+ AND processed_at < NOW() - make_interval(days => $1)
1139
+ """ # noqa: S608
1140
+
1141
+ try:
1142
+ async with self._pool.acquire() as conn:
1143
+ result = await conn.execute(
1144
+ query,
1145
+ retention_days,
1146
+ timeout=self._query_timeout,
1147
+ )
1148
+ # Parse result like "DELETE 42"
1149
+ deleted_count = int(result.split()[-1]) if result else 0
1150
+
1151
+ logger.info(
1152
+ "Cleaned up processed outbox records",
1153
+ extra={
1154
+ "table_name": self._table_name,
1155
+ "retention_days": retention_days,
1156
+ "deleted_count": deleted_count,
1157
+ "correlation_id": str(correlation_id),
1158
+ },
1159
+ )
1160
+
1161
+ return deleted_count
1162
+
1163
+ except asyncpg.PostgresConnectionError as e:
1164
+ raise InfraConnectionError(
1165
+ f"Failed to cleanup outbox: {self._table_name}",
1166
+ context=ctx,
1167
+ ) from e
1168
+
1169
+ except asyncpg.QueryCanceledError as e:
1170
+ timeout_ctx = ModelTimeoutErrorContext(
1171
+ transport_type=EnumInfraTransportType.DATABASE,
1172
+ operation="outbox_cleanup",
1173
+ target_name=self._table_name,
1174
+ correlation_id=correlation_id,
1175
+ )
1176
+ raise InfraTimeoutError(
1177
+ f"Timeout cleaning up outbox: {self._table_name}",
1178
+ context=timeout_ctx,
1179
+ ) from e
1180
+
1181
+ except Exception as e:
1182
+ raise RuntimeHostError(
1183
+ f"Failed to cleanup outbox: {type(e).__name__}",
1184
+ context=ctx,
1185
+ ) from e
1186
+
1187
+
1188
+ __all__: list[str] = [
1189
+ "TransitionNotificationOutbox",
1190
+ ]