omnibase_infra 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/errors/__init__.py +4 -0
  3. omnibase_infra/errors/error_infra.py +60 -0
  4. omnibase_infra/handlers/__init__.py +3 -0
  5. omnibase_infra/handlers/handler_slack_webhook.py +426 -0
  6. omnibase_infra/handlers/models/__init__.py +14 -0
  7. omnibase_infra/handlers/models/enum_alert_severity.py +36 -0
  8. omnibase_infra/handlers/models/model_slack_alert.py +24 -0
  9. omnibase_infra/handlers/models/model_slack_alert_payload.py +77 -0
  10. omnibase_infra/handlers/models/model_slack_alert_result.py +73 -0
  11. omnibase_infra/mixins/mixin_node_introspection.py +42 -20
  12. omnibase_infra/models/discovery/model_dependency_spec.py +1 -0
  13. omnibase_infra/models/discovery/model_discovered_capabilities.py +1 -1
  14. omnibase_infra/models/discovery/model_introspection_config.py +28 -1
  15. omnibase_infra/models/discovery/model_introspection_performance_metrics.py +1 -0
  16. omnibase_infra/models/discovery/model_introspection_task_config.py +1 -0
  17. omnibase_infra/models/runtime/__init__.py +4 -0
  18. omnibase_infra/models/runtime/model_resolved_dependencies.py +116 -0
  19. omnibase_infra/nodes/contract_registry_reducer/contract.yaml +6 -5
  20. omnibase_infra/nodes/contract_registry_reducer/reducer.py +9 -26
  21. omnibase_infra/nodes/node_contract_persistence_effect/node.py +18 -1
  22. omnibase_infra/nodes/node_contract_persistence_effect/registry/registry_infra_contract_persistence_effect.py +33 -2
  23. omnibase_infra/nodes/node_registration_orchestrator/models/model_postgres_intent_payload.py +8 -12
  24. omnibase_infra/nodes/node_slack_alerter_effect/__init__.py +33 -0
  25. omnibase_infra/nodes/node_slack_alerter_effect/contract.yaml +291 -0
  26. omnibase_infra/nodes/node_slack_alerter_effect/node.py +106 -0
  27. omnibase_infra/runtime/__init__.py +7 -0
  28. omnibase_infra/runtime/baseline_subscriptions.py +13 -6
  29. omnibase_infra/runtime/contract_dependency_resolver.py +455 -0
  30. omnibase_infra/runtime/contract_registration_event_router.py +5 -5
  31. omnibase_infra/runtime/emit_daemon/event_registry.py +34 -22
  32. omnibase_infra/runtime/event_bus_subcontract_wiring.py +63 -23
  33. omnibase_infra/runtime/publisher_topic_scoped.py +16 -11
  34. omnibase_infra/runtime/registry_policy.py +29 -15
  35. omnibase_infra/runtime/request_response_wiring.py +15 -7
  36. omnibase_infra/runtime/service_runtime_host_process.py +149 -5
  37. omnibase_infra/runtime/util_version.py +5 -1
  38. omnibase_infra/schemas/schema_latency_baseline.sql +135 -0
  39. omnibase_infra/services/contract_publisher/config.py +4 -4
  40. omnibase_infra/services/contract_publisher/service.py +8 -5
  41. omnibase_infra/services/observability/injection_effectiveness/__init__.py +67 -0
  42. omnibase_infra/services/observability/injection_effectiveness/config.py +295 -0
  43. omnibase_infra/services/observability/injection_effectiveness/consumer.py +1461 -0
  44. omnibase_infra/services/observability/injection_effectiveness/models/__init__.py +32 -0
  45. omnibase_infra/services/observability/injection_effectiveness/models/model_agent_match.py +79 -0
  46. omnibase_infra/services/observability/injection_effectiveness/models/model_context_utilization.py +118 -0
  47. omnibase_infra/services/observability/injection_effectiveness/models/model_latency_breakdown.py +107 -0
  48. omnibase_infra/services/observability/injection_effectiveness/models/model_pattern_utilization.py +46 -0
  49. omnibase_infra/services/observability/injection_effectiveness/writer_postgres.py +596 -0
  50. omnibase_infra/utils/__init__.py +7 -0
  51. omnibase_infra/utils/util_db_error_context.py +292 -0
  52. omnibase_infra/validation/validation_exemptions.yaml +11 -0
  53. {omnibase_infra-0.3.2.dist-info → omnibase_infra-0.4.0.dist-info}/METADATA +2 -2
  54. {omnibase_infra-0.3.2.dist-info → omnibase_infra-0.4.0.dist-info}/RECORD +57 -36
  55. {omnibase_infra-0.3.2.dist-info → omnibase_infra-0.4.0.dist-info}/WHEEL +0 -0
  56. {omnibase_infra-0.3.2.dist-info → omnibase_infra-0.4.0.dist-info}/entry_points.txt +0 -0
  57. {omnibase_infra-0.3.2.dist-info → omnibase_infra-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,596 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """PostgreSQL Writer for Injection Effectiveness Observability.
4
+
5
+ This module provides a PostgreSQL writer for persisting injection effectiveness
6
+ events consumed from Kafka. It handles batch upserts with idempotency
7
+ guarantees and circuit breaker resilience.
8
+
9
+ Design Decisions:
10
+ - Pool injection: asyncpg.Pool is injected, not created/managed
11
+ - Batch inserts: Uses executemany for efficient batch processing
12
+ - Idempotency: ON CONFLICT DO NOTHING/UPDATE per table contract
13
+ - Circuit breaker: MixinAsyncCircuitBreaker for resilience
14
+
15
+ Idempotency Contract:
16
+ | Table | Unique Key | Conflict Action |
17
+ |------------------------|------------------------------|-----------------|
18
+ | injection_effectiveness| session_id | DO UPDATE |
19
+ | latency_breakdowns | (session_id, prompt_id) | DO NOTHING |
20
+ | pattern_hit_rates | (pattern_id, utilization_method) | DO UPDATE (rolling avg) |
21
+
22
+ Related Tickets:
23
+ - OMN-1890: Store injection metrics with corrected schema
24
+
25
+ Example:
26
+ >>> import asyncpg
27
+ >>> from omnibase_infra.services.observability.injection_effectiveness.writer_postgres import (
28
+ ... WriterInjectionEffectivenessPostgres,
29
+ ... )
30
+ >>>
31
+ >>> pool = await asyncpg.create_pool(dsn="postgresql://...")
32
+ >>> writer = WriterInjectionEffectivenessPostgres(pool)
33
+ >>>
34
+ >>> # Write batch of context utilization events
35
+ >>> count = await writer.write_context_utilization(events)
36
+ >>> print(f"Wrote {count} context utilization events")
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import logging
42
+ from uuid import UUID
43
+
44
+ import asyncpg
45
+
46
+ from omnibase_core.types import JsonType
47
+ from omnibase_infra.enums import EnumInfraTransportType
48
+ from omnibase_infra.mixins import MixinAsyncCircuitBreaker
49
+ from omnibase_infra.services.observability.injection_effectiveness.models.model_agent_match import (
50
+ ModelAgentMatchEvent,
51
+ )
52
+ from omnibase_infra.services.observability.injection_effectiveness.models.model_context_utilization import (
53
+ ModelContextUtilizationEvent,
54
+ )
55
+ from omnibase_infra.services.observability.injection_effectiveness.models.model_latency_breakdown import (
56
+ ModelLatencyBreakdownEvent,
57
+ )
58
+ from omnibase_infra.utils.util_db_error_context import db_operation_error_context
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+
63
+ class WriterInjectionEffectivenessPostgres(MixinAsyncCircuitBreaker):
64
+ """PostgreSQL writer for injection effectiveness observability events.
65
+
66
+ Provides batch write methods for injection effectiveness tables with idempotency
67
+ guarantees and circuit breaker resilience. The asyncpg.Pool is injected
68
+ and its lifecycle is managed externally.
69
+
70
+ Features:
71
+ - Batch inserts/upserts via executemany for efficiency
72
+ - Idempotent writes via ON CONFLICT clauses
73
+ - Circuit breaker for database resilience
74
+ - Correlation ID propagation for tracing
75
+
76
+ Attributes:
77
+ _pool: Injected asyncpg connection pool.
78
+ circuit_breaker_threshold: Failure threshold before opening circuit.
79
+ circuit_breaker_reset_timeout: Seconds before auto-reset.
80
+ DEFAULT_QUERY_TIMEOUT_SECONDS: Default timeout for database queries (30s).
81
+ DEFAULT_MINIMUM_SUPPORT_THRESHOLD: Default minimum sample count for confidence (20).
82
+ DEFAULT_HIT_MISS_THRESHOLD: Default threshold for hit/miss classification (0.5).
83
+
84
+ Example:
85
+ >>> pool = await asyncpg.create_pool(dsn="postgresql://...")
86
+ >>> writer = WriterInjectionEffectivenessPostgres(
87
+ ... pool,
88
+ ... circuit_breaker_threshold=5,
89
+ ... circuit_breaker_reset_timeout=60.0,
90
+ ... circuit_breaker_half_open_successes=2,
91
+ ... query_timeout=30.0,
92
+ ... minimum_support_threshold=20, # samples needed before confidence
93
+ ... hit_miss_threshold=0.5, # score threshold for hit vs miss
94
+ ... )
95
+ >>>
96
+ >>> # Write batch of context utilization events
97
+ >>> count = await writer.write_context_utilization(events)
98
+ """
99
+
100
+ DEFAULT_QUERY_TIMEOUT_SECONDS: float = 30.0
101
+ DEFAULT_MINIMUM_SUPPORT_THRESHOLD: int = 20
102
+ DEFAULT_HIT_MISS_THRESHOLD: float = 0.5
103
+
104
+ def __init__(
105
+ self,
106
+ pool: asyncpg.Pool,
107
+ circuit_breaker_threshold: int = 5,
108
+ circuit_breaker_reset_timeout: float = 60.0,
109
+ circuit_breaker_half_open_successes: int = 1,
110
+ query_timeout: float | None = None,
111
+ minimum_support_threshold: int | None = None,
112
+ hit_miss_threshold: float | None = None,
113
+ ) -> None:
114
+ """Initialize the PostgreSQL writer with an injected pool.
115
+
116
+ Args:
117
+ pool: asyncpg connection pool (lifecycle managed externally).
118
+ circuit_breaker_threshold: Failures before opening circuit (default: 5).
119
+ circuit_breaker_reset_timeout: Seconds before auto-reset (default: 60.0).
120
+ circuit_breaker_half_open_successes: Successful requests required to close
121
+ circuit from half-open state (default: 1).
122
+ query_timeout: Timeout in seconds for database queries. Applied via
123
+ PostgreSQL statement_timeout (default: DEFAULT_QUERY_TIMEOUT_SECONDS).
124
+ minimum_support_threshold: Minimum sample count required before calculating
125
+ confidence score for pattern_hit_rates. This implements statistical
126
+ minimum support gating to avoid premature confidence scores based on
127
+ insufficient data (default: DEFAULT_MINIMUM_SUPPORT_THRESHOLD = 20).
128
+ hit_miss_threshold: Threshold for classifying pattern utilization as hit
129
+ vs miss. Scores > threshold count as hits, scores <= threshold count
130
+ as misses. This heuristic determines when a pattern injection was
131
+ "useful enough" to count as a hit (default: DEFAULT_HIT_MISS_THRESHOLD = 0.5).
132
+
133
+ Raises:
134
+ ProtocolConfigurationError: If circuit breaker parameters are invalid.
135
+ """
136
+ self._pool = pool
137
+ self._query_timeout = query_timeout or self.DEFAULT_QUERY_TIMEOUT_SECONDS
138
+ self._minimum_support_threshold = (
139
+ minimum_support_threshold
140
+ if minimum_support_threshold is not None
141
+ else self.DEFAULT_MINIMUM_SUPPORT_THRESHOLD
142
+ )
143
+ self._hit_miss_threshold = (
144
+ hit_miss_threshold
145
+ if hit_miss_threshold is not None
146
+ else self.DEFAULT_HIT_MISS_THRESHOLD
147
+ )
148
+
149
+ # Initialize circuit breaker mixin
150
+ self._init_circuit_breaker(
151
+ threshold=circuit_breaker_threshold,
152
+ reset_timeout=circuit_breaker_reset_timeout,
153
+ service_name="injection-effectiveness-postgres-writer",
154
+ transport_type=EnumInfraTransportType.DATABASE,
155
+ half_open_successes=circuit_breaker_half_open_successes,
156
+ )
157
+
158
+ logger.info(
159
+ "WriterInjectionEffectivenessPostgres initialized",
160
+ extra={
161
+ "circuit_breaker_threshold": circuit_breaker_threshold,
162
+ "circuit_breaker_reset_timeout": circuit_breaker_reset_timeout,
163
+ "circuit_breaker_half_open_successes": circuit_breaker_half_open_successes,
164
+ "query_timeout": self._query_timeout,
165
+ "minimum_support_threshold": self._minimum_support_threshold,
166
+ "hit_miss_threshold": self._hit_miss_threshold,
167
+ },
168
+ )
169
+
170
+ async def write_context_utilization(
171
+ self,
172
+ events: list[ModelContextUtilizationEvent],
173
+ correlation_id: UUID,
174
+ ) -> int:
175
+ """Write batch of context utilization events to PostgreSQL.
176
+
177
+ Performs two operations:
178
+ 1. UPSERT to injection_effectiveness table (session_id is primary key)
179
+ 2. INSERT to pattern_hit_rates table for each pattern (ON CONFLICT DO NOTHING)
180
+
181
+ Args:
182
+ events: List of context utilization events to write.
183
+ correlation_id: Correlation ID for tracing (required - models auto-generate).
184
+
185
+ Returns:
186
+ Count of events in the batch (executemany doesn't return affected rows).
187
+
188
+ Raises:
189
+ InfraConnectionError: If database connection fails.
190
+ InfraTimeoutError: If operation times out.
191
+ InfraUnavailableError: If circuit breaker is open.
192
+ """
193
+ if not events:
194
+ return 0
195
+
196
+ # Check circuit breaker before entering error context
197
+ async with self._circuit_breaker_lock:
198
+ await self._check_circuit_breaker(
199
+ operation="write_context_utilization",
200
+ correlation_id=correlation_id,
201
+ )
202
+
203
+ # SQL for injection_effectiveness upsert
204
+ sql_effectiveness = """
205
+ INSERT INTO injection_effectiveness (
206
+ session_id, correlation_id, cohort, cohort_identity_type,
207
+ total_injected_tokens, patterns_injected, utilization_score,
208
+ utilization_method, injected_identifiers_count, reused_identifiers_count,
209
+ created_at, updated_at
210
+ )
211
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, NOW())
212
+ ON CONFLICT (session_id) DO UPDATE SET
213
+ correlation_id = COALESCE(EXCLUDED.correlation_id, injection_effectiveness.correlation_id),
214
+ cohort = COALESCE(EXCLUDED.cohort, injection_effectiveness.cohort),
215
+ cohort_identity_type = COALESCE(EXCLUDED.cohort_identity_type, injection_effectiveness.cohort_identity_type),
216
+ total_injected_tokens = EXCLUDED.total_injected_tokens,
217
+ patterns_injected = EXCLUDED.patterns_injected,
218
+ utilization_score = EXCLUDED.utilization_score,
219
+ utilization_method = EXCLUDED.utilization_method,
220
+ injected_identifiers_count = EXCLUDED.injected_identifiers_count,
221
+ reused_identifiers_count = EXCLUDED.reused_identifiers_count,
222
+ updated_at = NOW()
223
+ """
224
+
225
+ # SQL for pattern_hit_rates upsert with rolling average
226
+ # Aggregates per-pattern statistics across all sessions
227
+ # Note: minimum_support_threshold is formatted into SQL since executemany
228
+ # doesn't support different parameter values per-position, and this is a
229
+ # controlled integer configuration value (not user input).
230
+ # Security: int() cast guarantees numeric-only output, preventing SQL injection.
231
+ min_support_str = str(int(self._minimum_support_threshold))
232
+ sql_patterns = """
233
+ INSERT INTO pattern_hit_rates (
234
+ pattern_id, utilization_method, utilization_score,
235
+ hit_count, miss_count, sample_count, created_at, updated_at
236
+ )
237
+ VALUES ($1, $2, $3, $4, $5, 1, NOW(), NOW())
238
+ ON CONFLICT (pattern_id, utilization_method) DO UPDATE SET
239
+ -- Rolling average: new_avg = ((old_avg * old_count) + new_score) / (old_count + 1)
240
+ utilization_score = (
241
+ (pattern_hit_rates.utilization_score * pattern_hit_rates.sample_count) + EXCLUDED.utilization_score
242
+ ) / (pattern_hit_rates.sample_count + 1),
243
+ hit_count = pattern_hit_rates.hit_count + EXCLUDED.hit_count,
244
+ miss_count = pattern_hit_rates.miss_count + EXCLUDED.miss_count,
245
+ sample_count = pattern_hit_rates.sample_count + 1,
246
+ -- Set confidence when sample_count >= minimum_support_threshold
247
+ -- (minimum support gating prevents premature confidence scores)
248
+ confidence = CASE
249
+ WHEN pattern_hit_rates.sample_count + 1 >= __MIN_SUPPORT__ THEN
250
+ (pattern_hit_rates.utilization_score * pattern_hit_rates.sample_count + EXCLUDED.utilization_score) / (pattern_hit_rates.sample_count + 1)
251
+ ELSE NULL
252
+ END,
253
+ updated_at = NOW()
254
+ """.replace("__MIN_SUPPORT__", min_support_str)
255
+
256
+ # Use shared error context for consistent exception handling
257
+ async with db_operation_error_context(
258
+ operation="write_context_utilization",
259
+ target_name="injection_effectiveness",
260
+ correlation_id=correlation_id,
261
+ timeout_seconds=self._query_timeout,
262
+ circuit_breaker=self,
263
+ ):
264
+ async with self._pool.acquire() as conn:
265
+ # Apply statement_timeout for query timeout enforcement
266
+ # Convert seconds to milliseconds for PostgreSQL
267
+ # Use parameterized query for defense in depth (even though int() cast
268
+ # already guarantees numeric output, parameterized is the preferred pattern)
269
+ timeout_ms = int(self._query_timeout * 1000)
270
+ await conn.execute("SET statement_timeout = $1", str(timeout_ms))
271
+
272
+ # Wrap both writes in an explicit transaction for atomicity.
273
+ # If pattern_hit_rates write fails after injection_effectiveness succeeds,
274
+ # both are rolled back to prevent partial data.
275
+ async with conn.transaction():
276
+ # Write to injection_effectiveness
277
+ await conn.executemany(
278
+ sql_effectiveness,
279
+ [
280
+ (
281
+ e.session_id,
282
+ e.correlation_id,
283
+ e.cohort,
284
+ e.cohort_identity_type,
285
+ e.total_injected_tokens,
286
+ e.patterns_injected,
287
+ e.utilization_score,
288
+ e.utilization_method,
289
+ e.injected_identifiers_count,
290
+ e.reused_identifiers_count,
291
+ e.created_at,
292
+ )
293
+ for e in events
294
+ ],
295
+ )
296
+
297
+ # Write pattern utilizations to pattern_hit_rates (aggregated per pattern)
298
+ #
299
+ # Hit/miss classification threshold rationale:
300
+ # The default threshold of 0.5 represents a "majority utility" heuristic:
301
+ # - hit (score > 0.5): More than half the injected pattern content was
302
+ # utilized by the model, indicating the injection was net-positive.
303
+ # - miss (score <= 0.5): Half or less was utilized, indicating the
304
+ # injection added noise/tokens without proportional benefit.
305
+ #
306
+ # This threshold is configurable via hit_miss_threshold parameter to
307
+ # accommodate different utilization measurement methods and use cases.
308
+ # For example, strict environments might use 0.7, while exploratory
309
+ # injections might tolerate 0.3.
310
+ #
311
+ # Classification is binary (hit=1/miss=1) to enable simple aggregate
312
+ # hit rate calculations: hit_rate = hit_count / (hit_count + miss_count)
313
+ pattern_rows = []
314
+ for e in events:
315
+ for p in e.pattern_utilizations:
316
+ hit_count = (
317
+ 1
318
+ if p.utilization_score > self._hit_miss_threshold
319
+ else 0
320
+ )
321
+ miss_count = (
322
+ 0
323
+ if p.utilization_score > self._hit_miss_threshold
324
+ else 1
325
+ )
326
+ pattern_rows.append(
327
+ (
328
+ p.pattern_id,
329
+ p.utilization_method,
330
+ p.utilization_score,
331
+ hit_count,
332
+ miss_count,
333
+ )
334
+ )
335
+
336
+ if pattern_rows:
337
+ await conn.executemany(sql_patterns, pattern_rows)
338
+
339
+ # Record success - reset circuit breaker after successful write
340
+ async with self._circuit_breaker_lock:
341
+ await self._reset_circuit_breaker()
342
+
343
+ logger.debug(
344
+ "Wrote context utilization batch",
345
+ extra={
346
+ "count": len(events),
347
+ "pattern_count": len(pattern_rows) if pattern_rows else 0,
348
+ "correlation_id": str(correlation_id),
349
+ },
350
+ )
351
+ return len(events)
352
+
353
+ async def write_agent_match(
354
+ self,
355
+ events: list[ModelAgentMatchEvent],
356
+ correlation_id: UUID,
357
+ ) -> int:
358
+ """Write batch of agent match events to PostgreSQL.
359
+
360
+ UPSERT to injection_effectiveness table, merging with existing session data.
361
+ Only updates agent match fields (agent_match_score, expected_agent, actual_agent).
362
+
363
+ Args:
364
+ events: List of agent match events to write.
365
+ correlation_id: Correlation ID for tracing (required - models auto-generate).
366
+
367
+ Returns:
368
+ Count of events in the batch.
369
+
370
+ Raises:
371
+ InfraConnectionError: If database connection fails.
372
+ InfraTimeoutError: If operation times out.
373
+ InfraUnavailableError: If circuit breaker is open.
374
+ """
375
+ if not events:
376
+ return 0
377
+
378
+ # Check circuit breaker before entering error context
379
+ async with self._circuit_breaker_lock:
380
+ await self._check_circuit_breaker(
381
+ operation="write_agent_match",
382
+ correlation_id=correlation_id,
383
+ )
384
+
385
+ sql = """
386
+ INSERT INTO injection_effectiveness (
387
+ session_id, correlation_id, agent_match_score, expected_agent,
388
+ actual_agent, created_at, updated_at
389
+ )
390
+ VALUES ($1, $2, $3, $4, $5, $6, NOW())
391
+ ON CONFLICT (session_id) DO UPDATE SET
392
+ correlation_id = COALESCE(EXCLUDED.correlation_id, injection_effectiveness.correlation_id),
393
+ agent_match_score = EXCLUDED.agent_match_score,
394
+ expected_agent = EXCLUDED.expected_agent,
395
+ actual_agent = EXCLUDED.actual_agent,
396
+ updated_at = NOW()
397
+ """
398
+
399
+ # Use shared error context for consistent exception handling
400
+ async with db_operation_error_context(
401
+ operation="write_agent_match",
402
+ target_name="injection_effectiveness",
403
+ correlation_id=correlation_id,
404
+ timeout_seconds=self._query_timeout,
405
+ circuit_breaker=self,
406
+ ):
407
+ async with self._pool.acquire() as conn:
408
+ # Apply statement_timeout for query timeout enforcement
409
+ # Use parameterized query for defense in depth
410
+ timeout_ms = int(self._query_timeout * 1000)
411
+ await conn.execute("SET statement_timeout = $1", str(timeout_ms))
412
+
413
+ await conn.executemany(
414
+ sql,
415
+ [
416
+ (
417
+ e.session_id,
418
+ e.correlation_id,
419
+ e.agent_match_score,
420
+ e.expected_agent,
421
+ e.actual_agent,
422
+ e.created_at,
423
+ )
424
+ for e in events
425
+ ],
426
+ )
427
+
428
+ # Record success - reset circuit breaker after successful write
429
+ async with self._circuit_breaker_lock:
430
+ await self._reset_circuit_breaker()
431
+
432
+ logger.debug(
433
+ "Wrote agent match batch",
434
+ extra={
435
+ "count": len(events),
436
+ "correlation_id": str(correlation_id),
437
+ },
438
+ )
439
+ return len(events)
440
+
441
+ async def write_latency_breakdowns(
442
+ self,
443
+ events: list[ModelLatencyBreakdownEvent],
444
+ correlation_id: UUID,
445
+ ) -> int:
446
+ """Write batch of latency breakdown events to PostgreSQL.
447
+
448
+ Performs two operations (order matters for FK constraint):
449
+ 1. UPSERT to injection_effectiveness table (creates parent row if needed)
450
+ 2. INSERT to latency_breakdowns table (ON CONFLICT DO NOTHING)
451
+
452
+ Args:
453
+ events: List of latency breakdown events to write.
454
+ correlation_id: Correlation ID for tracing (required - models auto-generate).
455
+
456
+ Returns:
457
+ Count of events in the batch.
458
+
459
+ Raises:
460
+ InfraConnectionError: If database connection fails.
461
+ InfraTimeoutError: If operation times out.
462
+ InfraUnavailableError: If circuit breaker is open.
463
+ """
464
+ if not events:
465
+ return 0
466
+
467
+ # Check circuit breaker before entering error context
468
+ async with self._circuit_breaker_lock:
469
+ await self._check_circuit_breaker(
470
+ operation="write_latency_breakdowns",
471
+ correlation_id=correlation_id,
472
+ )
473
+
474
+ # SQL for latency_breakdowns insert
475
+ sql_breakdowns = """
476
+ INSERT INTO latency_breakdowns (
477
+ session_id, prompt_id, cohort, cache_hit,
478
+ routing_latency_ms, retrieval_latency_ms, injection_latency_ms,
479
+ user_latency_ms, emitted_at, created_at
480
+ )
481
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, NOW())
482
+ ON CONFLICT (session_id, prompt_id) DO NOTHING
483
+ """
484
+
485
+ # SQL for injection_effectiveness upsert (MAX aggregation for user_visible_latency_ms)
486
+ sql_effectiveness = """
487
+ INSERT INTO injection_effectiveness (
488
+ session_id, correlation_id, cohort, user_visible_latency_ms,
489
+ created_at, updated_at
490
+ )
491
+ VALUES ($1, $2, $3, $4, $5, NOW())
492
+ ON CONFLICT (session_id) DO UPDATE SET
493
+ correlation_id = COALESCE(EXCLUDED.correlation_id, injection_effectiveness.correlation_id),
494
+ cohort = COALESCE(EXCLUDED.cohort, injection_effectiveness.cohort),
495
+ user_visible_latency_ms = GREATEST(
496
+ COALESCE(injection_effectiveness.user_visible_latency_ms, 0),
497
+ EXCLUDED.user_visible_latency_ms
498
+ ),
499
+ updated_at = NOW()
500
+ """
501
+
502
+ # Use shared error context for consistent exception handling
503
+ async with db_operation_error_context(
504
+ operation="write_latency_breakdowns",
505
+ target_name="latency_breakdowns",
506
+ correlation_id=correlation_id,
507
+ timeout_seconds=self._query_timeout,
508
+ circuit_breaker=self,
509
+ ):
510
+ async with self._pool.acquire() as conn:
511
+ # Apply statement_timeout for query timeout enforcement
512
+ # Use parameterized query for defense in depth
513
+ timeout_ms = int(self._query_timeout * 1000)
514
+ await conn.execute("SET statement_timeout = $1", str(timeout_ms))
515
+
516
+ # IMPORTANT: Upsert to injection_effectiveness FIRST to satisfy FK constraint
517
+ # If latency event arrives before utilization/agent-match events, we need
518
+ # the parent row to exist before inserting the child row.
519
+
520
+ # Compute MAX user_latency_ms per session for the batch
521
+ session_latencies: dict[
522
+ UUID, tuple[int, ModelLatencyBreakdownEvent]
523
+ ] = {}
524
+ for e in events:
525
+ if e.session_id not in session_latencies:
526
+ session_latencies[e.session_id] = (e.user_latency_ms, e)
527
+ else:
528
+ existing_latency, _ = session_latencies[e.session_id]
529
+ if e.user_latency_ms > existing_latency:
530
+ session_latencies[e.session_id] = (e.user_latency_ms, e)
531
+
532
+ # Wrap both writes in an explicit transaction for atomicity.
533
+ # If latency_breakdowns insert fails after injection_effectiveness upsert,
534
+ # both are rolled back to prevent partial data.
535
+ async with conn.transaction():
536
+ # 1. First: Upsert to injection_effectiveness (creates parent row if needed)
537
+ await conn.executemany(
538
+ sql_effectiveness,
539
+ [
540
+ (
541
+ session_id,
542
+ event.correlation_id,
543
+ event.cohort,
544
+ max_latency,
545
+ event.created_at,
546
+ )
547
+ for session_id, (
548
+ max_latency,
549
+ event,
550
+ ) in session_latencies.items()
551
+ ],
552
+ )
553
+
554
+ # 2. Then: Insert to latency_breakdowns (FK now satisfied)
555
+ await conn.executemany(
556
+ sql_breakdowns,
557
+ [
558
+ (
559
+ e.session_id,
560
+ e.prompt_id,
561
+ e.cohort,
562
+ e.cache_hit,
563
+ e.routing_latency_ms,
564
+ e.retrieval_latency_ms,
565
+ e.injection_latency_ms,
566
+ e.user_latency_ms,
567
+ e.emitted_at,
568
+ )
569
+ for e in events
570
+ ],
571
+ )
572
+
573
+ # Record success - reset circuit breaker after successful write
574
+ async with self._circuit_breaker_lock:
575
+ await self._reset_circuit_breaker()
576
+
577
+ logger.debug(
578
+ "Wrote latency breakdowns batch",
579
+ extra={
580
+ "count": len(events),
581
+ "sessions_updated": len(session_latencies),
582
+ "correlation_id": str(correlation_id),
583
+ },
584
+ )
585
+ return len(events)
586
+
587
+ def get_circuit_breaker_state(self) -> dict[str, JsonType]:
588
+ """Return current circuit breaker state for health checks.
589
+
590
+ Returns:
591
+ Dict containing circuit breaker state information.
592
+ """
593
+ return self._get_circuit_breaker_state()
594
+
595
+
596
+ __all__ = ["WriterInjectionEffectivenessPostgres"]
@@ -7,6 +7,7 @@ This package provides common utilities used across the infrastructure:
7
7
  - util_atomic_file: Atomic file write primitives using temp-file-rename pattern
8
8
  - util_consumer_group: Kafka consumer group ID generation with deterministic hashing
9
9
  - util_datetime: Datetime validation and timezone normalization
10
+ - util_db_error_context: Database operation error handling context manager
10
11
  - util_db_transaction: Database transaction context manager for asyncpg
11
12
  - util_dsn_validation: PostgreSQL DSN validation and sanitization
12
13
  - util_env_parsing: Type-safe environment variable parsing with validation
@@ -38,6 +39,10 @@ from omnibase_infra.utils.util_datetime import (
38
39
  validate_timezone_aware_with_context,
39
40
  warn_if_naive_datetime,
40
41
  )
42
+
43
+ # Note: util_db_error_context is NOT imported here to avoid circular imports.
44
+ # Import directly: from omnibase_infra.utils.util_db_error_context import db_operation_error_context
45
+ # See: omnibase_infra.errors -> util_error_sanitization -> utils.__init__ -> util_db_error_context -> errors
41
46
  from omnibase_infra.utils.util_db_transaction import (
42
47
  transaction_context,
43
48
  )
@@ -80,6 +85,8 @@ __all__: list[str] = [
80
85
  "CorrelationContext",
81
86
  "KAFKA_CONSUMER_GROUP_MAX_LENGTH",
82
87
  "OptimisticConflictError",
88
+ # Note: ProtocolCircuitBreakerFailureRecorder and db_operation_error_context are NOT exported
89
+ # here to avoid circular imports. Import directly from util_db_error_context.
83
90
  "SAFE_ERROR_PATTERNS",
84
91
  "SEMVER_PATTERN",
85
92
  "SENSITIVE_PATTERNS",