policyengine-observability 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1784 @@
1
+ from __future__ import annotations
2
+
3
+ import inspect
4
+ import json
5
+ import logging
6
+ import sys
7
+ import threading
8
+ import time
9
+ import traceback
10
+ from collections.abc import AsyncIterator, Iterator
11
+ from contextlib import asynccontextmanager, contextmanager
12
+ from contextvars import ContextVar
13
+ from datetime import UTC, datetime
14
+ from enum import Enum
15
+ from functools import wraps
16
+ from typing import Any
17
+
18
+ from .config import ObservabilityConfig
19
+ from .context import (
20
+ ErrorRecord,
21
+ OperationObservabilityContext,
22
+ RequestObservabilityContext,
23
+ _metric_attrs,
24
+ )
25
+ from .logging import configure_plain_logger
26
+ from .segments import coerce_segment_name
27
+
28
+ OBSERVABILITY_INTERNAL_DISPATCH_HEADER = "X-PolicyEngine-Internal-Dispatch"
29
+ REQUEST_ID_HEADER = "X-PolicyEngine-Request-Id"
30
+ TRACEPARENT_HEADER = "traceparent"
31
+
32
+ REQUEST_LOGGER_NAME = "policyengine_observability.requests"
33
+ OPERATION_LOGGER_NAME = "policyengine_observability.operations"
34
+ EVENT_LOGGER_NAME = "policyengine_observability.events"
35
+ INTERNAL_LOGGER_NAME = "policyengine_observability.internal"
36
+
37
+ REQUEST_LOGGER = logging.getLogger(REQUEST_LOGGER_NAME)
38
+ OPERATION_LOGGER = logging.getLogger(OPERATION_LOGGER_NAME)
39
+ EVENT_LOGGER = logging.getLogger(EVENT_LOGGER_NAME)
40
+ INTERNAL_LOGGER = logging.getLogger(INTERNAL_LOGGER_NAME)
41
+
42
+ _REQUEST_CONTEXT: ContextVar[RequestObservabilityContext | None] = ContextVar(
43
+ "policyengine_request_observability_context",
44
+ default=None,
45
+ )
46
+ _OPERATION_CONTEXT: ContextVar[OperationObservabilityContext | None] = (
47
+ ContextVar(
48
+ "policyengine_operation_observability_context",
49
+ default=None,
50
+ )
51
+ )
52
+ _TIMINGS: ContextVar[dict[str, float] | None] = ContextVar(
53
+ "policyengine_observability_timings",
54
+ default=None,
55
+ )
56
+ _TURN_START: ContextVar[float | None] = ContextVar(
57
+ "policyengine_observability_turn_start",
58
+ default=None,
59
+ )
60
+
61
+
62
+ class _NoOpInstrument:
63
+ def add(self, *_args, **_kwargs) -> None:
64
+ return None
65
+
66
+ def record(self, *_args, **_kwargs) -> None:
67
+ return None
68
+
69
+
70
+ class ObservabilityRuntime:
71
+ def __init__(
72
+ self,
73
+ config: ObservabilityConfig,
74
+ *,
75
+ segment_registry: type[Enum] | None = None,
76
+ ) -> None:
77
+ self.config = config
78
+ self.segment_registry = segment_registry
79
+ self.enabled = config.enabled
80
+ self.trace = None
81
+ self.propagate = None
82
+ self.SpanKind = None
83
+ self.Status = None
84
+ self.StatusCode = None
85
+ self.tracer_provider = None
86
+ self.meter_provider = None
87
+ self.tracer = None
88
+ self.meter = None
89
+ self.operation_duration = _NoOpInstrument()
90
+ self.http_duration = _NoOpInstrument()
91
+ self.segment_duration = _NoOpInstrument()
92
+ self.calculate_duration = _NoOpInstrument()
93
+ self.backend_duration = _NoOpInstrument()
94
+ self.operations = _NoOpInstrument()
95
+ self.requests = _NoOpInstrument()
96
+ self.errors = _NoOpInstrument()
97
+ self.rate_limited = _NoOpInstrument()
98
+ self.failover_events = _NoOpInstrument()
99
+ self.active_requests = _NoOpInstrument()
100
+ self._httpx_instrumented = False
101
+
102
+ @classmethod
103
+ def disabled(cls) -> ObservabilityRuntime:
104
+ return cls(ObservabilityConfig(enabled=False))
105
+
106
+ def configure(self) -> None:
107
+ self._configure_loggers()
108
+ if not self.enabled or not self.config.otel_enabled:
109
+ return
110
+ self._configure_otel()
111
+ if self.config.instrument_httpx:
112
+ self.instrument_httpx()
113
+
114
+ def current_context(self) -> RequestObservabilityContext | None:
115
+ try:
116
+ return _REQUEST_CONTEXT.get()
117
+ except BaseException as exc:
118
+ self.log_observability_failure("context.current", exc)
119
+ return None
120
+
121
+ def current_operation(
122
+ self,
123
+ ) -> OperationObservabilityContext | None:
124
+ try:
125
+ return _OPERATION_CONTEXT.get()
126
+ except BaseException as exc:
127
+ self.log_observability_failure("operation.current", exc)
128
+ return None
129
+
130
+ def operation(
131
+ self,
132
+ name: str,
133
+ *,
134
+ flavor: str | None = None,
135
+ **attrs: Any,
136
+ ):
137
+ return _OperationManager(self, name, flavor=flavor, attrs=attrs)
138
+
139
+ def entrypoint(
140
+ self,
141
+ name: str | None = None,
142
+ *,
143
+ flavor: str | None = None,
144
+ **attrs: Any,
145
+ ):
146
+ def decorator(func):
147
+ operation_name = name or getattr(func, "__name__", "operation")
148
+ return self.operation(
149
+ operation_name,
150
+ flavor=flavor,
151
+ **attrs,
152
+ )(func)
153
+
154
+ return decorator
155
+
156
+ def start_operation(
157
+ self,
158
+ name: str,
159
+ *,
160
+ flavor: str | None = None,
161
+ parent_context: Any = None,
162
+ timings: dict[str, float] | None = None,
163
+ emit_log: bool = True,
164
+ record_metric: bool = True,
165
+ **attrs: Any,
166
+ ) -> dict[str, Any]:
167
+ handle = {
168
+ "operation": None,
169
+ "operation_token": None,
170
+ "timings_token": None,
171
+ "start_token": None,
172
+ "context_token": None,
173
+ }
174
+ if not self.enabled:
175
+ return handle
176
+ try:
177
+ operation = OperationObservabilityContext(
178
+ config=self.config,
179
+ name=self._safe_str(name),
180
+ flavor=flavor,
181
+ attributes={
182
+ key: value
183
+ for key, value in attrs.items()
184
+ if value is not None
185
+ },
186
+ timings_ms={},
187
+ emit_log=emit_log,
188
+ record_metric=record_metric,
189
+ )
190
+ operation.context_token = _OPERATION_CONTEXT.set(operation)
191
+ handle["operation"] = operation
192
+ handle["operation_token"] = operation.context_token
193
+ if timings is not None:
194
+ handle["timings_token"] = _TIMINGS.set(timings)
195
+ handle["start_token"] = _TURN_START.set(time.perf_counter())
196
+ if parent_context is not None and self.tracer is not None:
197
+ try:
198
+ from opentelemetry import context as otel_context
199
+
200
+ handle["context_token"] = otel_context.attach(
201
+ parent_context
202
+ )
203
+ except BaseException as exc:
204
+ self.log_observability_failure(
205
+ "operation.context_attach",
206
+ exc,
207
+ )
208
+ if self.tracer is not None:
209
+ operation.span_handle = self._start_span(
210
+ self._span_name(operation.name),
211
+ operation.span_attributes(),
212
+ )
213
+ except BaseException as exc:
214
+ self.log_observability_failure("operation.start", exc, name=name)
215
+ return handle
216
+
217
+ def end_operation(
218
+ self,
219
+ handle: dict[str, Any] | None,
220
+ error: BaseException | None = None,
221
+ ) -> None:
222
+ if not handle:
223
+ return
224
+ operation = handle.get("operation")
225
+ try:
226
+ if operation is not None and error is not None:
227
+ operation.error = ErrorRecord(
228
+ type=type(error).__name__,
229
+ message=self._safe_str(error),
230
+ handled=False,
231
+ stack=self._safe_traceback(error),
232
+ )
233
+ self.record_error_metric(
234
+ operation.metric_attributes(
235
+ error_type=type(error).__name__
236
+ )
237
+ )
238
+ if operation is not None:
239
+ self.complete_operation(operation)
240
+ if operation is not None:
241
+ self._end_span(operation.span_handle, error)
242
+ except BaseException as exc:
243
+ self.log_observability_failure("operation.end", exc)
244
+ finally:
245
+ context_token = handle.get("context_token")
246
+ if context_token is not None:
247
+ try:
248
+ from opentelemetry import context as otel_context
249
+
250
+ otel_context.detach(context_token)
251
+ except BaseException as exc:
252
+ self.log_observability_failure(
253
+ "operation.context_detach",
254
+ exc,
255
+ )
256
+ for var, key in (
257
+ (_TIMINGS, "timings_token"),
258
+ (_TURN_START, "start_token"),
259
+ (_OPERATION_CONTEXT, "operation_token"),
260
+ ):
261
+ token = handle.get(key)
262
+ if token is not None:
263
+ try:
264
+ var.reset(token)
265
+ except BaseException as exc:
266
+ self.log_observability_failure(
267
+ "operation.context_reset",
268
+ exc,
269
+ token=key,
270
+ )
271
+
272
+ def complete_operation(
273
+ self,
274
+ operation: OperationObservabilityContext,
275
+ ) -> None:
276
+ if operation.metric_recorded:
277
+ return
278
+ operation.metric_recorded = True
279
+ if operation.record_metric:
280
+ self.record_operation_metric(
281
+ operation.duration_seconds(),
282
+ operation.metric_attributes(),
283
+ )
284
+ if operation.emit_log:
285
+ self.emit_operation_log(operation)
286
+
287
+ def begin_request(
288
+ self,
289
+ context: RequestObservabilityContext,
290
+ *,
291
+ carrier: Any = None,
292
+ ) -> None:
293
+ if not self.enabled:
294
+ return
295
+ try:
296
+ context.context_token = _REQUEST_CONTEXT.set(context)
297
+ context.set_attribute("endpoint", context.endpoint)
298
+ self._begin_request_operation(context)
299
+ self._start_request_span(context, carrier=carrier)
300
+ self.record_active_request(1, context.metric_attributes())
301
+ except BaseException as exc:
302
+ self.log_observability_failure("request.begin", exc)
303
+
304
+ def _begin_request_operation(
305
+ self,
306
+ context: RequestObservabilityContext,
307
+ ) -> None:
308
+ try:
309
+ operation = OperationObservabilityContext(
310
+ config=context.config,
311
+ name=context.route,
312
+ flavor="http",
313
+ attributes={
314
+ "route": context.route,
315
+ "method": context.method,
316
+ "endpoint": context.endpoint,
317
+ "path": context.path,
318
+ },
319
+ timings_ms=context.timings_ms,
320
+ emit_log=False,
321
+ record_metric=False,
322
+ )
323
+ operation.context_token = _OPERATION_CONTEXT.set(operation)
324
+ context.operation_context = operation
325
+ context.operation_token = operation.context_token
326
+ except BaseException as exc:
327
+ self.log_observability_failure(
328
+ "request.operation_begin",
329
+ exc,
330
+ request_id=getattr(context, "request_id", None),
331
+ )
332
+
333
+ def finish_request(self, status_code: int) -> dict[str, str]:
334
+ headers = self.prepare_response(status_code)
335
+ self.complete_request(status_code)
336
+ return headers
337
+
338
+ def prepare_response(self, status_code: int) -> dict[str, str]:
339
+ if not self.enabled:
340
+ return {}
341
+ headers: dict[str, str] = {}
342
+ try:
343
+ context = self.current_context()
344
+ if context is None:
345
+ return headers
346
+ context.status_code = status_code
347
+ self._set_current_span_attributes(context.span_attributes())
348
+ if context.operation_context is not None:
349
+ context.operation_context.set_attribute(
350
+ "status_code",
351
+ str(status_code),
352
+ )
353
+ headers[REQUEST_ID_HEADER] = context.request_id
354
+ traceparent = self.traceparent_header()
355
+ if traceparent:
356
+ headers[TRACEPARENT_HEADER] = traceparent
357
+ if status_code == 429:
358
+ context.set_attribute("rate_limited", True)
359
+ return headers
360
+ except BaseException as exc:
361
+ self.log_observability_failure("request.prepare_response", exc)
362
+ return headers
363
+
364
+ def complete_request(self, status_code: int | None = None) -> None:
365
+ if not self.enabled:
366
+ return
367
+ try:
368
+ context = self.current_context()
369
+ if context is None:
370
+ return
371
+ if status_code is not None:
372
+ context.status_code = status_code
373
+ self._set_current_span_attributes(context.span_attributes())
374
+ if context.request_metric_recorded:
375
+ return
376
+ context.request_metric_recorded = True
377
+ if context.status_code == 429:
378
+ self.record_rate_limited_metric(context.metric_attributes())
379
+ self.record_request_metric(
380
+ context.duration_seconds(),
381
+ context.metric_attributes(),
382
+ )
383
+ self._close_active_request(context)
384
+ except BaseException as exc:
385
+ self.log_observability_failure("request.complete", exc)
386
+
387
+ def update_request_route(
388
+ self,
389
+ *,
390
+ route: str | None = None,
391
+ endpoint: str | None = None,
392
+ ) -> None:
393
+ if not self.enabled:
394
+ return
395
+ try:
396
+ context = self.current_context()
397
+ if context is None:
398
+ return
399
+ route_changed = bool(route and route != context.route)
400
+ old_active_attributes = (
401
+ context.metric_attributes()
402
+ if route_changed and not context.active_closed
403
+ else None
404
+ )
405
+ if route:
406
+ context.route = route
407
+ if context.operation_context is not None:
408
+ context.operation_context.name = route
409
+ context.operation_context.set_attribute("route", route)
410
+ if endpoint:
411
+ context.endpoint = endpoint
412
+ context.set_attribute("endpoint", endpoint)
413
+ if context.operation_context is not None:
414
+ context.operation_context.set_attribute(
415
+ "endpoint",
416
+ endpoint,
417
+ )
418
+ self._set_current_span_attributes(context.span_attributes())
419
+ span = context.server_span
420
+ update_name = getattr(span, "update_name", None)
421
+ if route and update_name is not None:
422
+ update_name(route)
423
+ if old_active_attributes is not None:
424
+ self.record_active_request(-1, old_active_attributes)
425
+ self.record_active_request(1, context.metric_attributes())
426
+ except BaseException as exc:
427
+ self.log_observability_failure("request.update_route", exc)
428
+
429
+ def teardown_request(self, exc: BaseException | None = None) -> None:
430
+ if not self.enabled:
431
+ return
432
+ context = self.current_context()
433
+ if context is None:
434
+ return
435
+ try:
436
+ if exc is not None:
437
+ self.record_error(
438
+ exc,
439
+ handled=False,
440
+ status_code=context.status_code or 500,
441
+ )
442
+ self._close_active_request(context)
443
+ self.emit_request_log(context)
444
+ except BaseException as observability_exc:
445
+ self.log_observability_failure(
446
+ "request.teardown",
447
+ observability_exc,
448
+ )
449
+ finally:
450
+ self._close_request_span(context, exc)
451
+ self._reset_request_operation_context(context)
452
+ self._reset_request_context(context)
453
+
454
+ def set_attribute(self, key: str, value: Any) -> None:
455
+ if not self.enabled:
456
+ return
457
+ try:
458
+ context = self.current_context()
459
+ if context is not None:
460
+ context.set_attribute(key, value)
461
+ if context.operation_context is not None:
462
+ context.operation_context.set_attribute(key, value)
463
+ self._set_current_span_attributes(
464
+ context.span_attributes(**{f"policyengine.{key}": value})
465
+ )
466
+ return
467
+ operation = self.current_operation()
468
+ if operation is not None:
469
+ operation.set_attribute(key, value)
470
+ self._set_current_span_attributes(
471
+ operation.span_attributes(**{f"policyengine.{key}": value})
472
+ )
473
+ except BaseException as exc:
474
+ self.log_observability_failure(
475
+ "request.set_attribute",
476
+ exc,
477
+ attribute=key,
478
+ )
479
+
480
+ def segment(self, name: Any, **attrs: Any) -> Iterator[Any]:
481
+ return _SegmentManager(self, name, attrs)
482
+
483
+ @contextmanager
484
+ def _segment_context(self, name: Any, **attrs: Any) -> Iterator[Any]:
485
+ if not self.enabled:
486
+ yield None
487
+ return
488
+ segment_name = self._coerce_segment(name)
489
+ implicit_operation = self._start_implicit_operation(
490
+ segment_name,
491
+ attrs,
492
+ )
493
+ start = self._safe_perf_counter(f"segment.{segment_name}.start")
494
+ span_attrs = self._segment_span_attributes(attrs)
495
+ span_name = self._span_name(segment_name)
496
+ error: BaseException | None = None
497
+ with self._safe_span(span_name, span_attrs) as span:
498
+ try:
499
+ yield span
500
+ except BaseException as exc:
501
+ error = exc
502
+ self._record_segment_safely(segment_name, start, attrs)
503
+ raise
504
+ else:
505
+ self._record_segment_safely(segment_name, start, attrs)
506
+ finally:
507
+ self.end_operation(implicit_operation, error)
508
+
509
+ @asynccontextmanager
510
+ async def asegment(self, name: Any, **attrs: Any) -> AsyncIterator[Any]:
511
+ if not self.enabled:
512
+ yield None
513
+ return
514
+ segment_name = self._coerce_segment(name)
515
+ implicit_operation = self._start_implicit_operation(
516
+ segment_name,
517
+ attrs,
518
+ )
519
+ start = self._safe_perf_counter(f"segment.{segment_name}.start")
520
+ span_attrs = self._segment_span_attributes(attrs)
521
+ span_name = self._span_name(segment_name)
522
+ error: BaseException | None = None
523
+ with self._safe_span(span_name, span_attrs) as span:
524
+ try:
525
+ yield span
526
+ except BaseException as exc:
527
+ error = exc
528
+ self._record_segment_safely(segment_name, start, attrs)
529
+ raise
530
+ else:
531
+ self._record_segment_safely(segment_name, start, attrs)
532
+ finally:
533
+ self.end_operation(implicit_operation, error)
534
+
535
+ @contextmanager
536
+ def collect_timings(self, name: str = "operation", **attrs: Any):
537
+ timings: dict[str, float] = {}
538
+ handle = self.start_scope(timings, name=name, **attrs)
539
+ error: BaseException | None = None
540
+ try:
541
+ yield timings
542
+ except BaseException as exc:
543
+ error = exc
544
+ raise
545
+ finally:
546
+ self.end_scope(handle, error)
547
+
548
+ def start_scope(
549
+ self,
550
+ timings: dict[str, float],
551
+ *,
552
+ name: str = "operation",
553
+ parent_context: Any = None,
554
+ **attrs: Any,
555
+ ) -> dict[str, Any]:
556
+ if self.current_operation() is None:
557
+ return {
558
+ "operation_handle": self.start_operation(
559
+ name,
560
+ parent_context=parent_context,
561
+ timings=timings,
562
+ **attrs,
563
+ )
564
+ }
565
+ handle = {
566
+ "operation_handle": None,
567
+ "timings_token": None,
568
+ "start_token": None,
569
+ "context_token": None,
570
+ "span": None,
571
+ }
572
+ try:
573
+ handle["timings_token"] = _TIMINGS.set(timings)
574
+ except BaseException as exc:
575
+ self.log_observability_failure("scope.timings_set", exc)
576
+ try:
577
+ handle["start_token"] = _TURN_START.set(time.perf_counter())
578
+ except BaseException as exc:
579
+ self.log_observability_failure("scope.start_set", exc)
580
+ if parent_context is not None and self.tracer is not None:
581
+ try:
582
+ from opentelemetry import context as otel_context
583
+
584
+ handle["context_token"] = otel_context.attach(parent_context)
585
+ except BaseException as exc:
586
+ self.log_observability_failure("scope.context_attach", exc)
587
+ try:
588
+ if self.tracer is not None:
589
+ handle["span"] = self._start_span(name, attrs)
590
+ except BaseException as exc:
591
+ self.log_observability_failure("scope.span_start", exc, span=name)
592
+ handle["span"] = None
593
+ return handle
594
+
595
+ def annotate(
596
+ self,
597
+ handle: dict[str, Any] | None = None,
598
+ **attrs: Any,
599
+ ) -> None:
600
+ try:
601
+ if handle:
602
+ span_handle = handle.get("span")
603
+ if span_handle is not None:
604
+ _cm, span = span_handle
605
+ for key, value in attrs.items():
606
+ if value is not None:
607
+ span.set_attribute(key, value)
608
+ context = self.current_context()
609
+ if context is not None:
610
+ for key, value in attrs.items():
611
+ context.set_attribute(key, value)
612
+ operation = self.current_operation()
613
+ if operation is not None:
614
+ for key, value in attrs.items():
615
+ operation.set_attribute(key, value)
616
+ self._set_current_span_attributes(operation.span_attributes())
617
+ except BaseException as exc:
618
+ self.log_observability_failure("scope.annotate", exc)
619
+
620
+ def end_scope(
621
+ self,
622
+ handle: dict[str, Any] | None,
623
+ error: BaseException | None = None,
624
+ ) -> None:
625
+ if not handle:
626
+ return
627
+ operation_handle = handle.get("operation_handle")
628
+ if operation_handle is not None:
629
+ self.end_operation(operation_handle, error)
630
+ return
631
+ try:
632
+ self._end_span(handle.get("span"), error)
633
+ except BaseException as exc:
634
+ self.log_observability_failure("scope.span_end", exc)
635
+ context_token = handle.get("context_token")
636
+ if context_token is not None:
637
+ try:
638
+ from opentelemetry import context as otel_context
639
+
640
+ otel_context.detach(context_token)
641
+ except BaseException as exc:
642
+ self.log_observability_failure("scope.context_detach", exc)
643
+ for var, key in (
644
+ (_TIMINGS, "timings_token"),
645
+ (_TURN_START, "start_token"),
646
+ ):
647
+ token = handle.get(key)
648
+ if token is not None:
649
+ try:
650
+ var.reset(token)
651
+ except BaseException as exc:
652
+ self.log_observability_failure(
653
+ "scope.context_reset",
654
+ exc,
655
+ token=key,
656
+ )
657
+
658
+ def mark(self, key: str, ms: float) -> None:
659
+ try:
660
+ timings = _TIMINGS.get()
661
+ if timings is not None:
662
+ timings[key] = round(float(ms), 1)
663
+ except BaseException as exc:
664
+ self.log_observability_failure("scope.mark", exc, key=key)
665
+
666
+ def mark_ttft(self, key: str = "ttft_ms") -> None:
667
+ try:
668
+ start = _TURN_START.get()
669
+ if start is not None:
670
+ self.mark(key, (time.perf_counter() - start) * 1000.0)
671
+ except BaseException as exc:
672
+ self.log_observability_failure("scope.mark_ttft", exc)
673
+
674
+ def record_error(
675
+ self,
676
+ exc: BaseException,
677
+ *,
678
+ handled: bool,
679
+ status_code: int | None = None,
680
+ include_stack: bool = True,
681
+ ) -> None:
682
+ if not self.enabled:
683
+ return
684
+ try:
685
+ context = self.current_context()
686
+ operation = self.current_operation()
687
+ error_record = ErrorRecord(
688
+ type=type(exc).__name__,
689
+ message=self._safe_str(exc),
690
+ handled=handled,
691
+ stack=(self._safe_traceback(exc) if include_stack else None),
692
+ )
693
+ if context is not None:
694
+ if status_code is not None:
695
+ context.status_code = status_code
696
+ context.error = error_record
697
+ self.record_error_metric(
698
+ context.metric_attributes(error_type=type(exc).__name__)
699
+ )
700
+ elif operation is not None:
701
+ operation.error = error_record
702
+ self.record_error_metric(
703
+ operation.metric_attributes(error_type=type(exc).__name__)
704
+ )
705
+ else:
706
+ return
707
+ span = self._current_span()
708
+ if span is not None:
709
+ self._record_exception_on_span(
710
+ span,
711
+ exc,
712
+ handled=handled,
713
+ status_code=status_code,
714
+ )
715
+ except BaseException as observability_exc:
716
+ self.log_observability_failure(
717
+ "request.record_error",
718
+ observability_exc,
719
+ original_error_type=type(exc).__name__,
720
+ )
721
+
722
+ def record_event(self, event: str, **fields: Any) -> None:
723
+ if not self.enabled:
724
+ return
725
+ try:
726
+ context = self.current_context()
727
+ operation = self.current_operation()
728
+ base: dict[str, Any] = {
729
+ "schema_version": "policyengine.observability.event.v1",
730
+ "event": event,
731
+ "created_at": datetime.now(UTC).isoformat(),
732
+ }
733
+ if context is not None:
734
+ trace_id, span_id = self._trace_ids()
735
+ base.update(
736
+ {
737
+ "service_name": context.config.service_name,
738
+ "service_role": context.config.service_role,
739
+ "environment": context.config.environment,
740
+ "request_id": context.request_id,
741
+ "trace_id": trace_id,
742
+ "span_id": span_id,
743
+ "route": context.route,
744
+ "path": context.path,
745
+ }
746
+ )
747
+ elif operation is not None:
748
+ trace_id, span_id = self._trace_ids()
749
+ base.update(
750
+ {
751
+ "service_name": operation.config.service_name,
752
+ "service_role": operation.config.service_role,
753
+ "environment": operation.config.environment,
754
+ "operation": operation.name,
755
+ "flavor": operation.flavor,
756
+ "trace_id": trace_id,
757
+ "span_id": span_id,
758
+ }
759
+ )
760
+ clean_fields = {
761
+ key: value
762
+ for key, value in fields.items()
763
+ if value is not None
764
+ }
765
+ base.update(clean_fields)
766
+ EVENT_LOGGER.info(self._json(base))
767
+ self._add_span_event(event, clean_fields)
768
+ if event.startswith("modal_") or "fallback" in event:
769
+ attrs = (
770
+ context.metric_attributes(event=event)
771
+ if context
772
+ else operation.metric_attributes(event=event)
773
+ if operation
774
+ else _metric_attrs(
775
+ {"event": event},
776
+ self.config.metric_attribute_keys,
777
+ )
778
+ )
779
+ self.record_failover_event_metric(attrs)
780
+ except BaseException as exc:
781
+ self.log_observability_failure(
782
+ "request.record_event",
783
+ exc,
784
+ event_name=event,
785
+ )
786
+
787
+ def traceparent_header(self) -> str | None:
788
+ if not self.enabled or self.propagate is None:
789
+ return None
790
+ try:
791
+ carrier: dict[str, str] = {}
792
+ self.propagate.inject(carrier)
793
+ return carrier.get(TRACEPARENT_HEADER)
794
+ except BaseException as exc:
795
+ self.log_observability_failure("request.traceparent_header", exc)
796
+ return None
797
+
798
+ def capture_context(self):
799
+ if self.tracer is None:
800
+ return None
801
+ try:
802
+ from opentelemetry import context as otel_context
803
+
804
+ return otel_context.get_current()
805
+ except BaseException as exc:
806
+ self.log_observability_failure("otel.capture_context", exc)
807
+ return None
808
+
809
+ def emit_request_log(self, context: RequestObservabilityContext) -> None:
810
+ if not self.enabled:
811
+ return
812
+ try:
813
+ if context.emitted:
814
+ return
815
+ context.emitted = True
816
+ if (
817
+ context.internal_dispatch
818
+ or not context.config.request_logs_enabled
819
+ ):
820
+ return
821
+ trace_id, span_id = self._trace_ids()
822
+ REQUEST_LOGGER.info(
823
+ self._json(
824
+ context.as_log_record(
825
+ trace_id=trace_id,
826
+ span_id=span_id,
827
+ )
828
+ )
829
+ )
830
+ except BaseException as exc:
831
+ self.log_observability_failure(
832
+ "request.emit_request_log",
833
+ exc,
834
+ request_id=getattr(context, "request_id", None),
835
+ )
836
+
837
+ def emit_operation_log(
838
+ self,
839
+ operation: OperationObservabilityContext,
840
+ ) -> None:
841
+ if not self.enabled:
842
+ return
843
+ try:
844
+ if operation.emitted:
845
+ return
846
+ operation.emitted = True
847
+ trace_id, span_id = self._trace_ids()
848
+ OPERATION_LOGGER.info(
849
+ self._json(
850
+ operation.as_log_record(
851
+ trace_id=trace_id,
852
+ span_id=span_id,
853
+ )
854
+ )
855
+ )
856
+ except BaseException as exc:
857
+ self.log_observability_failure(
858
+ "operation.emit_log",
859
+ exc,
860
+ operation=getattr(operation, "name", None),
861
+ )
862
+
863
+ def record_operation_metric(
864
+ self,
865
+ duration_seconds: float,
866
+ attributes: dict[str, str],
867
+ ) -> None:
868
+ try:
869
+ self.operation_duration.record(duration_seconds, attributes)
870
+ self.operations.add(1, attributes)
871
+ except BaseException as exc:
872
+ self.log_observability_failure("metrics.record_operation", exc)
873
+
874
+ def record_request_metric(
875
+ self,
876
+ duration_seconds: float,
877
+ attributes: dict[str, str],
878
+ ) -> None:
879
+ try:
880
+ self.http_duration.record(duration_seconds, attributes)
881
+ self.requests.add(1, attributes)
882
+ except BaseException as exc:
883
+ self.log_observability_failure("metrics.record_request", exc)
884
+
885
+ def record_segment_metric(
886
+ self,
887
+ segment: str,
888
+ duration_seconds: float,
889
+ attributes: dict[str, str],
890
+ *,
891
+ backend_segment: bool = False,
892
+ ) -> None:
893
+ try:
894
+ segment_attributes = {**attributes, "segment": segment}
895
+ self.segment_duration.record(duration_seconds, segment_attributes)
896
+ if segment == "calculation":
897
+ self.calculate_duration.record(duration_seconds, attributes)
898
+ if backend_segment:
899
+ self.backend_duration.record(
900
+ duration_seconds,
901
+ segment_attributes,
902
+ )
903
+ except BaseException as exc:
904
+ self.log_observability_failure(
905
+ "metrics.record_segment",
906
+ exc,
907
+ segment=segment,
908
+ )
909
+
910
+ def record_error_metric(self, attributes: dict[str, str]) -> None:
911
+ try:
912
+ self.errors.add(1, attributes)
913
+ except BaseException as exc:
914
+ self.log_observability_failure("metrics.record_error", exc)
915
+
916
+ def record_rate_limited_metric(self, attributes: dict[str, str]) -> None:
917
+ try:
918
+ self.rate_limited.add(1, attributes)
919
+ except BaseException as exc:
920
+ self.log_observability_failure("metrics.record_rate_limited", exc)
921
+
922
+ def record_failover_event_metric(self, attributes: dict[str, str]) -> None:
923
+ try:
924
+ self.failover_events.add(1, attributes)
925
+ except BaseException as exc:
926
+ self.log_observability_failure(
927
+ "metrics.record_failover_event",
928
+ exc,
929
+ )
930
+
931
+ def record_active_request(
932
+ self,
933
+ delta: int,
934
+ attributes: dict[str, str],
935
+ ) -> None:
936
+ try:
937
+ self.active_requests.add(delta, attributes)
938
+ except BaseException as exc:
939
+ self.log_observability_failure("metrics.add_active_request", exc)
940
+
941
+ def instrument_fastapi(self, app: Any) -> None:
942
+ if not self.enabled or not self.config.otel_enabled:
943
+ return
944
+ try:
945
+ from opentelemetry.instrumentation.fastapi import (
946
+ FastAPIInstrumentor,
947
+ )
948
+
949
+ FastAPIInstrumentor.instrument_app(app)
950
+ except BaseException as exc:
951
+ self.log_observability_failure(
952
+ "fastapi.auto_instrument",
953
+ exc,
954
+ )
955
+
956
+ def instrument_httpx(self) -> None:
957
+ if (
958
+ not self.enabled
959
+ or not self.config.otel_enabled
960
+ or self._httpx_instrumented
961
+ ):
962
+ return
963
+ try:
964
+ from opentelemetry.instrumentation.httpx import (
965
+ HTTPXClientInstrumentor,
966
+ )
967
+
968
+ HTTPXClientInstrumentor().instrument()
969
+ self._httpx_instrumented = True
970
+ except BaseException as exc:
971
+ self.log_observability_failure("httpx.auto_instrument", exc)
972
+
973
+ def shutdown(self) -> None:
974
+ providers = [
975
+ ("trace", self.tracer_provider),
976
+ ("metrics", self.meter_provider),
977
+ ]
978
+ providers = [
979
+ (name, provider)
980
+ for name, provider in providers
981
+ if provider is not None
982
+ ]
983
+ if not providers:
984
+ return
985
+
986
+ def flush() -> None:
987
+ for name, provider in providers:
988
+ try:
989
+ provider.shutdown()
990
+ except BaseException as exc:
991
+ self.log_observability_failure(
992
+ f"otel.{name}_shutdown",
993
+ exc,
994
+ )
995
+
996
+ thread = threading.Thread(
997
+ target=flush,
998
+ name="policyengine-otel-shutdown",
999
+ daemon=True,
1000
+ )
1001
+ thread.start()
1002
+ thread.join(timeout=self.config.shutdown_timeout_seconds)
1003
+ if thread.is_alive():
1004
+ self.log_observability_failure(
1005
+ "otel.shutdown_timeout",
1006
+ TimeoutError("OpenTelemetry shutdown timed out."),
1007
+ timeout_seconds=self.config.shutdown_timeout_seconds,
1008
+ )
1009
+
1010
+ def shutdown_tracing(self) -> None:
1011
+ self.shutdown()
1012
+
1013
+ def log_observability_failure(
1014
+ self,
1015
+ operation: str,
1016
+ exc: BaseException,
1017
+ **fields: Any,
1018
+ ) -> None:
1019
+ payload = {
1020
+ "schema_version": "policyengine.observability.internal_error.v1",
1021
+ "event": "observability_internal_error",
1022
+ "created_at": datetime.now(UTC).isoformat(),
1023
+ "operation": operation,
1024
+ "error": {
1025
+ "type": type(exc).__name__,
1026
+ "message": self._safe_str(exc),
1027
+ "stack": self._safe_traceback(exc),
1028
+ },
1029
+ }
1030
+ payload.update(
1031
+ {key: value for key, value in fields.items() if value is not None}
1032
+ )
1033
+ try:
1034
+ INTERNAL_LOGGER.error(self._json(payload))
1035
+ except BaseException:
1036
+ self._write_stderr(payload)
1037
+
1038
+ def _configure_loggers(self) -> None:
1039
+ for logger in (
1040
+ REQUEST_LOGGER,
1041
+ OPERATION_LOGGER,
1042
+ EVENT_LOGGER,
1043
+ INTERNAL_LOGGER,
1044
+ ):
1045
+ configure_plain_logger(logger, self.config.log_level)
1046
+
1047
+ def _configure_otel(self) -> None:
1048
+ try:
1049
+ from opentelemetry import metrics, propagate, trace
1050
+ from opentelemetry.sdk.metrics import MeterProvider
1051
+ from opentelemetry.sdk.resources import (
1052
+ DEPLOYMENT_ENVIRONMENT,
1053
+ SERVICE_NAME,
1054
+ Resource,
1055
+ )
1056
+ from opentelemetry.sdk.trace import TracerProvider
1057
+ from opentelemetry.trace import SpanKind, Status, StatusCode
1058
+ except BaseException as exc:
1059
+ self.log_observability_failure("otel.configure_imports", exc)
1060
+ return
1061
+
1062
+ try:
1063
+ resource = Resource.create(
1064
+ {
1065
+ SERVICE_NAME: self.config.service_name,
1066
+ DEPLOYMENT_ENVIRONMENT: self.config.environment,
1067
+ "service.role": self.config.service_role,
1068
+ }
1069
+ )
1070
+ tracer_provider = TracerProvider(resource=resource)
1071
+ metric_readers = []
1072
+ if self.config.otlp_endpoint:
1073
+ self._add_trace_exporter(tracer_provider)
1074
+ metric_reader = self._metric_reader()
1075
+ if metric_reader is not None:
1076
+ metric_readers.append(metric_reader)
1077
+ self.tracer_provider = tracer_provider
1078
+ try:
1079
+ trace.set_tracer_provider(tracer_provider)
1080
+ except BaseException as exc:
1081
+ self.log_observability_failure(
1082
+ "otel.set_tracer_provider",
1083
+ exc,
1084
+ )
1085
+ try:
1086
+ self.meter_provider = MeterProvider(
1087
+ resource=resource,
1088
+ metric_readers=metric_readers,
1089
+ )
1090
+ metrics.set_meter_provider(self.meter_provider)
1091
+ except BaseException as exc:
1092
+ self.log_observability_failure(
1093
+ "otel.set_meter_provider",
1094
+ exc,
1095
+ )
1096
+ self.trace = trace
1097
+ self.propagate = propagate
1098
+ self.SpanKind = SpanKind
1099
+ self.Status = Status
1100
+ self.StatusCode = StatusCode
1101
+ tracer_name = self.config.tracer_name or self.config.service_name
1102
+ meter_name = self.config.meter_name or self.config.service_name
1103
+ self.tracer = trace.get_tracer(tracer_name)
1104
+ self.meter = metrics.get_meter(meter_name)
1105
+ self._configure_instruments()
1106
+ except BaseException as exc:
1107
+ self.log_observability_failure("otel.configure", exc)
1108
+
1109
+ def _add_trace_exporter(self, tracer_provider) -> None:
1110
+ try:
1111
+ if self.config.otlp_protocol.startswith("http"):
1112
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
1113
+ OTLPSpanExporter,
1114
+ )
1115
+ else:
1116
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
1117
+ OTLPSpanExporter,
1118
+ )
1119
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
1120
+
1121
+ tracer_provider.add_span_processor(
1122
+ BatchSpanProcessor(OTLPSpanExporter())
1123
+ )
1124
+ except BaseException as exc:
1125
+ self.log_observability_failure("otel.trace_exporter", exc)
1126
+
1127
+ def _metric_reader(self):
1128
+ try:
1129
+ if self.config.otlp_protocol.startswith("http"):
1130
+ from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
1131
+ OTLPMetricExporter,
1132
+ )
1133
+ else:
1134
+ from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
1135
+ OTLPMetricExporter,
1136
+ )
1137
+ from opentelemetry.sdk.metrics.export import (
1138
+ PeriodicExportingMetricReader,
1139
+ )
1140
+
1141
+ return PeriodicExportingMetricReader(OTLPMetricExporter())
1142
+ except BaseException as exc:
1143
+ self.log_observability_failure("otel.metric_exporter", exc)
1144
+ return None
1145
+
1146
+ def _configure_instruments(self) -> None:
1147
+ self.operation_duration = self._instrument(
1148
+ getattr(self.meter, "create_histogram", None),
1149
+ "policyengine.operation.duration",
1150
+ unit="s",
1151
+ description="PolicyEngine operation duration.",
1152
+ )
1153
+ self.http_duration = self._instrument(
1154
+ getattr(self.meter, "create_histogram", None),
1155
+ "http.server.request.duration",
1156
+ unit="s",
1157
+ description="HTTP server request duration.",
1158
+ )
1159
+ self.segment_duration = self._instrument(
1160
+ getattr(self.meter, "create_histogram", None),
1161
+ "policyengine.segment.duration",
1162
+ unit="s",
1163
+ description="PolicyEngine operation segment duration.",
1164
+ )
1165
+ self.calculate_duration = self._instrument(
1166
+ getattr(self.meter, "create_histogram", None),
1167
+ "policyengine.calculate.duration",
1168
+ unit="s",
1169
+ description="PolicyEngine calculate operation duration.",
1170
+ )
1171
+ self.backend_duration = self._instrument(
1172
+ getattr(self.meter, "create_histogram", None),
1173
+ "policyengine.backend.duration",
1174
+ unit="s",
1175
+ description="PolicyEngine backend call duration.",
1176
+ )
1177
+ self.operations = self._instrument(
1178
+ getattr(self.meter, "create_counter", None),
1179
+ "policyengine.operations",
1180
+ description="PolicyEngine operation count.",
1181
+ )
1182
+ self.requests = self._instrument(
1183
+ getattr(self.meter, "create_counter", None),
1184
+ "policyengine.requests",
1185
+ description="PolicyEngine request count.",
1186
+ )
1187
+ self.errors = self._instrument(
1188
+ getattr(self.meter, "create_counter", None),
1189
+ "policyengine.errors",
1190
+ description="PolicyEngine error count.",
1191
+ )
1192
+ self.rate_limited = self._instrument(
1193
+ getattr(self.meter, "create_counter", None),
1194
+ "policyengine.rate_limited_requests",
1195
+ description="PolicyEngine rate-limited request count.",
1196
+ )
1197
+ self.failover_events = self._instrument(
1198
+ getattr(self.meter, "create_counter", None),
1199
+ "policyengine.failover.events",
1200
+ description="PolicyEngine failover event count.",
1201
+ )
1202
+ self.active_requests = self._instrument(
1203
+ getattr(self.meter, "create_up_down_counter", None),
1204
+ "http.server.active_requests",
1205
+ description="Active HTTP server requests.",
1206
+ )
1207
+
1208
+ def _instrument(self, factory, *args, **kwargs):
1209
+ if factory is None:
1210
+ return _NoOpInstrument()
1211
+ try:
1212
+ return factory(*args, **kwargs)
1213
+ except BaseException as exc:
1214
+ self.log_observability_failure(
1215
+ "metrics.create_instrument",
1216
+ exc,
1217
+ instrument=args[0] if args else None,
1218
+ )
1219
+ return _NoOpInstrument()
1220
+
1221
+ def _start_request_span(
1222
+ self,
1223
+ context: RequestObservabilityContext,
1224
+ *,
1225
+ carrier: Any = None,
1226
+ ) -> None:
1227
+ if self.tracer is None:
1228
+ return
1229
+ attrs = context.span_attributes()
1230
+ parent_context = self._extract_context(carrier)
1231
+ try:
1232
+ context.server_span_cm = self.tracer.start_as_current_span(
1233
+ context.route,
1234
+ context=parent_context,
1235
+ kind=self.SpanKind.SERVER if self.SpanKind else None,
1236
+ attributes=attrs,
1237
+ )
1238
+ context.server_span = context.server_span_cm.__enter__()
1239
+ except BaseException as exc:
1240
+ context.server_span_cm = None
1241
+ context.server_span = None
1242
+ self.log_observability_failure("otel.request_span_enter", exc)
1243
+
1244
+ def _close_request_span(
1245
+ self,
1246
+ context: RequestObservabilityContext,
1247
+ exc: BaseException | None,
1248
+ ) -> None:
1249
+ if context.span_closed:
1250
+ return
1251
+ context.span_closed = True
1252
+ span_cm = context.server_span_cm
1253
+ if span_cm is None:
1254
+ return
1255
+ try:
1256
+ if exc is None:
1257
+ span_cm.__exit__(None, None, None)
1258
+ else:
1259
+ span_cm.__exit__(type(exc), exc, exc.__traceback__)
1260
+ except BaseException as observability_exc:
1261
+ self.log_observability_failure(
1262
+ "otel.request_span_exit",
1263
+ observability_exc,
1264
+ request_id=context.request_id,
1265
+ )
1266
+
1267
+ @contextmanager
1268
+ def _safe_span(self, name: str, attrs: dict[str, Any]) -> Iterator[Any]:
1269
+ if self.tracer is None:
1270
+ yield None
1271
+ return
1272
+ span_handle = self._start_span(name, attrs)
1273
+ if span_handle is None:
1274
+ yield None
1275
+ return
1276
+ _cm, span = span_handle
1277
+ try:
1278
+ yield span
1279
+ except BaseException as exc:
1280
+ try:
1281
+ self._end_span(span_handle, exc)
1282
+ except BaseException as observability_exc:
1283
+ self.log_observability_failure(
1284
+ "otel.span_exit",
1285
+ observability_exc,
1286
+ span=name,
1287
+ )
1288
+ raise
1289
+ else:
1290
+ try:
1291
+ self._end_span(span_handle)
1292
+ except BaseException as exc:
1293
+ self.log_observability_failure(
1294
+ "otel.span_exit",
1295
+ exc,
1296
+ span=name,
1297
+ )
1298
+
1299
+ def _start_span(self, name: str, attrs: dict[str, Any]):
1300
+ try:
1301
+ span_cm = self.tracer.start_as_current_span(name)
1302
+ span = span_cm.__enter__()
1303
+ except BaseException as exc:
1304
+ self.log_observability_failure("otel.span_enter", exc, span=name)
1305
+ return None
1306
+ try:
1307
+ for key, value in attrs.items():
1308
+ if value is not None:
1309
+ span.set_attribute(key, value)
1310
+ except BaseException as exc:
1311
+ self.log_observability_failure(
1312
+ "otel.span_attributes",
1313
+ exc,
1314
+ span=name,
1315
+ )
1316
+ return span_cm, span
1317
+
1318
+ def _end_span(
1319
+ self,
1320
+ span_handle,
1321
+ error: BaseException | None = None,
1322
+ ) -> None:
1323
+ if span_handle is None:
1324
+ return
1325
+ span_cm, span = span_handle
1326
+ try:
1327
+ if error is not None:
1328
+ self._record_exception_on_span(
1329
+ span,
1330
+ error,
1331
+ handled=False,
1332
+ status_code=500,
1333
+ )
1334
+ except BaseException as exc:
1335
+ self.log_observability_failure("otel.span_error_status", exc)
1336
+ try:
1337
+ span_cm.__exit__(None, None, None)
1338
+ except BaseException as exc:
1339
+ self.log_observability_failure("otel.span_exit", exc)
1340
+
1341
+ def _record_segment_safely(
1342
+ self,
1343
+ name: str,
1344
+ start: float | None,
1345
+ attrs: dict[str, Any],
1346
+ ) -> None:
1347
+ if start is None:
1348
+ return
1349
+ end = self._safe_perf_counter(f"segment.{name}.end")
1350
+ if end is None:
1351
+ return
1352
+ try:
1353
+ duration = end - start
1354
+ self._record_timing(name, duration)
1355
+ context = self.current_context()
1356
+ operation = self.current_operation()
1357
+ metric_extra = {
1358
+ key: value
1359
+ for key, value in attrs.items()
1360
+ if (
1361
+ key in self.config.metric_attribute_keys
1362
+ and value is not None
1363
+ )
1364
+ }
1365
+ if context is not None:
1366
+ context.timings_ms[name] = round(duration * 1000, 3)
1367
+ if operation is not None:
1368
+ operation.timings_ms[name] = round(duration * 1000, 3)
1369
+ metric_attributes = operation.metric_attributes(
1370
+ segment=name,
1371
+ **metric_extra,
1372
+ )
1373
+ elif context is not None:
1374
+ metric_attributes = context.metric_attributes(
1375
+ segment=name,
1376
+ **metric_extra,
1377
+ )
1378
+ else:
1379
+ metric_attributes = _metric_attrs(
1380
+ {
1381
+ "service.name": self.config.service_name,
1382
+ "service.role": self.config.service_role,
1383
+ "deployment.environment": self.config.environment,
1384
+ "segment": name,
1385
+ **metric_extra,
1386
+ },
1387
+ self.config.metric_attribute_keys,
1388
+ )
1389
+ self.record_segment_metric(
1390
+ name,
1391
+ duration,
1392
+ metric_attributes,
1393
+ backend_segment="backend" in metric_extra,
1394
+ )
1395
+ except BaseException as exc:
1396
+ self.log_observability_failure(
1397
+ "request.record_segment",
1398
+ exc,
1399
+ segment=name,
1400
+ )
1401
+
1402
+ def _record_timing(self, name: str, duration_seconds: float) -> None:
1403
+ try:
1404
+ timings = _TIMINGS.get()
1405
+ if timings is None:
1406
+ return
1407
+ key = f"{name}_ms"
1408
+ duration_ms = duration_seconds * 1000.0
1409
+ timings[key] = round(timings.get(key, 0.0) + duration_ms, 1)
1410
+ except BaseException as exc:
1411
+ self.log_observability_failure(
1412
+ "scope.record_timing",
1413
+ exc,
1414
+ segment=name,
1415
+ )
1416
+
1417
+ def _segment_span_attributes(
1418
+ self,
1419
+ attrs: dict[str, Any],
1420
+ ) -> dict[str, Any]:
1421
+ context = self.current_context()
1422
+ operation = self.current_operation()
1423
+ span_attrs = {
1424
+ key: value for key, value in attrs.items() if value is not None
1425
+ }
1426
+ if context is not None:
1427
+ span_attrs = {**context.span_attributes(), **span_attrs}
1428
+ elif operation is not None:
1429
+ span_attrs = {**operation.span_attributes(), **span_attrs}
1430
+ return span_attrs
1431
+
1432
+ def _span_name(self, segment_name: str) -> str:
1433
+ if not self.config.span_prefix:
1434
+ return segment_name
1435
+ return f"{self.config.span_prefix}.{segment_name}"
1436
+
1437
+ def _start_implicit_operation(
1438
+ self,
1439
+ segment_name: str,
1440
+ attrs: dict[str, Any],
1441
+ ) -> dict[str, Any] | None:
1442
+ if (
1443
+ self.current_operation() is not None
1444
+ or self.current_context() is not None
1445
+ ):
1446
+ return None
1447
+ operation_name = attrs.get("operation") or segment_name
1448
+ flavor = attrs.get("flavor")
1449
+ operation_attrs = {
1450
+ key: value
1451
+ for key, value in attrs.items()
1452
+ if key not in {"operation", "flavor"} and value is not None
1453
+ }
1454
+ return self.start_operation(
1455
+ self._safe_str(operation_name),
1456
+ flavor=self._safe_str(flavor) if flavor is not None else None,
1457
+ **operation_attrs,
1458
+ )
1459
+
1460
+ def _coerce_segment(self, name: Any) -> str:
1461
+ segment, is_registered = coerce_segment_name(
1462
+ name,
1463
+ registry=self.segment_registry,
1464
+ )
1465
+ if not is_registered:
1466
+ self.log_observability_failure(
1467
+ "segment.coerce",
1468
+ ValueError("Unregistered observability segment."),
1469
+ segment=segment,
1470
+ segment_type=type(name).__name__,
1471
+ )
1472
+ return segment
1473
+
1474
+ def _set_current_span_attributes(self, attrs: dict[str, Any]) -> None:
1475
+ span = self._current_span()
1476
+ if span is None:
1477
+ return
1478
+ try:
1479
+ for key, value in attrs.items():
1480
+ if value is not None:
1481
+ span.set_attribute(key, value)
1482
+ except BaseException as exc:
1483
+ self.log_observability_failure("otel.set_span_attributes", exc)
1484
+
1485
+ def _current_span(self):
1486
+ if self.trace is None:
1487
+ return None
1488
+ try:
1489
+ return self.trace.get_current_span()
1490
+ except BaseException as exc:
1491
+ self.log_observability_failure("otel.current_span", exc)
1492
+ return None
1493
+
1494
+ def _trace_ids(self) -> tuple[str | None, str | None]:
1495
+ span = self._current_span()
1496
+ if span is None:
1497
+ return None, None
1498
+ try:
1499
+ context = span.get_span_context()
1500
+ except BaseException as exc:
1501
+ self.log_observability_failure("otel.span_context", exc)
1502
+ return None, None
1503
+ if not getattr(context, "is_valid", False):
1504
+ return None, None
1505
+ return f"{context.trace_id:032x}", f"{context.span_id:016x}"
1506
+
1507
+ def _extract_context(self, carrier: Any):
1508
+ if self.propagate is None or carrier is None:
1509
+ return None
1510
+ try:
1511
+ return self.propagate.extract(carrier)
1512
+ except BaseException as exc:
1513
+ self.log_observability_failure("otel.extract_context", exc)
1514
+ return None
1515
+
1516
+ def _record_exception_on_span(
1517
+ self,
1518
+ span,
1519
+ exc: BaseException,
1520
+ *,
1521
+ handled: bool,
1522
+ status_code: int | None,
1523
+ ) -> None:
1524
+ try:
1525
+ span.record_exception(exc)
1526
+ span.set_attribute("error.type", type(exc).__name__)
1527
+ span.set_attribute("error.handled", handled)
1528
+ if (
1529
+ self.Status is not None
1530
+ and self.StatusCode is not None
1531
+ and (
1532
+ not handled
1533
+ or (status_code is not None and status_code >= 500)
1534
+ )
1535
+ ):
1536
+ span.set_status(
1537
+ self.Status(
1538
+ self.StatusCode.ERROR,
1539
+ self._safe_str(exc),
1540
+ )
1541
+ )
1542
+ except BaseException as observability_exc:
1543
+ self.log_observability_failure(
1544
+ "otel.record_exception",
1545
+ observability_exc,
1546
+ original_error_type=type(exc).__name__,
1547
+ )
1548
+
1549
+ def _add_span_event(self, event: str, fields: dict[str, Any]) -> None:
1550
+ span = self._current_span()
1551
+ if span is None:
1552
+ return
1553
+ try:
1554
+ span.add_event(
1555
+ event,
1556
+ {
1557
+ key: value
1558
+ for key, value in fields.items()
1559
+ if _is_safe_span_value(value)
1560
+ },
1561
+ )
1562
+ except BaseException as exc:
1563
+ self.log_observability_failure(
1564
+ "otel.add_event",
1565
+ exc,
1566
+ event_name=event,
1567
+ )
1568
+
1569
+ def _close_active_request(
1570
+ self,
1571
+ context: RequestObservabilityContext,
1572
+ ) -> None:
1573
+ try:
1574
+ if context.active_closed:
1575
+ return
1576
+ context.active_closed = True
1577
+ self.record_active_request(-1, context.metric_attributes())
1578
+ except BaseException as exc:
1579
+ self.log_observability_failure(
1580
+ "request.close_active",
1581
+ exc,
1582
+ request_id=getattr(context, "request_id", None),
1583
+ )
1584
+
1585
+ def _reset_request_operation_context(
1586
+ self,
1587
+ context: RequestObservabilityContext,
1588
+ ) -> None:
1589
+ token = context.operation_token
1590
+ if token is None:
1591
+ return
1592
+ try:
1593
+ _OPERATION_CONTEXT.reset(token)
1594
+ except BaseException as exc:
1595
+ self.log_observability_failure(
1596
+ "request.operation_context_reset",
1597
+ exc,
1598
+ request_id=getattr(context, "request_id", None),
1599
+ )
1600
+
1601
+ def _reset_request_context(
1602
+ self,
1603
+ context: RequestObservabilityContext,
1604
+ ) -> None:
1605
+ token = context.context_token
1606
+ if token is None:
1607
+ return
1608
+ try:
1609
+ _REQUEST_CONTEXT.reset(token)
1610
+ except BaseException as exc:
1611
+ self.log_observability_failure(
1612
+ "request.context_reset",
1613
+ exc,
1614
+ request_id=getattr(context, "request_id", None),
1615
+ )
1616
+
1617
+ def _safe_perf_counter(self, operation: str) -> float | None:
1618
+ try:
1619
+ return time.perf_counter()
1620
+ except BaseException as exc:
1621
+ self.log_observability_failure(operation, exc)
1622
+ return None
1623
+
1624
+ def _safe_str(self, value: Any) -> str:
1625
+ try:
1626
+ return str(value)
1627
+ except BaseException:
1628
+ return f"<unprintable {type(value).__name__}>"
1629
+
1630
+ def _safe_traceback(self, exc: BaseException) -> str:
1631
+ try:
1632
+ return "".join(
1633
+ traceback.format_exception(type(exc), exc, exc.__traceback__)
1634
+ )
1635
+ except BaseException:
1636
+ return ""
1637
+
1638
+ def _json(self, payload: dict[str, Any]) -> str:
1639
+ try:
1640
+ return json.dumps(payload, sort_keys=True, default=str)
1641
+ except BaseException:
1642
+ return json.dumps(
1643
+ {
1644
+ "schema_version": "policyengine.observability.internal_error.v1",
1645
+ "event": "observability_internal_error",
1646
+ "created_at": datetime.now(UTC).isoformat(),
1647
+ "operation": "observability.failure_json",
1648
+ },
1649
+ sort_keys=True,
1650
+ )
1651
+
1652
+ def _write_stderr(self, payload: dict[str, Any]) -> None:
1653
+ try:
1654
+ sys.stderr.write(self._json(payload) + "\n")
1655
+ except BaseException:
1656
+ return
1657
+
1658
+
1659
+ def _is_safe_span_value(value: Any) -> bool:
1660
+ return isinstance(value, str | bool | int | float)
1661
+
1662
+
1663
+ class _OperationManager:
1664
+ def __init__(
1665
+ self,
1666
+ runtime: ObservabilityRuntime,
1667
+ name: str,
1668
+ *,
1669
+ flavor: str | None,
1670
+ attrs: dict[str, Any],
1671
+ ) -> None:
1672
+ self.runtime = runtime
1673
+ self.name = name
1674
+ self.flavor = flavor
1675
+ self.attrs = attrs
1676
+ self.handle: dict[str, Any] | None = None
1677
+
1678
+ def __enter__(self):
1679
+ self.handle = self.runtime.start_operation(
1680
+ self.name,
1681
+ flavor=self.flavor,
1682
+ **self.attrs,
1683
+ )
1684
+ return self.runtime.current_operation()
1685
+
1686
+ def __exit__(self, exc_type, exc, _traceback) -> bool:
1687
+ self.runtime.end_operation(self.handle, exc)
1688
+ return False
1689
+
1690
+ async def __aenter__(self):
1691
+ return self.__enter__()
1692
+
1693
+ async def __aexit__(self, exc_type, exc, traceback) -> bool:
1694
+ return self.__exit__(exc_type, exc, traceback)
1695
+
1696
+ def __call__(self, func):
1697
+ if inspect.iscoroutinefunction(func):
1698
+
1699
+ @wraps(func)
1700
+ async def async_wrapper(*args, **kwargs):
1701
+ async with self.runtime.operation(
1702
+ self.name,
1703
+ flavor=self.flavor,
1704
+ **self.attrs,
1705
+ ):
1706
+ return await func(*args, **kwargs)
1707
+
1708
+ return async_wrapper
1709
+
1710
+ @wraps(func)
1711
+ def wrapper(*args, **kwargs):
1712
+ with self.runtime.operation(
1713
+ self.name,
1714
+ flavor=self.flavor,
1715
+ **self.attrs,
1716
+ ):
1717
+ return func(*args, **kwargs)
1718
+
1719
+ return wrapper
1720
+
1721
+
1722
+ class _SegmentManager:
1723
+ def __init__(
1724
+ self,
1725
+ runtime: ObservabilityRuntime,
1726
+ name: Any,
1727
+ attrs: dict[str, Any],
1728
+ ) -> None:
1729
+ self.runtime = runtime
1730
+ self.name = name
1731
+ self.attrs = attrs
1732
+ self.context_manager = None
1733
+
1734
+ def __enter__(self):
1735
+ self.context_manager = self.runtime._segment_context(
1736
+ self.name,
1737
+ **self.attrs,
1738
+ )
1739
+ return self.context_manager.__enter__()
1740
+
1741
+ def __exit__(self, exc_type, exc, traceback) -> bool:
1742
+ if self.context_manager is None:
1743
+ return False
1744
+ return bool(self.context_manager.__exit__(exc_type, exc, traceback))
1745
+
1746
+ async def __aenter__(self):
1747
+ self.context_manager = self.runtime.asegment(self.name, **self.attrs)
1748
+ return await self.context_manager.__aenter__()
1749
+
1750
+ async def __aexit__(self, exc_type, exc, traceback) -> bool:
1751
+ if self.context_manager is None:
1752
+ return False
1753
+ return bool(
1754
+ await self.context_manager.__aexit__(exc_type, exc, traceback)
1755
+ )
1756
+
1757
+ def __call__(self, func):
1758
+ if inspect.iscoroutinefunction(func):
1759
+
1760
+ @wraps(func)
1761
+ async def async_wrapper(*args, **kwargs):
1762
+ async with self.runtime.segment(self.name, **self.attrs):
1763
+ return await func(*args, **kwargs)
1764
+
1765
+ return async_wrapper
1766
+
1767
+ @wraps(func)
1768
+ def wrapper(*args, **kwargs):
1769
+ with self.runtime.segment(self.name, **self.attrs):
1770
+ return func(*args, **kwargs)
1771
+
1772
+ return wrapper
1773
+
1774
+
1775
+ _RUNTIME = ObservabilityRuntime(ObservabilityConfig())
1776
+
1777
+
1778
+ def set_observability_runtime(runtime: ObservabilityRuntime) -> None:
1779
+ global _RUNTIME
1780
+ _RUNTIME = runtime
1781
+
1782
+
1783
+ def observability_runtime() -> ObservabilityRuntime:
1784
+ return _RUNTIME