policyengine-observability 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- policyengine_observability/__init__.py +166 -0
- policyengine_observability/adapters/__init__.py +1 -0
- policyengine_observability/adapters/fastapi.py +286 -0
- policyengine_observability/adapters/flask.py +132 -0
- policyengine_observability/config.py +164 -0
- policyengine_observability/context.py +238 -0
- policyengine_observability/integrations/__init__.py +1 -0
- policyengine_observability/integrations/httpx.py +8 -0
- policyengine_observability/logging.py +17 -0
- policyengine_observability/runtime.py +1784 -0
- policyengine_observability/segments.py +35 -0
- policyengine_observability-0.2.0.dist-info/METADATA +52 -0
- policyengine_observability-0.2.0.dist-info/RECORD +14 -0
- policyengine_observability-0.2.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1784 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import traceback
|
|
10
|
+
from collections.abc import AsyncIterator, Iterator
|
|
11
|
+
from contextlib import asynccontextmanager, contextmanager
|
|
12
|
+
from contextvars import ContextVar
|
|
13
|
+
from datetime import UTC, datetime
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from functools import wraps
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from .config import ObservabilityConfig
|
|
19
|
+
from .context import (
|
|
20
|
+
ErrorRecord,
|
|
21
|
+
OperationObservabilityContext,
|
|
22
|
+
RequestObservabilityContext,
|
|
23
|
+
_metric_attrs,
|
|
24
|
+
)
|
|
25
|
+
from .logging import configure_plain_logger
|
|
26
|
+
from .segments import coerce_segment_name
|
|
27
|
+
|
|
28
|
+
OBSERVABILITY_INTERNAL_DISPATCH_HEADER = "X-PolicyEngine-Internal-Dispatch"
|
|
29
|
+
REQUEST_ID_HEADER = "X-PolicyEngine-Request-Id"
|
|
30
|
+
TRACEPARENT_HEADER = "traceparent"
|
|
31
|
+
|
|
32
|
+
REQUEST_LOGGER_NAME = "policyengine_observability.requests"
|
|
33
|
+
OPERATION_LOGGER_NAME = "policyengine_observability.operations"
|
|
34
|
+
EVENT_LOGGER_NAME = "policyengine_observability.events"
|
|
35
|
+
INTERNAL_LOGGER_NAME = "policyengine_observability.internal"
|
|
36
|
+
|
|
37
|
+
REQUEST_LOGGER = logging.getLogger(REQUEST_LOGGER_NAME)
|
|
38
|
+
OPERATION_LOGGER = logging.getLogger(OPERATION_LOGGER_NAME)
|
|
39
|
+
EVENT_LOGGER = logging.getLogger(EVENT_LOGGER_NAME)
|
|
40
|
+
INTERNAL_LOGGER = logging.getLogger(INTERNAL_LOGGER_NAME)
|
|
41
|
+
|
|
42
|
+
_REQUEST_CONTEXT: ContextVar[RequestObservabilityContext | None] = ContextVar(
|
|
43
|
+
"policyengine_request_observability_context",
|
|
44
|
+
default=None,
|
|
45
|
+
)
|
|
46
|
+
_OPERATION_CONTEXT: ContextVar[OperationObservabilityContext | None] = (
|
|
47
|
+
ContextVar(
|
|
48
|
+
"policyengine_operation_observability_context",
|
|
49
|
+
default=None,
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
_TIMINGS: ContextVar[dict[str, float] | None] = ContextVar(
|
|
53
|
+
"policyengine_observability_timings",
|
|
54
|
+
default=None,
|
|
55
|
+
)
|
|
56
|
+
_TURN_START: ContextVar[float | None] = ContextVar(
|
|
57
|
+
"policyengine_observability_turn_start",
|
|
58
|
+
default=None,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _NoOpInstrument:
|
|
63
|
+
def add(self, *_args, **_kwargs) -> None:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def record(self, *_args, **_kwargs) -> None:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ObservabilityRuntime:
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
config: ObservabilityConfig,
|
|
74
|
+
*,
|
|
75
|
+
segment_registry: type[Enum] | None = None,
|
|
76
|
+
) -> None:
|
|
77
|
+
self.config = config
|
|
78
|
+
self.segment_registry = segment_registry
|
|
79
|
+
self.enabled = config.enabled
|
|
80
|
+
self.trace = None
|
|
81
|
+
self.propagate = None
|
|
82
|
+
self.SpanKind = None
|
|
83
|
+
self.Status = None
|
|
84
|
+
self.StatusCode = None
|
|
85
|
+
self.tracer_provider = None
|
|
86
|
+
self.meter_provider = None
|
|
87
|
+
self.tracer = None
|
|
88
|
+
self.meter = None
|
|
89
|
+
self.operation_duration = _NoOpInstrument()
|
|
90
|
+
self.http_duration = _NoOpInstrument()
|
|
91
|
+
self.segment_duration = _NoOpInstrument()
|
|
92
|
+
self.calculate_duration = _NoOpInstrument()
|
|
93
|
+
self.backend_duration = _NoOpInstrument()
|
|
94
|
+
self.operations = _NoOpInstrument()
|
|
95
|
+
self.requests = _NoOpInstrument()
|
|
96
|
+
self.errors = _NoOpInstrument()
|
|
97
|
+
self.rate_limited = _NoOpInstrument()
|
|
98
|
+
self.failover_events = _NoOpInstrument()
|
|
99
|
+
self.active_requests = _NoOpInstrument()
|
|
100
|
+
self._httpx_instrumented = False
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def disabled(cls) -> ObservabilityRuntime:
|
|
104
|
+
return cls(ObservabilityConfig(enabled=False))
|
|
105
|
+
|
|
106
|
+
def configure(self) -> None:
|
|
107
|
+
self._configure_loggers()
|
|
108
|
+
if not self.enabled or not self.config.otel_enabled:
|
|
109
|
+
return
|
|
110
|
+
self._configure_otel()
|
|
111
|
+
if self.config.instrument_httpx:
|
|
112
|
+
self.instrument_httpx()
|
|
113
|
+
|
|
114
|
+
def current_context(self) -> RequestObservabilityContext | None:
|
|
115
|
+
try:
|
|
116
|
+
return _REQUEST_CONTEXT.get()
|
|
117
|
+
except BaseException as exc:
|
|
118
|
+
self.log_observability_failure("context.current", exc)
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
def current_operation(
|
|
122
|
+
self,
|
|
123
|
+
) -> OperationObservabilityContext | None:
|
|
124
|
+
try:
|
|
125
|
+
return _OPERATION_CONTEXT.get()
|
|
126
|
+
except BaseException as exc:
|
|
127
|
+
self.log_observability_failure("operation.current", exc)
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
def operation(
|
|
131
|
+
self,
|
|
132
|
+
name: str,
|
|
133
|
+
*,
|
|
134
|
+
flavor: str | None = None,
|
|
135
|
+
**attrs: Any,
|
|
136
|
+
):
|
|
137
|
+
return _OperationManager(self, name, flavor=flavor, attrs=attrs)
|
|
138
|
+
|
|
139
|
+
def entrypoint(
|
|
140
|
+
self,
|
|
141
|
+
name: str | None = None,
|
|
142
|
+
*,
|
|
143
|
+
flavor: str | None = None,
|
|
144
|
+
**attrs: Any,
|
|
145
|
+
):
|
|
146
|
+
def decorator(func):
|
|
147
|
+
operation_name = name or getattr(func, "__name__", "operation")
|
|
148
|
+
return self.operation(
|
|
149
|
+
operation_name,
|
|
150
|
+
flavor=flavor,
|
|
151
|
+
**attrs,
|
|
152
|
+
)(func)
|
|
153
|
+
|
|
154
|
+
return decorator
|
|
155
|
+
|
|
156
|
+
def start_operation(
|
|
157
|
+
self,
|
|
158
|
+
name: str,
|
|
159
|
+
*,
|
|
160
|
+
flavor: str | None = None,
|
|
161
|
+
parent_context: Any = None,
|
|
162
|
+
timings: dict[str, float] | None = None,
|
|
163
|
+
emit_log: bool = True,
|
|
164
|
+
record_metric: bool = True,
|
|
165
|
+
**attrs: Any,
|
|
166
|
+
) -> dict[str, Any]:
|
|
167
|
+
handle = {
|
|
168
|
+
"operation": None,
|
|
169
|
+
"operation_token": None,
|
|
170
|
+
"timings_token": None,
|
|
171
|
+
"start_token": None,
|
|
172
|
+
"context_token": None,
|
|
173
|
+
}
|
|
174
|
+
if not self.enabled:
|
|
175
|
+
return handle
|
|
176
|
+
try:
|
|
177
|
+
operation = OperationObservabilityContext(
|
|
178
|
+
config=self.config,
|
|
179
|
+
name=self._safe_str(name),
|
|
180
|
+
flavor=flavor,
|
|
181
|
+
attributes={
|
|
182
|
+
key: value
|
|
183
|
+
for key, value in attrs.items()
|
|
184
|
+
if value is not None
|
|
185
|
+
},
|
|
186
|
+
timings_ms={},
|
|
187
|
+
emit_log=emit_log,
|
|
188
|
+
record_metric=record_metric,
|
|
189
|
+
)
|
|
190
|
+
operation.context_token = _OPERATION_CONTEXT.set(operation)
|
|
191
|
+
handle["operation"] = operation
|
|
192
|
+
handle["operation_token"] = operation.context_token
|
|
193
|
+
if timings is not None:
|
|
194
|
+
handle["timings_token"] = _TIMINGS.set(timings)
|
|
195
|
+
handle["start_token"] = _TURN_START.set(time.perf_counter())
|
|
196
|
+
if parent_context is not None and self.tracer is not None:
|
|
197
|
+
try:
|
|
198
|
+
from opentelemetry import context as otel_context
|
|
199
|
+
|
|
200
|
+
handle["context_token"] = otel_context.attach(
|
|
201
|
+
parent_context
|
|
202
|
+
)
|
|
203
|
+
except BaseException as exc:
|
|
204
|
+
self.log_observability_failure(
|
|
205
|
+
"operation.context_attach",
|
|
206
|
+
exc,
|
|
207
|
+
)
|
|
208
|
+
if self.tracer is not None:
|
|
209
|
+
operation.span_handle = self._start_span(
|
|
210
|
+
self._span_name(operation.name),
|
|
211
|
+
operation.span_attributes(),
|
|
212
|
+
)
|
|
213
|
+
except BaseException as exc:
|
|
214
|
+
self.log_observability_failure("operation.start", exc, name=name)
|
|
215
|
+
return handle
|
|
216
|
+
|
|
217
|
+
def end_operation(
|
|
218
|
+
self,
|
|
219
|
+
handle: dict[str, Any] | None,
|
|
220
|
+
error: BaseException | None = None,
|
|
221
|
+
) -> None:
|
|
222
|
+
if not handle:
|
|
223
|
+
return
|
|
224
|
+
operation = handle.get("operation")
|
|
225
|
+
try:
|
|
226
|
+
if operation is not None and error is not None:
|
|
227
|
+
operation.error = ErrorRecord(
|
|
228
|
+
type=type(error).__name__,
|
|
229
|
+
message=self._safe_str(error),
|
|
230
|
+
handled=False,
|
|
231
|
+
stack=self._safe_traceback(error),
|
|
232
|
+
)
|
|
233
|
+
self.record_error_metric(
|
|
234
|
+
operation.metric_attributes(
|
|
235
|
+
error_type=type(error).__name__
|
|
236
|
+
)
|
|
237
|
+
)
|
|
238
|
+
if operation is not None:
|
|
239
|
+
self.complete_operation(operation)
|
|
240
|
+
if operation is not None:
|
|
241
|
+
self._end_span(operation.span_handle, error)
|
|
242
|
+
except BaseException as exc:
|
|
243
|
+
self.log_observability_failure("operation.end", exc)
|
|
244
|
+
finally:
|
|
245
|
+
context_token = handle.get("context_token")
|
|
246
|
+
if context_token is not None:
|
|
247
|
+
try:
|
|
248
|
+
from opentelemetry import context as otel_context
|
|
249
|
+
|
|
250
|
+
otel_context.detach(context_token)
|
|
251
|
+
except BaseException as exc:
|
|
252
|
+
self.log_observability_failure(
|
|
253
|
+
"operation.context_detach",
|
|
254
|
+
exc,
|
|
255
|
+
)
|
|
256
|
+
for var, key in (
|
|
257
|
+
(_TIMINGS, "timings_token"),
|
|
258
|
+
(_TURN_START, "start_token"),
|
|
259
|
+
(_OPERATION_CONTEXT, "operation_token"),
|
|
260
|
+
):
|
|
261
|
+
token = handle.get(key)
|
|
262
|
+
if token is not None:
|
|
263
|
+
try:
|
|
264
|
+
var.reset(token)
|
|
265
|
+
except BaseException as exc:
|
|
266
|
+
self.log_observability_failure(
|
|
267
|
+
"operation.context_reset",
|
|
268
|
+
exc,
|
|
269
|
+
token=key,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
def complete_operation(
|
|
273
|
+
self,
|
|
274
|
+
operation: OperationObservabilityContext,
|
|
275
|
+
) -> None:
|
|
276
|
+
if operation.metric_recorded:
|
|
277
|
+
return
|
|
278
|
+
operation.metric_recorded = True
|
|
279
|
+
if operation.record_metric:
|
|
280
|
+
self.record_operation_metric(
|
|
281
|
+
operation.duration_seconds(),
|
|
282
|
+
operation.metric_attributes(),
|
|
283
|
+
)
|
|
284
|
+
if operation.emit_log:
|
|
285
|
+
self.emit_operation_log(operation)
|
|
286
|
+
|
|
287
|
+
def begin_request(
|
|
288
|
+
self,
|
|
289
|
+
context: RequestObservabilityContext,
|
|
290
|
+
*,
|
|
291
|
+
carrier: Any = None,
|
|
292
|
+
) -> None:
|
|
293
|
+
if not self.enabled:
|
|
294
|
+
return
|
|
295
|
+
try:
|
|
296
|
+
context.context_token = _REQUEST_CONTEXT.set(context)
|
|
297
|
+
context.set_attribute("endpoint", context.endpoint)
|
|
298
|
+
self._begin_request_operation(context)
|
|
299
|
+
self._start_request_span(context, carrier=carrier)
|
|
300
|
+
self.record_active_request(1, context.metric_attributes())
|
|
301
|
+
except BaseException as exc:
|
|
302
|
+
self.log_observability_failure("request.begin", exc)
|
|
303
|
+
|
|
304
|
+
def _begin_request_operation(
|
|
305
|
+
self,
|
|
306
|
+
context: RequestObservabilityContext,
|
|
307
|
+
) -> None:
|
|
308
|
+
try:
|
|
309
|
+
operation = OperationObservabilityContext(
|
|
310
|
+
config=context.config,
|
|
311
|
+
name=context.route,
|
|
312
|
+
flavor="http",
|
|
313
|
+
attributes={
|
|
314
|
+
"route": context.route,
|
|
315
|
+
"method": context.method,
|
|
316
|
+
"endpoint": context.endpoint,
|
|
317
|
+
"path": context.path,
|
|
318
|
+
},
|
|
319
|
+
timings_ms=context.timings_ms,
|
|
320
|
+
emit_log=False,
|
|
321
|
+
record_metric=False,
|
|
322
|
+
)
|
|
323
|
+
operation.context_token = _OPERATION_CONTEXT.set(operation)
|
|
324
|
+
context.operation_context = operation
|
|
325
|
+
context.operation_token = operation.context_token
|
|
326
|
+
except BaseException as exc:
|
|
327
|
+
self.log_observability_failure(
|
|
328
|
+
"request.operation_begin",
|
|
329
|
+
exc,
|
|
330
|
+
request_id=getattr(context, "request_id", None),
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
def finish_request(self, status_code: int) -> dict[str, str]:
|
|
334
|
+
headers = self.prepare_response(status_code)
|
|
335
|
+
self.complete_request(status_code)
|
|
336
|
+
return headers
|
|
337
|
+
|
|
338
|
+
def prepare_response(self, status_code: int) -> dict[str, str]:
|
|
339
|
+
if not self.enabled:
|
|
340
|
+
return {}
|
|
341
|
+
headers: dict[str, str] = {}
|
|
342
|
+
try:
|
|
343
|
+
context = self.current_context()
|
|
344
|
+
if context is None:
|
|
345
|
+
return headers
|
|
346
|
+
context.status_code = status_code
|
|
347
|
+
self._set_current_span_attributes(context.span_attributes())
|
|
348
|
+
if context.operation_context is not None:
|
|
349
|
+
context.operation_context.set_attribute(
|
|
350
|
+
"status_code",
|
|
351
|
+
str(status_code),
|
|
352
|
+
)
|
|
353
|
+
headers[REQUEST_ID_HEADER] = context.request_id
|
|
354
|
+
traceparent = self.traceparent_header()
|
|
355
|
+
if traceparent:
|
|
356
|
+
headers[TRACEPARENT_HEADER] = traceparent
|
|
357
|
+
if status_code == 429:
|
|
358
|
+
context.set_attribute("rate_limited", True)
|
|
359
|
+
return headers
|
|
360
|
+
except BaseException as exc:
|
|
361
|
+
self.log_observability_failure("request.prepare_response", exc)
|
|
362
|
+
return headers
|
|
363
|
+
|
|
364
|
+
def complete_request(self, status_code: int | None = None) -> None:
|
|
365
|
+
if not self.enabled:
|
|
366
|
+
return
|
|
367
|
+
try:
|
|
368
|
+
context = self.current_context()
|
|
369
|
+
if context is None:
|
|
370
|
+
return
|
|
371
|
+
if status_code is not None:
|
|
372
|
+
context.status_code = status_code
|
|
373
|
+
self._set_current_span_attributes(context.span_attributes())
|
|
374
|
+
if context.request_metric_recorded:
|
|
375
|
+
return
|
|
376
|
+
context.request_metric_recorded = True
|
|
377
|
+
if context.status_code == 429:
|
|
378
|
+
self.record_rate_limited_metric(context.metric_attributes())
|
|
379
|
+
self.record_request_metric(
|
|
380
|
+
context.duration_seconds(),
|
|
381
|
+
context.metric_attributes(),
|
|
382
|
+
)
|
|
383
|
+
self._close_active_request(context)
|
|
384
|
+
except BaseException as exc:
|
|
385
|
+
self.log_observability_failure("request.complete", exc)
|
|
386
|
+
|
|
387
|
+
def update_request_route(
|
|
388
|
+
self,
|
|
389
|
+
*,
|
|
390
|
+
route: str | None = None,
|
|
391
|
+
endpoint: str | None = None,
|
|
392
|
+
) -> None:
|
|
393
|
+
if not self.enabled:
|
|
394
|
+
return
|
|
395
|
+
try:
|
|
396
|
+
context = self.current_context()
|
|
397
|
+
if context is None:
|
|
398
|
+
return
|
|
399
|
+
route_changed = bool(route and route != context.route)
|
|
400
|
+
old_active_attributes = (
|
|
401
|
+
context.metric_attributes()
|
|
402
|
+
if route_changed and not context.active_closed
|
|
403
|
+
else None
|
|
404
|
+
)
|
|
405
|
+
if route:
|
|
406
|
+
context.route = route
|
|
407
|
+
if context.operation_context is not None:
|
|
408
|
+
context.operation_context.name = route
|
|
409
|
+
context.operation_context.set_attribute("route", route)
|
|
410
|
+
if endpoint:
|
|
411
|
+
context.endpoint = endpoint
|
|
412
|
+
context.set_attribute("endpoint", endpoint)
|
|
413
|
+
if context.operation_context is not None:
|
|
414
|
+
context.operation_context.set_attribute(
|
|
415
|
+
"endpoint",
|
|
416
|
+
endpoint,
|
|
417
|
+
)
|
|
418
|
+
self._set_current_span_attributes(context.span_attributes())
|
|
419
|
+
span = context.server_span
|
|
420
|
+
update_name = getattr(span, "update_name", None)
|
|
421
|
+
if route and update_name is not None:
|
|
422
|
+
update_name(route)
|
|
423
|
+
if old_active_attributes is not None:
|
|
424
|
+
self.record_active_request(-1, old_active_attributes)
|
|
425
|
+
self.record_active_request(1, context.metric_attributes())
|
|
426
|
+
except BaseException as exc:
|
|
427
|
+
self.log_observability_failure("request.update_route", exc)
|
|
428
|
+
|
|
429
|
+
def teardown_request(self, exc: BaseException | None = None) -> None:
|
|
430
|
+
if not self.enabled:
|
|
431
|
+
return
|
|
432
|
+
context = self.current_context()
|
|
433
|
+
if context is None:
|
|
434
|
+
return
|
|
435
|
+
try:
|
|
436
|
+
if exc is not None:
|
|
437
|
+
self.record_error(
|
|
438
|
+
exc,
|
|
439
|
+
handled=False,
|
|
440
|
+
status_code=context.status_code or 500,
|
|
441
|
+
)
|
|
442
|
+
self._close_active_request(context)
|
|
443
|
+
self.emit_request_log(context)
|
|
444
|
+
except BaseException as observability_exc:
|
|
445
|
+
self.log_observability_failure(
|
|
446
|
+
"request.teardown",
|
|
447
|
+
observability_exc,
|
|
448
|
+
)
|
|
449
|
+
finally:
|
|
450
|
+
self._close_request_span(context, exc)
|
|
451
|
+
self._reset_request_operation_context(context)
|
|
452
|
+
self._reset_request_context(context)
|
|
453
|
+
|
|
454
|
+
def set_attribute(self, key: str, value: Any) -> None:
|
|
455
|
+
if not self.enabled:
|
|
456
|
+
return
|
|
457
|
+
try:
|
|
458
|
+
context = self.current_context()
|
|
459
|
+
if context is not None:
|
|
460
|
+
context.set_attribute(key, value)
|
|
461
|
+
if context.operation_context is not None:
|
|
462
|
+
context.operation_context.set_attribute(key, value)
|
|
463
|
+
self._set_current_span_attributes(
|
|
464
|
+
context.span_attributes(**{f"policyengine.{key}": value})
|
|
465
|
+
)
|
|
466
|
+
return
|
|
467
|
+
operation = self.current_operation()
|
|
468
|
+
if operation is not None:
|
|
469
|
+
operation.set_attribute(key, value)
|
|
470
|
+
self._set_current_span_attributes(
|
|
471
|
+
operation.span_attributes(**{f"policyengine.{key}": value})
|
|
472
|
+
)
|
|
473
|
+
except BaseException as exc:
|
|
474
|
+
self.log_observability_failure(
|
|
475
|
+
"request.set_attribute",
|
|
476
|
+
exc,
|
|
477
|
+
attribute=key,
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
def segment(self, name: Any, **attrs: Any) -> Iterator[Any]:
|
|
481
|
+
return _SegmentManager(self, name, attrs)
|
|
482
|
+
|
|
483
|
+
@contextmanager
|
|
484
|
+
def _segment_context(self, name: Any, **attrs: Any) -> Iterator[Any]:
|
|
485
|
+
if not self.enabled:
|
|
486
|
+
yield None
|
|
487
|
+
return
|
|
488
|
+
segment_name = self._coerce_segment(name)
|
|
489
|
+
implicit_operation = self._start_implicit_operation(
|
|
490
|
+
segment_name,
|
|
491
|
+
attrs,
|
|
492
|
+
)
|
|
493
|
+
start = self._safe_perf_counter(f"segment.{segment_name}.start")
|
|
494
|
+
span_attrs = self._segment_span_attributes(attrs)
|
|
495
|
+
span_name = self._span_name(segment_name)
|
|
496
|
+
error: BaseException | None = None
|
|
497
|
+
with self._safe_span(span_name, span_attrs) as span:
|
|
498
|
+
try:
|
|
499
|
+
yield span
|
|
500
|
+
except BaseException as exc:
|
|
501
|
+
error = exc
|
|
502
|
+
self._record_segment_safely(segment_name, start, attrs)
|
|
503
|
+
raise
|
|
504
|
+
else:
|
|
505
|
+
self._record_segment_safely(segment_name, start, attrs)
|
|
506
|
+
finally:
|
|
507
|
+
self.end_operation(implicit_operation, error)
|
|
508
|
+
|
|
509
|
+
@asynccontextmanager
|
|
510
|
+
async def asegment(self, name: Any, **attrs: Any) -> AsyncIterator[Any]:
|
|
511
|
+
if not self.enabled:
|
|
512
|
+
yield None
|
|
513
|
+
return
|
|
514
|
+
segment_name = self._coerce_segment(name)
|
|
515
|
+
implicit_operation = self._start_implicit_operation(
|
|
516
|
+
segment_name,
|
|
517
|
+
attrs,
|
|
518
|
+
)
|
|
519
|
+
start = self._safe_perf_counter(f"segment.{segment_name}.start")
|
|
520
|
+
span_attrs = self._segment_span_attributes(attrs)
|
|
521
|
+
span_name = self._span_name(segment_name)
|
|
522
|
+
error: BaseException | None = None
|
|
523
|
+
with self._safe_span(span_name, span_attrs) as span:
|
|
524
|
+
try:
|
|
525
|
+
yield span
|
|
526
|
+
except BaseException as exc:
|
|
527
|
+
error = exc
|
|
528
|
+
self._record_segment_safely(segment_name, start, attrs)
|
|
529
|
+
raise
|
|
530
|
+
else:
|
|
531
|
+
self._record_segment_safely(segment_name, start, attrs)
|
|
532
|
+
finally:
|
|
533
|
+
self.end_operation(implicit_operation, error)
|
|
534
|
+
|
|
535
|
+
@contextmanager
|
|
536
|
+
def collect_timings(self, name: str = "operation", **attrs: Any):
|
|
537
|
+
timings: dict[str, float] = {}
|
|
538
|
+
handle = self.start_scope(timings, name=name, **attrs)
|
|
539
|
+
error: BaseException | None = None
|
|
540
|
+
try:
|
|
541
|
+
yield timings
|
|
542
|
+
except BaseException as exc:
|
|
543
|
+
error = exc
|
|
544
|
+
raise
|
|
545
|
+
finally:
|
|
546
|
+
self.end_scope(handle, error)
|
|
547
|
+
|
|
548
|
+
def start_scope(
|
|
549
|
+
self,
|
|
550
|
+
timings: dict[str, float],
|
|
551
|
+
*,
|
|
552
|
+
name: str = "operation",
|
|
553
|
+
parent_context: Any = None,
|
|
554
|
+
**attrs: Any,
|
|
555
|
+
) -> dict[str, Any]:
|
|
556
|
+
if self.current_operation() is None:
|
|
557
|
+
return {
|
|
558
|
+
"operation_handle": self.start_operation(
|
|
559
|
+
name,
|
|
560
|
+
parent_context=parent_context,
|
|
561
|
+
timings=timings,
|
|
562
|
+
**attrs,
|
|
563
|
+
)
|
|
564
|
+
}
|
|
565
|
+
handle = {
|
|
566
|
+
"operation_handle": None,
|
|
567
|
+
"timings_token": None,
|
|
568
|
+
"start_token": None,
|
|
569
|
+
"context_token": None,
|
|
570
|
+
"span": None,
|
|
571
|
+
}
|
|
572
|
+
try:
|
|
573
|
+
handle["timings_token"] = _TIMINGS.set(timings)
|
|
574
|
+
except BaseException as exc:
|
|
575
|
+
self.log_observability_failure("scope.timings_set", exc)
|
|
576
|
+
try:
|
|
577
|
+
handle["start_token"] = _TURN_START.set(time.perf_counter())
|
|
578
|
+
except BaseException as exc:
|
|
579
|
+
self.log_observability_failure("scope.start_set", exc)
|
|
580
|
+
if parent_context is not None and self.tracer is not None:
|
|
581
|
+
try:
|
|
582
|
+
from opentelemetry import context as otel_context
|
|
583
|
+
|
|
584
|
+
handle["context_token"] = otel_context.attach(parent_context)
|
|
585
|
+
except BaseException as exc:
|
|
586
|
+
self.log_observability_failure("scope.context_attach", exc)
|
|
587
|
+
try:
|
|
588
|
+
if self.tracer is not None:
|
|
589
|
+
handle["span"] = self._start_span(name, attrs)
|
|
590
|
+
except BaseException as exc:
|
|
591
|
+
self.log_observability_failure("scope.span_start", exc, span=name)
|
|
592
|
+
handle["span"] = None
|
|
593
|
+
return handle
|
|
594
|
+
|
|
595
|
+
def annotate(
|
|
596
|
+
self,
|
|
597
|
+
handle: dict[str, Any] | None = None,
|
|
598
|
+
**attrs: Any,
|
|
599
|
+
) -> None:
|
|
600
|
+
try:
|
|
601
|
+
if handle:
|
|
602
|
+
span_handle = handle.get("span")
|
|
603
|
+
if span_handle is not None:
|
|
604
|
+
_cm, span = span_handle
|
|
605
|
+
for key, value in attrs.items():
|
|
606
|
+
if value is not None:
|
|
607
|
+
span.set_attribute(key, value)
|
|
608
|
+
context = self.current_context()
|
|
609
|
+
if context is not None:
|
|
610
|
+
for key, value in attrs.items():
|
|
611
|
+
context.set_attribute(key, value)
|
|
612
|
+
operation = self.current_operation()
|
|
613
|
+
if operation is not None:
|
|
614
|
+
for key, value in attrs.items():
|
|
615
|
+
operation.set_attribute(key, value)
|
|
616
|
+
self._set_current_span_attributes(operation.span_attributes())
|
|
617
|
+
except BaseException as exc:
|
|
618
|
+
self.log_observability_failure("scope.annotate", exc)
|
|
619
|
+
|
|
620
|
+
def end_scope(
|
|
621
|
+
self,
|
|
622
|
+
handle: dict[str, Any] | None,
|
|
623
|
+
error: BaseException | None = None,
|
|
624
|
+
) -> None:
|
|
625
|
+
if not handle:
|
|
626
|
+
return
|
|
627
|
+
operation_handle = handle.get("operation_handle")
|
|
628
|
+
if operation_handle is not None:
|
|
629
|
+
self.end_operation(operation_handle, error)
|
|
630
|
+
return
|
|
631
|
+
try:
|
|
632
|
+
self._end_span(handle.get("span"), error)
|
|
633
|
+
except BaseException as exc:
|
|
634
|
+
self.log_observability_failure("scope.span_end", exc)
|
|
635
|
+
context_token = handle.get("context_token")
|
|
636
|
+
if context_token is not None:
|
|
637
|
+
try:
|
|
638
|
+
from opentelemetry import context as otel_context
|
|
639
|
+
|
|
640
|
+
otel_context.detach(context_token)
|
|
641
|
+
except BaseException as exc:
|
|
642
|
+
self.log_observability_failure("scope.context_detach", exc)
|
|
643
|
+
for var, key in (
|
|
644
|
+
(_TIMINGS, "timings_token"),
|
|
645
|
+
(_TURN_START, "start_token"),
|
|
646
|
+
):
|
|
647
|
+
token = handle.get(key)
|
|
648
|
+
if token is not None:
|
|
649
|
+
try:
|
|
650
|
+
var.reset(token)
|
|
651
|
+
except BaseException as exc:
|
|
652
|
+
self.log_observability_failure(
|
|
653
|
+
"scope.context_reset",
|
|
654
|
+
exc,
|
|
655
|
+
token=key,
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
def mark(self, key: str, ms: float) -> None:
|
|
659
|
+
try:
|
|
660
|
+
timings = _TIMINGS.get()
|
|
661
|
+
if timings is not None:
|
|
662
|
+
timings[key] = round(float(ms), 1)
|
|
663
|
+
except BaseException as exc:
|
|
664
|
+
self.log_observability_failure("scope.mark", exc, key=key)
|
|
665
|
+
|
|
666
|
+
def mark_ttft(self, key: str = "ttft_ms") -> None:
|
|
667
|
+
try:
|
|
668
|
+
start = _TURN_START.get()
|
|
669
|
+
if start is not None:
|
|
670
|
+
self.mark(key, (time.perf_counter() - start) * 1000.0)
|
|
671
|
+
except BaseException as exc:
|
|
672
|
+
self.log_observability_failure("scope.mark_ttft", exc)
|
|
673
|
+
|
|
674
|
+
def record_error(
|
|
675
|
+
self,
|
|
676
|
+
exc: BaseException,
|
|
677
|
+
*,
|
|
678
|
+
handled: bool,
|
|
679
|
+
status_code: int | None = None,
|
|
680
|
+
include_stack: bool = True,
|
|
681
|
+
) -> None:
|
|
682
|
+
if not self.enabled:
|
|
683
|
+
return
|
|
684
|
+
try:
|
|
685
|
+
context = self.current_context()
|
|
686
|
+
operation = self.current_operation()
|
|
687
|
+
error_record = ErrorRecord(
|
|
688
|
+
type=type(exc).__name__,
|
|
689
|
+
message=self._safe_str(exc),
|
|
690
|
+
handled=handled,
|
|
691
|
+
stack=(self._safe_traceback(exc) if include_stack else None),
|
|
692
|
+
)
|
|
693
|
+
if context is not None:
|
|
694
|
+
if status_code is not None:
|
|
695
|
+
context.status_code = status_code
|
|
696
|
+
context.error = error_record
|
|
697
|
+
self.record_error_metric(
|
|
698
|
+
context.metric_attributes(error_type=type(exc).__name__)
|
|
699
|
+
)
|
|
700
|
+
elif operation is not None:
|
|
701
|
+
operation.error = error_record
|
|
702
|
+
self.record_error_metric(
|
|
703
|
+
operation.metric_attributes(error_type=type(exc).__name__)
|
|
704
|
+
)
|
|
705
|
+
else:
|
|
706
|
+
return
|
|
707
|
+
span = self._current_span()
|
|
708
|
+
if span is not None:
|
|
709
|
+
self._record_exception_on_span(
|
|
710
|
+
span,
|
|
711
|
+
exc,
|
|
712
|
+
handled=handled,
|
|
713
|
+
status_code=status_code,
|
|
714
|
+
)
|
|
715
|
+
except BaseException as observability_exc:
|
|
716
|
+
self.log_observability_failure(
|
|
717
|
+
"request.record_error",
|
|
718
|
+
observability_exc,
|
|
719
|
+
original_error_type=type(exc).__name__,
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
def record_event(self, event: str, **fields: Any) -> None:
|
|
723
|
+
if not self.enabled:
|
|
724
|
+
return
|
|
725
|
+
try:
|
|
726
|
+
context = self.current_context()
|
|
727
|
+
operation = self.current_operation()
|
|
728
|
+
base: dict[str, Any] = {
|
|
729
|
+
"schema_version": "policyengine.observability.event.v1",
|
|
730
|
+
"event": event,
|
|
731
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
732
|
+
}
|
|
733
|
+
if context is not None:
|
|
734
|
+
trace_id, span_id = self._trace_ids()
|
|
735
|
+
base.update(
|
|
736
|
+
{
|
|
737
|
+
"service_name": context.config.service_name,
|
|
738
|
+
"service_role": context.config.service_role,
|
|
739
|
+
"environment": context.config.environment,
|
|
740
|
+
"request_id": context.request_id,
|
|
741
|
+
"trace_id": trace_id,
|
|
742
|
+
"span_id": span_id,
|
|
743
|
+
"route": context.route,
|
|
744
|
+
"path": context.path,
|
|
745
|
+
}
|
|
746
|
+
)
|
|
747
|
+
elif operation is not None:
|
|
748
|
+
trace_id, span_id = self._trace_ids()
|
|
749
|
+
base.update(
|
|
750
|
+
{
|
|
751
|
+
"service_name": operation.config.service_name,
|
|
752
|
+
"service_role": operation.config.service_role,
|
|
753
|
+
"environment": operation.config.environment,
|
|
754
|
+
"operation": operation.name,
|
|
755
|
+
"flavor": operation.flavor,
|
|
756
|
+
"trace_id": trace_id,
|
|
757
|
+
"span_id": span_id,
|
|
758
|
+
}
|
|
759
|
+
)
|
|
760
|
+
clean_fields = {
|
|
761
|
+
key: value
|
|
762
|
+
for key, value in fields.items()
|
|
763
|
+
if value is not None
|
|
764
|
+
}
|
|
765
|
+
base.update(clean_fields)
|
|
766
|
+
EVENT_LOGGER.info(self._json(base))
|
|
767
|
+
self._add_span_event(event, clean_fields)
|
|
768
|
+
if event.startswith("modal_") or "fallback" in event:
|
|
769
|
+
attrs = (
|
|
770
|
+
context.metric_attributes(event=event)
|
|
771
|
+
if context
|
|
772
|
+
else operation.metric_attributes(event=event)
|
|
773
|
+
if operation
|
|
774
|
+
else _metric_attrs(
|
|
775
|
+
{"event": event},
|
|
776
|
+
self.config.metric_attribute_keys,
|
|
777
|
+
)
|
|
778
|
+
)
|
|
779
|
+
self.record_failover_event_metric(attrs)
|
|
780
|
+
except BaseException as exc:
|
|
781
|
+
self.log_observability_failure(
|
|
782
|
+
"request.record_event",
|
|
783
|
+
exc,
|
|
784
|
+
event_name=event,
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
def traceparent_header(self) -> str | None:
|
|
788
|
+
if not self.enabled or self.propagate is None:
|
|
789
|
+
return None
|
|
790
|
+
try:
|
|
791
|
+
carrier: dict[str, str] = {}
|
|
792
|
+
self.propagate.inject(carrier)
|
|
793
|
+
return carrier.get(TRACEPARENT_HEADER)
|
|
794
|
+
except BaseException as exc:
|
|
795
|
+
self.log_observability_failure("request.traceparent_header", exc)
|
|
796
|
+
return None
|
|
797
|
+
|
|
798
|
+
def capture_context(self):
|
|
799
|
+
if self.tracer is None:
|
|
800
|
+
return None
|
|
801
|
+
try:
|
|
802
|
+
from opentelemetry import context as otel_context
|
|
803
|
+
|
|
804
|
+
return otel_context.get_current()
|
|
805
|
+
except BaseException as exc:
|
|
806
|
+
self.log_observability_failure("otel.capture_context", exc)
|
|
807
|
+
return None
|
|
808
|
+
|
|
809
|
+
def emit_request_log(self, context: RequestObservabilityContext) -> None:
|
|
810
|
+
if not self.enabled:
|
|
811
|
+
return
|
|
812
|
+
try:
|
|
813
|
+
if context.emitted:
|
|
814
|
+
return
|
|
815
|
+
context.emitted = True
|
|
816
|
+
if (
|
|
817
|
+
context.internal_dispatch
|
|
818
|
+
or not context.config.request_logs_enabled
|
|
819
|
+
):
|
|
820
|
+
return
|
|
821
|
+
trace_id, span_id = self._trace_ids()
|
|
822
|
+
REQUEST_LOGGER.info(
|
|
823
|
+
self._json(
|
|
824
|
+
context.as_log_record(
|
|
825
|
+
trace_id=trace_id,
|
|
826
|
+
span_id=span_id,
|
|
827
|
+
)
|
|
828
|
+
)
|
|
829
|
+
)
|
|
830
|
+
except BaseException as exc:
|
|
831
|
+
self.log_observability_failure(
|
|
832
|
+
"request.emit_request_log",
|
|
833
|
+
exc,
|
|
834
|
+
request_id=getattr(context, "request_id", None),
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
def emit_operation_log(
|
|
838
|
+
self,
|
|
839
|
+
operation: OperationObservabilityContext,
|
|
840
|
+
) -> None:
|
|
841
|
+
if not self.enabled:
|
|
842
|
+
return
|
|
843
|
+
try:
|
|
844
|
+
if operation.emitted:
|
|
845
|
+
return
|
|
846
|
+
operation.emitted = True
|
|
847
|
+
trace_id, span_id = self._trace_ids()
|
|
848
|
+
OPERATION_LOGGER.info(
|
|
849
|
+
self._json(
|
|
850
|
+
operation.as_log_record(
|
|
851
|
+
trace_id=trace_id,
|
|
852
|
+
span_id=span_id,
|
|
853
|
+
)
|
|
854
|
+
)
|
|
855
|
+
)
|
|
856
|
+
except BaseException as exc:
|
|
857
|
+
self.log_observability_failure(
|
|
858
|
+
"operation.emit_log",
|
|
859
|
+
exc,
|
|
860
|
+
operation=getattr(operation, "name", None),
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
def record_operation_metric(
|
|
864
|
+
self,
|
|
865
|
+
duration_seconds: float,
|
|
866
|
+
attributes: dict[str, str],
|
|
867
|
+
) -> None:
|
|
868
|
+
try:
|
|
869
|
+
self.operation_duration.record(duration_seconds, attributes)
|
|
870
|
+
self.operations.add(1, attributes)
|
|
871
|
+
except BaseException as exc:
|
|
872
|
+
self.log_observability_failure("metrics.record_operation", exc)
|
|
873
|
+
|
|
874
|
+
def record_request_metric(
|
|
875
|
+
self,
|
|
876
|
+
duration_seconds: float,
|
|
877
|
+
attributes: dict[str, str],
|
|
878
|
+
) -> None:
|
|
879
|
+
try:
|
|
880
|
+
self.http_duration.record(duration_seconds, attributes)
|
|
881
|
+
self.requests.add(1, attributes)
|
|
882
|
+
except BaseException as exc:
|
|
883
|
+
self.log_observability_failure("metrics.record_request", exc)
|
|
884
|
+
|
|
885
|
+
def record_segment_metric(
|
|
886
|
+
self,
|
|
887
|
+
segment: str,
|
|
888
|
+
duration_seconds: float,
|
|
889
|
+
attributes: dict[str, str],
|
|
890
|
+
*,
|
|
891
|
+
backend_segment: bool = False,
|
|
892
|
+
) -> None:
|
|
893
|
+
try:
|
|
894
|
+
segment_attributes = {**attributes, "segment": segment}
|
|
895
|
+
self.segment_duration.record(duration_seconds, segment_attributes)
|
|
896
|
+
if segment == "calculation":
|
|
897
|
+
self.calculate_duration.record(duration_seconds, attributes)
|
|
898
|
+
if backend_segment:
|
|
899
|
+
self.backend_duration.record(
|
|
900
|
+
duration_seconds,
|
|
901
|
+
segment_attributes,
|
|
902
|
+
)
|
|
903
|
+
except BaseException as exc:
|
|
904
|
+
self.log_observability_failure(
|
|
905
|
+
"metrics.record_segment",
|
|
906
|
+
exc,
|
|
907
|
+
segment=segment,
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
def record_error_metric(self, attributes: dict[str, str]) -> None:
|
|
911
|
+
try:
|
|
912
|
+
self.errors.add(1, attributes)
|
|
913
|
+
except BaseException as exc:
|
|
914
|
+
self.log_observability_failure("metrics.record_error", exc)
|
|
915
|
+
|
|
916
|
+
def record_rate_limited_metric(self, attributes: dict[str, str]) -> None:
|
|
917
|
+
try:
|
|
918
|
+
self.rate_limited.add(1, attributes)
|
|
919
|
+
except BaseException as exc:
|
|
920
|
+
self.log_observability_failure("metrics.record_rate_limited", exc)
|
|
921
|
+
|
|
922
|
+
def record_failover_event_metric(self, attributes: dict[str, str]) -> None:
|
|
923
|
+
try:
|
|
924
|
+
self.failover_events.add(1, attributes)
|
|
925
|
+
except BaseException as exc:
|
|
926
|
+
self.log_observability_failure(
|
|
927
|
+
"metrics.record_failover_event",
|
|
928
|
+
exc,
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
def record_active_request(
|
|
932
|
+
self,
|
|
933
|
+
delta: int,
|
|
934
|
+
attributes: dict[str, str],
|
|
935
|
+
) -> None:
|
|
936
|
+
try:
|
|
937
|
+
self.active_requests.add(delta, attributes)
|
|
938
|
+
except BaseException as exc:
|
|
939
|
+
self.log_observability_failure("metrics.add_active_request", exc)
|
|
940
|
+
|
|
941
|
+
def instrument_fastapi(self, app: Any) -> None:
|
|
942
|
+
if not self.enabled or not self.config.otel_enabled:
|
|
943
|
+
return
|
|
944
|
+
try:
|
|
945
|
+
from opentelemetry.instrumentation.fastapi import (
|
|
946
|
+
FastAPIInstrumentor,
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
FastAPIInstrumentor.instrument_app(app)
|
|
950
|
+
except BaseException as exc:
|
|
951
|
+
self.log_observability_failure(
|
|
952
|
+
"fastapi.auto_instrument",
|
|
953
|
+
exc,
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
def instrument_httpx(self) -> None:
|
|
957
|
+
if (
|
|
958
|
+
not self.enabled
|
|
959
|
+
or not self.config.otel_enabled
|
|
960
|
+
or self._httpx_instrumented
|
|
961
|
+
):
|
|
962
|
+
return
|
|
963
|
+
try:
|
|
964
|
+
from opentelemetry.instrumentation.httpx import (
|
|
965
|
+
HTTPXClientInstrumentor,
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
HTTPXClientInstrumentor().instrument()
|
|
969
|
+
self._httpx_instrumented = True
|
|
970
|
+
except BaseException as exc:
|
|
971
|
+
self.log_observability_failure("httpx.auto_instrument", exc)
|
|
972
|
+
|
|
973
|
+
def shutdown(self) -> None:
|
|
974
|
+
providers = [
|
|
975
|
+
("trace", self.tracer_provider),
|
|
976
|
+
("metrics", self.meter_provider),
|
|
977
|
+
]
|
|
978
|
+
providers = [
|
|
979
|
+
(name, provider)
|
|
980
|
+
for name, provider in providers
|
|
981
|
+
if provider is not None
|
|
982
|
+
]
|
|
983
|
+
if not providers:
|
|
984
|
+
return
|
|
985
|
+
|
|
986
|
+
def flush() -> None:
|
|
987
|
+
for name, provider in providers:
|
|
988
|
+
try:
|
|
989
|
+
provider.shutdown()
|
|
990
|
+
except BaseException as exc:
|
|
991
|
+
self.log_observability_failure(
|
|
992
|
+
f"otel.{name}_shutdown",
|
|
993
|
+
exc,
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
thread = threading.Thread(
|
|
997
|
+
target=flush,
|
|
998
|
+
name="policyengine-otel-shutdown",
|
|
999
|
+
daemon=True,
|
|
1000
|
+
)
|
|
1001
|
+
thread.start()
|
|
1002
|
+
thread.join(timeout=self.config.shutdown_timeout_seconds)
|
|
1003
|
+
if thread.is_alive():
|
|
1004
|
+
self.log_observability_failure(
|
|
1005
|
+
"otel.shutdown_timeout",
|
|
1006
|
+
TimeoutError("OpenTelemetry shutdown timed out."),
|
|
1007
|
+
timeout_seconds=self.config.shutdown_timeout_seconds,
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
def shutdown_tracing(self) -> None:
|
|
1011
|
+
self.shutdown()
|
|
1012
|
+
|
|
1013
|
+
def log_observability_failure(
|
|
1014
|
+
self,
|
|
1015
|
+
operation: str,
|
|
1016
|
+
exc: BaseException,
|
|
1017
|
+
**fields: Any,
|
|
1018
|
+
) -> None:
|
|
1019
|
+
payload = {
|
|
1020
|
+
"schema_version": "policyengine.observability.internal_error.v1",
|
|
1021
|
+
"event": "observability_internal_error",
|
|
1022
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
1023
|
+
"operation": operation,
|
|
1024
|
+
"error": {
|
|
1025
|
+
"type": type(exc).__name__,
|
|
1026
|
+
"message": self._safe_str(exc),
|
|
1027
|
+
"stack": self._safe_traceback(exc),
|
|
1028
|
+
},
|
|
1029
|
+
}
|
|
1030
|
+
payload.update(
|
|
1031
|
+
{key: value for key, value in fields.items() if value is not None}
|
|
1032
|
+
)
|
|
1033
|
+
try:
|
|
1034
|
+
INTERNAL_LOGGER.error(self._json(payload))
|
|
1035
|
+
except BaseException:
|
|
1036
|
+
self._write_stderr(payload)
|
|
1037
|
+
|
|
1038
|
+
def _configure_loggers(self) -> None:
|
|
1039
|
+
for logger in (
|
|
1040
|
+
REQUEST_LOGGER,
|
|
1041
|
+
OPERATION_LOGGER,
|
|
1042
|
+
EVENT_LOGGER,
|
|
1043
|
+
INTERNAL_LOGGER,
|
|
1044
|
+
):
|
|
1045
|
+
configure_plain_logger(logger, self.config.log_level)
|
|
1046
|
+
|
|
1047
|
+
def _configure_otel(self) -> None:
|
|
1048
|
+
try:
|
|
1049
|
+
from opentelemetry import metrics, propagate, trace
|
|
1050
|
+
from opentelemetry.sdk.metrics import MeterProvider
|
|
1051
|
+
from opentelemetry.sdk.resources import (
|
|
1052
|
+
DEPLOYMENT_ENVIRONMENT,
|
|
1053
|
+
SERVICE_NAME,
|
|
1054
|
+
Resource,
|
|
1055
|
+
)
|
|
1056
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
1057
|
+
from opentelemetry.trace import SpanKind, Status, StatusCode
|
|
1058
|
+
except BaseException as exc:
|
|
1059
|
+
self.log_observability_failure("otel.configure_imports", exc)
|
|
1060
|
+
return
|
|
1061
|
+
|
|
1062
|
+
try:
|
|
1063
|
+
resource = Resource.create(
|
|
1064
|
+
{
|
|
1065
|
+
SERVICE_NAME: self.config.service_name,
|
|
1066
|
+
DEPLOYMENT_ENVIRONMENT: self.config.environment,
|
|
1067
|
+
"service.role": self.config.service_role,
|
|
1068
|
+
}
|
|
1069
|
+
)
|
|
1070
|
+
tracer_provider = TracerProvider(resource=resource)
|
|
1071
|
+
metric_readers = []
|
|
1072
|
+
if self.config.otlp_endpoint:
|
|
1073
|
+
self._add_trace_exporter(tracer_provider)
|
|
1074
|
+
metric_reader = self._metric_reader()
|
|
1075
|
+
if metric_reader is not None:
|
|
1076
|
+
metric_readers.append(metric_reader)
|
|
1077
|
+
self.tracer_provider = tracer_provider
|
|
1078
|
+
try:
|
|
1079
|
+
trace.set_tracer_provider(tracer_provider)
|
|
1080
|
+
except BaseException as exc:
|
|
1081
|
+
self.log_observability_failure(
|
|
1082
|
+
"otel.set_tracer_provider",
|
|
1083
|
+
exc,
|
|
1084
|
+
)
|
|
1085
|
+
try:
|
|
1086
|
+
self.meter_provider = MeterProvider(
|
|
1087
|
+
resource=resource,
|
|
1088
|
+
metric_readers=metric_readers,
|
|
1089
|
+
)
|
|
1090
|
+
metrics.set_meter_provider(self.meter_provider)
|
|
1091
|
+
except BaseException as exc:
|
|
1092
|
+
self.log_observability_failure(
|
|
1093
|
+
"otel.set_meter_provider",
|
|
1094
|
+
exc,
|
|
1095
|
+
)
|
|
1096
|
+
self.trace = trace
|
|
1097
|
+
self.propagate = propagate
|
|
1098
|
+
self.SpanKind = SpanKind
|
|
1099
|
+
self.Status = Status
|
|
1100
|
+
self.StatusCode = StatusCode
|
|
1101
|
+
tracer_name = self.config.tracer_name or self.config.service_name
|
|
1102
|
+
meter_name = self.config.meter_name or self.config.service_name
|
|
1103
|
+
self.tracer = trace.get_tracer(tracer_name)
|
|
1104
|
+
self.meter = metrics.get_meter(meter_name)
|
|
1105
|
+
self._configure_instruments()
|
|
1106
|
+
except BaseException as exc:
|
|
1107
|
+
self.log_observability_failure("otel.configure", exc)
|
|
1108
|
+
|
|
1109
|
+
def _add_trace_exporter(self, tracer_provider) -> None:
|
|
1110
|
+
try:
|
|
1111
|
+
if self.config.otlp_protocol.startswith("http"):
|
|
1112
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
1113
|
+
OTLPSpanExporter,
|
|
1114
|
+
)
|
|
1115
|
+
else:
|
|
1116
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
|
|
1117
|
+
OTLPSpanExporter,
|
|
1118
|
+
)
|
|
1119
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
1120
|
+
|
|
1121
|
+
tracer_provider.add_span_processor(
|
|
1122
|
+
BatchSpanProcessor(OTLPSpanExporter())
|
|
1123
|
+
)
|
|
1124
|
+
except BaseException as exc:
|
|
1125
|
+
self.log_observability_failure("otel.trace_exporter", exc)
|
|
1126
|
+
|
|
1127
|
+
def _metric_reader(self):
|
|
1128
|
+
try:
|
|
1129
|
+
if self.config.otlp_protocol.startswith("http"):
|
|
1130
|
+
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
|
|
1131
|
+
OTLPMetricExporter,
|
|
1132
|
+
)
|
|
1133
|
+
else:
|
|
1134
|
+
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
|
|
1135
|
+
OTLPMetricExporter,
|
|
1136
|
+
)
|
|
1137
|
+
from opentelemetry.sdk.metrics.export import (
|
|
1138
|
+
PeriodicExportingMetricReader,
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
return PeriodicExportingMetricReader(OTLPMetricExporter())
|
|
1142
|
+
except BaseException as exc:
|
|
1143
|
+
self.log_observability_failure("otel.metric_exporter", exc)
|
|
1144
|
+
return None
|
|
1145
|
+
|
|
1146
|
+
def _configure_instruments(self) -> None:
|
|
1147
|
+
self.operation_duration = self._instrument(
|
|
1148
|
+
getattr(self.meter, "create_histogram", None),
|
|
1149
|
+
"policyengine.operation.duration",
|
|
1150
|
+
unit="s",
|
|
1151
|
+
description="PolicyEngine operation duration.",
|
|
1152
|
+
)
|
|
1153
|
+
self.http_duration = self._instrument(
|
|
1154
|
+
getattr(self.meter, "create_histogram", None),
|
|
1155
|
+
"http.server.request.duration",
|
|
1156
|
+
unit="s",
|
|
1157
|
+
description="HTTP server request duration.",
|
|
1158
|
+
)
|
|
1159
|
+
self.segment_duration = self._instrument(
|
|
1160
|
+
getattr(self.meter, "create_histogram", None),
|
|
1161
|
+
"policyengine.segment.duration",
|
|
1162
|
+
unit="s",
|
|
1163
|
+
description="PolicyEngine operation segment duration.",
|
|
1164
|
+
)
|
|
1165
|
+
self.calculate_duration = self._instrument(
|
|
1166
|
+
getattr(self.meter, "create_histogram", None),
|
|
1167
|
+
"policyengine.calculate.duration",
|
|
1168
|
+
unit="s",
|
|
1169
|
+
description="PolicyEngine calculate operation duration.",
|
|
1170
|
+
)
|
|
1171
|
+
self.backend_duration = self._instrument(
|
|
1172
|
+
getattr(self.meter, "create_histogram", None),
|
|
1173
|
+
"policyengine.backend.duration",
|
|
1174
|
+
unit="s",
|
|
1175
|
+
description="PolicyEngine backend call duration.",
|
|
1176
|
+
)
|
|
1177
|
+
self.operations = self._instrument(
|
|
1178
|
+
getattr(self.meter, "create_counter", None),
|
|
1179
|
+
"policyengine.operations",
|
|
1180
|
+
description="PolicyEngine operation count.",
|
|
1181
|
+
)
|
|
1182
|
+
self.requests = self._instrument(
|
|
1183
|
+
getattr(self.meter, "create_counter", None),
|
|
1184
|
+
"policyengine.requests",
|
|
1185
|
+
description="PolicyEngine request count.",
|
|
1186
|
+
)
|
|
1187
|
+
self.errors = self._instrument(
|
|
1188
|
+
getattr(self.meter, "create_counter", None),
|
|
1189
|
+
"policyengine.errors",
|
|
1190
|
+
description="PolicyEngine error count.",
|
|
1191
|
+
)
|
|
1192
|
+
self.rate_limited = self._instrument(
|
|
1193
|
+
getattr(self.meter, "create_counter", None),
|
|
1194
|
+
"policyengine.rate_limited_requests",
|
|
1195
|
+
description="PolicyEngine rate-limited request count.",
|
|
1196
|
+
)
|
|
1197
|
+
self.failover_events = self._instrument(
|
|
1198
|
+
getattr(self.meter, "create_counter", None),
|
|
1199
|
+
"policyengine.failover.events",
|
|
1200
|
+
description="PolicyEngine failover event count.",
|
|
1201
|
+
)
|
|
1202
|
+
self.active_requests = self._instrument(
|
|
1203
|
+
getattr(self.meter, "create_up_down_counter", None),
|
|
1204
|
+
"http.server.active_requests",
|
|
1205
|
+
description="Active HTTP server requests.",
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
def _instrument(self, factory, *args, **kwargs):
|
|
1209
|
+
if factory is None:
|
|
1210
|
+
return _NoOpInstrument()
|
|
1211
|
+
try:
|
|
1212
|
+
return factory(*args, **kwargs)
|
|
1213
|
+
except BaseException as exc:
|
|
1214
|
+
self.log_observability_failure(
|
|
1215
|
+
"metrics.create_instrument",
|
|
1216
|
+
exc,
|
|
1217
|
+
instrument=args[0] if args else None,
|
|
1218
|
+
)
|
|
1219
|
+
return _NoOpInstrument()
|
|
1220
|
+
|
|
1221
|
+
def _start_request_span(
|
|
1222
|
+
self,
|
|
1223
|
+
context: RequestObservabilityContext,
|
|
1224
|
+
*,
|
|
1225
|
+
carrier: Any = None,
|
|
1226
|
+
) -> None:
|
|
1227
|
+
if self.tracer is None:
|
|
1228
|
+
return
|
|
1229
|
+
attrs = context.span_attributes()
|
|
1230
|
+
parent_context = self._extract_context(carrier)
|
|
1231
|
+
try:
|
|
1232
|
+
context.server_span_cm = self.tracer.start_as_current_span(
|
|
1233
|
+
context.route,
|
|
1234
|
+
context=parent_context,
|
|
1235
|
+
kind=self.SpanKind.SERVER if self.SpanKind else None,
|
|
1236
|
+
attributes=attrs,
|
|
1237
|
+
)
|
|
1238
|
+
context.server_span = context.server_span_cm.__enter__()
|
|
1239
|
+
except BaseException as exc:
|
|
1240
|
+
context.server_span_cm = None
|
|
1241
|
+
context.server_span = None
|
|
1242
|
+
self.log_observability_failure("otel.request_span_enter", exc)
|
|
1243
|
+
|
|
1244
|
+
def _close_request_span(
|
|
1245
|
+
self,
|
|
1246
|
+
context: RequestObservabilityContext,
|
|
1247
|
+
exc: BaseException | None,
|
|
1248
|
+
) -> None:
|
|
1249
|
+
if context.span_closed:
|
|
1250
|
+
return
|
|
1251
|
+
context.span_closed = True
|
|
1252
|
+
span_cm = context.server_span_cm
|
|
1253
|
+
if span_cm is None:
|
|
1254
|
+
return
|
|
1255
|
+
try:
|
|
1256
|
+
if exc is None:
|
|
1257
|
+
span_cm.__exit__(None, None, None)
|
|
1258
|
+
else:
|
|
1259
|
+
span_cm.__exit__(type(exc), exc, exc.__traceback__)
|
|
1260
|
+
except BaseException as observability_exc:
|
|
1261
|
+
self.log_observability_failure(
|
|
1262
|
+
"otel.request_span_exit",
|
|
1263
|
+
observability_exc,
|
|
1264
|
+
request_id=context.request_id,
|
|
1265
|
+
)
|
|
1266
|
+
|
|
1267
|
+
@contextmanager
|
|
1268
|
+
def _safe_span(self, name: str, attrs: dict[str, Any]) -> Iterator[Any]:
|
|
1269
|
+
if self.tracer is None:
|
|
1270
|
+
yield None
|
|
1271
|
+
return
|
|
1272
|
+
span_handle = self._start_span(name, attrs)
|
|
1273
|
+
if span_handle is None:
|
|
1274
|
+
yield None
|
|
1275
|
+
return
|
|
1276
|
+
_cm, span = span_handle
|
|
1277
|
+
try:
|
|
1278
|
+
yield span
|
|
1279
|
+
except BaseException as exc:
|
|
1280
|
+
try:
|
|
1281
|
+
self._end_span(span_handle, exc)
|
|
1282
|
+
except BaseException as observability_exc:
|
|
1283
|
+
self.log_observability_failure(
|
|
1284
|
+
"otel.span_exit",
|
|
1285
|
+
observability_exc,
|
|
1286
|
+
span=name,
|
|
1287
|
+
)
|
|
1288
|
+
raise
|
|
1289
|
+
else:
|
|
1290
|
+
try:
|
|
1291
|
+
self._end_span(span_handle)
|
|
1292
|
+
except BaseException as exc:
|
|
1293
|
+
self.log_observability_failure(
|
|
1294
|
+
"otel.span_exit",
|
|
1295
|
+
exc,
|
|
1296
|
+
span=name,
|
|
1297
|
+
)
|
|
1298
|
+
|
|
1299
|
+
def _start_span(self, name: str, attrs: dict[str, Any]):
|
|
1300
|
+
try:
|
|
1301
|
+
span_cm = self.tracer.start_as_current_span(name)
|
|
1302
|
+
span = span_cm.__enter__()
|
|
1303
|
+
except BaseException as exc:
|
|
1304
|
+
self.log_observability_failure("otel.span_enter", exc, span=name)
|
|
1305
|
+
return None
|
|
1306
|
+
try:
|
|
1307
|
+
for key, value in attrs.items():
|
|
1308
|
+
if value is not None:
|
|
1309
|
+
span.set_attribute(key, value)
|
|
1310
|
+
except BaseException as exc:
|
|
1311
|
+
self.log_observability_failure(
|
|
1312
|
+
"otel.span_attributes",
|
|
1313
|
+
exc,
|
|
1314
|
+
span=name,
|
|
1315
|
+
)
|
|
1316
|
+
return span_cm, span
|
|
1317
|
+
|
|
1318
|
+
def _end_span(
|
|
1319
|
+
self,
|
|
1320
|
+
span_handle,
|
|
1321
|
+
error: BaseException | None = None,
|
|
1322
|
+
) -> None:
|
|
1323
|
+
if span_handle is None:
|
|
1324
|
+
return
|
|
1325
|
+
span_cm, span = span_handle
|
|
1326
|
+
try:
|
|
1327
|
+
if error is not None:
|
|
1328
|
+
self._record_exception_on_span(
|
|
1329
|
+
span,
|
|
1330
|
+
error,
|
|
1331
|
+
handled=False,
|
|
1332
|
+
status_code=500,
|
|
1333
|
+
)
|
|
1334
|
+
except BaseException as exc:
|
|
1335
|
+
self.log_observability_failure("otel.span_error_status", exc)
|
|
1336
|
+
try:
|
|
1337
|
+
span_cm.__exit__(None, None, None)
|
|
1338
|
+
except BaseException as exc:
|
|
1339
|
+
self.log_observability_failure("otel.span_exit", exc)
|
|
1340
|
+
|
|
1341
|
+
def _record_segment_safely(
|
|
1342
|
+
self,
|
|
1343
|
+
name: str,
|
|
1344
|
+
start: float | None,
|
|
1345
|
+
attrs: dict[str, Any],
|
|
1346
|
+
) -> None:
|
|
1347
|
+
if start is None:
|
|
1348
|
+
return
|
|
1349
|
+
end = self._safe_perf_counter(f"segment.{name}.end")
|
|
1350
|
+
if end is None:
|
|
1351
|
+
return
|
|
1352
|
+
try:
|
|
1353
|
+
duration = end - start
|
|
1354
|
+
self._record_timing(name, duration)
|
|
1355
|
+
context = self.current_context()
|
|
1356
|
+
operation = self.current_operation()
|
|
1357
|
+
metric_extra = {
|
|
1358
|
+
key: value
|
|
1359
|
+
for key, value in attrs.items()
|
|
1360
|
+
if (
|
|
1361
|
+
key in self.config.metric_attribute_keys
|
|
1362
|
+
and value is not None
|
|
1363
|
+
)
|
|
1364
|
+
}
|
|
1365
|
+
if context is not None:
|
|
1366
|
+
context.timings_ms[name] = round(duration * 1000, 3)
|
|
1367
|
+
if operation is not None:
|
|
1368
|
+
operation.timings_ms[name] = round(duration * 1000, 3)
|
|
1369
|
+
metric_attributes = operation.metric_attributes(
|
|
1370
|
+
segment=name,
|
|
1371
|
+
**metric_extra,
|
|
1372
|
+
)
|
|
1373
|
+
elif context is not None:
|
|
1374
|
+
metric_attributes = context.metric_attributes(
|
|
1375
|
+
segment=name,
|
|
1376
|
+
**metric_extra,
|
|
1377
|
+
)
|
|
1378
|
+
else:
|
|
1379
|
+
metric_attributes = _metric_attrs(
|
|
1380
|
+
{
|
|
1381
|
+
"service.name": self.config.service_name,
|
|
1382
|
+
"service.role": self.config.service_role,
|
|
1383
|
+
"deployment.environment": self.config.environment,
|
|
1384
|
+
"segment": name,
|
|
1385
|
+
**metric_extra,
|
|
1386
|
+
},
|
|
1387
|
+
self.config.metric_attribute_keys,
|
|
1388
|
+
)
|
|
1389
|
+
self.record_segment_metric(
|
|
1390
|
+
name,
|
|
1391
|
+
duration,
|
|
1392
|
+
metric_attributes,
|
|
1393
|
+
backend_segment="backend" in metric_extra,
|
|
1394
|
+
)
|
|
1395
|
+
except BaseException as exc:
|
|
1396
|
+
self.log_observability_failure(
|
|
1397
|
+
"request.record_segment",
|
|
1398
|
+
exc,
|
|
1399
|
+
segment=name,
|
|
1400
|
+
)
|
|
1401
|
+
|
|
1402
|
+
def _record_timing(self, name: str, duration_seconds: float) -> None:
|
|
1403
|
+
try:
|
|
1404
|
+
timings = _TIMINGS.get()
|
|
1405
|
+
if timings is None:
|
|
1406
|
+
return
|
|
1407
|
+
key = f"{name}_ms"
|
|
1408
|
+
duration_ms = duration_seconds * 1000.0
|
|
1409
|
+
timings[key] = round(timings.get(key, 0.0) + duration_ms, 1)
|
|
1410
|
+
except BaseException as exc:
|
|
1411
|
+
self.log_observability_failure(
|
|
1412
|
+
"scope.record_timing",
|
|
1413
|
+
exc,
|
|
1414
|
+
segment=name,
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
def _segment_span_attributes(
|
|
1418
|
+
self,
|
|
1419
|
+
attrs: dict[str, Any],
|
|
1420
|
+
) -> dict[str, Any]:
|
|
1421
|
+
context = self.current_context()
|
|
1422
|
+
operation = self.current_operation()
|
|
1423
|
+
span_attrs = {
|
|
1424
|
+
key: value for key, value in attrs.items() if value is not None
|
|
1425
|
+
}
|
|
1426
|
+
if context is not None:
|
|
1427
|
+
span_attrs = {**context.span_attributes(), **span_attrs}
|
|
1428
|
+
elif operation is not None:
|
|
1429
|
+
span_attrs = {**operation.span_attributes(), **span_attrs}
|
|
1430
|
+
return span_attrs
|
|
1431
|
+
|
|
1432
|
+
def _span_name(self, segment_name: str) -> str:
|
|
1433
|
+
if not self.config.span_prefix:
|
|
1434
|
+
return segment_name
|
|
1435
|
+
return f"{self.config.span_prefix}.{segment_name}"
|
|
1436
|
+
|
|
1437
|
+
def _start_implicit_operation(
|
|
1438
|
+
self,
|
|
1439
|
+
segment_name: str,
|
|
1440
|
+
attrs: dict[str, Any],
|
|
1441
|
+
) -> dict[str, Any] | None:
|
|
1442
|
+
if (
|
|
1443
|
+
self.current_operation() is not None
|
|
1444
|
+
or self.current_context() is not None
|
|
1445
|
+
):
|
|
1446
|
+
return None
|
|
1447
|
+
operation_name = attrs.get("operation") or segment_name
|
|
1448
|
+
flavor = attrs.get("flavor")
|
|
1449
|
+
operation_attrs = {
|
|
1450
|
+
key: value
|
|
1451
|
+
for key, value in attrs.items()
|
|
1452
|
+
if key not in {"operation", "flavor"} and value is not None
|
|
1453
|
+
}
|
|
1454
|
+
return self.start_operation(
|
|
1455
|
+
self._safe_str(operation_name),
|
|
1456
|
+
flavor=self._safe_str(flavor) if flavor is not None else None,
|
|
1457
|
+
**operation_attrs,
|
|
1458
|
+
)
|
|
1459
|
+
|
|
1460
|
+
def _coerce_segment(self, name: Any) -> str:
|
|
1461
|
+
segment, is_registered = coerce_segment_name(
|
|
1462
|
+
name,
|
|
1463
|
+
registry=self.segment_registry,
|
|
1464
|
+
)
|
|
1465
|
+
if not is_registered:
|
|
1466
|
+
self.log_observability_failure(
|
|
1467
|
+
"segment.coerce",
|
|
1468
|
+
ValueError("Unregistered observability segment."),
|
|
1469
|
+
segment=segment,
|
|
1470
|
+
segment_type=type(name).__name__,
|
|
1471
|
+
)
|
|
1472
|
+
return segment
|
|
1473
|
+
|
|
1474
|
+
def _set_current_span_attributes(self, attrs: dict[str, Any]) -> None:
|
|
1475
|
+
span = self._current_span()
|
|
1476
|
+
if span is None:
|
|
1477
|
+
return
|
|
1478
|
+
try:
|
|
1479
|
+
for key, value in attrs.items():
|
|
1480
|
+
if value is not None:
|
|
1481
|
+
span.set_attribute(key, value)
|
|
1482
|
+
except BaseException as exc:
|
|
1483
|
+
self.log_observability_failure("otel.set_span_attributes", exc)
|
|
1484
|
+
|
|
1485
|
+
def _current_span(self):
|
|
1486
|
+
if self.trace is None:
|
|
1487
|
+
return None
|
|
1488
|
+
try:
|
|
1489
|
+
return self.trace.get_current_span()
|
|
1490
|
+
except BaseException as exc:
|
|
1491
|
+
self.log_observability_failure("otel.current_span", exc)
|
|
1492
|
+
return None
|
|
1493
|
+
|
|
1494
|
+
def _trace_ids(self) -> tuple[str | None, str | None]:
|
|
1495
|
+
span = self._current_span()
|
|
1496
|
+
if span is None:
|
|
1497
|
+
return None, None
|
|
1498
|
+
try:
|
|
1499
|
+
context = span.get_span_context()
|
|
1500
|
+
except BaseException as exc:
|
|
1501
|
+
self.log_observability_failure("otel.span_context", exc)
|
|
1502
|
+
return None, None
|
|
1503
|
+
if not getattr(context, "is_valid", False):
|
|
1504
|
+
return None, None
|
|
1505
|
+
return f"{context.trace_id:032x}", f"{context.span_id:016x}"
|
|
1506
|
+
|
|
1507
|
+
def _extract_context(self, carrier: Any):
|
|
1508
|
+
if self.propagate is None or carrier is None:
|
|
1509
|
+
return None
|
|
1510
|
+
try:
|
|
1511
|
+
return self.propagate.extract(carrier)
|
|
1512
|
+
except BaseException as exc:
|
|
1513
|
+
self.log_observability_failure("otel.extract_context", exc)
|
|
1514
|
+
return None
|
|
1515
|
+
|
|
1516
|
+
def _record_exception_on_span(
|
|
1517
|
+
self,
|
|
1518
|
+
span,
|
|
1519
|
+
exc: BaseException,
|
|
1520
|
+
*,
|
|
1521
|
+
handled: bool,
|
|
1522
|
+
status_code: int | None,
|
|
1523
|
+
) -> None:
|
|
1524
|
+
try:
|
|
1525
|
+
span.record_exception(exc)
|
|
1526
|
+
span.set_attribute("error.type", type(exc).__name__)
|
|
1527
|
+
span.set_attribute("error.handled", handled)
|
|
1528
|
+
if (
|
|
1529
|
+
self.Status is not None
|
|
1530
|
+
and self.StatusCode is not None
|
|
1531
|
+
and (
|
|
1532
|
+
not handled
|
|
1533
|
+
or (status_code is not None and status_code >= 500)
|
|
1534
|
+
)
|
|
1535
|
+
):
|
|
1536
|
+
span.set_status(
|
|
1537
|
+
self.Status(
|
|
1538
|
+
self.StatusCode.ERROR,
|
|
1539
|
+
self._safe_str(exc),
|
|
1540
|
+
)
|
|
1541
|
+
)
|
|
1542
|
+
except BaseException as observability_exc:
|
|
1543
|
+
self.log_observability_failure(
|
|
1544
|
+
"otel.record_exception",
|
|
1545
|
+
observability_exc,
|
|
1546
|
+
original_error_type=type(exc).__name__,
|
|
1547
|
+
)
|
|
1548
|
+
|
|
1549
|
+
def _add_span_event(self, event: str, fields: dict[str, Any]) -> None:
|
|
1550
|
+
span = self._current_span()
|
|
1551
|
+
if span is None:
|
|
1552
|
+
return
|
|
1553
|
+
try:
|
|
1554
|
+
span.add_event(
|
|
1555
|
+
event,
|
|
1556
|
+
{
|
|
1557
|
+
key: value
|
|
1558
|
+
for key, value in fields.items()
|
|
1559
|
+
if _is_safe_span_value(value)
|
|
1560
|
+
},
|
|
1561
|
+
)
|
|
1562
|
+
except BaseException as exc:
|
|
1563
|
+
self.log_observability_failure(
|
|
1564
|
+
"otel.add_event",
|
|
1565
|
+
exc,
|
|
1566
|
+
event_name=event,
|
|
1567
|
+
)
|
|
1568
|
+
|
|
1569
|
+
def _close_active_request(
|
|
1570
|
+
self,
|
|
1571
|
+
context: RequestObservabilityContext,
|
|
1572
|
+
) -> None:
|
|
1573
|
+
try:
|
|
1574
|
+
if context.active_closed:
|
|
1575
|
+
return
|
|
1576
|
+
context.active_closed = True
|
|
1577
|
+
self.record_active_request(-1, context.metric_attributes())
|
|
1578
|
+
except BaseException as exc:
|
|
1579
|
+
self.log_observability_failure(
|
|
1580
|
+
"request.close_active",
|
|
1581
|
+
exc,
|
|
1582
|
+
request_id=getattr(context, "request_id", None),
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
def _reset_request_operation_context(
|
|
1586
|
+
self,
|
|
1587
|
+
context: RequestObservabilityContext,
|
|
1588
|
+
) -> None:
|
|
1589
|
+
token = context.operation_token
|
|
1590
|
+
if token is None:
|
|
1591
|
+
return
|
|
1592
|
+
try:
|
|
1593
|
+
_OPERATION_CONTEXT.reset(token)
|
|
1594
|
+
except BaseException as exc:
|
|
1595
|
+
self.log_observability_failure(
|
|
1596
|
+
"request.operation_context_reset",
|
|
1597
|
+
exc,
|
|
1598
|
+
request_id=getattr(context, "request_id", None),
|
|
1599
|
+
)
|
|
1600
|
+
|
|
1601
|
+
def _reset_request_context(
|
|
1602
|
+
self,
|
|
1603
|
+
context: RequestObservabilityContext,
|
|
1604
|
+
) -> None:
|
|
1605
|
+
token = context.context_token
|
|
1606
|
+
if token is None:
|
|
1607
|
+
return
|
|
1608
|
+
try:
|
|
1609
|
+
_REQUEST_CONTEXT.reset(token)
|
|
1610
|
+
except BaseException as exc:
|
|
1611
|
+
self.log_observability_failure(
|
|
1612
|
+
"request.context_reset",
|
|
1613
|
+
exc,
|
|
1614
|
+
request_id=getattr(context, "request_id", None),
|
|
1615
|
+
)
|
|
1616
|
+
|
|
1617
|
+
def _safe_perf_counter(self, operation: str) -> float | None:
|
|
1618
|
+
try:
|
|
1619
|
+
return time.perf_counter()
|
|
1620
|
+
except BaseException as exc:
|
|
1621
|
+
self.log_observability_failure(operation, exc)
|
|
1622
|
+
return None
|
|
1623
|
+
|
|
1624
|
+
def _safe_str(self, value: Any) -> str:
|
|
1625
|
+
try:
|
|
1626
|
+
return str(value)
|
|
1627
|
+
except BaseException:
|
|
1628
|
+
return f"<unprintable {type(value).__name__}>"
|
|
1629
|
+
|
|
1630
|
+
def _safe_traceback(self, exc: BaseException) -> str:
|
|
1631
|
+
try:
|
|
1632
|
+
return "".join(
|
|
1633
|
+
traceback.format_exception(type(exc), exc, exc.__traceback__)
|
|
1634
|
+
)
|
|
1635
|
+
except BaseException:
|
|
1636
|
+
return ""
|
|
1637
|
+
|
|
1638
|
+
def _json(self, payload: dict[str, Any]) -> str:
|
|
1639
|
+
try:
|
|
1640
|
+
return json.dumps(payload, sort_keys=True, default=str)
|
|
1641
|
+
except BaseException:
|
|
1642
|
+
return json.dumps(
|
|
1643
|
+
{
|
|
1644
|
+
"schema_version": "policyengine.observability.internal_error.v1",
|
|
1645
|
+
"event": "observability_internal_error",
|
|
1646
|
+
"created_at": datetime.now(UTC).isoformat(),
|
|
1647
|
+
"operation": "observability.failure_json",
|
|
1648
|
+
},
|
|
1649
|
+
sort_keys=True,
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
def _write_stderr(self, payload: dict[str, Any]) -> None:
|
|
1653
|
+
try:
|
|
1654
|
+
sys.stderr.write(self._json(payload) + "\n")
|
|
1655
|
+
except BaseException:
|
|
1656
|
+
return
|
|
1657
|
+
|
|
1658
|
+
|
|
1659
|
+
def _is_safe_span_value(value: Any) -> bool:
|
|
1660
|
+
return isinstance(value, str | bool | int | float)
|
|
1661
|
+
|
|
1662
|
+
|
|
1663
|
+
class _OperationManager:
|
|
1664
|
+
def __init__(
|
|
1665
|
+
self,
|
|
1666
|
+
runtime: ObservabilityRuntime,
|
|
1667
|
+
name: str,
|
|
1668
|
+
*,
|
|
1669
|
+
flavor: str | None,
|
|
1670
|
+
attrs: dict[str, Any],
|
|
1671
|
+
) -> None:
|
|
1672
|
+
self.runtime = runtime
|
|
1673
|
+
self.name = name
|
|
1674
|
+
self.flavor = flavor
|
|
1675
|
+
self.attrs = attrs
|
|
1676
|
+
self.handle: dict[str, Any] | None = None
|
|
1677
|
+
|
|
1678
|
+
def __enter__(self):
|
|
1679
|
+
self.handle = self.runtime.start_operation(
|
|
1680
|
+
self.name,
|
|
1681
|
+
flavor=self.flavor,
|
|
1682
|
+
**self.attrs,
|
|
1683
|
+
)
|
|
1684
|
+
return self.runtime.current_operation()
|
|
1685
|
+
|
|
1686
|
+
def __exit__(self, exc_type, exc, _traceback) -> bool:
|
|
1687
|
+
self.runtime.end_operation(self.handle, exc)
|
|
1688
|
+
return False
|
|
1689
|
+
|
|
1690
|
+
async def __aenter__(self):
|
|
1691
|
+
return self.__enter__()
|
|
1692
|
+
|
|
1693
|
+
async def __aexit__(self, exc_type, exc, traceback) -> bool:
|
|
1694
|
+
return self.__exit__(exc_type, exc, traceback)
|
|
1695
|
+
|
|
1696
|
+
def __call__(self, func):
|
|
1697
|
+
if inspect.iscoroutinefunction(func):
|
|
1698
|
+
|
|
1699
|
+
@wraps(func)
|
|
1700
|
+
async def async_wrapper(*args, **kwargs):
|
|
1701
|
+
async with self.runtime.operation(
|
|
1702
|
+
self.name,
|
|
1703
|
+
flavor=self.flavor,
|
|
1704
|
+
**self.attrs,
|
|
1705
|
+
):
|
|
1706
|
+
return await func(*args, **kwargs)
|
|
1707
|
+
|
|
1708
|
+
return async_wrapper
|
|
1709
|
+
|
|
1710
|
+
@wraps(func)
|
|
1711
|
+
def wrapper(*args, **kwargs):
|
|
1712
|
+
with self.runtime.operation(
|
|
1713
|
+
self.name,
|
|
1714
|
+
flavor=self.flavor,
|
|
1715
|
+
**self.attrs,
|
|
1716
|
+
):
|
|
1717
|
+
return func(*args, **kwargs)
|
|
1718
|
+
|
|
1719
|
+
return wrapper
|
|
1720
|
+
|
|
1721
|
+
|
|
1722
|
+
class _SegmentManager:
|
|
1723
|
+
def __init__(
|
|
1724
|
+
self,
|
|
1725
|
+
runtime: ObservabilityRuntime,
|
|
1726
|
+
name: Any,
|
|
1727
|
+
attrs: dict[str, Any],
|
|
1728
|
+
) -> None:
|
|
1729
|
+
self.runtime = runtime
|
|
1730
|
+
self.name = name
|
|
1731
|
+
self.attrs = attrs
|
|
1732
|
+
self.context_manager = None
|
|
1733
|
+
|
|
1734
|
+
def __enter__(self):
|
|
1735
|
+
self.context_manager = self.runtime._segment_context(
|
|
1736
|
+
self.name,
|
|
1737
|
+
**self.attrs,
|
|
1738
|
+
)
|
|
1739
|
+
return self.context_manager.__enter__()
|
|
1740
|
+
|
|
1741
|
+
def __exit__(self, exc_type, exc, traceback) -> bool:
|
|
1742
|
+
if self.context_manager is None:
|
|
1743
|
+
return False
|
|
1744
|
+
return bool(self.context_manager.__exit__(exc_type, exc, traceback))
|
|
1745
|
+
|
|
1746
|
+
async def __aenter__(self):
|
|
1747
|
+
self.context_manager = self.runtime.asegment(self.name, **self.attrs)
|
|
1748
|
+
return await self.context_manager.__aenter__()
|
|
1749
|
+
|
|
1750
|
+
async def __aexit__(self, exc_type, exc, traceback) -> bool:
|
|
1751
|
+
if self.context_manager is None:
|
|
1752
|
+
return False
|
|
1753
|
+
return bool(
|
|
1754
|
+
await self.context_manager.__aexit__(exc_type, exc, traceback)
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
def __call__(self, func):
|
|
1758
|
+
if inspect.iscoroutinefunction(func):
|
|
1759
|
+
|
|
1760
|
+
@wraps(func)
|
|
1761
|
+
async def async_wrapper(*args, **kwargs):
|
|
1762
|
+
async with self.runtime.segment(self.name, **self.attrs):
|
|
1763
|
+
return await func(*args, **kwargs)
|
|
1764
|
+
|
|
1765
|
+
return async_wrapper
|
|
1766
|
+
|
|
1767
|
+
@wraps(func)
|
|
1768
|
+
def wrapper(*args, **kwargs):
|
|
1769
|
+
with self.runtime.segment(self.name, **self.attrs):
|
|
1770
|
+
return func(*args, **kwargs)
|
|
1771
|
+
|
|
1772
|
+
return wrapper
|
|
1773
|
+
|
|
1774
|
+
|
|
1775
|
+
_RUNTIME = ObservabilityRuntime(ObservabilityConfig())
|
|
1776
|
+
|
|
1777
|
+
|
|
1778
|
+
def set_observability_runtime(runtime: ObservabilityRuntime) -> None:
|
|
1779
|
+
global _RUNTIME
|
|
1780
|
+
_RUNTIME = runtime
|
|
1781
|
+
|
|
1782
|
+
|
|
1783
|
+
def observability_runtime() -> ObservabilityRuntime:
|
|
1784
|
+
return _RUNTIME
|