splunk-otel-util-genai 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. opentelemetry/util/genai/__init__.py +17 -0
  2. opentelemetry/util/genai/_fsspec_upload/__init__.py +39 -0
  3. opentelemetry/util/genai/_fsspec_upload/fsspec_hook.py +184 -0
  4. opentelemetry/util/genai/attributes.py +60 -0
  5. opentelemetry/util/genai/callbacks.py +24 -0
  6. opentelemetry/util/genai/config.py +184 -0
  7. opentelemetry/util/genai/debug.py +183 -0
  8. opentelemetry/util/genai/emitters/__init__.py +25 -0
  9. opentelemetry/util/genai/emitters/composite.py +186 -0
  10. opentelemetry/util/genai/emitters/configuration.py +324 -0
  11. opentelemetry/util/genai/emitters/content_events.py +153 -0
  12. opentelemetry/util/genai/emitters/evaluation.py +519 -0
  13. opentelemetry/util/genai/emitters/metrics.py +308 -0
  14. opentelemetry/util/genai/emitters/span.py +774 -0
  15. opentelemetry/util/genai/emitters/spec.py +48 -0
  16. opentelemetry/util/genai/emitters/utils.py +961 -0
  17. opentelemetry/util/genai/environment_variables.py +200 -0
  18. opentelemetry/util/genai/handler.py +1002 -0
  19. opentelemetry/util/genai/instruments.py +44 -0
  20. opentelemetry/util/genai/interfaces.py +58 -0
  21. opentelemetry/util/genai/plugins.py +114 -0
  22. opentelemetry/util/genai/span_context.py +80 -0
  23. opentelemetry/util/genai/types.py +440 -0
  24. opentelemetry/util/genai/upload_hook.py +119 -0
  25. opentelemetry/util/genai/utils.py +182 -0
  26. opentelemetry/util/genai/version.py +15 -0
  27. splunk_otel_util_genai-0.1.3.dist-info/METADATA +70 -0
  28. splunk_otel_util_genai-0.1.3.dist-info/RECORD +31 -0
  29. splunk_otel_util_genai-0.1.3.dist-info/WHEEL +4 -0
  30. splunk_otel_util_genai-0.1.3.dist-info/entry_points.txt +5 -0
  31. splunk_otel_util_genai-0.1.3.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,519 @@
1
+ """Emitters responsible for emitting telemetry derived from evaluation results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from typing import Any, Dict, Optional, Sequence
8
+
9
+ from opentelemetry._logs import Logger, get_logger
10
+ from opentelemetry.semconv._incubating.attributes import (
11
+ error_attributes as ErrorAttributes,
12
+ )
13
+
14
+ from ..attributes import (
15
+ GEN_AI_EVALUATION_ATTRIBUTES_PREFIX,
16
+ GEN_AI_EVALUATION_EXPLANATION,
17
+ GEN_AI_EVALUATION_NAME,
18
+ GEN_AI_EVALUATION_SCORE_LABEL,
19
+ GEN_AI_EVALUATION_SCORE_VALUE,
20
+ GEN_AI_OPERATION_NAME,
21
+ GEN_AI_PROVIDER_NAME,
22
+ GEN_AI_REQUEST_MODEL,
23
+ GEN_AI_RESPONSE_ID,
24
+ )
25
+ from ..interfaces import EmitterMeta
26
+ from ..span_context import (
27
+ build_otel_context,
28
+ extract_span_context,
29
+ store_span_context,
30
+ )
31
+ from ..types import EvaluationResult, GenAI
32
+ from .utils import _evaluation_to_log_record
33
+
34
+
35
+ def _get_request_model(invocation: GenAI) -> str | None:
36
+ return getattr(invocation, "request_model", None) or getattr(
37
+ invocation, "model", None
38
+ )
39
+
40
+
41
+ def _get_response_id(invocation: GenAI) -> str | None: # best-effort
42
+ return getattr(invocation, "response_id", None)
43
+
44
+
45
+ class _EvaluationEmitterBase(EmitterMeta):
46
+ role = "evaluation"
47
+
48
+ def on_start(self, obj: Any) -> None: # pragma: no cover - default no-op
49
+ return None
50
+
51
+ def on_end(self, obj: Any) -> None: # pragma: no cover - default no-op
52
+ return None
53
+
54
+ def on_error(
55
+ self, error, obj: Any
56
+ ) -> None: # pragma: no cover - default no-op
57
+ return None
58
+
59
+
60
+ def _canonicalize_metric_name(raw_name: str) -> Optional[str]:
61
+ """Map raw evaluator metric names (possibly noisy) to canonical names.
62
+
63
+ Handles legacy / provider-specific variants and formatting differences:
64
+ - answer relevancy / answer_relevancy / answer relevance -> relevance
65
+ - faithfulness -> hallucination (legacy synonym)
66
+ - hallucination [geval] / hallucination_geval / hallucination-* -> hallucination
67
+ - direct passthrough for: hallucination, sentiment, toxicity, bias
68
+ Returns None if the metric is unsupported (ignored by emitters).
69
+ """
70
+ if not raw_name:
71
+ return None
72
+ lowered = raw_name.strip().lower()
73
+ # Fast path exact matches first
74
+ if lowered in {"bias", "toxicity", "sentiment", "hallucination"}:
75
+ return lowered
76
+ if lowered == "faithfulness":
77
+ return "hallucination"
78
+ # Normalize punctuation/whitespace to underscores for pattern matching
79
+ import re as _re # local import to avoid global cost
80
+
81
+ normalized = _re.sub(r"[^a-z0-9]+", "_", lowered).strip("_")
82
+ if normalized in {"answer_relevancy", "answer_relevance", "relevance"}:
83
+ return "relevance"
84
+ if normalized.startswith("hallucination"):
85
+ return "hallucination"
86
+ if normalized.startswith("sentiment"):
87
+ # Allow variants like sentiment_geval, sentiment[geval], sentiment-geval
88
+ return "sentiment"
89
+ return None
90
+
91
+
92
+ # Debug logging configuration:
93
+ # OTEL_GENAI_EVAL_DEBUG_SKIPS=1|true|yes -> one-time logs when a measurement is skipped (already implemented)
94
+ # OTEL_GENAI_EVAL_DEBUG_EACH=1|true|yes -> verbose log line for every evaluation result processed (attempted measurement)
95
+ _EVAL_DEBUG_SKIPS = os.getenv("OTEL_GENAI_EVAL_DEBUG_SKIPS", "").lower() in {
96
+ "1",
97
+ "true",
98
+ "yes",
99
+ "on",
100
+ }
101
+ _EVAL_DEBUG_EACH = os.getenv("OTEL_GENAI_EVAL_DEBUG_EACH", "").lower() in {
102
+ "1",
103
+ "true",
104
+ "yes",
105
+ "on",
106
+ }
107
+
108
+
109
+ class EvaluationMetricsEmitter(_EvaluationEmitterBase):
110
+ """Records evaluation scores to metric-specific histograms.
111
+
112
+ Instead of a single shared histogram (gen_ai.evaluation.score), we emit to
113
+ gen_ai.evaluation.score.<metric_name>. This improves downstream aggregation
114
+ clarity at the cost of additional instruments. A callable factory provided
115
+ by the handler supplies (and caches) histogram instances.
116
+ """
117
+
118
+ role = "evaluation_metrics"
119
+ name = "EvaluationMetrics"
120
+
121
+ def __init__(
122
+ self, histogram_factory
123
+ ) -> None: # callable(metric_name)->Histogram|None OR direct histogram
124
+ # Backward-compatible: tests may pass a histogram instance directly.
125
+ if hasattr(histogram_factory, "record") and not callable( # type: ignore[arg-type]
126
+ getattr(histogram_factory, "__call__", None)
127
+ ):
128
+ direct_hist = histogram_factory
129
+
130
+ def _direct_factory(_name: str): # ignore metric name, single hist
131
+ return direct_hist
132
+
133
+ self._hist_factory = _direct_factory
134
+ else:
135
+ self._hist_factory = histogram_factory
136
+
137
+ def on_evaluation_results( # type: ignore[override]
138
+ self,
139
+ results: Sequence[EvaluationResult],
140
+ obj: Any | None = None,
141
+ ) -> None:
142
+ invocation = obj if isinstance(obj, GenAI) else None
143
+ if invocation is None:
144
+ if _EVAL_DEBUG_SKIPS:
145
+ logging.getLogger(__name__).debug(
146
+ "EvaluationMetricsEmitter: skipping all results (no GenAI invocation provided)"
147
+ )
148
+ return
149
+ # Per-emitter set of (reason, key) we have already logged to avoid noise.
150
+ if not hasattr(self, "_logged_skip_keys"):
151
+ self._logged_skip_keys = set() # type: ignore[attr-defined]
152
+
153
+ span_context = getattr(invocation, "span_context", None)
154
+ if (
155
+ span_context is None
156
+ and getattr(invocation, "span", None) is not None
157
+ ):
158
+ span_context = extract_span_context(invocation.span)
159
+ store_span_context(invocation, span_context)
160
+ otel_context = build_otel_context(
161
+ getattr(invocation, "span", None),
162
+ span_context,
163
+ )
164
+
165
+ def _log_skip(
166
+ reason: str,
167
+ metric_raw: Any,
168
+ extra: Optional[Dict[str, Any]] = None,
169
+ ):
170
+ if not _EVAL_DEBUG_SKIPS:
171
+ return
172
+ key = (reason, str(metric_raw))
173
+ try:
174
+ if key in self._logged_skip_keys: # type: ignore[attr-defined]
175
+ return
176
+ self._logged_skip_keys.add(key) # type: ignore[attr-defined]
177
+ except Exception: # pragma: no cover - defensive
178
+ pass
179
+ msg = f"EvaluationMetricsEmitter: skipped metric '{metric_raw}' reason={reason}"
180
+ if extra:
181
+ try:
182
+ msg += " " + " ".join(
183
+ f"{k}={v!r}" for k, v in extra.items() if v is not None
184
+ )
185
+ except Exception: # pragma: no cover - defensive
186
+ pass
187
+ logging.getLogger(__name__).debug(msg)
188
+
189
+ for res in results:
190
+ canonical = _canonicalize_metric_name(
191
+ getattr(res, "metric_name", "") or ""
192
+ )
193
+ raw_name = getattr(res, "metric_name", None)
194
+ if _EVAL_DEBUG_EACH:
195
+ logging.getLogger(__name__).debug(
196
+ "EvaluationMetricsEmitter: processing metric raw=%r canonical=%r score=%r type=%s label=%r",
197
+ raw_name,
198
+ canonical,
199
+ getattr(res, "score", None),
200
+ type(getattr(res, "score", None)).__name__,
201
+ getattr(res, "label", None),
202
+ )
203
+ if canonical is None:
204
+ _log_skip("unsupported_metric_name", raw_name)
205
+ continue
206
+ if not isinstance(res.score, (int, float)):
207
+ _log_skip(
208
+ "non_numeric_score",
209
+ raw_name,
210
+ {
211
+ "score_type": type(res.score).__name__,
212
+ "score_value": getattr(res, "score", None),
213
+ },
214
+ )
215
+ continue
216
+ try:
217
+ histogram = (
218
+ self._hist_factory(canonical)
219
+ if self._hist_factory
220
+ else None
221
+ ) # type: ignore[attr-defined]
222
+ except Exception as exc: # pragma: no cover - defensive
223
+ histogram = None
224
+ _log_skip(
225
+ "histogram_factory_error", raw_name, {"error": repr(exc)}
226
+ )
227
+ if histogram is None:
228
+ # Log once per metric name if histogram factory did not provide an instrument.
229
+ try:
230
+ _once_key = f"_genai_eval_hist_missing_{canonical}"
231
+ if not getattr(self, _once_key, False):
232
+ logging.getLogger(__name__).debug(
233
+ "EvaluationMetricsEmitter: no histogram for canonical metric '%s' (factory returned None)",
234
+ canonical,
235
+ )
236
+ setattr(self, _once_key, True)
237
+ except Exception:
238
+ pass
239
+ _log_skip(
240
+ "no_histogram_instrument",
241
+ raw_name,
242
+ {"canonical": canonical},
243
+ )
244
+ continue
245
+ elif _EVAL_DEBUG_EACH:
246
+ logging.getLogger(__name__).debug(
247
+ "EvaluationMetricsEmitter: recording metric canonical=%r score=%r instrument=%s",
248
+ canonical,
249
+ getattr(res, "score", None),
250
+ type(histogram).__name__,
251
+ )
252
+ attrs: Dict[str, Any] = {
253
+ GEN_AI_OPERATION_NAME: "evaluation",
254
+ GEN_AI_EVALUATION_NAME: canonical,
255
+ }
256
+ # If the source invocation carried agent identity, propagate
257
+ agent_name = getattr(invocation, "agent_name", None)
258
+ agent_id = getattr(invocation, "agent_id", None)
259
+ # Fallbacks: if instrumentation didn't populate agent_name/id fields explicitly but
260
+ # the invocation is an AgentInvocation, derive them from core fields to preserve identity.
261
+ try:
262
+ from opentelemetry.util.genai.types import (
263
+ AgentInvocation as _AI, # local import to avoid cycle
264
+ )
265
+
266
+ if agent_name is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
267
+ agent_name = getattr(invocation, "name", None)
268
+ if agent_id is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
269
+ agent_id = str(getattr(invocation, "run_id", "")) or None
270
+ except Exception: # pragma: no cover - defensive
271
+ pass
272
+ workflow_id = getattr(invocation, "workflow_id", None)
273
+ if agent_name:
274
+ attrs["gen_ai.agent.name"] = agent_name
275
+ if agent_id:
276
+ attrs["gen_ai.agent.id"] = agent_id
277
+ if workflow_id:
278
+ attrs["gen_ai.workflow.id"] = workflow_id
279
+ req_model = _get_request_model(invocation)
280
+ if req_model:
281
+ attrs[GEN_AI_REQUEST_MODEL] = req_model
282
+ provider = getattr(invocation, "provider", None)
283
+ if provider:
284
+ attrs[GEN_AI_PROVIDER_NAME] = provider
285
+ if res.label is not None:
286
+ attrs[GEN_AI_EVALUATION_SCORE_LABEL] = res.label
287
+ # Propagate evaluator-derived pass boolean if present
288
+ passed = None
289
+ try:
290
+ if isinstance(getattr(res, "attributes", None), dict):
291
+ passed = res.attributes.get("gen_ai.evaluation.passed")
292
+ except Exception: # pragma: no cover - defensive
293
+ passed = None
294
+ if passed is None and res.label is not None:
295
+ label_text = str(res.label).strip().lower()
296
+ if label_text in {"pass", "passed"}:
297
+ passed = True
298
+ elif label_text in {"fail", "failed"}:
299
+ passed = False
300
+ if isinstance(passed, bool):
301
+ attrs["gen_ai.evaluation.passed"] = passed
302
+ attrs["gen_ai.evaluation.score.units"] = "score"
303
+ if res.error is not None:
304
+ if getattr(res.error, "message", None):
305
+ attrs[ErrorAttributes.ERROR_MESSAGE] = res.error.message
306
+ if getattr(res.error, "type", None):
307
+ attrs[ErrorAttributes.ERROR_TYPE] = (
308
+ res.error.type.__qualname__
309
+ )
310
+ try:
311
+ if otel_context is not None:
312
+ histogram.record( # type: ignore[attr-defined]
313
+ res.score,
314
+ attributes=attrs,
315
+ context=otel_context,
316
+ )
317
+ else:
318
+ histogram.record(res.score, attributes=attrs) # type: ignore[attr-defined]
319
+ except Exception as exc: # pragma: no cover - defensive
320
+ _log_skip(
321
+ "histogram_record_error", raw_name, {"error": repr(exc)}
322
+ )
323
+ if _EVAL_DEBUG_EACH:
324
+ logging.getLogger(__name__).debug(
325
+ "EvaluationMetricsEmitter: record failed canonical=%r score=%r error=%r",
326
+ canonical,
327
+ getattr(res, "score", None),
328
+ exc,
329
+ )
330
+ pass
331
+
332
+
333
+ class EvaluationEventsEmitter(_EvaluationEmitterBase):
334
+ """Emits one event per evaluation result."""
335
+
336
+ role = "evaluation_events"
337
+ name = "EvaluationEvents"
338
+
339
+ def __init__(
340
+ self,
341
+ logger: Optional[Logger] = None,
342
+ *,
343
+ emit_legacy_event: bool = False,
344
+ ) -> None:
345
+ self._logger: Logger = logger or get_logger(__name__)
346
+ self._emit_legacy_event = emit_legacy_event
347
+ self._primary_event_name = "gen_ai.evaluation.result"
348
+ self._legacy_event_name = "gen_ai.evaluation"
349
+ self._py_logger = logging.getLogger(
350
+ f"{__name__}.EvaluationEventsEmitter"
351
+ )
352
+
353
+ def on_evaluation_results( # type: ignore[override]
354
+ self,
355
+ results: Sequence[EvaluationResult],
356
+ obj: Any | None = None,
357
+ ) -> None:
358
+ invocation = obj if isinstance(obj, GenAI) else None
359
+ if invocation is None or not results:
360
+ return
361
+
362
+ req_model = _get_request_model(invocation)
363
+ provider = getattr(invocation, "provider", None)
364
+ response_id = _get_response_id(invocation)
365
+
366
+ for res in results:
367
+ canonical = _canonicalize_metric_name(
368
+ getattr(res, "metric_name", "") or ""
369
+ )
370
+ if canonical is None:
371
+ continue
372
+ base_attrs: Dict[str, Any] = {
373
+ GEN_AI_OPERATION_NAME: "evaluation",
374
+ GEN_AI_EVALUATION_NAME: canonical,
375
+ }
376
+ agent_name = getattr(invocation, "agent_name", None)
377
+ agent_id = getattr(invocation, "agent_id", None)
378
+ try:
379
+ from opentelemetry.util.genai.types import (
380
+ AgentInvocation as _AI, # local import to avoid cycle
381
+ )
382
+
383
+ if agent_name is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
384
+ agent_name = getattr(invocation, "name", None)
385
+ if agent_id is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
386
+ agent_id = str(getattr(invocation, "run_id", "")) or None
387
+ except Exception: # pragma: no cover - defensive
388
+ pass
389
+ workflow_id = getattr(invocation, "workflow_id", None)
390
+ if agent_name:
391
+ base_attrs["gen_ai.agent.name"] = agent_name
392
+ if agent_id:
393
+ base_attrs["gen_ai.agent.id"] = agent_id
394
+ if workflow_id:
395
+ base_attrs["gen_ai.workflow.id"] = workflow_id
396
+ if req_model:
397
+ base_attrs[GEN_AI_REQUEST_MODEL] = req_model
398
+ if provider:
399
+ base_attrs[GEN_AI_PROVIDER_NAME] = provider
400
+ if response_id:
401
+ base_attrs[GEN_AI_RESPONSE_ID] = response_id
402
+ if isinstance(res.score, (int, float)):
403
+ base_attrs[GEN_AI_EVALUATION_SCORE_VALUE] = res.score
404
+ if res.label is not None:
405
+ base_attrs[GEN_AI_EVALUATION_SCORE_LABEL] = res.label
406
+ else:
407
+ base_attrs[GEN_AI_EVALUATION_SCORE_LABEL] = "unknown"
408
+ # Propagate pass boolean if available
409
+ passed = None
410
+ try:
411
+ if isinstance(getattr(res, "attributes", None), dict):
412
+ passed = res.attributes.get("gen_ai.evaluation.passed")
413
+ except Exception: # pragma: no cover - defensive
414
+ passed = None
415
+ if isinstance(passed, bool):
416
+ base_attrs["gen_ai.evaluation.passed"] = passed
417
+ if isinstance(res.score, (int, float)):
418
+ base_attrs["gen_ai.evaluation.score.units"] = "score"
419
+ if res.error is not None:
420
+ if getattr(res.error, "message", None):
421
+ base_attrs[ErrorAttributes.ERROR_MESSAGE] = (
422
+ res.error.message
423
+ )
424
+ if getattr(res.error, "type", None):
425
+ base_attrs[ErrorAttributes.ERROR_TYPE] = (
426
+ res.error.type.__qualname__
427
+ )
428
+
429
+ spec_attrs = dict(base_attrs)
430
+ if res.explanation:
431
+ spec_attrs[GEN_AI_EVALUATION_EXPLANATION] = res.explanation
432
+ if res.attributes:
433
+ for key, value in dict(res.attributes).items():
434
+ key_str = str(key)
435
+ spec_attrs[
436
+ f"{GEN_AI_EVALUATION_ATTRIBUTES_PREFIX}{key_str}"
437
+ ] = value
438
+ if res.error is not None:
439
+ if getattr(res.error, "message", None):
440
+ spec_attrs[ErrorAttributes.ERROR_MESSAGE] = (
441
+ res.error.message
442
+ )
443
+ if getattr(res.error, "type", None):
444
+ spec_attrs[ErrorAttributes.ERROR_TYPE] = (
445
+ res.error.type.__qualname__
446
+ )
447
+
448
+ primary_body: Dict[str, Any] = {}
449
+ if isinstance(res.score, (int, float)):
450
+ primary_body["score"] = res.score
451
+ elif res.score is not None:
452
+ primary_body["score"] = res.score
453
+ if res.label is not None:
454
+ primary_body["label"] = res.label
455
+ if res.explanation:
456
+ primary_body["explanation"] = res.explanation
457
+ if res.attributes:
458
+ primary_body["attributes"] = dict(res.attributes)
459
+ if res.error is not None:
460
+ primary_body["error"] = {
461
+ "type": res.error.type.__qualname__,
462
+ "message": getattr(res.error, "message", None),
463
+ }
464
+
465
+ try:
466
+ record = _evaluation_to_log_record(
467
+ invocation,
468
+ self._primary_event_name,
469
+ spec_attrs,
470
+ body=primary_body or None,
471
+ )
472
+ self._logger.emit(record)
473
+ if self._py_logger.isEnabledFor(logging.DEBUG):
474
+ self._py_logger.debug(
475
+ "Emitted evaluation log event metric=%s trace_id=%s span_id=%s",
476
+ canonical,
477
+ getattr(invocation, "trace_id", None),
478
+ getattr(invocation, "span_id", None),
479
+ )
480
+ except Exception: # pragma: no cover - defensive
481
+ if self._py_logger.isEnabledFor(logging.DEBUG):
482
+ self._py_logger.debug(
483
+ "Failed to emit evaluation log event", exc_info=True
484
+ )
485
+
486
+ if not self._emit_legacy_event:
487
+ continue
488
+
489
+ legacy_attrs = dict(base_attrs)
490
+ legacy_body: Dict[str, Any] = {}
491
+ if res.explanation:
492
+ legacy_body["gen_ai.evaluation.explanation"] = res.explanation
493
+ if res.attributes:
494
+ legacy_body["gen_ai.evaluation.attributes"] = dict(
495
+ res.attributes
496
+ )
497
+ if res.error is not None and getattr(res.error, "message", None):
498
+ legacy_attrs["error.message"] = res.error.message
499
+
500
+ try:
501
+ legacy_record = _evaluation_to_log_record(
502
+ invocation,
503
+ self._legacy_event_name,
504
+ legacy_attrs,
505
+ body=legacy_body or None,
506
+ )
507
+ self._logger.emit(legacy_record)
508
+ except Exception: # pragma: no cover - defensive
509
+ if self._py_logger.isEnabledFor(logging.DEBUG):
510
+ self._py_logger.debug(
511
+ "Failed to emit legacy evaluation log event",
512
+ exc_info=True,
513
+ )
514
+
515
+
516
+ __all__ = [
517
+ "EvaluationMetricsEmitter",
518
+ "EvaluationEventsEmitter",
519
+ ]