splunk-otel-util-genai 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opentelemetry/util/genai/__init__.py +17 -0
- opentelemetry/util/genai/_fsspec_upload/__init__.py +39 -0
- opentelemetry/util/genai/_fsspec_upload/fsspec_hook.py +184 -0
- opentelemetry/util/genai/attributes.py +60 -0
- opentelemetry/util/genai/callbacks.py +24 -0
- opentelemetry/util/genai/config.py +184 -0
- opentelemetry/util/genai/debug.py +183 -0
- opentelemetry/util/genai/emitters/__init__.py +25 -0
- opentelemetry/util/genai/emitters/composite.py +186 -0
- opentelemetry/util/genai/emitters/configuration.py +324 -0
- opentelemetry/util/genai/emitters/content_events.py +153 -0
- opentelemetry/util/genai/emitters/evaluation.py +519 -0
- opentelemetry/util/genai/emitters/metrics.py +308 -0
- opentelemetry/util/genai/emitters/span.py +774 -0
- opentelemetry/util/genai/emitters/spec.py +48 -0
- opentelemetry/util/genai/emitters/utils.py +961 -0
- opentelemetry/util/genai/environment_variables.py +200 -0
- opentelemetry/util/genai/handler.py +1002 -0
- opentelemetry/util/genai/instruments.py +44 -0
- opentelemetry/util/genai/interfaces.py +58 -0
- opentelemetry/util/genai/plugins.py +114 -0
- opentelemetry/util/genai/span_context.py +80 -0
- opentelemetry/util/genai/types.py +440 -0
- opentelemetry/util/genai/upload_hook.py +119 -0
- opentelemetry/util/genai/utils.py +182 -0
- opentelemetry/util/genai/version.py +15 -0
- splunk_otel_util_genai-0.1.3.dist-info/METADATA +70 -0
- splunk_otel_util_genai-0.1.3.dist-info/RECORD +31 -0
- splunk_otel_util_genai-0.1.3.dist-info/WHEEL +4 -0
- splunk_otel_util_genai-0.1.3.dist-info/entry_points.txt +5 -0
- splunk_otel_util_genai-0.1.3.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
"""Emitters responsible for emitting telemetry derived from evaluation results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any, Dict, Optional, Sequence
|
|
8
|
+
|
|
9
|
+
from opentelemetry._logs import Logger, get_logger
|
|
10
|
+
from opentelemetry.semconv._incubating.attributes import (
|
|
11
|
+
error_attributes as ErrorAttributes,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from ..attributes import (
|
|
15
|
+
GEN_AI_EVALUATION_ATTRIBUTES_PREFIX,
|
|
16
|
+
GEN_AI_EVALUATION_EXPLANATION,
|
|
17
|
+
GEN_AI_EVALUATION_NAME,
|
|
18
|
+
GEN_AI_EVALUATION_SCORE_LABEL,
|
|
19
|
+
GEN_AI_EVALUATION_SCORE_VALUE,
|
|
20
|
+
GEN_AI_OPERATION_NAME,
|
|
21
|
+
GEN_AI_PROVIDER_NAME,
|
|
22
|
+
GEN_AI_REQUEST_MODEL,
|
|
23
|
+
GEN_AI_RESPONSE_ID,
|
|
24
|
+
)
|
|
25
|
+
from ..interfaces import EmitterMeta
|
|
26
|
+
from ..span_context import (
|
|
27
|
+
build_otel_context,
|
|
28
|
+
extract_span_context,
|
|
29
|
+
store_span_context,
|
|
30
|
+
)
|
|
31
|
+
from ..types import EvaluationResult, GenAI
|
|
32
|
+
from .utils import _evaluation_to_log_record
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_request_model(invocation: GenAI) -> str | None:
|
|
36
|
+
return getattr(invocation, "request_model", None) or getattr(
|
|
37
|
+
invocation, "model", None
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _get_response_id(invocation: GenAI) -> str | None: # best-effort
|
|
42
|
+
return getattr(invocation, "response_id", None)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _EvaluationEmitterBase(EmitterMeta):
|
|
46
|
+
role = "evaluation"
|
|
47
|
+
|
|
48
|
+
def on_start(self, obj: Any) -> None: # pragma: no cover - default no-op
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def on_end(self, obj: Any) -> None: # pragma: no cover - default no-op
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
def on_error(
|
|
55
|
+
self, error, obj: Any
|
|
56
|
+
) -> None: # pragma: no cover - default no-op
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _canonicalize_metric_name(raw_name: str) -> Optional[str]:
|
|
61
|
+
"""Map raw evaluator metric names (possibly noisy) to canonical names.
|
|
62
|
+
|
|
63
|
+
Handles legacy / provider-specific variants and formatting differences:
|
|
64
|
+
- answer relevancy / answer_relevancy / answer relevance -> relevance
|
|
65
|
+
- faithfulness -> hallucination (legacy synonym)
|
|
66
|
+
- hallucination [geval] / hallucination_geval / hallucination-* -> hallucination
|
|
67
|
+
- direct passthrough for: hallucination, sentiment, toxicity, bias
|
|
68
|
+
Returns None if the metric is unsupported (ignored by emitters).
|
|
69
|
+
"""
|
|
70
|
+
if not raw_name:
|
|
71
|
+
return None
|
|
72
|
+
lowered = raw_name.strip().lower()
|
|
73
|
+
# Fast path exact matches first
|
|
74
|
+
if lowered in {"bias", "toxicity", "sentiment", "hallucination"}:
|
|
75
|
+
return lowered
|
|
76
|
+
if lowered == "faithfulness":
|
|
77
|
+
return "hallucination"
|
|
78
|
+
# Normalize punctuation/whitespace to underscores for pattern matching
|
|
79
|
+
import re as _re # local import to avoid global cost
|
|
80
|
+
|
|
81
|
+
normalized = _re.sub(r"[^a-z0-9]+", "_", lowered).strip("_")
|
|
82
|
+
if normalized in {"answer_relevancy", "answer_relevance", "relevance"}:
|
|
83
|
+
return "relevance"
|
|
84
|
+
if normalized.startswith("hallucination"):
|
|
85
|
+
return "hallucination"
|
|
86
|
+
if normalized.startswith("sentiment"):
|
|
87
|
+
# Allow variants like sentiment_geval, sentiment[geval], sentiment-geval
|
|
88
|
+
return "sentiment"
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Debug logging configuration:
|
|
93
|
+
# OTEL_GENAI_EVAL_DEBUG_SKIPS=1|true|yes -> one-time logs when a measurement is skipped (already implemented)
|
|
94
|
+
# OTEL_GENAI_EVAL_DEBUG_EACH=1|true|yes -> verbose log line for every evaluation result processed (attempted measurement)
|
|
95
|
+
_EVAL_DEBUG_SKIPS = os.getenv("OTEL_GENAI_EVAL_DEBUG_SKIPS", "").lower() in {
|
|
96
|
+
"1",
|
|
97
|
+
"true",
|
|
98
|
+
"yes",
|
|
99
|
+
"on",
|
|
100
|
+
}
|
|
101
|
+
_EVAL_DEBUG_EACH = os.getenv("OTEL_GENAI_EVAL_DEBUG_EACH", "").lower() in {
|
|
102
|
+
"1",
|
|
103
|
+
"true",
|
|
104
|
+
"yes",
|
|
105
|
+
"on",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class EvaluationMetricsEmitter(_EvaluationEmitterBase):
|
|
110
|
+
"""Records evaluation scores to metric-specific histograms.
|
|
111
|
+
|
|
112
|
+
Instead of a single shared histogram (gen_ai.evaluation.score), we emit to
|
|
113
|
+
gen_ai.evaluation.score.<metric_name>. This improves downstream aggregation
|
|
114
|
+
clarity at the cost of additional instruments. A callable factory provided
|
|
115
|
+
by the handler supplies (and caches) histogram instances.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
role = "evaluation_metrics"
|
|
119
|
+
name = "EvaluationMetrics"
|
|
120
|
+
|
|
121
|
+
def __init__(
|
|
122
|
+
self, histogram_factory
|
|
123
|
+
) -> None: # callable(metric_name)->Histogram|None OR direct histogram
|
|
124
|
+
# Backward-compatible: tests may pass a histogram instance directly.
|
|
125
|
+
if hasattr(histogram_factory, "record") and not callable( # type: ignore[arg-type]
|
|
126
|
+
getattr(histogram_factory, "__call__", None)
|
|
127
|
+
):
|
|
128
|
+
direct_hist = histogram_factory
|
|
129
|
+
|
|
130
|
+
def _direct_factory(_name: str): # ignore metric name, single hist
|
|
131
|
+
return direct_hist
|
|
132
|
+
|
|
133
|
+
self._hist_factory = _direct_factory
|
|
134
|
+
else:
|
|
135
|
+
self._hist_factory = histogram_factory
|
|
136
|
+
|
|
137
|
+
def on_evaluation_results( # type: ignore[override]
|
|
138
|
+
self,
|
|
139
|
+
results: Sequence[EvaluationResult],
|
|
140
|
+
obj: Any | None = None,
|
|
141
|
+
) -> None:
|
|
142
|
+
invocation = obj if isinstance(obj, GenAI) else None
|
|
143
|
+
if invocation is None:
|
|
144
|
+
if _EVAL_DEBUG_SKIPS:
|
|
145
|
+
logging.getLogger(__name__).debug(
|
|
146
|
+
"EvaluationMetricsEmitter: skipping all results (no GenAI invocation provided)"
|
|
147
|
+
)
|
|
148
|
+
return
|
|
149
|
+
# Per-emitter set of (reason, key) we have already logged to avoid noise.
|
|
150
|
+
if not hasattr(self, "_logged_skip_keys"):
|
|
151
|
+
self._logged_skip_keys = set() # type: ignore[attr-defined]
|
|
152
|
+
|
|
153
|
+
span_context = getattr(invocation, "span_context", None)
|
|
154
|
+
if (
|
|
155
|
+
span_context is None
|
|
156
|
+
and getattr(invocation, "span", None) is not None
|
|
157
|
+
):
|
|
158
|
+
span_context = extract_span_context(invocation.span)
|
|
159
|
+
store_span_context(invocation, span_context)
|
|
160
|
+
otel_context = build_otel_context(
|
|
161
|
+
getattr(invocation, "span", None),
|
|
162
|
+
span_context,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _log_skip(
|
|
166
|
+
reason: str,
|
|
167
|
+
metric_raw: Any,
|
|
168
|
+
extra: Optional[Dict[str, Any]] = None,
|
|
169
|
+
):
|
|
170
|
+
if not _EVAL_DEBUG_SKIPS:
|
|
171
|
+
return
|
|
172
|
+
key = (reason, str(metric_raw))
|
|
173
|
+
try:
|
|
174
|
+
if key in self._logged_skip_keys: # type: ignore[attr-defined]
|
|
175
|
+
return
|
|
176
|
+
self._logged_skip_keys.add(key) # type: ignore[attr-defined]
|
|
177
|
+
except Exception: # pragma: no cover - defensive
|
|
178
|
+
pass
|
|
179
|
+
msg = f"EvaluationMetricsEmitter: skipped metric '{metric_raw}' reason={reason}"
|
|
180
|
+
if extra:
|
|
181
|
+
try:
|
|
182
|
+
msg += " " + " ".join(
|
|
183
|
+
f"{k}={v!r}" for k, v in extra.items() if v is not None
|
|
184
|
+
)
|
|
185
|
+
except Exception: # pragma: no cover - defensive
|
|
186
|
+
pass
|
|
187
|
+
logging.getLogger(__name__).debug(msg)
|
|
188
|
+
|
|
189
|
+
for res in results:
|
|
190
|
+
canonical = _canonicalize_metric_name(
|
|
191
|
+
getattr(res, "metric_name", "") or ""
|
|
192
|
+
)
|
|
193
|
+
raw_name = getattr(res, "metric_name", None)
|
|
194
|
+
if _EVAL_DEBUG_EACH:
|
|
195
|
+
logging.getLogger(__name__).debug(
|
|
196
|
+
"EvaluationMetricsEmitter: processing metric raw=%r canonical=%r score=%r type=%s label=%r",
|
|
197
|
+
raw_name,
|
|
198
|
+
canonical,
|
|
199
|
+
getattr(res, "score", None),
|
|
200
|
+
type(getattr(res, "score", None)).__name__,
|
|
201
|
+
getattr(res, "label", None),
|
|
202
|
+
)
|
|
203
|
+
if canonical is None:
|
|
204
|
+
_log_skip("unsupported_metric_name", raw_name)
|
|
205
|
+
continue
|
|
206
|
+
if not isinstance(res.score, (int, float)):
|
|
207
|
+
_log_skip(
|
|
208
|
+
"non_numeric_score",
|
|
209
|
+
raw_name,
|
|
210
|
+
{
|
|
211
|
+
"score_type": type(res.score).__name__,
|
|
212
|
+
"score_value": getattr(res, "score", None),
|
|
213
|
+
},
|
|
214
|
+
)
|
|
215
|
+
continue
|
|
216
|
+
try:
|
|
217
|
+
histogram = (
|
|
218
|
+
self._hist_factory(canonical)
|
|
219
|
+
if self._hist_factory
|
|
220
|
+
else None
|
|
221
|
+
) # type: ignore[attr-defined]
|
|
222
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
223
|
+
histogram = None
|
|
224
|
+
_log_skip(
|
|
225
|
+
"histogram_factory_error", raw_name, {"error": repr(exc)}
|
|
226
|
+
)
|
|
227
|
+
if histogram is None:
|
|
228
|
+
# Log once per metric name if histogram factory did not provide an instrument.
|
|
229
|
+
try:
|
|
230
|
+
_once_key = f"_genai_eval_hist_missing_{canonical}"
|
|
231
|
+
if not getattr(self, _once_key, False):
|
|
232
|
+
logging.getLogger(__name__).debug(
|
|
233
|
+
"EvaluationMetricsEmitter: no histogram for canonical metric '%s' (factory returned None)",
|
|
234
|
+
canonical,
|
|
235
|
+
)
|
|
236
|
+
setattr(self, _once_key, True)
|
|
237
|
+
except Exception:
|
|
238
|
+
pass
|
|
239
|
+
_log_skip(
|
|
240
|
+
"no_histogram_instrument",
|
|
241
|
+
raw_name,
|
|
242
|
+
{"canonical": canonical},
|
|
243
|
+
)
|
|
244
|
+
continue
|
|
245
|
+
elif _EVAL_DEBUG_EACH:
|
|
246
|
+
logging.getLogger(__name__).debug(
|
|
247
|
+
"EvaluationMetricsEmitter: recording metric canonical=%r score=%r instrument=%s",
|
|
248
|
+
canonical,
|
|
249
|
+
getattr(res, "score", None),
|
|
250
|
+
type(histogram).__name__,
|
|
251
|
+
)
|
|
252
|
+
attrs: Dict[str, Any] = {
|
|
253
|
+
GEN_AI_OPERATION_NAME: "evaluation",
|
|
254
|
+
GEN_AI_EVALUATION_NAME: canonical,
|
|
255
|
+
}
|
|
256
|
+
# If the source invocation carried agent identity, propagate
|
|
257
|
+
agent_name = getattr(invocation, "agent_name", None)
|
|
258
|
+
agent_id = getattr(invocation, "agent_id", None)
|
|
259
|
+
# Fallbacks: if instrumentation didn't populate agent_name/id fields explicitly but
|
|
260
|
+
# the invocation is an AgentInvocation, derive them from core fields to preserve identity.
|
|
261
|
+
try:
|
|
262
|
+
from opentelemetry.util.genai.types import (
|
|
263
|
+
AgentInvocation as _AI, # local import to avoid cycle
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
if agent_name is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
|
|
267
|
+
agent_name = getattr(invocation, "name", None)
|
|
268
|
+
if agent_id is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
|
|
269
|
+
agent_id = str(getattr(invocation, "run_id", "")) or None
|
|
270
|
+
except Exception: # pragma: no cover - defensive
|
|
271
|
+
pass
|
|
272
|
+
workflow_id = getattr(invocation, "workflow_id", None)
|
|
273
|
+
if agent_name:
|
|
274
|
+
attrs["gen_ai.agent.name"] = agent_name
|
|
275
|
+
if agent_id:
|
|
276
|
+
attrs["gen_ai.agent.id"] = agent_id
|
|
277
|
+
if workflow_id:
|
|
278
|
+
attrs["gen_ai.workflow.id"] = workflow_id
|
|
279
|
+
req_model = _get_request_model(invocation)
|
|
280
|
+
if req_model:
|
|
281
|
+
attrs[GEN_AI_REQUEST_MODEL] = req_model
|
|
282
|
+
provider = getattr(invocation, "provider", None)
|
|
283
|
+
if provider:
|
|
284
|
+
attrs[GEN_AI_PROVIDER_NAME] = provider
|
|
285
|
+
if res.label is not None:
|
|
286
|
+
attrs[GEN_AI_EVALUATION_SCORE_LABEL] = res.label
|
|
287
|
+
# Propagate evaluator-derived pass boolean if present
|
|
288
|
+
passed = None
|
|
289
|
+
try:
|
|
290
|
+
if isinstance(getattr(res, "attributes", None), dict):
|
|
291
|
+
passed = res.attributes.get("gen_ai.evaluation.passed")
|
|
292
|
+
except Exception: # pragma: no cover - defensive
|
|
293
|
+
passed = None
|
|
294
|
+
if passed is None and res.label is not None:
|
|
295
|
+
label_text = str(res.label).strip().lower()
|
|
296
|
+
if label_text in {"pass", "passed"}:
|
|
297
|
+
passed = True
|
|
298
|
+
elif label_text in {"fail", "failed"}:
|
|
299
|
+
passed = False
|
|
300
|
+
if isinstance(passed, bool):
|
|
301
|
+
attrs["gen_ai.evaluation.passed"] = passed
|
|
302
|
+
attrs["gen_ai.evaluation.score.units"] = "score"
|
|
303
|
+
if res.error is not None:
|
|
304
|
+
if getattr(res.error, "message", None):
|
|
305
|
+
attrs[ErrorAttributes.ERROR_MESSAGE] = res.error.message
|
|
306
|
+
if getattr(res.error, "type", None):
|
|
307
|
+
attrs[ErrorAttributes.ERROR_TYPE] = (
|
|
308
|
+
res.error.type.__qualname__
|
|
309
|
+
)
|
|
310
|
+
try:
|
|
311
|
+
if otel_context is not None:
|
|
312
|
+
histogram.record( # type: ignore[attr-defined]
|
|
313
|
+
res.score,
|
|
314
|
+
attributes=attrs,
|
|
315
|
+
context=otel_context,
|
|
316
|
+
)
|
|
317
|
+
else:
|
|
318
|
+
histogram.record(res.score, attributes=attrs) # type: ignore[attr-defined]
|
|
319
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
320
|
+
_log_skip(
|
|
321
|
+
"histogram_record_error", raw_name, {"error": repr(exc)}
|
|
322
|
+
)
|
|
323
|
+
if _EVAL_DEBUG_EACH:
|
|
324
|
+
logging.getLogger(__name__).debug(
|
|
325
|
+
"EvaluationMetricsEmitter: record failed canonical=%r score=%r error=%r",
|
|
326
|
+
canonical,
|
|
327
|
+
getattr(res, "score", None),
|
|
328
|
+
exc,
|
|
329
|
+
)
|
|
330
|
+
pass
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
class EvaluationEventsEmitter(_EvaluationEmitterBase):
|
|
334
|
+
"""Emits one event per evaluation result."""
|
|
335
|
+
|
|
336
|
+
role = "evaluation_events"
|
|
337
|
+
name = "EvaluationEvents"
|
|
338
|
+
|
|
339
|
+
def __init__(
|
|
340
|
+
self,
|
|
341
|
+
logger: Optional[Logger] = None,
|
|
342
|
+
*,
|
|
343
|
+
emit_legacy_event: bool = False,
|
|
344
|
+
) -> None:
|
|
345
|
+
self._logger: Logger = logger or get_logger(__name__)
|
|
346
|
+
self._emit_legacy_event = emit_legacy_event
|
|
347
|
+
self._primary_event_name = "gen_ai.evaluation.result"
|
|
348
|
+
self._legacy_event_name = "gen_ai.evaluation"
|
|
349
|
+
self._py_logger = logging.getLogger(
|
|
350
|
+
f"{__name__}.EvaluationEventsEmitter"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
def on_evaluation_results( # type: ignore[override]
|
|
354
|
+
self,
|
|
355
|
+
results: Sequence[EvaluationResult],
|
|
356
|
+
obj: Any | None = None,
|
|
357
|
+
) -> None:
|
|
358
|
+
invocation = obj if isinstance(obj, GenAI) else None
|
|
359
|
+
if invocation is None or not results:
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
req_model = _get_request_model(invocation)
|
|
363
|
+
provider = getattr(invocation, "provider", None)
|
|
364
|
+
response_id = _get_response_id(invocation)
|
|
365
|
+
|
|
366
|
+
for res in results:
|
|
367
|
+
canonical = _canonicalize_metric_name(
|
|
368
|
+
getattr(res, "metric_name", "") or ""
|
|
369
|
+
)
|
|
370
|
+
if canonical is None:
|
|
371
|
+
continue
|
|
372
|
+
base_attrs: Dict[str, Any] = {
|
|
373
|
+
GEN_AI_OPERATION_NAME: "evaluation",
|
|
374
|
+
GEN_AI_EVALUATION_NAME: canonical,
|
|
375
|
+
}
|
|
376
|
+
agent_name = getattr(invocation, "agent_name", None)
|
|
377
|
+
agent_id = getattr(invocation, "agent_id", None)
|
|
378
|
+
try:
|
|
379
|
+
from opentelemetry.util.genai.types import (
|
|
380
|
+
AgentInvocation as _AI, # local import to avoid cycle
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
if agent_name is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
|
|
384
|
+
agent_name = getattr(invocation, "name", None)
|
|
385
|
+
if agent_id is None and isinstance(invocation, _AI): # type: ignore[attr-defined]
|
|
386
|
+
agent_id = str(getattr(invocation, "run_id", "")) or None
|
|
387
|
+
except Exception: # pragma: no cover - defensive
|
|
388
|
+
pass
|
|
389
|
+
workflow_id = getattr(invocation, "workflow_id", None)
|
|
390
|
+
if agent_name:
|
|
391
|
+
base_attrs["gen_ai.agent.name"] = agent_name
|
|
392
|
+
if agent_id:
|
|
393
|
+
base_attrs["gen_ai.agent.id"] = agent_id
|
|
394
|
+
if workflow_id:
|
|
395
|
+
base_attrs["gen_ai.workflow.id"] = workflow_id
|
|
396
|
+
if req_model:
|
|
397
|
+
base_attrs[GEN_AI_REQUEST_MODEL] = req_model
|
|
398
|
+
if provider:
|
|
399
|
+
base_attrs[GEN_AI_PROVIDER_NAME] = provider
|
|
400
|
+
if response_id:
|
|
401
|
+
base_attrs[GEN_AI_RESPONSE_ID] = response_id
|
|
402
|
+
if isinstance(res.score, (int, float)):
|
|
403
|
+
base_attrs[GEN_AI_EVALUATION_SCORE_VALUE] = res.score
|
|
404
|
+
if res.label is not None:
|
|
405
|
+
base_attrs[GEN_AI_EVALUATION_SCORE_LABEL] = res.label
|
|
406
|
+
else:
|
|
407
|
+
base_attrs[GEN_AI_EVALUATION_SCORE_LABEL] = "unknown"
|
|
408
|
+
# Propagate pass boolean if available
|
|
409
|
+
passed = None
|
|
410
|
+
try:
|
|
411
|
+
if isinstance(getattr(res, "attributes", None), dict):
|
|
412
|
+
passed = res.attributes.get("gen_ai.evaluation.passed")
|
|
413
|
+
except Exception: # pragma: no cover - defensive
|
|
414
|
+
passed = None
|
|
415
|
+
if isinstance(passed, bool):
|
|
416
|
+
base_attrs["gen_ai.evaluation.passed"] = passed
|
|
417
|
+
if isinstance(res.score, (int, float)):
|
|
418
|
+
base_attrs["gen_ai.evaluation.score.units"] = "score"
|
|
419
|
+
if res.error is not None:
|
|
420
|
+
if getattr(res.error, "message", None):
|
|
421
|
+
base_attrs[ErrorAttributes.ERROR_MESSAGE] = (
|
|
422
|
+
res.error.message
|
|
423
|
+
)
|
|
424
|
+
if getattr(res.error, "type", None):
|
|
425
|
+
base_attrs[ErrorAttributes.ERROR_TYPE] = (
|
|
426
|
+
res.error.type.__qualname__
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
spec_attrs = dict(base_attrs)
|
|
430
|
+
if res.explanation:
|
|
431
|
+
spec_attrs[GEN_AI_EVALUATION_EXPLANATION] = res.explanation
|
|
432
|
+
if res.attributes:
|
|
433
|
+
for key, value in dict(res.attributes).items():
|
|
434
|
+
key_str = str(key)
|
|
435
|
+
spec_attrs[
|
|
436
|
+
f"{GEN_AI_EVALUATION_ATTRIBUTES_PREFIX}{key_str}"
|
|
437
|
+
] = value
|
|
438
|
+
if res.error is not None:
|
|
439
|
+
if getattr(res.error, "message", None):
|
|
440
|
+
spec_attrs[ErrorAttributes.ERROR_MESSAGE] = (
|
|
441
|
+
res.error.message
|
|
442
|
+
)
|
|
443
|
+
if getattr(res.error, "type", None):
|
|
444
|
+
spec_attrs[ErrorAttributes.ERROR_TYPE] = (
|
|
445
|
+
res.error.type.__qualname__
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
primary_body: Dict[str, Any] = {}
|
|
449
|
+
if isinstance(res.score, (int, float)):
|
|
450
|
+
primary_body["score"] = res.score
|
|
451
|
+
elif res.score is not None:
|
|
452
|
+
primary_body["score"] = res.score
|
|
453
|
+
if res.label is not None:
|
|
454
|
+
primary_body["label"] = res.label
|
|
455
|
+
if res.explanation:
|
|
456
|
+
primary_body["explanation"] = res.explanation
|
|
457
|
+
if res.attributes:
|
|
458
|
+
primary_body["attributes"] = dict(res.attributes)
|
|
459
|
+
if res.error is not None:
|
|
460
|
+
primary_body["error"] = {
|
|
461
|
+
"type": res.error.type.__qualname__,
|
|
462
|
+
"message": getattr(res.error, "message", None),
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
try:
|
|
466
|
+
record = _evaluation_to_log_record(
|
|
467
|
+
invocation,
|
|
468
|
+
self._primary_event_name,
|
|
469
|
+
spec_attrs,
|
|
470
|
+
body=primary_body or None,
|
|
471
|
+
)
|
|
472
|
+
self._logger.emit(record)
|
|
473
|
+
if self._py_logger.isEnabledFor(logging.DEBUG):
|
|
474
|
+
self._py_logger.debug(
|
|
475
|
+
"Emitted evaluation log event metric=%s trace_id=%s span_id=%s",
|
|
476
|
+
canonical,
|
|
477
|
+
getattr(invocation, "trace_id", None),
|
|
478
|
+
getattr(invocation, "span_id", None),
|
|
479
|
+
)
|
|
480
|
+
except Exception: # pragma: no cover - defensive
|
|
481
|
+
if self._py_logger.isEnabledFor(logging.DEBUG):
|
|
482
|
+
self._py_logger.debug(
|
|
483
|
+
"Failed to emit evaluation log event", exc_info=True
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
if not self._emit_legacy_event:
|
|
487
|
+
continue
|
|
488
|
+
|
|
489
|
+
legacy_attrs = dict(base_attrs)
|
|
490
|
+
legacy_body: Dict[str, Any] = {}
|
|
491
|
+
if res.explanation:
|
|
492
|
+
legacy_body["gen_ai.evaluation.explanation"] = res.explanation
|
|
493
|
+
if res.attributes:
|
|
494
|
+
legacy_body["gen_ai.evaluation.attributes"] = dict(
|
|
495
|
+
res.attributes
|
|
496
|
+
)
|
|
497
|
+
if res.error is not None and getattr(res.error, "message", None):
|
|
498
|
+
legacy_attrs["error.message"] = res.error.message
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
legacy_record = _evaluation_to_log_record(
|
|
502
|
+
invocation,
|
|
503
|
+
self._legacy_event_name,
|
|
504
|
+
legacy_attrs,
|
|
505
|
+
body=legacy_body or None,
|
|
506
|
+
)
|
|
507
|
+
self._logger.emit(legacy_record)
|
|
508
|
+
except Exception: # pragma: no cover - defensive
|
|
509
|
+
if self._py_logger.isEnabledFor(logging.DEBUG):
|
|
510
|
+
self._py_logger.debug(
|
|
511
|
+
"Failed to emit legacy evaluation log event",
|
|
512
|
+
exc_info=True,
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
__all__ = [
|
|
517
|
+
"EvaluationMetricsEmitter",
|
|
518
|
+
"EvaluationEventsEmitter",
|
|
519
|
+
]
|