evalvault 1.75.0__py3-none-any.whl → 1.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +99 -63
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/cli/commands/method.py +2 -2
- evalvault/adapters/inbound/cli/commands/run.py +146 -28
- evalvault/adapters/inbound/cli/commands/run_helpers.py +157 -55
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/settings.py +40 -4
- evalvault/domain/services/evaluator.py +2 -0
- {evalvault-1.75.0.dist-info → evalvault-1.76.0.dist-info}/METADATA +1 -1
- {evalvault-1.75.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +18 -18
- {evalvault-1.75.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -5,7 +5,7 @@ import tempfile
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
7
|
from evalvault.adapters.outbound.tracker.log_sanitizer import MAX_LOG_CHARS, sanitize_payload
|
|
8
|
-
from evalvault.domain.entities import EvaluationRun
|
|
8
|
+
from evalvault.domain.entities import EvaluationRun, TestCaseResult
|
|
9
9
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
10
10
|
|
|
11
11
|
|
|
@@ -29,6 +29,17 @@ class MLflowAdapter(TrackerPort):
|
|
|
29
29
|
tracking_uri: MLflow tracking server URI
|
|
30
30
|
experiment_name: MLflow experiment name
|
|
31
31
|
"""
|
|
32
|
+
try:
|
|
33
|
+
import torch # type: ignore
|
|
34
|
+
except Exception:
|
|
35
|
+
torch = None # type: ignore
|
|
36
|
+
if torch is not None and not hasattr(torch, "Tensor"):
|
|
37
|
+
|
|
38
|
+
class _TorchTensor: # pragma: no cover - guard for namespace package
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
torch.Tensor = _TorchTensor # type: ignore[attr-defined]
|
|
42
|
+
|
|
32
43
|
import mlflow
|
|
33
44
|
|
|
34
45
|
mlflow.set_tracking_uri(tracking_uri)
|
|
@@ -36,6 +47,21 @@ class MLflowAdapter(TrackerPort):
|
|
|
36
47
|
self._mlflow = mlflow
|
|
37
48
|
self._active_runs: dict[str, Any] = {} # trace_id -> mlflow run
|
|
38
49
|
|
|
50
|
+
def _enable_system_metrics(self) -> None:
|
|
51
|
+
try:
|
|
52
|
+
enable_fn = getattr(self._mlflow, "enable_system_metrics_logging", None)
|
|
53
|
+
if callable(enable_fn):
|
|
54
|
+
enable_fn()
|
|
55
|
+
except Exception: # pragma: no cover - optional dependency
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
def _start_mlflow_run(self, name: str) -> Any:
|
|
59
|
+
try:
|
|
60
|
+
return self._mlflow.start_run(run_name=name, log_system_metrics=True)
|
|
61
|
+
except TypeError:
|
|
62
|
+
self._enable_system_metrics()
|
|
63
|
+
return self._mlflow.start_run(run_name=name)
|
|
64
|
+
|
|
39
65
|
def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
|
|
40
66
|
"""
|
|
41
67
|
Start a new MLflow run (mapped to trace).
|
|
@@ -47,7 +73,7 @@ class MLflowAdapter(TrackerPort):
|
|
|
47
73
|
Returns:
|
|
48
74
|
trace_id: MLflow run ID
|
|
49
75
|
"""
|
|
50
|
-
run = self.
|
|
76
|
+
run = self._start_mlflow_run(name)
|
|
51
77
|
trace_id = run.info.run_id
|
|
52
78
|
|
|
53
79
|
# Log metadata as MLflow parameters (only primitive types)
|
|
@@ -59,6 +85,12 @@ class MLflowAdapter(TrackerPort):
|
|
|
59
85
|
self._active_runs[trace_id] = run
|
|
60
86
|
return trace_id
|
|
61
87
|
|
|
88
|
+
def _write_temp_file(self, suffix: str, content: str) -> str:
|
|
89
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False) as f:
|
|
90
|
+
f.write(content)
|
|
91
|
+
f.flush()
|
|
92
|
+
return f.name
|
|
93
|
+
|
|
62
94
|
def add_span(
|
|
63
95
|
self,
|
|
64
96
|
trace_id: str,
|
|
@@ -89,10 +121,9 @@ class MLflowAdapter(TrackerPort):
|
|
|
89
121
|
"input": sanitize_payload(input_data, max_chars=MAX_LOG_CHARS),
|
|
90
122
|
"output": sanitize_payload(output_data, max_chars=MAX_LOG_CHARS),
|
|
91
123
|
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
self._mlflow.log_artifact(f.name, f"spans/{name}")
|
|
124
|
+
payload = json.dumps(span_data, default=str)
|
|
125
|
+
path = self._write_temp_file(".json", payload)
|
|
126
|
+
self._mlflow.log_artifact(path, f"spans/{name}")
|
|
96
127
|
|
|
97
128
|
def log_score(
|
|
98
129
|
self,
|
|
@@ -145,9 +176,15 @@ class MLflowAdapter(TrackerPort):
|
|
|
145
176
|
raise ValueError(f"Run not found: {trace_id}")
|
|
146
177
|
|
|
147
178
|
if artifact_type == "json":
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
179
|
+
payload = json.dumps(data, default=str)
|
|
180
|
+
path = self._write_temp_file(".json", payload)
|
|
181
|
+
self._mlflow.log_artifact(path, f"artifacts/{name}")
|
|
182
|
+
elif artifact_type == "text":
|
|
183
|
+
path = self._write_temp_file(".txt", str(data))
|
|
184
|
+
self._mlflow.log_artifact(path, f"artifacts/{name}")
|
|
185
|
+
else:
|
|
186
|
+
path = self._write_temp_file(".txt", str(data))
|
|
187
|
+
self._mlflow.log_artifact(path, f"artifacts/{name}")
|
|
151
188
|
|
|
152
189
|
def end_trace(self, trace_id: str) -> None:
|
|
153
190
|
"""
|
|
@@ -180,53 +217,171 @@ class MLflowAdapter(TrackerPort):
|
|
|
180
217
|
Returns:
|
|
181
218
|
trace_id: ID of the created MLflow run
|
|
182
219
|
"""
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
"
|
|
214
|
-
|
|
215
|
-
|
|
220
|
+
|
|
221
|
+
def _log_run() -> str:
|
|
222
|
+
trace_id = self.start_trace(
|
|
223
|
+
name=f"evaluation-{run.run_id[:8]}",
|
|
224
|
+
metadata={
|
|
225
|
+
"dataset_name": run.dataset_name,
|
|
226
|
+
"dataset_version": run.dataset_version,
|
|
227
|
+
"model_name": run.model_name,
|
|
228
|
+
"total_test_cases": run.total_test_cases,
|
|
229
|
+
},
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
self._mlflow.set_tag("run_id", run.run_id)
|
|
233
|
+
self._mlflow.set_tag("model_name", run.model_name)
|
|
234
|
+
self._mlflow.set_tag("dataset", f"{run.dataset_name}:{run.dataset_version}")
|
|
235
|
+
if run.tracker_metadata:
|
|
236
|
+
project_name = run.tracker_metadata.get("project_name")
|
|
237
|
+
if project_name:
|
|
238
|
+
self._mlflow.set_tag("project_name", project_name)
|
|
239
|
+
|
|
240
|
+
for metric_name in run.metrics_evaluated:
|
|
241
|
+
avg_score = run.get_avg_score(metric_name)
|
|
242
|
+
if avg_score is not None:
|
|
243
|
+
self.log_score(trace_id, f"avg_{metric_name}", avg_score)
|
|
244
|
+
|
|
245
|
+
self.log_score(trace_id, "pass_rate", run.pass_rate)
|
|
246
|
+
self._mlflow.log_metric("total_tokens", run.total_tokens)
|
|
247
|
+
if run.duration_seconds:
|
|
248
|
+
self._mlflow.log_metric("duration_seconds", run.duration_seconds)
|
|
249
|
+
if run.total_cost_usd is not None:
|
|
250
|
+
self._mlflow.log_metric("total_cost_usd", run.total_cost_usd)
|
|
251
|
+
|
|
252
|
+
results_data = []
|
|
253
|
+
for result in run.results:
|
|
254
|
+
result_dict = {
|
|
255
|
+
"test_case_id": result.test_case_id,
|
|
256
|
+
"all_passed": result.all_passed,
|
|
257
|
+
"tokens_used": result.tokens_used,
|
|
258
|
+
"metrics": [
|
|
259
|
+
{"name": m.name, "score": m.score, "passed": m.passed}
|
|
260
|
+
for m in result.metrics
|
|
261
|
+
],
|
|
262
|
+
}
|
|
263
|
+
results_data.append(result_dict)
|
|
264
|
+
self._trace_test_case(result)
|
|
265
|
+
|
|
266
|
+
self.save_artifact(trace_id, "test_results", results_data)
|
|
267
|
+
self.save_artifact(
|
|
268
|
+
trace_id,
|
|
269
|
+
"custom_metric_snapshot",
|
|
270
|
+
(run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
271
|
+
)
|
|
272
|
+
if run.tracker_metadata:
|
|
273
|
+
self.save_artifact(trace_id, "tracker_metadata", run.tracker_metadata)
|
|
274
|
+
self._register_prompts(run)
|
|
275
|
+
|
|
276
|
+
self.end_trace(trace_id)
|
|
277
|
+
return trace_id
|
|
278
|
+
|
|
279
|
+
trace_name = f"evaluation-{run.run_id[:8]}"
|
|
280
|
+
trace_attrs = {
|
|
281
|
+
"dataset_name": run.dataset_name,
|
|
282
|
+
"dataset_version": run.dataset_version,
|
|
283
|
+
"model_name": run.model_name,
|
|
284
|
+
}
|
|
285
|
+
try:
|
|
286
|
+
traced = self._mlflow.trace(
|
|
287
|
+
name=trace_name, span_type="EVALUATION", attributes=trace_attrs
|
|
288
|
+
)
|
|
289
|
+
return traced(_log_run)()
|
|
290
|
+
except Exception:
|
|
291
|
+
return _log_run()
|
|
292
|
+
|
|
293
|
+
def _register_prompts(self, run: EvaluationRun) -> None:
|
|
294
|
+
genai = getattr(self._mlflow, "genai", None)
|
|
295
|
+
if genai is None:
|
|
296
|
+
return
|
|
297
|
+
register_fn = getattr(genai, "register_prompt", None)
|
|
298
|
+
if not callable(register_fn):
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
prompt_entries = self._extract_prompt_entries(run)
|
|
302
|
+
if not prompt_entries:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
for entry in prompt_entries:
|
|
306
|
+
name = entry.get("name") or entry.get("role") or "prompt"
|
|
307
|
+
content = entry.get("content") or entry.get("content_preview") or ""
|
|
308
|
+
if not content:
|
|
309
|
+
continue
|
|
310
|
+
tags = {
|
|
311
|
+
"kind": str(entry.get("kind") or "custom"),
|
|
312
|
+
"role": str(entry.get("role") or ""),
|
|
313
|
+
"checksum": str(entry.get("checksum") or ""),
|
|
314
|
+
"run_id": run.run_id,
|
|
315
|
+
}
|
|
316
|
+
prompt_set_name = entry.get("prompt_set_name")
|
|
317
|
+
if prompt_set_name:
|
|
318
|
+
tags["prompt_set"] = str(prompt_set_name)
|
|
319
|
+
register_fn(
|
|
320
|
+
name=name,
|
|
321
|
+
template=content,
|
|
322
|
+
commit_message=entry.get("checksum"),
|
|
323
|
+
tags=tags,
|
|
324
|
+
model_config={
|
|
325
|
+
"model_name": run.model_name,
|
|
326
|
+
},
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
def _extract_prompt_entries(self, run: EvaluationRun) -> list[dict[str, Any]]:
|
|
330
|
+
entries: list[dict[str, Any]] = []
|
|
331
|
+
metadata = run.tracker_metadata or {}
|
|
332
|
+
prompt_set_detail = metadata.get("prompt_set_detail")
|
|
333
|
+
if isinstance(prompt_set_detail, dict):
|
|
334
|
+
prompt_set_name = prompt_set_detail.get("name")
|
|
335
|
+
for item in prompt_set_detail.get("items", []):
|
|
336
|
+
prompt = item.get("prompt") or {}
|
|
337
|
+
if not isinstance(prompt, dict):
|
|
338
|
+
continue
|
|
339
|
+
entries.append(
|
|
340
|
+
{
|
|
341
|
+
"name": prompt.get("name"),
|
|
342
|
+
"role": item.get("role"),
|
|
343
|
+
"kind": prompt.get("kind"),
|
|
344
|
+
"checksum": prompt.get("checksum"),
|
|
345
|
+
"content": prompt.get("content"),
|
|
346
|
+
"prompt_set_name": prompt_set_name,
|
|
347
|
+
}
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
phoenix_meta = metadata.get("phoenix") or {}
|
|
351
|
+
if isinstance(phoenix_meta, dict):
|
|
352
|
+
for entry in phoenix_meta.get("prompts", []) or []:
|
|
353
|
+
if not isinstance(entry, dict):
|
|
354
|
+
continue
|
|
355
|
+
entries.append(entry)
|
|
356
|
+
return entries
|
|
357
|
+
|
|
358
|
+
def _trace_test_case(self, result: TestCaseResult) -> None:
|
|
359
|
+
trace_fn = getattr(self._mlflow, "trace", None)
|
|
360
|
+
if not callable(trace_fn):
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
attrs = {
|
|
364
|
+
"test_case_id": result.test_case_id,
|
|
365
|
+
"all_passed": result.all_passed,
|
|
366
|
+
"tokens_used": result.tokens_used,
|
|
367
|
+
"latency_ms": result.latency_ms,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
def _emit() -> dict[str, Any]:
|
|
371
|
+
return {
|
|
216
372
|
"metrics": [
|
|
217
373
|
{"name": m.name, "score": m.score, "passed": m.passed} for m in result.metrics
|
|
218
374
|
],
|
|
375
|
+
"tokens_used": result.tokens_used,
|
|
376
|
+
"latency_ms": result.latency_ms,
|
|
219
377
|
}
|
|
220
|
-
results_data.append(result_dict)
|
|
221
|
-
|
|
222
|
-
self.save_artifact(trace_id, "test_results", results_data)
|
|
223
|
-
self.save_artifact(
|
|
224
|
-
trace_id,
|
|
225
|
-
"custom_metric_snapshot",
|
|
226
|
-
(run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
227
|
-
)
|
|
228
378
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
379
|
+
try:
|
|
380
|
+
wrapped = trace_fn(
|
|
381
|
+
name=f"test_case_{result.test_case_id}",
|
|
382
|
+
span_type="EVALUATION",
|
|
383
|
+
attributes=attrs,
|
|
384
|
+
)
|
|
385
|
+
wrapped(_emit)()
|
|
386
|
+
except Exception:
|
|
387
|
+
return
|
|
@@ -52,6 +52,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
52
52
|
self,
|
|
53
53
|
endpoint: str = "http://localhost:6006/v1/traces",
|
|
54
54
|
service_name: str = "evalvault",
|
|
55
|
+
project_name: str | None = None,
|
|
56
|
+
annotations_enabled: bool = True,
|
|
55
57
|
):
|
|
56
58
|
"""Initialize Phoenix adapter with OpenTelemetry.
|
|
57
59
|
|
|
@@ -61,11 +63,14 @@ class PhoenixAdapter(TrackerPort):
|
|
|
61
63
|
"""
|
|
62
64
|
self._endpoint = endpoint
|
|
63
65
|
self._service_name = service_name
|
|
66
|
+
self._project_name = project_name
|
|
67
|
+
self._annotations_enabled = annotations_enabled
|
|
64
68
|
self._tracer: Any | None = None
|
|
65
69
|
self._tracer_provider: TracerProvider | None = None
|
|
66
70
|
self._active_spans: dict[str, Any] = {}
|
|
67
71
|
self._tracer_any: Any | None = None
|
|
68
72
|
self._initialized = False
|
|
73
|
+
self._annotations_client: Any | None = None
|
|
69
74
|
|
|
70
75
|
def _ensure_initialized(self) -> None:
|
|
71
76
|
"""Lazy initialization of OpenTelemetry tracer."""
|
|
@@ -96,7 +101,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
96
101
|
return
|
|
97
102
|
|
|
98
103
|
# Create resource with service name
|
|
99
|
-
|
|
104
|
+
resource_attributes = {"service.name": self._service_name}
|
|
105
|
+
if self._project_name:
|
|
106
|
+
resource_attributes["project.name"] = self._project_name
|
|
107
|
+
resource = Resource.create(resource_attributes)
|
|
100
108
|
|
|
101
109
|
# Create tracer provider
|
|
102
110
|
self._tracer_provider = TracerProvider(resource=resource)
|
|
@@ -123,6 +131,50 @@ class PhoenixAdapter(TrackerPort):
|
|
|
123
131
|
"Failed to initialize Phoenix tracer. Check endpoint configuration and dependencies."
|
|
124
132
|
) from e
|
|
125
133
|
|
|
134
|
+
def _phoenix_base_url(self) -> str:
|
|
135
|
+
if "/v1/traces" in self._endpoint:
|
|
136
|
+
return self._endpoint.split("/v1/traces")[0]
|
|
137
|
+
return self._endpoint.rstrip("/")
|
|
138
|
+
|
|
139
|
+
def _get_annotations_client(self) -> Any | None:
|
|
140
|
+
if not self._annotations_enabled:
|
|
141
|
+
return None
|
|
142
|
+
if self._annotations_client is not None:
|
|
143
|
+
return self._annotations_client
|
|
144
|
+
try:
|
|
145
|
+
from phoenix.client import Client
|
|
146
|
+
except Exception:
|
|
147
|
+
return None
|
|
148
|
+
self._annotations_client = Client(base_url=self._phoenix_base_url())
|
|
149
|
+
return self._annotations_client
|
|
150
|
+
|
|
151
|
+
def _annotate_span(
|
|
152
|
+
self,
|
|
153
|
+
*,
|
|
154
|
+
span: Any,
|
|
155
|
+
name: str,
|
|
156
|
+
label: str,
|
|
157
|
+
score: float | None = None,
|
|
158
|
+
explanation: str | None = None,
|
|
159
|
+
) -> None:
|
|
160
|
+
client = self._get_annotations_client()
|
|
161
|
+
if client is None or span is None:
|
|
162
|
+
return
|
|
163
|
+
try:
|
|
164
|
+
from opentelemetry.trace import format_span_id
|
|
165
|
+
|
|
166
|
+
span_id = format_span_id(span.get_span_context().span_id)
|
|
167
|
+
client.annotations.add_span_annotation(
|
|
168
|
+
annotation_name=name,
|
|
169
|
+
annotator_kind="CODE",
|
|
170
|
+
span_id=span_id,
|
|
171
|
+
label=label,
|
|
172
|
+
score=score,
|
|
173
|
+
explanation=explanation,
|
|
174
|
+
)
|
|
175
|
+
except Exception:
|
|
176
|
+
return
|
|
177
|
+
|
|
126
178
|
def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
|
|
127
179
|
"""Start a new trace.
|
|
128
180
|
|
|
@@ -328,8 +380,17 @@ class PhoenixAdapter(TrackerPort):
|
|
|
328
380
|
|
|
329
381
|
# Set evaluation-specific attributes
|
|
330
382
|
span = self._active_spans[trace_id]
|
|
383
|
+
span.set_attribute("openinference.span.kind", "EVALUATOR")
|
|
331
384
|
span.set_attribute("evaluation.metrics", json.dumps(run.metrics_evaluated))
|
|
332
385
|
span.set_attribute("evaluation.thresholds", json.dumps(run.thresholds))
|
|
386
|
+
span.set_attribute("evaluation.status", "pass" if run.pass_rate >= 1.0 else "fail")
|
|
387
|
+
if run.tracker_metadata:
|
|
388
|
+
project_name = run.tracker_metadata.get("project_name")
|
|
389
|
+
if project_name:
|
|
390
|
+
span.set_attribute("project.name", project_name)
|
|
391
|
+
project_kind = run.tracker_metadata.get("evaluation_task") or "evaluation"
|
|
392
|
+
span.set_attribute("project.kind", project_kind)
|
|
393
|
+
span.set_attribute("project.status", "pass" if run.pass_rate >= 1.0 else "fail")
|
|
333
394
|
|
|
334
395
|
# Log average scores for each metric
|
|
335
396
|
for metric_name, summary in metric_summary.items():
|
|
@@ -369,6 +430,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
369
430
|
},
|
|
370
431
|
"metrics": metric_summary,
|
|
371
432
|
"custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
433
|
+
"prompt_metadata": (run.tracker_metadata or {}).get("phoenix", {}).get("prompts"),
|
|
434
|
+
"tracker_metadata": run.tracker_metadata,
|
|
372
435
|
"test_cases": [
|
|
373
436
|
{
|
|
374
437
|
"test_case_id": result.test_case_id,
|
|
@@ -420,6 +483,23 @@ class PhoenixAdapter(TrackerPort):
|
|
|
420
483
|
f"test-case-{result.test_case_id}",
|
|
421
484
|
context=context,
|
|
422
485
|
) as span:
|
|
486
|
+
try:
|
|
487
|
+
from opentelemetry.trace import Status, StatusCode
|
|
488
|
+
|
|
489
|
+
span.set_status(Status(StatusCode.OK if result.all_passed else StatusCode.ERROR))
|
|
490
|
+
except Exception:
|
|
491
|
+
pass
|
|
492
|
+
span.set_attribute("openinference.span.kind", "EVALUATOR")
|
|
493
|
+
span.set_attribute("evaluation.status", "pass" if result.all_passed else "fail")
|
|
494
|
+
self._annotate_span(
|
|
495
|
+
span=span,
|
|
496
|
+
name="evaluation_result",
|
|
497
|
+
label="pass" if result.all_passed else "fail",
|
|
498
|
+
score=1.0 if result.all_passed else 0.0,
|
|
499
|
+
explanation="All metrics passed"
|
|
500
|
+
if result.all_passed
|
|
501
|
+
else "One or more metrics failed",
|
|
502
|
+
)
|
|
423
503
|
# Input data
|
|
424
504
|
safe_question = sanitize_text(result.question, max_chars=MAX_LOG_CHARS) or ""
|
|
425
505
|
safe_answer = sanitize_text(result.answer, max_chars=MAX_LOG_CHARS) or ""
|
|
@@ -439,6 +519,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
439
519
|
# Metrics
|
|
440
520
|
span.set_attribute("output.all_passed", result.all_passed)
|
|
441
521
|
span.set_attribute("output.tokens_used", result.tokens_used)
|
|
522
|
+
if result.tokens_used:
|
|
523
|
+
span.set_attribute("llm.token_count.total", result.tokens_used)
|
|
524
|
+
if result.cost_usd is not None:
|
|
525
|
+
span.set_attribute("llm.cost.total", result.cost_usd)
|
|
442
526
|
|
|
443
527
|
for metric in result.metrics:
|
|
444
528
|
span.set_attribute(f"metric.{metric.name}.score", metric.score)
|
|
@@ -486,6 +570,7 @@ class PhoenixAdapter(TrackerPort):
|
|
|
486
570
|
)
|
|
487
571
|
if result.latency_ms:
|
|
488
572
|
span.set_attribute("timing.latency_ms", result.latency_ms)
|
|
573
|
+
span.set_attribute("evaluation.latency_ms", result.latency_ms)
|
|
489
574
|
|
|
490
575
|
def log_retrieval(
|
|
491
576
|
self,
|
|
@@ -528,6 +613,13 @@ class PhoenixAdapter(TrackerPort):
|
|
|
528
613
|
if tracer is None:
|
|
529
614
|
raise RuntimeError("Phoenix tracer is not initialized")
|
|
530
615
|
with tracer.start_span("retrieval", context=context) as span:
|
|
616
|
+
try:
|
|
617
|
+
from opentelemetry.trace import Status, StatusCode
|
|
618
|
+
|
|
619
|
+
span.set_status(Status(StatusCode.OK))
|
|
620
|
+
except Exception:
|
|
621
|
+
pass
|
|
622
|
+
span.set_attribute("openinference.span.kind", "RETRIEVER")
|
|
531
623
|
# Set retrieval attributes
|
|
532
624
|
for key, value in data.to_span_attributes().items():
|
|
533
625
|
span.set_attribute(key, value)
|
|
@@ -541,14 +633,24 @@ class PhoenixAdapter(TrackerPort):
|
|
|
541
633
|
|
|
542
634
|
span.set_attribute("spec.version", "0.1")
|
|
543
635
|
span.set_attribute("rag.module", "retrieve")
|
|
636
|
+
if data.retrieval_time_ms:
|
|
637
|
+
span.set_attribute("retrieval.latency_ms", data.retrieval_time_ms)
|
|
544
638
|
|
|
545
639
|
documents_payload = _build_retrieval_payload(data.candidates)
|
|
546
640
|
span.set_attribute("custom.retrieval.doc_count", len(documents_payload))
|
|
547
641
|
if documents_payload:
|
|
548
642
|
span.set_attribute("retrieval.documents_json", serialize_json(documents_payload))
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
643
|
+
previews = [
|
|
644
|
+
item.get("content_preview")
|
|
645
|
+
for item in documents_payload
|
|
646
|
+
if item.get("content_preview")
|
|
647
|
+
]
|
|
648
|
+
if previews:
|
|
649
|
+
span.set_attribute("output.value", previews)
|
|
650
|
+
else:
|
|
651
|
+
doc_ids = _extract_doc_ids(documents_payload)
|
|
652
|
+
if doc_ids:
|
|
653
|
+
span.set_attribute("output.value", doc_ids)
|
|
552
654
|
|
|
553
655
|
# Log each retrieved document as an event
|
|
554
656
|
for i, doc in enumerate(data.candidates):
|
|
@@ -615,10 +717,31 @@ class PhoenixAdapter(TrackerPort):
|
|
|
615
717
|
if tracer is None:
|
|
616
718
|
raise RuntimeError("Phoenix tracer is not initialized")
|
|
617
719
|
with tracer.start_span("generation", context=context) as span:
|
|
720
|
+
try:
|
|
721
|
+
from opentelemetry.trace import Status, StatusCode
|
|
722
|
+
|
|
723
|
+
span.set_status(Status(StatusCode.OK))
|
|
724
|
+
except Exception:
|
|
725
|
+
pass
|
|
726
|
+
span.set_attribute("openinference.span.kind", "LLM")
|
|
618
727
|
# Set generation attributes
|
|
619
728
|
for key, value in data.to_span_attributes().items():
|
|
620
729
|
span.set_attribute(key, value)
|
|
621
730
|
|
|
731
|
+
if data.model:
|
|
732
|
+
span.set_attribute("llm.model_name", data.model)
|
|
733
|
+
provider = data.model.split("/")[0] if "/" in data.model else ""
|
|
734
|
+
if provider:
|
|
735
|
+
span.set_attribute("llm.provider", provider)
|
|
736
|
+
if data.input_tokens:
|
|
737
|
+
span.set_attribute("llm.token_count.prompt", data.input_tokens)
|
|
738
|
+
if data.output_tokens:
|
|
739
|
+
span.set_attribute("llm.token_count.completion", data.output_tokens)
|
|
740
|
+
if data.total_tokens:
|
|
741
|
+
span.set_attribute("llm.token_count.total", data.total_tokens)
|
|
742
|
+
if data.cost_usd is not None:
|
|
743
|
+
span.set_attribute("llm.cost.total", data.cost_usd)
|
|
744
|
+
|
|
622
745
|
# Set prompt/response (truncate if too long)
|
|
623
746
|
prompt = sanitize_text(data.prompt, max_chars=MAX_LOG_CHARS) or ""
|
|
624
747
|
response = sanitize_text(data.response, max_chars=MAX_LOG_CHARS) or ""
|
|
@@ -637,6 +760,13 @@ class PhoenixAdapter(TrackerPort):
|
|
|
637
760
|
safe_template = sanitize_text(data.prompt_template, max_chars=MAX_LOG_CHARS)
|
|
638
761
|
if safe_template:
|
|
639
762
|
span.set_attribute("generation.prompt_template", safe_template)
|
|
763
|
+
span.set_attribute("llm.prompt_template.template", safe_template)
|
|
764
|
+
span.set_attribute("llm.prompt_template.version", "v1")
|
|
765
|
+
prompt_vars = data.metadata.get("prompt_variables") if data.metadata else None
|
|
766
|
+
if prompt_vars:
|
|
767
|
+
span.set_attribute(
|
|
768
|
+
"llm.prompt_template.variables", json.dumps(prompt_vars, default=str)
|
|
769
|
+
)
|
|
640
770
|
|
|
641
771
|
def log_rag_trace(self, data: RAGTraceData) -> str:
|
|
642
772
|
"""Log a full RAG trace (retrieval + generation) to Phoenix."""
|
|
@@ -660,6 +790,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
660
790
|
span = self._active_spans[trace_id]
|
|
661
791
|
should_end = True
|
|
662
792
|
|
|
793
|
+
span.set_attribute("openinference.span.kind", "CHAIN")
|
|
794
|
+
|
|
663
795
|
for key, value in data.to_span_attributes().items():
|
|
664
796
|
span.set_attribute(key, value)
|
|
665
797
|
|
|
@@ -667,11 +799,21 @@ class PhoenixAdapter(TrackerPort):
|
|
|
667
799
|
self.log_retrieval(trace_id, data.retrieval)
|
|
668
800
|
if data.generation:
|
|
669
801
|
self.log_generation(trace_id, data.generation)
|
|
802
|
+
output_preview = ""
|
|
670
803
|
if data.final_answer:
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
804
|
+
output_preview = sanitize_text(data.final_answer, max_chars=MAX_LOG_CHARS)
|
|
805
|
+
if not output_preview and data.generation and data.generation.response:
|
|
806
|
+
output_preview = sanitize_text(data.generation.response, max_chars=MAX_LOG_CHARS)
|
|
807
|
+
if not output_preview and data.retrieval:
|
|
808
|
+
previews = [
|
|
809
|
+
sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS)
|
|
810
|
+
for doc in data.retrieval.candidates
|
|
811
|
+
if doc.content
|
|
812
|
+
]
|
|
813
|
+
output_preview = "\n".join(previews[:3])
|
|
814
|
+
if output_preview:
|
|
815
|
+
span.set_attribute("rag.final_answer", output_preview)
|
|
816
|
+
span.set_attribute("output.value", output_preview)
|
|
675
817
|
|
|
676
818
|
if safe_query:
|
|
677
819
|
span.set_attribute("input.value", safe_query)
|
|
@@ -697,7 +839,14 @@ def _build_retrieval_payload(
|
|
|
697
839
|
payload: list[dict[str, Any]] = []
|
|
698
840
|
for index, doc in enumerate(documents, start=1):
|
|
699
841
|
doc_id = doc.chunk_id or doc.source or doc.metadata.get("doc_id") or f"doc_{index}"
|
|
700
|
-
|
|
842
|
+
preview = ""
|
|
843
|
+
if doc.content:
|
|
844
|
+
preview = sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS)
|
|
845
|
+
item: dict[str, Any] = {
|
|
846
|
+
"doc_id": doc_id,
|
|
847
|
+
"score": doc.score,
|
|
848
|
+
"content_preview": preview,
|
|
849
|
+
}
|
|
701
850
|
if doc.source:
|
|
702
851
|
item["source"] = doc.source
|
|
703
852
|
if doc.rerank_score is not None:
|
|
@@ -26,6 +26,7 @@ _tracer_provider: TracerProvider | None = None
|
|
|
26
26
|
def setup_phoenix_instrumentation(
|
|
27
27
|
endpoint: str = "http://localhost:6006/v1/traces",
|
|
28
28
|
service_name: str = "evalvault",
|
|
29
|
+
project_name: str | None = None,
|
|
29
30
|
enable_langchain: bool = True,
|
|
30
31
|
enable_openai: bool = True,
|
|
31
32
|
sample_rate: float = 1.0,
|
|
@@ -73,12 +74,13 @@ def setup_phoenix_instrumentation(
|
|
|
73
74
|
return None
|
|
74
75
|
|
|
75
76
|
# Create resource with service name
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
resource_attributes = {
|
|
78
|
+
"service.name": service_name,
|
|
79
|
+
"service.version": "0.1.0",
|
|
80
|
+
}
|
|
81
|
+
if project_name:
|
|
82
|
+
resource_attributes["project.name"] = project_name
|
|
83
|
+
resource = Resource.create(resource_attributes)
|
|
82
84
|
|
|
83
85
|
# Clamp sample rate between 0 and 1
|
|
84
86
|
ratio = max(0.0, min(sample_rate, 1.0))
|
|
@@ -59,10 +59,15 @@ def ensure_phoenix_instrumentation(
|
|
|
59
59
|
if api_token:
|
|
60
60
|
headers = {"api-key": api_token}
|
|
61
61
|
|
|
62
|
+
project_name = getattr(settings, "phoenix_project_name", None)
|
|
63
|
+
if project_name is not None and not isinstance(project_name, str):
|
|
64
|
+
project_name = None
|
|
65
|
+
|
|
62
66
|
try:
|
|
63
67
|
setup_phoenix_instrumentation(
|
|
64
68
|
endpoint=endpoint,
|
|
65
69
|
service_name="evalvault",
|
|
70
|
+
project_name=project_name,
|
|
66
71
|
sample_rate=sample_rate,
|
|
67
72
|
headers=headers,
|
|
68
73
|
)
|