evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. evalvault/adapters/inbound/api/adapter.py +127 -80
  2. evalvault/adapters/inbound/api/routers/calibration.py +9 -9
  3. evalvault/adapters/inbound/api/routers/chat.py +303 -17
  4. evalvault/adapters/inbound/api/routers/config.py +3 -1
  5. evalvault/adapters/inbound/api/routers/domain.py +10 -5
  6. evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
  7. evalvault/adapters/inbound/api/routers/runs.py +23 -4
  8. evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
  9. evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
  10. evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
  11. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
  12. evalvault/adapters/inbound/cli/commands/compare.py +2 -7
  13. evalvault/adapters/inbound/cli/commands/debug.py +3 -2
  14. evalvault/adapters/inbound/cli/commands/domain.py +12 -12
  15. evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
  16. evalvault/adapters/inbound/cli/commands/gate.py +3 -2
  17. evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
  18. evalvault/adapters/inbound/cli/commands/history.py +3 -12
  19. evalvault/adapters/inbound/cli/commands/method.py +3 -4
  20. evalvault/adapters/inbound/cli/commands/ops.py +2 -2
  21. evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
  22. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
  23. evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
  24. evalvault/adapters/inbound/cli/commands/regress.py +5 -4
  25. evalvault/adapters/inbound/cli/commands/run.py +188 -59
  26. evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
  27. evalvault/adapters/inbound/cli/commands/stage.py +6 -25
  28. evalvault/adapters/inbound/cli/utils/options.py +10 -4
  29. evalvault/adapters/inbound/mcp/tools.py +11 -8
  30. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
  31. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
  32. evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
  33. evalvault/adapters/outbound/domain_memory/factory.py +68 -0
  34. evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
  35. evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
  36. evalvault/adapters/outbound/llm/factory.py +1 -1
  37. evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
  38. evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
  39. evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
  40. evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
  41. evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
  42. evalvault/adapters/outbound/storage/base_sql.py +3 -2
  43. evalvault/adapters/outbound/storage/factory.py +53 -0
  44. evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
  45. evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
  46. evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
  47. evalvault/config/instrumentation.py +8 -6
  48. evalvault/config/phoenix_support.py +5 -0
  49. evalvault/config/settings.py +71 -11
  50. evalvault/domain/services/domain_learning_hook.py +2 -1
  51. evalvault/domain/services/evaluator.py +2 -0
  52. evalvault/ports/inbound/web_port.py +3 -1
  53. evalvault/ports/outbound/storage_port.py +2 -0
  54. evalvault-1.76.0.dist-info/METADATA +221 -0
  55. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
  56. evalvault-1.74.0.dist-info/METADATA +0 -585
  57. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
  58. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
  59. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,7 +5,7 @@ import tempfile
5
5
  from typing import Any
6
6
 
7
7
  from evalvault.adapters.outbound.tracker.log_sanitizer import MAX_LOG_CHARS, sanitize_payload
8
- from evalvault.domain.entities import EvaluationRun
8
+ from evalvault.domain.entities import EvaluationRun, TestCaseResult
9
9
  from evalvault.ports.outbound.tracker_port import TrackerPort
10
10
 
11
11
 
@@ -29,6 +29,17 @@ class MLflowAdapter(TrackerPort):
29
29
  tracking_uri: MLflow tracking server URI
30
30
  experiment_name: MLflow experiment name
31
31
  """
32
+ try:
33
+ import torch # type: ignore
34
+ except Exception:
35
+ torch = None # type: ignore
36
+ if torch is not None and not hasattr(torch, "Tensor"):
37
+
38
+ class _TorchTensor: # pragma: no cover - guard for namespace package
39
+ pass
40
+
41
+ torch.Tensor = _TorchTensor # type: ignore[attr-defined]
42
+
32
43
  import mlflow
33
44
 
34
45
  mlflow.set_tracking_uri(tracking_uri)
@@ -36,6 +47,21 @@ class MLflowAdapter(TrackerPort):
36
47
  self._mlflow = mlflow
37
48
  self._active_runs: dict[str, Any] = {} # trace_id -> mlflow run
38
49
 
50
+ def _enable_system_metrics(self) -> None:
51
+ try:
52
+ enable_fn = getattr(self._mlflow, "enable_system_metrics_logging", None)
53
+ if callable(enable_fn):
54
+ enable_fn()
55
+ except Exception: # pragma: no cover - optional dependency
56
+ return
57
+
58
+ def _start_mlflow_run(self, name: str) -> Any:
59
+ try:
60
+ return self._mlflow.start_run(run_name=name, log_system_metrics=True)
61
+ except TypeError:
62
+ self._enable_system_metrics()
63
+ return self._mlflow.start_run(run_name=name)
64
+
39
65
  def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
40
66
  """
41
67
  Start a new MLflow run (mapped to trace).
@@ -47,7 +73,7 @@ class MLflowAdapter(TrackerPort):
47
73
  Returns:
48
74
  trace_id: MLflow run ID
49
75
  """
50
- run = self._mlflow.start_run(run_name=name)
76
+ run = self._start_mlflow_run(name)
51
77
  trace_id = run.info.run_id
52
78
 
53
79
  # Log metadata as MLflow parameters (only primitive types)
@@ -59,6 +85,12 @@ class MLflowAdapter(TrackerPort):
59
85
  self._active_runs[trace_id] = run
60
86
  return trace_id
61
87
 
88
+ def _write_temp_file(self, suffix: str, content: str) -> str:
89
+ with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False) as f:
90
+ f.write(content)
91
+ f.flush()
92
+ return f.name
93
+
62
94
  def add_span(
63
95
  self,
64
96
  trace_id: str,
@@ -89,10 +121,9 @@ class MLflowAdapter(TrackerPort):
89
121
  "input": sanitize_payload(input_data, max_chars=MAX_LOG_CHARS),
90
122
  "output": sanitize_payload(output_data, max_chars=MAX_LOG_CHARS),
91
123
  }
92
-
93
- with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
94
- json.dump(span_data, f, default=str)
95
- self._mlflow.log_artifact(f.name, f"spans/{name}")
124
+ payload = json.dumps(span_data, default=str)
125
+ path = self._write_temp_file(".json", payload)
126
+ self._mlflow.log_artifact(path, f"spans/{name}")
96
127
 
97
128
  def log_score(
98
129
  self,
@@ -145,9 +176,15 @@ class MLflowAdapter(TrackerPort):
145
176
  raise ValueError(f"Run not found: {trace_id}")
146
177
 
147
178
  if artifact_type == "json":
148
- with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
149
- json.dump(data, f, default=str)
150
- self._mlflow.log_artifact(f.name, f"artifacts/{name}")
179
+ payload = json.dumps(data, default=str)
180
+ path = self._write_temp_file(".json", payload)
181
+ self._mlflow.log_artifact(path, f"artifacts/{name}")
182
+ elif artifact_type == "text":
183
+ path = self._write_temp_file(".txt", str(data))
184
+ self._mlflow.log_artifact(path, f"artifacts/{name}")
185
+ else:
186
+ path = self._write_temp_file(".txt", str(data))
187
+ self._mlflow.log_artifact(path, f"artifacts/{name}")
151
188
 
152
189
  def end_trace(self, trace_id: str) -> None:
153
190
  """
@@ -180,53 +217,171 @@ class MLflowAdapter(TrackerPort):
180
217
  Returns:
181
218
  trace_id: ID of the created MLflow run
182
219
  """
183
- # 1. Start MLflow run
184
- trace_id = self.start_trace(
185
- name=f"evaluation-{run.run_id[:8]}",
186
- metadata={
187
- "dataset_name": run.dataset_name,
188
- "dataset_version": run.dataset_version,
189
- "model_name": run.model_name,
190
- "total_test_cases": run.total_test_cases,
191
- },
192
- )
193
-
194
- # 2. Log average metric scores
195
- for metric_name in run.metrics_evaluated:
196
- avg_score = run.get_avg_score(metric_name)
197
- if avg_score is not None:
198
- self.log_score(trace_id, f"avg_{metric_name}", avg_score)
199
-
200
- # 3. Log overall pass rate
201
- self.log_score(trace_id, "pass_rate", run.pass_rate)
202
-
203
- # 4. Log resource usage
204
- self._mlflow.log_metric("total_tokens", run.total_tokens)
205
-
206
- if run.duration_seconds:
207
- self._mlflow.log_metric("duration_seconds", run.duration_seconds)
208
-
209
- # 5. Save individual test results as artifact
210
- results_data = []
211
- for result in run.results:
212
- result_dict = {
213
- "test_case_id": result.test_case_id,
214
- "all_passed": result.all_passed,
215
- "tokens_used": result.tokens_used,
220
+
221
+ def _log_run() -> str:
222
+ trace_id = self.start_trace(
223
+ name=f"evaluation-{run.run_id[:8]}",
224
+ metadata={
225
+ "dataset_name": run.dataset_name,
226
+ "dataset_version": run.dataset_version,
227
+ "model_name": run.model_name,
228
+ "total_test_cases": run.total_test_cases,
229
+ },
230
+ )
231
+
232
+ self._mlflow.set_tag("run_id", run.run_id)
233
+ self._mlflow.set_tag("model_name", run.model_name)
234
+ self._mlflow.set_tag("dataset", f"{run.dataset_name}:{run.dataset_version}")
235
+ if run.tracker_metadata:
236
+ project_name = run.tracker_metadata.get("project_name")
237
+ if project_name:
238
+ self._mlflow.set_tag("project_name", project_name)
239
+
240
+ for metric_name in run.metrics_evaluated:
241
+ avg_score = run.get_avg_score(metric_name)
242
+ if avg_score is not None:
243
+ self.log_score(trace_id, f"avg_{metric_name}", avg_score)
244
+
245
+ self.log_score(trace_id, "pass_rate", run.pass_rate)
246
+ self._mlflow.log_metric("total_tokens", run.total_tokens)
247
+ if run.duration_seconds:
248
+ self._mlflow.log_metric("duration_seconds", run.duration_seconds)
249
+ if run.total_cost_usd is not None:
250
+ self._mlflow.log_metric("total_cost_usd", run.total_cost_usd)
251
+
252
+ results_data = []
253
+ for result in run.results:
254
+ result_dict = {
255
+ "test_case_id": result.test_case_id,
256
+ "all_passed": result.all_passed,
257
+ "tokens_used": result.tokens_used,
258
+ "metrics": [
259
+ {"name": m.name, "score": m.score, "passed": m.passed}
260
+ for m in result.metrics
261
+ ],
262
+ }
263
+ results_data.append(result_dict)
264
+ self._trace_test_case(result)
265
+
266
+ self.save_artifact(trace_id, "test_results", results_data)
267
+ self.save_artifact(
268
+ trace_id,
269
+ "custom_metric_snapshot",
270
+ (run.tracker_metadata or {}).get("custom_metric_snapshot"),
271
+ )
272
+ if run.tracker_metadata:
273
+ self.save_artifact(trace_id, "tracker_metadata", run.tracker_metadata)
274
+ self._register_prompts(run)
275
+
276
+ self.end_trace(trace_id)
277
+ return trace_id
278
+
279
+ trace_name = f"evaluation-{run.run_id[:8]}"
280
+ trace_attrs = {
281
+ "dataset_name": run.dataset_name,
282
+ "dataset_version": run.dataset_version,
283
+ "model_name": run.model_name,
284
+ }
285
+ try:
286
+ traced = self._mlflow.trace(
287
+ name=trace_name, span_type="EVALUATION", attributes=trace_attrs
288
+ )
289
+ return traced(_log_run)()
290
+ except Exception:
291
+ return _log_run()
292
+
293
+ def _register_prompts(self, run: EvaluationRun) -> None:
294
+ genai = getattr(self._mlflow, "genai", None)
295
+ if genai is None:
296
+ return
297
+ register_fn = getattr(genai, "register_prompt", None)
298
+ if not callable(register_fn):
299
+ return
300
+
301
+ prompt_entries = self._extract_prompt_entries(run)
302
+ if not prompt_entries:
303
+ return
304
+
305
+ for entry in prompt_entries:
306
+ name = entry.get("name") or entry.get("role") or "prompt"
307
+ content = entry.get("content") or entry.get("content_preview") or ""
308
+ if not content:
309
+ continue
310
+ tags = {
311
+ "kind": str(entry.get("kind") or "custom"),
312
+ "role": str(entry.get("role") or ""),
313
+ "checksum": str(entry.get("checksum") or ""),
314
+ "run_id": run.run_id,
315
+ }
316
+ prompt_set_name = entry.get("prompt_set_name")
317
+ if prompt_set_name:
318
+ tags["prompt_set"] = str(prompt_set_name)
319
+ register_fn(
320
+ name=name,
321
+ template=content,
322
+ commit_message=entry.get("checksum"),
323
+ tags=tags,
324
+ model_config={
325
+ "model_name": run.model_name,
326
+ },
327
+ )
328
+
329
+ def _extract_prompt_entries(self, run: EvaluationRun) -> list[dict[str, Any]]:
330
+ entries: list[dict[str, Any]] = []
331
+ metadata = run.tracker_metadata or {}
332
+ prompt_set_detail = metadata.get("prompt_set_detail")
333
+ if isinstance(prompt_set_detail, dict):
334
+ prompt_set_name = prompt_set_detail.get("name")
335
+ for item in prompt_set_detail.get("items", []):
336
+ prompt = item.get("prompt") or {}
337
+ if not isinstance(prompt, dict):
338
+ continue
339
+ entries.append(
340
+ {
341
+ "name": prompt.get("name"),
342
+ "role": item.get("role"),
343
+ "kind": prompt.get("kind"),
344
+ "checksum": prompt.get("checksum"),
345
+ "content": prompt.get("content"),
346
+ "prompt_set_name": prompt_set_name,
347
+ }
348
+ )
349
+
350
+ phoenix_meta = metadata.get("phoenix") or {}
351
+ if isinstance(phoenix_meta, dict):
352
+ for entry in phoenix_meta.get("prompts", []) or []:
353
+ if not isinstance(entry, dict):
354
+ continue
355
+ entries.append(entry)
356
+ return entries
357
+
358
+ def _trace_test_case(self, result: TestCaseResult) -> None:
359
+ trace_fn = getattr(self._mlflow, "trace", None)
360
+ if not callable(trace_fn):
361
+ return
362
+
363
+ attrs = {
364
+ "test_case_id": result.test_case_id,
365
+ "all_passed": result.all_passed,
366
+ "tokens_used": result.tokens_used,
367
+ "latency_ms": result.latency_ms,
368
+ }
369
+
370
+ def _emit() -> dict[str, Any]:
371
+ return {
216
372
  "metrics": [
217
373
  {"name": m.name, "score": m.score, "passed": m.passed} for m in result.metrics
218
374
  ],
375
+ "tokens_used": result.tokens_used,
376
+ "latency_ms": result.latency_ms,
219
377
  }
220
- results_data.append(result_dict)
221
-
222
- self.save_artifact(trace_id, "test_results", results_data)
223
- self.save_artifact(
224
- trace_id,
225
- "custom_metric_snapshot",
226
- (run.tracker_metadata or {}).get("custom_metric_snapshot"),
227
- )
228
378
 
229
- # 6. End MLflow run
230
- self.end_trace(trace_id)
231
-
232
- return trace_id
379
+ try:
380
+ wrapped = trace_fn(
381
+ name=f"test_case_{result.test_case_id}",
382
+ span_type="EVALUATION",
383
+ attributes=attrs,
384
+ )
385
+ wrapped(_emit)()
386
+ except Exception:
387
+ return
@@ -52,6 +52,8 @@ class PhoenixAdapter(TrackerPort):
52
52
  self,
53
53
  endpoint: str = "http://localhost:6006/v1/traces",
54
54
  service_name: str = "evalvault",
55
+ project_name: str | None = None,
56
+ annotations_enabled: bool = True,
55
57
  ):
56
58
  """Initialize Phoenix adapter with OpenTelemetry.
57
59
 
@@ -61,11 +63,14 @@ class PhoenixAdapter(TrackerPort):
61
63
  """
62
64
  self._endpoint = endpoint
63
65
  self._service_name = service_name
66
+ self._project_name = project_name
67
+ self._annotations_enabled = annotations_enabled
64
68
  self._tracer: Any | None = None
65
69
  self._tracer_provider: TracerProvider | None = None
66
70
  self._active_spans: dict[str, Any] = {}
67
71
  self._tracer_any: Any | None = None
68
72
  self._initialized = False
73
+ self._annotations_client: Any | None = None
69
74
 
70
75
  def _ensure_initialized(self) -> None:
71
76
  """Lazy initialization of OpenTelemetry tracer."""
@@ -96,7 +101,10 @@ class PhoenixAdapter(TrackerPort):
96
101
  return
97
102
 
98
103
  # Create resource with service name
99
- resource = Resource.create({"service.name": self._service_name})
104
+ resource_attributes = {"service.name": self._service_name}
105
+ if self._project_name:
106
+ resource_attributes["project.name"] = self._project_name
107
+ resource = Resource.create(resource_attributes)
100
108
 
101
109
  # Create tracer provider
102
110
  self._tracer_provider = TracerProvider(resource=resource)
@@ -123,6 +131,50 @@ class PhoenixAdapter(TrackerPort):
123
131
  "Failed to initialize Phoenix tracer. Check endpoint configuration and dependencies."
124
132
  ) from e
125
133
 
134
+ def _phoenix_base_url(self) -> str:
135
+ if "/v1/traces" in self._endpoint:
136
+ return self._endpoint.split("/v1/traces")[0]
137
+ return self._endpoint.rstrip("/")
138
+
139
+ def _get_annotations_client(self) -> Any | None:
140
+ if not self._annotations_enabled:
141
+ return None
142
+ if self._annotations_client is not None:
143
+ return self._annotations_client
144
+ try:
145
+ from phoenix.client import Client
146
+ except Exception:
147
+ return None
148
+ self._annotations_client = Client(base_url=self._phoenix_base_url())
149
+ return self._annotations_client
150
+
151
+ def _annotate_span(
152
+ self,
153
+ *,
154
+ span: Any,
155
+ name: str,
156
+ label: str,
157
+ score: float | None = None,
158
+ explanation: str | None = None,
159
+ ) -> None:
160
+ client = self._get_annotations_client()
161
+ if client is None or span is None:
162
+ return
163
+ try:
164
+ from opentelemetry.trace import format_span_id
165
+
166
+ span_id = format_span_id(span.get_span_context().span_id)
167
+ client.annotations.add_span_annotation(
168
+ annotation_name=name,
169
+ annotator_kind="CODE",
170
+ span_id=span_id,
171
+ label=label,
172
+ score=score,
173
+ explanation=explanation,
174
+ )
175
+ except Exception:
176
+ return
177
+
126
178
  def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
127
179
  """Start a new trace.
128
180
 
@@ -328,8 +380,17 @@ class PhoenixAdapter(TrackerPort):
328
380
 
329
381
  # Set evaluation-specific attributes
330
382
  span = self._active_spans[trace_id]
383
+ span.set_attribute("openinference.span.kind", "EVALUATOR")
331
384
  span.set_attribute("evaluation.metrics", json.dumps(run.metrics_evaluated))
332
385
  span.set_attribute("evaluation.thresholds", json.dumps(run.thresholds))
386
+ span.set_attribute("evaluation.status", "pass" if run.pass_rate >= 1.0 else "fail")
387
+ if run.tracker_metadata:
388
+ project_name = run.tracker_metadata.get("project_name")
389
+ if project_name:
390
+ span.set_attribute("project.name", project_name)
391
+ project_kind = run.tracker_metadata.get("evaluation_task") or "evaluation"
392
+ span.set_attribute("project.kind", project_kind)
393
+ span.set_attribute("project.status", "pass" if run.pass_rate >= 1.0 else "fail")
333
394
 
334
395
  # Log average scores for each metric
335
396
  for metric_name, summary in metric_summary.items():
@@ -369,6 +430,8 @@ class PhoenixAdapter(TrackerPort):
369
430
  },
370
431
  "metrics": metric_summary,
371
432
  "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
433
+ "prompt_metadata": (run.tracker_metadata or {}).get("phoenix", {}).get("prompts"),
434
+ "tracker_metadata": run.tracker_metadata,
372
435
  "test_cases": [
373
436
  {
374
437
  "test_case_id": result.test_case_id,
@@ -420,6 +483,23 @@ class PhoenixAdapter(TrackerPort):
420
483
  f"test-case-{result.test_case_id}",
421
484
  context=context,
422
485
  ) as span:
486
+ try:
487
+ from opentelemetry.trace import Status, StatusCode
488
+
489
+ span.set_status(Status(StatusCode.OK if result.all_passed else StatusCode.ERROR))
490
+ except Exception:
491
+ pass
492
+ span.set_attribute("openinference.span.kind", "EVALUATOR")
493
+ span.set_attribute("evaluation.status", "pass" if result.all_passed else "fail")
494
+ self._annotate_span(
495
+ span=span,
496
+ name="evaluation_result",
497
+ label="pass" if result.all_passed else "fail",
498
+ score=1.0 if result.all_passed else 0.0,
499
+ explanation="All metrics passed"
500
+ if result.all_passed
501
+ else "One or more metrics failed",
502
+ )
423
503
  # Input data
424
504
  safe_question = sanitize_text(result.question, max_chars=MAX_LOG_CHARS) or ""
425
505
  safe_answer = sanitize_text(result.answer, max_chars=MAX_LOG_CHARS) or ""
@@ -439,6 +519,10 @@ class PhoenixAdapter(TrackerPort):
439
519
  # Metrics
440
520
  span.set_attribute("output.all_passed", result.all_passed)
441
521
  span.set_attribute("output.tokens_used", result.tokens_used)
522
+ if result.tokens_used:
523
+ span.set_attribute("llm.token_count.total", result.tokens_used)
524
+ if result.cost_usd is not None:
525
+ span.set_attribute("llm.cost.total", result.cost_usd)
442
526
 
443
527
  for metric in result.metrics:
444
528
  span.set_attribute(f"metric.{metric.name}.score", metric.score)
@@ -486,6 +570,7 @@ class PhoenixAdapter(TrackerPort):
486
570
  )
487
571
  if result.latency_ms:
488
572
  span.set_attribute("timing.latency_ms", result.latency_ms)
573
+ span.set_attribute("evaluation.latency_ms", result.latency_ms)
489
574
 
490
575
  def log_retrieval(
491
576
  self,
@@ -528,6 +613,13 @@ class PhoenixAdapter(TrackerPort):
528
613
  if tracer is None:
529
614
  raise RuntimeError("Phoenix tracer is not initialized")
530
615
  with tracer.start_span("retrieval", context=context) as span:
616
+ try:
617
+ from opentelemetry.trace import Status, StatusCode
618
+
619
+ span.set_status(Status(StatusCode.OK))
620
+ except Exception:
621
+ pass
622
+ span.set_attribute("openinference.span.kind", "RETRIEVER")
531
623
  # Set retrieval attributes
532
624
  for key, value in data.to_span_attributes().items():
533
625
  span.set_attribute(key, value)
@@ -541,14 +633,24 @@ class PhoenixAdapter(TrackerPort):
541
633
 
542
634
  span.set_attribute("spec.version", "0.1")
543
635
  span.set_attribute("rag.module", "retrieve")
636
+ if data.retrieval_time_ms:
637
+ span.set_attribute("retrieval.latency_ms", data.retrieval_time_ms)
544
638
 
545
639
  documents_payload = _build_retrieval_payload(data.candidates)
546
640
  span.set_attribute("custom.retrieval.doc_count", len(documents_payload))
547
641
  if documents_payload:
548
642
  span.set_attribute("retrieval.documents_json", serialize_json(documents_payload))
549
- doc_ids = _extract_doc_ids(documents_payload)
550
- if doc_ids:
551
- span.set_attribute("output.value", doc_ids)
643
+ previews = [
644
+ item.get("content_preview")
645
+ for item in documents_payload
646
+ if item.get("content_preview")
647
+ ]
648
+ if previews:
649
+ span.set_attribute("output.value", previews)
650
+ else:
651
+ doc_ids = _extract_doc_ids(documents_payload)
652
+ if doc_ids:
653
+ span.set_attribute("output.value", doc_ids)
552
654
 
553
655
  # Log each retrieved document as an event
554
656
  for i, doc in enumerate(data.candidates):
@@ -615,10 +717,31 @@ class PhoenixAdapter(TrackerPort):
615
717
  if tracer is None:
616
718
  raise RuntimeError("Phoenix tracer is not initialized")
617
719
  with tracer.start_span("generation", context=context) as span:
720
+ try:
721
+ from opentelemetry.trace import Status, StatusCode
722
+
723
+ span.set_status(Status(StatusCode.OK))
724
+ except Exception:
725
+ pass
726
+ span.set_attribute("openinference.span.kind", "LLM")
618
727
  # Set generation attributes
619
728
  for key, value in data.to_span_attributes().items():
620
729
  span.set_attribute(key, value)
621
730
 
731
+ if data.model:
732
+ span.set_attribute("llm.model_name", data.model)
733
+ provider = data.model.split("/")[0] if "/" in data.model else ""
734
+ if provider:
735
+ span.set_attribute("llm.provider", provider)
736
+ if data.input_tokens:
737
+ span.set_attribute("llm.token_count.prompt", data.input_tokens)
738
+ if data.output_tokens:
739
+ span.set_attribute("llm.token_count.completion", data.output_tokens)
740
+ if data.total_tokens:
741
+ span.set_attribute("llm.token_count.total", data.total_tokens)
742
+ if data.cost_usd is not None:
743
+ span.set_attribute("llm.cost.total", data.cost_usd)
744
+
622
745
  # Set prompt/response (truncate if too long)
623
746
  prompt = sanitize_text(data.prompt, max_chars=MAX_LOG_CHARS) or ""
624
747
  response = sanitize_text(data.response, max_chars=MAX_LOG_CHARS) or ""
@@ -637,6 +760,13 @@ class PhoenixAdapter(TrackerPort):
637
760
  safe_template = sanitize_text(data.prompt_template, max_chars=MAX_LOG_CHARS)
638
761
  if safe_template:
639
762
  span.set_attribute("generation.prompt_template", safe_template)
763
+ span.set_attribute("llm.prompt_template.template", safe_template)
764
+ span.set_attribute("llm.prompt_template.version", "v1")
765
+ prompt_vars = data.metadata.get("prompt_variables") if data.metadata else None
766
+ if prompt_vars:
767
+ span.set_attribute(
768
+ "llm.prompt_template.variables", json.dumps(prompt_vars, default=str)
769
+ )
640
770
 
641
771
  def log_rag_trace(self, data: RAGTraceData) -> str:
642
772
  """Log a full RAG trace (retrieval + generation) to Phoenix."""
@@ -660,6 +790,8 @@ class PhoenixAdapter(TrackerPort):
660
790
  span = self._active_spans[trace_id]
661
791
  should_end = True
662
792
 
793
+ span.set_attribute("openinference.span.kind", "CHAIN")
794
+
663
795
  for key, value in data.to_span_attributes().items():
664
796
  span.set_attribute(key, value)
665
797
 
@@ -667,11 +799,21 @@ class PhoenixAdapter(TrackerPort):
667
799
  self.log_retrieval(trace_id, data.retrieval)
668
800
  if data.generation:
669
801
  self.log_generation(trace_id, data.generation)
802
+ output_preview = ""
670
803
  if data.final_answer:
671
- preview = sanitize_text(data.final_answer, max_chars=MAX_LOG_CHARS)
672
- if preview:
673
- span.set_attribute("rag.final_answer", preview)
674
- span.set_attribute("output.value", preview)
804
+ output_preview = sanitize_text(data.final_answer, max_chars=MAX_LOG_CHARS)
805
+ if not output_preview and data.generation and data.generation.response:
806
+ output_preview = sanitize_text(data.generation.response, max_chars=MAX_LOG_CHARS)
807
+ if not output_preview and data.retrieval:
808
+ previews = [
809
+ sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS)
810
+ for doc in data.retrieval.candidates
811
+ if doc.content
812
+ ]
813
+ output_preview = "\n".join(previews[:3])
814
+ if output_preview:
815
+ span.set_attribute("rag.final_answer", output_preview)
816
+ span.set_attribute("output.value", output_preview)
675
817
 
676
818
  if safe_query:
677
819
  span.set_attribute("input.value", safe_query)
@@ -697,7 +839,14 @@ def _build_retrieval_payload(
697
839
  payload: list[dict[str, Any]] = []
698
840
  for index, doc in enumerate(documents, start=1):
699
841
  doc_id = doc.chunk_id or doc.source or doc.metadata.get("doc_id") or f"doc_{index}"
700
- item: dict[str, Any] = {"doc_id": doc_id, "score": doc.score}
842
+ preview = ""
843
+ if doc.content:
844
+ preview = sanitize_text(doc.content, max_chars=MAX_CONTEXT_CHARS)
845
+ item: dict[str, Any] = {
846
+ "doc_id": doc_id,
847
+ "score": doc.score,
848
+ "content_preview": preview,
849
+ }
701
850
  if doc.source:
702
851
  item["source"] = doc.source
703
852
  if doc.rerank_score is not None:
@@ -26,6 +26,7 @@ _tracer_provider: TracerProvider | None = None
26
26
  def setup_phoenix_instrumentation(
27
27
  endpoint: str = "http://localhost:6006/v1/traces",
28
28
  service_name: str = "evalvault",
29
+ project_name: str | None = None,
29
30
  enable_langchain: bool = True,
30
31
  enable_openai: bool = True,
31
32
  sample_rate: float = 1.0,
@@ -73,12 +74,13 @@ def setup_phoenix_instrumentation(
73
74
  return None
74
75
 
75
76
  # Create resource with service name
76
- resource = Resource.create(
77
- {
78
- "service.name": service_name,
79
- "service.version": "0.1.0",
80
- }
81
- )
77
+ resource_attributes = {
78
+ "service.name": service_name,
79
+ "service.version": "0.1.0",
80
+ }
81
+ if project_name:
82
+ resource_attributes["project.name"] = project_name
83
+ resource = Resource.create(resource_attributes)
82
84
 
83
85
  # Clamp sample rate between 0 and 1
84
86
  ratio = max(0.0, min(sample_rate, 1.0))
@@ -59,10 +59,15 @@ def ensure_phoenix_instrumentation(
59
59
  if api_token:
60
60
  headers = {"api-key": api_token}
61
61
 
62
+ project_name = getattr(settings, "phoenix_project_name", None)
63
+ if project_name is not None and not isinstance(project_name, str):
64
+ project_name = None
65
+
62
66
  try:
63
67
  setup_phoenix_instrumentation(
64
68
  endpoint=endpoint,
65
69
  service_name="evalvault",
70
+ project_name=project_name,
66
71
  sample_rate=sample_rate,
67
72
  headers=headers,
68
73
  )