evalvault 1.75.0__py3-none-any.whl → 1.77.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -97,7 +97,7 @@ def _resolve_faithfulness_fallback_config(
97
97
 
98
98
  def _default_faithfulness_fallback_model(provider: str) -> str | None:
99
99
  if provider == "ollama":
100
- return "gpt-oss-safeguard:20b"
100
+ return "qwen3:8b"
101
101
  if provider == "vllm":
102
102
  return "gpt-oss-120b"
103
103
  return None
@@ -15,7 +15,7 @@ class PhoenixSyncError(RuntimeError):
15
15
  def _normalize_base_url(endpoint: str) -> str:
16
16
  """Convert OTLP endpoint (…/v1/traces) to Phoenix REST base URL."""
17
17
 
18
- if not endpoint:
18
+ if not isinstance(endpoint, str) or not endpoint:
19
19
  return "http://localhost:6006"
20
20
  base = endpoint.strip()
21
21
  suffix = "/v1/traces"
@@ -104,6 +104,22 @@ class PhoenixSyncService:
104
104
  dataset_description=description,
105
105
  )
106
106
  except Exception as exc: # pragma: no cover - HTTP/serialization errors
107
+ message = str(exc)
108
+ if "already exists" in message:
109
+ existing = self._find_dataset_by_name(dataset_name)
110
+ if existing:
111
+ dataset_obj = self._client.datasets.get_dataset(dataset=existing["id"])
112
+ dataset_url = self._client.experiments.get_dataset_experiments_url(
113
+ dataset_obj.id
114
+ )
115
+ return PhoenixDatasetInfo(
116
+ dataset_id=dataset_obj.id,
117
+ dataset_name=dataset_obj.name,
118
+ dataset_version_id=dataset_obj.version_id,
119
+ url=dataset_url,
120
+ description=description,
121
+ example_count=getattr(dataset_obj, "examples", None),
122
+ )
107
123
  raise PhoenixSyncError(f"Dataset upload failed: {exc}") from exc
108
124
 
109
125
  dataset_url = self._client.experiments.get_dataset_experiments_url(phoenix_dataset.id)
@@ -173,6 +189,74 @@ class PhoenixSyncService:
173
189
  )
174
190
  return examples
175
191
 
192
+ def _find_dataset_by_name(self, dataset_name: str) -> dict[str, Any] | None:
193
+ try:
194
+ datasets = self._client.datasets.list()
195
+ except Exception:
196
+ return None
197
+ for entry in datasets:
198
+ if entry.get("name") == dataset_name:
199
+ return entry
200
+ return None
201
+
202
+ def sync_prompts(
203
+ self,
204
+ *,
205
+ prompt_entries: list[dict[str, Any]],
206
+ model_name: str,
207
+ model_provider: str,
208
+ prompt_set_name: str | None = None,
209
+ ) -> list[dict[str, Any]]:
210
+ """Create prompt versions in Phoenix Prompt Management."""
211
+
212
+ if not prompt_entries:
213
+ return []
214
+
215
+ try:
216
+ from phoenix.client.resources.prompts import PromptVersion
217
+ except Exception as exc: # pragma: no cover - optional dependency
218
+ raise PhoenixSyncError("Phoenix prompt client unavailable") from exc
219
+
220
+ synced: list[dict[str, Any]] = []
221
+ for index, entry in enumerate(prompt_entries, start=1):
222
+ name = entry.get("name") or entry.get("role") or f"prompt_{index}"
223
+ content = entry.get("content") or entry.get("content_preview") or ""
224
+ if not content:
225
+ continue
226
+ prompt_version = PromptVersion(
227
+ [{"role": "system", "content": content}],
228
+ model_name=model_name,
229
+ model_provider=model_provider,
230
+ template_format="NONE",
231
+ )
232
+ prompt_metadata = {
233
+ "kind": entry.get("kind"),
234
+ "role": entry.get("role"),
235
+ "checksum": entry.get("checksum"),
236
+ "status": entry.get("status"),
237
+ "source": entry.get("source") or entry.get("path"),
238
+ "order": index,
239
+ }
240
+ if prompt_set_name:
241
+ prompt_metadata["prompt_set"] = prompt_set_name
242
+ try:
243
+ version = self._client.prompts.create(
244
+ version=prompt_version,
245
+ name=name,
246
+ prompt_description=entry.get("notes"),
247
+ prompt_metadata=_as_serializable(prompt_metadata),
248
+ )
249
+ synced.append(
250
+ {
251
+ **entry,
252
+ "phoenix_prompt_version_id": getattr(version, "id", None),
253
+ }
254
+ )
255
+ except Exception as exc: # pragma: no cover - HTTP errors
256
+ raise PhoenixSyncError(f"Prompt sync failed: {exc}") from exc
257
+
258
+ return synced
259
+
176
260
  def _build_input_payload(self, test_case: TestCase) -> dict[str, Any]:
177
261
  return {
178
262
  "question": test_case.question,
@@ -258,6 +342,21 @@ def build_experiment_metadata(
258
342
  "total_test_cases": run.total_test_cases,
259
343
  "metrics": metrics,
260
344
  }
345
+ if run.results:
346
+ latencies = [r.latency_ms for r in run.results if r.latency_ms]
347
+ tokens = [r.tokens_used for r in run.results if r.tokens_used]
348
+ costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
349
+ if latencies:
350
+ payload["avg_latency_ms"] = round(sum(latencies) / len(latencies), 2)
351
+ if tokens:
352
+ payload["avg_tokens"] = round(sum(tokens) / len(tokens), 2)
353
+ if costs:
354
+ payload["avg_cost_usd"] = round(sum(costs) / len(costs), 6)
355
+ if run.total_tokens:
356
+ payload["total_tokens"] = run.total_tokens
357
+ if run.total_cost_usd is not None:
358
+ payload["total_cost_usd"] = run.total_cost_usd
359
+ payload["error_rate"] = round(1 - run.pass_rate, 4)
261
360
  if reliability_snapshot:
262
361
  payload["reliability_snapshot"] = reliability_snapshot
263
362
  if dataset.metadata:
@@ -50,6 +50,15 @@ class MarkdownReportAdapter:
50
50
  # 통계 분석
51
51
  if bundle.statistical:
52
52
  sections.append(self._generate_statistical_section(bundle.statistical))
53
+ sections.append(self._generate_reason_section(bundle.statistical))
54
+ sections.append(self._generate_meaning_section(bundle.statistical))
55
+ sections.append(self._generate_dataset_delta_section(bundle.statistical))
56
+ sections.append(self._generate_improvement_plan_section(bundle.statistical))
57
+ else:
58
+ sections.append(self._generate_reason_section(None))
59
+ sections.append(self._generate_meaning_section(None))
60
+ sections.append(self._generate_dataset_delta_section(None))
61
+ sections.append(self._generate_improvement_plan_section(None))
53
62
 
54
63
  # NLP 분석
55
64
  if include_nlp and bundle.has_nlp and bundle.nlp:
@@ -208,6 +217,89 @@ class MarkdownReportAdapter:
208
217
 
209
218
  return "\n".join(lines)
210
219
 
220
+ def _generate_reason_section(self, stat: StatisticalAnalysis | None) -> str:
221
+ lines = ["## 원인/근거"]
222
+ if stat is None:
223
+ lines.append(
224
+ "- 통계 분석 결과가 없어 원인/근거를 도출할 수 없습니다. (추가 데이터 필요)"
225
+ )
226
+ return "\n".join(lines)
227
+
228
+ if stat.low_performers:
229
+ for lp in stat.low_performers[:5]:
230
+ lines.append(
231
+ f"- {lp.test_case_id}: {lp.metric_name} {lp.score:.2f} < {lp.threshold:.2f}"
232
+ )
233
+ elif stat.insights:
234
+ for insight in stat.insights[:5]:
235
+ lines.append(f"- {insight}")
236
+ else:
237
+ lines.append("- 추가 데이터 필요")
238
+ return "\n".join(lines)
239
+
240
+ def _generate_meaning_section(self, stat: StatisticalAnalysis | None) -> str:
241
+ lines = ["## 결과 의미"]
242
+ if stat is None:
243
+ lines.append("- 통계 분석 결과가 없어 의미를 해석할 수 없습니다. (추가 데이터 필요)")
244
+ return "\n".join(lines)
245
+
246
+ if stat.overall_pass_rate < 0.7:
247
+ lines.append("- 전체 통과율이 낮아 사용자 신뢰/정확성 리스크가 큽니다.")
248
+ else:
249
+ lines.append("- 전체 통과율이 기준 이상으로 기본 품질은 유지됩니다.")
250
+
251
+ low_metrics = [
252
+ metric for metric, rate in (stat.metric_pass_rates or {}).items() if rate < 0.7
253
+ ]
254
+ if low_metrics:
255
+ metrics_str = ", ".join(sorted(low_metrics)[:6])
256
+ lines.append(f"- 기준 미달 메트릭: {metrics_str}")
257
+ return "\n".join(lines)
258
+
259
+ def _generate_dataset_delta_section(self, stat: StatisticalAnalysis | None) -> str:
260
+ lines = ["## 데이터셋 차이"]
261
+ if stat is None:
262
+ lines.append("- 데이터셋 기준 차이를 판단할 수 없습니다. (추가 데이터 필요)")
263
+ return "\n".join(lines)
264
+
265
+ low_metrics = [
266
+ metric for metric, rate in (stat.metric_pass_rates or {}).items() if rate < 0.7
267
+ ]
268
+ if low_metrics:
269
+ lines.append("- 데이터셋 기준 미달 지표: " + ", ".join(sorted(low_metrics)[:6]))
270
+ else:
271
+ lines.append("- 데이터셋 기준 미달 지표가 없습니다.")
272
+ return "\n".join(lines)
273
+
274
+ def _generate_improvement_plan_section(self, stat: StatisticalAnalysis | None) -> str:
275
+ lines = ["## 개선 방향"]
276
+ if stat is None:
277
+ lines.append("- 개선 방향 도출을 위한 분석 결과가 부족합니다. (추가 데이터 필요)")
278
+ return "\n".join(lines)
279
+
280
+ action_map = {
281
+ "context_precision": "랭커/리랭커 도입 및 상위 문서 필터링 강화",
282
+ "context_recall": "검색 범위 확장 또는 하드 네거티브 추가",
283
+ "mrr": "상위 K 재정렬 및 쿼리 재작성 적용",
284
+ "ndcg": "랭킹 품질 지표 최적화(리랭킹/하이브리드 검색)",
285
+ "hit_rate": "검색 후보군 확대 또는 인덱싱 개선",
286
+ "answer_relevancy": "답변 포맷/질문 의도 정렬 프롬프트 강화",
287
+ "faithfulness": "근거 인용/검증 단계 추가",
288
+ "factual_correctness": "정답 검증 규칙 강화 및 근거 필터링",
289
+ "semantic_similarity": "정답 기준 문장 재정의 및 평가셋 보강",
290
+ }
291
+
292
+ low_metrics = [
293
+ metric for metric, rate in (stat.metric_pass_rates or {}).items() if rate < 0.7
294
+ ]
295
+ if low_metrics:
296
+ for metric in sorted(low_metrics)[:5]:
297
+ action = action_map.get(metric, "실험을 통해 개선 방향을 재검증")
298
+ lines.append(f"- {metric}: {action}")
299
+ else:
300
+ lines.append("- 개선 대상 지표가 명확하지 않습니다. (추가 데이터 필요)")
301
+ return "\n".join(lines)
302
+
211
303
  def _generate_nlp_section(self, nlp: NLPAnalysis) -> str:
212
304
  """NLP 분석 섹션 생성."""
213
305
  lines = ["## NLP 분석"]
@@ -19,12 +19,9 @@ def build_storage_adapter(
19
19
  ) -> StoragePort:
20
20
  resolved_settings = settings or Settings()
21
21
 
22
- if db_path is not None:
23
- return SQLiteStorageAdapter(db_path=db_path)
24
-
25
22
  backend = getattr(resolved_settings, "db_backend", "postgres")
26
23
  if backend == "sqlite":
27
- resolved_db_path = resolved_settings.evalvault_db_path
24
+ resolved_db_path = db_path or resolved_settings.evalvault_db_path
28
25
  if resolved_db_path is None:
29
26
  raise RuntimeError("SQLite backend selected but evalvault_db_path is not set.")
30
27
  return SQLiteStorageAdapter(db_path=resolved_db_path)
@@ -5,7 +5,7 @@ import tempfile
5
5
  from typing import Any
6
6
 
7
7
  from evalvault.adapters.outbound.tracker.log_sanitizer import MAX_LOG_CHARS, sanitize_payload
8
- from evalvault.domain.entities import EvaluationRun
8
+ from evalvault.domain.entities import EvaluationRun, TestCaseResult
9
9
  from evalvault.ports.outbound.tracker_port import TrackerPort
10
10
 
11
11
 
@@ -29,6 +29,17 @@ class MLflowAdapter(TrackerPort):
29
29
  tracking_uri: MLflow tracking server URI
30
30
  experiment_name: MLflow experiment name
31
31
  """
32
+ try:
33
+ import torch # type: ignore
34
+ except Exception:
35
+ torch = None # type: ignore
36
+ if torch is not None and not hasattr(torch, "Tensor"):
37
+
38
+ class _TorchTensor: # pragma: no cover - guard for namespace package
39
+ pass
40
+
41
+ torch.Tensor = _TorchTensor # type: ignore[attr-defined]
42
+
32
43
  import mlflow
33
44
 
34
45
  mlflow.set_tracking_uri(tracking_uri)
@@ -36,6 +47,21 @@ class MLflowAdapter(TrackerPort):
36
47
  self._mlflow = mlflow
37
48
  self._active_runs: dict[str, Any] = {} # trace_id -> mlflow run
38
49
 
50
+ def _enable_system_metrics(self) -> None:
51
+ try:
52
+ enable_fn = getattr(self._mlflow, "enable_system_metrics_logging", None)
53
+ if callable(enable_fn):
54
+ enable_fn()
55
+ except Exception: # pragma: no cover - optional dependency
56
+ return
57
+
58
+ def _start_mlflow_run(self, name: str) -> Any:
59
+ try:
60
+ return self._mlflow.start_run(run_name=name, log_system_metrics=True)
61
+ except TypeError:
62
+ self._enable_system_metrics()
63
+ return self._mlflow.start_run(run_name=name)
64
+
39
65
  def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
40
66
  """
41
67
  Start a new MLflow run (mapped to trace).
@@ -47,7 +73,7 @@ class MLflowAdapter(TrackerPort):
47
73
  Returns:
48
74
  trace_id: MLflow run ID
49
75
  """
50
- run = self._mlflow.start_run(run_name=name)
76
+ run = self._start_mlflow_run(name)
51
77
  trace_id = run.info.run_id
52
78
 
53
79
  # Log metadata as MLflow parameters (only primitive types)
@@ -59,6 +85,12 @@ class MLflowAdapter(TrackerPort):
59
85
  self._active_runs[trace_id] = run
60
86
  return trace_id
61
87
 
88
+ def _write_temp_file(self, suffix: str, content: str) -> str:
89
+ with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False) as f:
90
+ f.write(content)
91
+ f.flush()
92
+ return f.name
93
+
62
94
  def add_span(
63
95
  self,
64
96
  trace_id: str,
@@ -89,10 +121,9 @@ class MLflowAdapter(TrackerPort):
89
121
  "input": sanitize_payload(input_data, max_chars=MAX_LOG_CHARS),
90
122
  "output": sanitize_payload(output_data, max_chars=MAX_LOG_CHARS),
91
123
  }
92
-
93
- with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
94
- json.dump(span_data, f, default=str)
95
- self._mlflow.log_artifact(f.name, f"spans/{name}")
124
+ payload = json.dumps(span_data, default=str)
125
+ path = self._write_temp_file(".json", payload)
126
+ self._mlflow.log_artifact(path, f"spans/{name}")
96
127
 
97
128
  def log_score(
98
129
  self,
@@ -145,9 +176,15 @@ class MLflowAdapter(TrackerPort):
145
176
  raise ValueError(f"Run not found: {trace_id}")
146
177
 
147
178
  if artifact_type == "json":
148
- with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
149
- json.dump(data, f, default=str)
150
- self._mlflow.log_artifact(f.name, f"artifacts/{name}")
179
+ payload = json.dumps(data, default=str)
180
+ path = self._write_temp_file(".json", payload)
181
+ self._mlflow.log_artifact(path, f"artifacts/{name}")
182
+ elif artifact_type == "text":
183
+ path = self._write_temp_file(".txt", str(data))
184
+ self._mlflow.log_artifact(path, f"artifacts/{name}")
185
+ else:
186
+ path = self._write_temp_file(".txt", str(data))
187
+ self._mlflow.log_artifact(path, f"artifacts/{name}")
151
188
 
152
189
  def end_trace(self, trace_id: str) -> None:
153
190
  """
@@ -180,53 +217,171 @@ class MLflowAdapter(TrackerPort):
180
217
  Returns:
181
218
  trace_id: ID of the created MLflow run
182
219
  """
183
- # 1. Start MLflow run
184
- trace_id = self.start_trace(
185
- name=f"evaluation-{run.run_id[:8]}",
186
- metadata={
187
- "dataset_name": run.dataset_name,
188
- "dataset_version": run.dataset_version,
189
- "model_name": run.model_name,
190
- "total_test_cases": run.total_test_cases,
191
- },
192
- )
193
-
194
- # 2. Log average metric scores
195
- for metric_name in run.metrics_evaluated:
196
- avg_score = run.get_avg_score(metric_name)
197
- if avg_score is not None:
198
- self.log_score(trace_id, f"avg_{metric_name}", avg_score)
199
-
200
- # 3. Log overall pass rate
201
- self.log_score(trace_id, "pass_rate", run.pass_rate)
202
-
203
- # 4. Log resource usage
204
- self._mlflow.log_metric("total_tokens", run.total_tokens)
205
-
206
- if run.duration_seconds:
207
- self._mlflow.log_metric("duration_seconds", run.duration_seconds)
208
-
209
- # 5. Save individual test results as artifact
210
- results_data = []
211
- for result in run.results:
212
- result_dict = {
213
- "test_case_id": result.test_case_id,
214
- "all_passed": result.all_passed,
215
- "tokens_used": result.tokens_used,
220
+
221
+ def _log_run() -> str:
222
+ trace_id = self.start_trace(
223
+ name=f"evaluation-{run.run_id[:8]}",
224
+ metadata={
225
+ "dataset_name": run.dataset_name,
226
+ "dataset_version": run.dataset_version,
227
+ "model_name": run.model_name,
228
+ "total_test_cases": run.total_test_cases,
229
+ },
230
+ )
231
+
232
+ self._mlflow.set_tag("run_id", run.run_id)
233
+ self._mlflow.set_tag("model_name", run.model_name)
234
+ self._mlflow.set_tag("dataset", f"{run.dataset_name}:{run.dataset_version}")
235
+ if run.tracker_metadata:
236
+ project_name = run.tracker_metadata.get("project_name")
237
+ if project_name:
238
+ self._mlflow.set_tag("project_name", project_name)
239
+
240
+ for metric_name in run.metrics_evaluated:
241
+ avg_score = run.get_avg_score(metric_name)
242
+ if avg_score is not None:
243
+ self.log_score(trace_id, f"avg_{metric_name}", avg_score)
244
+
245
+ self.log_score(trace_id, "pass_rate", run.pass_rate)
246
+ self._mlflow.log_metric("total_tokens", run.total_tokens)
247
+ if run.duration_seconds:
248
+ self._mlflow.log_metric("duration_seconds", run.duration_seconds)
249
+ if run.total_cost_usd is not None:
250
+ self._mlflow.log_metric("total_cost_usd", run.total_cost_usd)
251
+
252
+ results_data = []
253
+ for result in run.results:
254
+ result_dict = {
255
+ "test_case_id": result.test_case_id,
256
+ "all_passed": result.all_passed,
257
+ "tokens_used": result.tokens_used,
258
+ "metrics": [
259
+ {"name": m.name, "score": m.score, "passed": m.passed}
260
+ for m in result.metrics
261
+ ],
262
+ }
263
+ results_data.append(result_dict)
264
+ self._trace_test_case(result)
265
+
266
+ self.save_artifact(trace_id, "test_results", results_data)
267
+ self.save_artifact(
268
+ trace_id,
269
+ "custom_metric_snapshot",
270
+ (run.tracker_metadata or {}).get("custom_metric_snapshot"),
271
+ )
272
+ if run.tracker_metadata:
273
+ self.save_artifact(trace_id, "tracker_metadata", run.tracker_metadata)
274
+ self._register_prompts(run)
275
+
276
+ self.end_trace(trace_id)
277
+ return trace_id
278
+
279
+ trace_name = f"evaluation-{run.run_id[:8]}"
280
+ trace_attrs = {
281
+ "dataset_name": run.dataset_name,
282
+ "dataset_version": run.dataset_version,
283
+ "model_name": run.model_name,
284
+ }
285
+ try:
286
+ traced = self._mlflow.trace(
287
+ name=trace_name, span_type="EVALUATION", attributes=trace_attrs
288
+ )
289
+ return traced(_log_run)()
290
+ except Exception:
291
+ return _log_run()
292
+
293
+ def _register_prompts(self, run: EvaluationRun) -> None:
294
+ genai = getattr(self._mlflow, "genai", None)
295
+ if genai is None:
296
+ return
297
+ register_fn = getattr(genai, "register_prompt", None)
298
+ if not callable(register_fn):
299
+ return
300
+
301
+ prompt_entries = self._extract_prompt_entries(run)
302
+ if not prompt_entries:
303
+ return
304
+
305
+ for entry in prompt_entries:
306
+ name = entry.get("name") or entry.get("role") or "prompt"
307
+ content = entry.get("content") or entry.get("content_preview") or ""
308
+ if not content:
309
+ continue
310
+ tags = {
311
+ "kind": str(entry.get("kind") or "custom"),
312
+ "role": str(entry.get("role") or ""),
313
+ "checksum": str(entry.get("checksum") or ""),
314
+ "run_id": run.run_id,
315
+ }
316
+ prompt_set_name = entry.get("prompt_set_name")
317
+ if prompt_set_name:
318
+ tags["prompt_set"] = str(prompt_set_name)
319
+ register_fn(
320
+ name=name,
321
+ template=content,
322
+ commit_message=entry.get("checksum"),
323
+ tags=tags,
324
+ model_config={
325
+ "model_name": run.model_name,
326
+ },
327
+ )
328
+
329
+ def _extract_prompt_entries(self, run: EvaluationRun) -> list[dict[str, Any]]:
330
+ entries: list[dict[str, Any]] = []
331
+ metadata = run.tracker_metadata or {}
332
+ prompt_set_detail = metadata.get("prompt_set_detail")
333
+ if isinstance(prompt_set_detail, dict):
334
+ prompt_set_name = prompt_set_detail.get("name")
335
+ for item in prompt_set_detail.get("items", []):
336
+ prompt = item.get("prompt") or {}
337
+ if not isinstance(prompt, dict):
338
+ continue
339
+ entries.append(
340
+ {
341
+ "name": prompt.get("name"),
342
+ "role": item.get("role"),
343
+ "kind": prompt.get("kind"),
344
+ "checksum": prompt.get("checksum"),
345
+ "content": prompt.get("content"),
346
+ "prompt_set_name": prompt_set_name,
347
+ }
348
+ )
349
+
350
+ phoenix_meta = metadata.get("phoenix") or {}
351
+ if isinstance(phoenix_meta, dict):
352
+ for entry in phoenix_meta.get("prompts", []) or []:
353
+ if not isinstance(entry, dict):
354
+ continue
355
+ entries.append(entry)
356
+ return entries
357
+
358
+ def _trace_test_case(self, result: TestCaseResult) -> None:
359
+ trace_fn = getattr(self._mlflow, "trace", None)
360
+ if not callable(trace_fn):
361
+ return
362
+
363
+ attrs = {
364
+ "test_case_id": result.test_case_id,
365
+ "all_passed": result.all_passed,
366
+ "tokens_used": result.tokens_used,
367
+ "latency_ms": result.latency_ms,
368
+ }
369
+
370
+ def _emit() -> dict[str, Any]:
371
+ return {
216
372
  "metrics": [
217
373
  {"name": m.name, "score": m.score, "passed": m.passed} for m in result.metrics
218
374
  ],
375
+ "tokens_used": result.tokens_used,
376
+ "latency_ms": result.latency_ms,
219
377
  }
220
- results_data.append(result_dict)
221
-
222
- self.save_artifact(trace_id, "test_results", results_data)
223
- self.save_artifact(
224
- trace_id,
225
- "custom_metric_snapshot",
226
- (run.tracker_metadata or {}).get("custom_metric_snapshot"),
227
- )
228
378
 
229
- # 6. End MLflow run
230
- self.end_trace(trace_id)
231
-
232
- return trace_id
379
+ try:
380
+ wrapped = trace_fn(
381
+ name=f"test_case_{result.test_case_id}",
382
+ span_type="EVALUATION",
383
+ attributes=attrs,
384
+ )
385
+ wrapped(_emit)()
386
+ except Exception:
387
+ return