evalvault 1.75.0__py3-none-any.whl → 1.77.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +123 -64
- evalvault/adapters/inbound/api/main.py +2 -0
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/cli/app.py +3 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +6 -1
- evalvault/adapters/inbound/cli/commands/method.py +3 -3
- evalvault/adapters/inbound/cli/commands/run.py +153 -30
- evalvault/adapters/inbound/cli/commands/run_helpers.py +166 -62
- evalvault/adapters/outbound/analysis/llm_report_module.py +515 -33
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/phoenix/sync_service.py +100 -1
- evalvault/adapters/outbound/report/markdown_adapter.py +92 -0
- evalvault/adapters/outbound/storage/factory.py +1 -4
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +178 -12
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/runtime_services.py +122 -0
- evalvault/config/settings.py +40 -4
- evalvault/domain/services/evaluator.py +2 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/METADATA +2 -1
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/RECORD +25 -24
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/WHEEL +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -97,7 +97,7 @@ def _resolve_faithfulness_fallback_config(
|
|
|
97
97
|
|
|
98
98
|
def _default_faithfulness_fallback_model(provider: str) -> str | None:
|
|
99
99
|
if provider == "ollama":
|
|
100
|
-
return "
|
|
100
|
+
return "qwen3:8b"
|
|
101
101
|
if provider == "vllm":
|
|
102
102
|
return "gpt-oss-120b"
|
|
103
103
|
return None
|
|
@@ -15,7 +15,7 @@ class PhoenixSyncError(RuntimeError):
|
|
|
15
15
|
def _normalize_base_url(endpoint: str) -> str:
|
|
16
16
|
"""Convert OTLP endpoint (…/v1/traces) to Phoenix REST base URL."""
|
|
17
17
|
|
|
18
|
-
if not endpoint:
|
|
18
|
+
if not isinstance(endpoint, str) or not endpoint:
|
|
19
19
|
return "http://localhost:6006"
|
|
20
20
|
base = endpoint.strip()
|
|
21
21
|
suffix = "/v1/traces"
|
|
@@ -104,6 +104,22 @@ class PhoenixSyncService:
|
|
|
104
104
|
dataset_description=description,
|
|
105
105
|
)
|
|
106
106
|
except Exception as exc: # pragma: no cover - HTTP/serialization errors
|
|
107
|
+
message = str(exc)
|
|
108
|
+
if "already exists" in message:
|
|
109
|
+
existing = self._find_dataset_by_name(dataset_name)
|
|
110
|
+
if existing:
|
|
111
|
+
dataset_obj = self._client.datasets.get_dataset(dataset=existing["id"])
|
|
112
|
+
dataset_url = self._client.experiments.get_dataset_experiments_url(
|
|
113
|
+
dataset_obj.id
|
|
114
|
+
)
|
|
115
|
+
return PhoenixDatasetInfo(
|
|
116
|
+
dataset_id=dataset_obj.id,
|
|
117
|
+
dataset_name=dataset_obj.name,
|
|
118
|
+
dataset_version_id=dataset_obj.version_id,
|
|
119
|
+
url=dataset_url,
|
|
120
|
+
description=description,
|
|
121
|
+
example_count=getattr(dataset_obj, "examples", None),
|
|
122
|
+
)
|
|
107
123
|
raise PhoenixSyncError(f"Dataset upload failed: {exc}") from exc
|
|
108
124
|
|
|
109
125
|
dataset_url = self._client.experiments.get_dataset_experiments_url(phoenix_dataset.id)
|
|
@@ -173,6 +189,74 @@ class PhoenixSyncService:
|
|
|
173
189
|
)
|
|
174
190
|
return examples
|
|
175
191
|
|
|
192
|
+
def _find_dataset_by_name(self, dataset_name: str) -> dict[str, Any] | None:
|
|
193
|
+
try:
|
|
194
|
+
datasets = self._client.datasets.list()
|
|
195
|
+
except Exception:
|
|
196
|
+
return None
|
|
197
|
+
for entry in datasets:
|
|
198
|
+
if entry.get("name") == dataset_name:
|
|
199
|
+
return entry
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
def sync_prompts(
|
|
203
|
+
self,
|
|
204
|
+
*,
|
|
205
|
+
prompt_entries: list[dict[str, Any]],
|
|
206
|
+
model_name: str,
|
|
207
|
+
model_provider: str,
|
|
208
|
+
prompt_set_name: str | None = None,
|
|
209
|
+
) -> list[dict[str, Any]]:
|
|
210
|
+
"""Create prompt versions in Phoenix Prompt Management."""
|
|
211
|
+
|
|
212
|
+
if not prompt_entries:
|
|
213
|
+
return []
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
from phoenix.client.resources.prompts import PromptVersion
|
|
217
|
+
except Exception as exc: # pragma: no cover - optional dependency
|
|
218
|
+
raise PhoenixSyncError("Phoenix prompt client unavailable") from exc
|
|
219
|
+
|
|
220
|
+
synced: list[dict[str, Any]] = []
|
|
221
|
+
for index, entry in enumerate(prompt_entries, start=1):
|
|
222
|
+
name = entry.get("name") or entry.get("role") or f"prompt_{index}"
|
|
223
|
+
content = entry.get("content") or entry.get("content_preview") or ""
|
|
224
|
+
if not content:
|
|
225
|
+
continue
|
|
226
|
+
prompt_version = PromptVersion(
|
|
227
|
+
[{"role": "system", "content": content}],
|
|
228
|
+
model_name=model_name,
|
|
229
|
+
model_provider=model_provider,
|
|
230
|
+
template_format="NONE",
|
|
231
|
+
)
|
|
232
|
+
prompt_metadata = {
|
|
233
|
+
"kind": entry.get("kind"),
|
|
234
|
+
"role": entry.get("role"),
|
|
235
|
+
"checksum": entry.get("checksum"),
|
|
236
|
+
"status": entry.get("status"),
|
|
237
|
+
"source": entry.get("source") or entry.get("path"),
|
|
238
|
+
"order": index,
|
|
239
|
+
}
|
|
240
|
+
if prompt_set_name:
|
|
241
|
+
prompt_metadata["prompt_set"] = prompt_set_name
|
|
242
|
+
try:
|
|
243
|
+
version = self._client.prompts.create(
|
|
244
|
+
version=prompt_version,
|
|
245
|
+
name=name,
|
|
246
|
+
prompt_description=entry.get("notes"),
|
|
247
|
+
prompt_metadata=_as_serializable(prompt_metadata),
|
|
248
|
+
)
|
|
249
|
+
synced.append(
|
|
250
|
+
{
|
|
251
|
+
**entry,
|
|
252
|
+
"phoenix_prompt_version_id": getattr(version, "id", None),
|
|
253
|
+
}
|
|
254
|
+
)
|
|
255
|
+
except Exception as exc: # pragma: no cover - HTTP errors
|
|
256
|
+
raise PhoenixSyncError(f"Prompt sync failed: {exc}") from exc
|
|
257
|
+
|
|
258
|
+
return synced
|
|
259
|
+
|
|
176
260
|
def _build_input_payload(self, test_case: TestCase) -> dict[str, Any]:
|
|
177
261
|
return {
|
|
178
262
|
"question": test_case.question,
|
|
@@ -258,6 +342,21 @@ def build_experiment_metadata(
|
|
|
258
342
|
"total_test_cases": run.total_test_cases,
|
|
259
343
|
"metrics": metrics,
|
|
260
344
|
}
|
|
345
|
+
if run.results:
|
|
346
|
+
latencies = [r.latency_ms for r in run.results if r.latency_ms]
|
|
347
|
+
tokens = [r.tokens_used for r in run.results if r.tokens_used]
|
|
348
|
+
costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
|
|
349
|
+
if latencies:
|
|
350
|
+
payload["avg_latency_ms"] = round(sum(latencies) / len(latencies), 2)
|
|
351
|
+
if tokens:
|
|
352
|
+
payload["avg_tokens"] = round(sum(tokens) / len(tokens), 2)
|
|
353
|
+
if costs:
|
|
354
|
+
payload["avg_cost_usd"] = round(sum(costs) / len(costs), 6)
|
|
355
|
+
if run.total_tokens:
|
|
356
|
+
payload["total_tokens"] = run.total_tokens
|
|
357
|
+
if run.total_cost_usd is not None:
|
|
358
|
+
payload["total_cost_usd"] = run.total_cost_usd
|
|
359
|
+
payload["error_rate"] = round(1 - run.pass_rate, 4)
|
|
261
360
|
if reliability_snapshot:
|
|
262
361
|
payload["reliability_snapshot"] = reliability_snapshot
|
|
263
362
|
if dataset.metadata:
|
|
@@ -50,6 +50,15 @@ class MarkdownReportAdapter:
|
|
|
50
50
|
# 통계 분석
|
|
51
51
|
if bundle.statistical:
|
|
52
52
|
sections.append(self._generate_statistical_section(bundle.statistical))
|
|
53
|
+
sections.append(self._generate_reason_section(bundle.statistical))
|
|
54
|
+
sections.append(self._generate_meaning_section(bundle.statistical))
|
|
55
|
+
sections.append(self._generate_dataset_delta_section(bundle.statistical))
|
|
56
|
+
sections.append(self._generate_improvement_plan_section(bundle.statistical))
|
|
57
|
+
else:
|
|
58
|
+
sections.append(self._generate_reason_section(None))
|
|
59
|
+
sections.append(self._generate_meaning_section(None))
|
|
60
|
+
sections.append(self._generate_dataset_delta_section(None))
|
|
61
|
+
sections.append(self._generate_improvement_plan_section(None))
|
|
53
62
|
|
|
54
63
|
# NLP 분석
|
|
55
64
|
if include_nlp and bundle.has_nlp and bundle.nlp:
|
|
@@ -208,6 +217,89 @@ class MarkdownReportAdapter:
|
|
|
208
217
|
|
|
209
218
|
return "\n".join(lines)
|
|
210
219
|
|
|
220
|
+
def _generate_reason_section(self, stat: StatisticalAnalysis | None) -> str:
|
|
221
|
+
lines = ["## 원인/근거"]
|
|
222
|
+
if stat is None:
|
|
223
|
+
lines.append(
|
|
224
|
+
"- 통계 분석 결과가 없어 원인/근거를 도출할 수 없습니다. (추가 데이터 필요)"
|
|
225
|
+
)
|
|
226
|
+
return "\n".join(lines)
|
|
227
|
+
|
|
228
|
+
if stat.low_performers:
|
|
229
|
+
for lp in stat.low_performers[:5]:
|
|
230
|
+
lines.append(
|
|
231
|
+
f"- {lp.test_case_id}: {lp.metric_name} {lp.score:.2f} < {lp.threshold:.2f}"
|
|
232
|
+
)
|
|
233
|
+
elif stat.insights:
|
|
234
|
+
for insight in stat.insights[:5]:
|
|
235
|
+
lines.append(f"- {insight}")
|
|
236
|
+
else:
|
|
237
|
+
lines.append("- 추가 데이터 필요")
|
|
238
|
+
return "\n".join(lines)
|
|
239
|
+
|
|
240
|
+
def _generate_meaning_section(self, stat: StatisticalAnalysis | None) -> str:
|
|
241
|
+
lines = ["## 결과 의미"]
|
|
242
|
+
if stat is None:
|
|
243
|
+
lines.append("- 통계 분석 결과가 없어 의미를 해석할 수 없습니다. (추가 데이터 필요)")
|
|
244
|
+
return "\n".join(lines)
|
|
245
|
+
|
|
246
|
+
if stat.overall_pass_rate < 0.7:
|
|
247
|
+
lines.append("- 전체 통과율이 낮아 사용자 신뢰/정확성 리스크가 큽니다.")
|
|
248
|
+
else:
|
|
249
|
+
lines.append("- 전체 통과율이 기준 이상으로 기본 품질은 유지됩니다.")
|
|
250
|
+
|
|
251
|
+
low_metrics = [
|
|
252
|
+
metric for metric, rate in (stat.metric_pass_rates or {}).items() if rate < 0.7
|
|
253
|
+
]
|
|
254
|
+
if low_metrics:
|
|
255
|
+
metrics_str = ", ".join(sorted(low_metrics)[:6])
|
|
256
|
+
lines.append(f"- 기준 미달 메트릭: {metrics_str}")
|
|
257
|
+
return "\n".join(lines)
|
|
258
|
+
|
|
259
|
+
def _generate_dataset_delta_section(self, stat: StatisticalAnalysis | None) -> str:
|
|
260
|
+
lines = ["## 데이터셋 차이"]
|
|
261
|
+
if stat is None:
|
|
262
|
+
lines.append("- 데이터셋 기준 차이를 판단할 수 없습니다. (추가 데이터 필요)")
|
|
263
|
+
return "\n".join(lines)
|
|
264
|
+
|
|
265
|
+
low_metrics = [
|
|
266
|
+
metric for metric, rate in (stat.metric_pass_rates or {}).items() if rate < 0.7
|
|
267
|
+
]
|
|
268
|
+
if low_metrics:
|
|
269
|
+
lines.append("- 데이터셋 기준 미달 지표: " + ", ".join(sorted(low_metrics)[:6]))
|
|
270
|
+
else:
|
|
271
|
+
lines.append("- 데이터셋 기준 미달 지표가 없습니다.")
|
|
272
|
+
return "\n".join(lines)
|
|
273
|
+
|
|
274
|
+
def _generate_improvement_plan_section(self, stat: StatisticalAnalysis | None) -> str:
|
|
275
|
+
lines = ["## 개선 방향"]
|
|
276
|
+
if stat is None:
|
|
277
|
+
lines.append("- 개선 방향 도출을 위한 분석 결과가 부족합니다. (추가 데이터 필요)")
|
|
278
|
+
return "\n".join(lines)
|
|
279
|
+
|
|
280
|
+
action_map = {
|
|
281
|
+
"context_precision": "랭커/리랭커 도입 및 상위 문서 필터링 강화",
|
|
282
|
+
"context_recall": "검색 범위 확장 또는 하드 네거티브 추가",
|
|
283
|
+
"mrr": "상위 K 재정렬 및 쿼리 재작성 적용",
|
|
284
|
+
"ndcg": "랭킹 품질 지표 최적화(리랭킹/하이브리드 검색)",
|
|
285
|
+
"hit_rate": "검색 후보군 확대 또는 인덱싱 개선",
|
|
286
|
+
"answer_relevancy": "답변 포맷/질문 의도 정렬 프롬프트 강화",
|
|
287
|
+
"faithfulness": "근거 인용/검증 단계 추가",
|
|
288
|
+
"factual_correctness": "정답 검증 규칙 강화 및 근거 필터링",
|
|
289
|
+
"semantic_similarity": "정답 기준 문장 재정의 및 평가셋 보강",
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
low_metrics = [
|
|
293
|
+
metric for metric, rate in (stat.metric_pass_rates or {}).items() if rate < 0.7
|
|
294
|
+
]
|
|
295
|
+
if low_metrics:
|
|
296
|
+
for metric in sorted(low_metrics)[:5]:
|
|
297
|
+
action = action_map.get(metric, "실험을 통해 개선 방향을 재검증")
|
|
298
|
+
lines.append(f"- {metric}: {action}")
|
|
299
|
+
else:
|
|
300
|
+
lines.append("- 개선 대상 지표가 명확하지 않습니다. (추가 데이터 필요)")
|
|
301
|
+
return "\n".join(lines)
|
|
302
|
+
|
|
211
303
|
def _generate_nlp_section(self, nlp: NLPAnalysis) -> str:
|
|
212
304
|
"""NLP 분석 섹션 생성."""
|
|
213
305
|
lines = ["## NLP 분석"]
|
|
@@ -19,12 +19,9 @@ def build_storage_adapter(
|
|
|
19
19
|
) -> StoragePort:
|
|
20
20
|
resolved_settings = settings or Settings()
|
|
21
21
|
|
|
22
|
-
if db_path is not None:
|
|
23
|
-
return SQLiteStorageAdapter(db_path=db_path)
|
|
24
|
-
|
|
25
22
|
backend = getattr(resolved_settings, "db_backend", "postgres")
|
|
26
23
|
if backend == "sqlite":
|
|
27
|
-
resolved_db_path = resolved_settings.evalvault_db_path
|
|
24
|
+
resolved_db_path = db_path or resolved_settings.evalvault_db_path
|
|
28
25
|
if resolved_db_path is None:
|
|
29
26
|
raise RuntimeError("SQLite backend selected but evalvault_db_path is not set.")
|
|
30
27
|
return SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
@@ -5,7 +5,7 @@ import tempfile
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
7
|
from evalvault.adapters.outbound.tracker.log_sanitizer import MAX_LOG_CHARS, sanitize_payload
|
|
8
|
-
from evalvault.domain.entities import EvaluationRun
|
|
8
|
+
from evalvault.domain.entities import EvaluationRun, TestCaseResult
|
|
9
9
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
10
10
|
|
|
11
11
|
|
|
@@ -29,6 +29,17 @@ class MLflowAdapter(TrackerPort):
|
|
|
29
29
|
tracking_uri: MLflow tracking server URI
|
|
30
30
|
experiment_name: MLflow experiment name
|
|
31
31
|
"""
|
|
32
|
+
try:
|
|
33
|
+
import torch # type: ignore
|
|
34
|
+
except Exception:
|
|
35
|
+
torch = None # type: ignore
|
|
36
|
+
if torch is not None and not hasattr(torch, "Tensor"):
|
|
37
|
+
|
|
38
|
+
class _TorchTensor: # pragma: no cover - guard for namespace package
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
torch.Tensor = _TorchTensor # type: ignore[attr-defined]
|
|
42
|
+
|
|
32
43
|
import mlflow
|
|
33
44
|
|
|
34
45
|
mlflow.set_tracking_uri(tracking_uri)
|
|
@@ -36,6 +47,21 @@ class MLflowAdapter(TrackerPort):
|
|
|
36
47
|
self._mlflow = mlflow
|
|
37
48
|
self._active_runs: dict[str, Any] = {} # trace_id -> mlflow run
|
|
38
49
|
|
|
50
|
+
def _enable_system_metrics(self) -> None:
|
|
51
|
+
try:
|
|
52
|
+
enable_fn = getattr(self._mlflow, "enable_system_metrics_logging", None)
|
|
53
|
+
if callable(enable_fn):
|
|
54
|
+
enable_fn()
|
|
55
|
+
except Exception: # pragma: no cover - optional dependency
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
def _start_mlflow_run(self, name: str) -> Any:
|
|
59
|
+
try:
|
|
60
|
+
return self._mlflow.start_run(run_name=name, log_system_metrics=True)
|
|
61
|
+
except TypeError:
|
|
62
|
+
self._enable_system_metrics()
|
|
63
|
+
return self._mlflow.start_run(run_name=name)
|
|
64
|
+
|
|
39
65
|
def start_trace(self, name: str, metadata: dict[str, Any] | None = None) -> str:
|
|
40
66
|
"""
|
|
41
67
|
Start a new MLflow run (mapped to trace).
|
|
@@ -47,7 +73,7 @@ class MLflowAdapter(TrackerPort):
|
|
|
47
73
|
Returns:
|
|
48
74
|
trace_id: MLflow run ID
|
|
49
75
|
"""
|
|
50
|
-
run = self.
|
|
76
|
+
run = self._start_mlflow_run(name)
|
|
51
77
|
trace_id = run.info.run_id
|
|
52
78
|
|
|
53
79
|
# Log metadata as MLflow parameters (only primitive types)
|
|
@@ -59,6 +85,12 @@ class MLflowAdapter(TrackerPort):
|
|
|
59
85
|
self._active_runs[trace_id] = run
|
|
60
86
|
return trace_id
|
|
61
87
|
|
|
88
|
+
def _write_temp_file(self, suffix: str, content: str) -> str:
|
|
89
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=suffix, delete=False) as f:
|
|
90
|
+
f.write(content)
|
|
91
|
+
f.flush()
|
|
92
|
+
return f.name
|
|
93
|
+
|
|
62
94
|
def add_span(
|
|
63
95
|
self,
|
|
64
96
|
trace_id: str,
|
|
@@ -89,10 +121,9 @@ class MLflowAdapter(TrackerPort):
|
|
|
89
121
|
"input": sanitize_payload(input_data, max_chars=MAX_LOG_CHARS),
|
|
90
122
|
"output": sanitize_payload(output_data, max_chars=MAX_LOG_CHARS),
|
|
91
123
|
}
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
self._mlflow.log_artifact(f.name, f"spans/{name}")
|
|
124
|
+
payload = json.dumps(span_data, default=str)
|
|
125
|
+
path = self._write_temp_file(".json", payload)
|
|
126
|
+
self._mlflow.log_artifact(path, f"spans/{name}")
|
|
96
127
|
|
|
97
128
|
def log_score(
|
|
98
129
|
self,
|
|
@@ -145,9 +176,15 @@ class MLflowAdapter(TrackerPort):
|
|
|
145
176
|
raise ValueError(f"Run not found: {trace_id}")
|
|
146
177
|
|
|
147
178
|
if artifact_type == "json":
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
179
|
+
payload = json.dumps(data, default=str)
|
|
180
|
+
path = self._write_temp_file(".json", payload)
|
|
181
|
+
self._mlflow.log_artifact(path, f"artifacts/{name}")
|
|
182
|
+
elif artifact_type == "text":
|
|
183
|
+
path = self._write_temp_file(".txt", str(data))
|
|
184
|
+
self._mlflow.log_artifact(path, f"artifacts/{name}")
|
|
185
|
+
else:
|
|
186
|
+
path = self._write_temp_file(".txt", str(data))
|
|
187
|
+
self._mlflow.log_artifact(path, f"artifacts/{name}")
|
|
151
188
|
|
|
152
189
|
def end_trace(self, trace_id: str) -> None:
|
|
153
190
|
"""
|
|
@@ -180,53 +217,171 @@ class MLflowAdapter(TrackerPort):
|
|
|
180
217
|
Returns:
|
|
181
218
|
trace_id: ID of the created MLflow run
|
|
182
219
|
"""
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
"
|
|
214
|
-
|
|
215
|
-
|
|
220
|
+
|
|
221
|
+
def _log_run() -> str:
|
|
222
|
+
trace_id = self.start_trace(
|
|
223
|
+
name=f"evaluation-{run.run_id[:8]}",
|
|
224
|
+
metadata={
|
|
225
|
+
"dataset_name": run.dataset_name,
|
|
226
|
+
"dataset_version": run.dataset_version,
|
|
227
|
+
"model_name": run.model_name,
|
|
228
|
+
"total_test_cases": run.total_test_cases,
|
|
229
|
+
},
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
self._mlflow.set_tag("run_id", run.run_id)
|
|
233
|
+
self._mlflow.set_tag("model_name", run.model_name)
|
|
234
|
+
self._mlflow.set_tag("dataset", f"{run.dataset_name}:{run.dataset_version}")
|
|
235
|
+
if run.tracker_metadata:
|
|
236
|
+
project_name = run.tracker_metadata.get("project_name")
|
|
237
|
+
if project_name:
|
|
238
|
+
self._mlflow.set_tag("project_name", project_name)
|
|
239
|
+
|
|
240
|
+
for metric_name in run.metrics_evaluated:
|
|
241
|
+
avg_score = run.get_avg_score(metric_name)
|
|
242
|
+
if avg_score is not None:
|
|
243
|
+
self.log_score(trace_id, f"avg_{metric_name}", avg_score)
|
|
244
|
+
|
|
245
|
+
self.log_score(trace_id, "pass_rate", run.pass_rate)
|
|
246
|
+
self._mlflow.log_metric("total_tokens", run.total_tokens)
|
|
247
|
+
if run.duration_seconds:
|
|
248
|
+
self._mlflow.log_metric("duration_seconds", run.duration_seconds)
|
|
249
|
+
if run.total_cost_usd is not None:
|
|
250
|
+
self._mlflow.log_metric("total_cost_usd", run.total_cost_usd)
|
|
251
|
+
|
|
252
|
+
results_data = []
|
|
253
|
+
for result in run.results:
|
|
254
|
+
result_dict = {
|
|
255
|
+
"test_case_id": result.test_case_id,
|
|
256
|
+
"all_passed": result.all_passed,
|
|
257
|
+
"tokens_used": result.tokens_used,
|
|
258
|
+
"metrics": [
|
|
259
|
+
{"name": m.name, "score": m.score, "passed": m.passed}
|
|
260
|
+
for m in result.metrics
|
|
261
|
+
],
|
|
262
|
+
}
|
|
263
|
+
results_data.append(result_dict)
|
|
264
|
+
self._trace_test_case(result)
|
|
265
|
+
|
|
266
|
+
self.save_artifact(trace_id, "test_results", results_data)
|
|
267
|
+
self.save_artifact(
|
|
268
|
+
trace_id,
|
|
269
|
+
"custom_metric_snapshot",
|
|
270
|
+
(run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
271
|
+
)
|
|
272
|
+
if run.tracker_metadata:
|
|
273
|
+
self.save_artifact(trace_id, "tracker_metadata", run.tracker_metadata)
|
|
274
|
+
self._register_prompts(run)
|
|
275
|
+
|
|
276
|
+
self.end_trace(trace_id)
|
|
277
|
+
return trace_id
|
|
278
|
+
|
|
279
|
+
trace_name = f"evaluation-{run.run_id[:8]}"
|
|
280
|
+
trace_attrs = {
|
|
281
|
+
"dataset_name": run.dataset_name,
|
|
282
|
+
"dataset_version": run.dataset_version,
|
|
283
|
+
"model_name": run.model_name,
|
|
284
|
+
}
|
|
285
|
+
try:
|
|
286
|
+
traced = self._mlflow.trace(
|
|
287
|
+
name=trace_name, span_type="EVALUATION", attributes=trace_attrs
|
|
288
|
+
)
|
|
289
|
+
return traced(_log_run)()
|
|
290
|
+
except Exception:
|
|
291
|
+
return _log_run()
|
|
292
|
+
|
|
293
|
+
def _register_prompts(self, run: EvaluationRun) -> None:
|
|
294
|
+
genai = getattr(self._mlflow, "genai", None)
|
|
295
|
+
if genai is None:
|
|
296
|
+
return
|
|
297
|
+
register_fn = getattr(genai, "register_prompt", None)
|
|
298
|
+
if not callable(register_fn):
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
prompt_entries = self._extract_prompt_entries(run)
|
|
302
|
+
if not prompt_entries:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
for entry in prompt_entries:
|
|
306
|
+
name = entry.get("name") or entry.get("role") or "prompt"
|
|
307
|
+
content = entry.get("content") or entry.get("content_preview") or ""
|
|
308
|
+
if not content:
|
|
309
|
+
continue
|
|
310
|
+
tags = {
|
|
311
|
+
"kind": str(entry.get("kind") or "custom"),
|
|
312
|
+
"role": str(entry.get("role") or ""),
|
|
313
|
+
"checksum": str(entry.get("checksum") or ""),
|
|
314
|
+
"run_id": run.run_id,
|
|
315
|
+
}
|
|
316
|
+
prompt_set_name = entry.get("prompt_set_name")
|
|
317
|
+
if prompt_set_name:
|
|
318
|
+
tags["prompt_set"] = str(prompt_set_name)
|
|
319
|
+
register_fn(
|
|
320
|
+
name=name,
|
|
321
|
+
template=content,
|
|
322
|
+
commit_message=entry.get("checksum"),
|
|
323
|
+
tags=tags,
|
|
324
|
+
model_config={
|
|
325
|
+
"model_name": run.model_name,
|
|
326
|
+
},
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
def _extract_prompt_entries(self, run: EvaluationRun) -> list[dict[str, Any]]:
|
|
330
|
+
entries: list[dict[str, Any]] = []
|
|
331
|
+
metadata = run.tracker_metadata or {}
|
|
332
|
+
prompt_set_detail = metadata.get("prompt_set_detail")
|
|
333
|
+
if isinstance(prompt_set_detail, dict):
|
|
334
|
+
prompt_set_name = prompt_set_detail.get("name")
|
|
335
|
+
for item in prompt_set_detail.get("items", []):
|
|
336
|
+
prompt = item.get("prompt") or {}
|
|
337
|
+
if not isinstance(prompt, dict):
|
|
338
|
+
continue
|
|
339
|
+
entries.append(
|
|
340
|
+
{
|
|
341
|
+
"name": prompt.get("name"),
|
|
342
|
+
"role": item.get("role"),
|
|
343
|
+
"kind": prompt.get("kind"),
|
|
344
|
+
"checksum": prompt.get("checksum"),
|
|
345
|
+
"content": prompt.get("content"),
|
|
346
|
+
"prompt_set_name": prompt_set_name,
|
|
347
|
+
}
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
phoenix_meta = metadata.get("phoenix") or {}
|
|
351
|
+
if isinstance(phoenix_meta, dict):
|
|
352
|
+
for entry in phoenix_meta.get("prompts", []) or []:
|
|
353
|
+
if not isinstance(entry, dict):
|
|
354
|
+
continue
|
|
355
|
+
entries.append(entry)
|
|
356
|
+
return entries
|
|
357
|
+
|
|
358
|
+
def _trace_test_case(self, result: TestCaseResult) -> None:
|
|
359
|
+
trace_fn = getattr(self._mlflow, "trace", None)
|
|
360
|
+
if not callable(trace_fn):
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
attrs = {
|
|
364
|
+
"test_case_id": result.test_case_id,
|
|
365
|
+
"all_passed": result.all_passed,
|
|
366
|
+
"tokens_used": result.tokens_used,
|
|
367
|
+
"latency_ms": result.latency_ms,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
def _emit() -> dict[str, Any]:
|
|
371
|
+
return {
|
|
216
372
|
"metrics": [
|
|
217
373
|
{"name": m.name, "score": m.score, "passed": m.passed} for m in result.metrics
|
|
218
374
|
],
|
|
375
|
+
"tokens_used": result.tokens_used,
|
|
376
|
+
"latency_ms": result.latency_ms,
|
|
219
377
|
}
|
|
220
|
-
results_data.append(result_dict)
|
|
221
|
-
|
|
222
|
-
self.save_artifact(trace_id, "test_results", results_data)
|
|
223
|
-
self.save_artifact(
|
|
224
|
-
trace_id,
|
|
225
|
-
"custom_metric_snapshot",
|
|
226
|
-
(run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
227
|
-
)
|
|
228
378
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
379
|
+
try:
|
|
380
|
+
wrapped = trace_fn(
|
|
381
|
+
name=f"test_case_{result.test_case_id}",
|
|
382
|
+
span_type="EVALUATION",
|
|
383
|
+
attributes=attrs,
|
|
384
|
+
)
|
|
385
|
+
wrapped(_emit)()
|
|
386
|
+
except Exception:
|
|
387
|
+
return
|