evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -88,7 +88,25 @@ RUN_MODE_PRESETS: dict[str, RunModePreset] = {
|
|
|
88
88
|
),
|
|
89
89
|
}
|
|
90
90
|
|
|
91
|
-
SUMMARY_METRIC_ORDER = (
|
|
91
|
+
SUMMARY_METRIC_ORDER = (
|
|
92
|
+
"summary_faithfulness",
|
|
93
|
+
"summary_score",
|
|
94
|
+
"entity_preservation",
|
|
95
|
+
"summary_accuracy",
|
|
96
|
+
"summary_risk_coverage",
|
|
97
|
+
"summary_non_definitive",
|
|
98
|
+
"summary_needs_followup",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
SUMMARY_METRIC_SOURCE = {
|
|
102
|
+
"summary_faithfulness": "LLM",
|
|
103
|
+
"summary_score": "LLM",
|
|
104
|
+
"entity_preservation": "Rule",
|
|
105
|
+
"summary_accuracy": "Rule",
|
|
106
|
+
"summary_risk_coverage": "Rule",
|
|
107
|
+
"summary_non_definitive": "Rule",
|
|
108
|
+
"summary_needs_followup": "Rule",
|
|
109
|
+
}
|
|
92
110
|
|
|
93
111
|
|
|
94
112
|
def _display_results(result, console: Console, verbose: bool = False) -> None:
|
|
@@ -180,8 +198,9 @@ def _display_summary_guidance(result, console: Console) -> None:
|
|
|
180
198
|
if score is None:
|
|
181
199
|
continue
|
|
182
200
|
recommended = SUMMARY_RECOMMENDED_THRESHOLDS[metric]
|
|
201
|
+
source = SUMMARY_METRIC_SOURCE.get(metric, "Rule")
|
|
183
202
|
if score < recommended:
|
|
184
|
-
warnings.append(f"- {metric}: {score:.3f} < {recommended:.2f}")
|
|
203
|
+
warnings.append(f"- {metric} ({source}): {score:.3f} < {recommended:.2f}")
|
|
185
204
|
|
|
186
205
|
if warnings:
|
|
187
206
|
header = "[bold red]사용자 노출 기준 미달[/bold red]"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
|
|
6
|
+
from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
|
|
7
|
+
from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ComparisonPipelineAdapter(ComparisonPipelinePort):
|
|
11
|
+
def __init__(self, service: AnalysisPipelineService) -> None:
|
|
12
|
+
self._service = service
|
|
13
|
+
|
|
14
|
+
def run_comparison(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
run_ids: list[str],
|
|
18
|
+
compare_metrics: list[str] | None,
|
|
19
|
+
test_type: str,
|
|
20
|
+
parallel: bool,
|
|
21
|
+
concurrency: int | None,
|
|
22
|
+
report_type: str,
|
|
23
|
+
use_llm_report: bool,
|
|
24
|
+
) -> PipelineResult:
|
|
25
|
+
params = {
|
|
26
|
+
"run_ids": run_ids,
|
|
27
|
+
"compare_metrics": compare_metrics,
|
|
28
|
+
"test_type": test_type,
|
|
29
|
+
"report_type": report_type,
|
|
30
|
+
"use_llm_report": use_llm_report,
|
|
31
|
+
}
|
|
32
|
+
if parallel:
|
|
33
|
+
if concurrency is not None:
|
|
34
|
+
params["max_concurrency"] = concurrency
|
|
35
|
+
return asyncio.run(
|
|
36
|
+
self._service.analyze_intent_async(
|
|
37
|
+
AnalysisIntent.GENERATE_COMPARISON,
|
|
38
|
+
run_id=run_ids[0] if run_ids else None,
|
|
39
|
+
**params,
|
|
40
|
+
)
|
|
41
|
+
)
|
|
42
|
+
return self._service.analyze_intent(
|
|
43
|
+
AnalysisIntent.GENERATE_COMPARISON,
|
|
44
|
+
run_id=run_ids[0] if run_ids else None,
|
|
45
|
+
**params,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
__all__ = ["ComparisonPipelineAdapter"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LocalArtifactFileSystemAdapter(ArtifactFileSystemPort):
|
|
9
|
+
def exists(self, path: Path) -> bool:
|
|
10
|
+
return path.exists()
|
|
11
|
+
|
|
12
|
+
def is_dir(self, path: Path) -> bool:
|
|
13
|
+
return path.is_dir()
|
|
14
|
+
|
|
15
|
+
def read_text(self, path: Path) -> str:
|
|
16
|
+
return path.read_text(encoding="utf-8")
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
|
|
6
|
+
from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DifficultyProfileWriter(DifficultyProfileWriterPort):
|
|
10
|
+
def write_profile(
|
|
11
|
+
self,
|
|
12
|
+
*,
|
|
13
|
+
output_path: Path,
|
|
14
|
+
artifacts_dir: Path,
|
|
15
|
+
envelope: dict[str, object],
|
|
16
|
+
artifacts: dict[str, object],
|
|
17
|
+
) -> dict[str, object]:
|
|
18
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
breakdown_path = artifacts_dir / "difficulty_breakdown.json"
|
|
22
|
+
cases_path = artifacts_dir / "difficulty_cases.json"
|
|
23
|
+
breakdown_payload = artifacts.get("breakdown")
|
|
24
|
+
cases_payload = artifacts.get("cases")
|
|
25
|
+
write_json(
|
|
26
|
+
breakdown_path,
|
|
27
|
+
breakdown_payload if isinstance(breakdown_payload, dict) else {},
|
|
28
|
+
)
|
|
29
|
+
write_json(
|
|
30
|
+
cases_path,
|
|
31
|
+
{"cases": cases_payload} if isinstance(cases_payload, list) else {"cases": []},
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
index_payload = {
|
|
35
|
+
"files": {
|
|
36
|
+
"breakdown": str(breakdown_path),
|
|
37
|
+
"cases": str(cases_path),
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
index_path = artifacts_dir / "index.json"
|
|
41
|
+
write_json(index_path, index_payload)
|
|
42
|
+
|
|
43
|
+
artifacts_index = {
|
|
44
|
+
"dir": str(artifacts_dir),
|
|
45
|
+
"index": str(index_path),
|
|
46
|
+
}
|
|
47
|
+
envelope["artifacts"] = artifacts_index
|
|
48
|
+
write_json(output_path, envelope)
|
|
49
|
+
|
|
50
|
+
return artifacts_index
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
|
|
7
|
+
from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OpsSnapshotWriter(OpsSnapshotWriterPort):
|
|
11
|
+
def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None:
|
|
12
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
write_json(path, payload)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
|
|
4
|
+
from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
|
|
5
|
+
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
6
|
+
from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JudgeCalibrationAdapter(JudgeCalibrationPort):
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self._service = JudgeCalibrationService()
|
|
12
|
+
|
|
13
|
+
def calibrate(
|
|
14
|
+
self,
|
|
15
|
+
run: EvaluationRun,
|
|
16
|
+
feedbacks: list[SatisfactionFeedback],
|
|
17
|
+
*,
|
|
18
|
+
labels_source: str,
|
|
19
|
+
method: str,
|
|
20
|
+
metrics: list[str],
|
|
21
|
+
holdout_ratio: float,
|
|
22
|
+
seed: int,
|
|
23
|
+
parallel: bool = False,
|
|
24
|
+
concurrency: int = 8,
|
|
25
|
+
) -> JudgeCalibrationResult:
|
|
26
|
+
return self._service.calibrate(
|
|
27
|
+
run,
|
|
28
|
+
feedbacks,
|
|
29
|
+
labels_source=labels_source,
|
|
30
|
+
method=method,
|
|
31
|
+
metrics=metrics,
|
|
32
|
+
holdout_ratio=holdout_ratio,
|
|
33
|
+
seed=seed,
|
|
34
|
+
parallel=parallel,
|
|
35
|
+
concurrency=concurrency,
|
|
36
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JudgeCalibrationReporter:
|
|
12
|
+
def render_json(self, result: JudgeCalibrationResult) -> dict[str, Any]:
|
|
13
|
+
return {
|
|
14
|
+
"summary": asdict(result.summary),
|
|
15
|
+
"metrics": [asdict(metric) for metric in result.metrics],
|
|
16
|
+
"case_results": {
|
|
17
|
+
metric: [asdict(entry) for entry in entries]
|
|
18
|
+
for metric, entries in result.case_results.items()
|
|
19
|
+
},
|
|
20
|
+
"warnings": list(result.warnings),
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def write_artifacts(
|
|
24
|
+
self,
|
|
25
|
+
*,
|
|
26
|
+
result: JudgeCalibrationResult,
|
|
27
|
+
artifacts_dir: Path,
|
|
28
|
+
) -> dict[str, str]:
|
|
29
|
+
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
index_path = artifacts_dir / "index.json"
|
|
31
|
+
payload = {
|
|
32
|
+
"run_id": result.summary.run_id,
|
|
33
|
+
"metrics": [metric.metric for metric in result.metrics],
|
|
34
|
+
"cases": {},
|
|
35
|
+
}
|
|
36
|
+
for metric, cases in result.case_results.items():
|
|
37
|
+
case_path = artifacts_dir / f"{metric}.json"
|
|
38
|
+
case_payload = [
|
|
39
|
+
{
|
|
40
|
+
"test_case_id": case.test_case_id,
|
|
41
|
+
"raw_score": case.raw_score,
|
|
42
|
+
"calibrated_score": case.calibrated_score,
|
|
43
|
+
"label": case.label,
|
|
44
|
+
"label_source": case.label_source,
|
|
45
|
+
}
|
|
46
|
+
for case in cases
|
|
47
|
+
]
|
|
48
|
+
case_path.write_text(
|
|
49
|
+
json.dumps(case_payload, ensure_ascii=False, indent=2),
|
|
50
|
+
encoding="utf-8",
|
|
51
|
+
)
|
|
52
|
+
payload["cases"][metric] = str(case_path)
|
|
53
|
+
index_path.write_text(
|
|
54
|
+
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
55
|
+
encoding="utf-8",
|
|
56
|
+
)
|
|
57
|
+
return {"dir": str(artifacts_dir), "index": str(index_path)}
|
|
@@ -499,8 +499,20 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
|
|
|
499
499
|
"summary_faithfulness": 0.90,
|
|
500
500
|
"summary_score": 0.85,
|
|
501
501
|
"entity_preservation": 0.90,
|
|
502
|
+
"summary_accuracy": 0.90,
|
|
503
|
+
"summary_risk_coverage": 0.90,
|
|
504
|
+
"summary_non_definitive": 0.80,
|
|
505
|
+
"summary_needs_followup": 0.80,
|
|
502
506
|
}
|
|
503
|
-
SUMMARY_METRIC_ORDER = (
|
|
507
|
+
SUMMARY_METRIC_ORDER = (
|
|
508
|
+
"summary_faithfulness",
|
|
509
|
+
"summary_score",
|
|
510
|
+
"entity_preservation",
|
|
511
|
+
"summary_accuracy",
|
|
512
|
+
"summary_risk_coverage",
|
|
513
|
+
"summary_non_definitive",
|
|
514
|
+
"summary_needs_followup",
|
|
515
|
+
)
|
|
504
516
|
|
|
505
517
|
|
|
506
518
|
@dataclass
|
|
@@ -664,6 +664,8 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
664
664
|
def export_run_to_excel(self, run_id: str, output_path) -> Path:
|
|
665
665
|
from openpyxl import Workbook
|
|
666
666
|
|
|
667
|
+
from evalvault.domain.metrics.registry import get_metric_spec_map
|
|
668
|
+
|
|
667
669
|
output = Path(output_path)
|
|
668
670
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
669
671
|
|
|
@@ -837,6 +839,23 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
837
839
|
|
|
838
840
|
summary_rows: list[dict[str, Any]] = []
|
|
839
841
|
run_payload = run_rows[0] if run_rows else {}
|
|
842
|
+
custom_metric_rows: list[dict[str, Any]] = []
|
|
843
|
+
run_metadata = self._deserialize_json(run_payload.get("metadata")) if run_payload else None
|
|
844
|
+
if isinstance(run_metadata, dict):
|
|
845
|
+
custom_snapshot = run_metadata.get("custom_metric_snapshot")
|
|
846
|
+
if isinstance(custom_snapshot, dict):
|
|
847
|
+
entries = custom_snapshot.get("metrics")
|
|
848
|
+
if isinstance(entries, list):
|
|
849
|
+
for entry in entries:
|
|
850
|
+
if isinstance(entry, dict):
|
|
851
|
+
row = dict(entry)
|
|
852
|
+
row["schema_version"] = custom_snapshot.get("schema_version")
|
|
853
|
+
custom_metric_rows.append(row)
|
|
854
|
+
if custom_metric_rows:
|
|
855
|
+
custom_metric_rows = self._normalize_rows(
|
|
856
|
+
custom_metric_rows,
|
|
857
|
+
json_columns={"inputs", "rules"},
|
|
858
|
+
)
|
|
840
859
|
prompt_set_id = None
|
|
841
860
|
prompt_set_name = None
|
|
842
861
|
if run_prompt_payloads:
|
|
@@ -878,14 +897,17 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
878
897
|
if isinstance(threshold, (int, float)) and score >= threshold:
|
|
879
898
|
entry["pass_count"] += 1
|
|
880
899
|
|
|
900
|
+
metric_spec_map = get_metric_spec_map()
|
|
881
901
|
for entry in metrics_index.values():
|
|
882
902
|
count = entry["count"] or 0
|
|
903
|
+
spec = metric_spec_map.get(entry["metric_name"])
|
|
883
904
|
metric_summary_rows.append(
|
|
884
905
|
{
|
|
885
906
|
"metric_name": entry["metric_name"],
|
|
886
907
|
"avg_score": (entry["score_sum"] / count) if count else None,
|
|
887
908
|
"pass_rate": (entry["pass_count"] / count) if count else None,
|
|
888
909
|
"samples": count,
|
|
910
|
+
"source": spec.source if spec else None,
|
|
889
911
|
}
|
|
890
912
|
)
|
|
891
913
|
|
|
@@ -956,7 +978,25 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
956
978
|
(
|
|
957
979
|
"MetricsSummary",
|
|
958
980
|
metric_summary_rows,
|
|
959
|
-
["metric_name", "avg_score", "pass_rate", "samples"],
|
|
981
|
+
["metric_name", "avg_score", "pass_rate", "samples", "source"],
|
|
982
|
+
),
|
|
983
|
+
(
|
|
984
|
+
"CustomMetrics",
|
|
985
|
+
custom_metric_rows,
|
|
986
|
+
[
|
|
987
|
+
"schema_version",
|
|
988
|
+
"metric_name",
|
|
989
|
+
"source",
|
|
990
|
+
"description",
|
|
991
|
+
"evaluation_method",
|
|
992
|
+
"inputs",
|
|
993
|
+
"output",
|
|
994
|
+
"evaluation_process",
|
|
995
|
+
"rules",
|
|
996
|
+
"notes",
|
|
997
|
+
"implementation_path",
|
|
998
|
+
"implementation_hash",
|
|
999
|
+
],
|
|
960
1000
|
),
|
|
961
1001
|
(
|
|
962
1002
|
"RunPromptSets",
|
|
@@ -63,13 +63,15 @@ class LangfuseAdapter(TrackerPort):
|
|
|
63
63
|
span.update_trace(name=name, metadata=metadata)
|
|
64
64
|
self._traces[trace_id] = span
|
|
65
65
|
else:
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
trace_fn: Any = getattr(self._client, "trace", None)
|
|
67
|
+
if trace_fn is None:
|
|
68
|
+
raise RuntimeError("Langfuse client does not expose trace API")
|
|
69
|
+
trace_obj = trace_fn(
|
|
68
70
|
name=name,
|
|
69
71
|
metadata=metadata,
|
|
70
72
|
)
|
|
71
|
-
trace_id =
|
|
72
|
-
self._traces[trace_id] =
|
|
73
|
+
trace_id = trace_obj.id
|
|
74
|
+
self._traces[trace_id] = trace_obj
|
|
73
75
|
return trace_id
|
|
74
76
|
|
|
75
77
|
def add_span(
|
|
@@ -240,7 +242,7 @@ class LangfuseAdapter(TrackerPort):
|
|
|
240
242
|
passed_count = sum(
|
|
241
243
|
1
|
|
242
244
|
for r in run.results
|
|
243
|
-
if r.get_metric(metric_name) and
|
|
245
|
+
if (metric := r.get_metric(metric_name)) and metric.passed is True
|
|
244
246
|
)
|
|
245
247
|
avg_score = run.get_avg_score(metric_name)
|
|
246
248
|
threshold = run.thresholds.get(metric_name, 0.7)
|
|
@@ -358,6 +360,7 @@ class LangfuseAdapter(TrackerPort):
|
|
|
358
360
|
"summary": trace_output["summary"],
|
|
359
361
|
"metrics": metric_summary,
|
|
360
362
|
"phoenix_links": phoenix_links or {},
|
|
363
|
+
"custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
361
364
|
"test_cases": [
|
|
362
365
|
{
|
|
363
366
|
"test_case_id": result.test_case_id,
|
|
@@ -421,12 +424,15 @@ class LangfuseAdapter(TrackerPort):
|
|
|
421
424
|
}
|
|
422
425
|
|
|
423
426
|
# Span metadata: additional info
|
|
424
|
-
span_metadata = {
|
|
427
|
+
span_metadata: dict[str, float | int] = {
|
|
425
428
|
"tokens_used": result.tokens_used,
|
|
426
429
|
"latency_ms": result.latency_ms,
|
|
427
430
|
}
|
|
428
431
|
if result.cost_usd:
|
|
429
|
-
span_metadata
|
|
432
|
+
span_metadata = {
|
|
433
|
+
**span_metadata,
|
|
434
|
+
"cost_usd": float(result.cost_usd),
|
|
435
|
+
}
|
|
430
436
|
|
|
431
437
|
if hasattr(root_span, "start_span"):
|
|
432
438
|
child_span = root_span.start_span(
|
|
@@ -220,6 +220,11 @@ class MLflowAdapter(TrackerPort):
|
|
|
220
220
|
results_data.append(result_dict)
|
|
221
221
|
|
|
222
222
|
self.save_artifact(trace_id, "test_results", results_data)
|
|
223
|
+
self.save_artifact(
|
|
224
|
+
trace_id,
|
|
225
|
+
"custom_metric_snapshot",
|
|
226
|
+
(run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
227
|
+
)
|
|
223
228
|
|
|
224
229
|
# 6. End MLflow run
|
|
225
230
|
self.end_trace(trace_id)
|
|
@@ -26,8 +26,7 @@ from evalvault.domain.entities import (
|
|
|
26
26
|
from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
|
-
from opentelemetry.sdk.trace import
|
|
30
|
-
from opentelemetry.trace import Tracer
|
|
29
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
31
30
|
|
|
32
31
|
|
|
33
32
|
class PhoenixAdapter(TrackerPort):
|
|
@@ -62,9 +61,10 @@ class PhoenixAdapter(TrackerPort):
|
|
|
62
61
|
"""
|
|
63
62
|
self._endpoint = endpoint
|
|
64
63
|
self._service_name = service_name
|
|
65
|
-
self._tracer:
|
|
64
|
+
self._tracer: Any | None = None
|
|
66
65
|
self._tracer_provider: TracerProvider | None = None
|
|
67
|
-
self._active_spans: dict[str,
|
|
66
|
+
self._active_spans: dict[str, Any] = {}
|
|
67
|
+
self._tracer_any: Any | None = None
|
|
68
68
|
self._initialized = False
|
|
69
69
|
|
|
70
70
|
def _ensure_initialized(self) -> None:
|
|
@@ -90,7 +90,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
90
90
|
provider = get_tracer_provider()
|
|
91
91
|
if provider:
|
|
92
92
|
self._tracer_provider = provider
|
|
93
|
-
self.
|
|
93
|
+
self._tracer_any = trace.get_tracer(__name__)
|
|
94
|
+
self._tracer = self._tracer_any
|
|
94
95
|
self._initialized = True
|
|
95
96
|
return
|
|
96
97
|
|
|
@@ -109,7 +110,8 @@ class PhoenixAdapter(TrackerPort):
|
|
|
109
110
|
trace.set_tracer_provider(self._tracer_provider)
|
|
110
111
|
|
|
111
112
|
# Get tracer
|
|
112
|
-
self.
|
|
113
|
+
self._tracer_any = trace.get_tracer(__name__)
|
|
114
|
+
self._tracer = self._tracer_any
|
|
113
115
|
self._initialized = True
|
|
114
116
|
|
|
115
117
|
except ImportError as e:
|
|
@@ -134,7 +136,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
134
136
|
self._ensure_initialized()
|
|
135
137
|
|
|
136
138
|
# Start a new span as root
|
|
137
|
-
|
|
139
|
+
tracer = self._tracer_any
|
|
140
|
+
if tracer is None:
|
|
141
|
+
tracer = self._tracer
|
|
142
|
+
if tracer is None:
|
|
143
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
144
|
+
span = tracer.start_span(name)
|
|
138
145
|
trace_id = str(uuid.uuid4())
|
|
139
146
|
|
|
140
147
|
# Set metadata as span attributes
|
|
@@ -173,10 +180,15 @@ class PhoenixAdapter(TrackerPort):
|
|
|
173
180
|
|
|
174
181
|
from opentelemetry import trace
|
|
175
182
|
|
|
183
|
+
tracer = self._tracer_any
|
|
184
|
+
if tracer is None:
|
|
185
|
+
tracer = self._tracer
|
|
186
|
+
if tracer is None:
|
|
187
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
176
188
|
parent_span = self._active_spans[trace_id]
|
|
177
189
|
context = trace.set_span_in_context(parent_span)
|
|
178
190
|
|
|
179
|
-
with
|
|
191
|
+
with tracer.start_span(name, context=context) as span:
|
|
180
192
|
if input_data is not None:
|
|
181
193
|
safe_input = sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
|
|
182
194
|
span.set_attribute("input", json.dumps(safe_input, default=str))
|
|
@@ -279,7 +291,7 @@ class PhoenixAdapter(TrackerPort):
|
|
|
279
291
|
passed_count = sum(
|
|
280
292
|
1
|
|
281
293
|
for r in run.results
|
|
282
|
-
if r.get_metric(metric_name) and
|
|
294
|
+
if (metric := r.get_metric(metric_name)) and metric.passed is True
|
|
283
295
|
)
|
|
284
296
|
avg_score = run.get_avg_score(metric_name)
|
|
285
297
|
threshold = run.thresholds.get(metric_name, 0.7)
|
|
@@ -340,13 +352,40 @@ class PhoenixAdapter(TrackerPort):
|
|
|
340
352
|
"version": run.dataset_version,
|
|
341
353
|
"total_test_cases": run.total_test_cases,
|
|
342
354
|
},
|
|
355
|
+
"evaluation_config": {
|
|
356
|
+
"model": run.model_name,
|
|
357
|
+
"metrics": run.metrics_evaluated,
|
|
358
|
+
"thresholds": run.thresholds,
|
|
359
|
+
},
|
|
343
360
|
"summary": {
|
|
344
|
-
"
|
|
361
|
+
"total_test_cases": run.total_test_cases,
|
|
362
|
+
"passed": run.passed_test_cases,
|
|
363
|
+
"failed": run.total_test_cases - run.passed_test_cases,
|
|
364
|
+
"pass_rate": round(run.pass_rate, 4),
|
|
365
|
+
"duration_seconds": round(run.duration_seconds, 2)
|
|
366
|
+
if run.duration_seconds
|
|
367
|
+
else None,
|
|
345
368
|
"total_tokens": run.total_tokens,
|
|
346
|
-
"duration_seconds": run.duration_seconds,
|
|
347
369
|
},
|
|
348
370
|
"metrics": metric_summary,
|
|
371
|
+
"custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
|
|
372
|
+
"test_cases": [
|
|
373
|
+
{
|
|
374
|
+
"test_case_id": result.test_case_id,
|
|
375
|
+
"all_passed": result.all_passed,
|
|
376
|
+
"metrics": {
|
|
377
|
+
metric.name: {
|
|
378
|
+
"score": metric.score,
|
|
379
|
+
"threshold": metric.threshold,
|
|
380
|
+
"passed": metric.passed,
|
|
381
|
+
}
|
|
382
|
+
for metric in result.metrics
|
|
383
|
+
},
|
|
384
|
+
}
|
|
385
|
+
for result in run.results
|
|
386
|
+
],
|
|
349
387
|
}
|
|
388
|
+
|
|
350
389
|
self.save_artifact(trace_id, "ragas_evaluation", structured_artifact)
|
|
351
390
|
|
|
352
391
|
# End the trace
|
|
@@ -369,10 +408,15 @@ class PhoenixAdapter(TrackerPort):
|
|
|
369
408
|
"""
|
|
370
409
|
from opentelemetry import trace
|
|
371
410
|
|
|
411
|
+
tracer = self._tracer_any
|
|
412
|
+
if tracer is None:
|
|
413
|
+
tracer = self._tracer
|
|
414
|
+
if tracer is None:
|
|
415
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
372
416
|
parent_span = self._active_spans[trace_id]
|
|
373
417
|
context = trace.set_span_in_context(parent_span)
|
|
374
418
|
|
|
375
|
-
with
|
|
419
|
+
with tracer.start_span(
|
|
376
420
|
f"test-case-{result.test_case_id}",
|
|
377
421
|
context=context,
|
|
378
422
|
) as span:
|
|
@@ -478,7 +522,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
478
522
|
parent_span = self._active_spans[trace_id]
|
|
479
523
|
context = trace.set_span_in_context(parent_span)
|
|
480
524
|
|
|
481
|
-
|
|
525
|
+
tracer = self._tracer_any
|
|
526
|
+
if tracer is None:
|
|
527
|
+
tracer = self._tracer
|
|
528
|
+
if tracer is None:
|
|
529
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
530
|
+
with tracer.start_span("retrieval", context=context) as span:
|
|
482
531
|
# Set retrieval attributes
|
|
483
532
|
for key, value in data.to_span_attributes().items():
|
|
484
533
|
span.set_attribute(key, value)
|
|
@@ -560,7 +609,12 @@ class PhoenixAdapter(TrackerPort):
|
|
|
560
609
|
parent_span = self._active_spans[trace_id]
|
|
561
610
|
context = trace.set_span_in_context(parent_span)
|
|
562
611
|
|
|
563
|
-
|
|
612
|
+
tracer = self._tracer_any
|
|
613
|
+
if tracer is None:
|
|
614
|
+
tracer = self._tracer
|
|
615
|
+
if tracer is None:
|
|
616
|
+
raise RuntimeError("Phoenix tracer is not initialized")
|
|
617
|
+
with tracer.start_span("generation", context=context) as span:
|
|
564
618
|
# Set generation attributes
|
|
565
619
|
for key, value in data.to_span_attributes().items():
|
|
566
620
|
span.set_attribute(key, value)
|
evalvault/config/settings.py
CHANGED
|
@@ -321,6 +321,27 @@ class Settings(BaseSettings):
|
|
|
321
321
|
default="https://cloud.langfuse.com", description="Langfuse host URL"
|
|
322
322
|
)
|
|
323
323
|
|
|
324
|
+
mcp_enabled: bool = Field(
|
|
325
|
+
default=False,
|
|
326
|
+
description="Enable MCP JSON-RPC endpoint over HTTP.",
|
|
327
|
+
)
|
|
328
|
+
mcp_protocol_version: str = Field(
|
|
329
|
+
default="2025-11-25",
|
|
330
|
+
description="MCP protocol version to advertise.",
|
|
331
|
+
)
|
|
332
|
+
mcp_server_version: str = Field(
|
|
333
|
+
default="0.1.0",
|
|
334
|
+
description="EvalVault MCP server version.",
|
|
335
|
+
)
|
|
336
|
+
mcp_auth_tokens: str | None = Field(
|
|
337
|
+
default=None,
|
|
338
|
+
description="Comma-separated bearer tokens for MCP endpoint (required).",
|
|
339
|
+
)
|
|
340
|
+
mcp_allowed_tools: str | None = Field(
|
|
341
|
+
default=None,
|
|
342
|
+
description="Comma-separated allowlist of MCP tool names.",
|
|
343
|
+
)
|
|
344
|
+
|
|
324
345
|
# MLflow Configuration (optional)
|
|
325
346
|
mlflow_tracking_uri: str | None = Field(default=None, description="MLflow tracking server URI")
|
|
326
347
|
mlflow_experiment_name: str = Field(default="evalvault", description="MLflow experiment name")
|
|
@@ -34,6 +34,12 @@ from evalvault.domain.entities.improvement import (
|
|
|
34
34
|
RAGComponent,
|
|
35
35
|
RAGImprovementGuide,
|
|
36
36
|
)
|
|
37
|
+
from evalvault.domain.entities.judge_calibration import (
|
|
38
|
+
JudgeCalibrationCase,
|
|
39
|
+
JudgeCalibrationMetric,
|
|
40
|
+
JudgeCalibrationResult,
|
|
41
|
+
JudgeCalibrationSummary,
|
|
42
|
+
)
|
|
37
43
|
from evalvault.domain.entities.kg import EntityModel, RelationModel
|
|
38
44
|
from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
|
|
39
45
|
from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
|
|
@@ -104,6 +110,10 @@ __all__ = [
|
|
|
104
110
|
"PatternType",
|
|
105
111
|
"RAGComponent",
|
|
106
112
|
"RAGImprovementGuide",
|
|
113
|
+
"JudgeCalibrationCase",
|
|
114
|
+
"JudgeCalibrationMetric",
|
|
115
|
+
"JudgeCalibrationResult",
|
|
116
|
+
"JudgeCalibrationSummary",
|
|
107
117
|
# KG
|
|
108
118
|
"EntityModel",
|
|
109
119
|
"RelationModel",
|