evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -88,7 +88,25 @@ RUN_MODE_PRESETS: dict[str, RunModePreset] = {
88
88
  ),
89
89
  }
90
90
 
91
- SUMMARY_METRIC_ORDER = ("summary_faithfulness", "summary_score", "entity_preservation")
91
+ SUMMARY_METRIC_ORDER = (
92
+ "summary_faithfulness",
93
+ "summary_score",
94
+ "entity_preservation",
95
+ "summary_accuracy",
96
+ "summary_risk_coverage",
97
+ "summary_non_definitive",
98
+ "summary_needs_followup",
99
+ )
100
+
101
+ SUMMARY_METRIC_SOURCE = {
102
+ "summary_faithfulness": "LLM",
103
+ "summary_score": "LLM",
104
+ "entity_preservation": "Rule",
105
+ "summary_accuracy": "Rule",
106
+ "summary_risk_coverage": "Rule",
107
+ "summary_non_definitive": "Rule",
108
+ "summary_needs_followup": "Rule",
109
+ }
92
110
 
93
111
 
94
112
  def _display_results(result, console: Console, verbose: bool = False) -> None:
@@ -180,8 +198,9 @@ def _display_summary_guidance(result, console: Console) -> None:
180
198
  if score is None:
181
199
  continue
182
200
  recommended = SUMMARY_RECOMMENDED_THRESHOLDS[metric]
201
+ source = SUMMARY_METRIC_SOURCE.get(metric, "Rule")
183
202
  if score < recommended:
184
- warnings.append(f"- {metric}: {score:.3f} < {recommended:.2f}")
203
+ warnings.append(f"- {metric} ({source}): {score:.3f} < {recommended:.2f}")
185
204
 
186
205
  if warnings:
187
206
  header = "[bold red]사용자 노출 기준 미달[/bold red]"
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
6
+ from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
7
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
8
+
9
+
10
+ class ComparisonPipelineAdapter(ComparisonPipelinePort):
11
+ def __init__(self, service: AnalysisPipelineService) -> None:
12
+ self._service = service
13
+
14
+ def run_comparison(
15
+ self,
16
+ *,
17
+ run_ids: list[str],
18
+ compare_metrics: list[str] | None,
19
+ test_type: str,
20
+ parallel: bool,
21
+ concurrency: int | None,
22
+ report_type: str,
23
+ use_llm_report: bool,
24
+ ) -> PipelineResult:
25
+ params = {
26
+ "run_ids": run_ids,
27
+ "compare_metrics": compare_metrics,
28
+ "test_type": test_type,
29
+ "report_type": report_type,
30
+ "use_llm_report": use_llm_report,
31
+ }
32
+ if parallel:
33
+ if concurrency is not None:
34
+ params["max_concurrency"] = concurrency
35
+ return asyncio.run(
36
+ self._service.analyze_intent_async(
37
+ AnalysisIntent.GENERATE_COMPARISON,
38
+ run_id=run_ids[0] if run_ids else None,
39
+ **params,
40
+ )
41
+ )
42
+ return self._service.analyze_intent(
43
+ AnalysisIntent.GENERATE_COMPARISON,
44
+ run_id=run_ids[0] if run_ids else None,
45
+ **params,
46
+ )
47
+
48
+
49
+ __all__ = ["ComparisonPipelineAdapter"]
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
6
+
7
+
8
+ class LocalArtifactFileSystemAdapter(ArtifactFileSystemPort):
9
+ def exists(self, path: Path) -> bool:
10
+ return path.exists()
11
+
12
+ def is_dir(self, path: Path) -> bool:
13
+ return path.is_dir()
14
+
15
+ def read_text(self, path: Path) -> str:
16
+ return path.read_text(encoding="utf-8")
@@ -0,0 +1,3 @@
1
+ from evalvault.adapters.outbound.filesystem.ops_snapshot_writer import OpsSnapshotWriter
2
+
3
+ __all__ = ["OpsSnapshotWriter"]
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
6
+ from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
7
+
8
+
9
+ class DifficultyProfileWriter(DifficultyProfileWriterPort):
10
+ def write_profile(
11
+ self,
12
+ *,
13
+ output_path: Path,
14
+ artifacts_dir: Path,
15
+ envelope: dict[str, object],
16
+ artifacts: dict[str, object],
17
+ ) -> dict[str, object]:
18
+ output_path.parent.mkdir(parents=True, exist_ok=True)
19
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ breakdown_path = artifacts_dir / "difficulty_breakdown.json"
22
+ cases_path = artifacts_dir / "difficulty_cases.json"
23
+ breakdown_payload = artifacts.get("breakdown")
24
+ cases_payload = artifacts.get("cases")
25
+ write_json(
26
+ breakdown_path,
27
+ breakdown_payload if isinstance(breakdown_payload, dict) else {},
28
+ )
29
+ write_json(
30
+ cases_path,
31
+ {"cases": cases_payload} if isinstance(cases_payload, list) else {"cases": []},
32
+ )
33
+
34
+ index_payload = {
35
+ "files": {
36
+ "breakdown": str(breakdown_path),
37
+ "cases": str(cases_path),
38
+ }
39
+ }
40
+ index_path = artifacts_dir / "index.json"
41
+ write_json(index_path, index_payload)
42
+
43
+ artifacts_index = {
44
+ "dir": str(artifacts_dir),
45
+ "index": str(index_path),
46
+ }
47
+ envelope["artifacts"] = artifacts_index
48
+ write_json(output_path, envelope)
49
+
50
+ return artifacts_index
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
7
+ from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
8
+
9
+
10
+ class OpsSnapshotWriter(OpsSnapshotWriterPort):
11
+ def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None:
12
+ path.parent.mkdir(parents=True, exist_ok=True)
13
+ write_json(path, payload)
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
4
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
5
+ from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
6
+ from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
7
+
8
+
9
+ class JudgeCalibrationAdapter(JudgeCalibrationPort):
10
+ def __init__(self) -> None:
11
+ self._service = JudgeCalibrationService()
12
+
13
+ def calibrate(
14
+ self,
15
+ run: EvaluationRun,
16
+ feedbacks: list[SatisfactionFeedback],
17
+ *,
18
+ labels_source: str,
19
+ method: str,
20
+ metrics: list[str],
21
+ holdout_ratio: float,
22
+ seed: int,
23
+ parallel: bool = False,
24
+ concurrency: int = 8,
25
+ ) -> JudgeCalibrationResult:
26
+ return self._service.calibrate(
27
+ run,
28
+ feedbacks,
29
+ labels_source=labels_source,
30
+ method=method,
31
+ metrics=metrics,
32
+ holdout_ratio=holdout_ratio,
33
+ seed=seed,
34
+ parallel=parallel,
35
+ concurrency=concurrency,
36
+ )
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
9
+
10
+
11
+ class JudgeCalibrationReporter:
12
+ def render_json(self, result: JudgeCalibrationResult) -> dict[str, Any]:
13
+ return {
14
+ "summary": asdict(result.summary),
15
+ "metrics": [asdict(metric) for metric in result.metrics],
16
+ "case_results": {
17
+ metric: [asdict(entry) for entry in entries]
18
+ for metric, entries in result.case_results.items()
19
+ },
20
+ "warnings": list(result.warnings),
21
+ }
22
+
23
+ def write_artifacts(
24
+ self,
25
+ *,
26
+ result: JudgeCalibrationResult,
27
+ artifacts_dir: Path,
28
+ ) -> dict[str, str]:
29
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
30
+ index_path = artifacts_dir / "index.json"
31
+ payload = {
32
+ "run_id": result.summary.run_id,
33
+ "metrics": [metric.metric for metric in result.metrics],
34
+ "cases": {},
35
+ }
36
+ for metric, cases in result.case_results.items():
37
+ case_path = artifacts_dir / f"{metric}.json"
38
+ case_payload = [
39
+ {
40
+ "test_case_id": case.test_case_id,
41
+ "raw_score": case.raw_score,
42
+ "calibrated_score": case.calibrated_score,
43
+ "label": case.label,
44
+ "label_source": case.label_source,
45
+ }
46
+ for case in cases
47
+ ]
48
+ case_path.write_text(
49
+ json.dumps(case_payload, ensure_ascii=False, indent=2),
50
+ encoding="utf-8",
51
+ )
52
+ payload["cases"][metric] = str(case_path)
53
+ index_path.write_text(
54
+ json.dumps(payload, ensure_ascii=False, indent=2),
55
+ encoding="utf-8",
56
+ )
57
+ return {"dir": str(artifacts_dir), "index": str(index_path)}
@@ -499,8 +499,20 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
499
499
  "summary_faithfulness": 0.90,
500
500
  "summary_score": 0.85,
501
501
  "entity_preservation": 0.90,
502
+ "summary_accuracy": 0.90,
503
+ "summary_risk_coverage": 0.90,
504
+ "summary_non_definitive": 0.80,
505
+ "summary_needs_followup": 0.80,
502
506
  }
503
- SUMMARY_METRIC_ORDER = ("summary_faithfulness", "summary_score", "entity_preservation")
507
+ SUMMARY_METRIC_ORDER = (
508
+ "summary_faithfulness",
509
+ "summary_score",
510
+ "entity_preservation",
511
+ "summary_accuracy",
512
+ "summary_risk_coverage",
513
+ "summary_non_definitive",
514
+ "summary_needs_followup",
515
+ )
504
516
 
505
517
 
506
518
  @dataclass
@@ -664,6 +664,8 @@ class BaseSQLStorageAdapter(ABC):
664
664
  def export_run_to_excel(self, run_id: str, output_path) -> Path:
665
665
  from openpyxl import Workbook
666
666
 
667
+ from evalvault.domain.metrics.registry import get_metric_spec_map
668
+
667
669
  output = Path(output_path)
668
670
  output.parent.mkdir(parents=True, exist_ok=True)
669
671
 
@@ -837,6 +839,23 @@ class BaseSQLStorageAdapter(ABC):
837
839
 
838
840
  summary_rows: list[dict[str, Any]] = []
839
841
  run_payload = run_rows[0] if run_rows else {}
842
+ custom_metric_rows: list[dict[str, Any]] = []
843
+ run_metadata = self._deserialize_json(run_payload.get("metadata")) if run_payload else None
844
+ if isinstance(run_metadata, dict):
845
+ custom_snapshot = run_metadata.get("custom_metric_snapshot")
846
+ if isinstance(custom_snapshot, dict):
847
+ entries = custom_snapshot.get("metrics")
848
+ if isinstance(entries, list):
849
+ for entry in entries:
850
+ if isinstance(entry, dict):
851
+ row = dict(entry)
852
+ row["schema_version"] = custom_snapshot.get("schema_version")
853
+ custom_metric_rows.append(row)
854
+ if custom_metric_rows:
855
+ custom_metric_rows = self._normalize_rows(
856
+ custom_metric_rows,
857
+ json_columns={"inputs", "rules"},
858
+ )
840
859
  prompt_set_id = None
841
860
  prompt_set_name = None
842
861
  if run_prompt_payloads:
@@ -878,14 +897,17 @@ class BaseSQLStorageAdapter(ABC):
878
897
  if isinstance(threshold, (int, float)) and score >= threshold:
879
898
  entry["pass_count"] += 1
880
899
 
900
+ metric_spec_map = get_metric_spec_map()
881
901
  for entry in metrics_index.values():
882
902
  count = entry["count"] or 0
903
+ spec = metric_spec_map.get(entry["metric_name"])
883
904
  metric_summary_rows.append(
884
905
  {
885
906
  "metric_name": entry["metric_name"],
886
907
  "avg_score": (entry["score_sum"] / count) if count else None,
887
908
  "pass_rate": (entry["pass_count"] / count) if count else None,
888
909
  "samples": count,
910
+ "source": spec.source if spec else None,
889
911
  }
890
912
  )
891
913
 
@@ -956,7 +978,25 @@ class BaseSQLStorageAdapter(ABC):
956
978
  (
957
979
  "MetricsSummary",
958
980
  metric_summary_rows,
959
- ["metric_name", "avg_score", "pass_rate", "samples"],
981
+ ["metric_name", "avg_score", "pass_rate", "samples", "source"],
982
+ ),
983
+ (
984
+ "CustomMetrics",
985
+ custom_metric_rows,
986
+ [
987
+ "schema_version",
988
+ "metric_name",
989
+ "source",
990
+ "description",
991
+ "evaluation_method",
992
+ "inputs",
993
+ "output",
994
+ "evaluation_process",
995
+ "rules",
996
+ "notes",
997
+ "implementation_path",
998
+ "implementation_hash",
999
+ ],
960
1000
  ),
961
1001
  (
962
1002
  "RunPromptSets",
@@ -63,13 +63,15 @@ class LangfuseAdapter(TrackerPort):
63
63
  span.update_trace(name=name, metadata=metadata)
64
64
  self._traces[trace_id] = span
65
65
  else:
66
- # Langfuse 2.x: use trace method
67
- trace = self._client.trace(
66
+ trace_fn: Any = getattr(self._client, "trace", None)
67
+ if trace_fn is None:
68
+ raise RuntimeError("Langfuse client does not expose trace API")
69
+ trace_obj = trace_fn(
68
70
  name=name,
69
71
  metadata=metadata,
70
72
  )
71
- trace_id = trace.id
72
- self._traces[trace_id] = trace
73
+ trace_id = trace_obj.id
74
+ self._traces[trace_id] = trace_obj
73
75
  return trace_id
74
76
 
75
77
  def add_span(
@@ -240,7 +242,7 @@ class LangfuseAdapter(TrackerPort):
240
242
  passed_count = sum(
241
243
  1
242
244
  for r in run.results
243
- if r.get_metric(metric_name) and r.get_metric(metric_name).passed
245
+ if (metric := r.get_metric(metric_name)) and metric.passed is True
244
246
  )
245
247
  avg_score = run.get_avg_score(metric_name)
246
248
  threshold = run.thresholds.get(metric_name, 0.7)
@@ -358,6 +360,7 @@ class LangfuseAdapter(TrackerPort):
358
360
  "summary": trace_output["summary"],
359
361
  "metrics": metric_summary,
360
362
  "phoenix_links": phoenix_links or {},
363
+ "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
361
364
  "test_cases": [
362
365
  {
363
366
  "test_case_id": result.test_case_id,
@@ -421,12 +424,15 @@ class LangfuseAdapter(TrackerPort):
421
424
  }
422
425
 
423
426
  # Span metadata: additional info
424
- span_metadata = {
427
+ span_metadata: dict[str, float | int] = {
425
428
  "tokens_used": result.tokens_used,
426
429
  "latency_ms": result.latency_ms,
427
430
  }
428
431
  if result.cost_usd:
429
- span_metadata["cost_usd"] = result.cost_usd
432
+ span_metadata = {
433
+ **span_metadata,
434
+ "cost_usd": float(result.cost_usd),
435
+ }
430
436
 
431
437
  if hasattr(root_span, "start_span"):
432
438
  child_span = root_span.start_span(
@@ -220,6 +220,11 @@ class MLflowAdapter(TrackerPort):
220
220
  results_data.append(result_dict)
221
221
 
222
222
  self.save_artifact(trace_id, "test_results", results_data)
223
+ self.save_artifact(
224
+ trace_id,
225
+ "custom_metric_snapshot",
226
+ (run.tracker_metadata or {}).get("custom_metric_snapshot"),
227
+ )
223
228
 
224
229
  # 6. End MLflow run
225
230
  self.end_trace(trace_id)
@@ -26,8 +26,7 @@ from evalvault.domain.entities import (
26
26
  from evalvault.ports.outbound.tracker_port import TrackerPort
27
27
 
28
28
  if TYPE_CHECKING:
29
- from opentelemetry.sdk.trace import Span, TracerProvider
30
- from opentelemetry.trace import Tracer
29
+ from opentelemetry.sdk.trace import TracerProvider
31
30
 
32
31
 
33
32
  class PhoenixAdapter(TrackerPort):
@@ -62,9 +61,10 @@ class PhoenixAdapter(TrackerPort):
62
61
  """
63
62
  self._endpoint = endpoint
64
63
  self._service_name = service_name
65
- self._tracer: Tracer | None = None
64
+ self._tracer: Any | None = None
66
65
  self._tracer_provider: TracerProvider | None = None
67
- self._active_spans: dict[str, Span] = {}
66
+ self._active_spans: dict[str, Any] = {}
67
+ self._tracer_any: Any | None = None
68
68
  self._initialized = False
69
69
 
70
70
  def _ensure_initialized(self) -> None:
@@ -90,7 +90,8 @@ class PhoenixAdapter(TrackerPort):
90
90
  provider = get_tracer_provider()
91
91
  if provider:
92
92
  self._tracer_provider = provider
93
- self._tracer = trace.get_tracer(__name__)
93
+ self._tracer_any = trace.get_tracer(__name__)
94
+ self._tracer = self._tracer_any
94
95
  self._initialized = True
95
96
  return
96
97
 
@@ -109,7 +110,8 @@ class PhoenixAdapter(TrackerPort):
109
110
  trace.set_tracer_provider(self._tracer_provider)
110
111
 
111
112
  # Get tracer
112
- self._tracer = trace.get_tracer(__name__)
113
+ self._tracer_any = trace.get_tracer(__name__)
114
+ self._tracer = self._tracer_any
113
115
  self._initialized = True
114
116
 
115
117
  except ImportError as e:
@@ -134,7 +136,12 @@ class PhoenixAdapter(TrackerPort):
134
136
  self._ensure_initialized()
135
137
 
136
138
  # Start a new span as root
137
- span = self._tracer.start_span(name)
139
+ tracer = self._tracer_any
140
+ if tracer is None:
141
+ tracer = self._tracer
142
+ if tracer is None:
143
+ raise RuntimeError("Phoenix tracer is not initialized")
144
+ span = tracer.start_span(name)
138
145
  trace_id = str(uuid.uuid4())
139
146
 
140
147
  # Set metadata as span attributes
@@ -173,10 +180,15 @@ class PhoenixAdapter(TrackerPort):
173
180
 
174
181
  from opentelemetry import trace
175
182
 
183
+ tracer = self._tracer_any
184
+ if tracer is None:
185
+ tracer = self._tracer
186
+ if tracer is None:
187
+ raise RuntimeError("Phoenix tracer is not initialized")
176
188
  parent_span = self._active_spans[trace_id]
177
189
  context = trace.set_span_in_context(parent_span)
178
190
 
179
- with self._tracer.start_span(name, context=context) as span:
191
+ with tracer.start_span(name, context=context) as span:
180
192
  if input_data is not None:
181
193
  safe_input = sanitize_payload(input_data, max_chars=MAX_LOG_CHARS)
182
194
  span.set_attribute("input", json.dumps(safe_input, default=str))
@@ -279,7 +291,7 @@ class PhoenixAdapter(TrackerPort):
279
291
  passed_count = sum(
280
292
  1
281
293
  for r in run.results
282
- if r.get_metric(metric_name) and r.get_metric(metric_name).passed
294
+ if (metric := r.get_metric(metric_name)) and metric.passed is True
283
295
  )
284
296
  avg_score = run.get_avg_score(metric_name)
285
297
  threshold = run.thresholds.get(metric_name, 0.7)
@@ -340,13 +352,40 @@ class PhoenixAdapter(TrackerPort):
340
352
  "version": run.dataset_version,
341
353
  "total_test_cases": run.total_test_cases,
342
354
  },
355
+ "evaluation_config": {
356
+ "model": run.model_name,
357
+ "metrics": run.metrics_evaluated,
358
+ "thresholds": run.thresholds,
359
+ },
343
360
  "summary": {
344
- "pass_rate": run.pass_rate,
361
+ "total_test_cases": run.total_test_cases,
362
+ "passed": run.passed_test_cases,
363
+ "failed": run.total_test_cases - run.passed_test_cases,
364
+ "pass_rate": round(run.pass_rate, 4),
365
+ "duration_seconds": round(run.duration_seconds, 2)
366
+ if run.duration_seconds
367
+ else None,
345
368
  "total_tokens": run.total_tokens,
346
- "duration_seconds": run.duration_seconds,
347
369
  },
348
370
  "metrics": metric_summary,
371
+ "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
372
+ "test_cases": [
373
+ {
374
+ "test_case_id": result.test_case_id,
375
+ "all_passed": result.all_passed,
376
+ "metrics": {
377
+ metric.name: {
378
+ "score": metric.score,
379
+ "threshold": metric.threshold,
380
+ "passed": metric.passed,
381
+ }
382
+ for metric in result.metrics
383
+ },
384
+ }
385
+ for result in run.results
386
+ ],
349
387
  }
388
+
350
389
  self.save_artifact(trace_id, "ragas_evaluation", structured_artifact)
351
390
 
352
391
  # End the trace
@@ -369,10 +408,15 @@ class PhoenixAdapter(TrackerPort):
369
408
  """
370
409
  from opentelemetry import trace
371
410
 
411
+ tracer = self._tracer_any
412
+ if tracer is None:
413
+ tracer = self._tracer
414
+ if tracer is None:
415
+ raise RuntimeError("Phoenix tracer is not initialized")
372
416
  parent_span = self._active_spans[trace_id]
373
417
  context = trace.set_span_in_context(parent_span)
374
418
 
375
- with self._tracer.start_span(
419
+ with tracer.start_span(
376
420
  f"test-case-{result.test_case_id}",
377
421
  context=context,
378
422
  ) as span:
@@ -478,7 +522,12 @@ class PhoenixAdapter(TrackerPort):
478
522
  parent_span = self._active_spans[trace_id]
479
523
  context = trace.set_span_in_context(parent_span)
480
524
 
481
- with self._tracer.start_span("retrieval", context=context) as span:
525
+ tracer = self._tracer_any
526
+ if tracer is None:
527
+ tracer = self._tracer
528
+ if tracer is None:
529
+ raise RuntimeError("Phoenix tracer is not initialized")
530
+ with tracer.start_span("retrieval", context=context) as span:
482
531
  # Set retrieval attributes
483
532
  for key, value in data.to_span_attributes().items():
484
533
  span.set_attribute(key, value)
@@ -560,7 +609,12 @@ class PhoenixAdapter(TrackerPort):
560
609
  parent_span = self._active_spans[trace_id]
561
610
  context = trace.set_span_in_context(parent_span)
562
611
 
563
- with self._tracer.start_span("generation", context=context) as span:
612
+ tracer = self._tracer_any
613
+ if tracer is None:
614
+ tracer = self._tracer
615
+ if tracer is None:
616
+ raise RuntimeError("Phoenix tracer is not initialized")
617
+ with tracer.start_span("generation", context=context) as span:
564
618
  # Set generation attributes
565
619
  for key, value in data.to_span_attributes().items():
566
620
  span.set_attribute(key, value)
@@ -321,6 +321,27 @@ class Settings(BaseSettings):
321
321
  default="https://cloud.langfuse.com", description="Langfuse host URL"
322
322
  )
323
323
 
324
+ mcp_enabled: bool = Field(
325
+ default=False,
326
+ description="Enable MCP JSON-RPC endpoint over HTTP.",
327
+ )
328
+ mcp_protocol_version: str = Field(
329
+ default="2025-11-25",
330
+ description="MCP protocol version to advertise.",
331
+ )
332
+ mcp_server_version: str = Field(
333
+ default="0.1.0",
334
+ description="EvalVault MCP server version.",
335
+ )
336
+ mcp_auth_tokens: str | None = Field(
337
+ default=None,
338
+ description="Comma-separated bearer tokens for MCP endpoint (required).",
339
+ )
340
+ mcp_allowed_tools: str | None = Field(
341
+ default=None,
342
+ description="Comma-separated allowlist of MCP tool names.",
343
+ )
344
+
324
345
  # MLflow Configuration (optional)
325
346
  mlflow_tracking_uri: str | None = Field(default=None, description="MLflow tracking server URI")
326
347
  mlflow_experiment_name: str = Field(default="evalvault", description="MLflow experiment name")
@@ -34,6 +34,12 @@ from evalvault.domain.entities.improvement import (
34
34
  RAGComponent,
35
35
  RAGImprovementGuide,
36
36
  )
37
+ from evalvault.domain.entities.judge_calibration import (
38
+ JudgeCalibrationCase,
39
+ JudgeCalibrationMetric,
40
+ JudgeCalibrationResult,
41
+ JudgeCalibrationSummary,
42
+ )
37
43
  from evalvault.domain.entities.kg import EntityModel, RelationModel
38
44
  from evalvault.domain.entities.method import MethodInput, MethodInputDataset, MethodOutput
39
45
  from evalvault.domain.entities.prompt import Prompt, PromptSet, PromptSetBundle, PromptSetItem
@@ -104,6 +110,10 @@ __all__ = [
104
110
  "PatternType",
105
111
  "RAGComponent",
106
112
  "RAGImprovementGuide",
113
+ "JudgeCalibrationCase",
114
+ "JudgeCalibrationMetric",
115
+ "JudgeCalibrationResult",
116
+ "JudgeCalibrationSummary",
107
117
  # KG
108
118
  "EntityModel",
109
119
  "RelationModel",