evalvault 1.60.0__py3-none-any.whl → 1.62.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. evalvault/adapters/inbound/__init__.py +2 -1
  2. evalvault/adapters/inbound/api/adapter.py +29 -0
  3. evalvault/adapters/inbound/api/routers/runs.py +129 -6
  4. evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
  5. evalvault/adapters/inbound/cli/commands/calibrate.py +111 -0
  6. evalvault/adapters/inbound/cli/commands/run.py +2 -2
  7. evalvault/adapters/inbound/mcp/__init__.py +51 -0
  8. evalvault/adapters/inbound/mcp/schemas.py +159 -0
  9. evalvault/adapters/inbound/mcp/tools.py +710 -0
  10. evalvault/adapters/outbound/analysis/llm_report_module.py +605 -62
  11. evalvault/adapters/outbound/analysis/nlp_adapter.py +46 -2
  12. evalvault/adapters/outbound/analysis/nlp_analyzer_module.py +1 -1
  13. evalvault/adapters/outbound/storage/base_sql.py +91 -0
  14. evalvault/adapters/outbound/storage/postgres_adapter.py +22 -0
  15. evalvault/adapters/outbound/storage/postgres_schema.sql +14 -0
  16. evalvault/adapters/outbound/storage/schema.sql +15 -0
  17. evalvault/adapters/outbound/storage/sqlite_adapter.py +25 -0
  18. evalvault/config/settings.py +1 -1
  19. evalvault/domain/entities/__init__.py +12 -0
  20. evalvault/domain/entities/feedback.py +58 -0
  21. evalvault/domain/services/satisfaction_calibration_service.py +328 -0
  22. evalvault/ports/inbound/web_port.py +1 -1
  23. evalvault/ports/outbound/storage_port.py +10 -0
  24. {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/METADATA +4 -2
  25. {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/RECORD +28 -22
  26. {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/WHEEL +0 -0
  27. {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/entry_points.txt +0 -0
  28. {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,5 +1,6 @@
1
1
  """Inbound adapters."""
2
2
 
3
3
  from evalvault.adapters.inbound.cli import app
4
+ from evalvault.adapters.inbound.mcp import tools as mcp_tools
4
5
 
5
- __all__ = ["app"]
6
+ __all__ = ["app", "mcp_tools"]
@@ -14,6 +14,11 @@ from urllib.request import urlopen
14
14
 
15
15
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
16
16
  from evalvault.config.settings import Settings
17
+ from evalvault.domain.entities import (
18
+ CalibrationResult,
19
+ FeedbackSummary,
20
+ SatisfactionFeedback,
21
+ )
17
22
  from evalvault.domain.entities.prompt import PromptSetBundle
18
23
  from evalvault.domain.metrics.registry import (
19
24
  get_metric_descriptions as registry_metric_descriptions,
@@ -29,6 +34,9 @@ from evalvault.domain.services.prompt_registry import (
29
34
  build_prompt_summary,
30
35
  )
31
36
  from evalvault.domain.services.prompt_status import extract_prompt_entries
37
+ from evalvault.domain.services.satisfaction_calibration_service import (
38
+ SatisfactionCalibrationService,
39
+ )
32
40
  from evalvault.domain.services.stage_event_builder import StageEventBuilder
33
41
  from evalvault.domain.services.stage_metric_service import StageMetricService
34
42
  from evalvault.domain.services.threshold_profiles import apply_threshold_profile
@@ -893,6 +901,27 @@ class WebUIAdapter:
893
901
  raise RuntimeError("Storage not configured")
894
902
  return self._storage.delete_run_cluster_map(run_id, map_id)
895
903
 
904
+ def save_feedback(self, feedback: SatisfactionFeedback) -> str:
905
+ if self._storage is None or not hasattr(self._storage, "save_feedback"):
906
+ raise RuntimeError("Storage not configured")
907
+ return self._storage.save_feedback(feedback)
908
+
909
+ def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]:
910
+ if self._storage is None or not hasattr(self._storage, "list_feedback"):
911
+ raise RuntimeError("Storage not configured")
912
+ return self._storage.list_feedback(run_id)
913
+
914
+ def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
915
+ if self._storage is None or not hasattr(self._storage, "get_feedback_summary"):
916
+ raise RuntimeError("Storage not configured")
917
+ return self._storage.get_feedback_summary(run_id)
918
+
919
+ def build_calibration(self, run_id: str, *, model: str = "both") -> CalibrationResult:
920
+ run = self.get_run_details(run_id)
921
+ feedbacks = self.list_feedback(run_id)
922
+ service = SatisfactionCalibrationService()
923
+ return service.build_calibration(run, feedbacks, model=model)
924
+
896
925
  def list_stage_events(self, run_id: str, *, stage_type: str | None = None) -> list[StageEvent]:
897
926
  """Stage 이벤트 목록 조회."""
898
927
  if self._storage is None or not hasattr(self._storage, "list_stage_events"):
@@ -21,7 +21,11 @@ from evalvault.adapters.outbound.dataset.templates import (
21
21
  )
22
22
  from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
23
23
  from evalvault.config.settings import get_settings
24
- from evalvault.domain.entities import EvaluationRun
24
+ from evalvault.domain.entities import (
25
+ CalibrationResult,
26
+ EvaluationRun,
27
+ SatisfactionFeedback,
28
+ )
25
29
  from evalvault.domain.services.domain_learning_hook import DomainLearningHook
26
30
  from evalvault.domain.services.ragas_prompt_overrides import (
27
31
  PromptOverrideError,
@@ -178,6 +182,31 @@ class ClusterMapDeleteResponse(BaseModel):
178
182
  deleted_count: int
179
183
 
180
184
 
185
+ class FeedbackSaveRequest(BaseModel):
186
+ test_case_id: str
187
+ satisfaction_score: float | None = None
188
+ thumb_feedback: Literal["up", "down", "none"] | None = None
189
+ comment: str | None = None
190
+ rater_id: str | None = None
191
+
192
+
193
+ class FeedbackResponse(BaseModel):
194
+ feedback_id: str
195
+ run_id: str
196
+ test_case_id: str
197
+ satisfaction_score: float | None = None
198
+ thumb_feedback: str | None = None
199
+ comment: str | None = None
200
+ rater_id: str | None = None
201
+ created_at: str | None = None
202
+
203
+
204
+ class FeedbackSummaryResponse(BaseModel):
205
+ avg_satisfaction_score: float | None = None
206
+ thumb_up_rate: float | None = None
207
+ total_feedback: int
208
+
209
+
181
210
  class VisualSpaceRequest(BaseModel):
182
211
  granularity: Literal["run", "case", "cluster"] = "case"
183
212
  base_run_id: str | None = None
@@ -188,9 +217,22 @@ class VisualSpaceRequest(BaseModel):
188
217
  cluster_map: dict[str, str] | None = None
189
218
 
190
219
 
191
- def _serialize_run_details(run: EvaluationRun) -> dict[str, Any]:
220
+ def _serialize_run_details(
221
+ run: EvaluationRun,
222
+ *,
223
+ calibration: CalibrationResult | None = None,
224
+ ) -> dict[str, Any]:
225
+ summary = run.to_summary_dict()
226
+ if calibration is not None:
227
+ summary.update(
228
+ {
229
+ "avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
230
+ "thumb_up_rate": calibration.summary.thumb_up_rate,
231
+ "imputed_ratio": calibration.summary.imputed_ratio,
232
+ }
233
+ )
192
234
  payload = {
193
- "summary": run.to_summary_dict(),
235
+ "summary": summary,
194
236
  "results": [
195
237
  {
196
238
  "test_case_id": result.test_case_id,
@@ -207,6 +249,21 @@ def _serialize_run_details(run: EvaluationRun) -> dict[str, Any]:
207
249
  }
208
250
  for metric in result.metrics
209
251
  ],
252
+ "calibrated_satisfaction": (
253
+ calibration.cases[result.test_case_id].calibrated_satisfaction
254
+ if calibration and result.test_case_id in calibration.cases
255
+ else None
256
+ ),
257
+ "imputed": (
258
+ calibration.cases[result.test_case_id].imputed
259
+ if calibration and result.test_case_id in calibration.cases
260
+ else False
261
+ ),
262
+ "imputation_source": (
263
+ calibration.cases[result.test_case_id].imputation_source
264
+ if calibration and result.test_case_id in calibration.cases
265
+ else None
266
+ ),
210
267
  }
211
268
  for result in run.results
212
269
  ],
@@ -719,9 +776,12 @@ def compare_runs(
719
776
  }
720
777
  )
721
778
 
779
+ base_calibration = adapter.build_calibration(base_id)
780
+ target_calibration = adapter.build_calibration(target_id)
781
+
722
782
  return {
723
- "base": _serialize_run_details(base_run),
724
- "target": _serialize_run_details(target_run),
783
+ "base": _serialize_run_details(base_run, calibration=base_calibration),
784
+ "target": _serialize_run_details(target_run, calibration=target_calibration),
725
785
  "metric_deltas": metric_deltas,
726
786
  "case_counts": _build_case_counts(base_run, target_run),
727
787
  "pass_rate_delta": target_run.pass_rate - base_run.pass_rate,
@@ -898,7 +958,70 @@ def get_run_details(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
898
958
  """Get detailed information for a specific run."""
899
959
  try:
900
960
  run: EvaluationRun = adapter.get_run_details(run_id)
901
- return _serialize_run_details(run)
961
+ calibration = adapter.build_calibration(run_id)
962
+ return _serialize_run_details(run, calibration=calibration)
963
+ except KeyError:
964
+ raise HTTPException(status_code=404, detail="Run not found")
965
+ except Exception as e:
966
+ raise HTTPException(status_code=500, detail=str(e))
967
+
968
+
969
+ @router.post("/{run_id}/feedback", response_model=FeedbackResponse)
970
+ def save_feedback(
971
+ run_id: str,
972
+ request: FeedbackSaveRequest,
973
+ adapter: AdapterDep,
974
+ ) -> dict[str, Any]:
975
+ try:
976
+ adapter.get_run_details(run_id)
977
+ thumb_feedback = request.thumb_feedback
978
+ if thumb_feedback == "none":
979
+ thumb_feedback = None
980
+ satisfaction_score = request.satisfaction_score
981
+ if satisfaction_score is not None:
982
+ satisfaction_score = max(1.0, min(5.0, satisfaction_score))
983
+ feedback = SatisfactionFeedback(
984
+ feedback_id="",
985
+ run_id=run_id,
986
+ test_case_id=request.test_case_id,
987
+ satisfaction_score=satisfaction_score,
988
+ thumb_feedback=thumb_feedback,
989
+ comment=request.comment,
990
+ rater_id=request.rater_id,
991
+ created_at=datetime.now(),
992
+ )
993
+ feedback_id = adapter.save_feedback(feedback)
994
+ saved = feedback.to_dict()
995
+ saved["feedback_id"] = feedback_id
996
+ return saved
997
+ except KeyError:
998
+ raise HTTPException(status_code=404, detail="Run not found")
999
+ except Exception as e:
1000
+ raise HTTPException(status_code=500, detail=str(e))
1001
+
1002
+
1003
+ @router.get("/{run_id}/feedback", response_model=list[FeedbackResponse])
1004
+ def list_feedback(run_id: str, adapter: AdapterDep) -> list[dict[str, Any]]:
1005
+ try:
1006
+ adapter.get_run_details(run_id)
1007
+ feedbacks = adapter.list_feedback(run_id)
1008
+ return [feedback.to_dict() for feedback in feedbacks]
1009
+ except KeyError:
1010
+ raise HTTPException(status_code=404, detail="Run not found")
1011
+ except Exception as e:
1012
+ raise HTTPException(status_code=500, detail=str(e))
1013
+
1014
+
1015
+ @router.get("/{run_id}/feedback/summary", response_model=FeedbackSummaryResponse)
1016
+ def get_feedback_summary(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
1017
+ try:
1018
+ adapter.get_run_details(run_id)
1019
+ summary = adapter.get_feedback_summary(run_id)
1020
+ return {
1021
+ "avg_satisfaction_score": summary.avg_satisfaction_score,
1022
+ "thumb_up_rate": summary.thumb_up_rate,
1023
+ "total_feedback": summary.total_feedback,
1024
+ }
902
1025
  except KeyError:
903
1026
  raise HTTPException(status_code=404, detail="Run not found")
904
1027
  except Exception as e:
@@ -13,6 +13,7 @@ from .agent import register_agent_commands
13
13
  from .analyze import register_analyze_commands
14
14
  from .api import register_api_command
15
15
  from .benchmark import create_benchmark_app
16
+ from .calibrate import register_calibrate_commands
16
17
  from .config import register_config_commands
17
18
  from .debug import create_debug_app
18
19
  from .domain import create_domain_app
@@ -61,6 +62,7 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
61
62
  CommandModule(register_pipeline_commands),
62
63
  CommandModule(register_history_commands),
63
64
  CommandModule(register_analyze_commands),
65
+ CommandModule(register_calibrate_commands),
64
66
  CommandModule(register_generate_commands),
65
67
  CommandModule(register_gate_commands),
66
68
  CommandModule(register_agent_commands),
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
11
+ from evalvault.config.settings import Settings
12
+ from evalvault.domain.services.satisfaction_calibration_service import (
13
+ SatisfactionCalibrationService,
14
+ )
15
+
16
+ from ..utils.options import db_option
17
+
18
+ _console = Console()
19
+
20
+
21
+ def register_calibrate_commands(app: typer.Typer, console: Console) -> None:
22
+ global _console
23
+ _console = console
24
+
25
+ @app.command()
26
+ def calibrate(
27
+ run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
28
+ model: str = typer.Option(
29
+ "both", "--model", help="모델 선택 (linear|xgb|both)", show_default=True
30
+ ),
31
+ write_back: bool = typer.Option(
32
+ False,
33
+ "--write-back",
34
+ help="보정 결과를 메타데이터에 저장",
35
+ show_default=True,
36
+ ),
37
+ db_path: Path | None = db_option(help_text="DB 경로"),
38
+ ) -> None:
39
+ resolved_db_path = db_path or Settings().evalvault_db_path
40
+ if resolved_db_path is None:
41
+ _console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
42
+ raise typer.Exit(1)
43
+
44
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
45
+ try:
46
+ run = storage.get_run(run_id)
47
+ except KeyError:
48
+ _console.print("[red]오류: Run을 찾을 수 없습니다.[/red]")
49
+ raise typer.Exit(1)
50
+
51
+ normalized_model = model.lower()
52
+ if normalized_model not in {"linear", "xgb", "both"}:
53
+ _console.print("[red]오류: model은 linear|xgb|both 중 하나여야 합니다.[/red]")
54
+ raise typer.Exit(1)
55
+
56
+ feedbacks = storage.list_feedback(run_id)
57
+ service = SatisfactionCalibrationService()
58
+ calibration = service.build_calibration(run, feedbacks, model=normalized_model)
59
+
60
+ table = Table(title="보정 모델 성능 요약")
61
+ table.add_column("모델")
62
+ table.add_column("MAE", justify="right")
63
+ table.add_column("Pearson", justify="right")
64
+ table.add_column("Spearman", justify="right")
65
+
66
+ if calibration.summary.model_metrics:
67
+ for model_name, metrics in calibration.summary.model_metrics.items():
68
+ table.add_row(
69
+ model_name,
70
+ _format_metric(metrics.get("mae")),
71
+ _format_metric(metrics.get("pearson")),
72
+ _format_metric(metrics.get("spearman")),
73
+ )
74
+ else:
75
+ table.add_row("N/A", "-", "-", "-")
76
+
77
+ _console.print(table)
78
+ _console.print(
79
+ f"평균 만족도: {calibration.summary.avg_satisfaction_score} | "
80
+ f"Thumb Up 비율: {calibration.summary.thumb_up_rate} | "
81
+ f"보정 비율: {calibration.summary.imputed_ratio}"
82
+ )
83
+
84
+ if write_back:
85
+ metadata = run.tracker_metadata or {}
86
+ metadata["calibration"] = {
87
+ "updated_at": datetime.now().isoformat(),
88
+ "model": model,
89
+ "summary": {
90
+ "avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
91
+ "thumb_up_rate": calibration.summary.thumb_up_rate,
92
+ "imputed_ratio": calibration.summary.imputed_ratio,
93
+ "model_metrics": calibration.summary.model_metrics,
94
+ },
95
+ "cases": {
96
+ case_id: {
97
+ "calibrated_satisfaction": case.calibrated_satisfaction,
98
+ "imputed": case.imputed,
99
+ "imputation_source": case.imputation_source,
100
+ }
101
+ for case_id, case in calibration.cases.items()
102
+ },
103
+ }
104
+ storage.update_run_metadata(run_id, metadata)
105
+ _console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
106
+
107
+
108
+ def _format_metric(value: float | None) -> str:
109
+ if value is None:
110
+ return "-"
111
+ return f"{value:.3f}"
@@ -886,7 +886,7 @@ def register_run_commands(
886
886
  details=str(exc),
887
887
  fixes=[
888
888
  "Ollama가 실행 중인지 확인하세요: `ollama serve` (또는 데스크톱 앱 실행).",
889
- "필요 모델을 받아두세요: `ollama pull gemma3:1b`, `ollama pull qwen3-embedding:0.6b`.",
889
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b`, `ollama pull qwen3-embedding:0.6b`.",
890
890
  "서버 URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
891
891
  ],
892
892
  )
@@ -1461,7 +1461,7 @@ def register_run_commands(
1461
1461
  if provider == "ollama":
1462
1462
  fixes = [
1463
1463
  "Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
1464
- "필요 모델을 받아두세요: `ollama pull gemma3:1b` 및 `ollama pull qwen3-embedding:0.6b`.",
1464
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
1465
1465
  "URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
1466
1466
  ]
1467
1467
  elif provider == "openai":
@@ -0,0 +1,51 @@
1
+ """MCP inbound adapter package."""
2
+
3
+ from .schemas import (
4
+ AnalyzeCompareRequest,
5
+ AnalyzeCompareResponse,
6
+ ComparisonArtifactsPayload,
7
+ EvaluationArtifactsPayload,
8
+ GetArtifactsRequest,
9
+ GetArtifactsResponse,
10
+ GetRunSummaryRequest,
11
+ GetRunSummaryResponse,
12
+ ListRunsRequest,
13
+ ListRunsResponse,
14
+ McpError,
15
+ MetricsDeltaPayload,
16
+ RunEvaluationRequest,
17
+ RunEvaluationResponse,
18
+ RunSummaryPayload,
19
+ )
20
+ from .tools import (
21
+ analyze_compare,
22
+ get_artifacts,
23
+ get_run_summary,
24
+ get_tool_specs,
25
+ list_runs,
26
+ run_evaluation,
27
+ )
28
+
29
+ __all__ = [
30
+ "AnalyzeCompareRequest",
31
+ "AnalyzeCompareResponse",
32
+ "ComparisonArtifactsPayload",
33
+ "EvaluationArtifactsPayload",
34
+ "GetArtifactsRequest",
35
+ "GetArtifactsResponse",
36
+ "GetRunSummaryRequest",
37
+ "GetRunSummaryResponse",
38
+ "ListRunsRequest",
39
+ "ListRunsResponse",
40
+ "McpError",
41
+ "MetricsDeltaPayload",
42
+ "RunEvaluationRequest",
43
+ "RunEvaluationResponse",
44
+ "RunSummaryPayload",
45
+ "analyze_compare",
46
+ "get_artifacts",
47
+ "get_run_summary",
48
+ "get_tool_specs",
49
+ "list_runs",
50
+ "run_evaluation",
51
+ ]
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class ErrorStage(str, Enum):
11
+ preprocess = "preprocess"
12
+ evaluate = "evaluate"
13
+ analyze = "analyze"
14
+ compare = "compare"
15
+ storage = "storage"
16
+
17
+
18
+ class McpError(BaseModel):
19
+ code: str
20
+ message: str
21
+ details: dict[str, Any] | None = None
22
+ retryable: bool = False
23
+ stage: ErrorStage | None = None
24
+
25
+
26
+ class RunSummaryPayload(BaseModel):
27
+ run_id: str
28
+ dataset_name: str
29
+ model_name: str
30
+ pass_rate: float
31
+ total_test_cases: int
32
+ passed_test_cases: int
33
+ started_at: str
34
+ finished_at: str | None = None
35
+ metrics_evaluated: list[str] = Field(default_factory=list)
36
+ threshold_profile: str | None = None
37
+ run_mode: str | None = None
38
+ evaluation_task: str | None = None
39
+ project_name: str | None = None
40
+ avg_metric_scores: dict[str, float] | None = None
41
+ thresholds: dict[str, float] | None = None
42
+
43
+ model_config = ConfigDict(extra="allow")
44
+
45
+
46
+ class ListRunsRequest(BaseModel):
47
+ limit: int = Field(50, ge=1, le=500)
48
+ dataset_name: str | None = None
49
+ model_name: str | None = None
50
+ run_mode: str | None = None
51
+ project_names: list[str] | None = None
52
+ db_path: Path | None = None
53
+
54
+
55
+ class ListRunsResponse(BaseModel):
56
+ runs: list[RunSummaryPayload] = Field(default_factory=list)
57
+ errors: list[McpError] = Field(default_factory=list)
58
+
59
+
60
+ class GetRunSummaryRequest(BaseModel):
61
+ run_id: str
62
+ db_path: Path | None = None
63
+
64
+
65
+ class GetRunSummaryResponse(BaseModel):
66
+ summary: RunSummaryPayload | None = None
67
+ errors: list[McpError] = Field(default_factory=list)
68
+
69
+
70
+ class ArtifactsKind(str, Enum):
71
+ analysis = "analysis"
72
+ comparison = "comparison"
73
+
74
+
75
+ class GetArtifactsRequest(BaseModel):
76
+ run_id: str
77
+ kind: ArtifactsKind = ArtifactsKind.analysis
78
+ comparison_run_id: str | None = None
79
+ base_dir: Path | None = None
80
+
81
+
82
+ class ArtifactsPayload(BaseModel):
83
+ kind: Literal["analysis", "comparison"]
84
+ report_path: str | None = None
85
+ output_path: str | None = None
86
+ artifacts_dir: str | None = None
87
+ artifacts_index_path: str | None = None
88
+
89
+
90
+ class GetArtifactsResponse(BaseModel):
91
+ run_id: str
92
+ artifacts: ArtifactsPayload | None = None
93
+ errors: list[McpError] = Field(default_factory=list)
94
+
95
+
96
+ class RunEvaluationRequest(BaseModel):
97
+ dataset_path: Path
98
+ metrics: list[str]
99
+ profile: str | None = None
100
+ model_name: str | None = None
101
+ evaluation_task: str | None = None
102
+ db_path: Path | None = None
103
+ thresholds: dict[str, float] | None = None
104
+ threshold_profile: str | None = None
105
+ parallel: bool = True
106
+ batch_size: int = 5
107
+ auto_analyze: bool = False
108
+ analysis_output: Path | None = None
109
+ analysis_report: Path | None = None
110
+ analysis_dir: Path | None = None
111
+
112
+
113
+ class EvaluationArtifactsPayload(BaseModel):
114
+ analysis_report_path: str | None = None
115
+ analysis_output_path: str | None = None
116
+ analysis_artifacts_dir: str | None = None
117
+ analysis_artifacts_index_path: str | None = None
118
+
119
+
120
+ class RunEvaluationResponse(BaseModel):
121
+ run_id: str
122
+ metrics: dict[str, float | None] = Field(default_factory=dict)
123
+ thresholds: dict[str, float] | None = None
124
+ artifacts: EvaluationArtifactsPayload | None = None
125
+ errors: list[McpError] = Field(default_factory=list)
126
+
127
+
128
+ class AnalyzeCompareRequest(BaseModel):
129
+ run_id_a: str
130
+ run_id_b: str
131
+ metrics: list[str] | None = None
132
+ test_type: Literal["t-test", "mann-whitney"] = "t-test"
133
+ profile: str | None = None
134
+ db_path: Path | None = None
135
+ output: Path | None = None
136
+ report: Path | None = None
137
+ output_dir: Path | None = None
138
+
139
+
140
+ class MetricsDeltaPayload(BaseModel):
141
+ avg: dict[str, float] = Field(default_factory=dict)
142
+ by_metric: dict[str, float] = Field(default_factory=dict)
143
+ notes: list[str] | None = None
144
+
145
+
146
+ class ComparisonArtifactsPayload(BaseModel):
147
+ json_path: str | None = None
148
+ report_path: str | None = None
149
+ artifacts_dir: str | None = None
150
+ artifacts_index_path: str | None = None
151
+
152
+
153
+ class AnalyzeCompareResponse(BaseModel):
154
+ baseline_run_id: str
155
+ candidate_run_id: str
156
+ comparison_report_path: str | None = None
157
+ metrics_delta: MetricsDeltaPayload = Field(default_factory=MetricsDeltaPayload)
158
+ artifacts: ComparisonArtifactsPayload | None = None
159
+ errors: list[McpError] = Field(default_factory=list)