evalvault 1.60.0__py3-none-any.whl → 1.62.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/__init__.py +2 -1
- evalvault/adapters/inbound/api/adapter.py +29 -0
- evalvault/adapters/inbound/api/routers/runs.py +129 -6
- evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
- evalvault/adapters/inbound/cli/commands/calibrate.py +111 -0
- evalvault/adapters/inbound/cli/commands/run.py +2 -2
- evalvault/adapters/inbound/mcp/__init__.py +51 -0
- evalvault/adapters/inbound/mcp/schemas.py +159 -0
- evalvault/adapters/inbound/mcp/tools.py +710 -0
- evalvault/adapters/outbound/analysis/llm_report_module.py +605 -62
- evalvault/adapters/outbound/analysis/nlp_adapter.py +46 -2
- evalvault/adapters/outbound/analysis/nlp_analyzer_module.py +1 -1
- evalvault/adapters/outbound/storage/base_sql.py +91 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +22 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +14 -0
- evalvault/adapters/outbound/storage/schema.sql +15 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +25 -0
- evalvault/config/settings.py +1 -1
- evalvault/domain/entities/__init__.py +12 -0
- evalvault/domain/entities/feedback.py +58 -0
- evalvault/domain/services/satisfaction_calibration_service.py +328 -0
- evalvault/ports/inbound/web_port.py +1 -1
- evalvault/ports/outbound/storage_port.py +10 -0
- {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/METADATA +4 -2
- {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/RECORD +28 -22
- {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/WHEEL +0 -0
- {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.60.0.dist-info → evalvault-1.62.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -14,6 +14,11 @@ from urllib.request import urlopen
|
|
|
14
14
|
|
|
15
15
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
16
16
|
from evalvault.config.settings import Settings
|
|
17
|
+
from evalvault.domain.entities import (
|
|
18
|
+
CalibrationResult,
|
|
19
|
+
FeedbackSummary,
|
|
20
|
+
SatisfactionFeedback,
|
|
21
|
+
)
|
|
17
22
|
from evalvault.domain.entities.prompt import PromptSetBundle
|
|
18
23
|
from evalvault.domain.metrics.registry import (
|
|
19
24
|
get_metric_descriptions as registry_metric_descriptions,
|
|
@@ -29,6 +34,9 @@ from evalvault.domain.services.prompt_registry import (
|
|
|
29
34
|
build_prompt_summary,
|
|
30
35
|
)
|
|
31
36
|
from evalvault.domain.services.prompt_status import extract_prompt_entries
|
|
37
|
+
from evalvault.domain.services.satisfaction_calibration_service import (
|
|
38
|
+
SatisfactionCalibrationService,
|
|
39
|
+
)
|
|
32
40
|
from evalvault.domain.services.stage_event_builder import StageEventBuilder
|
|
33
41
|
from evalvault.domain.services.stage_metric_service import StageMetricService
|
|
34
42
|
from evalvault.domain.services.threshold_profiles import apply_threshold_profile
|
|
@@ -893,6 +901,27 @@ class WebUIAdapter:
|
|
|
893
901
|
raise RuntimeError("Storage not configured")
|
|
894
902
|
return self._storage.delete_run_cluster_map(run_id, map_id)
|
|
895
903
|
|
|
904
|
+
def save_feedback(self, feedback: SatisfactionFeedback) -> str:
|
|
905
|
+
if self._storage is None or not hasattr(self._storage, "save_feedback"):
|
|
906
|
+
raise RuntimeError("Storage not configured")
|
|
907
|
+
return self._storage.save_feedback(feedback)
|
|
908
|
+
|
|
909
|
+
def list_feedback(self, run_id: str) -> list[SatisfactionFeedback]:
|
|
910
|
+
if self._storage is None or not hasattr(self._storage, "list_feedback"):
|
|
911
|
+
raise RuntimeError("Storage not configured")
|
|
912
|
+
return self._storage.list_feedback(run_id)
|
|
913
|
+
|
|
914
|
+
def get_feedback_summary(self, run_id: str) -> FeedbackSummary:
|
|
915
|
+
if self._storage is None or not hasattr(self._storage, "get_feedback_summary"):
|
|
916
|
+
raise RuntimeError("Storage not configured")
|
|
917
|
+
return self._storage.get_feedback_summary(run_id)
|
|
918
|
+
|
|
919
|
+
def build_calibration(self, run_id: str, *, model: str = "both") -> CalibrationResult:
|
|
920
|
+
run = self.get_run_details(run_id)
|
|
921
|
+
feedbacks = self.list_feedback(run_id)
|
|
922
|
+
service = SatisfactionCalibrationService()
|
|
923
|
+
return service.build_calibration(run, feedbacks, model=model)
|
|
924
|
+
|
|
896
925
|
def list_stage_events(self, run_id: str, *, stage_type: str | None = None) -> list[StageEvent]:
|
|
897
926
|
"""Stage 이벤트 목록 조회."""
|
|
898
927
|
if self._storage is None or not hasattr(self._storage, "list_stage_events"):
|
|
@@ -21,7 +21,11 @@ from evalvault.adapters.outbound.dataset.templates import (
|
|
|
21
21
|
)
|
|
22
22
|
from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
|
|
23
23
|
from evalvault.config.settings import get_settings
|
|
24
|
-
from evalvault.domain.entities import
|
|
24
|
+
from evalvault.domain.entities import (
|
|
25
|
+
CalibrationResult,
|
|
26
|
+
EvaluationRun,
|
|
27
|
+
SatisfactionFeedback,
|
|
28
|
+
)
|
|
25
29
|
from evalvault.domain.services.domain_learning_hook import DomainLearningHook
|
|
26
30
|
from evalvault.domain.services.ragas_prompt_overrides import (
|
|
27
31
|
PromptOverrideError,
|
|
@@ -178,6 +182,31 @@ class ClusterMapDeleteResponse(BaseModel):
|
|
|
178
182
|
deleted_count: int
|
|
179
183
|
|
|
180
184
|
|
|
185
|
+
class FeedbackSaveRequest(BaseModel):
|
|
186
|
+
test_case_id: str
|
|
187
|
+
satisfaction_score: float | None = None
|
|
188
|
+
thumb_feedback: Literal["up", "down", "none"] | None = None
|
|
189
|
+
comment: str | None = None
|
|
190
|
+
rater_id: str | None = None
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class FeedbackResponse(BaseModel):
|
|
194
|
+
feedback_id: str
|
|
195
|
+
run_id: str
|
|
196
|
+
test_case_id: str
|
|
197
|
+
satisfaction_score: float | None = None
|
|
198
|
+
thumb_feedback: str | None = None
|
|
199
|
+
comment: str | None = None
|
|
200
|
+
rater_id: str | None = None
|
|
201
|
+
created_at: str | None = None
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class FeedbackSummaryResponse(BaseModel):
|
|
205
|
+
avg_satisfaction_score: float | None = None
|
|
206
|
+
thumb_up_rate: float | None = None
|
|
207
|
+
total_feedback: int
|
|
208
|
+
|
|
209
|
+
|
|
181
210
|
class VisualSpaceRequest(BaseModel):
|
|
182
211
|
granularity: Literal["run", "case", "cluster"] = "case"
|
|
183
212
|
base_run_id: str | None = None
|
|
@@ -188,9 +217,22 @@ class VisualSpaceRequest(BaseModel):
|
|
|
188
217
|
cluster_map: dict[str, str] | None = None
|
|
189
218
|
|
|
190
219
|
|
|
191
|
-
def _serialize_run_details(
|
|
220
|
+
def _serialize_run_details(
|
|
221
|
+
run: EvaluationRun,
|
|
222
|
+
*,
|
|
223
|
+
calibration: CalibrationResult | None = None,
|
|
224
|
+
) -> dict[str, Any]:
|
|
225
|
+
summary = run.to_summary_dict()
|
|
226
|
+
if calibration is not None:
|
|
227
|
+
summary.update(
|
|
228
|
+
{
|
|
229
|
+
"avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
|
|
230
|
+
"thumb_up_rate": calibration.summary.thumb_up_rate,
|
|
231
|
+
"imputed_ratio": calibration.summary.imputed_ratio,
|
|
232
|
+
}
|
|
233
|
+
)
|
|
192
234
|
payload = {
|
|
193
|
-
"summary":
|
|
235
|
+
"summary": summary,
|
|
194
236
|
"results": [
|
|
195
237
|
{
|
|
196
238
|
"test_case_id": result.test_case_id,
|
|
@@ -207,6 +249,21 @@ def _serialize_run_details(run: EvaluationRun) -> dict[str, Any]:
|
|
|
207
249
|
}
|
|
208
250
|
for metric in result.metrics
|
|
209
251
|
],
|
|
252
|
+
"calibrated_satisfaction": (
|
|
253
|
+
calibration.cases[result.test_case_id].calibrated_satisfaction
|
|
254
|
+
if calibration and result.test_case_id in calibration.cases
|
|
255
|
+
else None
|
|
256
|
+
),
|
|
257
|
+
"imputed": (
|
|
258
|
+
calibration.cases[result.test_case_id].imputed
|
|
259
|
+
if calibration and result.test_case_id in calibration.cases
|
|
260
|
+
else False
|
|
261
|
+
),
|
|
262
|
+
"imputation_source": (
|
|
263
|
+
calibration.cases[result.test_case_id].imputation_source
|
|
264
|
+
if calibration and result.test_case_id in calibration.cases
|
|
265
|
+
else None
|
|
266
|
+
),
|
|
210
267
|
}
|
|
211
268
|
for result in run.results
|
|
212
269
|
],
|
|
@@ -719,9 +776,12 @@ def compare_runs(
|
|
|
719
776
|
}
|
|
720
777
|
)
|
|
721
778
|
|
|
779
|
+
base_calibration = adapter.build_calibration(base_id)
|
|
780
|
+
target_calibration = adapter.build_calibration(target_id)
|
|
781
|
+
|
|
722
782
|
return {
|
|
723
|
-
"base": _serialize_run_details(base_run),
|
|
724
|
-
"target": _serialize_run_details(target_run),
|
|
783
|
+
"base": _serialize_run_details(base_run, calibration=base_calibration),
|
|
784
|
+
"target": _serialize_run_details(target_run, calibration=target_calibration),
|
|
725
785
|
"metric_deltas": metric_deltas,
|
|
726
786
|
"case_counts": _build_case_counts(base_run, target_run),
|
|
727
787
|
"pass_rate_delta": target_run.pass_rate - base_run.pass_rate,
|
|
@@ -898,7 +958,70 @@ def get_run_details(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
|
|
|
898
958
|
"""Get detailed information for a specific run."""
|
|
899
959
|
try:
|
|
900
960
|
run: EvaluationRun = adapter.get_run_details(run_id)
|
|
901
|
-
|
|
961
|
+
calibration = adapter.build_calibration(run_id)
|
|
962
|
+
return _serialize_run_details(run, calibration=calibration)
|
|
963
|
+
except KeyError:
|
|
964
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
965
|
+
except Exception as e:
|
|
966
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
@router.post("/{run_id}/feedback", response_model=FeedbackResponse)
|
|
970
|
+
def save_feedback(
|
|
971
|
+
run_id: str,
|
|
972
|
+
request: FeedbackSaveRequest,
|
|
973
|
+
adapter: AdapterDep,
|
|
974
|
+
) -> dict[str, Any]:
|
|
975
|
+
try:
|
|
976
|
+
adapter.get_run_details(run_id)
|
|
977
|
+
thumb_feedback = request.thumb_feedback
|
|
978
|
+
if thumb_feedback == "none":
|
|
979
|
+
thumb_feedback = None
|
|
980
|
+
satisfaction_score = request.satisfaction_score
|
|
981
|
+
if satisfaction_score is not None:
|
|
982
|
+
satisfaction_score = max(1.0, min(5.0, satisfaction_score))
|
|
983
|
+
feedback = SatisfactionFeedback(
|
|
984
|
+
feedback_id="",
|
|
985
|
+
run_id=run_id,
|
|
986
|
+
test_case_id=request.test_case_id,
|
|
987
|
+
satisfaction_score=satisfaction_score,
|
|
988
|
+
thumb_feedback=thumb_feedback,
|
|
989
|
+
comment=request.comment,
|
|
990
|
+
rater_id=request.rater_id,
|
|
991
|
+
created_at=datetime.now(),
|
|
992
|
+
)
|
|
993
|
+
feedback_id = adapter.save_feedback(feedback)
|
|
994
|
+
saved = feedback.to_dict()
|
|
995
|
+
saved["feedback_id"] = feedback_id
|
|
996
|
+
return saved
|
|
997
|
+
except KeyError:
|
|
998
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
999
|
+
except Exception as e:
|
|
1000
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1001
|
+
|
|
1002
|
+
|
|
1003
|
+
@router.get("/{run_id}/feedback", response_model=list[FeedbackResponse])
|
|
1004
|
+
def list_feedback(run_id: str, adapter: AdapterDep) -> list[dict[str, Any]]:
|
|
1005
|
+
try:
|
|
1006
|
+
adapter.get_run_details(run_id)
|
|
1007
|
+
feedbacks = adapter.list_feedback(run_id)
|
|
1008
|
+
return [feedback.to_dict() for feedback in feedbacks]
|
|
1009
|
+
except KeyError:
|
|
1010
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
1011
|
+
except Exception as e:
|
|
1012
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
@router.get("/{run_id}/feedback/summary", response_model=FeedbackSummaryResponse)
|
|
1016
|
+
def get_feedback_summary(run_id: str, adapter: AdapterDep) -> dict[str, Any]:
|
|
1017
|
+
try:
|
|
1018
|
+
adapter.get_run_details(run_id)
|
|
1019
|
+
summary = adapter.get_feedback_summary(run_id)
|
|
1020
|
+
return {
|
|
1021
|
+
"avg_satisfaction_score": summary.avg_satisfaction_score,
|
|
1022
|
+
"thumb_up_rate": summary.thumb_up_rate,
|
|
1023
|
+
"total_feedback": summary.total_feedback,
|
|
1024
|
+
}
|
|
902
1025
|
except KeyError:
|
|
903
1026
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
904
1027
|
except Exception as e:
|
|
@@ -13,6 +13,7 @@ from .agent import register_agent_commands
|
|
|
13
13
|
from .analyze import register_analyze_commands
|
|
14
14
|
from .api import register_api_command
|
|
15
15
|
from .benchmark import create_benchmark_app
|
|
16
|
+
from .calibrate import register_calibrate_commands
|
|
16
17
|
from .config import register_config_commands
|
|
17
18
|
from .debug import create_debug_app
|
|
18
19
|
from .domain import create_domain_app
|
|
@@ -61,6 +62,7 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
|
|
|
61
62
|
CommandModule(register_pipeline_commands),
|
|
62
63
|
CommandModule(register_history_commands),
|
|
63
64
|
CommandModule(register_analyze_commands),
|
|
65
|
+
CommandModule(register_calibrate_commands),
|
|
64
66
|
CommandModule(register_generate_commands),
|
|
65
67
|
CommandModule(register_gate_commands),
|
|
66
68
|
CommandModule(register_agent_commands),
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
11
|
+
from evalvault.config.settings import Settings
|
|
12
|
+
from evalvault.domain.services.satisfaction_calibration_service import (
|
|
13
|
+
SatisfactionCalibrationService,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from ..utils.options import db_option
|
|
17
|
+
|
|
18
|
+
_console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def register_calibrate_commands(app: typer.Typer, console: Console) -> None:
|
|
22
|
+
global _console
|
|
23
|
+
_console = console
|
|
24
|
+
|
|
25
|
+
@app.command()
|
|
26
|
+
def calibrate(
|
|
27
|
+
run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
|
|
28
|
+
model: str = typer.Option(
|
|
29
|
+
"both", "--model", help="모델 선택 (linear|xgb|both)", show_default=True
|
|
30
|
+
),
|
|
31
|
+
write_back: bool = typer.Option(
|
|
32
|
+
False,
|
|
33
|
+
"--write-back",
|
|
34
|
+
help="보정 결과를 메타데이터에 저장",
|
|
35
|
+
show_default=True,
|
|
36
|
+
),
|
|
37
|
+
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
38
|
+
) -> None:
|
|
39
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
40
|
+
if resolved_db_path is None:
|
|
41
|
+
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
42
|
+
raise typer.Exit(1)
|
|
43
|
+
|
|
44
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
45
|
+
try:
|
|
46
|
+
run = storage.get_run(run_id)
|
|
47
|
+
except KeyError:
|
|
48
|
+
_console.print("[red]오류: Run을 찾을 수 없습니다.[/red]")
|
|
49
|
+
raise typer.Exit(1)
|
|
50
|
+
|
|
51
|
+
normalized_model = model.lower()
|
|
52
|
+
if normalized_model not in {"linear", "xgb", "both"}:
|
|
53
|
+
_console.print("[red]오류: model은 linear|xgb|both 중 하나여야 합니다.[/red]")
|
|
54
|
+
raise typer.Exit(1)
|
|
55
|
+
|
|
56
|
+
feedbacks = storage.list_feedback(run_id)
|
|
57
|
+
service = SatisfactionCalibrationService()
|
|
58
|
+
calibration = service.build_calibration(run, feedbacks, model=normalized_model)
|
|
59
|
+
|
|
60
|
+
table = Table(title="보정 모델 성능 요약")
|
|
61
|
+
table.add_column("모델")
|
|
62
|
+
table.add_column("MAE", justify="right")
|
|
63
|
+
table.add_column("Pearson", justify="right")
|
|
64
|
+
table.add_column("Spearman", justify="right")
|
|
65
|
+
|
|
66
|
+
if calibration.summary.model_metrics:
|
|
67
|
+
for model_name, metrics in calibration.summary.model_metrics.items():
|
|
68
|
+
table.add_row(
|
|
69
|
+
model_name,
|
|
70
|
+
_format_metric(metrics.get("mae")),
|
|
71
|
+
_format_metric(metrics.get("pearson")),
|
|
72
|
+
_format_metric(metrics.get("spearman")),
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
table.add_row("N/A", "-", "-", "-")
|
|
76
|
+
|
|
77
|
+
_console.print(table)
|
|
78
|
+
_console.print(
|
|
79
|
+
f"평균 만족도: {calibration.summary.avg_satisfaction_score} | "
|
|
80
|
+
f"Thumb Up 비율: {calibration.summary.thumb_up_rate} | "
|
|
81
|
+
f"보정 비율: {calibration.summary.imputed_ratio}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if write_back:
|
|
85
|
+
metadata = run.tracker_metadata or {}
|
|
86
|
+
metadata["calibration"] = {
|
|
87
|
+
"updated_at": datetime.now().isoformat(),
|
|
88
|
+
"model": model,
|
|
89
|
+
"summary": {
|
|
90
|
+
"avg_satisfaction_score": calibration.summary.avg_satisfaction_score,
|
|
91
|
+
"thumb_up_rate": calibration.summary.thumb_up_rate,
|
|
92
|
+
"imputed_ratio": calibration.summary.imputed_ratio,
|
|
93
|
+
"model_metrics": calibration.summary.model_metrics,
|
|
94
|
+
},
|
|
95
|
+
"cases": {
|
|
96
|
+
case_id: {
|
|
97
|
+
"calibrated_satisfaction": case.calibrated_satisfaction,
|
|
98
|
+
"imputed": case.imputed,
|
|
99
|
+
"imputation_source": case.imputation_source,
|
|
100
|
+
}
|
|
101
|
+
for case_id, case in calibration.cases.items()
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
storage.update_run_metadata(run_id, metadata)
|
|
105
|
+
_console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _format_metric(value: float | None) -> str:
|
|
109
|
+
if value is None:
|
|
110
|
+
return "-"
|
|
111
|
+
return f"{value:.3f}"
|
|
@@ -886,7 +886,7 @@ def register_run_commands(
|
|
|
886
886
|
details=str(exc),
|
|
887
887
|
fixes=[
|
|
888
888
|
"Ollama가 실행 중인지 확인하세요: `ollama serve` (또는 데스크톱 앱 실행).",
|
|
889
|
-
"필요 모델을 받아두세요: `ollama pull
|
|
889
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b`, `ollama pull qwen3-embedding:0.6b`.",
|
|
890
890
|
"서버 URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
891
891
|
],
|
|
892
892
|
)
|
|
@@ -1461,7 +1461,7 @@ def register_run_commands(
|
|
|
1461
1461
|
if provider == "ollama":
|
|
1462
1462
|
fixes = [
|
|
1463
1463
|
"Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
|
|
1464
|
-
"필요 모델을 받아두세요: `ollama pull
|
|
1464
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
|
|
1465
1465
|
"URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
1466
1466
|
]
|
|
1467
1467
|
elif provider == "openai":
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""MCP inbound adapter package."""
|
|
2
|
+
|
|
3
|
+
from .schemas import (
|
|
4
|
+
AnalyzeCompareRequest,
|
|
5
|
+
AnalyzeCompareResponse,
|
|
6
|
+
ComparisonArtifactsPayload,
|
|
7
|
+
EvaluationArtifactsPayload,
|
|
8
|
+
GetArtifactsRequest,
|
|
9
|
+
GetArtifactsResponse,
|
|
10
|
+
GetRunSummaryRequest,
|
|
11
|
+
GetRunSummaryResponse,
|
|
12
|
+
ListRunsRequest,
|
|
13
|
+
ListRunsResponse,
|
|
14
|
+
McpError,
|
|
15
|
+
MetricsDeltaPayload,
|
|
16
|
+
RunEvaluationRequest,
|
|
17
|
+
RunEvaluationResponse,
|
|
18
|
+
RunSummaryPayload,
|
|
19
|
+
)
|
|
20
|
+
from .tools import (
|
|
21
|
+
analyze_compare,
|
|
22
|
+
get_artifacts,
|
|
23
|
+
get_run_summary,
|
|
24
|
+
get_tool_specs,
|
|
25
|
+
list_runs,
|
|
26
|
+
run_evaluation,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"AnalyzeCompareRequest",
|
|
31
|
+
"AnalyzeCompareResponse",
|
|
32
|
+
"ComparisonArtifactsPayload",
|
|
33
|
+
"EvaluationArtifactsPayload",
|
|
34
|
+
"GetArtifactsRequest",
|
|
35
|
+
"GetArtifactsResponse",
|
|
36
|
+
"GetRunSummaryRequest",
|
|
37
|
+
"GetRunSummaryResponse",
|
|
38
|
+
"ListRunsRequest",
|
|
39
|
+
"ListRunsResponse",
|
|
40
|
+
"McpError",
|
|
41
|
+
"MetricsDeltaPayload",
|
|
42
|
+
"RunEvaluationRequest",
|
|
43
|
+
"RunEvaluationResponse",
|
|
44
|
+
"RunSummaryPayload",
|
|
45
|
+
"analyze_compare",
|
|
46
|
+
"get_artifacts",
|
|
47
|
+
"get_run_summary",
|
|
48
|
+
"get_tool_specs",
|
|
49
|
+
"list_runs",
|
|
50
|
+
"run_evaluation",
|
|
51
|
+
]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ErrorStage(str, Enum):
|
|
11
|
+
preprocess = "preprocess"
|
|
12
|
+
evaluate = "evaluate"
|
|
13
|
+
analyze = "analyze"
|
|
14
|
+
compare = "compare"
|
|
15
|
+
storage = "storage"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class McpError(BaseModel):
|
|
19
|
+
code: str
|
|
20
|
+
message: str
|
|
21
|
+
details: dict[str, Any] | None = None
|
|
22
|
+
retryable: bool = False
|
|
23
|
+
stage: ErrorStage | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RunSummaryPayload(BaseModel):
|
|
27
|
+
run_id: str
|
|
28
|
+
dataset_name: str
|
|
29
|
+
model_name: str
|
|
30
|
+
pass_rate: float
|
|
31
|
+
total_test_cases: int
|
|
32
|
+
passed_test_cases: int
|
|
33
|
+
started_at: str
|
|
34
|
+
finished_at: str | None = None
|
|
35
|
+
metrics_evaluated: list[str] = Field(default_factory=list)
|
|
36
|
+
threshold_profile: str | None = None
|
|
37
|
+
run_mode: str | None = None
|
|
38
|
+
evaluation_task: str | None = None
|
|
39
|
+
project_name: str | None = None
|
|
40
|
+
avg_metric_scores: dict[str, float] | None = None
|
|
41
|
+
thresholds: dict[str, float] | None = None
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="allow")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ListRunsRequest(BaseModel):
|
|
47
|
+
limit: int = Field(50, ge=1, le=500)
|
|
48
|
+
dataset_name: str | None = None
|
|
49
|
+
model_name: str | None = None
|
|
50
|
+
run_mode: str | None = None
|
|
51
|
+
project_names: list[str] | None = None
|
|
52
|
+
db_path: Path | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ListRunsResponse(BaseModel):
|
|
56
|
+
runs: list[RunSummaryPayload] = Field(default_factory=list)
|
|
57
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class GetRunSummaryRequest(BaseModel):
|
|
61
|
+
run_id: str
|
|
62
|
+
db_path: Path | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class GetRunSummaryResponse(BaseModel):
|
|
66
|
+
summary: RunSummaryPayload | None = None
|
|
67
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ArtifactsKind(str, Enum):
|
|
71
|
+
analysis = "analysis"
|
|
72
|
+
comparison = "comparison"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class GetArtifactsRequest(BaseModel):
|
|
76
|
+
run_id: str
|
|
77
|
+
kind: ArtifactsKind = ArtifactsKind.analysis
|
|
78
|
+
comparison_run_id: str | None = None
|
|
79
|
+
base_dir: Path | None = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ArtifactsPayload(BaseModel):
|
|
83
|
+
kind: Literal["analysis", "comparison"]
|
|
84
|
+
report_path: str | None = None
|
|
85
|
+
output_path: str | None = None
|
|
86
|
+
artifacts_dir: str | None = None
|
|
87
|
+
artifacts_index_path: str | None = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class GetArtifactsResponse(BaseModel):
|
|
91
|
+
run_id: str
|
|
92
|
+
artifacts: ArtifactsPayload | None = None
|
|
93
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class RunEvaluationRequest(BaseModel):
|
|
97
|
+
dataset_path: Path
|
|
98
|
+
metrics: list[str]
|
|
99
|
+
profile: str | None = None
|
|
100
|
+
model_name: str | None = None
|
|
101
|
+
evaluation_task: str | None = None
|
|
102
|
+
db_path: Path | None = None
|
|
103
|
+
thresholds: dict[str, float] | None = None
|
|
104
|
+
threshold_profile: str | None = None
|
|
105
|
+
parallel: bool = True
|
|
106
|
+
batch_size: int = 5
|
|
107
|
+
auto_analyze: bool = False
|
|
108
|
+
analysis_output: Path | None = None
|
|
109
|
+
analysis_report: Path | None = None
|
|
110
|
+
analysis_dir: Path | None = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class EvaluationArtifactsPayload(BaseModel):
|
|
114
|
+
analysis_report_path: str | None = None
|
|
115
|
+
analysis_output_path: str | None = None
|
|
116
|
+
analysis_artifacts_dir: str | None = None
|
|
117
|
+
analysis_artifacts_index_path: str | None = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RunEvaluationResponse(BaseModel):
|
|
121
|
+
run_id: str
|
|
122
|
+
metrics: dict[str, float | None] = Field(default_factory=dict)
|
|
123
|
+
thresholds: dict[str, float] | None = None
|
|
124
|
+
artifacts: EvaluationArtifactsPayload | None = None
|
|
125
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class AnalyzeCompareRequest(BaseModel):
|
|
129
|
+
run_id_a: str
|
|
130
|
+
run_id_b: str
|
|
131
|
+
metrics: list[str] | None = None
|
|
132
|
+
test_type: Literal["t-test", "mann-whitney"] = "t-test"
|
|
133
|
+
profile: str | None = None
|
|
134
|
+
db_path: Path | None = None
|
|
135
|
+
output: Path | None = None
|
|
136
|
+
report: Path | None = None
|
|
137
|
+
output_dir: Path | None = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class MetricsDeltaPayload(BaseModel):
|
|
141
|
+
avg: dict[str, float] = Field(default_factory=dict)
|
|
142
|
+
by_metric: dict[str, float] = Field(default_factory=dict)
|
|
143
|
+
notes: list[str] | None = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ComparisonArtifactsPayload(BaseModel):
|
|
147
|
+
json_path: str | None = None
|
|
148
|
+
report_path: str | None = None
|
|
149
|
+
artifacts_dir: str | None = None
|
|
150
|
+
artifacts_index_path: str | None = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class AnalyzeCompareResponse(BaseModel):
|
|
154
|
+
baseline_run_id: str
|
|
155
|
+
candidate_run_id: str
|
|
156
|
+
comparison_report_path: str | None = None
|
|
157
|
+
metrics_delta: MetricsDeltaPayload = Field(default_factory=MetricsDeltaPayload)
|
|
158
|
+
artifacts: ComparisonArtifactsPayload | None = None
|
|
159
|
+
errors: list[McpError] = Field(default_factory=list)
|