evalvault 1.59.0__py3-none-any.whl → 1.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/__init__.py +2 -1
- evalvault/adapters/inbound/api/adapter.py +13 -25
- evalvault/adapters/inbound/api/routers/pipeline.py +16 -0
- evalvault/adapters/inbound/api/routers/runs.py +16 -0
- evalvault/adapters/inbound/cli/app.py +3 -13
- evalvault/adapters/inbound/cli/commands/config.py +4 -40
- evalvault/adapters/inbound/cli/commands/pipeline.py +4 -8
- evalvault/adapters/inbound/cli/commands/run.py +2 -2
- evalvault/adapters/inbound/mcp/__init__.py +51 -0
- evalvault/adapters/inbound/mcp/schemas.py +159 -0
- evalvault/adapters/inbound/mcp/tools.py +710 -0
- evalvault/adapters/outbound/analysis/llm_report_module.py +787 -35
- evalvault/config/settings.py +1 -1
- evalvault/domain/metrics/analysis_registry.py +217 -0
- evalvault/domain/metrics/registry.py +185 -0
- evalvault/domain/services/pipeline_template_registry.py +112 -0
- evalvault/ports/inbound/web_port.py +5 -1
- {evalvault-1.59.0.dist-info → evalvault-1.61.0.dist-info}/METADATA +2 -2
- {evalvault-1.59.0.dist-info → evalvault-1.61.0.dist-info}/RECORD +22 -17
- {evalvault-1.59.0.dist-info → evalvault-1.61.0.dist-info}/WHEEL +0 -0
- {evalvault-1.59.0.dist-info → evalvault-1.61.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.59.0.dist-info → evalvault-1.61.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -15,6 +15,13 @@ from urllib.request import urlopen
|
|
|
15
15
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
16
16
|
from evalvault.config.settings import Settings
|
|
17
17
|
from evalvault.domain.entities.prompt import PromptSetBundle
|
|
18
|
+
from evalvault.domain.metrics.registry import (
|
|
19
|
+
get_metric_descriptions as registry_metric_descriptions,
|
|
20
|
+
)
|
|
21
|
+
from evalvault.domain.metrics.registry import (
|
|
22
|
+
list_metric_names,
|
|
23
|
+
list_metric_specs,
|
|
24
|
+
)
|
|
18
25
|
from evalvault.domain.services.cluster_map_builder import build_cluster_map
|
|
19
26
|
from evalvault.domain.services.prompt_registry import (
|
|
20
27
|
PromptInput,
|
|
@@ -42,21 +49,6 @@ if TYPE_CHECKING:
|
|
|
42
49
|
|
|
43
50
|
logger = logging.getLogger(__name__)
|
|
44
51
|
|
|
45
|
-
# 지원하는 메트릭 목록
|
|
46
|
-
AVAILABLE_METRICS = [
|
|
47
|
-
"faithfulness",
|
|
48
|
-
"answer_relevancy",
|
|
49
|
-
"context_precision",
|
|
50
|
-
"context_recall",
|
|
51
|
-
"factual_correctness",
|
|
52
|
-
"semantic_similarity",
|
|
53
|
-
"summary_score",
|
|
54
|
-
"summary_faithfulness",
|
|
55
|
-
"insurance_term_accuracy",
|
|
56
|
-
"entity_preservation",
|
|
57
|
-
"contextual_relevancy",
|
|
58
|
-
]
|
|
59
|
-
|
|
60
52
|
|
|
61
53
|
@dataclass
|
|
62
54
|
class GateResult:
|
|
@@ -978,19 +970,15 @@ class WebUIAdapter:
|
|
|
978
970
|
|
|
979
971
|
def get_available_metrics(self) -> list[str]:
|
|
980
972
|
"""사용 가능한 메트릭 목록 반환."""
|
|
981
|
-
return
|
|
973
|
+
return list_metric_names()
|
|
974
|
+
|
|
975
|
+
def get_metric_specs(self) -> list[dict[str, object]]:
|
|
976
|
+
"""메트릭 스펙 목록 반환."""
|
|
977
|
+
return [spec.to_dict() for spec in list_metric_specs()]
|
|
982
978
|
|
|
983
979
|
def get_metric_descriptions(self) -> dict[str, str]:
|
|
984
980
|
"""메트릭별 설명 반환."""
|
|
985
|
-
return
|
|
986
|
-
"faithfulness": "답변이 컨텍스트에 충실한지 평가",
|
|
987
|
-
"answer_relevancy": "답변이 질문과 관련있는지 평가",
|
|
988
|
-
"context_precision": "검색된 컨텍스트의 정밀도 평가",
|
|
989
|
-
"context_recall": "필요한 정보가 검색되었는지 평가",
|
|
990
|
-
"factual_correctness": "ground_truth 대비 사실적 정확성 평가",
|
|
991
|
-
"semantic_similarity": "답변과 ground_truth 간 의미적 유사도 평가",
|
|
992
|
-
"insurance_term_accuracy": "보험 용어 정확성 평가",
|
|
993
|
-
}
|
|
981
|
+
return registry_metric_descriptions()
|
|
994
982
|
|
|
995
983
|
def create_dataset_from_upload(
|
|
996
984
|
self,
|
|
@@ -11,6 +11,7 @@ from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
|
11
11
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
12
12
|
from evalvault.config.settings import get_settings
|
|
13
13
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
14
|
+
from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
|
|
14
15
|
from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
|
|
15
16
|
|
|
16
17
|
router = APIRouter(tags=["pipeline"])
|
|
@@ -220,6 +221,15 @@ class PipelineResultResponse(PipelineResultSummary):
|
|
|
220
221
|
final_output: dict[str, Any] | None = None
|
|
221
222
|
|
|
222
223
|
|
|
224
|
+
class AnalysisMetricSpecResponse(BaseModel):
|
|
225
|
+
key: str
|
|
226
|
+
label: str
|
|
227
|
+
description: str
|
|
228
|
+
signal_group: str
|
|
229
|
+
module_id: str
|
|
230
|
+
output_path: list[str]
|
|
231
|
+
|
|
232
|
+
|
|
223
233
|
def _serialize_payload(value: Any) -> Any:
|
|
224
234
|
try:
|
|
225
235
|
return jsonable_encoder(value)
|
|
@@ -366,6 +376,12 @@ async def list_intents():
|
|
|
366
376
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
367
377
|
|
|
368
378
|
|
|
379
|
+
@router.get("/options/analysis-metric-specs", response_model=list[AnalysisMetricSpecResponse])
|
|
380
|
+
async def list_analysis_metric_specs_endpoint():
|
|
381
|
+
"""List analysis metric specs for pipeline outputs."""
|
|
382
|
+
return [spec.to_dict() for spec in list_analysis_metric_specs()]
|
|
383
|
+
|
|
384
|
+
|
|
369
385
|
@router.post("/results", response_model=PipelineResultSummary)
|
|
370
386
|
async def save_pipeline_result(payload: PipelineResultPayload):
|
|
371
387
|
"""Save a pipeline analysis result for history."""
|
|
@@ -113,6 +113,16 @@ class ModelItemResponse(BaseModel):
|
|
|
113
113
|
supports_tools: bool | None = None
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
class MetricSpecResponse(BaseModel):
|
|
117
|
+
name: str
|
|
118
|
+
description: str
|
|
119
|
+
requires_ground_truth: bool
|
|
120
|
+
requires_embeddings: bool
|
|
121
|
+
source: str
|
|
122
|
+
category: str
|
|
123
|
+
signal_group: str
|
|
124
|
+
|
|
125
|
+
|
|
116
126
|
class ClusterMapItemResponse(BaseModel):
|
|
117
127
|
test_case_id: str
|
|
118
128
|
cluster_id: str
|
|
@@ -395,6 +405,12 @@ def list_metrics(adapter: AdapterDep):
|
|
|
395
405
|
return adapter.get_available_metrics()
|
|
396
406
|
|
|
397
407
|
|
|
408
|
+
@router.get("/options/metric-specs", response_model=list[MetricSpecResponse])
|
|
409
|
+
def list_metric_specs(adapter: AdapterDep):
|
|
410
|
+
"""Get available metrics with metadata."""
|
|
411
|
+
return adapter.get_metric_specs()
|
|
412
|
+
|
|
413
|
+
|
|
398
414
|
@router.get("/options/cluster-maps", response_model=list[ClusterMapFileResponse])
|
|
399
415
|
def list_cluster_maps():
|
|
400
416
|
"""List available cluster map CSV files."""
|
|
@@ -14,6 +14,8 @@ import typer
|
|
|
14
14
|
from rich import print as rprint
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
|
|
17
|
+
from evalvault.domain.metrics.registry import list_metric_names
|
|
18
|
+
|
|
17
19
|
from .commands import attach_sub_apps, register_all_commands
|
|
18
20
|
|
|
19
21
|
|
|
@@ -32,19 +34,7 @@ app = typer.Typer(
|
|
|
32
34
|
)
|
|
33
35
|
console = Console()
|
|
34
36
|
|
|
35
|
-
AVAILABLE_METRICS
|
|
36
|
-
"faithfulness",
|
|
37
|
-
"answer_relevancy",
|
|
38
|
-
"context_precision",
|
|
39
|
-
"context_recall",
|
|
40
|
-
"factual_correctness",
|
|
41
|
-
"semantic_similarity",
|
|
42
|
-
"summary_score",
|
|
43
|
-
"summary_faithfulness",
|
|
44
|
-
"insurance_term_accuracy",
|
|
45
|
-
"entity_preservation",
|
|
46
|
-
"contextual_relevancy",
|
|
47
|
-
]
|
|
37
|
+
AVAILABLE_METRICS = list_metric_names()
|
|
48
38
|
|
|
49
39
|
register_all_commands(app, console, available_metrics=AVAILABLE_METRICS)
|
|
50
40
|
attach_sub_apps(app, console)
|
|
@@ -7,6 +7,7 @@ from rich.console import Console
|
|
|
7
7
|
from rich.table import Table
|
|
8
8
|
|
|
9
9
|
from evalvault.config.settings import Settings, apply_profile
|
|
10
|
+
from evalvault.domain.metrics.registry import list_metric_specs
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def register_config_commands(app: typer.Typer, console: Console) -> None:
|
|
@@ -22,46 +23,9 @@ def register_config_commands(app: typer.Typer, console: Console) -> None:
|
|
|
22
23
|
table.add_column("Description")
|
|
23
24
|
table.add_column("Requires Ground Truth", justify="center")
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
"
|
|
27
|
-
|
|
28
|
-
"[red]No[/red]",
|
|
29
|
-
)
|
|
30
|
-
table.add_row(
|
|
31
|
-
"answer_relevancy",
|
|
32
|
-
"Measures how relevant the answer is to the question",
|
|
33
|
-
"[red]No[/red]",
|
|
34
|
-
)
|
|
35
|
-
table.add_row(
|
|
36
|
-
"context_precision",
|
|
37
|
-
"Measures ranking quality of retrieved contexts",
|
|
38
|
-
"[green]Yes[/green]",
|
|
39
|
-
)
|
|
40
|
-
table.add_row(
|
|
41
|
-
"context_recall",
|
|
42
|
-
"Measures if all relevant info is in retrieved contexts",
|
|
43
|
-
"[green]Yes[/green]",
|
|
44
|
-
)
|
|
45
|
-
table.add_row(
|
|
46
|
-
"summary_score",
|
|
47
|
-
"Measures summary coverage and conciseness against contexts",
|
|
48
|
-
"[red]No[/red]",
|
|
49
|
-
)
|
|
50
|
-
table.add_row(
|
|
51
|
-
"summary_faithfulness",
|
|
52
|
-
"Measures whether summary statements are grounded in contexts",
|
|
53
|
-
"[red]No[/red]",
|
|
54
|
-
)
|
|
55
|
-
table.add_row(
|
|
56
|
-
"entity_preservation",
|
|
57
|
-
"Measures preservation of key insurance entities in summaries",
|
|
58
|
-
"[red]No[/red]",
|
|
59
|
-
)
|
|
60
|
-
table.add_row(
|
|
61
|
-
"insurance_term_accuracy",
|
|
62
|
-
"Measures if insurance terms in answer are grounded in contexts",
|
|
63
|
-
"[red]No[/red]",
|
|
64
|
-
)
|
|
26
|
+
for spec in list_metric_specs():
|
|
27
|
+
needs_gt = "[green]Yes[/green]" if spec.requires_ground_truth else "[red]No[/red]"
|
|
28
|
+
table.add_row(spec.name, spec.description, needs_gt)
|
|
65
29
|
|
|
66
30
|
console.print(table)
|
|
67
31
|
console.print("\n[dim]Use --metrics flag with 'run' command to specify metrics.[/dim]")
|
|
@@ -13,6 +13,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
|
|
|
13
13
|
from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
|
|
14
14
|
from evalvault.config.settings import Settings
|
|
15
15
|
|
|
16
|
+
from ..utils.analysis_io import serialize_pipeline_result
|
|
16
17
|
from ..utils.options import db_option
|
|
17
18
|
|
|
18
19
|
|
|
@@ -104,15 +105,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
|
|
|
104
105
|
console.print(f" [red]{node_id}:[/red] {node_result.error}")
|
|
105
106
|
|
|
106
107
|
if output:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"intent": result.intent.value if result.intent else None,
|
|
110
|
-
"is_complete": result.is_complete,
|
|
111
|
-
"duration_ms": result.total_duration_ms,
|
|
112
|
-
"final_output": result.final_output,
|
|
113
|
-
}
|
|
108
|
+
payload = serialize_pipeline_result(result)
|
|
109
|
+
payload["query"] = query
|
|
114
110
|
with open(output, "w", encoding="utf-8") as f:
|
|
115
|
-
json.dump(
|
|
111
|
+
json.dump(payload, f, ensure_ascii=False, indent=2)
|
|
116
112
|
console.print(f"\n[green]Results saved to {output}[/green]")
|
|
117
113
|
|
|
118
114
|
console.print()
|
|
@@ -886,7 +886,7 @@ def register_run_commands(
|
|
|
886
886
|
details=str(exc),
|
|
887
887
|
fixes=[
|
|
888
888
|
"Ollama가 실행 중인지 확인하세요: `ollama serve` (또는 데스크톱 앱 실행).",
|
|
889
|
-
"필요 모델을 받아두세요: `ollama pull
|
|
889
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b`, `ollama pull qwen3-embedding:0.6b`.",
|
|
890
890
|
"서버 URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
891
891
|
],
|
|
892
892
|
)
|
|
@@ -1461,7 +1461,7 @@ def register_run_commands(
|
|
|
1461
1461
|
if provider == "ollama":
|
|
1462
1462
|
fixes = [
|
|
1463
1463
|
"Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
|
|
1464
|
-
"필요 모델을 받아두세요: `ollama pull
|
|
1464
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
|
|
1465
1465
|
"URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
1466
1466
|
]
|
|
1467
1467
|
elif provider == "openai":
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""MCP inbound adapter package."""
|
|
2
|
+
|
|
3
|
+
from .schemas import (
|
|
4
|
+
AnalyzeCompareRequest,
|
|
5
|
+
AnalyzeCompareResponse,
|
|
6
|
+
ComparisonArtifactsPayload,
|
|
7
|
+
EvaluationArtifactsPayload,
|
|
8
|
+
GetArtifactsRequest,
|
|
9
|
+
GetArtifactsResponse,
|
|
10
|
+
GetRunSummaryRequest,
|
|
11
|
+
GetRunSummaryResponse,
|
|
12
|
+
ListRunsRequest,
|
|
13
|
+
ListRunsResponse,
|
|
14
|
+
McpError,
|
|
15
|
+
MetricsDeltaPayload,
|
|
16
|
+
RunEvaluationRequest,
|
|
17
|
+
RunEvaluationResponse,
|
|
18
|
+
RunSummaryPayload,
|
|
19
|
+
)
|
|
20
|
+
from .tools import (
|
|
21
|
+
analyze_compare,
|
|
22
|
+
get_artifacts,
|
|
23
|
+
get_run_summary,
|
|
24
|
+
get_tool_specs,
|
|
25
|
+
list_runs,
|
|
26
|
+
run_evaluation,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"AnalyzeCompareRequest",
|
|
31
|
+
"AnalyzeCompareResponse",
|
|
32
|
+
"ComparisonArtifactsPayload",
|
|
33
|
+
"EvaluationArtifactsPayload",
|
|
34
|
+
"GetArtifactsRequest",
|
|
35
|
+
"GetArtifactsResponse",
|
|
36
|
+
"GetRunSummaryRequest",
|
|
37
|
+
"GetRunSummaryResponse",
|
|
38
|
+
"ListRunsRequest",
|
|
39
|
+
"ListRunsResponse",
|
|
40
|
+
"McpError",
|
|
41
|
+
"MetricsDeltaPayload",
|
|
42
|
+
"RunEvaluationRequest",
|
|
43
|
+
"RunEvaluationResponse",
|
|
44
|
+
"RunSummaryPayload",
|
|
45
|
+
"analyze_compare",
|
|
46
|
+
"get_artifacts",
|
|
47
|
+
"get_run_summary",
|
|
48
|
+
"get_tool_specs",
|
|
49
|
+
"list_runs",
|
|
50
|
+
"run_evaluation",
|
|
51
|
+
]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ErrorStage(str, Enum):
|
|
11
|
+
preprocess = "preprocess"
|
|
12
|
+
evaluate = "evaluate"
|
|
13
|
+
analyze = "analyze"
|
|
14
|
+
compare = "compare"
|
|
15
|
+
storage = "storage"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class McpError(BaseModel):
|
|
19
|
+
code: str
|
|
20
|
+
message: str
|
|
21
|
+
details: dict[str, Any] | None = None
|
|
22
|
+
retryable: bool = False
|
|
23
|
+
stage: ErrorStage | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RunSummaryPayload(BaseModel):
|
|
27
|
+
run_id: str
|
|
28
|
+
dataset_name: str
|
|
29
|
+
model_name: str
|
|
30
|
+
pass_rate: float
|
|
31
|
+
total_test_cases: int
|
|
32
|
+
passed_test_cases: int
|
|
33
|
+
started_at: str
|
|
34
|
+
finished_at: str | None = None
|
|
35
|
+
metrics_evaluated: list[str] = Field(default_factory=list)
|
|
36
|
+
threshold_profile: str | None = None
|
|
37
|
+
run_mode: str | None = None
|
|
38
|
+
evaluation_task: str | None = None
|
|
39
|
+
project_name: str | None = None
|
|
40
|
+
avg_metric_scores: dict[str, float] | None = None
|
|
41
|
+
thresholds: dict[str, float] | None = None
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="allow")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ListRunsRequest(BaseModel):
|
|
47
|
+
limit: int = Field(50, ge=1, le=500)
|
|
48
|
+
dataset_name: str | None = None
|
|
49
|
+
model_name: str | None = None
|
|
50
|
+
run_mode: str | None = None
|
|
51
|
+
project_names: list[str] | None = None
|
|
52
|
+
db_path: Path | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ListRunsResponse(BaseModel):
|
|
56
|
+
runs: list[RunSummaryPayload] = Field(default_factory=list)
|
|
57
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class GetRunSummaryRequest(BaseModel):
|
|
61
|
+
run_id: str
|
|
62
|
+
db_path: Path | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class GetRunSummaryResponse(BaseModel):
|
|
66
|
+
summary: RunSummaryPayload | None = None
|
|
67
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ArtifactsKind(str, Enum):
|
|
71
|
+
analysis = "analysis"
|
|
72
|
+
comparison = "comparison"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class GetArtifactsRequest(BaseModel):
|
|
76
|
+
run_id: str
|
|
77
|
+
kind: ArtifactsKind = ArtifactsKind.analysis
|
|
78
|
+
comparison_run_id: str | None = None
|
|
79
|
+
base_dir: Path | None = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ArtifactsPayload(BaseModel):
|
|
83
|
+
kind: Literal["analysis", "comparison"]
|
|
84
|
+
report_path: str | None = None
|
|
85
|
+
output_path: str | None = None
|
|
86
|
+
artifacts_dir: str | None = None
|
|
87
|
+
artifacts_index_path: str | None = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class GetArtifactsResponse(BaseModel):
|
|
91
|
+
run_id: str
|
|
92
|
+
artifacts: ArtifactsPayload | None = None
|
|
93
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class RunEvaluationRequest(BaseModel):
|
|
97
|
+
dataset_path: Path
|
|
98
|
+
metrics: list[str]
|
|
99
|
+
profile: str | None = None
|
|
100
|
+
model_name: str | None = None
|
|
101
|
+
evaluation_task: str | None = None
|
|
102
|
+
db_path: Path | None = None
|
|
103
|
+
thresholds: dict[str, float] | None = None
|
|
104
|
+
threshold_profile: str | None = None
|
|
105
|
+
parallel: bool = True
|
|
106
|
+
batch_size: int = 5
|
|
107
|
+
auto_analyze: bool = False
|
|
108
|
+
analysis_output: Path | None = None
|
|
109
|
+
analysis_report: Path | None = None
|
|
110
|
+
analysis_dir: Path | None = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class EvaluationArtifactsPayload(BaseModel):
|
|
114
|
+
analysis_report_path: str | None = None
|
|
115
|
+
analysis_output_path: str | None = None
|
|
116
|
+
analysis_artifacts_dir: str | None = None
|
|
117
|
+
analysis_artifacts_index_path: str | None = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RunEvaluationResponse(BaseModel):
|
|
121
|
+
run_id: str
|
|
122
|
+
metrics: dict[str, float | None] = Field(default_factory=dict)
|
|
123
|
+
thresholds: dict[str, float] | None = None
|
|
124
|
+
artifacts: EvaluationArtifactsPayload | None = None
|
|
125
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class AnalyzeCompareRequest(BaseModel):
|
|
129
|
+
run_id_a: str
|
|
130
|
+
run_id_b: str
|
|
131
|
+
metrics: list[str] | None = None
|
|
132
|
+
test_type: Literal["t-test", "mann-whitney"] = "t-test"
|
|
133
|
+
profile: str | None = None
|
|
134
|
+
db_path: Path | None = None
|
|
135
|
+
output: Path | None = None
|
|
136
|
+
report: Path | None = None
|
|
137
|
+
output_dir: Path | None = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class MetricsDeltaPayload(BaseModel):
|
|
141
|
+
avg: dict[str, float] = Field(default_factory=dict)
|
|
142
|
+
by_metric: dict[str, float] = Field(default_factory=dict)
|
|
143
|
+
notes: list[str] | None = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ComparisonArtifactsPayload(BaseModel):
|
|
147
|
+
json_path: str | None = None
|
|
148
|
+
report_path: str | None = None
|
|
149
|
+
artifacts_dir: str | None = None
|
|
150
|
+
artifacts_index_path: str | None = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class AnalyzeCompareResponse(BaseModel):
|
|
154
|
+
baseline_run_id: str
|
|
155
|
+
candidate_run_id: str
|
|
156
|
+
comparison_report_path: str | None = None
|
|
157
|
+
metrics_delta: MetricsDeltaPayload = Field(default_factory=MetricsDeltaPayload)
|
|
158
|
+
artifacts: ComparisonArtifactsPayload | None = None
|
|
159
|
+
errors: list[McpError] = Field(default_factory=list)
|