evalvault 1.60.0__py3-none-any.whl → 1.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/__init__.py +2 -1
- evalvault/adapters/inbound/cli/commands/run.py +2 -2
- evalvault/adapters/inbound/mcp/__init__.py +51 -0
- evalvault/adapters/inbound/mcp/schemas.py +159 -0
- evalvault/adapters/inbound/mcp/tools.py +710 -0
- evalvault/adapters/outbound/analysis/llm_report_module.py +605 -62
- evalvault/config/settings.py +1 -1
- evalvault/ports/inbound/web_port.py +1 -1
- {evalvault-1.60.0.dist-info → evalvault-1.61.0.dist-info}/METADATA +2 -2
- {evalvault-1.60.0.dist-info → evalvault-1.61.0.dist-info}/RECORD +13 -10
- {evalvault-1.60.0.dist-info → evalvault-1.61.0.dist-info}/WHEEL +0 -0
- {evalvault-1.60.0.dist-info → evalvault-1.61.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.60.0.dist-info → evalvault-1.61.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -886,7 +886,7 @@ def register_run_commands(
|
|
|
886
886
|
details=str(exc),
|
|
887
887
|
fixes=[
|
|
888
888
|
"Ollama가 실행 중인지 확인하세요: `ollama serve` (또는 데스크톱 앱 실행).",
|
|
889
|
-
"필요 모델을 받아두세요: `ollama pull
|
|
889
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b`, `ollama pull qwen3-embedding:0.6b`.",
|
|
890
890
|
"서버 URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
891
891
|
],
|
|
892
892
|
)
|
|
@@ -1461,7 +1461,7 @@ def register_run_commands(
|
|
|
1461
1461
|
if provider == "ollama":
|
|
1462
1462
|
fixes = [
|
|
1463
1463
|
"Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
|
|
1464
|
-
"필요 모델을 받아두세요: `ollama pull
|
|
1464
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
|
|
1465
1465
|
"URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
1466
1466
|
]
|
|
1467
1467
|
elif provider == "openai":
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""MCP inbound adapter package."""
|
|
2
|
+
|
|
3
|
+
from .schemas import (
|
|
4
|
+
AnalyzeCompareRequest,
|
|
5
|
+
AnalyzeCompareResponse,
|
|
6
|
+
ComparisonArtifactsPayload,
|
|
7
|
+
EvaluationArtifactsPayload,
|
|
8
|
+
GetArtifactsRequest,
|
|
9
|
+
GetArtifactsResponse,
|
|
10
|
+
GetRunSummaryRequest,
|
|
11
|
+
GetRunSummaryResponse,
|
|
12
|
+
ListRunsRequest,
|
|
13
|
+
ListRunsResponse,
|
|
14
|
+
McpError,
|
|
15
|
+
MetricsDeltaPayload,
|
|
16
|
+
RunEvaluationRequest,
|
|
17
|
+
RunEvaluationResponse,
|
|
18
|
+
RunSummaryPayload,
|
|
19
|
+
)
|
|
20
|
+
from .tools import (
|
|
21
|
+
analyze_compare,
|
|
22
|
+
get_artifacts,
|
|
23
|
+
get_run_summary,
|
|
24
|
+
get_tool_specs,
|
|
25
|
+
list_runs,
|
|
26
|
+
run_evaluation,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"AnalyzeCompareRequest",
|
|
31
|
+
"AnalyzeCompareResponse",
|
|
32
|
+
"ComparisonArtifactsPayload",
|
|
33
|
+
"EvaluationArtifactsPayload",
|
|
34
|
+
"GetArtifactsRequest",
|
|
35
|
+
"GetArtifactsResponse",
|
|
36
|
+
"GetRunSummaryRequest",
|
|
37
|
+
"GetRunSummaryResponse",
|
|
38
|
+
"ListRunsRequest",
|
|
39
|
+
"ListRunsResponse",
|
|
40
|
+
"McpError",
|
|
41
|
+
"MetricsDeltaPayload",
|
|
42
|
+
"RunEvaluationRequest",
|
|
43
|
+
"RunEvaluationResponse",
|
|
44
|
+
"RunSummaryPayload",
|
|
45
|
+
"analyze_compare",
|
|
46
|
+
"get_artifacts",
|
|
47
|
+
"get_run_summary",
|
|
48
|
+
"get_tool_specs",
|
|
49
|
+
"list_runs",
|
|
50
|
+
"run_evaluation",
|
|
51
|
+
]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ErrorStage(str, Enum):
|
|
11
|
+
preprocess = "preprocess"
|
|
12
|
+
evaluate = "evaluate"
|
|
13
|
+
analyze = "analyze"
|
|
14
|
+
compare = "compare"
|
|
15
|
+
storage = "storage"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class McpError(BaseModel):
|
|
19
|
+
code: str
|
|
20
|
+
message: str
|
|
21
|
+
details: dict[str, Any] | None = None
|
|
22
|
+
retryable: bool = False
|
|
23
|
+
stage: ErrorStage | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RunSummaryPayload(BaseModel):
|
|
27
|
+
run_id: str
|
|
28
|
+
dataset_name: str
|
|
29
|
+
model_name: str
|
|
30
|
+
pass_rate: float
|
|
31
|
+
total_test_cases: int
|
|
32
|
+
passed_test_cases: int
|
|
33
|
+
started_at: str
|
|
34
|
+
finished_at: str | None = None
|
|
35
|
+
metrics_evaluated: list[str] = Field(default_factory=list)
|
|
36
|
+
threshold_profile: str | None = None
|
|
37
|
+
run_mode: str | None = None
|
|
38
|
+
evaluation_task: str | None = None
|
|
39
|
+
project_name: str | None = None
|
|
40
|
+
avg_metric_scores: dict[str, float] | None = None
|
|
41
|
+
thresholds: dict[str, float] | None = None
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="allow")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ListRunsRequest(BaseModel):
|
|
47
|
+
limit: int = Field(50, ge=1, le=500)
|
|
48
|
+
dataset_name: str | None = None
|
|
49
|
+
model_name: str | None = None
|
|
50
|
+
run_mode: str | None = None
|
|
51
|
+
project_names: list[str] | None = None
|
|
52
|
+
db_path: Path | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ListRunsResponse(BaseModel):
|
|
56
|
+
runs: list[RunSummaryPayload] = Field(default_factory=list)
|
|
57
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class GetRunSummaryRequest(BaseModel):
|
|
61
|
+
run_id: str
|
|
62
|
+
db_path: Path | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class GetRunSummaryResponse(BaseModel):
|
|
66
|
+
summary: RunSummaryPayload | None = None
|
|
67
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ArtifactsKind(str, Enum):
|
|
71
|
+
analysis = "analysis"
|
|
72
|
+
comparison = "comparison"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class GetArtifactsRequest(BaseModel):
|
|
76
|
+
run_id: str
|
|
77
|
+
kind: ArtifactsKind = ArtifactsKind.analysis
|
|
78
|
+
comparison_run_id: str | None = None
|
|
79
|
+
base_dir: Path | None = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ArtifactsPayload(BaseModel):
|
|
83
|
+
kind: Literal["analysis", "comparison"]
|
|
84
|
+
report_path: str | None = None
|
|
85
|
+
output_path: str | None = None
|
|
86
|
+
artifacts_dir: str | None = None
|
|
87
|
+
artifacts_index_path: str | None = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class GetArtifactsResponse(BaseModel):
|
|
91
|
+
run_id: str
|
|
92
|
+
artifacts: ArtifactsPayload | None = None
|
|
93
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class RunEvaluationRequest(BaseModel):
|
|
97
|
+
dataset_path: Path
|
|
98
|
+
metrics: list[str]
|
|
99
|
+
profile: str | None = None
|
|
100
|
+
model_name: str | None = None
|
|
101
|
+
evaluation_task: str | None = None
|
|
102
|
+
db_path: Path | None = None
|
|
103
|
+
thresholds: dict[str, float] | None = None
|
|
104
|
+
threshold_profile: str | None = None
|
|
105
|
+
parallel: bool = True
|
|
106
|
+
batch_size: int = 5
|
|
107
|
+
auto_analyze: bool = False
|
|
108
|
+
analysis_output: Path | None = None
|
|
109
|
+
analysis_report: Path | None = None
|
|
110
|
+
analysis_dir: Path | None = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class EvaluationArtifactsPayload(BaseModel):
|
|
114
|
+
analysis_report_path: str | None = None
|
|
115
|
+
analysis_output_path: str | None = None
|
|
116
|
+
analysis_artifacts_dir: str | None = None
|
|
117
|
+
analysis_artifacts_index_path: str | None = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RunEvaluationResponse(BaseModel):
|
|
121
|
+
run_id: str
|
|
122
|
+
metrics: dict[str, float | None] = Field(default_factory=dict)
|
|
123
|
+
thresholds: dict[str, float] | None = None
|
|
124
|
+
artifacts: EvaluationArtifactsPayload | None = None
|
|
125
|
+
errors: list[McpError] = Field(default_factory=list)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class AnalyzeCompareRequest(BaseModel):
|
|
129
|
+
run_id_a: str
|
|
130
|
+
run_id_b: str
|
|
131
|
+
metrics: list[str] | None = None
|
|
132
|
+
test_type: Literal["t-test", "mann-whitney"] = "t-test"
|
|
133
|
+
profile: str | None = None
|
|
134
|
+
db_path: Path | None = None
|
|
135
|
+
output: Path | None = None
|
|
136
|
+
report: Path | None = None
|
|
137
|
+
output_dir: Path | None = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class MetricsDeltaPayload(BaseModel):
|
|
141
|
+
avg: dict[str, float] = Field(default_factory=dict)
|
|
142
|
+
by_metric: dict[str, float] = Field(default_factory=dict)
|
|
143
|
+
notes: list[str] | None = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ComparisonArtifactsPayload(BaseModel):
|
|
147
|
+
json_path: str | None = None
|
|
148
|
+
report_path: str | None = None
|
|
149
|
+
artifacts_dir: str | None = None
|
|
150
|
+
artifacts_index_path: str | None = None
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class AnalyzeCompareResponse(BaseModel):
|
|
154
|
+
baseline_run_id: str
|
|
155
|
+
candidate_run_id: str
|
|
156
|
+
comparison_report_path: str | None = None
|
|
157
|
+
metrics_delta: MetricsDeltaPayload = Field(default_factory=MetricsDeltaPayload)
|
|
158
|
+
artifacts: ComparisonArtifactsPayload | None = None
|
|
159
|
+
errors: list[McpError] = Field(default_factory=list)
|