evalvault 1.60.0__py3-none-any.whl → 1.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  """Inbound adapters."""
2
2
 
3
3
  from evalvault.adapters.inbound.cli import app
4
+ from evalvault.adapters.inbound.mcp import tools as mcp_tools
4
5
 
5
- __all__ = ["app"]
6
+ __all__ = ["app", "mcp_tools"]
@@ -886,7 +886,7 @@ def register_run_commands(
886
886
  details=str(exc),
887
887
  fixes=[
888
888
  "Ollama가 실행 중인지 확인하세요: `ollama serve` (또는 데스크톱 앱 실행).",
889
- "필요 모델을 받아두세요: `ollama pull gemma3:1b`, `ollama pull qwen3-embedding:0.6b`.",
889
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b`, `ollama pull qwen3-embedding:0.6b`.",
890
890
  "서버 URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
891
891
  ],
892
892
  )
@@ -1461,7 +1461,7 @@ def register_run_commands(
1461
1461
  if provider == "ollama":
1462
1462
  fixes = [
1463
1463
  "Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
1464
- "필요 모델을 받아두세요: `ollama pull gemma3:1b` 및 `ollama pull qwen3-embedding:0.6b`.",
1464
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
1465
1465
  "URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
1466
1466
  ]
1467
1467
  elif provider == "openai":
@@ -0,0 +1,51 @@
1
+ """MCP inbound adapter package."""
2
+
3
+ from .schemas import (
4
+ AnalyzeCompareRequest,
5
+ AnalyzeCompareResponse,
6
+ ComparisonArtifactsPayload,
7
+ EvaluationArtifactsPayload,
8
+ GetArtifactsRequest,
9
+ GetArtifactsResponse,
10
+ GetRunSummaryRequest,
11
+ GetRunSummaryResponse,
12
+ ListRunsRequest,
13
+ ListRunsResponse,
14
+ McpError,
15
+ MetricsDeltaPayload,
16
+ RunEvaluationRequest,
17
+ RunEvaluationResponse,
18
+ RunSummaryPayload,
19
+ )
20
+ from .tools import (
21
+ analyze_compare,
22
+ get_artifacts,
23
+ get_run_summary,
24
+ get_tool_specs,
25
+ list_runs,
26
+ run_evaluation,
27
+ )
28
+
29
+ __all__ = [
30
+ "AnalyzeCompareRequest",
31
+ "AnalyzeCompareResponse",
32
+ "ComparisonArtifactsPayload",
33
+ "EvaluationArtifactsPayload",
34
+ "GetArtifactsRequest",
35
+ "GetArtifactsResponse",
36
+ "GetRunSummaryRequest",
37
+ "GetRunSummaryResponse",
38
+ "ListRunsRequest",
39
+ "ListRunsResponse",
40
+ "McpError",
41
+ "MetricsDeltaPayload",
42
+ "RunEvaluationRequest",
43
+ "RunEvaluationResponse",
44
+ "RunSummaryPayload",
45
+ "analyze_compare",
46
+ "get_artifacts",
47
+ "get_run_summary",
48
+ "get_tool_specs",
49
+ "list_runs",
50
+ "run_evaluation",
51
+ ]
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class ErrorStage(str, Enum):
11
+ preprocess = "preprocess"
12
+ evaluate = "evaluate"
13
+ analyze = "analyze"
14
+ compare = "compare"
15
+ storage = "storage"
16
+
17
+
18
+ class McpError(BaseModel):
19
+ code: str
20
+ message: str
21
+ details: dict[str, Any] | None = None
22
+ retryable: bool = False
23
+ stage: ErrorStage | None = None
24
+
25
+
26
+ class RunSummaryPayload(BaseModel):
27
+ run_id: str
28
+ dataset_name: str
29
+ model_name: str
30
+ pass_rate: float
31
+ total_test_cases: int
32
+ passed_test_cases: int
33
+ started_at: str
34
+ finished_at: str | None = None
35
+ metrics_evaluated: list[str] = Field(default_factory=list)
36
+ threshold_profile: str | None = None
37
+ run_mode: str | None = None
38
+ evaluation_task: str | None = None
39
+ project_name: str | None = None
40
+ avg_metric_scores: dict[str, float] | None = None
41
+ thresholds: dict[str, float] | None = None
42
+
43
+ model_config = ConfigDict(extra="allow")
44
+
45
+
46
+ class ListRunsRequest(BaseModel):
47
+ limit: int = Field(50, ge=1, le=500)
48
+ dataset_name: str | None = None
49
+ model_name: str | None = None
50
+ run_mode: str | None = None
51
+ project_names: list[str] | None = None
52
+ db_path: Path | None = None
53
+
54
+
55
+ class ListRunsResponse(BaseModel):
56
+ runs: list[RunSummaryPayload] = Field(default_factory=list)
57
+ errors: list[McpError] = Field(default_factory=list)
58
+
59
+
60
+ class GetRunSummaryRequest(BaseModel):
61
+ run_id: str
62
+ db_path: Path | None = None
63
+
64
+
65
+ class GetRunSummaryResponse(BaseModel):
66
+ summary: RunSummaryPayload | None = None
67
+ errors: list[McpError] = Field(default_factory=list)
68
+
69
+
70
+ class ArtifactsKind(str, Enum):
71
+ analysis = "analysis"
72
+ comparison = "comparison"
73
+
74
+
75
+ class GetArtifactsRequest(BaseModel):
76
+ run_id: str
77
+ kind: ArtifactsKind = ArtifactsKind.analysis
78
+ comparison_run_id: str | None = None
79
+ base_dir: Path | None = None
80
+
81
+
82
+ class ArtifactsPayload(BaseModel):
83
+ kind: Literal["analysis", "comparison"]
84
+ report_path: str | None = None
85
+ output_path: str | None = None
86
+ artifacts_dir: str | None = None
87
+ artifacts_index_path: str | None = None
88
+
89
+
90
+ class GetArtifactsResponse(BaseModel):
91
+ run_id: str
92
+ artifacts: ArtifactsPayload | None = None
93
+ errors: list[McpError] = Field(default_factory=list)
94
+
95
+
96
+ class RunEvaluationRequest(BaseModel):
97
+ dataset_path: Path
98
+ metrics: list[str]
99
+ profile: str | None = None
100
+ model_name: str | None = None
101
+ evaluation_task: str | None = None
102
+ db_path: Path | None = None
103
+ thresholds: dict[str, float] | None = None
104
+ threshold_profile: str | None = None
105
+ parallel: bool = True
106
+ batch_size: int = 5
107
+ auto_analyze: bool = False
108
+ analysis_output: Path | None = None
109
+ analysis_report: Path | None = None
110
+ analysis_dir: Path | None = None
111
+
112
+
113
+ class EvaluationArtifactsPayload(BaseModel):
114
+ analysis_report_path: str | None = None
115
+ analysis_output_path: str | None = None
116
+ analysis_artifacts_dir: str | None = None
117
+ analysis_artifacts_index_path: str | None = None
118
+
119
+
120
+ class RunEvaluationResponse(BaseModel):
121
+ run_id: str
122
+ metrics: dict[str, float | None] = Field(default_factory=dict)
123
+ thresholds: dict[str, float] | None = None
124
+ artifacts: EvaluationArtifactsPayload | None = None
125
+ errors: list[McpError] = Field(default_factory=list)
126
+
127
+
128
+ class AnalyzeCompareRequest(BaseModel):
129
+ run_id_a: str
130
+ run_id_b: str
131
+ metrics: list[str] | None = None
132
+ test_type: Literal["t-test", "mann-whitney"] = "t-test"
133
+ profile: str | None = None
134
+ db_path: Path | None = None
135
+ output: Path | None = None
136
+ report: Path | None = None
137
+ output_dir: Path | None = None
138
+
139
+
140
+ class MetricsDeltaPayload(BaseModel):
141
+ avg: dict[str, float] = Field(default_factory=dict)
142
+ by_metric: dict[str, float] = Field(default_factory=dict)
143
+ notes: list[str] | None = None
144
+
145
+
146
+ class ComparisonArtifactsPayload(BaseModel):
147
+ json_path: str | None = None
148
+ report_path: str | None = None
149
+ artifacts_dir: str | None = None
150
+ artifacts_index_path: str | None = None
151
+
152
+
153
+ class AnalyzeCompareResponse(BaseModel):
154
+ baseline_run_id: str
155
+ candidate_run_id: str
156
+ comparison_report_path: str | None = None
157
+ metrics_delta: MetricsDeltaPayload = Field(default_factory=MetricsDeltaPayload)
158
+ artifacts: ComparisonArtifactsPayload | None = None
159
+ errors: list[McpError] = Field(default_factory=list)