evalvault 1.59.0__py3-none-any.whl → 1.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  """Inbound adapters."""
2
2
 
3
3
  from evalvault.adapters.inbound.cli import app
4
+ from evalvault.adapters.inbound.mcp import tools as mcp_tools
4
5
 
5
- __all__ = ["app"]
6
+ __all__ = ["app", "mcp_tools"]
@@ -15,6 +15,13 @@ from urllib.request import urlopen
15
15
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
16
16
  from evalvault.config.settings import Settings
17
17
  from evalvault.domain.entities.prompt import PromptSetBundle
18
+ from evalvault.domain.metrics.registry import (
19
+ get_metric_descriptions as registry_metric_descriptions,
20
+ )
21
+ from evalvault.domain.metrics.registry import (
22
+ list_metric_names,
23
+ list_metric_specs,
24
+ )
18
25
  from evalvault.domain.services.cluster_map_builder import build_cluster_map
19
26
  from evalvault.domain.services.prompt_registry import (
20
27
  PromptInput,
@@ -42,21 +49,6 @@ if TYPE_CHECKING:
42
49
 
43
50
  logger = logging.getLogger(__name__)
44
51
 
45
- # 지원하는 메트릭 목록
46
- AVAILABLE_METRICS = [
47
- "faithfulness",
48
- "answer_relevancy",
49
- "context_precision",
50
- "context_recall",
51
- "factual_correctness",
52
- "semantic_similarity",
53
- "summary_score",
54
- "summary_faithfulness",
55
- "insurance_term_accuracy",
56
- "entity_preservation",
57
- "contextual_relevancy",
58
- ]
59
-
60
52
 
61
53
  @dataclass
62
54
  class GateResult:
@@ -978,19 +970,15 @@ class WebUIAdapter:
978
970
 
979
971
  def get_available_metrics(self) -> list[str]:
980
972
  """사용 가능한 메트릭 목록 반환."""
981
- return AVAILABLE_METRICS.copy()
973
+ return list_metric_names()
974
+
975
+ def get_metric_specs(self) -> list[dict[str, object]]:
976
+ """메트릭 스펙 목록 반환."""
977
+ return [spec.to_dict() for spec in list_metric_specs()]
982
978
 
983
979
  def get_metric_descriptions(self) -> dict[str, str]:
984
980
  """메트릭별 설명 반환."""
985
- return {
986
- "faithfulness": "답변이 컨텍스트에 충실한지 평가",
987
- "answer_relevancy": "답변이 질문과 관련있는지 평가",
988
- "context_precision": "검색된 컨텍스트의 정밀도 평가",
989
- "context_recall": "필요한 정보가 검색되었는지 평가",
990
- "factual_correctness": "ground_truth 대비 사실적 정확성 평가",
991
- "semantic_similarity": "답변과 ground_truth 간 의미적 유사도 평가",
992
- "insurance_term_accuracy": "보험 용어 정확성 평가",
993
- }
981
+ return registry_metric_descriptions()
994
982
 
995
983
  def create_dataset_from_upload(
996
984
  self,
@@ -11,6 +11,7 @@ from evalvault.adapters.outbound.llm import get_llm_adapter
11
11
  from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
12
12
  from evalvault.config.settings import get_settings
13
13
  from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
14
+ from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
14
15
  from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
15
16
 
16
17
  router = APIRouter(tags=["pipeline"])
@@ -220,6 +221,15 @@ class PipelineResultResponse(PipelineResultSummary):
220
221
  final_output: dict[str, Any] | None = None
221
222
 
222
223
 
224
+ class AnalysisMetricSpecResponse(BaseModel):
225
+ key: str
226
+ label: str
227
+ description: str
228
+ signal_group: str
229
+ module_id: str
230
+ output_path: list[str]
231
+
232
+
223
233
  def _serialize_payload(value: Any) -> Any:
224
234
  try:
225
235
  return jsonable_encoder(value)
@@ -366,6 +376,12 @@ async def list_intents():
366
376
  raise HTTPException(status_code=500, detail=str(exc)) from exc
367
377
 
368
378
 
379
+ @router.get("/options/analysis-metric-specs", response_model=list[AnalysisMetricSpecResponse])
380
+ async def list_analysis_metric_specs_endpoint():
381
+ """List analysis metric specs for pipeline outputs."""
382
+ return [spec.to_dict() for spec in list_analysis_metric_specs()]
383
+
384
+
369
385
  @router.post("/results", response_model=PipelineResultSummary)
370
386
  async def save_pipeline_result(payload: PipelineResultPayload):
371
387
  """Save a pipeline analysis result for history."""
@@ -113,6 +113,16 @@ class ModelItemResponse(BaseModel):
113
113
  supports_tools: bool | None = None
114
114
 
115
115
 
116
+ class MetricSpecResponse(BaseModel):
117
+ name: str
118
+ description: str
119
+ requires_ground_truth: bool
120
+ requires_embeddings: bool
121
+ source: str
122
+ category: str
123
+ signal_group: str
124
+
125
+
116
126
  class ClusterMapItemResponse(BaseModel):
117
127
  test_case_id: str
118
128
  cluster_id: str
@@ -395,6 +405,12 @@ def list_metrics(adapter: AdapterDep):
395
405
  return adapter.get_available_metrics()
396
406
 
397
407
 
408
+ @router.get("/options/metric-specs", response_model=list[MetricSpecResponse])
409
+ def list_metric_specs(adapter: AdapterDep):
410
+ """Get available metrics with metadata."""
411
+ return adapter.get_metric_specs()
412
+
413
+
398
414
  @router.get("/options/cluster-maps", response_model=list[ClusterMapFileResponse])
399
415
  def list_cluster_maps():
400
416
  """List available cluster map CSV files."""
@@ -14,6 +14,8 @@ import typer
14
14
  from rich import print as rprint
15
15
  from rich.console import Console
16
16
 
17
+ from evalvault.domain.metrics.registry import list_metric_names
18
+
17
19
  from .commands import attach_sub_apps, register_all_commands
18
20
 
19
21
 
@@ -32,19 +34,7 @@ app = typer.Typer(
32
34
  )
33
35
  console = Console()
34
36
 
35
- AVAILABLE_METRICS: list[str] = [
36
- "faithfulness",
37
- "answer_relevancy",
38
- "context_precision",
39
- "context_recall",
40
- "factual_correctness",
41
- "semantic_similarity",
42
- "summary_score",
43
- "summary_faithfulness",
44
- "insurance_term_accuracy",
45
- "entity_preservation",
46
- "contextual_relevancy",
47
- ]
37
+ AVAILABLE_METRICS = list_metric_names()
48
38
 
49
39
  register_all_commands(app, console, available_metrics=AVAILABLE_METRICS)
50
40
  attach_sub_apps(app, console)
@@ -7,6 +7,7 @@ from rich.console import Console
7
7
  from rich.table import Table
8
8
 
9
9
  from evalvault.config.settings import Settings, apply_profile
10
+ from evalvault.domain.metrics.registry import list_metric_specs
10
11
 
11
12
 
12
13
  def register_config_commands(app: typer.Typer, console: Console) -> None:
@@ -22,46 +23,9 @@ def register_config_commands(app: typer.Typer, console: Console) -> None:
22
23
  table.add_column("Description")
23
24
  table.add_column("Requires Ground Truth", justify="center")
24
25
 
25
- table.add_row(
26
- "faithfulness",
27
- "Measures factual accuracy of the answer based on contexts",
28
- "[red]No[/red]",
29
- )
30
- table.add_row(
31
- "answer_relevancy",
32
- "Measures how relevant the answer is to the question",
33
- "[red]No[/red]",
34
- )
35
- table.add_row(
36
- "context_precision",
37
- "Measures ranking quality of retrieved contexts",
38
- "[green]Yes[/green]",
39
- )
40
- table.add_row(
41
- "context_recall",
42
- "Measures if all relevant info is in retrieved contexts",
43
- "[green]Yes[/green]",
44
- )
45
- table.add_row(
46
- "summary_score",
47
- "Measures summary coverage and conciseness against contexts",
48
- "[red]No[/red]",
49
- )
50
- table.add_row(
51
- "summary_faithfulness",
52
- "Measures whether summary statements are grounded in contexts",
53
- "[red]No[/red]",
54
- )
55
- table.add_row(
56
- "entity_preservation",
57
- "Measures preservation of key insurance entities in summaries",
58
- "[red]No[/red]",
59
- )
60
- table.add_row(
61
- "insurance_term_accuracy",
62
- "Measures if insurance terms in answer are grounded in contexts",
63
- "[red]No[/red]",
64
- )
26
+ for spec in list_metric_specs():
27
+ needs_gt = "[green]Yes[/green]" if spec.requires_ground_truth else "[red]No[/red]"
28
+ table.add_row(spec.name, spec.description, needs_gt)
65
29
 
66
30
  console.print(table)
67
31
  console.print("\n[dim]Use --metrics flag with 'run' command to specify metrics.[/dim]")
@@ -13,6 +13,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
13
13
  from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
14
14
  from evalvault.config.settings import Settings
15
15
 
16
+ from ..utils.analysis_io import serialize_pipeline_result
16
17
  from ..utils.options import db_option
17
18
 
18
19
 
@@ -104,15 +105,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
104
105
  console.print(f" [red]{node_id}:[/red] {node_result.error}")
105
106
 
106
107
  if output:
107
- data = {
108
- "query": query,
109
- "intent": result.intent.value if result.intent else None,
110
- "is_complete": result.is_complete,
111
- "duration_ms": result.total_duration_ms,
112
- "final_output": result.final_output,
113
- }
108
+ payload = serialize_pipeline_result(result)
109
+ payload["query"] = query
114
110
  with open(output, "w", encoding="utf-8") as f:
115
- json.dump(data, f, ensure_ascii=False, indent=2)
111
+ json.dump(payload, f, ensure_ascii=False, indent=2)
116
112
  console.print(f"\n[green]Results saved to {output}[/green]")
117
113
 
118
114
  console.print()
@@ -886,7 +886,7 @@ def register_run_commands(
886
886
  details=str(exc),
887
887
  fixes=[
888
888
  "Ollama가 실행 중인지 확인하세요: `ollama serve` (또는 데스크톱 앱 실행).",
889
- "필요 모델을 받아두세요: `ollama pull gemma3:1b`, `ollama pull qwen3-embedding:0.6b`.",
889
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b`, `ollama pull qwen3-embedding:0.6b`.",
890
890
  "서버 URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
891
891
  ],
892
892
  )
@@ -1461,7 +1461,7 @@ def register_run_commands(
1461
1461
  if provider == "ollama":
1462
1462
  fixes = [
1463
1463
  "Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
1464
- "필요 모델을 받아두세요: `ollama pull gemma3:1b` 및 `ollama pull qwen3-embedding:0.6b`.",
1464
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
1465
1465
  "URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
1466
1466
  ]
1467
1467
  elif provider == "openai":
@@ -0,0 +1,51 @@
1
+ """MCP inbound adapter package."""
2
+
3
+ from .schemas import (
4
+ AnalyzeCompareRequest,
5
+ AnalyzeCompareResponse,
6
+ ComparisonArtifactsPayload,
7
+ EvaluationArtifactsPayload,
8
+ GetArtifactsRequest,
9
+ GetArtifactsResponse,
10
+ GetRunSummaryRequest,
11
+ GetRunSummaryResponse,
12
+ ListRunsRequest,
13
+ ListRunsResponse,
14
+ McpError,
15
+ MetricsDeltaPayload,
16
+ RunEvaluationRequest,
17
+ RunEvaluationResponse,
18
+ RunSummaryPayload,
19
+ )
20
+ from .tools import (
21
+ analyze_compare,
22
+ get_artifacts,
23
+ get_run_summary,
24
+ get_tool_specs,
25
+ list_runs,
26
+ run_evaluation,
27
+ )
28
+
29
+ __all__ = [
30
+ "AnalyzeCompareRequest",
31
+ "AnalyzeCompareResponse",
32
+ "ComparisonArtifactsPayload",
33
+ "EvaluationArtifactsPayload",
34
+ "GetArtifactsRequest",
35
+ "GetArtifactsResponse",
36
+ "GetRunSummaryRequest",
37
+ "GetRunSummaryResponse",
38
+ "ListRunsRequest",
39
+ "ListRunsResponse",
40
+ "McpError",
41
+ "MetricsDeltaPayload",
42
+ "RunEvaluationRequest",
43
+ "RunEvaluationResponse",
44
+ "RunSummaryPayload",
45
+ "analyze_compare",
46
+ "get_artifacts",
47
+ "get_run_summary",
48
+ "get_tool_specs",
49
+ "list_runs",
50
+ "run_evaluation",
51
+ ]
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class ErrorStage(str, Enum):
11
+ preprocess = "preprocess"
12
+ evaluate = "evaluate"
13
+ analyze = "analyze"
14
+ compare = "compare"
15
+ storage = "storage"
16
+
17
+
18
+ class McpError(BaseModel):
19
+ code: str
20
+ message: str
21
+ details: dict[str, Any] | None = None
22
+ retryable: bool = False
23
+ stage: ErrorStage | None = None
24
+
25
+
26
+ class RunSummaryPayload(BaseModel):
27
+ run_id: str
28
+ dataset_name: str
29
+ model_name: str
30
+ pass_rate: float
31
+ total_test_cases: int
32
+ passed_test_cases: int
33
+ started_at: str
34
+ finished_at: str | None = None
35
+ metrics_evaluated: list[str] = Field(default_factory=list)
36
+ threshold_profile: str | None = None
37
+ run_mode: str | None = None
38
+ evaluation_task: str | None = None
39
+ project_name: str | None = None
40
+ avg_metric_scores: dict[str, float] | None = None
41
+ thresholds: dict[str, float] | None = None
42
+
43
+ model_config = ConfigDict(extra="allow")
44
+
45
+
46
+ class ListRunsRequest(BaseModel):
47
+ limit: int = Field(50, ge=1, le=500)
48
+ dataset_name: str | None = None
49
+ model_name: str | None = None
50
+ run_mode: str | None = None
51
+ project_names: list[str] | None = None
52
+ db_path: Path | None = None
53
+
54
+
55
+ class ListRunsResponse(BaseModel):
56
+ runs: list[RunSummaryPayload] = Field(default_factory=list)
57
+ errors: list[McpError] = Field(default_factory=list)
58
+
59
+
60
+ class GetRunSummaryRequest(BaseModel):
61
+ run_id: str
62
+ db_path: Path | None = None
63
+
64
+
65
+ class GetRunSummaryResponse(BaseModel):
66
+ summary: RunSummaryPayload | None = None
67
+ errors: list[McpError] = Field(default_factory=list)
68
+
69
+
70
+ class ArtifactsKind(str, Enum):
71
+ analysis = "analysis"
72
+ comparison = "comparison"
73
+
74
+
75
+ class GetArtifactsRequest(BaseModel):
76
+ run_id: str
77
+ kind: ArtifactsKind = ArtifactsKind.analysis
78
+ comparison_run_id: str | None = None
79
+ base_dir: Path | None = None
80
+
81
+
82
+ class ArtifactsPayload(BaseModel):
83
+ kind: Literal["analysis", "comparison"]
84
+ report_path: str | None = None
85
+ output_path: str | None = None
86
+ artifacts_dir: str | None = None
87
+ artifacts_index_path: str | None = None
88
+
89
+
90
+ class GetArtifactsResponse(BaseModel):
91
+ run_id: str
92
+ artifacts: ArtifactsPayload | None = None
93
+ errors: list[McpError] = Field(default_factory=list)
94
+
95
+
96
+ class RunEvaluationRequest(BaseModel):
97
+ dataset_path: Path
98
+ metrics: list[str]
99
+ profile: str | None = None
100
+ model_name: str | None = None
101
+ evaluation_task: str | None = None
102
+ db_path: Path | None = None
103
+ thresholds: dict[str, float] | None = None
104
+ threshold_profile: str | None = None
105
+ parallel: bool = True
106
+ batch_size: int = 5
107
+ auto_analyze: bool = False
108
+ analysis_output: Path | None = None
109
+ analysis_report: Path | None = None
110
+ analysis_dir: Path | None = None
111
+
112
+
113
+ class EvaluationArtifactsPayload(BaseModel):
114
+ analysis_report_path: str | None = None
115
+ analysis_output_path: str | None = None
116
+ analysis_artifacts_dir: str | None = None
117
+ analysis_artifacts_index_path: str | None = None
118
+
119
+
120
+ class RunEvaluationResponse(BaseModel):
121
+ run_id: str
122
+ metrics: dict[str, float | None] = Field(default_factory=dict)
123
+ thresholds: dict[str, float] | None = None
124
+ artifacts: EvaluationArtifactsPayload | None = None
125
+ errors: list[McpError] = Field(default_factory=list)
126
+
127
+
128
+ class AnalyzeCompareRequest(BaseModel):
129
+ run_id_a: str
130
+ run_id_b: str
131
+ metrics: list[str] | None = None
132
+ test_type: Literal["t-test", "mann-whitney"] = "t-test"
133
+ profile: str | None = None
134
+ db_path: Path | None = None
135
+ output: Path | None = None
136
+ report: Path | None = None
137
+ output_dir: Path | None = None
138
+
139
+
140
+ class MetricsDeltaPayload(BaseModel):
141
+ avg: dict[str, float] = Field(default_factory=dict)
142
+ by_metric: dict[str, float] = Field(default_factory=dict)
143
+ notes: list[str] | None = None
144
+
145
+
146
+ class ComparisonArtifactsPayload(BaseModel):
147
+ json_path: str | None = None
148
+ report_path: str | None = None
149
+ artifacts_dir: str | None = None
150
+ artifacts_index_path: str | None = None
151
+
152
+
153
+ class AnalyzeCompareResponse(BaseModel):
154
+ baseline_run_id: str
155
+ candidate_run_id: str
156
+ comparison_report_path: str | None = None
157
+ metrics_delta: MetricsDeltaPayload = Field(default_factory=MetricsDeltaPayload)
158
+ artifacts: ComparisonArtifactsPayload | None = None
159
+ errors: list[McpError] = Field(default_factory=list)