evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.analysis.comparison_pipeline_adapter import (
|
|
11
|
+
ComparisonPipelineAdapter,
|
|
12
|
+
)
|
|
13
|
+
from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis_pipeline_service
|
|
14
|
+
from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
|
|
15
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
16
|
+
from evalvault.config.settings import Settings, apply_profile
|
|
17
|
+
from evalvault.domain.services.run_comparison_service import (
|
|
18
|
+
RunComparisonError,
|
|
19
|
+
RunComparisonRequest,
|
|
20
|
+
RunComparisonService,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from ..utils.analysis_io import (
|
|
24
|
+
build_comparison_scorecard,
|
|
25
|
+
resolve_artifact_dir,
|
|
26
|
+
resolve_output_paths,
|
|
27
|
+
serialize_pipeline_result,
|
|
28
|
+
write_json,
|
|
29
|
+
write_pipeline_artifacts,
|
|
30
|
+
)
|
|
31
|
+
from ..utils.console import print_cli_error
|
|
32
|
+
from ..utils.options import db_option, profile_option
|
|
33
|
+
from ..utils.validators import parse_csv_option, validate_choice
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _coerce_test_type(value: str) -> str:
|
|
37
|
+
if value == "t-test":
|
|
38
|
+
return "t-test"
|
|
39
|
+
return "mann-whitney"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def register_compare_commands(app: typer.Typer, console: Console) -> None:
|
|
43
|
+
@app.command(name="compare")
|
|
44
|
+
def compare(
|
|
45
|
+
run_id_a: str = typer.Argument(..., help="기준 Run ID"),
|
|
46
|
+
run_id_b: str = typer.Argument(..., help="비교 Run ID"),
|
|
47
|
+
metrics: str | None = typer.Option(
|
|
48
|
+
None,
|
|
49
|
+
"--metrics",
|
|
50
|
+
"-m",
|
|
51
|
+
help="비교할 메트릭 목록 (쉼표 구분)",
|
|
52
|
+
),
|
|
53
|
+
test: str = typer.Option(
|
|
54
|
+
"t-test",
|
|
55
|
+
"--test",
|
|
56
|
+
"-t",
|
|
57
|
+
help="통계 검정 (t-test, mann-whitney)",
|
|
58
|
+
),
|
|
59
|
+
output_format: str = typer.Option(
|
|
60
|
+
"table",
|
|
61
|
+
"--format",
|
|
62
|
+
"-f",
|
|
63
|
+
help="출력 형식 (table, json)",
|
|
64
|
+
),
|
|
65
|
+
output: Path | None = typer.Option(None, "--output", "-o", help="JSON 출력 파일"),
|
|
66
|
+
report: Path | None = typer.Option(None, "--report", help="리포트 출력 파일"),
|
|
67
|
+
output_dir: Path | None = typer.Option(
|
|
68
|
+
None,
|
|
69
|
+
"--output-dir",
|
|
70
|
+
help="출력 기본 디렉터리",
|
|
71
|
+
),
|
|
72
|
+
artifacts_dir: Path | None = typer.Option(
|
|
73
|
+
None,
|
|
74
|
+
"--artifacts-dir",
|
|
75
|
+
help="아티팩트 저장 디렉터리",
|
|
76
|
+
),
|
|
77
|
+
parallel: bool = typer.Option(
|
|
78
|
+
False,
|
|
79
|
+
"--parallel/--no-parallel",
|
|
80
|
+
help="병렬 파이프라인 실행",
|
|
81
|
+
),
|
|
82
|
+
concurrency: int | None = typer.Option(
|
|
83
|
+
None,
|
|
84
|
+
"--concurrency",
|
|
85
|
+
min=1,
|
|
86
|
+
help="병렬 실행 동시성 제한",
|
|
87
|
+
),
|
|
88
|
+
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
89
|
+
profile: str | None = profile_option(help_text="LLM 프로필"),
|
|
90
|
+
) -> None:
|
|
91
|
+
validate_choice(test, ["t-test", "mann-whitney"], console, value_label="test")
|
|
92
|
+
validate_choice(output_format, ["table", "json"], console, value_label="format")
|
|
93
|
+
|
|
94
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
95
|
+
if resolved_db_path is None:
|
|
96
|
+
print_cli_error(console, "DB 경로가 설정되지 않았습니다.")
|
|
97
|
+
raise typer.Exit(1)
|
|
98
|
+
|
|
99
|
+
metric_list = parse_csv_option(metrics)
|
|
100
|
+
metric_list = metric_list or None
|
|
101
|
+
|
|
102
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
103
|
+
analysis_adapter = StatisticalAnalysisAdapter()
|
|
104
|
+
|
|
105
|
+
settings = Settings()
|
|
106
|
+
profile_name = profile or settings.evalvault_profile
|
|
107
|
+
if profile_name:
|
|
108
|
+
settings = apply_profile(settings, profile_name)
|
|
109
|
+
|
|
110
|
+
llm_adapter = None
|
|
111
|
+
try:
|
|
112
|
+
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
113
|
+
|
|
114
|
+
llm_adapter = get_llm_adapter(settings)
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
console.print(f"[yellow]경고: LLM 어댑터 초기화 실패 ({exc})[/yellow]")
|
|
117
|
+
|
|
118
|
+
pipeline_service = build_analysis_pipeline_service(
|
|
119
|
+
storage=storage,
|
|
120
|
+
llm_adapter=llm_adapter,
|
|
121
|
+
)
|
|
122
|
+
pipeline_adapter = ComparisonPipelineAdapter(pipeline_service)
|
|
123
|
+
|
|
124
|
+
service = RunComparisonService(
|
|
125
|
+
storage=storage,
|
|
126
|
+
analysis_port=analysis_adapter,
|
|
127
|
+
pipeline_port=pipeline_adapter,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
request = RunComparisonRequest(
|
|
131
|
+
run_id_a=run_id_a,
|
|
132
|
+
run_id_b=run_id_b,
|
|
133
|
+
metrics=metric_list,
|
|
134
|
+
test_type=_coerce_test_type(test),
|
|
135
|
+
parallel=parallel,
|
|
136
|
+
concurrency=concurrency,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
outcome = service.compare_runs(request)
|
|
141
|
+
except RunComparisonError as exc:
|
|
142
|
+
print_cli_error(console, str(exc))
|
|
143
|
+
raise typer.Exit(exc.exit_code) from exc
|
|
144
|
+
|
|
145
|
+
comparison_prefix = f"comparison_{run_id_a[:8]}_{run_id_b[:8]}"
|
|
146
|
+
resolved_base_dir = output_dir or Path("reports/comparison")
|
|
147
|
+
output_path, report_path = resolve_output_paths(
|
|
148
|
+
base_dir=resolved_base_dir,
|
|
149
|
+
output_path=output,
|
|
150
|
+
report_path=report,
|
|
151
|
+
prefix=comparison_prefix,
|
|
152
|
+
)
|
|
153
|
+
if artifacts_dir is not None:
|
|
154
|
+
resolved_artifacts_dir = artifacts_dir
|
|
155
|
+
resolved_artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
else:
|
|
157
|
+
resolved_artifacts_dir = resolve_artifact_dir(
|
|
158
|
+
base_dir=output_dir,
|
|
159
|
+
output_path=output_path,
|
|
160
|
+
report_path=report_path,
|
|
161
|
+
prefix=comparison_prefix,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
artifact_index = write_pipeline_artifacts(
|
|
165
|
+
outcome.pipeline_result,
|
|
166
|
+
artifacts_dir=resolved_artifacts_dir,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
payload = _build_envelope(outcome, artifact_index)
|
|
170
|
+
payload["run_ids"] = list(outcome.run_ids)
|
|
171
|
+
payload["data"] = serialize_pipeline_result(outcome.pipeline_result)
|
|
172
|
+
payload["data"]["run_ids"] = list(outcome.run_ids)
|
|
173
|
+
payload["data"]["artifacts"] = artifact_index
|
|
174
|
+
write_json(output_path, payload)
|
|
175
|
+
report_path.write_text(outcome.report_text, encoding="utf-8")
|
|
176
|
+
|
|
177
|
+
if output_format == "table":
|
|
178
|
+
_render_table(console, outcome)
|
|
179
|
+
else:
|
|
180
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
181
|
+
|
|
182
|
+
if outcome.is_degraded:
|
|
183
|
+
console.print("[yellow]리포트가 일부 누락되었을 수 있습니다.[/yellow]")
|
|
184
|
+
|
|
185
|
+
console.print(f"[green]비교 결과 저장:[/green] {output_path}")
|
|
186
|
+
console.print(f"[green]비교 리포트 저장:[/green] {report_path}")
|
|
187
|
+
console.print(
|
|
188
|
+
"[green]비교 아티팩트 저장:[/green] "
|
|
189
|
+
f"{artifact_index['dir']} (index: {artifact_index['index']})"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if outcome.is_degraded:
|
|
193
|
+
raise typer.Exit(2)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _build_envelope(outcome, artifact_index: dict[str, str]) -> dict[str, object]:
|
|
197
|
+
return {
|
|
198
|
+
"command": "compare",
|
|
199
|
+
"version": 1,
|
|
200
|
+
"status": outcome.status,
|
|
201
|
+
"started_at": outcome.started_at.isoformat(),
|
|
202
|
+
"finished_at": outcome.finished_at.isoformat(),
|
|
203
|
+
"duration_ms": outcome.duration_ms,
|
|
204
|
+
"artifacts": {
|
|
205
|
+
"dir": artifact_index.get("dir"),
|
|
206
|
+
"index": artifact_index.get("index"),
|
|
207
|
+
},
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _render_table(console: Console, outcome) -> None:
|
|
212
|
+
table = Table(title="통계 비교", show_header=True, header_style="bold cyan")
|
|
213
|
+
table.add_column("메트릭")
|
|
214
|
+
table.add_column("실행 A (평균)", justify="right")
|
|
215
|
+
table.add_column("실행 B (평균)", justify="right")
|
|
216
|
+
table.add_column("변화 (%)", justify="right")
|
|
217
|
+
table.add_column("p-값", justify="right")
|
|
218
|
+
table.add_column("효과 크기", justify="right")
|
|
219
|
+
table.add_column("유의")
|
|
220
|
+
table.add_column("승자")
|
|
221
|
+
|
|
222
|
+
for comparison in outcome.comparisons:
|
|
223
|
+
sig_style = "green" if comparison.is_significant else "dim"
|
|
224
|
+
winner = comparison.winner[:8] if comparison.winner else "-"
|
|
225
|
+
table.add_row(
|
|
226
|
+
comparison.metric,
|
|
227
|
+
f"{comparison.mean_a:.3f}",
|
|
228
|
+
f"{comparison.mean_b:.3f}",
|
|
229
|
+
f"{comparison.diff_percent:+.1f}%",
|
|
230
|
+
f"{comparison.p_value:.4f}",
|
|
231
|
+
f"{comparison.effect_size:.2f} ({comparison.effect_level.value})",
|
|
232
|
+
f"[{sig_style}]{'예' if comparison.is_significant else '아니오'}[/{sig_style}]",
|
|
233
|
+
winner,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
console.print("\n[bold]실행 비교 결과[/bold]")
|
|
237
|
+
console.print(table)
|
|
238
|
+
console.print()
|
|
239
|
+
|
|
240
|
+
scorecard = build_comparison_scorecard(
|
|
241
|
+
outcome.pipeline_result.get_node_result("run_metric_comparison").output
|
|
242
|
+
if outcome.pipeline_result.get_node_result("run_metric_comparison")
|
|
243
|
+
else {}
|
|
244
|
+
)
|
|
245
|
+
if not scorecard:
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
summary_table = Table(title="비교 스코어카드", show_header=True, header_style="bold cyan")
|
|
249
|
+
summary_table.add_column("메트릭")
|
|
250
|
+
summary_table.add_column("A", justify="right")
|
|
251
|
+
summary_table.add_column("B", justify="right")
|
|
252
|
+
summary_table.add_column("차이", justify="right")
|
|
253
|
+
summary_table.add_column("p-값", justify="right")
|
|
254
|
+
summary_table.add_column("효과 크기", justify="right")
|
|
255
|
+
summary_table.add_column("유의 여부")
|
|
256
|
+
|
|
257
|
+
for row in scorecard:
|
|
258
|
+
effect_size = row.get("effect_size")
|
|
259
|
+
effect_level = row.get("effect_level")
|
|
260
|
+
effect_text = (
|
|
261
|
+
f"{effect_size:.2f} ({effect_level})"
|
|
262
|
+
if isinstance(effect_size, (float, int)) and effect_level
|
|
263
|
+
else "-"
|
|
264
|
+
)
|
|
265
|
+
summary_table.add_row(
|
|
266
|
+
str(row.get("metric") or "-"),
|
|
267
|
+
_format_float(row.get("mean_a")),
|
|
268
|
+
_format_float(row.get("mean_b")),
|
|
269
|
+
_format_float(row.get("diff"), signed=True),
|
|
270
|
+
_format_float(row.get("p_value")),
|
|
271
|
+
effect_text,
|
|
272
|
+
"예" if row.get("is_significant") else "아니오",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
console.print(summary_table)
|
|
276
|
+
console.print()
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _format_float(value: float | None, *, signed: bool = False) -> str:
|
|
280
|
+
if value is None:
|
|
281
|
+
return "-"
|
|
282
|
+
try:
|
|
283
|
+
if signed:
|
|
284
|
+
return f"{float(value):+.3f}"
|
|
285
|
+
return f"{float(value):.3f}"
|
|
286
|
+
except (TypeError, ValueError):
|
|
287
|
+
return "-"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
__all__ = ["register_compare_commands"]
|
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
from pathlib import Path
|
|
7
|
+
from typing import cast
|
|
7
8
|
|
|
8
9
|
import typer
|
|
9
10
|
from rich.console import Console
|
|
@@ -46,7 +47,7 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
|
|
|
46
47
|
"--mode",
|
|
47
48
|
help="Filter by run mode: 'simple' or 'full'.",
|
|
48
49
|
),
|
|
49
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
50
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
50
51
|
) -> None:
|
|
51
52
|
"""Show evaluation run history.
|
|
52
53
|
|
|
@@ -75,7 +76,6 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
|
|
|
75
76
|
|
|
76
77
|
\b
|
|
77
78
|
See also:
|
|
78
|
-
evalvault compare — Compare two runs side by side
|
|
79
79
|
evalvault export — Export run details to JSON
|
|
80
80
|
evalvault run — Create new evaluation runs
|
|
81
81
|
"""
|
|
@@ -88,7 +88,11 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
|
|
|
88
88
|
"[red]Error:[/red] --mode must be one of: " + ", ".join(RUN_MODE_CHOICES)
|
|
89
89
|
)
|
|
90
90
|
raise typer.Exit(2)
|
|
91
|
-
|
|
91
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
92
|
+
if resolved_db_path is None:
|
|
93
|
+
console.print("[red]Error:[/red] Database path is not configured.")
|
|
94
|
+
raise typer.Exit(1)
|
|
95
|
+
storage = SQLiteStorageAdapter(db_path=cast(Path, resolved_db_path))
|
|
92
96
|
runs = storage.list_runs(limit=limit, dataset_name=dataset, model_name=model)
|
|
93
97
|
if normalized_mode:
|
|
94
98
|
runs = [
|
|
@@ -157,86 +161,6 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
|
|
|
157
161
|
console.print(table)
|
|
158
162
|
console.print(f"\n[dim]Showing {len(runs)} of {limit} runs[/dim]\n")
|
|
159
163
|
|
|
160
|
-
@app.command()
|
|
161
|
-
def compare(
|
|
162
|
-
run_id1: str = typer.Argument(..., help="First run ID to compare."),
|
|
163
|
-
run_id2: str = typer.Argument(..., help="Second run ID to compare."),
|
|
164
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
165
|
-
) -> None:
|
|
166
|
-
"""Compare two evaluation runs.
|
|
167
|
-
|
|
168
|
-
Show a side-by-side comparison of metrics, pass rates, and scores
|
|
169
|
-
between two evaluation runs.
|
|
170
|
-
|
|
171
|
-
\b
|
|
172
|
-
Examples:
|
|
173
|
-
# Compare two runs by ID
|
|
174
|
-
evalvault compare abc12345 def67890
|
|
175
|
-
|
|
176
|
-
# Compare runs from a custom database
|
|
177
|
-
evalvault compare abc12345 def67890 --db custom.db
|
|
178
|
-
|
|
179
|
-
\b
|
|
180
|
-
See also:
|
|
181
|
-
evalvault history — List runs to find IDs
|
|
182
|
-
evalvault export — Export run details to JSON
|
|
183
|
-
evalvault analyze — Deep analysis of a single run
|
|
184
|
-
"""
|
|
185
|
-
console.print("\n[bold]Comparing Evaluation Runs[/bold]\n")
|
|
186
|
-
|
|
187
|
-
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
188
|
-
|
|
189
|
-
try:
|
|
190
|
-
run1 = storage.get_run(run_id1)
|
|
191
|
-
run2 = storage.get_run(run_id2)
|
|
192
|
-
except KeyError as exc:
|
|
193
|
-
console.print(f"[red]Error:[/red] {exc}")
|
|
194
|
-
raise typer.Exit(1) from exc
|
|
195
|
-
|
|
196
|
-
table = Table(show_header=True, header_style="bold cyan")
|
|
197
|
-
table.add_column("Metric")
|
|
198
|
-
table.add_column(f"Run 1\n{run_id1[:12]}...", justify="right")
|
|
199
|
-
table.add_column(f"Run 2\n{run_id2[:12]}...", justify="right")
|
|
200
|
-
table.add_column("Difference", justify="right")
|
|
201
|
-
|
|
202
|
-
table.add_row("Dataset", run1.dataset_name, run2.dataset_name, "-")
|
|
203
|
-
table.add_row("Model", run1.model_name, run2.model_name, "-")
|
|
204
|
-
table.add_row(
|
|
205
|
-
"Test Cases",
|
|
206
|
-
str(run1.total_test_cases),
|
|
207
|
-
str(run2.total_test_cases),
|
|
208
|
-
str(run2.total_test_cases - run1.total_test_cases),
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
pass_rate_diff = run2.pass_rate - run1.pass_rate
|
|
212
|
-
diff_color = "green" if pass_rate_diff > 0 else "red" if pass_rate_diff < 0 else "dim"
|
|
213
|
-
table.add_row(
|
|
214
|
-
"Pass Rate",
|
|
215
|
-
f"{run1.pass_rate:.1%}",
|
|
216
|
-
f"{run2.pass_rate:.1%}",
|
|
217
|
-
f"[{diff_color}]{pass_rate_diff:+.1%}[/{diff_color}]",
|
|
218
|
-
)
|
|
219
|
-
|
|
220
|
-
for metric in run1.metrics_evaluated:
|
|
221
|
-
if metric in run2.metrics_evaluated:
|
|
222
|
-
score1 = run1.get_avg_score(metric)
|
|
223
|
-
score2 = run2.get_avg_score(metric)
|
|
224
|
-
diff = score2 - score1 if score1 is not None and score2 is not None else None
|
|
225
|
-
diff_str = (
|
|
226
|
-
f"[{'green' if diff and diff > 0 else 'red' if diff and diff < 0 else 'dim'}]{diff:+.3f}[/{'green' if diff and diff > 0 else 'red' if diff and diff < 0 else 'dim'}]"
|
|
227
|
-
if diff is not None
|
|
228
|
-
else "-"
|
|
229
|
-
)
|
|
230
|
-
table.add_row(
|
|
231
|
-
f"Avg {metric}",
|
|
232
|
-
f"{score1:.3f}" if score1 is not None else "-",
|
|
233
|
-
f"{score2:.3f}" if score2 is not None else "-",
|
|
234
|
-
diff_str,
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
console.print(table)
|
|
238
|
-
console.print()
|
|
239
|
-
|
|
240
164
|
@app.command(name="export")
|
|
241
165
|
def export_cmd(
|
|
242
166
|
run_id: str = typer.Argument(..., help="Run ID to export."),
|
|
@@ -246,7 +170,7 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
|
|
|
246
170
|
"-o",
|
|
247
171
|
help="Output file path (JSON format).",
|
|
248
172
|
),
|
|
249
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
173
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
250
174
|
) -> None:
|
|
251
175
|
"""Export evaluation run to JSON file.
|
|
252
176
|
|
|
@@ -275,7 +199,11 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
|
|
|
275
199
|
"""
|
|
276
200
|
console.print(f"\n[bold]Exporting Run {run_id}[/bold]\n")
|
|
277
201
|
|
|
278
|
-
|
|
202
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
203
|
+
if resolved_db_path is None:
|
|
204
|
+
console.print("[red]Error:[/red] Database path is not configured.")
|
|
205
|
+
raise typer.Exit(1)
|
|
206
|
+
storage = SQLiteStorageAdapter(db_path=cast(Path, resolved_db_path))
|
|
279
207
|
|
|
280
208
|
try:
|
|
281
209
|
run = storage.get_run(run_id)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
from evalvault.adapters.outbound.filesystem.ops_snapshot_writer import OpsSnapshotWriter
|
|
10
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
11
|
+
from evalvault.config.settings import Settings, apply_profile
|
|
12
|
+
from evalvault.domain.services.ops_snapshot_service import (
|
|
13
|
+
OpsSnapshotRequest,
|
|
14
|
+
OpsSnapshotService,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from ..utils.console import print_cli_error, progress_spinner
|
|
18
|
+
from ..utils.options import db_option, profile_option
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _resolve_storage_path(db_path: Path | None) -> Path:
|
|
22
|
+
if db_path is None:
|
|
23
|
+
return Path(Settings().evalvault_db_path)
|
|
24
|
+
return db_path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_ops_app(console: Console) -> typer.Typer:
|
|
28
|
+
app = typer.Typer(name="ops", help="Ops utilities.")
|
|
29
|
+
|
|
30
|
+
@app.command("snapshot")
|
|
31
|
+
def snapshot(
|
|
32
|
+
run_id: str = typer.Option(..., "--run-id", help="Run ID to snapshot."),
|
|
33
|
+
profile: str | None = profile_option(help_text="Profile name to snapshot."),
|
|
34
|
+
db_path: Path | None = db_option(help_text="Path to SQLite database file."),
|
|
35
|
+
include_model_config: bool = typer.Option(
|
|
36
|
+
False,
|
|
37
|
+
"--include-model-config",
|
|
38
|
+
help="Include model profile configuration.",
|
|
39
|
+
),
|
|
40
|
+
include_env: bool = typer.Option(
|
|
41
|
+
False,
|
|
42
|
+
"--include-env",
|
|
43
|
+
help="Include resolved settings snapshot.",
|
|
44
|
+
),
|
|
45
|
+
redact: list[str] = typer.Option(
|
|
46
|
+
[],
|
|
47
|
+
"--redact",
|
|
48
|
+
help="Environment keys to redact (repeatable).",
|
|
49
|
+
),
|
|
50
|
+
output_path: Path = typer.Option(
|
|
51
|
+
..., "--output", "-o", help="Output JSON path for snapshot."
|
|
52
|
+
),
|
|
53
|
+
) -> None:
|
|
54
|
+
settings = Settings()
|
|
55
|
+
resolved_profile = profile or settings.evalvault_profile
|
|
56
|
+
if resolved_profile:
|
|
57
|
+
settings = apply_profile(settings, resolved_profile)
|
|
58
|
+
|
|
59
|
+
resolved_db_path = _resolve_storage_path(db_path)
|
|
60
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
61
|
+
writer = OpsSnapshotWriter()
|
|
62
|
+
service = OpsSnapshotService(
|
|
63
|
+
storage=storage,
|
|
64
|
+
writer=writer,
|
|
65
|
+
settings=settings,
|
|
66
|
+
output_path=output_path,
|
|
67
|
+
)
|
|
68
|
+
request = OpsSnapshotRequest(
|
|
69
|
+
run_id=run_id,
|
|
70
|
+
profile=resolved_profile,
|
|
71
|
+
db_path=resolved_db_path,
|
|
72
|
+
include_model_config=include_model_config,
|
|
73
|
+
include_env=include_env,
|
|
74
|
+
redact_keys=tuple(redact),
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
with progress_spinner(console, "Ops snapshot 생성 중..."):
|
|
78
|
+
started_at = datetime.now(UTC)
|
|
79
|
+
try:
|
|
80
|
+
envelope = service.collect(request)
|
|
81
|
+
except KeyError as exc:
|
|
82
|
+
print_cli_error(
|
|
83
|
+
console,
|
|
84
|
+
"Run을 찾지 못했습니다.",
|
|
85
|
+
details=str(exc),
|
|
86
|
+
fixes=["--run-id 값과 --db 경로를 확인하세요."],
|
|
87
|
+
)
|
|
88
|
+
raise typer.Exit(1) from exc
|
|
89
|
+
except Exception as exc:
|
|
90
|
+
print_cli_error(
|
|
91
|
+
console,
|
|
92
|
+
"Ops snapshot 생성 중 오류가 발생했습니다.",
|
|
93
|
+
details=str(exc),
|
|
94
|
+
fixes=["--output 경로와 파일 권한을 확인하세요."],
|
|
95
|
+
)
|
|
96
|
+
raise typer.Exit(1) from exc
|
|
97
|
+
|
|
98
|
+
finished_at = datetime.now(UTC)
|
|
99
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
100
|
+
console.print("[green]Ops snapshot 완료[/green]")
|
|
101
|
+
console.print(f"- output: {output_path}")
|
|
102
|
+
console.print(f"- duration_ms: {duration_ms}")
|
|
103
|
+
console.print(f"- status: {envelope.status}")
|
|
104
|
+
if envelope.data.get("model_config") is None and include_model_config:
|
|
105
|
+
console.print("[yellow]model_config을 찾지 못했습니다.[/yellow]")
|
|
106
|
+
|
|
107
|
+
return app
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
__all__ = ["create_ops_app"]
|