evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +14 -0
- evalvault/adapters/inbound/api/main.py +14 -4
- evalvault/adapters/inbound/api/routers/chat.py +543 -0
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/inbound/cli/commands/run.py +14 -0
- evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
- evalvault/adapters/outbound/storage/base_sql.py +41 -1
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
- evalvault/config/settings.py +21 -0
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/prompt.py +1 -1
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/metrics/__init__.py +8 -0
- evalvault/domain/metrics/registry.py +39 -3
- evalvault/domain/metrics/summary_accuracy.py +189 -0
- evalvault/domain/metrics/summary_needs_followup.py +45 -0
- evalvault/domain/metrics/summary_non_definitive.py +41 -0
- evalvault/domain/metrics/summary_risk_coverage.py +45 -0
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/custom_metric_snapshot.py +233 -0
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +282 -27
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/prompt_registry.py +39 -10
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/domain/services/threshold_profiles.py +4 -0
- evalvault/domain/services/visual_space_service.py +79 -4
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.inbound.cli.utils.console import print_cli_error
|
|
11
|
+
from evalvault.adapters.inbound.cli.utils.validators import validate_choice
|
|
12
|
+
from evalvault.adapters.outbound.artifact_fs import LocalArtifactFileSystemAdapter
|
|
13
|
+
from evalvault.domain.services.artifact_lint_service import ArtifactLintService
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_artifacts_app(console: Console) -> typer.Typer:
|
|
19
|
+
artifacts_app = typer.Typer(name="artifacts", help="Artifact utilities.")
|
|
20
|
+
|
|
21
|
+
@artifacts_app.command("lint")
|
|
22
|
+
def lint(
|
|
23
|
+
artifacts_dir: Path = typer.Argument(..., help="Artifacts directory."),
|
|
24
|
+
strict: bool = typer.Option(False, "--strict", help="Fail on missing files."),
|
|
25
|
+
output_format: str = typer.Option(
|
|
26
|
+
"json",
|
|
27
|
+
"--format",
|
|
28
|
+
"-f",
|
|
29
|
+
help="Output format (json).",
|
|
30
|
+
),
|
|
31
|
+
output: Path | None = typer.Option(
|
|
32
|
+
None,
|
|
33
|
+
"--output",
|
|
34
|
+
"-o",
|
|
35
|
+
help="Output file path for lint result.",
|
|
36
|
+
),
|
|
37
|
+
parallel: bool = typer.Option(
|
|
38
|
+
True,
|
|
39
|
+
"--parallel/--no-parallel",
|
|
40
|
+
help="Enable parallel validation (placeholder).",
|
|
41
|
+
),
|
|
42
|
+
concurrency: int = typer.Option(
|
|
43
|
+
8,
|
|
44
|
+
"--concurrency",
|
|
45
|
+
min=1,
|
|
46
|
+
help="Parallel validation concurrency (placeholder).",
|
|
47
|
+
),
|
|
48
|
+
) -> None:
|
|
49
|
+
validate_choice(output_format, ["json"], console, value_label="format")
|
|
50
|
+
|
|
51
|
+
logger.info("Artifacts lint command started: %s", artifacts_dir)
|
|
52
|
+
fs_adapter = LocalArtifactFileSystemAdapter()
|
|
53
|
+
service = ArtifactLintService(fs_adapter)
|
|
54
|
+
summary = service.lint(artifacts_dir, strict=strict)
|
|
55
|
+
|
|
56
|
+
payload = _build_payload(summary, parallel=parallel, concurrency=concurrency)
|
|
57
|
+
if output:
|
|
58
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
60
|
+
console.print(f"[green]Lint report saved:[/green] {output}")
|
|
61
|
+
else:
|
|
62
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
63
|
+
|
|
64
|
+
if summary.status == "error":
|
|
65
|
+
logger.error("Artifacts lint command failed: %s", artifacts_dir)
|
|
66
|
+
print_cli_error(console, "Artifact lint failed", details=str(artifacts_dir))
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
|
|
69
|
+
logger.info("Artifacts lint command finished: %s", artifacts_dir)
|
|
70
|
+
|
|
71
|
+
return artifacts_app
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _build_payload(summary, *, parallel: bool, concurrency: int) -> dict[str, object]:
|
|
75
|
+
issues = [
|
|
76
|
+
{
|
|
77
|
+
"level": issue.level,
|
|
78
|
+
"code": issue.code,
|
|
79
|
+
"message": issue.message,
|
|
80
|
+
"path": issue.path,
|
|
81
|
+
}
|
|
82
|
+
for issue in summary.issues
|
|
83
|
+
]
|
|
84
|
+
error_count = sum(1 for issue in summary.issues if issue.level == "error")
|
|
85
|
+
warning_count = sum(1 for issue in summary.issues if issue.level == "warning")
|
|
86
|
+
return {
|
|
87
|
+
"command": "artifacts.lint",
|
|
88
|
+
"version": 1,
|
|
89
|
+
"status": summary.status,
|
|
90
|
+
"started_at": summary.started_at.isoformat(),
|
|
91
|
+
"finished_at": summary.finished_at.isoformat(),
|
|
92
|
+
"duration_ms": summary.duration_ms,
|
|
93
|
+
"artifacts": {
|
|
94
|
+
"dir": str(summary.artifacts_dir),
|
|
95
|
+
"index": str(summary.index_path),
|
|
96
|
+
},
|
|
97
|
+
"data": {
|
|
98
|
+
"strict": summary.strict,
|
|
99
|
+
"parallel": parallel,
|
|
100
|
+
"concurrency": concurrency,
|
|
101
|
+
"issue_counts": {
|
|
102
|
+
"error": error_count,
|
|
103
|
+
"warning": warning_count,
|
|
104
|
+
},
|
|
105
|
+
"issues": issues,
|
|
106
|
+
},
|
|
107
|
+
}
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.inbound.cli.utils.analysis_io import resolve_artifact_dir, write_json
|
|
11
|
+
from evalvault.adapters.inbound.cli.utils.console import print_cli_error, progress_spinner
|
|
12
|
+
from evalvault.adapters.inbound.cli.utils.options import db_option
|
|
13
|
+
from evalvault.adapters.inbound.cli.utils.validators import parse_csv_option, validate_choice
|
|
14
|
+
from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
|
|
15
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
16
|
+
from evalvault.config.settings import Settings
|
|
17
|
+
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
18
|
+
|
|
19
|
+
_console = Console()
|
|
20
|
+
|
|
21
|
+
_ALLOWED_LABELS = ["feedback", "gold", "hybrid"]
|
|
22
|
+
_ALLOWED_METHODS = ["platt", "isotonic", "temperature", "none"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def register_calibrate_judge_commands(app: typer.Typer, console: Console) -> None:
|
|
26
|
+
global _console
|
|
27
|
+
_console = console
|
|
28
|
+
|
|
29
|
+
@app.command(name="calibrate-judge")
|
|
30
|
+
def calibrate_judge(
|
|
31
|
+
run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
|
|
32
|
+
labels_source: str = typer.Option(
|
|
33
|
+
"feedback",
|
|
34
|
+
"--labels-source",
|
|
35
|
+
help="라벨 소스 (feedback|gold|hybrid)",
|
|
36
|
+
),
|
|
37
|
+
method: str = typer.Option(
|
|
38
|
+
"isotonic",
|
|
39
|
+
"--method",
|
|
40
|
+
help="보정 방법 (platt|isotonic|temperature|none)",
|
|
41
|
+
),
|
|
42
|
+
metrics: str | None = typer.Option(
|
|
43
|
+
None,
|
|
44
|
+
"--metric",
|
|
45
|
+
"-m",
|
|
46
|
+
help="보정 대상 메트릭 (쉼표로 구분, 미지정 시 run 메트릭 전체)",
|
|
47
|
+
),
|
|
48
|
+
holdout_ratio: float = typer.Option(
|
|
49
|
+
0.2,
|
|
50
|
+
"--holdout-ratio",
|
|
51
|
+
help="검증용 holdout 비율",
|
|
52
|
+
),
|
|
53
|
+
seed: int = typer.Option(42, "--seed", help="샘플 분할 랜덤 시드"),
|
|
54
|
+
write_back: bool = typer.Option(
|
|
55
|
+
False,
|
|
56
|
+
"--write-back",
|
|
57
|
+
help="보정 결과를 Run 메타데이터에 저장",
|
|
58
|
+
),
|
|
59
|
+
output: Path | None = typer.Option(
|
|
60
|
+
None,
|
|
61
|
+
"--output",
|
|
62
|
+
"-o",
|
|
63
|
+
help="JSON 결과 파일 경로",
|
|
64
|
+
),
|
|
65
|
+
artifacts_dir: Path | None = typer.Option(
|
|
66
|
+
None,
|
|
67
|
+
"--artifacts-dir",
|
|
68
|
+
help="아티팩트 저장 디렉터리",
|
|
69
|
+
),
|
|
70
|
+
parallel: bool = typer.Option(
|
|
71
|
+
False,
|
|
72
|
+
"--parallel/--no-parallel",
|
|
73
|
+
help="병렬 실행 활성화",
|
|
74
|
+
),
|
|
75
|
+
concurrency: int = typer.Option(8, "--concurrency", help="동시성 수준"),
|
|
76
|
+
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
77
|
+
) -> None:
|
|
78
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
79
|
+
if resolved_db_path is None:
|
|
80
|
+
print_cli_error(_console, "DB 경로가 설정되지 않았습니다.")
|
|
81
|
+
raise typer.Exit(1)
|
|
82
|
+
|
|
83
|
+
labels_source = labels_source.strip().lower()
|
|
84
|
+
method = method.strip().lower()
|
|
85
|
+
validate_choice(labels_source, _ALLOWED_LABELS, _console, value_label="labels-source")
|
|
86
|
+
validate_choice(method, _ALLOWED_METHODS, _console, value_label="method")
|
|
87
|
+
|
|
88
|
+
metric_list = parse_csv_option(metrics)
|
|
89
|
+
if holdout_ratio <= 0 or holdout_ratio >= 1:
|
|
90
|
+
print_cli_error(_console, "--holdout-ratio 값은 0과 1 사이여야 합니다.")
|
|
91
|
+
raise typer.Exit(1)
|
|
92
|
+
if seed < 0:
|
|
93
|
+
print_cli_error(_console, "--seed 값은 0 이상이어야 합니다.")
|
|
94
|
+
raise typer.Exit(1)
|
|
95
|
+
if concurrency <= 0:
|
|
96
|
+
print_cli_error(_console, "--concurrency 값은 1 이상이어야 합니다.")
|
|
97
|
+
raise typer.Exit(1)
|
|
98
|
+
|
|
99
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
100
|
+
try:
|
|
101
|
+
run = storage.get_run(run_id)
|
|
102
|
+
except KeyError:
|
|
103
|
+
print_cli_error(_console, f"Run을 찾을 수 없습니다: {run_id}")
|
|
104
|
+
raise typer.Exit(1)
|
|
105
|
+
|
|
106
|
+
feedbacks = storage.list_feedback(run_id)
|
|
107
|
+
if labels_source in {"feedback", "hybrid"} and not feedbacks:
|
|
108
|
+
print_cli_error(_console, "피드백 라벨이 없습니다.")
|
|
109
|
+
raise typer.Exit(1)
|
|
110
|
+
|
|
111
|
+
resolved_metrics = metric_list or list(run.metrics_evaluated)
|
|
112
|
+
if not resolved_metrics:
|
|
113
|
+
print_cli_error(_console, "보정 대상 메트릭이 없습니다.")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
|
|
116
|
+
prefix = f"judge_calibration_{run_id}"
|
|
117
|
+
output_path = (output or Path("reports/calibration") / f"{prefix}.json").expanduser()
|
|
118
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
resolved_artifacts_dir = resolve_artifact_dir(
|
|
120
|
+
base_dir=artifacts_dir,
|
|
121
|
+
output_path=output_path,
|
|
122
|
+
report_path=output_path,
|
|
123
|
+
prefix=prefix,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
service = JudgeCalibrationService()
|
|
127
|
+
reporter = JudgeCalibrationReporter()
|
|
128
|
+
started_at = datetime.now(UTC)
|
|
129
|
+
|
|
130
|
+
with progress_spinner(_console, "Judge 보정 실행 중..."):
|
|
131
|
+
result = service.calibrate(
|
|
132
|
+
run,
|
|
133
|
+
feedbacks,
|
|
134
|
+
labels_source=labels_source,
|
|
135
|
+
method=method,
|
|
136
|
+
metrics=resolved_metrics,
|
|
137
|
+
holdout_ratio=holdout_ratio,
|
|
138
|
+
seed=seed,
|
|
139
|
+
parallel=parallel,
|
|
140
|
+
concurrency=concurrency,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
artifacts_index = reporter.write_artifacts(
|
|
144
|
+
result=result,
|
|
145
|
+
artifacts_dir=resolved_artifacts_dir,
|
|
146
|
+
)
|
|
147
|
+
finished_at = datetime.now(UTC)
|
|
148
|
+
payload = _build_envelope(
|
|
149
|
+
result,
|
|
150
|
+
artifacts_index,
|
|
151
|
+
started_at=started_at,
|
|
152
|
+
finished_at=finished_at,
|
|
153
|
+
)
|
|
154
|
+
write_json(output_path, payload)
|
|
155
|
+
|
|
156
|
+
_display_summary(result)
|
|
157
|
+
_console.print(f"[green]JSON 저장:[/green] {output_path}")
|
|
158
|
+
_console.print(
|
|
159
|
+
f"[green]아티팩트 저장:[/green] {artifacts_index['dir']} (index: {artifacts_index['index']})"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
if write_back:
|
|
163
|
+
metadata = run.tracker_metadata or {}
|
|
164
|
+
metadata["judge_calibration"] = reporter.render_json(result)
|
|
165
|
+
metadata["judge_calibration"]["artifacts"] = artifacts_index
|
|
166
|
+
metadata["judge_calibration"]["output"] = str(output_path)
|
|
167
|
+
storage.update_run_metadata(run_id, metadata)
|
|
168
|
+
_console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
|
|
169
|
+
|
|
170
|
+
if result.summary.gate_passed is False:
|
|
171
|
+
raise typer.Exit(2)
|
|
172
|
+
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _build_envelope(
|
|
177
|
+
result,
|
|
178
|
+
artifacts_index: dict[str, str],
|
|
179
|
+
*,
|
|
180
|
+
started_at: datetime,
|
|
181
|
+
finished_at: datetime,
|
|
182
|
+
) -> dict[str, object]:
|
|
183
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
184
|
+
status = "ok" if result.summary.gate_passed else "degraded"
|
|
185
|
+
return {
|
|
186
|
+
"command": "calibrate-judge",
|
|
187
|
+
"version": 1,
|
|
188
|
+
"status": status,
|
|
189
|
+
"started_at": started_at.astimezone(UTC).isoformat(),
|
|
190
|
+
"finished_at": finished_at.astimezone(UTC).isoformat(),
|
|
191
|
+
"duration_ms": duration_ms,
|
|
192
|
+
"artifacts": artifacts_index,
|
|
193
|
+
"data": {
|
|
194
|
+
"summary": _serialize_summary(result.summary),
|
|
195
|
+
"metrics": [_serialize_metric(metric) for metric in result.metrics],
|
|
196
|
+
"case_results": {
|
|
197
|
+
metric: [_serialize_case(case) for case in cases]
|
|
198
|
+
for metric, cases in result.case_results.items()
|
|
199
|
+
},
|
|
200
|
+
"warnings": list(result.warnings),
|
|
201
|
+
},
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _serialize_summary(summary) -> dict[str, object]:
|
|
206
|
+
return {
|
|
207
|
+
"run_id": summary.run_id,
|
|
208
|
+
"labels_source": summary.labels_source,
|
|
209
|
+
"method": summary.method,
|
|
210
|
+
"metrics": list(summary.metrics),
|
|
211
|
+
"holdout_ratio": summary.holdout_ratio,
|
|
212
|
+
"seed": summary.seed,
|
|
213
|
+
"total_labels": summary.total_labels,
|
|
214
|
+
"total_samples": summary.total_samples,
|
|
215
|
+
"gate_passed": summary.gate_passed,
|
|
216
|
+
"gate_threshold": summary.gate_threshold,
|
|
217
|
+
"notes": list(summary.notes),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _serialize_metric(metric) -> dict[str, object]:
|
|
222
|
+
return {
|
|
223
|
+
"metric": metric.metric,
|
|
224
|
+
"method": metric.method,
|
|
225
|
+
"sample_count": metric.sample_count,
|
|
226
|
+
"label_count": metric.label_count,
|
|
227
|
+
"mae": metric.mae,
|
|
228
|
+
"pearson": metric.pearson,
|
|
229
|
+
"spearman": metric.spearman,
|
|
230
|
+
"temperature": metric.temperature,
|
|
231
|
+
"parameters": dict(metric.parameters),
|
|
232
|
+
"gate_passed": metric.gate_passed,
|
|
233
|
+
"warning": metric.warning,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _serialize_case(case) -> dict[str, object]:
|
|
238
|
+
return {
|
|
239
|
+
"test_case_id": case.test_case_id,
|
|
240
|
+
"raw_score": case.raw_score,
|
|
241
|
+
"calibrated_score": case.calibrated_score,
|
|
242
|
+
"label": case.label,
|
|
243
|
+
"label_source": case.label_source,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _display_summary(result) -> None:
|
|
248
|
+
summary_table = Table(title="Judge 보정 요약", show_header=True, header_style="bold cyan")
|
|
249
|
+
summary_table.add_column("메트릭")
|
|
250
|
+
summary_table.add_column("표본", justify="right")
|
|
251
|
+
summary_table.add_column("라벨", justify="right")
|
|
252
|
+
summary_table.add_column("MAE", justify="right")
|
|
253
|
+
summary_table.add_column("Pearson", justify="right")
|
|
254
|
+
summary_table.add_column("Spearman", justify="right")
|
|
255
|
+
summary_table.add_column("Gate", justify="right")
|
|
256
|
+
|
|
257
|
+
for metric in result.metrics:
|
|
258
|
+
summary_table.add_row(
|
|
259
|
+
metric.metric,
|
|
260
|
+
str(metric.sample_count),
|
|
261
|
+
str(metric.label_count),
|
|
262
|
+
_format_metric(metric.mae),
|
|
263
|
+
_format_metric(metric.pearson),
|
|
264
|
+
_format_metric(metric.spearman),
|
|
265
|
+
"PASS" if metric.gate_passed else "FAIL",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
_console.print(summary_table)
|
|
269
|
+
_console.print(
|
|
270
|
+
f"라벨 소스: {result.summary.labels_source} | "
|
|
271
|
+
f"방법: {result.summary.method} | "
|
|
272
|
+
f"Gate: {'PASS' if result.summary.gate_passed else 'FAIL'}"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if result.warnings:
|
|
276
|
+
for warning in result.warnings:
|
|
277
|
+
_console.print(f"[yellow]경고:[/yellow] {warning}")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _format_metric(value: float | None) -> str:
|
|
281
|
+
if value is None:
|
|
282
|
+
return "-"
|
|
283
|
+
return f"{value:.3f}"
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.outbound.analysis.comparison_pipeline_adapter import (
|
|
11
|
+
ComparisonPipelineAdapter,
|
|
12
|
+
)
|
|
13
|
+
from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis_pipeline_service
|
|
14
|
+
from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
|
|
15
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
16
|
+
from evalvault.config.settings import Settings, apply_profile
|
|
17
|
+
from evalvault.domain.services.run_comparison_service import (
|
|
18
|
+
RunComparisonError,
|
|
19
|
+
RunComparisonRequest,
|
|
20
|
+
RunComparisonService,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from ..utils.analysis_io import (
|
|
24
|
+
build_comparison_scorecard,
|
|
25
|
+
resolve_artifact_dir,
|
|
26
|
+
resolve_output_paths,
|
|
27
|
+
serialize_pipeline_result,
|
|
28
|
+
write_json,
|
|
29
|
+
write_pipeline_artifacts,
|
|
30
|
+
)
|
|
31
|
+
from ..utils.console import print_cli_error
|
|
32
|
+
from ..utils.options import db_option, profile_option
|
|
33
|
+
from ..utils.validators import parse_csv_option, validate_choice
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _coerce_test_type(value: str) -> str:
|
|
37
|
+
if value == "t-test":
|
|
38
|
+
return "t-test"
|
|
39
|
+
return "mann-whitney"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def register_compare_commands(app: typer.Typer, console: Console) -> None:
|
|
43
|
+
@app.command(name="compare")
|
|
44
|
+
def compare(
|
|
45
|
+
run_id_a: str = typer.Argument(..., help="기준 Run ID"),
|
|
46
|
+
run_id_b: str = typer.Argument(..., help="비교 Run ID"),
|
|
47
|
+
metrics: str | None = typer.Option(
|
|
48
|
+
None,
|
|
49
|
+
"--metrics",
|
|
50
|
+
"-m",
|
|
51
|
+
help="비교할 메트릭 목록 (쉼표 구분)",
|
|
52
|
+
),
|
|
53
|
+
test: str = typer.Option(
|
|
54
|
+
"t-test",
|
|
55
|
+
"--test",
|
|
56
|
+
"-t",
|
|
57
|
+
help="통계 검정 (t-test, mann-whitney)",
|
|
58
|
+
),
|
|
59
|
+
output_format: str = typer.Option(
|
|
60
|
+
"table",
|
|
61
|
+
"--format",
|
|
62
|
+
"-f",
|
|
63
|
+
help="출력 형식 (table, json)",
|
|
64
|
+
),
|
|
65
|
+
output: Path | None = typer.Option(None, "--output", "-o", help="JSON 출력 파일"),
|
|
66
|
+
report: Path | None = typer.Option(None, "--report", help="리포트 출력 파일"),
|
|
67
|
+
output_dir: Path | None = typer.Option(
|
|
68
|
+
None,
|
|
69
|
+
"--output-dir",
|
|
70
|
+
help="출력 기본 디렉터리",
|
|
71
|
+
),
|
|
72
|
+
artifacts_dir: Path | None = typer.Option(
|
|
73
|
+
None,
|
|
74
|
+
"--artifacts-dir",
|
|
75
|
+
help="아티팩트 저장 디렉터리",
|
|
76
|
+
),
|
|
77
|
+
parallel: bool = typer.Option(
|
|
78
|
+
False,
|
|
79
|
+
"--parallel/--no-parallel",
|
|
80
|
+
help="병렬 파이프라인 실행",
|
|
81
|
+
),
|
|
82
|
+
concurrency: int | None = typer.Option(
|
|
83
|
+
None,
|
|
84
|
+
"--concurrency",
|
|
85
|
+
min=1,
|
|
86
|
+
help="병렬 실행 동시성 제한",
|
|
87
|
+
),
|
|
88
|
+
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
89
|
+
profile: str | None = profile_option(help_text="LLM 프로필"),
|
|
90
|
+
) -> None:
|
|
91
|
+
validate_choice(test, ["t-test", "mann-whitney"], console, value_label="test")
|
|
92
|
+
validate_choice(output_format, ["table", "json"], console, value_label="format")
|
|
93
|
+
|
|
94
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
95
|
+
if resolved_db_path is None:
|
|
96
|
+
print_cli_error(console, "DB 경로가 설정되지 않았습니다.")
|
|
97
|
+
raise typer.Exit(1)
|
|
98
|
+
|
|
99
|
+
metric_list = parse_csv_option(metrics)
|
|
100
|
+
metric_list = metric_list or None
|
|
101
|
+
|
|
102
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
103
|
+
analysis_adapter = StatisticalAnalysisAdapter()
|
|
104
|
+
|
|
105
|
+
settings = Settings()
|
|
106
|
+
profile_name = profile or settings.evalvault_profile
|
|
107
|
+
if profile_name:
|
|
108
|
+
settings = apply_profile(settings, profile_name)
|
|
109
|
+
|
|
110
|
+
llm_adapter = None
|
|
111
|
+
try:
|
|
112
|
+
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
113
|
+
|
|
114
|
+
llm_adapter = get_llm_adapter(settings)
|
|
115
|
+
except Exception as exc:
|
|
116
|
+
console.print(f"[yellow]경고: LLM 어댑터 초기화 실패 ({exc})[/yellow]")
|
|
117
|
+
|
|
118
|
+
pipeline_service = build_analysis_pipeline_service(
|
|
119
|
+
storage=storage,
|
|
120
|
+
llm_adapter=llm_adapter,
|
|
121
|
+
)
|
|
122
|
+
pipeline_adapter = ComparisonPipelineAdapter(pipeline_service)
|
|
123
|
+
|
|
124
|
+
service = RunComparisonService(
|
|
125
|
+
storage=storage,
|
|
126
|
+
analysis_port=analysis_adapter,
|
|
127
|
+
pipeline_port=pipeline_adapter,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
request = RunComparisonRequest(
|
|
131
|
+
run_id_a=run_id_a,
|
|
132
|
+
run_id_b=run_id_b,
|
|
133
|
+
metrics=metric_list,
|
|
134
|
+
test_type=_coerce_test_type(test),
|
|
135
|
+
parallel=parallel,
|
|
136
|
+
concurrency=concurrency,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
outcome = service.compare_runs(request)
|
|
141
|
+
except RunComparisonError as exc:
|
|
142
|
+
print_cli_error(console, str(exc))
|
|
143
|
+
raise typer.Exit(exc.exit_code) from exc
|
|
144
|
+
|
|
145
|
+
comparison_prefix = f"comparison_{run_id_a[:8]}_{run_id_b[:8]}"
|
|
146
|
+
resolved_base_dir = output_dir or Path("reports/comparison")
|
|
147
|
+
output_path, report_path = resolve_output_paths(
|
|
148
|
+
base_dir=resolved_base_dir,
|
|
149
|
+
output_path=output,
|
|
150
|
+
report_path=report,
|
|
151
|
+
prefix=comparison_prefix,
|
|
152
|
+
)
|
|
153
|
+
if artifacts_dir is not None:
|
|
154
|
+
resolved_artifacts_dir = artifacts_dir
|
|
155
|
+
resolved_artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
else:
|
|
157
|
+
resolved_artifacts_dir = resolve_artifact_dir(
|
|
158
|
+
base_dir=output_dir,
|
|
159
|
+
output_path=output_path,
|
|
160
|
+
report_path=report_path,
|
|
161
|
+
prefix=comparison_prefix,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
artifact_index = write_pipeline_artifacts(
|
|
165
|
+
outcome.pipeline_result,
|
|
166
|
+
artifacts_dir=resolved_artifacts_dir,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
payload = _build_envelope(outcome, artifact_index)
|
|
170
|
+
payload["run_ids"] = list(outcome.run_ids)
|
|
171
|
+
payload["data"] = serialize_pipeline_result(outcome.pipeline_result)
|
|
172
|
+
payload["data"]["run_ids"] = list(outcome.run_ids)
|
|
173
|
+
payload["data"]["artifacts"] = artifact_index
|
|
174
|
+
write_json(output_path, payload)
|
|
175
|
+
report_path.write_text(outcome.report_text, encoding="utf-8")
|
|
176
|
+
|
|
177
|
+
if output_format == "table":
|
|
178
|
+
_render_table(console, outcome)
|
|
179
|
+
else:
|
|
180
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
181
|
+
|
|
182
|
+
if outcome.is_degraded:
|
|
183
|
+
console.print("[yellow]리포트가 일부 누락되었을 수 있습니다.[/yellow]")
|
|
184
|
+
|
|
185
|
+
console.print(f"[green]비교 결과 저장:[/green] {output_path}")
|
|
186
|
+
console.print(f"[green]비교 리포트 저장:[/green] {report_path}")
|
|
187
|
+
console.print(
|
|
188
|
+
"[green]비교 아티팩트 저장:[/green] "
|
|
189
|
+
f"{artifact_index['dir']} (index: {artifact_index['index']})"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if outcome.is_degraded:
|
|
193
|
+
raise typer.Exit(2)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _build_envelope(outcome, artifact_index: dict[str, str]) -> dict[str, object]:
|
|
197
|
+
return {
|
|
198
|
+
"command": "compare",
|
|
199
|
+
"version": 1,
|
|
200
|
+
"status": outcome.status,
|
|
201
|
+
"started_at": outcome.started_at.isoformat(),
|
|
202
|
+
"finished_at": outcome.finished_at.isoformat(),
|
|
203
|
+
"duration_ms": outcome.duration_ms,
|
|
204
|
+
"artifacts": {
|
|
205
|
+
"dir": artifact_index.get("dir"),
|
|
206
|
+
"index": artifact_index.get("index"),
|
|
207
|
+
},
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _render_table(console: Console, outcome) -> None:
|
|
212
|
+
table = Table(title="통계 비교", show_header=True, header_style="bold cyan")
|
|
213
|
+
table.add_column("메트릭")
|
|
214
|
+
table.add_column("실행 A (평균)", justify="right")
|
|
215
|
+
table.add_column("실행 B (평균)", justify="right")
|
|
216
|
+
table.add_column("변화 (%)", justify="right")
|
|
217
|
+
table.add_column("p-값", justify="right")
|
|
218
|
+
table.add_column("효과 크기", justify="right")
|
|
219
|
+
table.add_column("유의")
|
|
220
|
+
table.add_column("승자")
|
|
221
|
+
|
|
222
|
+
for comparison in outcome.comparisons:
|
|
223
|
+
sig_style = "green" if comparison.is_significant else "dim"
|
|
224
|
+
winner = comparison.winner[:8] if comparison.winner else "-"
|
|
225
|
+
table.add_row(
|
|
226
|
+
comparison.metric,
|
|
227
|
+
f"{comparison.mean_a:.3f}",
|
|
228
|
+
f"{comparison.mean_b:.3f}",
|
|
229
|
+
f"{comparison.diff_percent:+.1f}%",
|
|
230
|
+
f"{comparison.p_value:.4f}",
|
|
231
|
+
f"{comparison.effect_size:.2f} ({comparison.effect_level.value})",
|
|
232
|
+
f"[{sig_style}]{'예' if comparison.is_significant else '아니오'}[/{sig_style}]",
|
|
233
|
+
winner,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
console.print("\n[bold]실행 비교 결과[/bold]")
|
|
237
|
+
console.print(table)
|
|
238
|
+
console.print()
|
|
239
|
+
|
|
240
|
+
scorecard = build_comparison_scorecard(
|
|
241
|
+
outcome.pipeline_result.get_node_result("run_metric_comparison").output
|
|
242
|
+
if outcome.pipeline_result.get_node_result("run_metric_comparison")
|
|
243
|
+
else {}
|
|
244
|
+
)
|
|
245
|
+
if not scorecard:
|
|
246
|
+
return
|
|
247
|
+
|
|
248
|
+
summary_table = Table(title="비교 스코어카드", show_header=True, header_style="bold cyan")
|
|
249
|
+
summary_table.add_column("메트릭")
|
|
250
|
+
summary_table.add_column("A", justify="right")
|
|
251
|
+
summary_table.add_column("B", justify="right")
|
|
252
|
+
summary_table.add_column("차이", justify="right")
|
|
253
|
+
summary_table.add_column("p-값", justify="right")
|
|
254
|
+
summary_table.add_column("효과 크기", justify="right")
|
|
255
|
+
summary_table.add_column("유의 여부")
|
|
256
|
+
|
|
257
|
+
for row in scorecard:
|
|
258
|
+
effect_size = row.get("effect_size")
|
|
259
|
+
effect_level = row.get("effect_level")
|
|
260
|
+
effect_text = (
|
|
261
|
+
f"{effect_size:.2f} ({effect_level})"
|
|
262
|
+
if isinstance(effect_size, (float, int)) and effect_level
|
|
263
|
+
else "-"
|
|
264
|
+
)
|
|
265
|
+
summary_table.add_row(
|
|
266
|
+
str(row.get("metric") or "-"),
|
|
267
|
+
_format_float(row.get("mean_a")),
|
|
268
|
+
_format_float(row.get("mean_b")),
|
|
269
|
+
_format_float(row.get("diff"), signed=True),
|
|
270
|
+
_format_float(row.get("p_value")),
|
|
271
|
+
effect_text,
|
|
272
|
+
"예" if row.get("is_significant") else "아니오",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
console.print(summary_table)
|
|
276
|
+
console.print()
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _format_float(value: float | None, *, signed: bool = False) -> str:
|
|
280
|
+
if value is None:
|
|
281
|
+
return "-"
|
|
282
|
+
try:
|
|
283
|
+
if signed:
|
|
284
|
+
return f"{float(value):+.3f}"
|
|
285
|
+
return f"{float(value):.3f}"
|
|
286
|
+
except (TypeError, ValueError):
|
|
287
|
+
return "-"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
__all__ = ["register_compare_commands"]
|