evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import typer
8
+ from rich.console import Console
9
+
10
+ from evalvault.adapters.inbound.cli.utils.console import print_cli_error
11
+ from evalvault.adapters.inbound.cli.utils.validators import validate_choice
12
+ from evalvault.adapters.outbound.artifact_fs import LocalArtifactFileSystemAdapter
13
+ from evalvault.domain.services.artifact_lint_service import ArtifactLintService
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def create_artifacts_app(console: Console) -> typer.Typer:
19
+ artifacts_app = typer.Typer(name="artifacts", help="Artifact utilities.")
20
+
21
+ @artifacts_app.command("lint")
22
+ def lint(
23
+ artifacts_dir: Path = typer.Argument(..., help="Artifacts directory."),
24
+ strict: bool = typer.Option(False, "--strict", help="Fail on missing files."),
25
+ output_format: str = typer.Option(
26
+ "json",
27
+ "--format",
28
+ "-f",
29
+ help="Output format (json).",
30
+ ),
31
+ output: Path | None = typer.Option(
32
+ None,
33
+ "--output",
34
+ "-o",
35
+ help="Output file path for lint result.",
36
+ ),
37
+ parallel: bool = typer.Option(
38
+ True,
39
+ "--parallel/--no-parallel",
40
+ help="Enable parallel validation (placeholder).",
41
+ ),
42
+ concurrency: int = typer.Option(
43
+ 8,
44
+ "--concurrency",
45
+ min=1,
46
+ help="Parallel validation concurrency (placeholder).",
47
+ ),
48
+ ) -> None:
49
+ validate_choice(output_format, ["json"], console, value_label="format")
50
+
51
+ logger.info("Artifacts lint command started: %s", artifacts_dir)
52
+ fs_adapter = LocalArtifactFileSystemAdapter()
53
+ service = ArtifactLintService(fs_adapter)
54
+ summary = service.lint(artifacts_dir, strict=strict)
55
+
56
+ payload = _build_payload(summary, parallel=parallel, concurrency=concurrency)
57
+ if output:
58
+ output.parent.mkdir(parents=True, exist_ok=True)
59
+ output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
60
+ console.print(f"[green]Lint report saved:[/green] {output}")
61
+ else:
62
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
63
+
64
+ if summary.status == "error":
65
+ logger.error("Artifacts lint command failed: %s", artifacts_dir)
66
+ print_cli_error(console, "Artifact lint failed", details=str(artifacts_dir))
67
+ raise typer.Exit(1)
68
+
69
+ logger.info("Artifacts lint command finished: %s", artifacts_dir)
70
+
71
+ return artifacts_app
72
+
73
+
74
+ def _build_payload(summary, *, parallel: bool, concurrency: int) -> dict[str, object]:
75
+ issues = [
76
+ {
77
+ "level": issue.level,
78
+ "code": issue.code,
79
+ "message": issue.message,
80
+ "path": issue.path,
81
+ }
82
+ for issue in summary.issues
83
+ ]
84
+ error_count = sum(1 for issue in summary.issues if issue.level == "error")
85
+ warning_count = sum(1 for issue in summary.issues if issue.level == "warning")
86
+ return {
87
+ "command": "artifacts.lint",
88
+ "version": 1,
89
+ "status": summary.status,
90
+ "started_at": summary.started_at.isoformat(),
91
+ "finished_at": summary.finished_at.isoformat(),
92
+ "duration_ms": summary.duration_ms,
93
+ "artifacts": {
94
+ "dir": str(summary.artifacts_dir),
95
+ "index": str(summary.index_path),
96
+ },
97
+ "data": {
98
+ "strict": summary.strict,
99
+ "parallel": parallel,
100
+ "concurrency": concurrency,
101
+ "issue_counts": {
102
+ "error": error_count,
103
+ "warning": warning_count,
104
+ },
105
+ "issues": issues,
106
+ },
107
+ }
@@ -0,0 +1,283 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from evalvault.adapters.inbound.cli.utils.analysis_io import resolve_artifact_dir, write_json
11
+ from evalvault.adapters.inbound.cli.utils.console import print_cli_error, progress_spinner
12
+ from evalvault.adapters.inbound.cli.utils.options import db_option
13
+ from evalvault.adapters.inbound.cli.utils.validators import parse_csv_option, validate_choice
14
+ from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
15
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
16
+ from evalvault.config.settings import Settings
17
+ from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
18
+
19
+ _console = Console()
20
+
21
+ _ALLOWED_LABELS = ["feedback", "gold", "hybrid"]
22
+ _ALLOWED_METHODS = ["platt", "isotonic", "temperature", "none"]
23
+
24
+
25
+ def register_calibrate_judge_commands(app: typer.Typer, console: Console) -> None:
26
+ global _console
27
+ _console = console
28
+
29
+ @app.command(name="calibrate-judge")
30
+ def calibrate_judge(
31
+ run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
32
+ labels_source: str = typer.Option(
33
+ "feedback",
34
+ "--labels-source",
35
+ help="라벨 소스 (feedback|gold|hybrid)",
36
+ ),
37
+ method: str = typer.Option(
38
+ "isotonic",
39
+ "--method",
40
+ help="보정 방법 (platt|isotonic|temperature|none)",
41
+ ),
42
+ metrics: str | None = typer.Option(
43
+ None,
44
+ "--metric",
45
+ "-m",
46
+ help="보정 대상 메트릭 (쉼표로 구분, 미지정 시 run 메트릭 전체)",
47
+ ),
48
+ holdout_ratio: float = typer.Option(
49
+ 0.2,
50
+ "--holdout-ratio",
51
+ help="검증용 holdout 비율",
52
+ ),
53
+ seed: int = typer.Option(42, "--seed", help="샘플 분할 랜덤 시드"),
54
+ write_back: bool = typer.Option(
55
+ False,
56
+ "--write-back",
57
+ help="보정 결과를 Run 메타데이터에 저장",
58
+ ),
59
+ output: Path | None = typer.Option(
60
+ None,
61
+ "--output",
62
+ "-o",
63
+ help="JSON 결과 파일 경로",
64
+ ),
65
+ artifacts_dir: Path | None = typer.Option(
66
+ None,
67
+ "--artifacts-dir",
68
+ help="아티팩트 저장 디렉터리",
69
+ ),
70
+ parallel: bool = typer.Option(
71
+ False,
72
+ "--parallel/--no-parallel",
73
+ help="병렬 실행 활성화",
74
+ ),
75
+ concurrency: int = typer.Option(8, "--concurrency", help="동시성 수준"),
76
+ db_path: Path | None = db_option(help_text="DB 경로"),
77
+ ) -> None:
78
+ resolved_db_path = db_path or Settings().evalvault_db_path
79
+ if resolved_db_path is None:
80
+ print_cli_error(_console, "DB 경로가 설정되지 않았습니다.")
81
+ raise typer.Exit(1)
82
+
83
+ labels_source = labels_source.strip().lower()
84
+ method = method.strip().lower()
85
+ validate_choice(labels_source, _ALLOWED_LABELS, _console, value_label="labels-source")
86
+ validate_choice(method, _ALLOWED_METHODS, _console, value_label="method")
87
+
88
+ metric_list = parse_csv_option(metrics)
89
+ if holdout_ratio <= 0 or holdout_ratio >= 1:
90
+ print_cli_error(_console, "--holdout-ratio 값은 0과 1 사이여야 합니다.")
91
+ raise typer.Exit(1)
92
+ if seed < 0:
93
+ print_cli_error(_console, "--seed 값은 0 이상이어야 합니다.")
94
+ raise typer.Exit(1)
95
+ if concurrency <= 0:
96
+ print_cli_error(_console, "--concurrency 값은 1 이상이어야 합니다.")
97
+ raise typer.Exit(1)
98
+
99
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
100
+ try:
101
+ run = storage.get_run(run_id)
102
+ except KeyError:
103
+ print_cli_error(_console, f"Run을 찾을 수 없습니다: {run_id}")
104
+ raise typer.Exit(1)
105
+
106
+ feedbacks = storage.list_feedback(run_id)
107
+ if labels_source in {"feedback", "hybrid"} and not feedbacks:
108
+ print_cli_error(_console, "피드백 라벨이 없습니다.")
109
+ raise typer.Exit(1)
110
+
111
+ resolved_metrics = metric_list or list(run.metrics_evaluated)
112
+ if not resolved_metrics:
113
+ print_cli_error(_console, "보정 대상 메트릭이 없습니다.")
114
+ raise typer.Exit(1)
115
+
116
+ prefix = f"judge_calibration_{run_id}"
117
+ output_path = (output or Path("reports/calibration") / f"{prefix}.json").expanduser()
118
+ output_path.parent.mkdir(parents=True, exist_ok=True)
119
+ resolved_artifacts_dir = resolve_artifact_dir(
120
+ base_dir=artifacts_dir,
121
+ output_path=output_path,
122
+ report_path=output_path,
123
+ prefix=prefix,
124
+ )
125
+
126
+ service = JudgeCalibrationService()
127
+ reporter = JudgeCalibrationReporter()
128
+ started_at = datetime.now(UTC)
129
+
130
+ with progress_spinner(_console, "Judge 보정 실행 중..."):
131
+ result = service.calibrate(
132
+ run,
133
+ feedbacks,
134
+ labels_source=labels_source,
135
+ method=method,
136
+ metrics=resolved_metrics,
137
+ holdout_ratio=holdout_ratio,
138
+ seed=seed,
139
+ parallel=parallel,
140
+ concurrency=concurrency,
141
+ )
142
+
143
+ artifacts_index = reporter.write_artifacts(
144
+ result=result,
145
+ artifacts_dir=resolved_artifacts_dir,
146
+ )
147
+ finished_at = datetime.now(UTC)
148
+ payload = _build_envelope(
149
+ result,
150
+ artifacts_index,
151
+ started_at=started_at,
152
+ finished_at=finished_at,
153
+ )
154
+ write_json(output_path, payload)
155
+
156
+ _display_summary(result)
157
+ _console.print(f"[green]JSON 저장:[/green] {output_path}")
158
+ _console.print(
159
+ f"[green]아티팩트 저장:[/green] {artifacts_index['dir']} (index: {artifacts_index['index']})"
160
+ )
161
+
162
+ if write_back:
163
+ metadata = run.tracker_metadata or {}
164
+ metadata["judge_calibration"] = reporter.render_json(result)
165
+ metadata["judge_calibration"]["artifacts"] = artifacts_index
166
+ metadata["judge_calibration"]["output"] = str(output_path)
167
+ storage.update_run_metadata(run_id, metadata)
168
+ _console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
169
+
170
+ if result.summary.gate_passed is False:
171
+ raise typer.Exit(2)
172
+
173
+ return None
174
+
175
+
176
+ def _build_envelope(
177
+ result,
178
+ artifacts_index: dict[str, str],
179
+ *,
180
+ started_at: datetime,
181
+ finished_at: datetime,
182
+ ) -> dict[str, object]:
183
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
184
+ status = "ok" if result.summary.gate_passed else "degraded"
185
+ return {
186
+ "command": "calibrate-judge",
187
+ "version": 1,
188
+ "status": status,
189
+ "started_at": started_at.astimezone(UTC).isoformat(),
190
+ "finished_at": finished_at.astimezone(UTC).isoformat(),
191
+ "duration_ms": duration_ms,
192
+ "artifacts": artifacts_index,
193
+ "data": {
194
+ "summary": _serialize_summary(result.summary),
195
+ "metrics": [_serialize_metric(metric) for metric in result.metrics],
196
+ "case_results": {
197
+ metric: [_serialize_case(case) for case in cases]
198
+ for metric, cases in result.case_results.items()
199
+ },
200
+ "warnings": list(result.warnings),
201
+ },
202
+ }
203
+
204
+
205
+ def _serialize_summary(summary) -> dict[str, object]:
206
+ return {
207
+ "run_id": summary.run_id,
208
+ "labels_source": summary.labels_source,
209
+ "method": summary.method,
210
+ "metrics": list(summary.metrics),
211
+ "holdout_ratio": summary.holdout_ratio,
212
+ "seed": summary.seed,
213
+ "total_labels": summary.total_labels,
214
+ "total_samples": summary.total_samples,
215
+ "gate_passed": summary.gate_passed,
216
+ "gate_threshold": summary.gate_threshold,
217
+ "notes": list(summary.notes),
218
+ }
219
+
220
+
221
+ def _serialize_metric(metric) -> dict[str, object]:
222
+ return {
223
+ "metric": metric.metric,
224
+ "method": metric.method,
225
+ "sample_count": metric.sample_count,
226
+ "label_count": metric.label_count,
227
+ "mae": metric.mae,
228
+ "pearson": metric.pearson,
229
+ "spearman": metric.spearman,
230
+ "temperature": metric.temperature,
231
+ "parameters": dict(metric.parameters),
232
+ "gate_passed": metric.gate_passed,
233
+ "warning": metric.warning,
234
+ }
235
+
236
+
237
+ def _serialize_case(case) -> dict[str, object]:
238
+ return {
239
+ "test_case_id": case.test_case_id,
240
+ "raw_score": case.raw_score,
241
+ "calibrated_score": case.calibrated_score,
242
+ "label": case.label,
243
+ "label_source": case.label_source,
244
+ }
245
+
246
+
247
+ def _display_summary(result) -> None:
248
+ summary_table = Table(title="Judge 보정 요약", show_header=True, header_style="bold cyan")
249
+ summary_table.add_column("메트릭")
250
+ summary_table.add_column("표본", justify="right")
251
+ summary_table.add_column("라벨", justify="right")
252
+ summary_table.add_column("MAE", justify="right")
253
+ summary_table.add_column("Pearson", justify="right")
254
+ summary_table.add_column("Spearman", justify="right")
255
+ summary_table.add_column("Gate", justify="right")
256
+
257
+ for metric in result.metrics:
258
+ summary_table.add_row(
259
+ metric.metric,
260
+ str(metric.sample_count),
261
+ str(metric.label_count),
262
+ _format_metric(metric.mae),
263
+ _format_metric(metric.pearson),
264
+ _format_metric(metric.spearman),
265
+ "PASS" if metric.gate_passed else "FAIL",
266
+ )
267
+
268
+ _console.print(summary_table)
269
+ _console.print(
270
+ f"라벨 소스: {result.summary.labels_source} | "
271
+ f"방법: {result.summary.method} | "
272
+ f"Gate: {'PASS' if result.summary.gate_passed else 'FAIL'}"
273
+ )
274
+
275
+ if result.warnings:
276
+ for warning in result.warnings:
277
+ _console.print(f"[yellow]경고:[/yellow] {warning}")
278
+
279
+
280
+ def _format_metric(value: float | None) -> str:
281
+ if value is None:
282
+ return "-"
283
+ return f"{value:.3f}"
@@ -0,0 +1,290 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from evalvault.adapters.outbound.analysis.comparison_pipeline_adapter import (
11
+ ComparisonPipelineAdapter,
12
+ )
13
+ from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis_pipeline_service
14
+ from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
15
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
16
+ from evalvault.config.settings import Settings, apply_profile
17
+ from evalvault.domain.services.run_comparison_service import (
18
+ RunComparisonError,
19
+ RunComparisonRequest,
20
+ RunComparisonService,
21
+ )
22
+
23
+ from ..utils.analysis_io import (
24
+ build_comparison_scorecard,
25
+ resolve_artifact_dir,
26
+ resolve_output_paths,
27
+ serialize_pipeline_result,
28
+ write_json,
29
+ write_pipeline_artifacts,
30
+ )
31
+ from ..utils.console import print_cli_error
32
+ from ..utils.options import db_option, profile_option
33
+ from ..utils.validators import parse_csv_option, validate_choice
34
+
35
+
36
+ def _coerce_test_type(value: str) -> str:
37
+ if value == "t-test":
38
+ return "t-test"
39
+ return "mann-whitney"
40
+
41
+
42
+ def register_compare_commands(app: typer.Typer, console: Console) -> None:
43
+ @app.command(name="compare")
44
+ def compare(
45
+ run_id_a: str = typer.Argument(..., help="기준 Run ID"),
46
+ run_id_b: str = typer.Argument(..., help="비교 Run ID"),
47
+ metrics: str | None = typer.Option(
48
+ None,
49
+ "--metrics",
50
+ "-m",
51
+ help="비교할 메트릭 목록 (쉼표 구분)",
52
+ ),
53
+ test: str = typer.Option(
54
+ "t-test",
55
+ "--test",
56
+ "-t",
57
+ help="통계 검정 (t-test, mann-whitney)",
58
+ ),
59
+ output_format: str = typer.Option(
60
+ "table",
61
+ "--format",
62
+ "-f",
63
+ help="출력 형식 (table, json)",
64
+ ),
65
+ output: Path | None = typer.Option(None, "--output", "-o", help="JSON 출력 파일"),
66
+ report: Path | None = typer.Option(None, "--report", help="리포트 출력 파일"),
67
+ output_dir: Path | None = typer.Option(
68
+ None,
69
+ "--output-dir",
70
+ help="출력 기본 디렉터리",
71
+ ),
72
+ artifacts_dir: Path | None = typer.Option(
73
+ None,
74
+ "--artifacts-dir",
75
+ help="아티팩트 저장 디렉터리",
76
+ ),
77
+ parallel: bool = typer.Option(
78
+ False,
79
+ "--parallel/--no-parallel",
80
+ help="병렬 파이프라인 실행",
81
+ ),
82
+ concurrency: int | None = typer.Option(
83
+ None,
84
+ "--concurrency",
85
+ min=1,
86
+ help="병렬 실행 동시성 제한",
87
+ ),
88
+ db_path: Path | None = db_option(help_text="DB 경로"),
89
+ profile: str | None = profile_option(help_text="LLM 프로필"),
90
+ ) -> None:
91
+ validate_choice(test, ["t-test", "mann-whitney"], console, value_label="test")
92
+ validate_choice(output_format, ["table", "json"], console, value_label="format")
93
+
94
+ resolved_db_path = db_path or Settings().evalvault_db_path
95
+ if resolved_db_path is None:
96
+ print_cli_error(console, "DB 경로가 설정되지 않았습니다.")
97
+ raise typer.Exit(1)
98
+
99
+ metric_list = parse_csv_option(metrics)
100
+ metric_list = metric_list or None
101
+
102
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
103
+ analysis_adapter = StatisticalAnalysisAdapter()
104
+
105
+ settings = Settings()
106
+ profile_name = profile or settings.evalvault_profile
107
+ if profile_name:
108
+ settings = apply_profile(settings, profile_name)
109
+
110
+ llm_adapter = None
111
+ try:
112
+ from evalvault.adapters.outbound.llm import get_llm_adapter
113
+
114
+ llm_adapter = get_llm_adapter(settings)
115
+ except Exception as exc:
116
+ console.print(f"[yellow]경고: LLM 어댑터 초기화 실패 ({exc})[/yellow]")
117
+
118
+ pipeline_service = build_analysis_pipeline_service(
119
+ storage=storage,
120
+ llm_adapter=llm_adapter,
121
+ )
122
+ pipeline_adapter = ComparisonPipelineAdapter(pipeline_service)
123
+
124
+ service = RunComparisonService(
125
+ storage=storage,
126
+ analysis_port=analysis_adapter,
127
+ pipeline_port=pipeline_adapter,
128
+ )
129
+
130
+ request = RunComparisonRequest(
131
+ run_id_a=run_id_a,
132
+ run_id_b=run_id_b,
133
+ metrics=metric_list,
134
+ test_type=_coerce_test_type(test),
135
+ parallel=parallel,
136
+ concurrency=concurrency,
137
+ )
138
+
139
+ try:
140
+ outcome = service.compare_runs(request)
141
+ except RunComparisonError as exc:
142
+ print_cli_error(console, str(exc))
143
+ raise typer.Exit(exc.exit_code) from exc
144
+
145
+ comparison_prefix = f"comparison_{run_id_a[:8]}_{run_id_b[:8]}"
146
+ resolved_base_dir = output_dir or Path("reports/comparison")
147
+ output_path, report_path = resolve_output_paths(
148
+ base_dir=resolved_base_dir,
149
+ output_path=output,
150
+ report_path=report,
151
+ prefix=comparison_prefix,
152
+ )
153
+ if artifacts_dir is not None:
154
+ resolved_artifacts_dir = artifacts_dir
155
+ resolved_artifacts_dir.mkdir(parents=True, exist_ok=True)
156
+ else:
157
+ resolved_artifacts_dir = resolve_artifact_dir(
158
+ base_dir=output_dir,
159
+ output_path=output_path,
160
+ report_path=report_path,
161
+ prefix=comparison_prefix,
162
+ )
163
+
164
+ artifact_index = write_pipeline_artifacts(
165
+ outcome.pipeline_result,
166
+ artifacts_dir=resolved_artifacts_dir,
167
+ )
168
+
169
+ payload = _build_envelope(outcome, artifact_index)
170
+ payload["run_ids"] = list(outcome.run_ids)
171
+ payload["data"] = serialize_pipeline_result(outcome.pipeline_result)
172
+ payload["data"]["run_ids"] = list(outcome.run_ids)
173
+ payload["data"]["artifacts"] = artifact_index
174
+ write_json(output_path, payload)
175
+ report_path.write_text(outcome.report_text, encoding="utf-8")
176
+
177
+ if output_format == "table":
178
+ _render_table(console, outcome)
179
+ else:
180
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
181
+
182
+ if outcome.is_degraded:
183
+ console.print("[yellow]리포트가 일부 누락되었을 수 있습니다.[/yellow]")
184
+
185
+ console.print(f"[green]비교 결과 저장:[/green] {output_path}")
186
+ console.print(f"[green]비교 리포트 저장:[/green] {report_path}")
187
+ console.print(
188
+ "[green]비교 아티팩트 저장:[/green] "
189
+ f"{artifact_index['dir']} (index: {artifact_index['index']})"
190
+ )
191
+
192
+ if outcome.is_degraded:
193
+ raise typer.Exit(2)
194
+
195
+
196
+ def _build_envelope(outcome, artifact_index: dict[str, str]) -> dict[str, object]:
197
+ return {
198
+ "command": "compare",
199
+ "version": 1,
200
+ "status": outcome.status,
201
+ "started_at": outcome.started_at.isoformat(),
202
+ "finished_at": outcome.finished_at.isoformat(),
203
+ "duration_ms": outcome.duration_ms,
204
+ "artifacts": {
205
+ "dir": artifact_index.get("dir"),
206
+ "index": artifact_index.get("index"),
207
+ },
208
+ }
209
+
210
+
211
+ def _render_table(console: Console, outcome) -> None:
212
+ table = Table(title="통계 비교", show_header=True, header_style="bold cyan")
213
+ table.add_column("메트릭")
214
+ table.add_column("실행 A (평균)", justify="right")
215
+ table.add_column("실행 B (평균)", justify="right")
216
+ table.add_column("변화 (%)", justify="right")
217
+ table.add_column("p-값", justify="right")
218
+ table.add_column("효과 크기", justify="right")
219
+ table.add_column("유의")
220
+ table.add_column("승자")
221
+
222
+ for comparison in outcome.comparisons:
223
+ sig_style = "green" if comparison.is_significant else "dim"
224
+ winner = comparison.winner[:8] if comparison.winner else "-"
225
+ table.add_row(
226
+ comparison.metric,
227
+ f"{comparison.mean_a:.3f}",
228
+ f"{comparison.mean_b:.3f}",
229
+ f"{comparison.diff_percent:+.1f}%",
230
+ f"{comparison.p_value:.4f}",
231
+ f"{comparison.effect_size:.2f} ({comparison.effect_level.value})",
232
+ f"[{sig_style}]{'예' if comparison.is_significant else '아니오'}[/{sig_style}]",
233
+ winner,
234
+ )
235
+
236
+ console.print("\n[bold]실행 비교 결과[/bold]")
237
+ console.print(table)
238
+ console.print()
239
+
240
+ scorecard = build_comparison_scorecard(
241
+ outcome.pipeline_result.get_node_result("run_metric_comparison").output
242
+ if outcome.pipeline_result.get_node_result("run_metric_comparison")
243
+ else {}
244
+ )
245
+ if not scorecard:
246
+ return
247
+
248
+ summary_table = Table(title="비교 스코어카드", show_header=True, header_style="bold cyan")
249
+ summary_table.add_column("메트릭")
250
+ summary_table.add_column("A", justify="right")
251
+ summary_table.add_column("B", justify="right")
252
+ summary_table.add_column("차이", justify="right")
253
+ summary_table.add_column("p-값", justify="right")
254
+ summary_table.add_column("효과 크기", justify="right")
255
+ summary_table.add_column("유의 여부")
256
+
257
+ for row in scorecard:
258
+ effect_size = row.get("effect_size")
259
+ effect_level = row.get("effect_level")
260
+ effect_text = (
261
+ f"{effect_size:.2f} ({effect_level})"
262
+ if isinstance(effect_size, (float, int)) and effect_level
263
+ else "-"
264
+ )
265
+ summary_table.add_row(
266
+ str(row.get("metric") or "-"),
267
+ _format_float(row.get("mean_a")),
268
+ _format_float(row.get("mean_b")),
269
+ _format_float(row.get("diff"), signed=True),
270
+ _format_float(row.get("p_value")),
271
+ effect_text,
272
+ "예" if row.get("is_significant") else "아니오",
273
+ )
274
+
275
+ console.print(summary_table)
276
+ console.print()
277
+
278
+
279
+ def _format_float(value: float | None, *, signed: bool = False) -> str:
280
+ if value is None:
281
+ return "-"
282
+ try:
283
+ if signed:
284
+ return f"{float(value):+.3f}"
285
+ return f"{float(value):.3f}"
286
+ except (TypeError, ValueError):
287
+ return "-"
288
+
289
+
290
+ __all__ = ["register_compare_commands"]