evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  2. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  3. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  4. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  5. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  6. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  7. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  8. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  9. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  10. evalvault/adapters/outbound/artifact_fs.py +16 -0
  11. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  12. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  13. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  14. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  15. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  16. evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
  17. evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
  18. evalvault/domain/entities/__init__.py +10 -0
  19. evalvault/domain/entities/judge_calibration.py +50 -0
  20. evalvault/domain/entities/stage.py +11 -3
  21. evalvault/domain/services/artifact_lint_service.py +268 -0
  22. evalvault/domain/services/benchmark_runner.py +1 -6
  23. evalvault/domain/services/dataset_preprocessor.py +26 -0
  24. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  25. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  26. evalvault/domain/services/evaluator.py +2 -0
  27. evalvault/domain/services/judge_calibration_service.py +495 -0
  28. evalvault/domain/services/ops_snapshot_service.py +159 -0
  29. evalvault/domain/services/regression_gate_service.py +199 -0
  30. evalvault/domain/services/run_comparison_service.py +159 -0
  31. evalvault/domain/services/stage_event_builder.py +6 -1
  32. evalvault/domain/services/stage_metric_service.py +83 -18
  33. evalvault/ports/outbound/__init__.py +4 -0
  34. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  35. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  36. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  37. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  38. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  39. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
  40. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
  41. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  42. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  43. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from collections.abc import Callable
6
6
  from dataclasses import dataclass
7
- from typing import Any, Protocol
7
+ from typing import Any
8
8
 
9
9
  import typer
10
10
  from rich.console import Console
@@ -12,8 +12,11 @@ from rich.console import Console
12
12
  from .agent import register_agent_commands
13
13
  from .analyze import register_analyze_commands
14
14
  from .api import register_api_command
15
+ from .artifacts import create_artifacts_app
15
16
  from .benchmark import create_benchmark_app
16
17
  from .calibrate import register_calibrate_commands
18
+ from .calibrate_judge import register_calibrate_judge_commands
19
+ from .compare import register_compare_commands
17
20
  from .config import register_config_commands
18
21
  from .debug import create_debug_app
19
22
  from .domain import create_domain_app
@@ -25,19 +28,17 @@ from .init import register_init_command
25
28
  from .kg import create_kg_app
26
29
  from .langfuse import register_langfuse_commands
27
30
  from .method import create_method_app
31
+ from .ops import create_ops_app
28
32
  from .phoenix import create_phoenix_app
29
33
  from .pipeline import register_pipeline_commands
34
+ from .profile_difficulty import register_profile_difficulty_commands
30
35
  from .prompts import create_prompts_app
36
+ from .regress import register_regress_commands
31
37
  from .run import register_run_commands
32
38
  from .stage import create_stage_app
33
39
 
34
40
  CommandFactory = Callable[[Console], typer.Typer]
35
-
36
-
37
- class CommandRegistrar(Protocol):
38
- """Callable protocol for Typer command registrars."""
39
-
40
- def __call__(self, app: typer.Typer, console: Console, **kwargs: Any) -> None: ...
41
+ CommandRegistrar = Callable[..., Any]
41
42
 
42
43
 
43
44
  @dataclass(frozen=True)
@@ -61,10 +62,14 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
61
62
  CommandModule(register_run_commands, needs_metrics=True),
62
63
  CommandModule(register_pipeline_commands),
63
64
  CommandModule(register_history_commands),
65
+ CommandModule(register_compare_commands),
64
66
  CommandModule(register_analyze_commands),
65
67
  CommandModule(register_calibrate_commands),
68
+ CommandModule(register_calibrate_judge_commands),
66
69
  CommandModule(register_generate_commands),
67
70
  CommandModule(register_gate_commands),
71
+ CommandModule(register_profile_difficulty_commands, needs_metrics=True),
72
+ CommandModule(register_regress_commands),
68
73
  CommandModule(register_agent_commands),
69
74
  CommandModule(register_experiment_commands),
70
75
  CommandModule(register_config_commands),
@@ -78,9 +83,11 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
78
83
  SubAppModule("domain", create_domain_app),
79
84
  SubAppModule("benchmark", create_benchmark_app),
80
85
  SubAppModule("method", create_method_app),
86
+ SubAppModule("ops", create_ops_app),
81
87
  SubAppModule("phoenix", create_phoenix_app),
82
88
  SubAppModule("prompts", create_prompts_app),
83
89
  SubAppModule("stage", create_stage_app),
90
+ SubAppModule("artifacts", create_artifacts_app),
84
91
  SubAppModule("debug", create_debug_app),
85
92
  )
86
93
 
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ import typer
8
+ from rich.console import Console
9
+
10
+ from evalvault.adapters.inbound.cli.utils.console import print_cli_error
11
+ from evalvault.adapters.inbound.cli.utils.validators import validate_choice
12
+ from evalvault.adapters.outbound.artifact_fs import LocalArtifactFileSystemAdapter
13
+ from evalvault.domain.services.artifact_lint_service import ArtifactLintService
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def create_artifacts_app(console: Console) -> typer.Typer:
19
+ artifacts_app = typer.Typer(name="artifacts", help="Artifact utilities.")
20
+
21
+ @artifacts_app.command("lint")
22
+ def lint(
23
+ artifacts_dir: Path = typer.Argument(..., help="Artifacts directory."),
24
+ strict: bool = typer.Option(False, "--strict", help="Fail on missing files."),
25
+ output_format: str = typer.Option(
26
+ "json",
27
+ "--format",
28
+ "-f",
29
+ help="Output format (json).",
30
+ ),
31
+ output: Path | None = typer.Option(
32
+ None,
33
+ "--output",
34
+ "-o",
35
+ help="Output file path for lint result.",
36
+ ),
37
+ parallel: bool = typer.Option(
38
+ True,
39
+ "--parallel/--no-parallel",
40
+ help="Enable parallel validation (placeholder).",
41
+ ),
42
+ concurrency: int = typer.Option(
43
+ 8,
44
+ "--concurrency",
45
+ min=1,
46
+ help="Parallel validation concurrency (placeholder).",
47
+ ),
48
+ ) -> None:
49
+ validate_choice(output_format, ["json"], console, value_label="format")
50
+
51
+ logger.info("Artifacts lint command started: %s", artifacts_dir)
52
+ fs_adapter = LocalArtifactFileSystemAdapter()
53
+ service = ArtifactLintService(fs_adapter)
54
+ summary = service.lint(artifacts_dir, strict=strict)
55
+
56
+ payload = _build_payload(summary, parallel=parallel, concurrency=concurrency)
57
+ if output:
58
+ output.parent.mkdir(parents=True, exist_ok=True)
59
+ output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
60
+ console.print(f"[green]Lint report saved:[/green] {output}")
61
+ else:
62
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
63
+
64
+ if summary.status == "error":
65
+ logger.error("Artifacts lint command failed: %s", artifacts_dir)
66
+ print_cli_error(console, "Artifact lint failed", details=str(artifacts_dir))
67
+ raise typer.Exit(1)
68
+
69
+ logger.info("Artifacts lint command finished: %s", artifacts_dir)
70
+
71
+ return artifacts_app
72
+
73
+
74
+ def _build_payload(summary, *, parallel: bool, concurrency: int) -> dict[str, object]:
75
+ issues = [
76
+ {
77
+ "level": issue.level,
78
+ "code": issue.code,
79
+ "message": issue.message,
80
+ "path": issue.path,
81
+ }
82
+ for issue in summary.issues
83
+ ]
84
+ error_count = sum(1 for issue in summary.issues if issue.level == "error")
85
+ warning_count = sum(1 for issue in summary.issues if issue.level == "warning")
86
+ return {
87
+ "command": "artifacts.lint",
88
+ "version": 1,
89
+ "status": summary.status,
90
+ "started_at": summary.started_at.isoformat(),
91
+ "finished_at": summary.finished_at.isoformat(),
92
+ "duration_ms": summary.duration_ms,
93
+ "artifacts": {
94
+ "dir": str(summary.artifacts_dir),
95
+ "index": str(summary.index_path),
96
+ },
97
+ "data": {
98
+ "strict": summary.strict,
99
+ "parallel": parallel,
100
+ "concurrency": concurrency,
101
+ "issue_counts": {
102
+ "error": error_count,
103
+ "warning": warning_count,
104
+ },
105
+ "issues": issues,
106
+ },
107
+ }
@@ -0,0 +1,283 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.table import Table
9
+
10
+ from evalvault.adapters.inbound.cli.utils.analysis_io import resolve_artifact_dir, write_json
11
+ from evalvault.adapters.inbound.cli.utils.console import print_cli_error, progress_spinner
12
+ from evalvault.adapters.inbound.cli.utils.options import db_option
13
+ from evalvault.adapters.inbound.cli.utils.validators import parse_csv_option, validate_choice
14
+ from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
15
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
16
+ from evalvault.config.settings import Settings
17
+ from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
18
+
19
+ _console = Console()
20
+
21
+ _ALLOWED_LABELS = ["feedback", "gold", "hybrid"]
22
+ _ALLOWED_METHODS = ["platt", "isotonic", "temperature", "none"]
23
+
24
+
25
+ def register_calibrate_judge_commands(app: typer.Typer, console: Console) -> None:
26
+ global _console
27
+ _console = console
28
+
29
+ @app.command(name="calibrate-judge")
30
+ def calibrate_judge(
31
+ run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
32
+ labels_source: str = typer.Option(
33
+ "feedback",
34
+ "--labels-source",
35
+ help="라벨 소스 (feedback|gold|hybrid)",
36
+ ),
37
+ method: str = typer.Option(
38
+ "isotonic",
39
+ "--method",
40
+ help="보정 방법 (platt|isotonic|temperature|none)",
41
+ ),
42
+ metrics: str | None = typer.Option(
43
+ None,
44
+ "--metric",
45
+ "-m",
46
+ help="보정 대상 메트릭 (쉼표로 구분, 미지정 시 run 메트릭 전체)",
47
+ ),
48
+ holdout_ratio: float = typer.Option(
49
+ 0.2,
50
+ "--holdout-ratio",
51
+ help="검증용 holdout 비율",
52
+ ),
53
+ seed: int = typer.Option(42, "--seed", help="샘플 분할 랜덤 시드"),
54
+ write_back: bool = typer.Option(
55
+ False,
56
+ "--write-back",
57
+ help="보정 결과를 Run 메타데이터에 저장",
58
+ ),
59
+ output: Path | None = typer.Option(
60
+ None,
61
+ "--output",
62
+ "-o",
63
+ help="JSON 결과 파일 경로",
64
+ ),
65
+ artifacts_dir: Path | None = typer.Option(
66
+ None,
67
+ "--artifacts-dir",
68
+ help="아티팩트 저장 디렉터리",
69
+ ),
70
+ parallel: bool = typer.Option(
71
+ False,
72
+ "--parallel/--no-parallel",
73
+ help="병렬 실행 활성화",
74
+ ),
75
+ concurrency: int = typer.Option(8, "--concurrency", help="동시성 수준"),
76
+ db_path: Path | None = db_option(help_text="DB 경로"),
77
+ ) -> None:
78
+ resolved_db_path = db_path or Settings().evalvault_db_path
79
+ if resolved_db_path is None:
80
+ print_cli_error(_console, "DB 경로가 설정되지 않았습니다.")
81
+ raise typer.Exit(1)
82
+
83
+ labels_source = labels_source.strip().lower()
84
+ method = method.strip().lower()
85
+ validate_choice(labels_source, _ALLOWED_LABELS, _console, value_label="labels-source")
86
+ validate_choice(method, _ALLOWED_METHODS, _console, value_label="method")
87
+
88
+ metric_list = parse_csv_option(metrics)
89
+ if holdout_ratio <= 0 or holdout_ratio >= 1:
90
+ print_cli_error(_console, "--holdout-ratio 값은 0과 1 사이여야 합니다.")
91
+ raise typer.Exit(1)
92
+ if seed < 0:
93
+ print_cli_error(_console, "--seed 값은 0 이상이어야 합니다.")
94
+ raise typer.Exit(1)
95
+ if concurrency <= 0:
96
+ print_cli_error(_console, "--concurrency 값은 1 이상이어야 합니다.")
97
+ raise typer.Exit(1)
98
+
99
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
100
+ try:
101
+ run = storage.get_run(run_id)
102
+ except KeyError:
103
+ print_cli_error(_console, f"Run을 찾을 수 없습니다: {run_id}")
104
+ raise typer.Exit(1)
105
+
106
+ feedbacks = storage.list_feedback(run_id)
107
+ if labels_source in {"feedback", "hybrid"} and not feedbacks:
108
+ print_cli_error(_console, "피드백 라벨이 없습니다.")
109
+ raise typer.Exit(1)
110
+
111
+ resolved_metrics = metric_list or list(run.metrics_evaluated)
112
+ if not resolved_metrics:
113
+ print_cli_error(_console, "보정 대상 메트릭이 없습니다.")
114
+ raise typer.Exit(1)
115
+
116
+ prefix = f"judge_calibration_{run_id}"
117
+ output_path = (output or Path("reports/calibration") / f"{prefix}.json").expanduser()
118
+ output_path.parent.mkdir(parents=True, exist_ok=True)
119
+ resolved_artifacts_dir = resolve_artifact_dir(
120
+ base_dir=artifacts_dir,
121
+ output_path=output_path,
122
+ report_path=output_path,
123
+ prefix=prefix,
124
+ )
125
+
126
+ service = JudgeCalibrationService()
127
+ reporter = JudgeCalibrationReporter()
128
+ started_at = datetime.now(UTC)
129
+
130
+ with progress_spinner(_console, "Judge 보정 실행 중..."):
131
+ result = service.calibrate(
132
+ run,
133
+ feedbacks,
134
+ labels_source=labels_source,
135
+ method=method,
136
+ metrics=resolved_metrics,
137
+ holdout_ratio=holdout_ratio,
138
+ seed=seed,
139
+ parallel=parallel,
140
+ concurrency=concurrency,
141
+ )
142
+
143
+ artifacts_index = reporter.write_artifacts(
144
+ result=result,
145
+ artifacts_dir=resolved_artifacts_dir,
146
+ )
147
+ finished_at = datetime.now(UTC)
148
+ payload = _build_envelope(
149
+ result,
150
+ artifacts_index,
151
+ started_at=started_at,
152
+ finished_at=finished_at,
153
+ )
154
+ write_json(output_path, payload)
155
+
156
+ _display_summary(result)
157
+ _console.print(f"[green]JSON 저장:[/green] {output_path}")
158
+ _console.print(
159
+ f"[green]아티팩트 저장:[/green] {artifacts_index['dir']} (index: {artifacts_index['index']})"
160
+ )
161
+
162
+ if write_back:
163
+ metadata = run.tracker_metadata or {}
164
+ metadata["judge_calibration"] = reporter.render_json(result)
165
+ metadata["judge_calibration"]["artifacts"] = artifacts_index
166
+ metadata["judge_calibration"]["output"] = str(output_path)
167
+ storage.update_run_metadata(run_id, metadata)
168
+ _console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
169
+
170
+ if result.summary.gate_passed is False:
171
+ raise typer.Exit(2)
172
+
173
+ return None
174
+
175
+
176
+ def _build_envelope(
177
+ result,
178
+ artifacts_index: dict[str, str],
179
+ *,
180
+ started_at: datetime,
181
+ finished_at: datetime,
182
+ ) -> dict[str, object]:
183
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
184
+ status = "ok" if result.summary.gate_passed else "degraded"
185
+ return {
186
+ "command": "calibrate-judge",
187
+ "version": 1,
188
+ "status": status,
189
+ "started_at": started_at.astimezone(UTC).isoformat(),
190
+ "finished_at": finished_at.astimezone(UTC).isoformat(),
191
+ "duration_ms": duration_ms,
192
+ "artifacts": artifacts_index,
193
+ "data": {
194
+ "summary": _serialize_summary(result.summary),
195
+ "metrics": [_serialize_metric(metric) for metric in result.metrics],
196
+ "case_results": {
197
+ metric: [_serialize_case(case) for case in cases]
198
+ for metric, cases in result.case_results.items()
199
+ },
200
+ "warnings": list(result.warnings),
201
+ },
202
+ }
203
+
204
+
205
+ def _serialize_summary(summary) -> dict[str, object]:
206
+ return {
207
+ "run_id": summary.run_id,
208
+ "labels_source": summary.labels_source,
209
+ "method": summary.method,
210
+ "metrics": list(summary.metrics),
211
+ "holdout_ratio": summary.holdout_ratio,
212
+ "seed": summary.seed,
213
+ "total_labels": summary.total_labels,
214
+ "total_samples": summary.total_samples,
215
+ "gate_passed": summary.gate_passed,
216
+ "gate_threshold": summary.gate_threshold,
217
+ "notes": list(summary.notes),
218
+ }
219
+
220
+
221
+ def _serialize_metric(metric) -> dict[str, object]:
222
+ return {
223
+ "metric": metric.metric,
224
+ "method": metric.method,
225
+ "sample_count": metric.sample_count,
226
+ "label_count": metric.label_count,
227
+ "mae": metric.mae,
228
+ "pearson": metric.pearson,
229
+ "spearman": metric.spearman,
230
+ "temperature": metric.temperature,
231
+ "parameters": dict(metric.parameters),
232
+ "gate_passed": metric.gate_passed,
233
+ "warning": metric.warning,
234
+ }
235
+
236
+
237
+ def _serialize_case(case) -> dict[str, object]:
238
+ return {
239
+ "test_case_id": case.test_case_id,
240
+ "raw_score": case.raw_score,
241
+ "calibrated_score": case.calibrated_score,
242
+ "label": case.label,
243
+ "label_source": case.label_source,
244
+ }
245
+
246
+
247
+ def _display_summary(result) -> None:
248
+ summary_table = Table(title="Judge 보정 요약", show_header=True, header_style="bold cyan")
249
+ summary_table.add_column("메트릭")
250
+ summary_table.add_column("표본", justify="right")
251
+ summary_table.add_column("라벨", justify="right")
252
+ summary_table.add_column("MAE", justify="right")
253
+ summary_table.add_column("Pearson", justify="right")
254
+ summary_table.add_column("Spearman", justify="right")
255
+ summary_table.add_column("Gate", justify="right")
256
+
257
+ for metric in result.metrics:
258
+ summary_table.add_row(
259
+ metric.metric,
260
+ str(metric.sample_count),
261
+ str(metric.label_count),
262
+ _format_metric(metric.mae),
263
+ _format_metric(metric.pearson),
264
+ _format_metric(metric.spearman),
265
+ "PASS" if metric.gate_passed else "FAIL",
266
+ )
267
+
268
+ _console.print(summary_table)
269
+ _console.print(
270
+ f"라벨 소스: {result.summary.labels_source} | "
271
+ f"방법: {result.summary.method} | "
272
+ f"Gate: {'PASS' if result.summary.gate_passed else 'FAIL'}"
273
+ )
274
+
275
+ if result.warnings:
276
+ for warning in result.warnings:
277
+ _console.print(f"[yellow]경고:[/yellow] {warning}")
278
+
279
+
280
+ def _format_metric(value: float | None) -> str:
281
+ if value is None:
282
+ return "-"
283
+ return f"{value:.3f}"