evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
- evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
- evalvault/adapters/inbound/cli/commands/compare.py +290 -0
- evalvault/adapters/inbound/cli/commands/history.py +13 -85
- evalvault/adapters/inbound/cli/commands/ops.py +110 -0
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
- evalvault/adapters/inbound/cli/commands/regress.py +251 -0
- evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
- evalvault/adapters/outbound/artifact_fs.py +16 -0
- evalvault/adapters/outbound/filesystem/__init__.py +3 -0
- evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
- evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
- evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
- evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
- evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
- evalvault/domain/entities/__init__.py +10 -0
- evalvault/domain/entities/judge_calibration.py +50 -0
- evalvault/domain/entities/stage.py +11 -3
- evalvault/domain/services/artifact_lint_service.py +268 -0
- evalvault/domain/services/benchmark_runner.py +1 -6
- evalvault/domain/services/dataset_preprocessor.py +26 -0
- evalvault/domain/services/difficulty_profile_reporter.py +25 -0
- evalvault/domain/services/difficulty_profiling_service.py +304 -0
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/domain/services/judge_calibration_service.py +495 -0
- evalvault/domain/services/ops_snapshot_service.py +159 -0
- evalvault/domain/services/regression_gate_service.py +199 -0
- evalvault/domain/services/run_comparison_service.py +159 -0
- evalvault/domain/services/stage_event_builder.py +6 -1
- evalvault/domain/services/stage_metric_service.py +83 -18
- evalvault/ports/outbound/__init__.py +4 -0
- evalvault/ports/outbound/artifact_fs_port.py +12 -0
- evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
- evalvault/ports/outbound/difficulty_profile_port.py +15 -0
- evalvault/ports/outbound/judge_calibration_port.py +22 -0
- evalvault/ports/outbound/ops_snapshot_port.py +8 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from collections.abc import Callable
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import typer
|
|
10
10
|
from rich.console import Console
|
|
@@ -12,8 +12,11 @@ from rich.console import Console
|
|
|
12
12
|
from .agent import register_agent_commands
|
|
13
13
|
from .analyze import register_analyze_commands
|
|
14
14
|
from .api import register_api_command
|
|
15
|
+
from .artifacts import create_artifacts_app
|
|
15
16
|
from .benchmark import create_benchmark_app
|
|
16
17
|
from .calibrate import register_calibrate_commands
|
|
18
|
+
from .calibrate_judge import register_calibrate_judge_commands
|
|
19
|
+
from .compare import register_compare_commands
|
|
17
20
|
from .config import register_config_commands
|
|
18
21
|
from .debug import create_debug_app
|
|
19
22
|
from .domain import create_domain_app
|
|
@@ -25,19 +28,17 @@ from .init import register_init_command
|
|
|
25
28
|
from .kg import create_kg_app
|
|
26
29
|
from .langfuse import register_langfuse_commands
|
|
27
30
|
from .method import create_method_app
|
|
31
|
+
from .ops import create_ops_app
|
|
28
32
|
from .phoenix import create_phoenix_app
|
|
29
33
|
from .pipeline import register_pipeline_commands
|
|
34
|
+
from .profile_difficulty import register_profile_difficulty_commands
|
|
30
35
|
from .prompts import create_prompts_app
|
|
36
|
+
from .regress import register_regress_commands
|
|
31
37
|
from .run import register_run_commands
|
|
32
38
|
from .stage import create_stage_app
|
|
33
39
|
|
|
34
40
|
CommandFactory = Callable[[Console], typer.Typer]
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class CommandRegistrar(Protocol):
|
|
38
|
-
"""Callable protocol for Typer command registrars."""
|
|
39
|
-
|
|
40
|
-
def __call__(self, app: typer.Typer, console: Console, **kwargs: Any) -> None: ...
|
|
41
|
+
CommandRegistrar = Callable[..., Any]
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
@dataclass(frozen=True)
|
|
@@ -61,10 +62,14 @@ COMMAND_MODULES: tuple[CommandModule, ...] = (
|
|
|
61
62
|
CommandModule(register_run_commands, needs_metrics=True),
|
|
62
63
|
CommandModule(register_pipeline_commands),
|
|
63
64
|
CommandModule(register_history_commands),
|
|
65
|
+
CommandModule(register_compare_commands),
|
|
64
66
|
CommandModule(register_analyze_commands),
|
|
65
67
|
CommandModule(register_calibrate_commands),
|
|
68
|
+
CommandModule(register_calibrate_judge_commands),
|
|
66
69
|
CommandModule(register_generate_commands),
|
|
67
70
|
CommandModule(register_gate_commands),
|
|
71
|
+
CommandModule(register_profile_difficulty_commands, needs_metrics=True),
|
|
72
|
+
CommandModule(register_regress_commands),
|
|
68
73
|
CommandModule(register_agent_commands),
|
|
69
74
|
CommandModule(register_experiment_commands),
|
|
70
75
|
CommandModule(register_config_commands),
|
|
@@ -78,9 +83,11 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
|
|
|
78
83
|
SubAppModule("domain", create_domain_app),
|
|
79
84
|
SubAppModule("benchmark", create_benchmark_app),
|
|
80
85
|
SubAppModule("method", create_method_app),
|
|
86
|
+
SubAppModule("ops", create_ops_app),
|
|
81
87
|
SubAppModule("phoenix", create_phoenix_app),
|
|
82
88
|
SubAppModule("prompts", create_prompts_app),
|
|
83
89
|
SubAppModule("stage", create_stage_app),
|
|
90
|
+
SubAppModule("artifacts", create_artifacts_app),
|
|
84
91
|
SubAppModule("debug", create_debug_app),
|
|
85
92
|
)
|
|
86
93
|
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from rich.console import Console
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.inbound.cli.utils.console import print_cli_error
|
|
11
|
+
from evalvault.adapters.inbound.cli.utils.validators import validate_choice
|
|
12
|
+
from evalvault.adapters.outbound.artifact_fs import LocalArtifactFileSystemAdapter
|
|
13
|
+
from evalvault.domain.services.artifact_lint_service import ArtifactLintService
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_artifacts_app(console: Console) -> typer.Typer:
|
|
19
|
+
artifacts_app = typer.Typer(name="artifacts", help="Artifact utilities.")
|
|
20
|
+
|
|
21
|
+
@artifacts_app.command("lint")
|
|
22
|
+
def lint(
|
|
23
|
+
artifacts_dir: Path = typer.Argument(..., help="Artifacts directory."),
|
|
24
|
+
strict: bool = typer.Option(False, "--strict", help="Fail on missing files."),
|
|
25
|
+
output_format: str = typer.Option(
|
|
26
|
+
"json",
|
|
27
|
+
"--format",
|
|
28
|
+
"-f",
|
|
29
|
+
help="Output format (json).",
|
|
30
|
+
),
|
|
31
|
+
output: Path | None = typer.Option(
|
|
32
|
+
None,
|
|
33
|
+
"--output",
|
|
34
|
+
"-o",
|
|
35
|
+
help="Output file path for lint result.",
|
|
36
|
+
),
|
|
37
|
+
parallel: bool = typer.Option(
|
|
38
|
+
True,
|
|
39
|
+
"--parallel/--no-parallel",
|
|
40
|
+
help="Enable parallel validation (placeholder).",
|
|
41
|
+
),
|
|
42
|
+
concurrency: int = typer.Option(
|
|
43
|
+
8,
|
|
44
|
+
"--concurrency",
|
|
45
|
+
min=1,
|
|
46
|
+
help="Parallel validation concurrency (placeholder).",
|
|
47
|
+
),
|
|
48
|
+
) -> None:
|
|
49
|
+
validate_choice(output_format, ["json"], console, value_label="format")
|
|
50
|
+
|
|
51
|
+
logger.info("Artifacts lint command started: %s", artifacts_dir)
|
|
52
|
+
fs_adapter = LocalArtifactFileSystemAdapter()
|
|
53
|
+
service = ArtifactLintService(fs_adapter)
|
|
54
|
+
summary = service.lint(artifacts_dir, strict=strict)
|
|
55
|
+
|
|
56
|
+
payload = _build_payload(summary, parallel=parallel, concurrency=concurrency)
|
|
57
|
+
if output:
|
|
58
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
output.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
60
|
+
console.print(f"[green]Lint report saved:[/green] {output}")
|
|
61
|
+
else:
|
|
62
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
63
|
+
|
|
64
|
+
if summary.status == "error":
|
|
65
|
+
logger.error("Artifacts lint command failed: %s", artifacts_dir)
|
|
66
|
+
print_cli_error(console, "Artifact lint failed", details=str(artifacts_dir))
|
|
67
|
+
raise typer.Exit(1)
|
|
68
|
+
|
|
69
|
+
logger.info("Artifacts lint command finished: %s", artifacts_dir)
|
|
70
|
+
|
|
71
|
+
return artifacts_app
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _build_payload(summary, *, parallel: bool, concurrency: int) -> dict[str, object]:
|
|
75
|
+
issues = [
|
|
76
|
+
{
|
|
77
|
+
"level": issue.level,
|
|
78
|
+
"code": issue.code,
|
|
79
|
+
"message": issue.message,
|
|
80
|
+
"path": issue.path,
|
|
81
|
+
}
|
|
82
|
+
for issue in summary.issues
|
|
83
|
+
]
|
|
84
|
+
error_count = sum(1 for issue in summary.issues if issue.level == "error")
|
|
85
|
+
warning_count = sum(1 for issue in summary.issues if issue.level == "warning")
|
|
86
|
+
return {
|
|
87
|
+
"command": "artifacts.lint",
|
|
88
|
+
"version": 1,
|
|
89
|
+
"status": summary.status,
|
|
90
|
+
"started_at": summary.started_at.isoformat(),
|
|
91
|
+
"finished_at": summary.finished_at.isoformat(),
|
|
92
|
+
"duration_ms": summary.duration_ms,
|
|
93
|
+
"artifacts": {
|
|
94
|
+
"dir": str(summary.artifacts_dir),
|
|
95
|
+
"index": str(summary.index_path),
|
|
96
|
+
},
|
|
97
|
+
"data": {
|
|
98
|
+
"strict": summary.strict,
|
|
99
|
+
"parallel": parallel,
|
|
100
|
+
"concurrency": concurrency,
|
|
101
|
+
"issue_counts": {
|
|
102
|
+
"error": error_count,
|
|
103
|
+
"warning": warning_count,
|
|
104
|
+
},
|
|
105
|
+
"issues": issues,
|
|
106
|
+
},
|
|
107
|
+
}
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from evalvault.adapters.inbound.cli.utils.analysis_io import resolve_artifact_dir, write_json
|
|
11
|
+
from evalvault.adapters.inbound.cli.utils.console import print_cli_error, progress_spinner
|
|
12
|
+
from evalvault.adapters.inbound.cli.utils.options import db_option
|
|
13
|
+
from evalvault.adapters.inbound.cli.utils.validators import parse_csv_option, validate_choice
|
|
14
|
+
from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
|
|
15
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
16
|
+
from evalvault.config.settings import Settings
|
|
17
|
+
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
18
|
+
|
|
19
|
+
_console = Console()
|
|
20
|
+
|
|
21
|
+
_ALLOWED_LABELS = ["feedback", "gold", "hybrid"]
|
|
22
|
+
_ALLOWED_METHODS = ["platt", "isotonic", "temperature", "none"]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def register_calibrate_judge_commands(app: typer.Typer, console: Console) -> None:
|
|
26
|
+
global _console
|
|
27
|
+
_console = console
|
|
28
|
+
|
|
29
|
+
@app.command(name="calibrate-judge")
|
|
30
|
+
def calibrate_judge(
|
|
31
|
+
run_id: str = typer.Argument(..., help="보정 대상 Run ID"),
|
|
32
|
+
labels_source: str = typer.Option(
|
|
33
|
+
"feedback",
|
|
34
|
+
"--labels-source",
|
|
35
|
+
help="라벨 소스 (feedback|gold|hybrid)",
|
|
36
|
+
),
|
|
37
|
+
method: str = typer.Option(
|
|
38
|
+
"isotonic",
|
|
39
|
+
"--method",
|
|
40
|
+
help="보정 방법 (platt|isotonic|temperature|none)",
|
|
41
|
+
),
|
|
42
|
+
metrics: str | None = typer.Option(
|
|
43
|
+
None,
|
|
44
|
+
"--metric",
|
|
45
|
+
"-m",
|
|
46
|
+
help="보정 대상 메트릭 (쉼표로 구분, 미지정 시 run 메트릭 전체)",
|
|
47
|
+
),
|
|
48
|
+
holdout_ratio: float = typer.Option(
|
|
49
|
+
0.2,
|
|
50
|
+
"--holdout-ratio",
|
|
51
|
+
help="검증용 holdout 비율",
|
|
52
|
+
),
|
|
53
|
+
seed: int = typer.Option(42, "--seed", help="샘플 분할 랜덤 시드"),
|
|
54
|
+
write_back: bool = typer.Option(
|
|
55
|
+
False,
|
|
56
|
+
"--write-back",
|
|
57
|
+
help="보정 결과를 Run 메타데이터에 저장",
|
|
58
|
+
),
|
|
59
|
+
output: Path | None = typer.Option(
|
|
60
|
+
None,
|
|
61
|
+
"--output",
|
|
62
|
+
"-o",
|
|
63
|
+
help="JSON 결과 파일 경로",
|
|
64
|
+
),
|
|
65
|
+
artifacts_dir: Path | None = typer.Option(
|
|
66
|
+
None,
|
|
67
|
+
"--artifacts-dir",
|
|
68
|
+
help="아티팩트 저장 디렉터리",
|
|
69
|
+
),
|
|
70
|
+
parallel: bool = typer.Option(
|
|
71
|
+
False,
|
|
72
|
+
"--parallel/--no-parallel",
|
|
73
|
+
help="병렬 실행 활성화",
|
|
74
|
+
),
|
|
75
|
+
concurrency: int = typer.Option(8, "--concurrency", help="동시성 수준"),
|
|
76
|
+
db_path: Path | None = db_option(help_text="DB 경로"),
|
|
77
|
+
) -> None:
|
|
78
|
+
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
79
|
+
if resolved_db_path is None:
|
|
80
|
+
print_cli_error(_console, "DB 경로가 설정되지 않았습니다.")
|
|
81
|
+
raise typer.Exit(1)
|
|
82
|
+
|
|
83
|
+
labels_source = labels_source.strip().lower()
|
|
84
|
+
method = method.strip().lower()
|
|
85
|
+
validate_choice(labels_source, _ALLOWED_LABELS, _console, value_label="labels-source")
|
|
86
|
+
validate_choice(method, _ALLOWED_METHODS, _console, value_label="method")
|
|
87
|
+
|
|
88
|
+
metric_list = parse_csv_option(metrics)
|
|
89
|
+
if holdout_ratio <= 0 or holdout_ratio >= 1:
|
|
90
|
+
print_cli_error(_console, "--holdout-ratio 값은 0과 1 사이여야 합니다.")
|
|
91
|
+
raise typer.Exit(1)
|
|
92
|
+
if seed < 0:
|
|
93
|
+
print_cli_error(_console, "--seed 값은 0 이상이어야 합니다.")
|
|
94
|
+
raise typer.Exit(1)
|
|
95
|
+
if concurrency <= 0:
|
|
96
|
+
print_cli_error(_console, "--concurrency 값은 1 이상이어야 합니다.")
|
|
97
|
+
raise typer.Exit(1)
|
|
98
|
+
|
|
99
|
+
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
100
|
+
try:
|
|
101
|
+
run = storage.get_run(run_id)
|
|
102
|
+
except KeyError:
|
|
103
|
+
print_cli_error(_console, f"Run을 찾을 수 없습니다: {run_id}")
|
|
104
|
+
raise typer.Exit(1)
|
|
105
|
+
|
|
106
|
+
feedbacks = storage.list_feedback(run_id)
|
|
107
|
+
if labels_source in {"feedback", "hybrid"} and not feedbacks:
|
|
108
|
+
print_cli_error(_console, "피드백 라벨이 없습니다.")
|
|
109
|
+
raise typer.Exit(1)
|
|
110
|
+
|
|
111
|
+
resolved_metrics = metric_list or list(run.metrics_evaluated)
|
|
112
|
+
if not resolved_metrics:
|
|
113
|
+
print_cli_error(_console, "보정 대상 메트릭이 없습니다.")
|
|
114
|
+
raise typer.Exit(1)
|
|
115
|
+
|
|
116
|
+
prefix = f"judge_calibration_{run_id}"
|
|
117
|
+
output_path = (output or Path("reports/calibration") / f"{prefix}.json").expanduser()
|
|
118
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
119
|
+
resolved_artifacts_dir = resolve_artifact_dir(
|
|
120
|
+
base_dir=artifacts_dir,
|
|
121
|
+
output_path=output_path,
|
|
122
|
+
report_path=output_path,
|
|
123
|
+
prefix=prefix,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
service = JudgeCalibrationService()
|
|
127
|
+
reporter = JudgeCalibrationReporter()
|
|
128
|
+
started_at = datetime.now(UTC)
|
|
129
|
+
|
|
130
|
+
with progress_spinner(_console, "Judge 보정 실행 중..."):
|
|
131
|
+
result = service.calibrate(
|
|
132
|
+
run,
|
|
133
|
+
feedbacks,
|
|
134
|
+
labels_source=labels_source,
|
|
135
|
+
method=method,
|
|
136
|
+
metrics=resolved_metrics,
|
|
137
|
+
holdout_ratio=holdout_ratio,
|
|
138
|
+
seed=seed,
|
|
139
|
+
parallel=parallel,
|
|
140
|
+
concurrency=concurrency,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
artifacts_index = reporter.write_artifacts(
|
|
144
|
+
result=result,
|
|
145
|
+
artifacts_dir=resolved_artifacts_dir,
|
|
146
|
+
)
|
|
147
|
+
finished_at = datetime.now(UTC)
|
|
148
|
+
payload = _build_envelope(
|
|
149
|
+
result,
|
|
150
|
+
artifacts_index,
|
|
151
|
+
started_at=started_at,
|
|
152
|
+
finished_at=finished_at,
|
|
153
|
+
)
|
|
154
|
+
write_json(output_path, payload)
|
|
155
|
+
|
|
156
|
+
_display_summary(result)
|
|
157
|
+
_console.print(f"[green]JSON 저장:[/green] {output_path}")
|
|
158
|
+
_console.print(
|
|
159
|
+
f"[green]아티팩트 저장:[/green] {artifacts_index['dir']} (index: {artifacts_index['index']})"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
if write_back:
|
|
163
|
+
metadata = run.tracker_metadata or {}
|
|
164
|
+
metadata["judge_calibration"] = reporter.render_json(result)
|
|
165
|
+
metadata["judge_calibration"]["artifacts"] = artifacts_index
|
|
166
|
+
metadata["judge_calibration"]["output"] = str(output_path)
|
|
167
|
+
storage.update_run_metadata(run_id, metadata)
|
|
168
|
+
_console.print("[green]보정 결과를 메타데이터에 저장했습니다.[/green]")
|
|
169
|
+
|
|
170
|
+
if result.summary.gate_passed is False:
|
|
171
|
+
raise typer.Exit(2)
|
|
172
|
+
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _build_envelope(
|
|
177
|
+
result,
|
|
178
|
+
artifacts_index: dict[str, str],
|
|
179
|
+
*,
|
|
180
|
+
started_at: datetime,
|
|
181
|
+
finished_at: datetime,
|
|
182
|
+
) -> dict[str, object]:
|
|
183
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
184
|
+
status = "ok" if result.summary.gate_passed else "degraded"
|
|
185
|
+
return {
|
|
186
|
+
"command": "calibrate-judge",
|
|
187
|
+
"version": 1,
|
|
188
|
+
"status": status,
|
|
189
|
+
"started_at": started_at.astimezone(UTC).isoformat(),
|
|
190
|
+
"finished_at": finished_at.astimezone(UTC).isoformat(),
|
|
191
|
+
"duration_ms": duration_ms,
|
|
192
|
+
"artifacts": artifacts_index,
|
|
193
|
+
"data": {
|
|
194
|
+
"summary": _serialize_summary(result.summary),
|
|
195
|
+
"metrics": [_serialize_metric(metric) for metric in result.metrics],
|
|
196
|
+
"case_results": {
|
|
197
|
+
metric: [_serialize_case(case) for case in cases]
|
|
198
|
+
for metric, cases in result.case_results.items()
|
|
199
|
+
},
|
|
200
|
+
"warnings": list(result.warnings),
|
|
201
|
+
},
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _serialize_summary(summary) -> dict[str, object]:
|
|
206
|
+
return {
|
|
207
|
+
"run_id": summary.run_id,
|
|
208
|
+
"labels_source": summary.labels_source,
|
|
209
|
+
"method": summary.method,
|
|
210
|
+
"metrics": list(summary.metrics),
|
|
211
|
+
"holdout_ratio": summary.holdout_ratio,
|
|
212
|
+
"seed": summary.seed,
|
|
213
|
+
"total_labels": summary.total_labels,
|
|
214
|
+
"total_samples": summary.total_samples,
|
|
215
|
+
"gate_passed": summary.gate_passed,
|
|
216
|
+
"gate_threshold": summary.gate_threshold,
|
|
217
|
+
"notes": list(summary.notes),
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _serialize_metric(metric) -> dict[str, object]:
|
|
222
|
+
return {
|
|
223
|
+
"metric": metric.metric,
|
|
224
|
+
"method": metric.method,
|
|
225
|
+
"sample_count": metric.sample_count,
|
|
226
|
+
"label_count": metric.label_count,
|
|
227
|
+
"mae": metric.mae,
|
|
228
|
+
"pearson": metric.pearson,
|
|
229
|
+
"spearman": metric.spearman,
|
|
230
|
+
"temperature": metric.temperature,
|
|
231
|
+
"parameters": dict(metric.parameters),
|
|
232
|
+
"gate_passed": metric.gate_passed,
|
|
233
|
+
"warning": metric.warning,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _serialize_case(case) -> dict[str, object]:
|
|
238
|
+
return {
|
|
239
|
+
"test_case_id": case.test_case_id,
|
|
240
|
+
"raw_score": case.raw_score,
|
|
241
|
+
"calibrated_score": case.calibrated_score,
|
|
242
|
+
"label": case.label,
|
|
243
|
+
"label_source": case.label_source,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _display_summary(result) -> None:
|
|
248
|
+
summary_table = Table(title="Judge 보정 요약", show_header=True, header_style="bold cyan")
|
|
249
|
+
summary_table.add_column("메트릭")
|
|
250
|
+
summary_table.add_column("표본", justify="right")
|
|
251
|
+
summary_table.add_column("라벨", justify="right")
|
|
252
|
+
summary_table.add_column("MAE", justify="right")
|
|
253
|
+
summary_table.add_column("Pearson", justify="right")
|
|
254
|
+
summary_table.add_column("Spearman", justify="right")
|
|
255
|
+
summary_table.add_column("Gate", justify="right")
|
|
256
|
+
|
|
257
|
+
for metric in result.metrics:
|
|
258
|
+
summary_table.add_row(
|
|
259
|
+
metric.metric,
|
|
260
|
+
str(metric.sample_count),
|
|
261
|
+
str(metric.label_count),
|
|
262
|
+
_format_metric(metric.mae),
|
|
263
|
+
_format_metric(metric.pearson),
|
|
264
|
+
_format_metric(metric.spearman),
|
|
265
|
+
"PASS" if metric.gate_passed else "FAIL",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
_console.print(summary_table)
|
|
269
|
+
_console.print(
|
|
270
|
+
f"라벨 소스: {result.summary.labels_source} | "
|
|
271
|
+
f"방법: {result.summary.method} | "
|
|
272
|
+
f"Gate: {'PASS' if result.summary.gate_passed else 'FAIL'}"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
if result.warnings:
|
|
276
|
+
for warning in result.warnings:
|
|
277
|
+
_console.print(f"[yellow]경고:[/yellow] {warning}")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _format_metric(value: float | None) -> str:
|
|
281
|
+
if value is None:
|
|
282
|
+
return "-"
|
|
283
|
+
return f"{value:.3f}"
|