evalvault 1.64.0__py3-none-any.whl → 1.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  2. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  3. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  4. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  5. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  6. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  7. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  8. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  9. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  10. evalvault/adapters/outbound/artifact_fs.py +16 -0
  11. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  12. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  13. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  14. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  15. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  16. evalvault/adapters/outbound/tracker/langfuse_adapter.py +12 -7
  17. evalvault/adapters/outbound/tracker/phoenix_adapter.py +39 -12
  18. evalvault/domain/entities/__init__.py +10 -0
  19. evalvault/domain/entities/judge_calibration.py +50 -0
  20. evalvault/domain/entities/stage.py +11 -3
  21. evalvault/domain/services/artifact_lint_service.py +268 -0
  22. evalvault/domain/services/benchmark_runner.py +1 -6
  23. evalvault/domain/services/dataset_preprocessor.py +26 -0
  24. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  25. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  26. evalvault/domain/services/evaluator.py +2 -0
  27. evalvault/domain/services/judge_calibration_service.py +495 -0
  28. evalvault/domain/services/ops_snapshot_service.py +159 -0
  29. evalvault/domain/services/regression_gate_service.py +199 -0
  30. evalvault/domain/services/run_comparison_service.py +159 -0
  31. evalvault/domain/services/stage_event_builder.py +6 -1
  32. evalvault/domain/services/stage_metric_service.py +83 -18
  33. evalvault/ports/outbound/__init__.py +4 -0
  34. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  35. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  36. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  37. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  38. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  39. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/METADATA +1 -1
  40. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/RECORD +43 -17
  41. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/WHEEL +0 -0
  42. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/entry_points.txt +0 -0
  43. {evalvault-1.64.0.dist-info → evalvault-1.65.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,160 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from collections.abc import Sequence
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+
9
+ import typer
10
+ from rich.console import Console
11
+
12
+ from evalvault.adapters.outbound.filesystem.difficulty_profile_writer import DifficultyProfileWriter
13
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
14
+ from evalvault.config.settings import Settings
15
+ from evalvault.domain.services.difficulty_profile_reporter import DifficultyProfileReporter
16
+ from evalvault.domain.services.difficulty_profiling_service import (
17
+ DifficultyProfileRequest,
18
+ DifficultyProfilingService,
19
+ )
20
+
21
+ from ..utils.console import print_cli_error, progress_spinner
22
+ from ..utils.options import db_option
23
+ from ..utils.validators import parse_csv_option, validate_choices
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def register_profile_difficulty_commands(
29
+ app: typer.Typer,
30
+ console: Console,
31
+ available_metrics: Sequence[str],
32
+ ) -> None:
33
+ @app.command("profile-difficulty")
34
+ def profile_difficulty(
35
+ dataset_name: str | None = typer.Option(
36
+ None, "--dataset-name", help="Dataset name to profile."
37
+ ),
38
+ run_id: str | None = typer.Option(None, "--run-id", help="Run ID to profile."),
39
+ limit_runs: int | None = typer.Option(
40
+ None, "--limit-runs", help="Limit number of runs to analyze."
41
+ ),
42
+ metrics: str | None = typer.Option(
43
+ None, "--metrics", "-m", help="Comma-separated metric allowlist."
44
+ ),
45
+ bucket_count: int = typer.Option(
46
+ 3, "--bucket-count", help="Number of difficulty buckets.", min=2
47
+ ),
48
+ min_samples: int = typer.Option(
49
+ 10, "--min-samples", help="Minimum samples required for profiling.", min=1
50
+ ),
51
+ output_path: Path | None = typer.Option(None, "--output", "-o", help="Output JSON path."),
52
+ artifacts_dir: Path | None = typer.Option(
53
+ None, "--artifacts-dir", help="Artifacts directory path."
54
+ ),
55
+ parallel: bool = typer.Option(
56
+ False, "--parallel/--no-parallel", help="Enable parallel execution."
57
+ ),
58
+ concurrency: int | None = typer.Option(
59
+ None, "--concurrency", help="Max concurrency when parallel is enabled.", min=1
60
+ ),
61
+ db_path: Path | None = db_option(help_text="SQLite DB path."),
62
+ ) -> None:
63
+ if not dataset_name and not run_id:
64
+ print_cli_error(
65
+ console,
66
+ "--dataset-name 또는 --run-id 중 하나는 필수입니다.",
67
+ fixes=["예: --dataset-name insurance-qa", "또는 --run-id run_123"],
68
+ )
69
+ raise typer.Exit(1)
70
+ if dataset_name and run_id:
71
+ print_cli_error(
72
+ console,
73
+ "--dataset-name과 --run-id는 동시에 사용할 수 없습니다.",
74
+ fixes=["둘 중 하나만 지정하세요."],
75
+ )
76
+ raise typer.Exit(1)
77
+
78
+ resolved_db_path = db_path or Settings().evalvault_db_path
79
+ if resolved_db_path is None:
80
+ print_cli_error(
81
+ console,
82
+ "DB 경로가 필요합니다.",
83
+ fixes=["--db 옵션으로 SQLite DB 경로를 지정하세요."],
84
+ )
85
+ raise typer.Exit(1)
86
+
87
+ metric_list = parse_csv_option(metrics)
88
+ if metric_list:
89
+ validate_choices(metric_list, available_metrics, console, value_label="metric")
90
+ resolved_metrics = tuple(metric_list) if metric_list else None
91
+
92
+ identifier = _safe_identifier(run_id or dataset_name or "difficulty")
93
+ prefix = f"difficulty_{identifier}"
94
+ resolved_output = output_path or Path("reports") / "difficulty" / f"{prefix}.json"
95
+ resolved_artifacts_dir = artifacts_dir or resolved_output.parent / "artifacts" / prefix
96
+
97
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
98
+ writer = DifficultyProfileWriter()
99
+ reporter = DifficultyProfileReporter(writer)
100
+ service = DifficultyProfilingService(storage=storage, reporter=reporter)
101
+ request = DifficultyProfileRequest(
102
+ dataset_name=dataset_name,
103
+ run_id=run_id,
104
+ limit_runs=limit_runs,
105
+ metrics=resolved_metrics,
106
+ bucket_count=bucket_count,
107
+ min_samples=min_samples,
108
+ output_path=resolved_output,
109
+ artifacts_dir=resolved_artifacts_dir,
110
+ parallel=parallel,
111
+ concurrency=concurrency,
112
+ )
113
+
114
+ with progress_spinner(console, "난이도 프로파일링 실행 중..."):
115
+ started_at = datetime.now(UTC)
116
+ logger.info("profile-difficulty started", extra={"dataset_name": dataset_name})
117
+ try:
118
+ envelope = service.profile(request)
119
+ except KeyError as exc:
120
+ logger.exception("profile-difficulty run missing")
121
+ print_cli_error(
122
+ console,
123
+ "Run을 찾지 못했습니다.",
124
+ details=str(exc),
125
+ fixes=["--run-id 값과 --db 경로를 확인하세요."],
126
+ )
127
+ raise typer.Exit(1) from exc
128
+ except ValueError as exc:
129
+ logger.exception("profile-difficulty validation failed")
130
+ print_cli_error(
131
+ console,
132
+ "난이도 프로파일링 조건을 만족하지 못했습니다.",
133
+ details=str(exc),
134
+ fixes=["--min-samples 값을 낮추거나 충분한 실행 이력을 준비하세요."],
135
+ )
136
+ raise typer.Exit(1) from exc
137
+ except Exception as exc:
138
+ logger.exception("profile-difficulty failed")
139
+ print_cli_error(
140
+ console,
141
+ "난이도 프로파일링 중 오류가 발생했습니다.",
142
+ details=str(exc),
143
+ )
144
+ raise typer.Exit(1) from exc
145
+
146
+ finished_at = datetime.now(UTC)
147
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
148
+
149
+ console.print("[green]난이도 프로파일링 완료[/green]")
150
+ console.print(f"- output: {resolved_output}")
151
+ console.print(f"- artifacts: {envelope.get('artifacts', {}).get('dir')}")
152
+ console.print(f"- duration_ms: {duration_ms}")
153
+
154
+
155
+ def _safe_identifier(value: str) -> str:
156
+ sanitized = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("_")
157
+ return sanitized or "difficulty"
158
+
159
+
160
+ __all__ = ["register_profile_difficulty_commands"]
@@ -0,0 +1,251 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import UTC, datetime
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
13
+ from evalvault.adapters.outbound.analysis.statistical_adapter import (
14
+ StatisticalAnalysisAdapter,
15
+ )
16
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
17
+ from evalvault.domain.services.regression_gate_service import (
18
+ RegressionGateReport,
19
+ RegressionGateService,
20
+ TestType,
21
+ )
22
+
23
+ from ..utils.formatters import format_diff, format_score, format_status
24
+ from ..utils.options import db_option
25
+ from ..utils.validators import parse_csv_option, validate_choice
26
+
27
+
28
+ def _coerce_test_type(value: Literal["t-test", "mann-whitney"]) -> TestType:
29
+ if value == "t-test":
30
+ return "t-test"
31
+ return "mann-whitney"
32
+
33
+
34
+ OutputFormat = Literal["table", "json", "github-actions"]
35
+
36
+
37
+ def _format_timestamp(value: datetime) -> str:
38
+ return value.astimezone(UTC).isoformat().replace("+00:00", "Z")
39
+
40
+
41
+ def _build_envelope(
42
+ *,
43
+ report: RegressionGateReport | None,
44
+ status: str,
45
+ started_at: datetime,
46
+ finished_at: datetime,
47
+ duration_ms: int,
48
+ message: str | None = None,
49
+ error_type: str | None = None,
50
+ ) -> dict[str, object]:
51
+ payload: dict[str, object] = {
52
+ "command": "regress",
53
+ "version": 1,
54
+ "status": status,
55
+ "started_at": _format_timestamp(started_at),
56
+ "finished_at": _format_timestamp(finished_at),
57
+ "duration_ms": duration_ms,
58
+ "artifacts": None,
59
+ "data": report.to_dict() if report else None,
60
+ }
61
+ if message:
62
+ payload["message"] = message
63
+ if error_type:
64
+ payload["error_type"] = error_type
65
+ return payload
66
+
67
+
68
+ def register_regress_commands(app: typer.Typer, console: Console) -> None:
69
+ @app.command()
70
+ def regress(
71
+ run_id: str = typer.Argument(..., help="Candidate run ID to check."),
72
+ baseline: str = typer.Option(
73
+ ...,
74
+ "--baseline",
75
+ "-b",
76
+ help="Baseline run ID for regression detection.",
77
+ ),
78
+ fail_on_regression: float = typer.Option(
79
+ 0.05,
80
+ "--fail-on-regression",
81
+ "-r",
82
+ help="Fail if metric drops by more than this amount (default: 0.05).",
83
+ ),
84
+ test: TestType = typer.Option(
85
+ "t-test",
86
+ "--test",
87
+ "-t",
88
+ help="Statistical test (t-test, mann-whitney).",
89
+ ),
90
+ metrics: str | None = typer.Option(
91
+ None,
92
+ "--metrics",
93
+ "-m",
94
+ help="Comma-separated list of metrics to check.",
95
+ ),
96
+ output_format: OutputFormat = typer.Option(
97
+ "table",
98
+ "--format",
99
+ "-f",
100
+ help="Output format: table, json, or github-actions.",
101
+ ),
102
+ output: Path | None = typer.Option(
103
+ None,
104
+ "--output",
105
+ "-o",
106
+ help="Write JSON summary to a file.",
107
+ ),
108
+ parallel: bool = typer.Option(
109
+ True,
110
+ "--parallel/--no-parallel",
111
+ help="Enable parallel execution for metric checks.",
112
+ ),
113
+ concurrency: int = typer.Option(
114
+ 8,
115
+ "--concurrency",
116
+ help="Concurrency level when running in parallel.",
117
+ ),
118
+ db_path: Path | None = db_option(help_text="Database path"),
119
+ ) -> None:
120
+ started_at = datetime.now(UTC)
121
+ if db_path is None:
122
+ console.print("[red]Error:[/red] Database path is not configured.")
123
+ raise typer.Exit(1)
124
+
125
+ validate_choice(test, ["t-test", "mann-whitney"], console, value_label="test")
126
+ metric_list = parse_csv_option(metrics)
127
+
128
+ storage = SQLiteStorageAdapter(db_path=db_path)
129
+ analysis_adapter = StatisticalAnalysisAdapter()
130
+ service = RegressionGateService(storage=storage, analysis_adapter=analysis_adapter)
131
+
132
+ try:
133
+ report = service.run_gate(
134
+ run_id,
135
+ baseline,
136
+ metrics=metric_list or None,
137
+ test_type=_coerce_test_type(test),
138
+ fail_on_regression=fail_on_regression,
139
+ parallel=parallel,
140
+ concurrency=concurrency,
141
+ )
142
+ except (KeyError, ValueError) as exc:
143
+ finished_at = datetime.now(UTC)
144
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
145
+ payload = _build_envelope(
146
+ report=None,
147
+ status="error",
148
+ started_at=started_at,
149
+ finished_at=finished_at,
150
+ duration_ms=duration_ms,
151
+ message=str(exc),
152
+ error_type=type(exc).__name__,
153
+ )
154
+ if output:
155
+ write_json(output, payload)
156
+ if output_format == "json":
157
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
158
+ else:
159
+ console.print(f"[red]Error:[/red] {exc}")
160
+ raise typer.Exit(1) from exc
161
+ except Exception as exc:
162
+ finished_at = datetime.now(UTC)
163
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
164
+ payload = _build_envelope(
165
+ report=None,
166
+ status="error",
167
+ started_at=started_at,
168
+ finished_at=finished_at,
169
+ duration_ms=duration_ms,
170
+ message=str(exc),
171
+ error_type=type(exc).__name__,
172
+ )
173
+ if output:
174
+ write_json(output, payload)
175
+ if output_format == "json":
176
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
177
+ else:
178
+ console.print(f"[red]Error:[/red] {exc}")
179
+ raise typer.Exit(3) from exc
180
+
181
+ finished_at = report.finished_at
182
+ duration_ms = report.duration_ms
183
+ payload = _build_envelope(
184
+ report=report,
185
+ status="ok",
186
+ started_at=report.started_at,
187
+ finished_at=finished_at,
188
+ duration_ms=duration_ms,
189
+ )
190
+ if output:
191
+ write_json(output, payload)
192
+
193
+ if output_format == "json":
194
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
195
+ elif output_format == "github-actions":
196
+ _render_github_actions(report, console)
197
+ else:
198
+ _render_table(report, console)
199
+
200
+ if report.regression_detected:
201
+ raise typer.Exit(2)
202
+
203
+
204
+ def _render_table(report: RegressionGateReport, console: Console) -> None:
205
+ console.print(f"\n[bold]Regression Gate Check: {report.candidate_run_id}[/bold]\n")
206
+ console.print(f"Baseline: {report.baseline_run_id}")
207
+ console.print(f"Test: {report.test_type}\n")
208
+ table = Table(show_header=True, header_style="bold cyan")
209
+ table.add_column("Metric")
210
+ table.add_column("Baseline", justify="right")
211
+ table.add_column("Candidate", justify="right")
212
+ table.add_column("Diff", justify="right")
213
+ table.add_column("p-value", justify="right")
214
+ table.add_column("Regression", justify="center")
215
+
216
+ for result in report.results:
217
+ table.add_row(
218
+ result.metric,
219
+ format_score(result.baseline_score),
220
+ format_score(result.candidate_score),
221
+ format_diff(result.diff),
222
+ f"{result.p_value:.4f}",
223
+ format_status(not result.regression, success_text="NO", failure_text="YES"),
224
+ )
225
+
226
+ console.print(table)
227
+ if report.regression_detected:
228
+ regressed = [r.metric for r in report.results if r.regression]
229
+ console.print("\n[bold red]Regression detected[/bold red]")
230
+ console.print(f"[red]Regressed metrics: {', '.join(regressed)}[/red]")
231
+ else:
232
+ console.print("\n[bold green]Regression gate PASSED[/bold green]")
233
+ console.print()
234
+
235
+
236
+ def _render_github_actions(report: RegressionGateReport, console: Console) -> None:
237
+ for result in report.results:
238
+ status = "✅" if not result.regression else "❌"
239
+ reg_status = " (REGRESSION)" if result.regression else ""
240
+ console.print(
241
+ f"{status} {result.metric}: {result.candidate_score:.3f} "
242
+ f"(baseline: {result.baseline_score:.3f}, diff: {result.diff:+.3f}){reg_status}"
243
+ )
244
+
245
+ console.print(f"::set-output name=passed::{str(not report.regression_detected).lower()}")
246
+ if report.regression_detected:
247
+ regressed = [r.metric for r in report.results if r.regression]
248
+ console.print(f"::error::Regression detected in: {', '.join(regressed)}")
249
+
250
+
251
+ __all__ = ["register_regress_commands"]
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from evalvault.domain.entities.analysis_pipeline import AnalysisIntent, PipelineResult
6
+ from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
7
+ from evalvault.ports.outbound.comparison_pipeline_port import ComparisonPipelinePort
8
+
9
+
10
+ class ComparisonPipelineAdapter(ComparisonPipelinePort):
11
+ def __init__(self, service: AnalysisPipelineService) -> None:
12
+ self._service = service
13
+
14
+ def run_comparison(
15
+ self,
16
+ *,
17
+ run_ids: list[str],
18
+ compare_metrics: list[str] | None,
19
+ test_type: str,
20
+ parallel: bool,
21
+ concurrency: int | None,
22
+ report_type: str,
23
+ use_llm_report: bool,
24
+ ) -> PipelineResult:
25
+ params = {
26
+ "run_ids": run_ids,
27
+ "compare_metrics": compare_metrics,
28
+ "test_type": test_type,
29
+ "report_type": report_type,
30
+ "use_llm_report": use_llm_report,
31
+ }
32
+ if parallel:
33
+ if concurrency is not None:
34
+ params["max_concurrency"] = concurrency
35
+ return asyncio.run(
36
+ self._service.analyze_intent_async(
37
+ AnalysisIntent.GENERATE_COMPARISON,
38
+ run_id=run_ids[0] if run_ids else None,
39
+ **params,
40
+ )
41
+ )
42
+ return self._service.analyze_intent(
43
+ AnalysisIntent.GENERATE_COMPARISON,
44
+ run_id=run_ids[0] if run_ids else None,
45
+ **params,
46
+ )
47
+
48
+
49
+ __all__ = ["ComparisonPipelineAdapter"]
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.ports.outbound.artifact_fs_port import ArtifactFileSystemPort
6
+
7
+
8
+ class LocalArtifactFileSystemAdapter(ArtifactFileSystemPort):
9
+ def exists(self, path: Path) -> bool:
10
+ return path.exists()
11
+
12
+ def is_dir(self, path: Path) -> bool:
13
+ return path.is_dir()
14
+
15
+ def read_text(self, path: Path) -> str:
16
+ return path.read_text(encoding="utf-8")
@@ -0,0 +1,3 @@
1
+ from evalvault.adapters.outbound.filesystem.ops_snapshot_writer import OpsSnapshotWriter
2
+
3
+ __all__ = ["OpsSnapshotWriter"]
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
6
+ from evalvault.ports.outbound.difficulty_profile_port import DifficultyProfileWriterPort
7
+
8
+
9
+ class DifficultyProfileWriter(DifficultyProfileWriterPort):
10
+ def write_profile(
11
+ self,
12
+ *,
13
+ output_path: Path,
14
+ artifacts_dir: Path,
15
+ envelope: dict[str, object],
16
+ artifacts: dict[str, object],
17
+ ) -> dict[str, object]:
18
+ output_path.parent.mkdir(parents=True, exist_ok=True)
19
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ breakdown_path = artifacts_dir / "difficulty_breakdown.json"
22
+ cases_path = artifacts_dir / "difficulty_cases.json"
23
+ breakdown_payload = artifacts.get("breakdown")
24
+ cases_payload = artifacts.get("cases")
25
+ write_json(
26
+ breakdown_path,
27
+ breakdown_payload if isinstance(breakdown_payload, dict) else {},
28
+ )
29
+ write_json(
30
+ cases_path,
31
+ {"cases": cases_payload} if isinstance(cases_payload, list) else {"cases": []},
32
+ )
33
+
34
+ index_payload = {
35
+ "files": {
36
+ "breakdown": str(breakdown_path),
37
+ "cases": str(cases_path),
38
+ }
39
+ }
40
+ index_path = artifacts_dir / "index.json"
41
+ write_json(index_path, index_payload)
42
+
43
+ artifacts_index = {
44
+ "dir": str(artifacts_dir),
45
+ "index": str(index_path),
46
+ }
47
+ envelope["artifacts"] = artifacts_index
48
+ write_json(output_path, envelope)
49
+
50
+ return artifacts_index
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+
6
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
7
+ from evalvault.ports.outbound.ops_snapshot_port import OpsSnapshotWriterPort
8
+
9
+
10
+ class OpsSnapshotWriter(OpsSnapshotWriterPort):
11
+ def write_snapshot(self, path: Path, payload: dict[str, Any]) -> None:
12
+ path.parent.mkdir(parents=True, exist_ok=True)
13
+ write_json(path, payload)
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from evalvault.domain.entities import EvaluationRun, SatisfactionFeedback
4
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
5
+ from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
6
+ from evalvault.ports.outbound.judge_calibration_port import JudgeCalibrationPort
7
+
8
+
9
+ class JudgeCalibrationAdapter(JudgeCalibrationPort):
10
+ def __init__(self) -> None:
11
+ self._service = JudgeCalibrationService()
12
+
13
+ def calibrate(
14
+ self,
15
+ run: EvaluationRun,
16
+ feedbacks: list[SatisfactionFeedback],
17
+ *,
18
+ labels_source: str,
19
+ method: str,
20
+ metrics: list[str],
21
+ holdout_ratio: float,
22
+ seed: int,
23
+ parallel: bool = False,
24
+ concurrency: int = 8,
25
+ ) -> JudgeCalibrationResult:
26
+ return self._service.calibrate(
27
+ run,
28
+ feedbacks,
29
+ labels_source=labels_source,
30
+ method=method,
31
+ metrics=metrics,
32
+ holdout_ratio=holdout_ratio,
33
+ seed=seed,
34
+ parallel=parallel,
35
+ concurrency=concurrency,
36
+ )
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import asdict
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from evalvault.domain.entities.judge_calibration import JudgeCalibrationResult
9
+
10
+
11
+ class JudgeCalibrationReporter:
12
+ def render_json(self, result: JudgeCalibrationResult) -> dict[str, Any]:
13
+ return {
14
+ "summary": asdict(result.summary),
15
+ "metrics": [asdict(metric) for metric in result.metrics],
16
+ "case_results": {
17
+ metric: [asdict(entry) for entry in entries]
18
+ for metric, entries in result.case_results.items()
19
+ },
20
+ "warnings": list(result.warnings),
21
+ }
22
+
23
+ def write_artifacts(
24
+ self,
25
+ *,
26
+ result: JudgeCalibrationResult,
27
+ artifacts_dir: Path,
28
+ ) -> dict[str, str]:
29
+ artifacts_dir.mkdir(parents=True, exist_ok=True)
30
+ index_path = artifacts_dir / "index.json"
31
+ payload = {
32
+ "run_id": result.summary.run_id,
33
+ "metrics": [metric.metric for metric in result.metrics],
34
+ "cases": {},
35
+ }
36
+ for metric, cases in result.case_results.items():
37
+ case_path = artifacts_dir / f"{metric}.json"
38
+ case_payload = [
39
+ {
40
+ "test_case_id": case.test_case_id,
41
+ "raw_score": case.raw_score,
42
+ "calibrated_score": case.calibrated_score,
43
+ "label": case.label,
44
+ "label_source": case.label_source,
45
+ }
46
+ for case in cases
47
+ ]
48
+ case_path.write_text(
49
+ json.dumps(case_payload, ensure_ascii=False, indent=2),
50
+ encoding="utf-8",
51
+ )
52
+ payload["cases"][metric] = str(case_path)
53
+ index_path.write_text(
54
+ json.dumps(payload, ensure_ascii=False, indent=2),
55
+ encoding="utf-8",
56
+ )
57
+ return {"dir": str(artifacts_dir), "index": str(index_path)}
@@ -63,13 +63,15 @@ class LangfuseAdapter(TrackerPort):
63
63
  span.update_trace(name=name, metadata=metadata)
64
64
  self._traces[trace_id] = span
65
65
  else:
66
- # Langfuse 2.x: use trace method
67
- trace = self._client.trace(
66
+ trace_fn: Any = getattr(self._client, "trace", None)
67
+ if trace_fn is None:
68
+ raise RuntimeError("Langfuse client does not expose trace API")
69
+ trace_obj = trace_fn(
68
70
  name=name,
69
71
  metadata=metadata,
70
72
  )
71
- trace_id = trace.id
72
- self._traces[trace_id] = trace
73
+ trace_id = trace_obj.id
74
+ self._traces[trace_id] = trace_obj
73
75
  return trace_id
74
76
 
75
77
  def add_span(
@@ -240,7 +242,7 @@ class LangfuseAdapter(TrackerPort):
240
242
  passed_count = sum(
241
243
  1
242
244
  for r in run.results
243
- if r.get_metric(metric_name) and r.get_metric(metric_name).passed
245
+ if (metric := r.get_metric(metric_name)) and metric.passed is True
244
246
  )
245
247
  avg_score = run.get_avg_score(metric_name)
246
248
  threshold = run.thresholds.get(metric_name, 0.7)
@@ -421,12 +423,15 @@ class LangfuseAdapter(TrackerPort):
421
423
  }
422
424
 
423
425
  # Span metadata: additional info
424
- span_metadata = {
426
+ span_metadata: dict[str, float | int] = {
425
427
  "tokens_used": result.tokens_used,
426
428
  "latency_ms": result.latency_ms,
427
429
  }
428
430
  if result.cost_usd:
429
- span_metadata["cost_usd"] = result.cost_usd
431
+ span_metadata = {
432
+ **span_metadata,
433
+ "cost_usd": float(result.cost_usd),
434
+ }
430
435
 
431
436
  if hasattr(root_span, "start_span"):
432
437
  child_span = root_span.start_span(