evalvault 1.64.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. evalvault/adapters/inbound/api/adapter.py +14 -0
  2. evalvault/adapters/inbound/api/main.py +14 -4
  3. evalvault/adapters/inbound/api/routers/chat.py +543 -0
  4. evalvault/adapters/inbound/cli/commands/__init__.py +14 -7
  5. evalvault/adapters/inbound/cli/commands/artifacts.py +107 -0
  6. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +283 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +290 -0
  8. evalvault/adapters/inbound/cli/commands/history.py +13 -85
  9. evalvault/adapters/inbound/cli/commands/ops.py +110 -0
  10. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +160 -0
  11. evalvault/adapters/inbound/cli/commands/regress.py +251 -0
  12. evalvault/adapters/inbound/cli/commands/run.py +14 -0
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
  14. evalvault/adapters/outbound/analysis/comparison_pipeline_adapter.py +49 -0
  15. evalvault/adapters/outbound/artifact_fs.py +16 -0
  16. evalvault/adapters/outbound/filesystem/__init__.py +3 -0
  17. evalvault/adapters/outbound/filesystem/difficulty_profile_writer.py +50 -0
  18. evalvault/adapters/outbound/filesystem/ops_snapshot_writer.py +13 -0
  19. evalvault/adapters/outbound/judge_calibration_adapter.py +36 -0
  20. evalvault/adapters/outbound/judge_calibration_reporter.py +57 -0
  21. evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
  22. evalvault/adapters/outbound/storage/base_sql.py +41 -1
  23. evalvault/adapters/outbound/tracker/langfuse_adapter.py +13 -7
  24. evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
  25. evalvault/adapters/outbound/tracker/phoenix_adapter.py +68 -14
  26. evalvault/config/settings.py +21 -0
  27. evalvault/domain/entities/__init__.py +10 -0
  28. evalvault/domain/entities/judge_calibration.py +50 -0
  29. evalvault/domain/entities/prompt.py +1 -1
  30. evalvault/domain/entities/stage.py +11 -3
  31. evalvault/domain/metrics/__init__.py +8 -0
  32. evalvault/domain/metrics/registry.py +39 -3
  33. evalvault/domain/metrics/summary_accuracy.py +189 -0
  34. evalvault/domain/metrics/summary_needs_followup.py +45 -0
  35. evalvault/domain/metrics/summary_non_definitive.py +41 -0
  36. evalvault/domain/metrics/summary_risk_coverage.py +45 -0
  37. evalvault/domain/services/artifact_lint_service.py +268 -0
  38. evalvault/domain/services/benchmark_runner.py +1 -6
  39. evalvault/domain/services/custom_metric_snapshot.py +233 -0
  40. evalvault/domain/services/dataset_preprocessor.py +26 -0
  41. evalvault/domain/services/difficulty_profile_reporter.py +25 -0
  42. evalvault/domain/services/difficulty_profiling_service.py +304 -0
  43. evalvault/domain/services/evaluator.py +282 -27
  44. evalvault/domain/services/judge_calibration_service.py +495 -0
  45. evalvault/domain/services/ops_snapshot_service.py +159 -0
  46. evalvault/domain/services/prompt_registry.py +39 -10
  47. evalvault/domain/services/regression_gate_service.py +199 -0
  48. evalvault/domain/services/run_comparison_service.py +159 -0
  49. evalvault/domain/services/stage_event_builder.py +6 -1
  50. evalvault/domain/services/stage_metric_service.py +83 -18
  51. evalvault/domain/services/threshold_profiles.py +4 -0
  52. evalvault/domain/services/visual_space_service.py +79 -4
  53. evalvault/ports/outbound/__init__.py +4 -0
  54. evalvault/ports/outbound/artifact_fs_port.py +12 -0
  55. evalvault/ports/outbound/comparison_pipeline_port.py +22 -0
  56. evalvault/ports/outbound/difficulty_profile_port.py +15 -0
  57. evalvault/ports/outbound/judge_calibration_port.py +22 -0
  58. evalvault/ports/outbound/ops_snapshot_port.py +8 -0
  59. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
  60. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +63 -31
  61. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
  62. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
  63. {evalvault-1.64.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  from pathlib import Path
7
+ from typing import cast
7
8
 
8
9
  import typer
9
10
  from rich.console import Console
@@ -46,7 +47,7 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
46
47
  "--mode",
47
48
  help="Filter by run mode: 'simple' or 'full'.",
48
49
  ),
49
- db_path: Path = db_option(help_text="Path to database file."),
50
+ db_path: Path | None = db_option(help_text="Path to database file."),
50
51
  ) -> None:
51
52
  """Show evaluation run history.
52
53
 
@@ -75,7 +76,6 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
75
76
 
76
77
  \b
77
78
  See also:
78
- evalvault compare — Compare two runs side by side
79
79
  evalvault export — Export run details to JSON
80
80
  evalvault run — Create new evaluation runs
81
81
  """
@@ -88,7 +88,11 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
88
88
  "[red]Error:[/red] --mode must be one of: " + ", ".join(RUN_MODE_CHOICES)
89
89
  )
90
90
  raise typer.Exit(2)
91
- storage = SQLiteStorageAdapter(db_path=db_path)
91
+ resolved_db_path = db_path or Settings().evalvault_db_path
92
+ if resolved_db_path is None:
93
+ console.print("[red]Error:[/red] Database path is not configured.")
94
+ raise typer.Exit(1)
95
+ storage = SQLiteStorageAdapter(db_path=cast(Path, resolved_db_path))
92
96
  runs = storage.list_runs(limit=limit, dataset_name=dataset, model_name=model)
93
97
  if normalized_mode:
94
98
  runs = [
@@ -157,86 +161,6 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
157
161
  console.print(table)
158
162
  console.print(f"\n[dim]Showing {len(runs)} of {limit} runs[/dim]\n")
159
163
 
160
- @app.command()
161
- def compare(
162
- run_id1: str = typer.Argument(..., help="First run ID to compare."),
163
- run_id2: str = typer.Argument(..., help="Second run ID to compare."),
164
- db_path: Path = db_option(help_text="Path to database file."),
165
- ) -> None:
166
- """Compare two evaluation runs.
167
-
168
- Show a side-by-side comparison of metrics, pass rates, and scores
169
- between two evaluation runs.
170
-
171
- \b
172
- Examples:
173
- # Compare two runs by ID
174
- evalvault compare abc12345 def67890
175
-
176
- # Compare runs from a custom database
177
- evalvault compare abc12345 def67890 --db custom.db
178
-
179
- \b
180
- See also:
181
- evalvault history — List runs to find IDs
182
- evalvault export — Export run details to JSON
183
- evalvault analyze — Deep analysis of a single run
184
- """
185
- console.print("\n[bold]Comparing Evaluation Runs[/bold]\n")
186
-
187
- storage = SQLiteStorageAdapter(db_path=db_path)
188
-
189
- try:
190
- run1 = storage.get_run(run_id1)
191
- run2 = storage.get_run(run_id2)
192
- except KeyError as exc:
193
- console.print(f"[red]Error:[/red] {exc}")
194
- raise typer.Exit(1) from exc
195
-
196
- table = Table(show_header=True, header_style="bold cyan")
197
- table.add_column("Metric")
198
- table.add_column(f"Run 1\n{run_id1[:12]}...", justify="right")
199
- table.add_column(f"Run 2\n{run_id2[:12]}...", justify="right")
200
- table.add_column("Difference", justify="right")
201
-
202
- table.add_row("Dataset", run1.dataset_name, run2.dataset_name, "-")
203
- table.add_row("Model", run1.model_name, run2.model_name, "-")
204
- table.add_row(
205
- "Test Cases",
206
- str(run1.total_test_cases),
207
- str(run2.total_test_cases),
208
- str(run2.total_test_cases - run1.total_test_cases),
209
- )
210
-
211
- pass_rate_diff = run2.pass_rate - run1.pass_rate
212
- diff_color = "green" if pass_rate_diff > 0 else "red" if pass_rate_diff < 0 else "dim"
213
- table.add_row(
214
- "Pass Rate",
215
- f"{run1.pass_rate:.1%}",
216
- f"{run2.pass_rate:.1%}",
217
- f"[{diff_color}]{pass_rate_diff:+.1%}[/{diff_color}]",
218
- )
219
-
220
- for metric in run1.metrics_evaluated:
221
- if metric in run2.metrics_evaluated:
222
- score1 = run1.get_avg_score(metric)
223
- score2 = run2.get_avg_score(metric)
224
- diff = score2 - score1 if score1 is not None and score2 is not None else None
225
- diff_str = (
226
- f"[{'green' if diff and diff > 0 else 'red' if diff and diff < 0 else 'dim'}]{diff:+.3f}[/{'green' if diff and diff > 0 else 'red' if diff and diff < 0 else 'dim'}]"
227
- if diff is not None
228
- else "-"
229
- )
230
- table.add_row(
231
- f"Avg {metric}",
232
- f"{score1:.3f}" if score1 is not None else "-",
233
- f"{score2:.3f}" if score2 is not None else "-",
234
- diff_str,
235
- )
236
-
237
- console.print(table)
238
- console.print()
239
-
240
164
  @app.command(name="export")
241
165
  def export_cmd(
242
166
  run_id: str = typer.Argument(..., help="Run ID to export."),
@@ -246,7 +170,7 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
246
170
  "-o",
247
171
  help="Output file path (JSON format).",
248
172
  ),
249
- db_path: Path = db_option(help_text="Path to database file."),
173
+ db_path: Path | None = db_option(help_text="Path to database file."),
250
174
  ) -> None:
251
175
  """Export evaluation run to JSON file.
252
176
 
@@ -275,7 +199,11 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
275
199
  """
276
200
  console.print(f"\n[bold]Exporting Run {run_id}[/bold]\n")
277
201
 
278
- storage = SQLiteStorageAdapter(db_path=db_path)
202
+ resolved_db_path = db_path or Settings().evalvault_db_path
203
+ if resolved_db_path is None:
204
+ console.print("[red]Error:[/red] Database path is not configured.")
205
+ raise typer.Exit(1)
206
+ storage = SQLiteStorageAdapter(db_path=cast(Path, resolved_db_path))
279
207
 
280
208
  try:
281
209
  run = storage.get_run(run_id)
@@ -0,0 +1,110 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import UTC, datetime
4
+ from pathlib import Path
5
+
6
+ import typer
7
+ from rich.console import Console
8
+
9
+ from evalvault.adapters.outbound.filesystem.ops_snapshot_writer import OpsSnapshotWriter
10
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
11
+ from evalvault.config.settings import Settings, apply_profile
12
+ from evalvault.domain.services.ops_snapshot_service import (
13
+ OpsSnapshotRequest,
14
+ OpsSnapshotService,
15
+ )
16
+
17
+ from ..utils.console import print_cli_error, progress_spinner
18
+ from ..utils.options import db_option, profile_option
19
+
20
+
21
+ def _resolve_storage_path(db_path: Path | None) -> Path:
22
+ if db_path is None:
23
+ return Path(Settings().evalvault_db_path)
24
+ return db_path
25
+
26
+
27
+ def create_ops_app(console: Console) -> typer.Typer:
28
+ app = typer.Typer(name="ops", help="Ops utilities.")
29
+
30
+ @app.command("snapshot")
31
+ def snapshot(
32
+ run_id: str = typer.Option(..., "--run-id", help="Run ID to snapshot."),
33
+ profile: str | None = profile_option(help_text="Profile name to snapshot."),
34
+ db_path: Path | None = db_option(help_text="Path to SQLite database file."),
35
+ include_model_config: bool = typer.Option(
36
+ False,
37
+ "--include-model-config",
38
+ help="Include model profile configuration.",
39
+ ),
40
+ include_env: bool = typer.Option(
41
+ False,
42
+ "--include-env",
43
+ help="Include resolved settings snapshot.",
44
+ ),
45
+ redact: list[str] = typer.Option(
46
+ [],
47
+ "--redact",
48
+ help="Environment keys to redact (repeatable).",
49
+ ),
50
+ output_path: Path = typer.Option(
51
+ ..., "--output", "-o", help="Output JSON path for snapshot."
52
+ ),
53
+ ) -> None:
54
+ settings = Settings()
55
+ resolved_profile = profile or settings.evalvault_profile
56
+ if resolved_profile:
57
+ settings = apply_profile(settings, resolved_profile)
58
+
59
+ resolved_db_path = _resolve_storage_path(db_path)
60
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
61
+ writer = OpsSnapshotWriter()
62
+ service = OpsSnapshotService(
63
+ storage=storage,
64
+ writer=writer,
65
+ settings=settings,
66
+ output_path=output_path,
67
+ )
68
+ request = OpsSnapshotRequest(
69
+ run_id=run_id,
70
+ profile=resolved_profile,
71
+ db_path=resolved_db_path,
72
+ include_model_config=include_model_config,
73
+ include_env=include_env,
74
+ redact_keys=tuple(redact),
75
+ )
76
+
77
+ with progress_spinner(console, "Ops snapshot 생성 중..."):
78
+ started_at = datetime.now(UTC)
79
+ try:
80
+ envelope = service.collect(request)
81
+ except KeyError as exc:
82
+ print_cli_error(
83
+ console,
84
+ "Run을 찾지 못했습니다.",
85
+ details=str(exc),
86
+ fixes=["--run-id 값과 --db 경로를 확인하세요."],
87
+ )
88
+ raise typer.Exit(1) from exc
89
+ except Exception as exc:
90
+ print_cli_error(
91
+ console,
92
+ "Ops snapshot 생성 중 오류가 발생했습니다.",
93
+ details=str(exc),
94
+ fixes=["--output 경로와 파일 권한을 확인하세요."],
95
+ )
96
+ raise typer.Exit(1) from exc
97
+
98
+ finished_at = datetime.now(UTC)
99
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
100
+ console.print("[green]Ops snapshot 완료[/green]")
101
+ console.print(f"- output: {output_path}")
102
+ console.print(f"- duration_ms: {duration_ms}")
103
+ console.print(f"- status: {envelope.status}")
104
+ if envelope.data.get("model_config") is None and include_model_config:
105
+ console.print("[yellow]model_config을 찾지 못했습니다.[/yellow]")
106
+
107
+ return app
108
+
109
+
110
+ __all__ = ["create_ops_app"]
@@ -0,0 +1,160 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from collections.abc import Sequence
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+
9
+ import typer
10
+ from rich.console import Console
11
+
12
+ from evalvault.adapters.outbound.filesystem.difficulty_profile_writer import DifficultyProfileWriter
13
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
14
+ from evalvault.config.settings import Settings
15
+ from evalvault.domain.services.difficulty_profile_reporter import DifficultyProfileReporter
16
+ from evalvault.domain.services.difficulty_profiling_service import (
17
+ DifficultyProfileRequest,
18
+ DifficultyProfilingService,
19
+ )
20
+
21
+ from ..utils.console import print_cli_error, progress_spinner
22
+ from ..utils.options import db_option
23
+ from ..utils.validators import parse_csv_option, validate_choices
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def register_profile_difficulty_commands(
29
+ app: typer.Typer,
30
+ console: Console,
31
+ available_metrics: Sequence[str],
32
+ ) -> None:
33
+ @app.command("profile-difficulty")
34
+ def profile_difficulty(
35
+ dataset_name: str | None = typer.Option(
36
+ None, "--dataset-name", help="Dataset name to profile."
37
+ ),
38
+ run_id: str | None = typer.Option(None, "--run-id", help="Run ID to profile."),
39
+ limit_runs: int | None = typer.Option(
40
+ None, "--limit-runs", help="Limit number of runs to analyze."
41
+ ),
42
+ metrics: str | None = typer.Option(
43
+ None, "--metrics", "-m", help="Comma-separated metric allowlist."
44
+ ),
45
+ bucket_count: int = typer.Option(
46
+ 3, "--bucket-count", help="Number of difficulty buckets.", min=2
47
+ ),
48
+ min_samples: int = typer.Option(
49
+ 10, "--min-samples", help="Minimum samples required for profiling.", min=1
50
+ ),
51
+ output_path: Path | None = typer.Option(None, "--output", "-o", help="Output JSON path."),
52
+ artifacts_dir: Path | None = typer.Option(
53
+ None, "--artifacts-dir", help="Artifacts directory path."
54
+ ),
55
+ parallel: bool = typer.Option(
56
+ False, "--parallel/--no-parallel", help="Enable parallel execution."
57
+ ),
58
+ concurrency: int | None = typer.Option(
59
+ None, "--concurrency", help="Max concurrency when parallel is enabled.", min=1
60
+ ),
61
+ db_path: Path | None = db_option(help_text="SQLite DB path."),
62
+ ) -> None:
63
+ if not dataset_name and not run_id:
64
+ print_cli_error(
65
+ console,
66
+ "--dataset-name 또는 --run-id 중 하나는 필수입니다.",
67
+ fixes=["예: --dataset-name insurance-qa", "또는 --run-id run_123"],
68
+ )
69
+ raise typer.Exit(1)
70
+ if dataset_name and run_id:
71
+ print_cli_error(
72
+ console,
73
+ "--dataset-name과 --run-id는 동시에 사용할 수 없습니다.",
74
+ fixes=["둘 중 하나만 지정하세요."],
75
+ )
76
+ raise typer.Exit(1)
77
+
78
+ resolved_db_path = db_path or Settings().evalvault_db_path
79
+ if resolved_db_path is None:
80
+ print_cli_error(
81
+ console,
82
+ "DB 경로가 필요합니다.",
83
+ fixes=["--db 옵션으로 SQLite DB 경로를 지정하세요."],
84
+ )
85
+ raise typer.Exit(1)
86
+
87
+ metric_list = parse_csv_option(metrics)
88
+ if metric_list:
89
+ validate_choices(metric_list, available_metrics, console, value_label="metric")
90
+ resolved_metrics = tuple(metric_list) if metric_list else None
91
+
92
+ identifier = _safe_identifier(run_id or dataset_name or "difficulty")
93
+ prefix = f"difficulty_{identifier}"
94
+ resolved_output = output_path or Path("reports") / "difficulty" / f"{prefix}.json"
95
+ resolved_artifacts_dir = artifacts_dir or resolved_output.parent / "artifacts" / prefix
96
+
97
+ storage = SQLiteStorageAdapter(db_path=resolved_db_path)
98
+ writer = DifficultyProfileWriter()
99
+ reporter = DifficultyProfileReporter(writer)
100
+ service = DifficultyProfilingService(storage=storage, reporter=reporter)
101
+ request = DifficultyProfileRequest(
102
+ dataset_name=dataset_name,
103
+ run_id=run_id,
104
+ limit_runs=limit_runs,
105
+ metrics=resolved_metrics,
106
+ bucket_count=bucket_count,
107
+ min_samples=min_samples,
108
+ output_path=resolved_output,
109
+ artifacts_dir=resolved_artifacts_dir,
110
+ parallel=parallel,
111
+ concurrency=concurrency,
112
+ )
113
+
114
+ with progress_spinner(console, "난이도 프로파일링 실행 중..."):
115
+ started_at = datetime.now(UTC)
116
+ logger.info("profile-difficulty started", extra={"dataset_name": dataset_name})
117
+ try:
118
+ envelope = service.profile(request)
119
+ except KeyError as exc:
120
+ logger.exception("profile-difficulty run missing")
121
+ print_cli_error(
122
+ console,
123
+ "Run을 찾지 못했습니다.",
124
+ details=str(exc),
125
+ fixes=["--run-id 값과 --db 경로를 확인하세요."],
126
+ )
127
+ raise typer.Exit(1) from exc
128
+ except ValueError as exc:
129
+ logger.exception("profile-difficulty validation failed")
130
+ print_cli_error(
131
+ console,
132
+ "난이도 프로파일링 조건을 만족하지 못했습니다.",
133
+ details=str(exc),
134
+ fixes=["--min-samples 값을 낮추거나 충분한 실행 이력을 준비하세요."],
135
+ )
136
+ raise typer.Exit(1) from exc
137
+ except Exception as exc:
138
+ logger.exception("profile-difficulty failed")
139
+ print_cli_error(
140
+ console,
141
+ "난이도 프로파일링 중 오류가 발생했습니다.",
142
+ details=str(exc),
143
+ )
144
+ raise typer.Exit(1) from exc
145
+
146
+ finished_at = datetime.now(UTC)
147
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
148
+
149
+ console.print("[green]난이도 프로파일링 완료[/green]")
150
+ console.print(f"- output: {resolved_output}")
151
+ console.print(f"- artifacts: {envelope.get('artifacts', {}).get('dir')}")
152
+ console.print(f"- duration_ms: {duration_ms}")
153
+
154
+
155
+ def _safe_identifier(value: str) -> str:
156
+ sanitized = re.sub(r"[^A-Za-z0-9_.-]+", "_", value).strip("_")
157
+ return sanitized or "difficulty"
158
+
159
+
160
+ __all__ = ["register_profile_difficulty_commands"]
@@ -0,0 +1,251 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from datetime import UTC, datetime
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
13
+ from evalvault.adapters.outbound.analysis.statistical_adapter import (
14
+ StatisticalAnalysisAdapter,
15
+ )
16
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
17
+ from evalvault.domain.services.regression_gate_service import (
18
+ RegressionGateReport,
19
+ RegressionGateService,
20
+ TestType,
21
+ )
22
+
23
+ from ..utils.formatters import format_diff, format_score, format_status
24
+ from ..utils.options import db_option
25
+ from ..utils.validators import parse_csv_option, validate_choice
26
+
27
+
28
+ def _coerce_test_type(value: Literal["t-test", "mann-whitney"]) -> TestType:
29
+ if value == "t-test":
30
+ return "t-test"
31
+ return "mann-whitney"
32
+
33
+
34
+ OutputFormat = Literal["table", "json", "github-actions"]
35
+
36
+
37
+ def _format_timestamp(value: datetime) -> str:
38
+ return value.astimezone(UTC).isoformat().replace("+00:00", "Z")
39
+
40
+
41
+ def _build_envelope(
42
+ *,
43
+ report: RegressionGateReport | None,
44
+ status: str,
45
+ started_at: datetime,
46
+ finished_at: datetime,
47
+ duration_ms: int,
48
+ message: str | None = None,
49
+ error_type: str | None = None,
50
+ ) -> dict[str, object]:
51
+ payload: dict[str, object] = {
52
+ "command": "regress",
53
+ "version": 1,
54
+ "status": status,
55
+ "started_at": _format_timestamp(started_at),
56
+ "finished_at": _format_timestamp(finished_at),
57
+ "duration_ms": duration_ms,
58
+ "artifacts": None,
59
+ "data": report.to_dict() if report else None,
60
+ }
61
+ if message:
62
+ payload["message"] = message
63
+ if error_type:
64
+ payload["error_type"] = error_type
65
+ return payload
66
+
67
+
68
+ def register_regress_commands(app: typer.Typer, console: Console) -> None:
69
+ @app.command()
70
+ def regress(
71
+ run_id: str = typer.Argument(..., help="Candidate run ID to check."),
72
+ baseline: str = typer.Option(
73
+ ...,
74
+ "--baseline",
75
+ "-b",
76
+ help="Baseline run ID for regression detection.",
77
+ ),
78
+ fail_on_regression: float = typer.Option(
79
+ 0.05,
80
+ "--fail-on-regression",
81
+ "-r",
82
+ help="Fail if metric drops by more than this amount (default: 0.05).",
83
+ ),
84
+ test: TestType = typer.Option(
85
+ "t-test",
86
+ "--test",
87
+ "-t",
88
+ help="Statistical test (t-test, mann-whitney).",
89
+ ),
90
+ metrics: str | None = typer.Option(
91
+ None,
92
+ "--metrics",
93
+ "-m",
94
+ help="Comma-separated list of metrics to check.",
95
+ ),
96
+ output_format: OutputFormat = typer.Option(
97
+ "table",
98
+ "--format",
99
+ "-f",
100
+ help="Output format: table, json, or github-actions.",
101
+ ),
102
+ output: Path | None = typer.Option(
103
+ None,
104
+ "--output",
105
+ "-o",
106
+ help="Write JSON summary to a file.",
107
+ ),
108
+ parallel: bool = typer.Option(
109
+ True,
110
+ "--parallel/--no-parallel",
111
+ help="Enable parallel execution for metric checks.",
112
+ ),
113
+ concurrency: int = typer.Option(
114
+ 8,
115
+ "--concurrency",
116
+ help="Concurrency level when running in parallel.",
117
+ ),
118
+ db_path: Path | None = db_option(help_text="Database path"),
119
+ ) -> None:
120
+ started_at = datetime.now(UTC)
121
+ if db_path is None:
122
+ console.print("[red]Error:[/red] Database path is not configured.")
123
+ raise typer.Exit(1)
124
+
125
+ validate_choice(test, ["t-test", "mann-whitney"], console, value_label="test")
126
+ metric_list = parse_csv_option(metrics)
127
+
128
+ storage = SQLiteStorageAdapter(db_path=db_path)
129
+ analysis_adapter = StatisticalAnalysisAdapter()
130
+ service = RegressionGateService(storage=storage, analysis_adapter=analysis_adapter)
131
+
132
+ try:
133
+ report = service.run_gate(
134
+ run_id,
135
+ baseline,
136
+ metrics=metric_list or None,
137
+ test_type=_coerce_test_type(test),
138
+ fail_on_regression=fail_on_regression,
139
+ parallel=parallel,
140
+ concurrency=concurrency,
141
+ )
142
+ except (KeyError, ValueError) as exc:
143
+ finished_at = datetime.now(UTC)
144
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
145
+ payload = _build_envelope(
146
+ report=None,
147
+ status="error",
148
+ started_at=started_at,
149
+ finished_at=finished_at,
150
+ duration_ms=duration_ms,
151
+ message=str(exc),
152
+ error_type=type(exc).__name__,
153
+ )
154
+ if output:
155
+ write_json(output, payload)
156
+ if output_format == "json":
157
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
158
+ else:
159
+ console.print(f"[red]Error:[/red] {exc}")
160
+ raise typer.Exit(1) from exc
161
+ except Exception as exc:
162
+ finished_at = datetime.now(UTC)
163
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
164
+ payload = _build_envelope(
165
+ report=None,
166
+ status="error",
167
+ started_at=started_at,
168
+ finished_at=finished_at,
169
+ duration_ms=duration_ms,
170
+ message=str(exc),
171
+ error_type=type(exc).__name__,
172
+ )
173
+ if output:
174
+ write_json(output, payload)
175
+ if output_format == "json":
176
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
177
+ else:
178
+ console.print(f"[red]Error:[/red] {exc}")
179
+ raise typer.Exit(3) from exc
180
+
181
+ finished_at = report.finished_at
182
+ duration_ms = report.duration_ms
183
+ payload = _build_envelope(
184
+ report=report,
185
+ status="ok",
186
+ started_at=report.started_at,
187
+ finished_at=finished_at,
188
+ duration_ms=duration_ms,
189
+ )
190
+ if output:
191
+ write_json(output, payload)
192
+
193
+ if output_format == "json":
194
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
195
+ elif output_format == "github-actions":
196
+ _render_github_actions(report, console)
197
+ else:
198
+ _render_table(report, console)
199
+
200
+ if report.regression_detected:
201
+ raise typer.Exit(2)
202
+
203
+
204
+ def _render_table(report: RegressionGateReport, console: Console) -> None:
205
+ console.print(f"\n[bold]Regression Gate Check: {report.candidate_run_id}[/bold]\n")
206
+ console.print(f"Baseline: {report.baseline_run_id}")
207
+ console.print(f"Test: {report.test_type}\n")
208
+ table = Table(show_header=True, header_style="bold cyan")
209
+ table.add_column("Metric")
210
+ table.add_column("Baseline", justify="right")
211
+ table.add_column("Candidate", justify="right")
212
+ table.add_column("Diff", justify="right")
213
+ table.add_column("p-value", justify="right")
214
+ table.add_column("Regression", justify="center")
215
+
216
+ for result in report.results:
217
+ table.add_row(
218
+ result.metric,
219
+ format_score(result.baseline_score),
220
+ format_score(result.candidate_score),
221
+ format_diff(result.diff),
222
+ f"{result.p_value:.4f}",
223
+ format_status(not result.regression, success_text="NO", failure_text="YES"),
224
+ )
225
+
226
+ console.print(table)
227
+ if report.regression_detected:
228
+ regressed = [r.metric for r in report.results if r.regression]
229
+ console.print("\n[bold red]Regression detected[/bold red]")
230
+ console.print(f"[red]Regressed metrics: {', '.join(regressed)}[/red]")
231
+ else:
232
+ console.print("\n[bold green]Regression gate PASSED[/bold green]")
233
+ console.print()
234
+
235
+
236
+ def _render_github_actions(report: RegressionGateReport, console: Console) -> None:
237
+ for result in report.results:
238
+ status = "✅" if not result.regression else "❌"
239
+ reg_status = " (REGRESSION)" if result.regression else ""
240
+ console.print(
241
+ f"{status} {result.metric}: {result.candidate_score:.3f} "
242
+ f"(baseline: {result.baseline_score:.3f}, diff: {result.diff:+.3f}){reg_status}"
243
+ )
244
+
245
+ console.print(f"::set-output name=passed::{str(not report.regression_detected).lower()}")
246
+ if report.regression_detected:
247
+ regressed = [r.metric for r in report.results if r.regression]
248
+ console.print(f"::error::Regression detected in: {', '.join(regressed)}")
249
+
250
+
251
+ __all__ = ["register_regress_commands"]
@@ -1742,6 +1742,14 @@ def register_run_commands(
1742
1742
  ragas_snapshots = tracker_meta.get("ragas_prompt_snapshots")
1743
1743
  ragas_snapshot_inputs = build_prompt_inputs_from_snapshots(
1744
1744
  ragas_snapshots if isinstance(ragas_snapshots, dict) else None,
1745
+ kind="ragas",
1746
+ source="ragas",
1747
+ )
1748
+ custom_snapshots = tracker_meta.get("custom_prompt_snapshots")
1749
+ custom_snapshot_inputs = build_prompt_inputs_from_snapshots(
1750
+ custom_snapshots if isinstance(custom_snapshots, dict) else None,
1751
+ kind="custom",
1752
+ source="custom_rules",
1745
1753
  )
1746
1754
  override_status: dict[str, str] = {}
1747
1755
  raw_override = tracker_meta.get("ragas_prompt_overrides")
@@ -1764,6 +1772,12 @@ def register_run_commands(
1764
1772
  if entry.role in existing_roles and override_status.get(entry.role) == "applied":
1765
1773
  continue
1766
1774
  prompt_inputs.append(entry)
1775
+ if custom_snapshot_inputs:
1776
+ existing_roles = {entry.role for entry in prompt_inputs if entry.kind == "custom"}
1777
+ for entry in custom_snapshot_inputs:
1778
+ if entry.role in existing_roles:
1779
+ continue
1780
+ prompt_inputs.append(entry)
1767
1781
  if prompt_inputs and not db_path:
1768
1782
  print_cli_warning(
1769
1783
  console,