ankora 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ankora/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """ankora: local-first, CI-native regression testing for LLM applications."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("ankora")
7
+ except PackageNotFoundError: # pragma: no cover - only when running from a non-installed tree
8
+ __version__ = "0.0.0"
ankora/cli.py ADDED
@@ -0,0 +1,291 @@
1
+ """The ankora command-line interface.
2
+
3
+ Wires the typer app and its subcommands (``init``, ``ingest``, ``run``, ``diff``,
4
+ ``gate``, ``baseline set``). ``gate`` is the CI entrypoint: it exits non-zero
5
+ when quality regresses against the baseline.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from pathlib import Path
12
+
13
+ import typer
14
+ from rich.console import Console
15
+ from rich.table import Table
16
+
17
+ from ankora import __version__
18
+ from ankora.config import (
19
+ DEFAULT_CONFIG_YAML,
20
+ Config,
21
+ ConfigError,
22
+ TargetConfig,
23
+ load_config,
24
+ )
25
+ from ankora.diff import DiffReport, diff_runs
26
+ from ankora.ingest import ingest_traces
27
+ from ankora.models import RunResult
28
+ from ankora.replay import replay
29
+ from ankora.storage import StorageError, get_baseline, load_run, run_path, set_baseline
30
+ from ankora.suites import SuiteError
31
+
32
+ app = typer.Typer(
33
+ name="ankora",
34
+ help="Local-first, CI-native regression testing for LLM applications.",
35
+ no_args_is_help=True,
36
+ add_completion=False,
37
+ )
38
+
39
+ console = Console()
40
+
41
+
42
+ def _version_callback(value: bool) -> None:
43
+ if value:
44
+ console.print(__version__)
45
+ raise typer.Exit()
46
+
47
+
48
+ @app.callback()
49
+ def _main(
50
+ _version: bool = typer.Option(
51
+ False,
52
+ "--version",
53
+ callback=_version_callback,
54
+ is_eager=True,
55
+ help="Show the ankora version and exit.",
56
+ ),
57
+ ) -> None:
58
+ """ankora: turn the traces you already capture into CI regression tests."""
59
+
60
+
61
+ @app.command()
62
+ def version() -> None:
63
+ """Print the ankora version."""
64
+ console.print(__version__)
65
+
66
+
67
+ @app.command()
68
+ def init(
69
+ force: bool = typer.Option(False, "--force", help="Overwrite an existing ankora.yaml."),
70
+ ) -> None:
71
+ """Scaffold ankora.yaml and an evals/ directory in the current repo."""
72
+ config_path = Path("ankora.yaml")
73
+ if config_path.exists() and not force:
74
+ console.print(f"[red]{config_path} already exists.[/] Pass [bold]--force[/] to overwrite.")
75
+ raise typer.Exit(code=1)
76
+
77
+ config_path.write_text(DEFAULT_CONFIG_YAML, encoding="utf-8")
78
+ evals_dir = Path("evals")
79
+ evals_dir.mkdir(exist_ok=True)
80
+
81
+ console.print(f"[green]Wrote[/] {config_path}")
82
+ console.print(f"[green]Created[/] {evals_dir}/ (add Case files here)")
83
+ console.print("Next: [bold]ankora ingest <trace-file>[/] to build your first cases.")
84
+
85
+
86
+ @app.command()
87
+ def ingest(
88
+ trace_file: str = typer.Argument(
89
+ ..., help="Path to an OpenTelemetry GenAI or Langfuse trace export (JSON)."
90
+ ),
91
+ out: str = typer.Option("evals/", "--out", help="Directory to write Case files into."),
92
+ fmt: str = typer.Option(
93
+ "auto", "--format", help="Trace format: auto (default), otel, or langfuse."
94
+ ),
95
+ ) -> None:
96
+ """Build or update regression Cases from an OTel GenAI or Langfuse trace file."""
97
+ if fmt not in ("auto", "otel", "langfuse"):
98
+ console.print(f"[red]--format must be one of: auto, otel, langfuse[/] (got {fmt!r})")
99
+ raise typer.Exit(code=1)
100
+
101
+ trace_path = Path(trace_file)
102
+ if not trace_path.exists():
103
+ console.print(f"[red]Trace file not found:[/] {trace_path}")
104
+ raise typer.Exit(code=1)
105
+
106
+ try:
107
+ result, detected = ingest_traces(trace_path, fmt)
108
+ except json.JSONDecodeError as exc:
109
+ console.print(f"[red]{trace_path} is not valid JSON:[/] {exc}")
110
+ raise typer.Exit(code=1) from exc
111
+
112
+ out_dir = Path(out)
113
+ out_dir.mkdir(parents=True, exist_ok=True)
114
+ for case in result.cases:
115
+ (out_dir / f"{case.id}.yaml").write_text(case.to_yaml(), encoding="utf-8")
116
+
117
+ console.print(
118
+ f"[green]Wrote {len(result.cases)} case(s)[/] to {out_dir} "
119
+ f"({detected} format; {result.skipped} skipped, {result.total} record(s) seen)."
120
+ )
121
+
122
+
123
+ @app.command()
124
+ def run(
125
+ suite: list[str] = typer.Option(None, "--suite", help="Glob(s) of Case files to run."),
126
+ target: str = typer.Option(None, "--target", help="Override target as provider:model."),
127
+ concurrency: int = typer.Option(8, "--concurrency", help="Max cases to replay in parallel."),
128
+ config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
129
+ ) -> None:
130
+ """Replay and score the suite, persist a RunResult, print a summary."""
131
+ try:
132
+ config = load_config(config_path)
133
+ if suite:
134
+ config = config.model_copy(update={"suites": list(suite)})
135
+ result = replay(config, target=target, concurrency=concurrency)
136
+ except (ConfigError, SuiteError) as exc:
137
+ console.print(f"[red]{exc}[/]")
138
+ raise typer.Exit(code=1) from exc
139
+
140
+ _print_run_table(result)
141
+ console.print(f"\nSaved run [bold]{result.run_id}[/] to {run_path(result.run_id)}")
142
+
143
+
144
+ def _print_run_table(result: RunResult) -> None:
145
+ table = Table(title=f"ankora run · {result.target.provider}:{result.target.model}")
146
+ table.add_column("Case", overflow="fold")
147
+ table.add_column("Result")
148
+ table.add_column("Scores", overflow="fold")
149
+ for case_result in result.case_results:
150
+ status = "[green]PASS[/]" if case_result.passed else "[red]FAIL[/]"
151
+ scores = ", ".join(
152
+ f"{s.scorer} {s.score:.2f}{'✓' if s.passed else '✗'}"
153
+ for s in case_result.scorer_results
154
+ )
155
+ table.add_row(case_result.case_id, status, scores or "—")
156
+ console.print(table)
157
+
158
+ summary = result.summary
159
+ color = "green" if summary.failed == 0 else "red"
160
+ console.print(f"[{color}]{summary.passed}/{summary.total} passed, {summary.failed} failed[/]")
161
+
162
+
163
+ @app.command()
164
+ def diff(
165
+ baseline: str = typer.Argument(..., help="Baseline run id or path to a run JSON."),
166
+ current: str = typer.Argument(..., help="Current run id or path to a run JSON."),
167
+ config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
168
+ fail_on: str = typer.Option(None, "--fail-on", help="Override gate.fail_on for this diff."),
169
+ ) -> None:
170
+ """Show per-case changes between two runs. Read-only; always exits 0."""
171
+ config = _load_config_or_default(config_path)
172
+ if fail_on:
173
+ gate_config = config.gate.model_copy(update={"fail_on": fail_on})
174
+ config = config.model_copy(update={"gate": gate_config})
175
+
176
+ try:
177
+ baseline_run = _resolve_run(baseline)
178
+ current_run = _resolve_run(current)
179
+ except StorageError as exc:
180
+ console.print(f"[red]{exc}[/]")
181
+ raise typer.Exit(code=1) from exc
182
+
183
+ _print_diff_report(diff_runs(baseline_run, current_run, config))
184
+
185
+
186
+ @app.command()
187
+ def gate(
188
+ config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
189
+ target: str = typer.Option(None, "--target", help="Override target as provider:model."),
190
+ concurrency: int = typer.Option(8, "--concurrency", help="Max cases to replay in parallel."),
191
+ ) -> None:
192
+ """Replay the suite, diff against baseline, and exit non-zero on regression.
193
+
194
+ This is the CI entrypoint.
195
+ """
196
+ try:
197
+ config = load_config(config_path)
198
+ current = replay(config, target=target, concurrency=concurrency)
199
+ except (ConfigError, SuiteError) as exc:
200
+ console.print(f"[red]{exc}[/]")
201
+ raise typer.Exit(code=1) from exc
202
+
203
+ _print_run_table(current)
204
+
205
+ try:
206
+ baseline = get_baseline(config)
207
+ except StorageError:
208
+ console.print(
209
+ "\n[yellow]No baseline yet[/] — nothing to regress against. "
210
+ f"Promote this run with [bold]ankora baseline set {current.run_id}[/]."
211
+ )
212
+ raise typer.Exit(code=0) from None
213
+
214
+ report = diff_runs(baseline, current, config)
215
+ _print_diff_report(report)
216
+ if report.has_regressions:
217
+ console.print(f"\n[red]{report.regressions} regression(s) detected — failing the gate.[/]")
218
+ raise typer.Exit(code=1)
219
+ console.print("\n[green]No regressions — gate passed.[/]")
220
+
221
+
222
+ baseline_app = typer.Typer(
223
+ name="baseline",
224
+ help="Manage the regression baseline.",
225
+ no_args_is_help=True,
226
+ )
227
+ app.add_typer(baseline_app)
228
+
229
+
230
+ @baseline_app.command("set")
231
+ def baseline_set(
232
+ run_id: str = typer.Argument(..., help="The run id to promote to baseline."),
233
+ config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
234
+ ) -> None:
235
+ """Promote a stored run to the baseline."""
236
+ try:
237
+ config = load_config(config_path)
238
+ path = set_baseline(config, run_id)
239
+ except (ConfigError, StorageError) as exc:
240
+ console.print(f"[red]{exc}[/]")
241
+ raise typer.Exit(code=1) from exc
242
+ console.print(f"[green]Baseline set[/] to run [bold]{run_id}[/] at {path}")
243
+
244
+
245
+ def _load_config_or_default(config_path: str) -> Config:
246
+ """Load config, or fall back to a minimal default (used by read-only diff)."""
247
+ try:
248
+ return load_config(config_path)
249
+ except ConfigError:
250
+ return Config(target=TargetConfig(provider="unknown", model="unknown"))
251
+
252
+
253
+ def _resolve_run(reference: str) -> RunResult:
254
+ """Resolve a run reference that is either a path to a run JSON or a run id."""
255
+ path = Path(reference)
256
+ if path.exists():
257
+ return RunResult.from_json(path.read_text(encoding="utf-8"))
258
+ return load_run(reference)
259
+
260
+
261
+ def _print_diff_report(report: DiffReport) -> None:
262
+ table = Table(title=f"ankora diff · fail_on={report.fail_on}")
263
+ table.add_column("Case", overflow="fold")
264
+ table.add_column("Status")
265
+ table.add_column("Baseline")
266
+ table.add_column("Current")
267
+ table.add_column("Δ")
268
+ table.add_column("Regression")
269
+ for case in report.cases:
270
+ table.add_row(
271
+ case.case_id,
272
+ case.status.value,
273
+ _fmt_score(case.baseline_score),
274
+ _fmt_score(case.current_score),
275
+ f"{case.delta:+.3f}" if case.delta is not None else "—",
276
+ "[red]yes[/]" if case.is_regression else "no",
277
+ )
278
+ console.print(table)
279
+ console.print(
280
+ f"regressed={report.regressed} new_failures={report.new_failures} "
281
+ f"fixed={report.fixed} unchanged={report.unchanged} "
282
+ f"new_passes={report.new_passes} removed={report.removed}"
283
+ )
284
+
285
+
286
+ def _fmt_score(score: float | None) -> str:
287
+ return f"{score:.3f}" if score is not None else "—"
288
+
289
+
290
+ if __name__ == "__main__":
291
+ app()
ankora/config.py ADDED
@@ -0,0 +1,228 @@
1
+ """Load and validate ankora.yaml into a pydantic Config.
2
+
3
+ See CLAUDE.md "Config file" section for the target schema. Provider API keys are
4
+ never stored on the model — they are read from the configured env var at call
5
+ time via :meth:`Config.resolve_api_key`.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Annotated, Any, Literal
13
+
14
+ import yaml
15
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError
16
+
17
+
18
+ class ConfigError(Exception):
19
+ """Raised for any human-facing problem loading or validating a config.
20
+
21
+ Carries a message intended to be printed straight to a user, rather than a
22
+ raw pydantic error dump.
23
+ """
24
+
25
+
26
+ class ProviderConfig(BaseModel):
27
+ """How to reach a provider (keys read from env, never inlined)."""
28
+
29
+ api_key_env: str
30
+
31
+
32
+ class TargetConfig(BaseModel):
33
+ """The provider/model replays run against by default."""
34
+
35
+ provider: str
36
+ model: str
37
+
38
+
39
+ class ModelRef(BaseModel):
40
+ """A provider/model reference used by scorers (judge, embedding model)."""
41
+
42
+ provider: str
43
+ model: str
44
+
45
+
46
+ class LLMJudgeScorerConfig(BaseModel):
47
+ """Rubric-based judge scorer (``type: llm_judge``)."""
48
+
49
+ type: Literal["llm_judge"]
50
+ judge: ModelRef
51
+ rubric: str
52
+ threshold: float = 0.7
53
+
54
+
55
+ class EmbeddingSimilarityScorerConfig(BaseModel):
56
+ """Cosine-similarity scorer over an embedding model (``type: embedding_similarity``)."""
57
+
58
+ type: Literal["embedding_similarity"]
59
+ model: ModelRef
60
+ threshold: float = 0.85
61
+
62
+
63
+ class ExactScorerConfig(BaseModel):
64
+ """Exact-match scorer (``type: exact``).
65
+
66
+ ``normalize`` strips surrounding whitespace and lowercases both sides before
67
+ comparing.
68
+ """
69
+
70
+ type: Literal["exact"]
71
+ threshold: float = 1.0
72
+ normalize: bool = True
73
+
74
+
75
+ class RegexScorerConfig(BaseModel):
76
+ """Regex-match scorer (``type: regex``)."""
77
+
78
+ type: Literal["regex"]
79
+ pattern: str
80
+ threshold: float = 1.0
81
+
82
+
83
+ class JSONSchemaScorerConfig(BaseModel):
84
+ """JSON-schema validation scorer (``type: json_schema``).
85
+
86
+ The schema is provided under the ``schema`` key in YAML; it is exposed on
87
+ the model as ``json_schema`` to avoid shadowing pydantic internals.
88
+ """
89
+
90
+ model_config = ConfigDict(populate_by_name=True)
91
+
92
+ type: Literal["json_schema"]
93
+ json_schema: dict[str, Any] = Field(alias="schema")
94
+ threshold: float = 1.0
95
+
96
+
97
+ # Discriminated on ``type`` so an unknown scorer type yields a clear error that
98
+ # names the valid tags. Kept under the name ``ScorerConfig`` for callers that
99
+ # import it (e.g. scorers.registry.build_scorer).
100
+ ScorerConfig = Annotated[
101
+ LLMJudgeScorerConfig
102
+ | EmbeddingSimilarityScorerConfig
103
+ | ExactScorerConfig
104
+ | RegexScorerConfig
105
+ | JSONSchemaScorerConfig,
106
+ Field(discriminator="type"),
107
+ ]
108
+
109
+
110
+ class GateConfig(BaseModel):
111
+ """How the gate decides pass/fail."""
112
+
113
+ fail_on: Literal["regression", "absolute"] = "regression"
114
+ baseline: str = ".ankora/baseline.json"
115
+
116
+
117
+ class Config(BaseModel):
118
+ """The fully-parsed ankora.yaml."""
119
+
120
+ version: int = 1
121
+ suites: list[str] = Field(default_factory=lambda: ["evals/**/*.yaml"])
122
+ target: TargetConfig
123
+ providers: dict[str, ProviderConfig] = Field(default_factory=dict)
124
+ scorers: list[ScorerConfig] = Field(default_factory=list)
125
+ gate: GateConfig = Field(default_factory=GateConfig)
126
+
127
+ def resolve_api_key(self, provider: str) -> str:
128
+ """Read the API key for ``provider`` from its configured env var.
129
+
130
+ The key is read at call time and never stored on the model. Raises
131
+ :class:`ConfigError` if the provider is not configured or its env var is
132
+ unset/empty.
133
+ """
134
+ provider_config = self.providers.get(provider)
135
+ if provider_config is None:
136
+ configured = ", ".join(sorted(self.providers)) or "(none)"
137
+ raise ConfigError(
138
+ f"Provider {provider!r} is not configured under `providers`. "
139
+ f"Configured providers: {configured}."
140
+ )
141
+ key = os.environ.get(provider_config.api_key_env)
142
+ if not key:
143
+ raise ConfigError(
144
+ f"Environment variable {provider_config.api_key_env!r} "
145
+ f"(for provider {provider!r}) is not set. "
146
+ "Export it before running, e.g. "
147
+ f"`export {provider_config.api_key_env}=...`."
148
+ )
149
+ return key
150
+
151
+
152
+ # The default config written by `ankora init`; mirrors the CLAUDE.md example.
153
+ DEFAULT_CONFIG_YAML = """\
154
+ version: 1
155
+ suites: ["evals/**/*.yaml"]
156
+ target:
157
+ provider: openai
158
+ model: gpt-4o-mini
159
+ providers:
160
+ openai: {api_key_env: OPENAI_API_KEY}
161
+ anthropic: {api_key_env: ANTHROPIC_API_KEY}
162
+ scorers:
163
+ - type: llm_judge
164
+ judge: {provider: openai, model: gpt-4o}
165
+ rubric: "Score 1 if the answer is factually consistent with the reference, else 0."
166
+ threshold: 0.7
167
+ - type: embedding_similarity
168
+ model: {provider: openai, model: text-embedding-3-small}
169
+ threshold: 0.85
170
+ gate:
171
+ fail_on: regression
172
+ baseline: .ankora/baseline.json
173
+ """
174
+
175
+
176
+ def _format_location(loc: tuple[Any, ...]) -> str:
177
+ """Render a pydantic error location as a readable dotted/indexed path."""
178
+ parts: list[str] = []
179
+ for item in loc:
180
+ if isinstance(item, int):
181
+ parts.append(f"[{item}]")
182
+ elif parts:
183
+ parts.append(f".{item}")
184
+ else:
185
+ parts.append(str(item))
186
+ return "".join(parts) or "(root)"
187
+
188
+
189
+ def _humanize_validation_error(error: ValidationError, source: str) -> ConfigError:
190
+ """Turn a pydantic ValidationError into a clear, user-facing ConfigError."""
191
+ lines = [f"Invalid ankora config ({source}):"]
192
+ for err in error.errors():
193
+ location = _format_location(err["loc"])
194
+ lines.append(f" - {location}: {err['msg']}")
195
+ return ConfigError("\n".join(lines))
196
+
197
+
198
+ def load_config(path: str | Path = "ankora.yaml") -> Config:
199
+ """Read an ankora.yaml file and validate it into a :class:`Config`.
200
+
201
+ Raises :class:`ConfigError` with a human-readable message for a missing
202
+ file, malformed YAML, or a schema violation (unknown scorer ``type``,
203
+ missing ``api_key_env``, malformed ``gate.fail_on``, etc.).
204
+ """
205
+ config_path = Path(path)
206
+ try:
207
+ raw = config_path.read_text(encoding="utf-8")
208
+ except FileNotFoundError as exc:
209
+ raise ConfigError(
210
+ f"Config file not found: {config_path}. Run `ankora init` to create one."
211
+ ) from exc
212
+
213
+ try:
214
+ data = yaml.safe_load(raw)
215
+ except yaml.YAMLError as exc:
216
+ raise ConfigError(f"Config file {config_path} is not valid YAML: {exc}") from exc
217
+
218
+ if data is None:
219
+ raise ConfigError(f"Config file {config_path} is empty.")
220
+ if not isinstance(data, dict):
221
+ raise ConfigError(
222
+ f"Config file {config_path} must be a YAML mapping, got {type(data).__name__}."
223
+ )
224
+
225
+ try:
226
+ return Config.model_validate(data)
227
+ except ValidationError as exc:
228
+ raise _humanize_validation_error(exc, str(config_path)) from exc
ankora/diff.py ADDED
@@ -0,0 +1,158 @@
1
+ """Compare a current RunResult against a baseline to classify per-case changes.
2
+
3
+ The gate uses this to decide whether quality regressed. Interpretation of what
4
+ counts as a *regression* depends on ``config.gate.fail_on``:
5
+
6
+ * ``"regression"`` — a case regresses only if it passed in the baseline and now
7
+ fails (crossing pass -> fail, i.e. its aggregate score dropped past the pass
8
+ threshold). Cases absent from the baseline are reported as new, never as
9
+ regressions.
10
+ * ``"absolute"`` — any case failing its scorer thresholds in the current run is
11
+ a regression, regardless of the baseline.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections import Counter
17
+ from enum import StrEnum
18
+
19
+ from pydantic import BaseModel, Field
20
+
21
+ from ankora.config import Config
22
+ from ankora.models import CaseResult, RunResult
23
+
24
+
25
+ class CaseStatus(StrEnum):
26
+ """How a single case changed between baseline and current."""
27
+
28
+ NEW_FAILURE = "new_failure" # only in current, failing
29
+ NEW_PASS = "new_pass" # only in current, passing
30
+ FIXED = "fixed" # failed baseline, passes now
31
+ REGRESSED = "regressed" # passed baseline, fails now
32
+ UNCHANGED = "unchanged" # same pass/fail state in both
33
+ REMOVED = "removed" # only in baseline, missing from current
34
+
35
+
36
+ class CaseDiff(BaseModel):
37
+ """The per-case comparison result."""
38
+
39
+ case_id: str
40
+ status: CaseStatus
41
+ baseline_passed: bool | None = None
42
+ current_passed: bool | None = None
43
+ baseline_score: float | None = None
44
+ current_score: float | None = None
45
+ delta: float | None = None
46
+ is_regression: bool = False
47
+ detail: str = ""
48
+
49
+
50
+ class DiffReport(BaseModel):
51
+ """The full comparison plus roll-up counts and the gate signal."""
52
+
53
+ fail_on: str
54
+ cases: list[CaseDiff] = Field(default_factory=list)
55
+ has_regressions: bool = False
56
+ regressions: int = 0
57
+ regressed: int = 0
58
+ new_failures: int = 0
59
+ new_passes: int = 0
60
+ fixed: int = 0
61
+ unchanged: int = 0
62
+ removed: int = 0
63
+
64
+
65
+ def diff_runs(baseline: RunResult, current: RunResult, config: Config) -> DiffReport:
66
+ """Compare ``current`` to ``baseline`` under ``config.gate.fail_on``."""
67
+ mode = config.gate.fail_on
68
+ baseline_by_id = {cr.case_id: cr for cr in baseline.case_results}
69
+ current_by_id = {cr.case_id: cr for cr in current.case_results}
70
+
71
+ cases: list[CaseDiff] = []
72
+ # Current run order first, so the report reads in suite order.
73
+ for current_case in current.case_results:
74
+ cases.append(_diff_case(baseline_by_id.get(current_case.case_id), current_case, mode))
75
+ # Then any baseline cases that vanished from the current run.
76
+ for baseline_case in baseline.case_results:
77
+ if baseline_case.case_id not in current_by_id:
78
+ cases.append(_removed_case(baseline_case))
79
+
80
+ counts = Counter(case.status for case in cases)
81
+ return DiffReport(
82
+ fail_on=mode,
83
+ cases=cases,
84
+ has_regressions=any(case.is_regression for case in cases),
85
+ regressions=sum(1 for case in cases if case.is_regression),
86
+ regressed=counts[CaseStatus.REGRESSED],
87
+ new_failures=counts[CaseStatus.NEW_FAILURE],
88
+ new_passes=counts[CaseStatus.NEW_PASS],
89
+ fixed=counts[CaseStatus.FIXED],
90
+ unchanged=counts[CaseStatus.UNCHANGED],
91
+ removed=counts[CaseStatus.REMOVED],
92
+ )
93
+
94
+
95
+ def _diff_case(baseline: CaseResult | None, current: CaseResult, mode: str) -> CaseDiff:
96
+ current_passed = current.passed
97
+ current_score = _aggregate_score(current)
98
+
99
+ if baseline is None:
100
+ status = CaseStatus.NEW_FAILURE if not current_passed else CaseStatus.NEW_PASS
101
+ # A brand-new case is never a regression under "regression"; under
102
+ # "absolute" it regresses if it fails.
103
+ is_regression = mode == "absolute" and not current_passed
104
+ detail = "new case, failing" if not current_passed else "new case, passing"
105
+ return CaseDiff(
106
+ case_id=current.case_id,
107
+ status=status,
108
+ current_passed=current_passed,
109
+ current_score=current_score,
110
+ is_regression=is_regression,
111
+ detail=detail,
112
+ )
113
+
114
+ baseline_passed = baseline.passed
115
+ baseline_score = _aggregate_score(baseline)
116
+ delta = current_score - baseline_score
117
+
118
+ if baseline_passed and not current_passed:
119
+ status = CaseStatus.REGRESSED
120
+ elif not baseline_passed and current_passed:
121
+ status = CaseStatus.FIXED
122
+ else:
123
+ status = CaseStatus.UNCHANGED
124
+
125
+ if mode == "absolute":
126
+ is_regression = not current_passed
127
+ else: # "regression": only a genuine pass -> fail crossing counts
128
+ is_regression = status == CaseStatus.REGRESSED
129
+
130
+ return CaseDiff(
131
+ case_id=current.case_id,
132
+ status=status,
133
+ baseline_passed=baseline_passed,
134
+ current_passed=current_passed,
135
+ baseline_score=baseline_score,
136
+ current_score=current_score,
137
+ delta=delta,
138
+ is_regression=is_regression,
139
+ detail=f"score {baseline_score:.3f} -> {current_score:.3f} (Δ{delta:+.3f})",
140
+ )
141
+
142
+
143
+ def _removed_case(baseline: CaseResult) -> CaseDiff:
144
+ return CaseDiff(
145
+ case_id=baseline.case_id,
146
+ status=CaseStatus.REMOVED,
147
+ baseline_passed=baseline.passed,
148
+ baseline_score=_aggregate_score(baseline),
149
+ is_regression=False,
150
+ detail="case missing from current run",
151
+ )
152
+
153
+
154
+ def _aggregate_score(case: CaseResult) -> float:
155
+ """Mean of the case's scorer scores; falls back to pass/fail as 1.0/0.0."""
156
+ if not case.scorer_results:
157
+ return 1.0 if case.passed else 0.0
158
+ return sum(result.score for result in case.scorer_results) / len(case.scorer_results)