ankora 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ankora/__init__.py +8 -0
- ankora/cli.py +291 -0
- ankora/config.py +228 -0
- ankora/diff.py +158 -0
- ankora/ingest/__init__.py +83 -0
- ankora/ingest/langfuse.py +273 -0
- ankora/ingest/otel.py +420 -0
- ankora/models.py +147 -0
- ankora/providers/__init__.py +1 -0
- ankora/providers/anthropic.py +100 -0
- ankora/providers/base.py +50 -0
- ankora/providers/echo.py +42 -0
- ankora/providers/openai.py +98 -0
- ankora/providers/registry.py +49 -0
- ankora/replay.py +75 -0
- ankora/scorers/__init__.py +1 -0
- ankora/scorers/base.py +22 -0
- ankora/scorers/embedding.py +45 -0
- ankora/scorers/exact.py +34 -0
- ankora/scorers/json_schema.py +175 -0
- ankora/scorers/llm_judge.py +123 -0
- ankora/scorers/regex.py +34 -0
- ankora/scorers/registry.py +54 -0
- ankora/storage.py +74 -0
- ankora/suites.py +43 -0
- ankora-0.1.0.dist-info/METADATA +234 -0
- ankora-0.1.0.dist-info/RECORD +31 -0
- ankora-0.1.0.dist-info/WHEEL +4 -0
- ankora-0.1.0.dist-info/entry_points.txt +2 -0
- ankora-0.1.0.dist-info/licenses/LICENSE +201 -0
- ankora-0.1.0.dist-info/licenses/NOTICE +5 -0
ankora/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""ankora: local-first, CI-native regression testing for LLM applications."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
__version__ = version("ankora")
|
|
7
|
+
except PackageNotFoundError: # pragma: no cover - only when running from a non-installed tree
|
|
8
|
+
__version__ = "0.0.0"
|
ankora/cli.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""The ankora command-line interface.
|
|
2
|
+
|
|
3
|
+
Wires the typer app and its subcommands (``init``, ``ingest``, ``run``, ``diff``,
|
|
4
|
+
``gate``, ``baseline set``). ``gate`` is the CI entrypoint: it exits non-zero
|
|
5
|
+
when quality regresses against the baseline.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import typer
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
|
|
17
|
+
from ankora import __version__
|
|
18
|
+
from ankora.config import (
|
|
19
|
+
DEFAULT_CONFIG_YAML,
|
|
20
|
+
Config,
|
|
21
|
+
ConfigError,
|
|
22
|
+
TargetConfig,
|
|
23
|
+
load_config,
|
|
24
|
+
)
|
|
25
|
+
from ankora.diff import DiffReport, diff_runs
|
|
26
|
+
from ankora.ingest import ingest_traces
|
|
27
|
+
from ankora.models import RunResult
|
|
28
|
+
from ankora.replay import replay
|
|
29
|
+
from ankora.storage import StorageError, get_baseline, load_run, run_path, set_baseline
|
|
30
|
+
from ankora.suites import SuiteError
|
|
31
|
+
|
|
32
|
+
app = typer.Typer(
|
|
33
|
+
name="ankora",
|
|
34
|
+
help="Local-first, CI-native regression testing for LLM applications.",
|
|
35
|
+
no_args_is_help=True,
|
|
36
|
+
add_completion=False,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
console = Console()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _version_callback(value: bool) -> None:
|
|
43
|
+
if value:
|
|
44
|
+
console.print(__version__)
|
|
45
|
+
raise typer.Exit()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@app.callback()
|
|
49
|
+
def _main(
|
|
50
|
+
_version: bool = typer.Option(
|
|
51
|
+
False,
|
|
52
|
+
"--version",
|
|
53
|
+
callback=_version_callback,
|
|
54
|
+
is_eager=True,
|
|
55
|
+
help="Show the ankora version and exit.",
|
|
56
|
+
),
|
|
57
|
+
) -> None:
|
|
58
|
+
"""ankora: turn the traces you already capture into CI regression tests."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@app.command()
|
|
62
|
+
def version() -> None:
|
|
63
|
+
"""Print the ankora version."""
|
|
64
|
+
console.print(__version__)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@app.command()
|
|
68
|
+
def init(
|
|
69
|
+
force: bool = typer.Option(False, "--force", help="Overwrite an existing ankora.yaml."),
|
|
70
|
+
) -> None:
|
|
71
|
+
"""Scaffold ankora.yaml and an evals/ directory in the current repo."""
|
|
72
|
+
config_path = Path("ankora.yaml")
|
|
73
|
+
if config_path.exists() and not force:
|
|
74
|
+
console.print(f"[red]{config_path} already exists.[/] Pass [bold]--force[/] to overwrite.")
|
|
75
|
+
raise typer.Exit(code=1)
|
|
76
|
+
|
|
77
|
+
config_path.write_text(DEFAULT_CONFIG_YAML, encoding="utf-8")
|
|
78
|
+
evals_dir = Path("evals")
|
|
79
|
+
evals_dir.mkdir(exist_ok=True)
|
|
80
|
+
|
|
81
|
+
console.print(f"[green]Wrote[/] {config_path}")
|
|
82
|
+
console.print(f"[green]Created[/] {evals_dir}/ (add Case files here)")
|
|
83
|
+
console.print("Next: [bold]ankora ingest <trace-file>[/] to build your first cases.")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@app.command()
|
|
87
|
+
def ingest(
|
|
88
|
+
trace_file: str = typer.Argument(
|
|
89
|
+
..., help="Path to an OpenTelemetry GenAI or Langfuse trace export (JSON)."
|
|
90
|
+
),
|
|
91
|
+
out: str = typer.Option("evals/", "--out", help="Directory to write Case files into."),
|
|
92
|
+
fmt: str = typer.Option(
|
|
93
|
+
"auto", "--format", help="Trace format: auto (default), otel, or langfuse."
|
|
94
|
+
),
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Build or update regression Cases from an OTel GenAI or Langfuse trace file."""
|
|
97
|
+
if fmt not in ("auto", "otel", "langfuse"):
|
|
98
|
+
console.print(f"[red]--format must be one of: auto, otel, langfuse[/] (got {fmt!r})")
|
|
99
|
+
raise typer.Exit(code=1)
|
|
100
|
+
|
|
101
|
+
trace_path = Path(trace_file)
|
|
102
|
+
if not trace_path.exists():
|
|
103
|
+
console.print(f"[red]Trace file not found:[/] {trace_path}")
|
|
104
|
+
raise typer.Exit(code=1)
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
result, detected = ingest_traces(trace_path, fmt)
|
|
108
|
+
except json.JSONDecodeError as exc:
|
|
109
|
+
console.print(f"[red]{trace_path} is not valid JSON:[/] {exc}")
|
|
110
|
+
raise typer.Exit(code=1) from exc
|
|
111
|
+
|
|
112
|
+
out_dir = Path(out)
|
|
113
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
for case in result.cases:
|
|
115
|
+
(out_dir / f"{case.id}.yaml").write_text(case.to_yaml(), encoding="utf-8")
|
|
116
|
+
|
|
117
|
+
console.print(
|
|
118
|
+
f"[green]Wrote {len(result.cases)} case(s)[/] to {out_dir} "
|
|
119
|
+
f"({detected} format; {result.skipped} skipped, {result.total} record(s) seen)."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@app.command()
|
|
124
|
+
def run(
|
|
125
|
+
suite: list[str] = typer.Option(None, "--suite", help="Glob(s) of Case files to run."),
|
|
126
|
+
target: str = typer.Option(None, "--target", help="Override target as provider:model."),
|
|
127
|
+
concurrency: int = typer.Option(8, "--concurrency", help="Max cases to replay in parallel."),
|
|
128
|
+
config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Replay and score the suite, persist a RunResult, print a summary."""
|
|
131
|
+
try:
|
|
132
|
+
config = load_config(config_path)
|
|
133
|
+
if suite:
|
|
134
|
+
config = config.model_copy(update={"suites": list(suite)})
|
|
135
|
+
result = replay(config, target=target, concurrency=concurrency)
|
|
136
|
+
except (ConfigError, SuiteError) as exc:
|
|
137
|
+
console.print(f"[red]{exc}[/]")
|
|
138
|
+
raise typer.Exit(code=1) from exc
|
|
139
|
+
|
|
140
|
+
_print_run_table(result)
|
|
141
|
+
console.print(f"\nSaved run [bold]{result.run_id}[/] to {run_path(result.run_id)}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _print_run_table(result: RunResult) -> None:
|
|
145
|
+
table = Table(title=f"ankora run · {result.target.provider}:{result.target.model}")
|
|
146
|
+
table.add_column("Case", overflow="fold")
|
|
147
|
+
table.add_column("Result")
|
|
148
|
+
table.add_column("Scores", overflow="fold")
|
|
149
|
+
for case_result in result.case_results:
|
|
150
|
+
status = "[green]PASS[/]" if case_result.passed else "[red]FAIL[/]"
|
|
151
|
+
scores = ", ".join(
|
|
152
|
+
f"{s.scorer} {s.score:.2f}{'✓' if s.passed else '✗'}"
|
|
153
|
+
for s in case_result.scorer_results
|
|
154
|
+
)
|
|
155
|
+
table.add_row(case_result.case_id, status, scores or "—")
|
|
156
|
+
console.print(table)
|
|
157
|
+
|
|
158
|
+
summary = result.summary
|
|
159
|
+
color = "green" if summary.failed == 0 else "red"
|
|
160
|
+
console.print(f"[{color}]{summary.passed}/{summary.total} passed, {summary.failed} failed[/]")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@app.command()
|
|
164
|
+
def diff(
|
|
165
|
+
baseline: str = typer.Argument(..., help="Baseline run id or path to a run JSON."),
|
|
166
|
+
current: str = typer.Argument(..., help="Current run id or path to a run JSON."),
|
|
167
|
+
config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
|
|
168
|
+
fail_on: str = typer.Option(None, "--fail-on", help="Override gate.fail_on for this diff."),
|
|
169
|
+
) -> None:
|
|
170
|
+
"""Show per-case changes between two runs. Read-only; always exits 0."""
|
|
171
|
+
config = _load_config_or_default(config_path)
|
|
172
|
+
if fail_on:
|
|
173
|
+
gate_config = config.gate.model_copy(update={"fail_on": fail_on})
|
|
174
|
+
config = config.model_copy(update={"gate": gate_config})
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
baseline_run = _resolve_run(baseline)
|
|
178
|
+
current_run = _resolve_run(current)
|
|
179
|
+
except StorageError as exc:
|
|
180
|
+
console.print(f"[red]{exc}[/]")
|
|
181
|
+
raise typer.Exit(code=1) from exc
|
|
182
|
+
|
|
183
|
+
_print_diff_report(diff_runs(baseline_run, current_run, config))
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@app.command()
|
|
187
|
+
def gate(
|
|
188
|
+
config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
|
|
189
|
+
target: str = typer.Option(None, "--target", help="Override target as provider:model."),
|
|
190
|
+
concurrency: int = typer.Option(8, "--concurrency", help="Max cases to replay in parallel."),
|
|
191
|
+
) -> None:
|
|
192
|
+
"""Replay the suite, diff against baseline, and exit non-zero on regression.
|
|
193
|
+
|
|
194
|
+
This is the CI entrypoint.
|
|
195
|
+
"""
|
|
196
|
+
try:
|
|
197
|
+
config = load_config(config_path)
|
|
198
|
+
current = replay(config, target=target, concurrency=concurrency)
|
|
199
|
+
except (ConfigError, SuiteError) as exc:
|
|
200
|
+
console.print(f"[red]{exc}[/]")
|
|
201
|
+
raise typer.Exit(code=1) from exc
|
|
202
|
+
|
|
203
|
+
_print_run_table(current)
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
baseline = get_baseline(config)
|
|
207
|
+
except StorageError:
|
|
208
|
+
console.print(
|
|
209
|
+
"\n[yellow]No baseline yet[/] — nothing to regress against. "
|
|
210
|
+
f"Promote this run with [bold]ankora baseline set {current.run_id}[/]."
|
|
211
|
+
)
|
|
212
|
+
raise typer.Exit(code=0) from None
|
|
213
|
+
|
|
214
|
+
report = diff_runs(baseline, current, config)
|
|
215
|
+
_print_diff_report(report)
|
|
216
|
+
if report.has_regressions:
|
|
217
|
+
console.print(f"\n[red]{report.regressions} regression(s) detected — failing the gate.[/]")
|
|
218
|
+
raise typer.Exit(code=1)
|
|
219
|
+
console.print("\n[green]No regressions — gate passed.[/]")
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
baseline_app = typer.Typer(
|
|
223
|
+
name="baseline",
|
|
224
|
+
help="Manage the regression baseline.",
|
|
225
|
+
no_args_is_help=True,
|
|
226
|
+
)
|
|
227
|
+
app.add_typer(baseline_app)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
@baseline_app.command("set")
|
|
231
|
+
def baseline_set(
|
|
232
|
+
run_id: str = typer.Argument(..., help="The run id to promote to baseline."),
|
|
233
|
+
config_path: str = typer.Option("ankora.yaml", "--config", help="Path to ankora.yaml."),
|
|
234
|
+
) -> None:
|
|
235
|
+
"""Promote a stored run to the baseline."""
|
|
236
|
+
try:
|
|
237
|
+
config = load_config(config_path)
|
|
238
|
+
path = set_baseline(config, run_id)
|
|
239
|
+
except (ConfigError, StorageError) as exc:
|
|
240
|
+
console.print(f"[red]{exc}[/]")
|
|
241
|
+
raise typer.Exit(code=1) from exc
|
|
242
|
+
console.print(f"[green]Baseline set[/] to run [bold]{run_id}[/] at {path}")
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _load_config_or_default(config_path: str) -> Config:
|
|
246
|
+
"""Load config, or fall back to a minimal default (used by read-only diff)."""
|
|
247
|
+
try:
|
|
248
|
+
return load_config(config_path)
|
|
249
|
+
except ConfigError:
|
|
250
|
+
return Config(target=TargetConfig(provider="unknown", model="unknown"))
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _resolve_run(reference: str) -> RunResult:
|
|
254
|
+
"""Resolve a run reference that is either a path to a run JSON or a run id."""
|
|
255
|
+
path = Path(reference)
|
|
256
|
+
if path.exists():
|
|
257
|
+
return RunResult.from_json(path.read_text(encoding="utf-8"))
|
|
258
|
+
return load_run(reference)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _print_diff_report(report: DiffReport) -> None:
|
|
262
|
+
table = Table(title=f"ankora diff · fail_on={report.fail_on}")
|
|
263
|
+
table.add_column("Case", overflow="fold")
|
|
264
|
+
table.add_column("Status")
|
|
265
|
+
table.add_column("Baseline")
|
|
266
|
+
table.add_column("Current")
|
|
267
|
+
table.add_column("Δ")
|
|
268
|
+
table.add_column("Regression")
|
|
269
|
+
for case in report.cases:
|
|
270
|
+
table.add_row(
|
|
271
|
+
case.case_id,
|
|
272
|
+
case.status.value,
|
|
273
|
+
_fmt_score(case.baseline_score),
|
|
274
|
+
_fmt_score(case.current_score),
|
|
275
|
+
f"{case.delta:+.3f}" if case.delta is not None else "—",
|
|
276
|
+
"[red]yes[/]" if case.is_regression else "no",
|
|
277
|
+
)
|
|
278
|
+
console.print(table)
|
|
279
|
+
console.print(
|
|
280
|
+
f"regressed={report.regressed} new_failures={report.new_failures} "
|
|
281
|
+
f"fixed={report.fixed} unchanged={report.unchanged} "
|
|
282
|
+
f"new_passes={report.new_passes} removed={report.removed}"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _fmt_score(score: float | None) -> str:
|
|
287
|
+
return f"{score:.3f}" if score is not None else "—"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
if __name__ == "__main__":
|
|
291
|
+
app()
|
ankora/config.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Load and validate ankora.yaml into a pydantic Config.
|
|
2
|
+
|
|
3
|
+
See CLAUDE.md "Config file" section for the target schema. Provider API keys are
|
|
4
|
+
never stored on the model — they are read from the configured env var at call
|
|
5
|
+
time via :meth:`Config.resolve_api_key`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Annotated, Any, Literal
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ConfigError(Exception):
|
|
19
|
+
"""Raised for any human-facing problem loading or validating a config.
|
|
20
|
+
|
|
21
|
+
Carries a message intended to be printed straight to a user, rather than a
|
|
22
|
+
raw pydantic error dump.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ProviderConfig(BaseModel):
|
|
27
|
+
"""How to reach a provider (keys read from env, never inlined)."""
|
|
28
|
+
|
|
29
|
+
api_key_env: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TargetConfig(BaseModel):
|
|
33
|
+
"""The provider/model replays run against by default."""
|
|
34
|
+
|
|
35
|
+
provider: str
|
|
36
|
+
model: str
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ModelRef(BaseModel):
|
|
40
|
+
"""A provider/model reference used by scorers (judge, embedding model)."""
|
|
41
|
+
|
|
42
|
+
provider: str
|
|
43
|
+
model: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LLMJudgeScorerConfig(BaseModel):
|
|
47
|
+
"""Rubric-based judge scorer (``type: llm_judge``)."""
|
|
48
|
+
|
|
49
|
+
type: Literal["llm_judge"]
|
|
50
|
+
judge: ModelRef
|
|
51
|
+
rubric: str
|
|
52
|
+
threshold: float = 0.7
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class EmbeddingSimilarityScorerConfig(BaseModel):
|
|
56
|
+
"""Cosine-similarity scorer over an embedding model (``type: embedding_similarity``)."""
|
|
57
|
+
|
|
58
|
+
type: Literal["embedding_similarity"]
|
|
59
|
+
model: ModelRef
|
|
60
|
+
threshold: float = 0.85
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ExactScorerConfig(BaseModel):
|
|
64
|
+
"""Exact-match scorer (``type: exact``).
|
|
65
|
+
|
|
66
|
+
``normalize`` strips surrounding whitespace and lowercases both sides before
|
|
67
|
+
comparing.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
type: Literal["exact"]
|
|
71
|
+
threshold: float = 1.0
|
|
72
|
+
normalize: bool = True
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class RegexScorerConfig(BaseModel):
|
|
76
|
+
"""Regex-match scorer (``type: regex``)."""
|
|
77
|
+
|
|
78
|
+
type: Literal["regex"]
|
|
79
|
+
pattern: str
|
|
80
|
+
threshold: float = 1.0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class JSONSchemaScorerConfig(BaseModel):
|
|
84
|
+
"""JSON-schema validation scorer (``type: json_schema``).
|
|
85
|
+
|
|
86
|
+
The schema is provided under the ``schema`` key in YAML; it is exposed on
|
|
87
|
+
the model as ``json_schema`` to avoid shadowing pydantic internals.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
91
|
+
|
|
92
|
+
type: Literal["json_schema"]
|
|
93
|
+
json_schema: dict[str, Any] = Field(alias="schema")
|
|
94
|
+
threshold: float = 1.0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# Discriminated on ``type`` so an unknown scorer type yields a clear error that
|
|
98
|
+
# names the valid tags. Kept under the name ``ScorerConfig`` for callers that
|
|
99
|
+
# import it (e.g. scorers.registry.build_scorer).
|
|
100
|
+
ScorerConfig = Annotated[
|
|
101
|
+
LLMJudgeScorerConfig
|
|
102
|
+
| EmbeddingSimilarityScorerConfig
|
|
103
|
+
| ExactScorerConfig
|
|
104
|
+
| RegexScorerConfig
|
|
105
|
+
| JSONSchemaScorerConfig,
|
|
106
|
+
Field(discriminator="type"),
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class GateConfig(BaseModel):
|
|
111
|
+
"""How the gate decides pass/fail."""
|
|
112
|
+
|
|
113
|
+
fail_on: Literal["regression", "absolute"] = "regression"
|
|
114
|
+
baseline: str = ".ankora/baseline.json"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class Config(BaseModel):
|
|
118
|
+
"""The fully-parsed ankora.yaml."""
|
|
119
|
+
|
|
120
|
+
version: int = 1
|
|
121
|
+
suites: list[str] = Field(default_factory=lambda: ["evals/**/*.yaml"])
|
|
122
|
+
target: TargetConfig
|
|
123
|
+
providers: dict[str, ProviderConfig] = Field(default_factory=dict)
|
|
124
|
+
scorers: list[ScorerConfig] = Field(default_factory=list)
|
|
125
|
+
gate: GateConfig = Field(default_factory=GateConfig)
|
|
126
|
+
|
|
127
|
+
def resolve_api_key(self, provider: str) -> str:
|
|
128
|
+
"""Read the API key for ``provider`` from its configured env var.
|
|
129
|
+
|
|
130
|
+
The key is read at call time and never stored on the model. Raises
|
|
131
|
+
:class:`ConfigError` if the provider is not configured or its env var is
|
|
132
|
+
unset/empty.
|
|
133
|
+
"""
|
|
134
|
+
provider_config = self.providers.get(provider)
|
|
135
|
+
if provider_config is None:
|
|
136
|
+
configured = ", ".join(sorted(self.providers)) or "(none)"
|
|
137
|
+
raise ConfigError(
|
|
138
|
+
f"Provider {provider!r} is not configured under `providers`. "
|
|
139
|
+
f"Configured providers: {configured}."
|
|
140
|
+
)
|
|
141
|
+
key = os.environ.get(provider_config.api_key_env)
|
|
142
|
+
if not key:
|
|
143
|
+
raise ConfigError(
|
|
144
|
+
f"Environment variable {provider_config.api_key_env!r} "
|
|
145
|
+
f"(for provider {provider!r}) is not set. "
|
|
146
|
+
"Export it before running, e.g. "
|
|
147
|
+
f"`export {provider_config.api_key_env}=...`."
|
|
148
|
+
)
|
|
149
|
+
return key
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# The default config written by `ankora init`; mirrors the CLAUDE.md example.
|
|
153
|
+
DEFAULT_CONFIG_YAML = """\
|
|
154
|
+
version: 1
|
|
155
|
+
suites: ["evals/**/*.yaml"]
|
|
156
|
+
target:
|
|
157
|
+
provider: openai
|
|
158
|
+
model: gpt-4o-mini
|
|
159
|
+
providers:
|
|
160
|
+
openai: {api_key_env: OPENAI_API_KEY}
|
|
161
|
+
anthropic: {api_key_env: ANTHROPIC_API_KEY}
|
|
162
|
+
scorers:
|
|
163
|
+
- type: llm_judge
|
|
164
|
+
judge: {provider: openai, model: gpt-4o}
|
|
165
|
+
rubric: "Score 1 if the answer is factually consistent with the reference, else 0."
|
|
166
|
+
threshold: 0.7
|
|
167
|
+
- type: embedding_similarity
|
|
168
|
+
model: {provider: openai, model: text-embedding-3-small}
|
|
169
|
+
threshold: 0.85
|
|
170
|
+
gate:
|
|
171
|
+
fail_on: regression
|
|
172
|
+
baseline: .ankora/baseline.json
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _format_location(loc: tuple[Any, ...]) -> str:
|
|
177
|
+
"""Render a pydantic error location as a readable dotted/indexed path."""
|
|
178
|
+
parts: list[str] = []
|
|
179
|
+
for item in loc:
|
|
180
|
+
if isinstance(item, int):
|
|
181
|
+
parts.append(f"[{item}]")
|
|
182
|
+
elif parts:
|
|
183
|
+
parts.append(f".{item}")
|
|
184
|
+
else:
|
|
185
|
+
parts.append(str(item))
|
|
186
|
+
return "".join(parts) or "(root)"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _humanize_validation_error(error: ValidationError, source: str) -> ConfigError:
|
|
190
|
+
"""Turn a pydantic ValidationError into a clear, user-facing ConfigError."""
|
|
191
|
+
lines = [f"Invalid ankora config ({source}):"]
|
|
192
|
+
for err in error.errors():
|
|
193
|
+
location = _format_location(err["loc"])
|
|
194
|
+
lines.append(f" - {location}: {err['msg']}")
|
|
195
|
+
return ConfigError("\n".join(lines))
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def load_config(path: str | Path = "ankora.yaml") -> Config:
|
|
199
|
+
"""Read an ankora.yaml file and validate it into a :class:`Config`.
|
|
200
|
+
|
|
201
|
+
Raises :class:`ConfigError` with a human-readable message for a missing
|
|
202
|
+
file, malformed YAML, or a schema violation (unknown scorer ``type``,
|
|
203
|
+
missing ``api_key_env``, malformed ``gate.fail_on``, etc.).
|
|
204
|
+
"""
|
|
205
|
+
config_path = Path(path)
|
|
206
|
+
try:
|
|
207
|
+
raw = config_path.read_text(encoding="utf-8")
|
|
208
|
+
except FileNotFoundError as exc:
|
|
209
|
+
raise ConfigError(
|
|
210
|
+
f"Config file not found: {config_path}. Run `ankora init` to create one."
|
|
211
|
+
) from exc
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
data = yaml.safe_load(raw)
|
|
215
|
+
except yaml.YAMLError as exc:
|
|
216
|
+
raise ConfigError(f"Config file {config_path} is not valid YAML: {exc}") from exc
|
|
217
|
+
|
|
218
|
+
if data is None:
|
|
219
|
+
raise ConfigError(f"Config file {config_path} is empty.")
|
|
220
|
+
if not isinstance(data, dict):
|
|
221
|
+
raise ConfigError(
|
|
222
|
+
f"Config file {config_path} must be a YAML mapping, got {type(data).__name__}."
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
return Config.model_validate(data)
|
|
227
|
+
except ValidationError as exc:
|
|
228
|
+
raise _humanize_validation_error(exc, str(config_path)) from exc
|
ankora/diff.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Compare a current RunResult against a baseline to classify per-case changes.
|
|
2
|
+
|
|
3
|
+
The gate uses this to decide whether quality regressed. Interpretation of what
|
|
4
|
+
counts as a *regression* depends on ``config.gate.fail_on``:
|
|
5
|
+
|
|
6
|
+
* ``"regression"`` — a case regresses only if it passed in the baseline and now
|
|
7
|
+
fails (crossing pass -> fail, i.e. its aggregate score dropped past the pass
|
|
8
|
+
threshold). Cases absent from the baseline are reported as new, never as
|
|
9
|
+
regressions.
|
|
10
|
+
* ``"absolute"`` — any case failing its scorer thresholds in the current run is
|
|
11
|
+
a regression, regardless of the baseline.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections import Counter
|
|
17
|
+
from enum import StrEnum
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
from ankora.config import Config
|
|
22
|
+
from ankora.models import CaseResult, RunResult
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CaseStatus(StrEnum):
|
|
26
|
+
"""How a single case changed between baseline and current."""
|
|
27
|
+
|
|
28
|
+
NEW_FAILURE = "new_failure" # only in current, failing
|
|
29
|
+
NEW_PASS = "new_pass" # only in current, passing
|
|
30
|
+
FIXED = "fixed" # failed baseline, passes now
|
|
31
|
+
REGRESSED = "regressed" # passed baseline, fails now
|
|
32
|
+
UNCHANGED = "unchanged" # same pass/fail state in both
|
|
33
|
+
REMOVED = "removed" # only in baseline, missing from current
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class CaseDiff(BaseModel):
|
|
37
|
+
"""The per-case comparison result."""
|
|
38
|
+
|
|
39
|
+
case_id: str
|
|
40
|
+
status: CaseStatus
|
|
41
|
+
baseline_passed: bool | None = None
|
|
42
|
+
current_passed: bool | None = None
|
|
43
|
+
baseline_score: float | None = None
|
|
44
|
+
current_score: float | None = None
|
|
45
|
+
delta: float | None = None
|
|
46
|
+
is_regression: bool = False
|
|
47
|
+
detail: str = ""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DiffReport(BaseModel):
|
|
51
|
+
"""The full comparison plus roll-up counts and the gate signal."""
|
|
52
|
+
|
|
53
|
+
fail_on: str
|
|
54
|
+
cases: list[CaseDiff] = Field(default_factory=list)
|
|
55
|
+
has_regressions: bool = False
|
|
56
|
+
regressions: int = 0
|
|
57
|
+
regressed: int = 0
|
|
58
|
+
new_failures: int = 0
|
|
59
|
+
new_passes: int = 0
|
|
60
|
+
fixed: int = 0
|
|
61
|
+
unchanged: int = 0
|
|
62
|
+
removed: int = 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def diff_runs(baseline: RunResult, current: RunResult, config: Config) -> DiffReport:
|
|
66
|
+
"""Compare ``current`` to ``baseline`` under ``config.gate.fail_on``."""
|
|
67
|
+
mode = config.gate.fail_on
|
|
68
|
+
baseline_by_id = {cr.case_id: cr for cr in baseline.case_results}
|
|
69
|
+
current_by_id = {cr.case_id: cr for cr in current.case_results}
|
|
70
|
+
|
|
71
|
+
cases: list[CaseDiff] = []
|
|
72
|
+
# Current run order first, so the report reads in suite order.
|
|
73
|
+
for current_case in current.case_results:
|
|
74
|
+
cases.append(_diff_case(baseline_by_id.get(current_case.case_id), current_case, mode))
|
|
75
|
+
# Then any baseline cases that vanished from the current run.
|
|
76
|
+
for baseline_case in baseline.case_results:
|
|
77
|
+
if baseline_case.case_id not in current_by_id:
|
|
78
|
+
cases.append(_removed_case(baseline_case))
|
|
79
|
+
|
|
80
|
+
counts = Counter(case.status for case in cases)
|
|
81
|
+
return DiffReport(
|
|
82
|
+
fail_on=mode,
|
|
83
|
+
cases=cases,
|
|
84
|
+
has_regressions=any(case.is_regression for case in cases),
|
|
85
|
+
regressions=sum(1 for case in cases if case.is_regression),
|
|
86
|
+
regressed=counts[CaseStatus.REGRESSED],
|
|
87
|
+
new_failures=counts[CaseStatus.NEW_FAILURE],
|
|
88
|
+
new_passes=counts[CaseStatus.NEW_PASS],
|
|
89
|
+
fixed=counts[CaseStatus.FIXED],
|
|
90
|
+
unchanged=counts[CaseStatus.UNCHANGED],
|
|
91
|
+
removed=counts[CaseStatus.REMOVED],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _diff_case(baseline: CaseResult | None, current: CaseResult, mode: str) -> CaseDiff:
|
|
96
|
+
current_passed = current.passed
|
|
97
|
+
current_score = _aggregate_score(current)
|
|
98
|
+
|
|
99
|
+
if baseline is None:
|
|
100
|
+
status = CaseStatus.NEW_FAILURE if not current_passed else CaseStatus.NEW_PASS
|
|
101
|
+
# A brand-new case is never a regression under "regression"; under
|
|
102
|
+
# "absolute" it regresses if it fails.
|
|
103
|
+
is_regression = mode == "absolute" and not current_passed
|
|
104
|
+
detail = "new case, failing" if not current_passed else "new case, passing"
|
|
105
|
+
return CaseDiff(
|
|
106
|
+
case_id=current.case_id,
|
|
107
|
+
status=status,
|
|
108
|
+
current_passed=current_passed,
|
|
109
|
+
current_score=current_score,
|
|
110
|
+
is_regression=is_regression,
|
|
111
|
+
detail=detail,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
baseline_passed = baseline.passed
|
|
115
|
+
baseline_score = _aggregate_score(baseline)
|
|
116
|
+
delta = current_score - baseline_score
|
|
117
|
+
|
|
118
|
+
if baseline_passed and not current_passed:
|
|
119
|
+
status = CaseStatus.REGRESSED
|
|
120
|
+
elif not baseline_passed and current_passed:
|
|
121
|
+
status = CaseStatus.FIXED
|
|
122
|
+
else:
|
|
123
|
+
status = CaseStatus.UNCHANGED
|
|
124
|
+
|
|
125
|
+
if mode == "absolute":
|
|
126
|
+
is_regression = not current_passed
|
|
127
|
+
else: # "regression": only a genuine pass -> fail crossing counts
|
|
128
|
+
is_regression = status == CaseStatus.REGRESSED
|
|
129
|
+
|
|
130
|
+
return CaseDiff(
|
|
131
|
+
case_id=current.case_id,
|
|
132
|
+
status=status,
|
|
133
|
+
baseline_passed=baseline_passed,
|
|
134
|
+
current_passed=current_passed,
|
|
135
|
+
baseline_score=baseline_score,
|
|
136
|
+
current_score=current_score,
|
|
137
|
+
delta=delta,
|
|
138
|
+
is_regression=is_regression,
|
|
139
|
+
detail=f"score {baseline_score:.3f} -> {current_score:.3f} (Δ{delta:+.3f})",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _removed_case(baseline: CaseResult) -> CaseDiff:
|
|
144
|
+
return CaseDiff(
|
|
145
|
+
case_id=baseline.case_id,
|
|
146
|
+
status=CaseStatus.REMOVED,
|
|
147
|
+
baseline_passed=baseline.passed,
|
|
148
|
+
baseline_score=_aggregate_score(baseline),
|
|
149
|
+
is_regression=False,
|
|
150
|
+
detail="case missing from current run",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _aggregate_score(case: CaseResult) -> float:
|
|
155
|
+
"""Mean of the case's scorer scores; falls back to pass/fail as 1.0/0.0."""
|
|
156
|
+
if not case.scorer_results:
|
|
157
|
+
return 1.0 if case.passed else 0.0
|
|
158
|
+
return sum(result.score for result in case.scorer_results) / len(case.scorer_results)
|