evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1041 @@
|
|
|
1
|
+
"""CLI command implementations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import typer
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
|
|
16
|
+
from evalgate_sdk._version import __version__
|
|
17
|
+
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _load_saved_config() -> dict[str, Any]:
|
|
22
|
+
"""Load api_key/base_url from .evalgate/config.json if present."""
|
|
23
|
+
config_path = Path.cwd() / ".evalgate" / "config.json"
|
|
24
|
+
if config_path.exists():
|
|
25
|
+
try:
|
|
26
|
+
data = json.loads(config_path.read_text())
|
|
27
|
+
return data if isinstance(data, dict) else {}
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
return {}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _resolve_credentials(
|
|
34
|
+
api_key: str | None,
|
|
35
|
+
base_url: str | None,
|
|
36
|
+
) -> tuple[str | None, str | None]:
|
|
37
|
+
"""Resolve credentials from flag -> config file (env vars handled by typer envvar=)."""
|
|
38
|
+
if not api_key or not base_url:
|
|
39
|
+
saved = _load_saved_config()
|
|
40
|
+
if not api_key:
|
|
41
|
+
api_key = saved.get("api_key")
|
|
42
|
+
if not base_url:
|
|
43
|
+
base_url = saved.get("base_url")
|
|
44
|
+
return api_key, base_url
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _run_async(coro: Any) -> Any:
|
|
48
|
+
"""Run an async function from sync CLI context."""
|
|
49
|
+
try:
|
|
50
|
+
loop = asyncio.get_running_loop()
|
|
51
|
+
except RuntimeError:
|
|
52
|
+
return asyncio.run(coro)
|
|
53
|
+
return loop.run_until_complete(coro)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _spec_name_from_result(result: dict[str, Any]) -> str:
|
|
57
|
+
return str(result.get("spec") or result.get("name") or result.get("test_name") or result.get("testName") or "unknown")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _build_run_artifact(run_id: str, started_at: str, results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
61
|
+
finished_at = datetime.now(timezone.utc).isoformat()
|
|
62
|
+
total = len(results)
|
|
63
|
+
passed = sum(1 for item in results if item.get("passed") is True)
|
|
64
|
+
failed = sum(1 for item in results if item.get("status") == "failed")
|
|
65
|
+
errors = sum(1 for item in results if item.get("status") == "error")
|
|
66
|
+
timeouts = sum(1 for item in results if item.get("status") == "timeout")
|
|
67
|
+
total_duration_ms = sum(float(item.get("duration_ms") or 0.0) for item in results)
|
|
68
|
+
average_score = (sum(float(item.get("score") or 0.0) for item in results) / total) if total > 0 else 0.0
|
|
69
|
+
pass_rate_ratio = (passed / total) if total > 0 else 0.0
|
|
70
|
+
pass_rate_percent = pass_rate_ratio * 100.0
|
|
71
|
+
return {
|
|
72
|
+
"schema_version": "1",
|
|
73
|
+
"schemaVersion": "1",
|
|
74
|
+
"run_id": run_id,
|
|
75
|
+
"runId": run_id,
|
|
76
|
+
"started_at": started_at,
|
|
77
|
+
"startedAt": started_at,
|
|
78
|
+
"finished_at": finished_at,
|
|
79
|
+
"finishedAt": finished_at,
|
|
80
|
+
"runtime": {"sdk": "python", "mode": "spec"},
|
|
81
|
+
"metadata": {"mode": "spec", "sdk": "python"},
|
|
82
|
+
"summary": {
|
|
83
|
+
"total": total,
|
|
84
|
+
"passed": passed,
|
|
85
|
+
"failed": failed,
|
|
86
|
+
"errors": errors,
|
|
87
|
+
"timeouts": timeouts,
|
|
88
|
+
"success": failed == 0 and errors == 0 and timeouts == 0,
|
|
89
|
+
"pass_rate": pass_rate_percent,
|
|
90
|
+
"passRate": pass_rate_ratio,
|
|
91
|
+
"average_score": average_score,
|
|
92
|
+
"averageScore": average_score,
|
|
93
|
+
"total_duration_ms": total_duration_ms,
|
|
94
|
+
"totalDurationMs": total_duration_ms,
|
|
95
|
+
},
|
|
96
|
+
"results": results,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _write_run_artifact(
|
|
101
|
+
artifact: dict[str, Any],
|
|
102
|
+
*,
|
|
103
|
+
output: str | None,
|
|
104
|
+
write_results: bool,
|
|
105
|
+
cwd: Path,
|
|
106
|
+
) -> list[Path]:
|
|
107
|
+
written: list[Path] = []
|
|
108
|
+
serialized = json.dumps(artifact, indent=2)
|
|
109
|
+
|
|
110
|
+
def _write(path: Path) -> None:
|
|
111
|
+
if path in written:
|
|
112
|
+
return
|
|
113
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
114
|
+
path.write_text(serialized, encoding="utf-8")
|
|
115
|
+
written.append(path)
|
|
116
|
+
|
|
117
|
+
if output:
|
|
118
|
+
_write(Path(output))
|
|
119
|
+
if write_results:
|
|
120
|
+
evalgate_dir = cwd / ".evalgate"
|
|
121
|
+
runs_dir = evalgate_dir / "runs"
|
|
122
|
+
run_id = str(artifact.get("run_id") or artifact.get("runId") or "run-latest")
|
|
123
|
+
_write(evalgate_dir / "last-run.json")
|
|
124
|
+
_write(runs_dir / "latest.json")
|
|
125
|
+
_write(runs_dir / f"{run_id}.json")
|
|
126
|
+
return written
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ── init ─────────────────────────────────────────────────────────────
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def init(
|
|
133
|
+
directory: str = typer.Argument(".", help="Project directory"),
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Initialize an EvalGate project — creates config, baseline, and CI workflow."""
|
|
136
|
+
cwd = Path(directory).resolve()
|
|
137
|
+
evalgate_dir = cwd / ".evalgate"
|
|
138
|
+
evalgate_dir.mkdir(exist_ok=True)
|
|
139
|
+
|
|
140
|
+
baseline_path = evalgate_dir / "baseline.json"
|
|
141
|
+
if not baseline_path.exists():
|
|
142
|
+
baseline_path.write_text(
|
|
143
|
+
json.dumps(
|
|
144
|
+
{
|
|
145
|
+
"version": 1,
|
|
146
|
+
"scores": {},
|
|
147
|
+
"latencies": {},
|
|
148
|
+
"tolerance": {"score_drop": 0.05, "latency_increase_pct": 20.0, "min_confidence": 0.8},
|
|
149
|
+
},
|
|
150
|
+
indent=2,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
console.print(f"[green]✓[/green] Created {baseline_path.relative_to(cwd)}")
|
|
154
|
+
|
|
155
|
+
config_path = evalgate_dir / "config.json"
|
|
156
|
+
if not config_path.exists():
|
|
157
|
+
config_path.write_text(
|
|
158
|
+
json.dumps(
|
|
159
|
+
{
|
|
160
|
+
"version": 1,
|
|
161
|
+
"project_name": cwd.name,
|
|
162
|
+
"eval_dir": "evals",
|
|
163
|
+
"baseline": str(baseline_path.relative_to(cwd)),
|
|
164
|
+
},
|
|
165
|
+
indent=2,
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
console.print(f"[green]✓[/green] Created {config_path.relative_to(cwd)}")
|
|
169
|
+
|
|
170
|
+
evals_dir = cwd / "evals"
|
|
171
|
+
evals_dir.mkdir(exist_ok=True)
|
|
172
|
+
|
|
173
|
+
example = evals_dir / "example_eval.py"
|
|
174
|
+
if not example.exists():
|
|
175
|
+
example.write_text(
|
|
176
|
+
'"""Example evaluation spec — replace with your LLM call."""\n\n'
|
|
177
|
+
"from evalgate_sdk.runtime import define_eval, create_result, EvalContext\n\n\n"
|
|
178
|
+
"def my_first_eval(ctx: EvalContext):\n"
|
|
179
|
+
" # Replace this with your actual LLM call\n"
|
|
180
|
+
" output = ctx.input or 'hello world'\n"
|
|
181
|
+
" return create_result(passed=len(output) > 0, score=1.0)\n\n\n"
|
|
182
|
+
'define_eval("example-eval", my_first_eval)\n'
|
|
183
|
+
)
|
|
184
|
+
console.print(f"[green]✓[/green] Created {example.relative_to(cwd)}")
|
|
185
|
+
|
|
186
|
+
gitignore = cwd / ".gitignore"
|
|
187
|
+
evalgate_pattern = ".evalgate/"
|
|
188
|
+
if gitignore.exists():
|
|
189
|
+
content = gitignore.read_text()
|
|
190
|
+
if evalgate_pattern not in content:
|
|
191
|
+
with gitignore.open("a") as f:
|
|
192
|
+
if not content.endswith("\n"):
|
|
193
|
+
f.write("\n")
|
|
194
|
+
f.write(f"{evalgate_pattern}\n")
|
|
195
|
+
console.print(f"[green]✓[/green] Added {evalgate_pattern} to .gitignore")
|
|
196
|
+
else:
|
|
197
|
+
gitignore.write_text(f"{evalgate_pattern}\n")
|
|
198
|
+
console.print(f"[green]✓[/green] Created .gitignore with {evalgate_pattern}")
|
|
199
|
+
|
|
200
|
+
console.print("\n[bold green]Project initialized![/bold green]")
|
|
201
|
+
console.print(" Next: [cyan]evalgate run[/cyan] to execute evaluations")
|
|
202
|
+
console.print(" Docs: https://evalgate.com/docs")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# ── run ──────────────────────────────────────────────────────────────
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def run(
|
|
209
|
+
eval_dir: str = typer.Option("evals", "--dir", "-d", help="Eval spec directory"),
|
|
210
|
+
spec_ids: str | None = typer.Option(None, "--spec-ids", help="Comma-separated spec IDs to run"),
|
|
211
|
+
output: str | None = typer.Option(None, "--output", "-o", help="Output file for results"),
|
|
212
|
+
write_results: bool = typer.Option(False, "--write-results", help="Write artifacts to .evalgate/"),
|
|
213
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
|
|
214
|
+
) -> None:
|
|
215
|
+
"""Run evaluation specs."""
|
|
216
|
+
from evalgate_sdk.runtime import create_eval_runtime, create_local_executor
|
|
217
|
+
|
|
218
|
+
cwd = Path.cwd()
|
|
219
|
+
eval_path = cwd / eval_dir
|
|
220
|
+
|
|
221
|
+
if not eval_path.exists():
|
|
222
|
+
console.print(f"[red]Error:[/red] Eval directory '{eval_dir}' not found")
|
|
223
|
+
console.print("Run [cyan]evalgate init[/cyan] first")
|
|
224
|
+
raise typer.Exit(1)
|
|
225
|
+
|
|
226
|
+
handle = create_eval_runtime(str(cwd))
|
|
227
|
+
executor = create_local_executor()
|
|
228
|
+
started_at = datetime.now(timezone.utc).isoformat()
|
|
229
|
+
run_id = f"run-{datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S')}"
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
spec_files = list(eval_path.glob("**/*.py"))
|
|
233
|
+
if not spec_files:
|
|
234
|
+
console.print(f"[yellow]No eval specs found in {eval_dir}/[/yellow]")
|
|
235
|
+
raise typer.Exit(0)
|
|
236
|
+
|
|
237
|
+
for spec_file in spec_files:
|
|
238
|
+
if spec_file.name.startswith("_"):
|
|
239
|
+
continue
|
|
240
|
+
try:
|
|
241
|
+
import importlib.util
|
|
242
|
+
|
|
243
|
+
spec_module = importlib.util.spec_from_file_location(spec_file.stem, spec_file)
|
|
244
|
+
if spec_module and spec_module.loader:
|
|
245
|
+
mod = importlib.util.module_from_spec(spec_module)
|
|
246
|
+
spec_module.loader.exec_module(mod)
|
|
247
|
+
except Exception as exc:
|
|
248
|
+
console.print(f"[red]Error loading {spec_file.name}:[/red] {exc}")
|
|
249
|
+
if verbose:
|
|
250
|
+
import traceback
|
|
251
|
+
|
|
252
|
+
console.print(f"[dim]{traceback.format_exc()}[/dim]")
|
|
253
|
+
|
|
254
|
+
specs = handle.runtime.list()
|
|
255
|
+
if spec_ids:
|
|
256
|
+
filter_ids = set(spec_ids.split(","))
|
|
257
|
+
specs = [s for s in specs if s.id in filter_ids or s.name in filter_ids]
|
|
258
|
+
|
|
259
|
+
if not specs:
|
|
260
|
+
console.print("[yellow]No matching specs found[/yellow]")
|
|
261
|
+
raise typer.Exit(0)
|
|
262
|
+
|
|
263
|
+
console.print(f"\n[bold]Running {len(specs)} eval(s)...[/bold]\n")
|
|
264
|
+
|
|
265
|
+
from evalgate_sdk.runtime.types import EvalContext
|
|
266
|
+
|
|
267
|
+
results: list[dict[str, Any]] = []
|
|
268
|
+
for spec in specs:
|
|
269
|
+
ctx = EvalContext(input="", metadata={})
|
|
270
|
+
try:
|
|
271
|
+
result = _run_async(executor.execute(spec, ctx))
|
|
272
|
+
status = "[green]✓ PASS[/green]" if result.passed else "[red]✗ FAIL[/red]"
|
|
273
|
+
console.print(f" {status} {spec.name} ({result.duration_ms:.0f}ms, score={result.score:.2f})")
|
|
274
|
+
entry: dict[str, Any] = {
|
|
275
|
+
"spec": spec.name,
|
|
276
|
+
"name": spec.name,
|
|
277
|
+
"specId": spec.id,
|
|
278
|
+
"test_id": spec.id,
|
|
279
|
+
"testId": spec.id,
|
|
280
|
+
"test_name": spec.name,
|
|
281
|
+
"testName": spec.name,
|
|
282
|
+
"file_path": spec.file_path or "",
|
|
283
|
+
"filePath": spec.file_path or "",
|
|
284
|
+
"position": spec.position or {"line": 0, "column": 0},
|
|
285
|
+
"input": str(ctx.input or ""),
|
|
286
|
+
"expected": "",
|
|
287
|
+
"actual": result.output or "",
|
|
288
|
+
"output": result.output or "",
|
|
289
|
+
"passed": result.passed,
|
|
290
|
+
"score": result.score,
|
|
291
|
+
"duration_ms": result.duration_ms,
|
|
292
|
+
"durationMs": result.duration_ms,
|
|
293
|
+
"metadata": dict(result.metadata),
|
|
294
|
+
"assertions": list(result.assertions),
|
|
295
|
+
"status": result.status,
|
|
296
|
+
"result": {
|
|
297
|
+
"status": result.status,
|
|
298
|
+
"score": result.score,
|
|
299
|
+
"duration": result.duration_ms,
|
|
300
|
+
"error": result.error,
|
|
301
|
+
},
|
|
302
|
+
}
|
|
303
|
+
if result.error:
|
|
304
|
+
entry["error"] = result.error
|
|
305
|
+
results.append(entry)
|
|
306
|
+
except Exception as exc:
|
|
307
|
+
console.print(f" [red]✗ ERROR[/red] {spec.name}: {exc}")
|
|
308
|
+
results.append(
|
|
309
|
+
{
|
|
310
|
+
"spec": spec.name,
|
|
311
|
+
"name": spec.name,
|
|
312
|
+
"specId": spec.id,
|
|
313
|
+
"test_id": spec.id,
|
|
314
|
+
"testId": spec.id,
|
|
315
|
+
"test_name": spec.name,
|
|
316
|
+
"testName": spec.name,
|
|
317
|
+
"file_path": spec.file_path or "",
|
|
318
|
+
"filePath": spec.file_path or "",
|
|
319
|
+
"position": spec.position or {"line": 0, "column": 0},
|
|
320
|
+
"input": str(ctx.input or ""),
|
|
321
|
+
"expected": "",
|
|
322
|
+
"actual": "",
|
|
323
|
+
"output": "",
|
|
324
|
+
"passed": False,
|
|
325
|
+
"score": 0.0,
|
|
326
|
+
"duration_ms": 0.0,
|
|
327
|
+
"durationMs": 0.0,
|
|
328
|
+
"error": str(exc),
|
|
329
|
+
"status": "error",
|
|
330
|
+
"metadata": {},
|
|
331
|
+
"assertions": [],
|
|
332
|
+
"result": {
|
|
333
|
+
"status": "error",
|
|
334
|
+
"score": 0.0,
|
|
335
|
+
"duration": 0.0,
|
|
336
|
+
"error": str(exc),
|
|
337
|
+
},
|
|
338
|
+
}
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
passed = sum(1 for r in results if r["passed"])
|
|
342
|
+
total = len(results)
|
|
343
|
+
console.print(f"\n[bold]{passed}/{total} passed[/bold]")
|
|
344
|
+
|
|
345
|
+
artifact = _build_run_artifact(run_id, started_at, results)
|
|
346
|
+
written_paths = _write_run_artifact(artifact, output=output, write_results=write_results, cwd=cwd)
|
|
347
|
+
for path in written_paths:
|
|
348
|
+
console.print(f"Results written to {path}")
|
|
349
|
+
|
|
350
|
+
if passed < total:
|
|
351
|
+
raise typer.Exit(1)
|
|
352
|
+
finally:
|
|
353
|
+
handle.dispose()
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
# ── gate ─────────────────────────────────────────────────────────────
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def gate(
|
|
360
|
+
baseline_path: str = typer.Option(".evalgate/baseline.json", "--baseline", "-b", help="Baseline file"),
|
|
361
|
+
report_path: str | None = typer.Option(None, "--report", help="Run report file"),
|
|
362
|
+
min_score: float = typer.Option(0.8, "--min-score", help="Minimum passing score"),
|
|
363
|
+
max_drop: float = typer.Option(0.05, "--max-drop", help="Max allowed score drop"),
|
|
364
|
+
) -> None:
|
|
365
|
+
"""Run regression gate against a baseline."""
|
|
366
|
+
from evalgate_sdk.regression import GATE_EXIT, Baseline, BaselineTolerance, evaluate_regression
|
|
367
|
+
|
|
368
|
+
bp = Path(baseline_path)
|
|
369
|
+
if not bp.exists():
|
|
370
|
+
console.print(f"[red]Baseline not found:[/red] {baseline_path}")
|
|
371
|
+
console.print("Run [cyan]evalgate baseline init[/cyan] first")
|
|
372
|
+
raise typer.Exit(GATE_EXIT.INFRA_ERROR)
|
|
373
|
+
|
|
374
|
+
raw = json.loads(bp.read_text())
|
|
375
|
+
tol_raw = raw.get("tolerance")
|
|
376
|
+
baseline = Baseline(
|
|
377
|
+
scores=raw.get("scores", {}),
|
|
378
|
+
tolerance=BaselineTolerance(**tol_raw) if isinstance(tol_raw, dict) else BaselineTolerance(),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
current_scores: dict[str, float] = {}
|
|
382
|
+
if report_path:
|
|
383
|
+
rp = Path(report_path)
|
|
384
|
+
if rp.exists():
|
|
385
|
+
report = json.loads(rp.read_text())
|
|
386
|
+
for r in report.get("results", []):
|
|
387
|
+
current_scores[_spec_name_from_result(r)] = float(r.get("score") or 0.0)
|
|
388
|
+
|
|
389
|
+
report = evaluate_regression(
|
|
390
|
+
baseline,
|
|
391
|
+
current_scores,
|
|
392
|
+
min_score=min_score if min_score > 0 else None,
|
|
393
|
+
max_drop=max_drop if max_drop > 0 else None,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
table = Table(title="Regression Gate")
|
|
397
|
+
table.add_column("Test", style="cyan")
|
|
398
|
+
table.add_column("Baseline", justify="right")
|
|
399
|
+
table.add_column("Current", justify="right")
|
|
400
|
+
table.add_column("Delta", justify="right")
|
|
401
|
+
table.add_column("Status")
|
|
402
|
+
|
|
403
|
+
for d in report.deltas:
|
|
404
|
+
status = "[green]PASS[/green]" if d.category == "pass" else f"[red]{d.severity.upper()}[/red]"
|
|
405
|
+
table.add_row(d.test_id, f"{d.baseline_value:.3f}", f"{d.current_value:.3f}", f"{d.delta:+.3f}", status)
|
|
406
|
+
|
|
407
|
+
console.print(table)
|
|
408
|
+
console.print(f"\nGate: {'[green]PASS[/green]' if report.gate_exit == 0 else '[red]FAIL[/red]'}")
|
|
409
|
+
|
|
410
|
+
raise typer.Exit(report.gate_exit)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
# ── check ────────────────────────────────────────────────────────────
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def check(
|
|
417
|
+
api_key: str | None = typer.Option(None, "--api-key", envvar="EVALGATE_API_KEY"),
|
|
418
|
+
base_url: str | None = typer.Option(None, "--base-url", envvar="EVALGATE_BASE_URL"),
|
|
419
|
+
evaluation_id: int | None = typer.Option(None, "--evaluation-id", help="Evaluation to check"),
|
|
420
|
+
min_score: float = typer.Option(0.0, "--min-score", help="Minimum passing score (0-100)"),
|
|
421
|
+
max_drop: float | None = typer.Option(None, "--max-drop", help="Max allowed regression delta"),
|
|
422
|
+
baseline: str = typer.Option("published", "--baseline", help="Baseline mode: published|previous|production|auto"),
|
|
423
|
+
fmt: str = typer.Option("human", "--format", "-f", help="Output format: human|json"),
|
|
424
|
+
) -> None:
|
|
425
|
+
"""CI/CD gate — check evaluation quality score via the API."""
|
|
426
|
+
from evalgate_sdk.client import AIEvalClient
|
|
427
|
+
from evalgate_sdk.errors import EvalGateError
|
|
428
|
+
|
|
429
|
+
EXIT_PASS = 0
|
|
430
|
+
EXIT_SCORE_FAIL = 1
|
|
431
|
+
EXIT_REGRESSION = 2
|
|
432
|
+
EXIT_API_ERROR = 4
|
|
433
|
+
EXIT_BAD_ARGS = 5
|
|
434
|
+
|
|
435
|
+
api_key, base_url = _resolve_credentials(api_key, base_url)
|
|
436
|
+
|
|
437
|
+
if not evaluation_id:
|
|
438
|
+
console.print("[red]--evaluation-id is required[/red]")
|
|
439
|
+
raise typer.Exit(EXIT_BAD_ARGS)
|
|
440
|
+
|
|
441
|
+
if not api_key:
|
|
442
|
+
console.print("[red]--api-key or EVALGATE_API_KEY is required (or run evalgate configure)[/red]")
|
|
443
|
+
raise typer.Exit(EXIT_BAD_ARGS)
|
|
444
|
+
|
|
445
|
+
async def _check() -> int:
|
|
446
|
+
client = AIEvalClient(api_key=api_key, base_url=base_url)
|
|
447
|
+
try:
|
|
448
|
+
quality = await client.get_quality(evaluation_id, baseline=baseline)
|
|
449
|
+
except EvalGateError as exc:
|
|
450
|
+
console.print(f"[red]API error:[/red] {exc}")
|
|
451
|
+
return EXIT_API_ERROR
|
|
452
|
+
finally:
|
|
453
|
+
await client.close()
|
|
454
|
+
|
|
455
|
+
score = quality.score
|
|
456
|
+
if score is None:
|
|
457
|
+
console.print("[yellow]No quality score available[/yellow]")
|
|
458
|
+
return EXIT_API_ERROR
|
|
459
|
+
|
|
460
|
+
if fmt == "json":
|
|
461
|
+
console.print(json.dumps(quality.model_dump(by_alias=True, exclude_none=True), indent=2))
|
|
462
|
+
else:
|
|
463
|
+
table = Table(title="Quality Gate")
|
|
464
|
+
table.add_column("Metric", style="cyan")
|
|
465
|
+
table.add_column("Value", justify="right")
|
|
466
|
+
|
|
467
|
+
table.add_row("Score", f"{score:.1f}")
|
|
468
|
+
table.add_row("Min Score", f"{min_score:.1f}")
|
|
469
|
+
if quality.evidence_level:
|
|
470
|
+
table.add_row("Evidence", quality.evidence_level)
|
|
471
|
+
if quality.total is not None:
|
|
472
|
+
table.add_row("Total Tests", str(quality.total))
|
|
473
|
+
if quality.regression_delta is not None:
|
|
474
|
+
table.add_row("Regression Δ", f"{quality.regression_delta:+.1f}")
|
|
475
|
+
if quality.avg_latency_ms is not None:
|
|
476
|
+
table.add_row("Avg Latency", f"{quality.avg_latency_ms:.0f}ms")
|
|
477
|
+
if quality.cost_usd is not None:
|
|
478
|
+
table.add_row("Cost", f"${quality.cost_usd:.4f}")
|
|
479
|
+
if quality.flags:
|
|
480
|
+
table.add_row("Flags", ", ".join(quality.flags))
|
|
481
|
+
console.print(table)
|
|
482
|
+
|
|
483
|
+
if max_drop is not None and quality.regression_delta is not None and quality.regression_delta <= -max_drop:
|
|
484
|
+
console.print(f"[red]✗ FAIL — regression {quality.regression_delta:+.1f} exceeds max drop {max_drop}[/red]")
|
|
485
|
+
return EXIT_REGRESSION
|
|
486
|
+
|
|
487
|
+
if score < min_score:
|
|
488
|
+
console.print(f"[red]✗ FAIL — score {score:.1f} < min {min_score:.1f}[/red]")
|
|
489
|
+
return EXIT_SCORE_FAIL
|
|
490
|
+
|
|
491
|
+
console.print("[green]✓ PASS[/green]")
|
|
492
|
+
return EXIT_PASS
|
|
493
|
+
|
|
494
|
+
exit_code = _run_async(_check())
|
|
495
|
+
raise typer.Exit(exit_code)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# ── ci ───────────────────────────────────────────────────────────────
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def ci(
|
|
502
|
+
eval_dir: str = typer.Option("evals", "--dir", "-d"),
|
|
503
|
+
baseline_path: str = typer.Option(".evalgate/baseline.json", "--baseline", "-b"),
|
|
504
|
+
output: str = typer.Option(".evalgate/last-run.json", "--output", "-o"),
|
|
505
|
+
min_score: float = typer.Option(0.8, "--min-score", help="Minimum passing score"),
|
|
506
|
+
max_drop: float = typer.Option(0.05, "--max-drop", help="Max allowed score drop"),
|
|
507
|
+
) -> None:
|
|
508
|
+
"""CI loop — run evals then gate against baseline (one command for CI)."""
|
|
509
|
+
console.print("[bold]EvalGate CI Pipeline[/bold]\n")
|
|
510
|
+
|
|
511
|
+
console.print("[bold]Step 1/2:[/bold] Running evaluations...")
|
|
512
|
+
try:
|
|
513
|
+
run(eval_dir=eval_dir, spec_ids=None, output=output, write_results=True, verbose=False)
|
|
514
|
+
except SystemExit as e:
|
|
515
|
+
if e.code != 0:
|
|
516
|
+
console.print("[red]Evaluations failed — skipping gate[/red]")
|
|
517
|
+
raise typer.Exit(e.code or 1) from e
|
|
518
|
+
|
|
519
|
+
console.print("\n[bold]Step 2/2:[/bold] Running regression gate...")
|
|
520
|
+
gate(baseline_path=baseline_path, report_path=output, min_score=min_score, max_drop=max_drop)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
# ── doctor ───────────────────────────────────────────────────────────
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def doctor() -> None:
|
|
527
|
+
"""Pre-flight check — verify environment and configuration."""
|
|
528
|
+
console.print("[bold]EvalGate Doctor[/bold]\n")
|
|
529
|
+
checks = []
|
|
530
|
+
|
|
531
|
+
# Python version
|
|
532
|
+
import platform
|
|
533
|
+
|
|
534
|
+
py_ver = platform.python_version()
|
|
535
|
+
py_ok = tuple(int(x) for x in py_ver.split(".")[:2]) >= (3, 9)
|
|
536
|
+
checks.append(("Python >= 3.9", py_ok, py_ver))
|
|
537
|
+
|
|
538
|
+
# SDK installed
|
|
539
|
+
checks.append(("evalgate-sdk installed", True, __version__))
|
|
540
|
+
|
|
541
|
+
# API key
|
|
542
|
+
saved_cfg = _load_saved_config()
|
|
543
|
+
has_key = bool(os.environ.get("EVALGATE_API_KEY") or saved_cfg.get("api_key"))
|
|
544
|
+
key_source = "env" if os.environ.get("EVALGATE_API_KEY") else ("config" if saved_cfg.get("api_key") else "missing")
|
|
545
|
+
checks.append(("API key configured", has_key, key_source))
|
|
546
|
+
|
|
547
|
+
# Config file
|
|
548
|
+
config_exists = Path(".evalgate/config.json").exists()
|
|
549
|
+
checks.append((".evalgate/config.json", config_exists, "found" if config_exists else "missing"))
|
|
550
|
+
|
|
551
|
+
# Baseline
|
|
552
|
+
baseline_exists = Path(".evalgate/baseline.json").exists()
|
|
553
|
+
checks.append((".evalgate/baseline.json", baseline_exists, "found" if baseline_exists else "missing"))
|
|
554
|
+
|
|
555
|
+
# Eval directory
|
|
556
|
+
evals_exist = Path("evals").exists()
|
|
557
|
+
checks.append(("evals/ directory", evals_exist, "found" if evals_exist else "missing"))
|
|
558
|
+
|
|
559
|
+
# Optional deps
|
|
560
|
+
for pkg in ["openai", "anthropic"]:
|
|
561
|
+
try:
|
|
562
|
+
__import__(pkg)
|
|
563
|
+
checks.append((f"{pkg} installed", True, "yes"))
|
|
564
|
+
except ImportError:
|
|
565
|
+
checks.append((f"{pkg} installed", None, "optional"))
|
|
566
|
+
|
|
567
|
+
table = Table(title="Environment Check")
|
|
568
|
+
table.add_column("Check", style="cyan")
|
|
569
|
+
table.add_column("Status")
|
|
570
|
+
table.add_column("Details")
|
|
571
|
+
|
|
572
|
+
for name, ok, detail in checks:
|
|
573
|
+
if ok is True:
|
|
574
|
+
status = "[green]✓[/green]"
|
|
575
|
+
elif ok is False:
|
|
576
|
+
status = "[red]✗[/red]"
|
|
577
|
+
else:
|
|
578
|
+
status = "[yellow]○[/yellow]"
|
|
579
|
+
table.add_row(name, status, str(detail))
|
|
580
|
+
|
|
581
|
+
console.print(table)
|
|
582
|
+
|
|
583
|
+
failures = sum(1 for _, ok, _ in checks if ok is False)
|
|
584
|
+
if failures:
|
|
585
|
+
console.print(f"\n[red]{failures} issue(s) found[/red]")
|
|
586
|
+
raise typer.Exit(1)
|
|
587
|
+
console.print("\n[green]All checks passed![/green]")
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
# ── discover ─────────────────────────────────────────────────────────
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def discover(
|
|
594
|
+
eval_dir: str = typer.Option("evals", "--dir", "-d"),
|
|
595
|
+
manifest: bool = typer.Option(False, "--manifest", help="Output JSON manifest"),
|
|
596
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show load errors"),
|
|
597
|
+
) -> None:
|
|
598
|
+
"""Discover eval specs in the project."""
|
|
599
|
+
from evalgate_sdk.runtime import create_eval_runtime
|
|
600
|
+
|
|
601
|
+
cwd = Path.cwd()
|
|
602
|
+
eval_path = cwd / eval_dir
|
|
603
|
+
handle = create_eval_runtime(str(cwd))
|
|
604
|
+
|
|
605
|
+
if not eval_path.exists():
|
|
606
|
+
handle.dispose()
|
|
607
|
+
console.print(f"[yellow]No eval directory at {eval_dir}/[/yellow]")
|
|
608
|
+
raise typer.Exit(0)
|
|
609
|
+
|
|
610
|
+
try:
|
|
611
|
+
for spec_file in sorted(eval_path.glob("**/*.py")):
|
|
612
|
+
if spec_file.name.startswith("_"):
|
|
613
|
+
continue
|
|
614
|
+
try:
|
|
615
|
+
import importlib.util
|
|
616
|
+
|
|
617
|
+
spec_module = importlib.util.spec_from_file_location(spec_file.stem, spec_file)
|
|
618
|
+
if spec_module and spec_module.loader:
|
|
619
|
+
mod = importlib.util.module_from_spec(spec_module)
|
|
620
|
+
spec_module.loader.exec_module(mod)
|
|
621
|
+
except Exception as exc:
|
|
622
|
+
if verbose:
|
|
623
|
+
console.print(f"[yellow]Warning: failed to load {spec_file.name}:[/yellow] {exc}")
|
|
624
|
+
|
|
625
|
+
specs = handle.runtime.list()
|
|
626
|
+
finally:
|
|
627
|
+
handle.dispose()
|
|
628
|
+
|
|
629
|
+
if manifest:
|
|
630
|
+
out = {"specs": [{"id": s.id, "name": s.name, "suite": s.suite, "tags": s.options.tags} for s in specs]}
|
|
631
|
+
console.print(json.dumps(out, indent=2))
|
|
632
|
+
else:
|
|
633
|
+
table = Table(title=f"Discovered Specs ({len(specs)})")
|
|
634
|
+
table.add_column("ID", style="dim")
|
|
635
|
+
table.add_column("Name", style="cyan")
|
|
636
|
+
table.add_column("Suite")
|
|
637
|
+
table.add_column("Tags")
|
|
638
|
+
for s in specs:
|
|
639
|
+
table.add_row(s.id[:12], s.name, s.suite or "-", ", ".join(s.options.tags) or "-")
|
|
640
|
+
console.print(table)
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
# ── diff ─────────────────────────────────────────────────────────────
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def diff(
|
|
647
|
+
report_a: str = typer.Argument(..., help="First run report"),
|
|
648
|
+
report_b: str = typer.Argument(..., help="Second run report"),
|
|
649
|
+
) -> None:
|
|
650
|
+
"""Compare two run reports."""
|
|
651
|
+
pa, pb = Path(report_a), Path(report_b)
|
|
652
|
+
if not pa.exists():
|
|
653
|
+
console.print(f"[red]Report not found:[/red] {report_a}")
|
|
654
|
+
raise typer.Exit(1)
|
|
655
|
+
if not pb.exists():
|
|
656
|
+
console.print(f"[red]Report not found:[/red] {report_b}")
|
|
657
|
+
raise typer.Exit(1)
|
|
658
|
+
|
|
659
|
+
a = json.loads(pa.read_text())
|
|
660
|
+
b = json.loads(pb.read_text())
|
|
661
|
+
|
|
662
|
+
results_a = {_spec_name_from_result(r): r for r in a.get("results", [])}
|
|
663
|
+
results_b = {_spec_name_from_result(r): r for r in b.get("results", [])}
|
|
664
|
+
|
|
665
|
+
all_specs = sorted(set(results_a) | set(results_b))
|
|
666
|
+
|
|
667
|
+
table = Table(title="Run Comparison")
|
|
668
|
+
table.add_column("Spec", style="cyan")
|
|
669
|
+
table.add_column("Score A", justify="right")
|
|
670
|
+
table.add_column("Score B", justify="right")
|
|
671
|
+
table.add_column("Delta", justify="right")
|
|
672
|
+
table.add_column("Status")
|
|
673
|
+
|
|
674
|
+
for spec in all_specs:
|
|
675
|
+
sa = results_a.get(spec, {}).get("score", 0)
|
|
676
|
+
sb = results_b.get(spec, {}).get("score", 0)
|
|
677
|
+
delta = sb - sa
|
|
678
|
+
status = "[green]improved[/green]" if delta > 0 else ("[red]regressed[/red]" if delta < 0 else "unchanged")
|
|
679
|
+
table.add_row(spec, f"{sa:.3f}", f"{sb:.3f}", f"{delta:+.3f}", status)
|
|
680
|
+
|
|
681
|
+
console.print(table)
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
# ── explain ──────────────────────────────────────────────────────────
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
def explain(
|
|
688
|
+
report_path: str = typer.Argument(".evalgate/last-run.json", help="Run report to explain"),
|
|
689
|
+
) -> None:
|
|
690
|
+
"""Explain failures in a run report."""
|
|
691
|
+
rp = Path(report_path)
|
|
692
|
+
if not rp.exists():
|
|
693
|
+
console.print(f"[red]Report not found:[/red] {report_path}")
|
|
694
|
+
raise typer.Exit(1)
|
|
695
|
+
|
|
696
|
+
report = json.loads(rp.read_text())
|
|
697
|
+
failures = [r for r in report.get("results", []) if not r.get("passed")]
|
|
698
|
+
|
|
699
|
+
if not failures:
|
|
700
|
+
console.print("[green]No failures to explain — all specs passed![/green]")
|
|
701
|
+
return
|
|
702
|
+
|
|
703
|
+
console.print(f"[bold]{len(failures)} failure(s) found:[/bold]\n")
|
|
704
|
+
for f in failures:
|
|
705
|
+
console.print(f" [red]✗[/red] [bold]{_spec_name_from_result(f)}[/bold]")
|
|
706
|
+
console.print(f" Score: {f.get('score', 0):.3f}")
|
|
707
|
+
if "error" in f:
|
|
708
|
+
console.print(f" Error: {f['error']}")
|
|
709
|
+
console.print()
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
# ── baseline ─────────────────────────────────────────────────────────
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def baseline(
|
|
716
|
+
action: str = typer.Argument("init", help="Action: init or update"),
|
|
717
|
+
path: str = typer.Option(".evalgate/baseline.json", "--path", "-p"),
|
|
718
|
+
report_path: str | None = typer.Option(None, "--from-report", help="Update baseline from a run report"),
|
|
719
|
+
) -> None:
|
|
720
|
+
"""Manage baselines — init or update from a run report."""
|
|
721
|
+
bp = Path(path)
|
|
722
|
+
|
|
723
|
+
if action == "init":
|
|
724
|
+
bp.parent.mkdir(parents=True, exist_ok=True)
|
|
725
|
+
if bp.exists():
|
|
726
|
+
console.print(f"[yellow]Baseline already exists at {path}[/yellow]")
|
|
727
|
+
return
|
|
728
|
+
bp.write_text(
|
|
729
|
+
json.dumps(
|
|
730
|
+
{
|
|
731
|
+
"version": 1,
|
|
732
|
+
"scores": {},
|
|
733
|
+
"latencies": {},
|
|
734
|
+
"tolerance": {"score_drop": 0.05, "latency_increase_pct": 20.0, "min_confidence": 0.8},
|
|
735
|
+
},
|
|
736
|
+
indent=2,
|
|
737
|
+
)
|
|
738
|
+
)
|
|
739
|
+
console.print(f"[green]✓[/green] Created baseline at {path}")
|
|
740
|
+
|
|
741
|
+
elif action == "update":
|
|
742
|
+
if not report_path:
|
|
743
|
+
console.print("[red]--from-report required for update[/red]")
|
|
744
|
+
raise typer.Exit(1)
|
|
745
|
+
rp = Path(report_path)
|
|
746
|
+
if not rp.exists():
|
|
747
|
+
console.print(f"[red]Report not found:[/red] {report_path}")
|
|
748
|
+
raise typer.Exit(1)
|
|
749
|
+
|
|
750
|
+
report = json.loads(rp.read_text())
|
|
751
|
+
existing = json.loads(bp.read_text()) if bp.exists() else {"version": 1, "scores": {}, "latencies": {}}
|
|
752
|
+
|
|
753
|
+
for r in report.get("results", []):
|
|
754
|
+
spec_name = _spec_name_from_result(r)
|
|
755
|
+
existing["scores"][spec_name] = r["score"]
|
|
756
|
+
if "duration_ms" in r:
|
|
757
|
+
existing.setdefault("latencies", {})[spec_name] = r["duration_ms"]
|
|
758
|
+
|
|
759
|
+
bp.write_text(json.dumps(existing, indent=2))
|
|
760
|
+
console.print(f"[green]✓[/green] Updated baseline with {len(report.get('results', []))} results")
|
|
761
|
+
|
|
762
|
+
else:
|
|
763
|
+
console.print(f"[red]Unknown action:[/red] {action}")
|
|
764
|
+
raise typer.Exit(1)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
# ── print-config ──────────────────────────────────────────────────
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def print_config(
|
|
771
|
+
path: str = typer.Option(".evalgate/config.json", "--path", "-p"),
|
|
772
|
+
) -> None:
|
|
773
|
+
"""Print the current project configuration."""
|
|
774
|
+
cp = Path(path)
|
|
775
|
+
if not cp.exists():
|
|
776
|
+
console.print(f"[yellow]No config found at {path}[/yellow]")
|
|
777
|
+
console.print("Run [cyan]evalai init[/cyan] first")
|
|
778
|
+
raise typer.Exit(1)
|
|
779
|
+
|
|
780
|
+
config = json.loads(cp.read_text())
|
|
781
|
+
|
|
782
|
+
table = Table(title="Project Configuration")
|
|
783
|
+
table.add_column("Key", style="cyan")
|
|
784
|
+
table.add_column("Value")
|
|
785
|
+
|
|
786
|
+
for key, value in config.items():
|
|
787
|
+
table.add_row(key, json.dumps(value) if isinstance(value, (dict, list)) else str(value))
|
|
788
|
+
|
|
789
|
+
console.print(table)
|
|
790
|
+
console.print(f"\n[dim]Config path: {cp.resolve()}[/dim]")
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
# ── share ─────────────────────────────────────────────────────────
|
|
794
|
+
|
|
795
|
+
|
|
796
|
+
def share(
|
|
797
|
+
evaluation_id: int = typer.Option(..., "--evaluation-id", help="Evaluation ID"),
|
|
798
|
+
run_id: int = typer.Option(..., "--run-id", help="Evaluation run ID"),
|
|
799
|
+
expires: str = typer.Option("7d", "--expires", help="Expiry e.g. 7d, 24h"),
|
|
800
|
+
api_key: str | None = typer.Option(None, "--api-key", envvar="EVALGATE_API_KEY"),
|
|
801
|
+
base_url: str | None = typer.Option(None, "--base-url", envvar="EVALGATE_BASE_URL"),
|
|
802
|
+
) -> None:
|
|
803
|
+
"""Create a shareable link for an evaluation run."""
|
|
804
|
+
from evalgate_sdk.client import AIEvalClient
|
|
805
|
+
from evalgate_sdk.errors import EvalGateError
|
|
806
|
+
|
|
807
|
+
api_key, base_url = _resolve_credentials(api_key, base_url)
|
|
808
|
+
|
|
809
|
+
if not api_key:
|
|
810
|
+
console.print("[red]--api-key or EVALGATE_API_KEY required (or run evalgate configure)[/red]")
|
|
811
|
+
raise typer.Exit(1)
|
|
812
|
+
|
|
813
|
+
def _parse_expires(spec: str) -> int | None:
|
|
814
|
+
import re
|
|
815
|
+
|
|
816
|
+
m = re.match(r"^(\d+)(d|h|m|s)$", spec, re.IGNORECASE)
|
|
817
|
+
if not m:
|
|
818
|
+
return None
|
|
819
|
+
n = int(m.group(1))
|
|
820
|
+
unit = m.group(2).lower()
|
|
821
|
+
if unit == "d":
|
|
822
|
+
return n
|
|
823
|
+
if unit == "h":
|
|
824
|
+
return max(1, n // 24)
|
|
825
|
+
return 1
|
|
826
|
+
|
|
827
|
+
expires_days = _parse_expires(expires)
|
|
828
|
+
if expires_days is None:
|
|
829
|
+
console.print("[red]Invalid --expires format. Use e.g. 7d, 24h[/red]")
|
|
830
|
+
raise typer.Exit(1)
|
|
831
|
+
|
|
832
|
+
async def _share() -> int:
|
|
833
|
+
client = AIEvalClient(api_key=api_key, base_url=base_url)
|
|
834
|
+
try:
|
|
835
|
+
console.print("Fetching run export...")
|
|
836
|
+
export_data = await client.get_run_export(evaluation_id, run_id)
|
|
837
|
+
|
|
838
|
+
console.print("Publishing share link...")
|
|
839
|
+
result = await client.publish_share(
|
|
840
|
+
evaluation_id,
|
|
841
|
+
export_data,
|
|
842
|
+
run_id,
|
|
843
|
+
expires_in_days=expires_days,
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
share_url = result.get("shareUrl", "")
|
|
847
|
+
console.print(f"[green]✓[/green] Share link (expires in {expires}): {share_url}")
|
|
848
|
+
return 0
|
|
849
|
+
except EvalGateError as exc:
|
|
850
|
+
console.print(f"[red]API error:[/red] {exc}")
|
|
851
|
+
return 1
|
|
852
|
+
finally:
|
|
853
|
+
await client.close()
|
|
854
|
+
|
|
855
|
+
exit_code = _run_async(_share())
|
|
856
|
+
raise typer.Exit(exit_code)
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
# ── configure ─────────────────────────────────────────────────────
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
def configure(
|
|
863
|
+
api_key: str | None = typer.Option(None, "--api-key", help="API key (prompted if not given)"),
|
|
864
|
+
base_url: str = typer.Option(
|
|
865
|
+
"https://evalgate.com",
|
|
866
|
+
"--base-url",
|
|
867
|
+
help="Platform base URL",
|
|
868
|
+
),
|
|
869
|
+
) -> None:
|
|
870
|
+
"""Set up API key and validate connection."""
|
|
871
|
+
from evalgate_sdk.client import AIEvalClient
|
|
872
|
+
from evalgate_sdk.errors import EvalGateError
|
|
873
|
+
|
|
874
|
+
if not api_key:
|
|
875
|
+
api_key = typer.prompt("Enter your API key", hide_input=True)
|
|
876
|
+
|
|
877
|
+
if not api_key:
|
|
878
|
+
console.print("[red]API key is required[/red]")
|
|
879
|
+
raise typer.Exit(1)
|
|
880
|
+
|
|
881
|
+
console.print("Validating API key...")
|
|
882
|
+
|
|
883
|
+
async def _validate() -> bool:
|
|
884
|
+
client = AIEvalClient(api_key=api_key, base_url=base_url)
|
|
885
|
+
try:
|
|
886
|
+
org = await client.organizations.get_current()
|
|
887
|
+
console.print(f"[green]✓[/green] Connected — org [bold]{org.name}[/bold] (id={org.id})")
|
|
888
|
+
return True
|
|
889
|
+
except EvalGateError as exc:
|
|
890
|
+
console.print(f"[red]✗ Validation failed:[/red] {exc}")
|
|
891
|
+
return False
|
|
892
|
+
finally:
|
|
893
|
+
await client.close()
|
|
894
|
+
|
|
895
|
+
if not _run_async(_validate()):
|
|
896
|
+
raise typer.Exit(1)
|
|
897
|
+
|
|
898
|
+
config_dir = Path.cwd() / ".evalgate"
|
|
899
|
+
config_dir.mkdir(exist_ok=True)
|
|
900
|
+
config_path = config_dir / "config.json"
|
|
901
|
+
|
|
902
|
+
config: dict[str, Any] = {}
|
|
903
|
+
if config_path.exists():
|
|
904
|
+
config = json.loads(config_path.read_text())
|
|
905
|
+
|
|
906
|
+
config["api_key"] = api_key
|
|
907
|
+
config["base_url"] = base_url
|
|
908
|
+
|
|
909
|
+
config_path.write_text(json.dumps(config, indent=2))
|
|
910
|
+
console.print(f"[green]✓[/green] Saved to {config_path.relative_to(Path.cwd())}")
|
|
911
|
+
console.print("\n[dim]Tip: add .evalgate/ to .gitignore to avoid committing credentials[/dim]")
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
# ── upgrade ───────────────────────────────────────────────────────
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def upgrade() -> None:
|
|
918
|
+
"""Check for SDK updates and print upgrade instructions."""
|
|
919
|
+
import httpx
|
|
920
|
+
|
|
921
|
+
console.print("[bold]Checking for updates...[/bold]\n")
|
|
922
|
+
|
|
923
|
+
try:
|
|
924
|
+
resp = httpx.get("https://pypi.org/pypi/evalgate/json", timeout=10)
|
|
925
|
+
if resp.status_code == 200:
|
|
926
|
+
latest = resp.json()["info"]["version"]
|
|
927
|
+
if latest == __version__:
|
|
928
|
+
console.print(f"[green]✓ You're on the latest version ({__version__})[/green]")
|
|
929
|
+
else:
|
|
930
|
+
console.print(f" Current: [yellow]{__version__}[/yellow]")
|
|
931
|
+
console.print(f" Latest: [green]{latest}[/green]\n")
|
|
932
|
+
console.print("Upgrade with:")
|
|
933
|
+
console.print(f' [cyan]pip install "evalgate-sdk=={latest}"[/cyan]')
|
|
934
|
+
else:
|
|
935
|
+
console.print(f"[yellow]Could not check PyPI (HTTP {resp.status_code})[/yellow]")
|
|
936
|
+
except Exception as exc:
|
|
937
|
+
console.print(f"[yellow]Could not reach PyPI:[/yellow] {exc}")
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
# ── impact-analysis ───────────────────────────────────────────────
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
def impact_analysis(
|
|
944
|
+
eval_dir: str = typer.Option("evals", "--dir", "-d"),
|
|
945
|
+
baseline_path: str = typer.Option(".evalgate/baseline.json", "--baseline", "-b"),
|
|
946
|
+
) -> None:
|
|
947
|
+
"""Analyze which eval specs would be affected by code changes."""
|
|
948
|
+
from evalgate_sdk.runtime import create_eval_runtime
|
|
949
|
+
|
|
950
|
+
cwd = Path.cwd()
|
|
951
|
+
eval_path = cwd / eval_dir
|
|
952
|
+
|
|
953
|
+
if not eval_path.exists():
|
|
954
|
+
console.print(f"[yellow]No eval directory at {eval_dir}/[/yellow]")
|
|
955
|
+
raise typer.Exit(0)
|
|
956
|
+
|
|
957
|
+
handle = create_eval_runtime(str(cwd))
|
|
958
|
+
|
|
959
|
+
for spec_file in sorted(eval_path.glob("**/*.py")):
|
|
960
|
+
if spec_file.name.startswith("_"):
|
|
961
|
+
continue
|
|
962
|
+
try:
|
|
963
|
+
import importlib.util
|
|
964
|
+
|
|
965
|
+
spec_module = importlib.util.spec_from_file_location(spec_file.stem, spec_file)
|
|
966
|
+
if spec_module and spec_module.loader:
|
|
967
|
+
mod = importlib.util.module_from_spec(spec_module)
|
|
968
|
+
spec_module.loader.exec_module(mod)
|
|
969
|
+
except Exception:
|
|
970
|
+
pass
|
|
971
|
+
|
|
972
|
+
specs = handle.runtime.list()
|
|
973
|
+
|
|
974
|
+
bp = Path(baseline_path)
|
|
975
|
+
baseline_scores: dict[str, float] = {}
|
|
976
|
+
if bp.exists():
|
|
977
|
+
raw = json.loads(bp.read_text())
|
|
978
|
+
baseline_scores = raw.get("scores", {})
|
|
979
|
+
|
|
980
|
+
table = Table(title="Impact Analysis")
|
|
981
|
+
table.add_column("Spec", style="cyan")
|
|
982
|
+
table.add_column("Suite")
|
|
983
|
+
table.add_column("Baseline Score", justify="right")
|
|
984
|
+
table.add_column("Has Baseline")
|
|
985
|
+
table.add_column("Risk")
|
|
986
|
+
|
|
987
|
+
for s in specs:
|
|
988
|
+
score = baseline_scores.get(s.name)
|
|
989
|
+
has_baseline = score is not None
|
|
990
|
+
risk = "[green]low[/green]"
|
|
991
|
+
if not has_baseline:
|
|
992
|
+
risk = "[yellow]unknown[/yellow]"
|
|
993
|
+
elif score is not None and score < 0.8:
|
|
994
|
+
risk = "[red]high[/red]"
|
|
995
|
+
|
|
996
|
+
table.add_row(
|
|
997
|
+
s.name,
|
|
998
|
+
s.suite or "-",
|
|
999
|
+
f"{score:.3f}" if score is not None else "-",
|
|
1000
|
+
"[green]yes[/green]" if has_baseline else "[red]no[/red]",
|
|
1001
|
+
risk,
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
console.print(table)
|
|
1005
|
+
console.print(f"\n[dim]{len(specs)} spec(s) discovered, {len(baseline_scores)} with baselines[/dim]")
|
|
1006
|
+
handle.dispose()
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
# ── migrate ───────────────────────────────────────────────────────
|
|
1010
|
+
|
|
1011
|
+
|
|
1012
|
+
def migrate(
|
|
1013
|
+
action: str = typer.Argument("config", help="What to migrate: config"),
|
|
1014
|
+
path: str = typer.Option(".evalgate/config.json", "--path", "-p"),
|
|
1015
|
+
) -> None:
|
|
1016
|
+
"""Migrate config or baseline to the latest format."""
|
|
1017
|
+
if action != "config":
|
|
1018
|
+
console.print(f"[red]Unknown migration target:[/red] {action}")
|
|
1019
|
+
console.print("Supported: [cyan]config[/cyan]")
|
|
1020
|
+
raise typer.Exit(1)
|
|
1021
|
+
|
|
1022
|
+
cp = Path(path)
|
|
1023
|
+
if not cp.exists():
|
|
1024
|
+
console.print(f"[yellow]No config at {path}[/yellow]")
|
|
1025
|
+
console.print("Run [cyan]evalai init[/cyan] first")
|
|
1026
|
+
raise typer.Exit(1)
|
|
1027
|
+
|
|
1028
|
+
config = json.loads(cp.read_text())
|
|
1029
|
+
current_version = config.get("version", 0)
|
|
1030
|
+
|
|
1031
|
+
if current_version >= 1:
|
|
1032
|
+
console.print(f"[green]✓ Config already at latest version ({current_version})[/green]")
|
|
1033
|
+
return
|
|
1034
|
+
|
|
1035
|
+
config["version"] = 1
|
|
1036
|
+
config.setdefault("project_name", Path.cwd().name)
|
|
1037
|
+
config.setdefault("eval_dir", "evals")
|
|
1038
|
+
config.setdefault("baseline", ".evalgate/baseline.json")
|
|
1039
|
+
|
|
1040
|
+
cp.write_text(json.dumps(config, indent=2))
|
|
1041
|
+
console.print(f"[green]✓ Migrated config from v{current_version} to v1[/green]")
|