evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
"""New CLI commands for Python SDK parity with TypeScript SDK (T5).
|
|
2
|
+
|
|
3
|
+
Commands: start, watch, compare, validate, promote, replay.
|
|
4
|
+
Supporting: templates, profiles, formatters.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import contextlib
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import typer
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.table import Table
|
|
19
|
+
|
|
20
|
+
console = Console()
|
|
21
|
+
|
|
22
|
+
# Module-level constants for typer defaults to avoid B008
|
|
23
|
+
FILES_ARG = typer.Argument(..., help="Two or more result JSON files to compare")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ── Templates ─────────────────────────────────────────────────────────
|
|
27
|
+
|
|
28
|
+
TEMPLATE_DESCRIPTIONS = {
|
|
29
|
+
"chatbot": "Conversational AI — tone, helpfulness, safety",
|
|
30
|
+
"codegen": "Code generation — syntax, correctness, style",
|
|
31
|
+
"agent": "Multi-step agent — tool use, reasoning, outcomes",
|
|
32
|
+
"safety": "Safety guards — PII, toxicity, hallucination",
|
|
33
|
+
"rag": "RAG pipeline — retrieval faithfulness, grounding",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
TEMPLATES: dict[str, dict[str, str]] = {
|
|
37
|
+
"chatbot": {
|
|
38
|
+
"eval/chatbot_quality.py": '''"""Chatbot quality evaluation."""
|
|
39
|
+
from evalgate_sdk.runtime.eval import define_eval, create_result
|
|
40
|
+
from evalgate_sdk.assertions import expect
|
|
41
|
+
|
|
42
|
+
define_eval("chatbot-responds-helpfully", lambda ctx: _eval_helpful(ctx))
|
|
43
|
+
|
|
44
|
+
async def _eval_helpful(ctx):
|
|
45
|
+
response = "I'd be happy to help you with that! Here's what I suggest..."
|
|
46
|
+
helpful = expect(response).to_contain_keywords(["help", "suggest"])
|
|
47
|
+
length = expect(response).to_have_length(min=20, max=500)
|
|
48
|
+
all_passed = helpful.passed and length.passed
|
|
49
|
+
return create_result(passed=all_passed, score=100 if all_passed else 40, output=response)
|
|
50
|
+
''',
|
|
51
|
+
},
|
|
52
|
+
"codegen": {
|
|
53
|
+
"eval/codegen_accuracy.py": '''"""Code generation accuracy evaluation."""
|
|
54
|
+
from evalgate_sdk.runtime.eval import define_eval, create_result
|
|
55
|
+
from evalgate_sdk.assertions import has_valid_code_syntax
|
|
56
|
+
|
|
57
|
+
define_eval("codegen-produces-valid-python", lambda ctx: _eval_codegen(ctx))
|
|
58
|
+
|
|
59
|
+
async def _eval_codegen(ctx):
|
|
60
|
+
code = "def hello():\\n return \'Hello, World!\'"
|
|
61
|
+
valid = has_valid_code_syntax(code, "python")
|
|
62
|
+
return create_result(passed=valid.passed, score=100 if valid.passed else 0, output=code)
|
|
63
|
+
''',
|
|
64
|
+
},
|
|
65
|
+
"agent": {
|
|
66
|
+
"eval/agent_tool_use.py": '''"""Agent tool-use evaluation."""
|
|
67
|
+
from evalgate_sdk.runtime.eval import define_eval, create_result
|
|
68
|
+
from evalgate_sdk.assertions import contains_keywords
|
|
69
|
+
|
|
70
|
+
define_eval("agent-uses-tools-correctly", lambda ctx: _eval_agent(ctx))
|
|
71
|
+
|
|
72
|
+
async def _eval_agent(ctx):
|
|
73
|
+
output = "I used the search tool to find: The weather is sunny."
|
|
74
|
+
used_tool = contains_keywords(output, ["search", "tool"])
|
|
75
|
+
return create_result(passed=used_tool.passed, score=100 if used_tool.passed else 0, output=output)
|
|
76
|
+
''',
|
|
77
|
+
},
|
|
78
|
+
"safety": {
|
|
79
|
+
"eval/safety_checks.py": '''"""Safety guard evaluation."""
|
|
80
|
+
from evalgate_sdk.runtime.eval import define_eval, create_result
|
|
81
|
+
from evalgate_sdk.assertions import expect
|
|
82
|
+
|
|
83
|
+
define_eval("no-pii-leak", lambda ctx: _eval_no_pii(ctx))
|
|
84
|
+
|
|
85
|
+
async def _eval_no_pii(ctx):
|
|
86
|
+
response = "I can help you find information about that topic safely."
|
|
87
|
+
no_pii = expect(response).to_not_contain_pii()
|
|
88
|
+
professional = expect(response).to_be_professional()
|
|
89
|
+
all_passed = no_pii.passed and professional.passed
|
|
90
|
+
return create_result(passed=all_passed, score=100 if all_passed else 0)
|
|
91
|
+
''',
|
|
92
|
+
},
|
|
93
|
+
"rag": {
|
|
94
|
+
"eval/rag_faithfulness.py": '''"""RAG faithfulness evaluation."""
|
|
95
|
+
from evalgate_sdk.runtime.eval import define_eval, create_result
|
|
96
|
+
from evalgate_sdk.assertions import has_no_hallucinations
|
|
97
|
+
|
|
98
|
+
define_eval("rag-grounded-response", lambda ctx: _eval_rag(ctx))
|
|
99
|
+
|
|
100
|
+
async def _eval_rag(ctx):
|
|
101
|
+
context_docs = ["Paris is the capital of France."]
|
|
102
|
+
response = "The capital of France is Paris."
|
|
103
|
+
grounded = has_no_hallucinations(response, context_docs)
|
|
104
|
+
return create_result(passed=grounded.passed, score=100 if grounded.passed else 0, output=response)
|
|
105
|
+
''',
|
|
106
|
+
},
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _install_template(template: str, project_root: str) -> int:
|
|
111
|
+
"""Install template files into the project. Returns number of files created."""
|
|
112
|
+
files = TEMPLATES.get(template, {})
|
|
113
|
+
count = 0
|
|
114
|
+
for rel_path, content in files.items():
|
|
115
|
+
full = Path(project_root) / rel_path
|
|
116
|
+
full.parent.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
if not full.exists():
|
|
118
|
+
full.write_text(content, encoding="utf-8")
|
|
119
|
+
count += 1
|
|
120
|
+
return count
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# ── start ─────────────────────────────────────────────────────────────
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def start(
|
|
127
|
+
format: str = typer.Option("human", "--format", "-f", help="Output format: human or json"),
|
|
128
|
+
skip_init: bool = typer.Option(False, "--skip-init", help="Skip init if not set up"),
|
|
129
|
+
template: str = typer.Option("", "--template", "-t", help="Starter template to install"),
|
|
130
|
+
) -> None:
|
|
131
|
+
"""Zero-config startup: one command → init → discover → run."""
|
|
132
|
+
project_root = os.getcwd()
|
|
133
|
+
|
|
134
|
+
if format == "human":
|
|
135
|
+
console.print("\n[bold cyan]🚀 evalgate start — zero-config evaluation run[/bold cyan]\n")
|
|
136
|
+
|
|
137
|
+
# Step 1: Ensure init
|
|
138
|
+
config_path = Path(project_root) / ".evalgate" / "config.json"
|
|
139
|
+
if not config_path.exists() and not skip_init:
|
|
140
|
+
if format == "human":
|
|
141
|
+
console.print("[yellow]📦 No config found. Initializing...[/yellow]")
|
|
142
|
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
|
143
|
+
config_path.write_text(
|
|
144
|
+
json.dumps(
|
|
145
|
+
{
|
|
146
|
+
"version": 1,
|
|
147
|
+
"project_name": Path(project_root).name,
|
|
148
|
+
"eval_dir": "eval",
|
|
149
|
+
"baseline": ".evalgate/baseline.json",
|
|
150
|
+
},
|
|
151
|
+
indent=2,
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
if format == "human":
|
|
155
|
+
console.print("[green]✓ Initialized .evalgate/config.json[/green]")
|
|
156
|
+
|
|
157
|
+
# Step 1b: Install template if requested
|
|
158
|
+
if template:
|
|
159
|
+
if template not in TEMPLATES:
|
|
160
|
+
console.print(f"[red]Unknown template: {template}[/red]")
|
|
161
|
+
console.print(f"Available: {', '.join(TEMPLATES.keys())}")
|
|
162
|
+
raise typer.Exit(1)
|
|
163
|
+
count = _install_template(template, project_root)
|
|
164
|
+
if format == "human":
|
|
165
|
+
console.print(f"[green]✓ Installed {template} template ({count} file(s))[/green]")
|
|
166
|
+
|
|
167
|
+
# Step 2: Discover specs
|
|
168
|
+
if format == "human":
|
|
169
|
+
console.print("\n[cyan]🔍 Discovering specs...[/cyan]")
|
|
170
|
+
|
|
171
|
+
from evalgate_sdk.runtime.execution_mode import get_execution_mode
|
|
172
|
+
|
|
173
|
+
mode_config = get_execution_mode(project_root)
|
|
174
|
+
spec_count = len(mode_config.spec_files)
|
|
175
|
+
|
|
176
|
+
if format == "human":
|
|
177
|
+
console.print(f"[dim]Found {spec_count} spec file(s) in {mode_config.mode} mode[/dim]")
|
|
178
|
+
|
|
179
|
+
if spec_count == 0:
|
|
180
|
+
if format == "human":
|
|
181
|
+
console.print(
|
|
182
|
+
"[yellow]No spec files found. Create eval files with define_eval() or use --template.[/yellow]"
|
|
183
|
+
)
|
|
184
|
+
raise typer.Exit(0)
|
|
185
|
+
|
|
186
|
+
if format == "human":
|
|
187
|
+
console.print("\n[green]✓ Ready to run evaluations[/green]")
|
|
188
|
+
console.print("[dim]Use 'evalgate run' to execute specs[/dim]")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ── watch ─────────────────────────────────────────────────────────────
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def watch(
|
|
195
|
+
eval_dir: str = typer.Option("eval", "--eval-dir", "-e", help="Directory to watch"),
|
|
196
|
+
debounce_ms: int = typer.Option(300, "--debounce", help="Debounce interval in ms"),
|
|
197
|
+
clear_screen: bool = typer.Option(True, "--clear/--no-clear", help="Clear screen between runs"),
|
|
198
|
+
) -> None:
|
|
199
|
+
"""Watch mode — re-run evaluations when source files change."""
|
|
200
|
+
import importlib.util
|
|
201
|
+
|
|
202
|
+
project_root = os.getcwd()
|
|
203
|
+
watch_dir = Path(project_root) / eval_dir
|
|
204
|
+
|
|
205
|
+
if not watch_dir.exists():
|
|
206
|
+
console.print(f"[red]Watch directory not found: {watch_dir}[/red]")
|
|
207
|
+
raise typer.Exit(1)
|
|
208
|
+
|
|
209
|
+
console.print(f"[cyan]👁️ Watching {watch_dir} (debounce: {debounce_ms}ms)[/cyan]")
|
|
210
|
+
console.print("[dim]Press Ctrl+C to stop[/dim]\n")
|
|
211
|
+
|
|
212
|
+
last_mtimes: dict[str, float] = {}
|
|
213
|
+
|
|
214
|
+
def _get_mtimes() -> dict[str, float]:
|
|
215
|
+
mtimes: dict[str, float] = {}
|
|
216
|
+
for f in watch_dir.rglob("*.py"):
|
|
217
|
+
if f.name.startswith("_"):
|
|
218
|
+
continue
|
|
219
|
+
with contextlib.suppress(OSError):
|
|
220
|
+
mtimes[str(f)] = f.stat().st_mtime
|
|
221
|
+
return mtimes
|
|
222
|
+
|
|
223
|
+
def _run_specs() -> None:
|
|
224
|
+
console.print(f"[cyan]▶ Running specs at {time.strftime('%H:%M:%S')}...[/cyan]")
|
|
225
|
+
try:
|
|
226
|
+
from evalgate_sdk.runtime.registry import create_eval_runtime
|
|
227
|
+
|
|
228
|
+
handle = create_eval_runtime("watch-mode")
|
|
229
|
+
for f in sorted(watch_dir.rglob("*.py")):
|
|
230
|
+
if f.name.startswith("_"):
|
|
231
|
+
continue
|
|
232
|
+
try:
|
|
233
|
+
spec = importlib.util.spec_from_file_location(f.stem, f)
|
|
234
|
+
if spec and spec.loader:
|
|
235
|
+
mod = importlib.util.module_from_spec(spec)
|
|
236
|
+
spec.loader.exec_module(mod)
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
console.print(f"[red]Error loading {f.name}: {exc}[/red]")
|
|
239
|
+
specs = handle.runtime.list()
|
|
240
|
+
console.print(f"[green]✓ Discovered {len(specs)} spec(s)[/green]")
|
|
241
|
+
handle.dispose()
|
|
242
|
+
except Exception as exc:
|
|
243
|
+
console.print(f"[red]Run error: {exc}[/red]")
|
|
244
|
+
|
|
245
|
+
# Initial run
|
|
246
|
+
last_mtimes = _get_mtimes()
|
|
247
|
+
_run_specs()
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
while True:
|
|
251
|
+
time.sleep(debounce_ms / 1000.0)
|
|
252
|
+
current = _get_mtimes()
|
|
253
|
+
if current != last_mtimes:
|
|
254
|
+
last_mtimes = current
|
|
255
|
+
if clear_screen:
|
|
256
|
+
os.system("cls" if os.name == "nt" else "clear")
|
|
257
|
+
_run_specs()
|
|
258
|
+
except KeyboardInterrupt:
|
|
259
|
+
console.print("\n[yellow]Watch mode stopped.[/yellow]")
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# ── compare ───────────────────────────────────────────────────────────
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def compare(
|
|
266
|
+
files: list[str] = FILES_ARG,
|
|
267
|
+
format: str = typer.Option("human", "--format", "-f", help="Output format"),
|
|
268
|
+
) -> None:
|
|
269
|
+
"""Compare evaluation result files."""
|
|
270
|
+
"""Compare results from multiple evaluation runs side-by-side."""
|
|
271
|
+
if len(files) < 2:
|
|
272
|
+
console.print("[red]Need at least 2 result files to compare.[/red]")
|
|
273
|
+
raise typer.Exit(1)
|
|
274
|
+
|
|
275
|
+
runs: list[dict[str, Any]] = []
|
|
276
|
+
for f in files:
|
|
277
|
+
p = Path(f)
|
|
278
|
+
if not p.exists():
|
|
279
|
+
console.print(f"[red]File not found: {f}[/red]")
|
|
280
|
+
raise typer.Exit(1)
|
|
281
|
+
runs.append(json.loads(p.read_text(encoding="utf-8")))
|
|
282
|
+
|
|
283
|
+
if format == "json":
|
|
284
|
+
console.print_json(json.dumps({"runs": runs}))
|
|
285
|
+
return
|
|
286
|
+
|
|
287
|
+
table = Table(title="Run Comparison")
|
|
288
|
+
table.add_column("Metric", style="cyan")
|
|
289
|
+
for _i, f in enumerate(files):
|
|
290
|
+
table.add_column(Path(f).stem, justify="right")
|
|
291
|
+
|
|
292
|
+
# Extract common metrics
|
|
293
|
+
metrics = ["total", "passed", "failed", "pass_rate", "average_score", "total_duration_ms"]
|
|
294
|
+
for metric in metrics:
|
|
295
|
+
row = [metric]
|
|
296
|
+
for run_data in runs:
|
|
297
|
+
summary = run_data.get("summary", {})
|
|
298
|
+
val = summary.get(metric, "-")
|
|
299
|
+
if isinstance(val, float):
|
|
300
|
+
row.append(f"{val:.2f}")
|
|
301
|
+
else:
|
|
302
|
+
row.append(str(val))
|
|
303
|
+
table.add_row(*row)
|
|
304
|
+
|
|
305
|
+
console.print(table)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# ── validate ──────────────────────────────────────────────────────────
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def validate(
|
|
312
|
+
eval_dir: str = typer.Option("eval", "--eval-dir", "-e", help="Directory containing spec files"),
|
|
313
|
+
) -> None:
|
|
314
|
+
"""Validate spec files without running them."""
|
|
315
|
+
import importlib.util
|
|
316
|
+
|
|
317
|
+
project_root = os.getcwd()
|
|
318
|
+
eval_path = Path(project_root) / eval_dir
|
|
319
|
+
|
|
320
|
+
if not eval_path.exists():
|
|
321
|
+
console.print(f"[red]Eval directory not found: {eval_path}[/red]")
|
|
322
|
+
raise typer.Exit(1)
|
|
323
|
+
|
|
324
|
+
from evalgate_sdk.runtime.registry import create_eval_runtime
|
|
325
|
+
|
|
326
|
+
handle = create_eval_runtime("validate")
|
|
327
|
+
errors: list[str] = []
|
|
328
|
+
file_count = 0
|
|
329
|
+
|
|
330
|
+
for spec_file in sorted(eval_path.rglob("*.py")):
|
|
331
|
+
if spec_file.name.startswith("_"):
|
|
332
|
+
continue
|
|
333
|
+
file_count += 1
|
|
334
|
+
try:
|
|
335
|
+
spec = importlib.util.spec_from_file_location(spec_file.stem, spec_file)
|
|
336
|
+
if spec and spec.loader:
|
|
337
|
+
mod = importlib.util.module_from_spec(spec)
|
|
338
|
+
spec.loader.exec_module(mod)
|
|
339
|
+
except Exception as exc:
|
|
340
|
+
errors.append(f"{spec_file.name}: {exc}")
|
|
341
|
+
|
|
342
|
+
specs = handle.runtime.list()
|
|
343
|
+
handle.dispose()
|
|
344
|
+
|
|
345
|
+
if errors:
|
|
346
|
+
console.print(f"\n[red]✗ {len(errors)} error(s) in {file_count} file(s):[/red]")
|
|
347
|
+
for err in errors:
|
|
348
|
+
console.print(f" [red]• {err}[/red]")
|
|
349
|
+
raise typer.Exit(1)
|
|
350
|
+
|
|
351
|
+
console.print(f"[green]✓ {len(specs)} spec(s) validated across {file_count} file(s)[/green]")
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# ── promote ───────────────────────────────────────────────────────────
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def promote(
|
|
358
|
+
candidate_file: str = typer.Argument(..., help="Path to candidate results JSON"),
|
|
359
|
+
baseline_path: str = typer.Option(".evalgate/baseline.json", "--baseline", "-b"),
|
|
360
|
+
min_score: float = typer.Option(90.0, "--min-score", help="Minimum score to promote"),
|
|
361
|
+
) -> None:
|
|
362
|
+
"""Promote candidate eval cases to the regression baseline."""
|
|
363
|
+
cp = Path(candidate_file)
|
|
364
|
+
if not cp.exists():
|
|
365
|
+
console.print(f"[red]Candidate file not found: {candidate_file}[/red]")
|
|
366
|
+
raise typer.Exit(1)
|
|
367
|
+
|
|
368
|
+
candidates = json.loads(cp.read_text(encoding="utf-8"))
|
|
369
|
+
results = candidates.get("results", [])
|
|
370
|
+
|
|
371
|
+
bp = Path(baseline_path)
|
|
372
|
+
baseline: dict[str, Any] = {}
|
|
373
|
+
if bp.exists():
|
|
374
|
+
baseline = json.loads(bp.read_text(encoding="utf-8"))
|
|
375
|
+
|
|
376
|
+
scores = baseline.get("scores", {})
|
|
377
|
+
promoted = 0
|
|
378
|
+
skipped = 0
|
|
379
|
+
|
|
380
|
+
for r in results:
|
|
381
|
+
name = r.get("test_name", r.get("testName", ""))
|
|
382
|
+
score = r.get("score", 0)
|
|
383
|
+
if score >= min_score:
|
|
384
|
+
scores[name] = score
|
|
385
|
+
promoted += 1
|
|
386
|
+
else:
|
|
387
|
+
skipped += 1
|
|
388
|
+
|
|
389
|
+
baseline["scores"] = scores
|
|
390
|
+
bp.parent.mkdir(parents=True, exist_ok=True)
|
|
391
|
+
bp.write_text(json.dumps(baseline, indent=2), encoding="utf-8")
|
|
392
|
+
|
|
393
|
+
console.print(f"[green]✓ Promoted {promoted} case(s) to baseline[/green]")
|
|
394
|
+
if skipped:
|
|
395
|
+
console.print(f"[yellow]⚠ Skipped {skipped} case(s) below min score ({min_score})[/yellow]")
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
# ── replay ────────────────────────────────────────────────────────────
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def replay(
|
|
402
|
+
result_file: str = typer.Argument(..., help="Path to previous run result JSON"),
|
|
403
|
+
spec_name: str = typer.Option("", "--spec", "-s", help="Replay a specific spec by name"),
|
|
404
|
+
) -> None:
|
|
405
|
+
"""Replay a previous evaluation run or specific spec."""
|
|
406
|
+
rp = Path(result_file)
|
|
407
|
+
if not rp.exists():
|
|
408
|
+
console.print(f"[red]Result file not found: {result_file}[/red]")
|
|
409
|
+
raise typer.Exit(1)
|
|
410
|
+
|
|
411
|
+
data = json.loads(rp.read_text(encoding="utf-8"))
|
|
412
|
+
results = data.get("results", [])
|
|
413
|
+
|
|
414
|
+
if spec_name:
|
|
415
|
+
results = [r for r in results if r.get("test_name", r.get("testName", "")) == spec_name]
|
|
416
|
+
if not results:
|
|
417
|
+
console.print(f"[red]No spec named '{spec_name}' found in results.[/red]")
|
|
418
|
+
raise typer.Exit(1)
|
|
419
|
+
|
|
420
|
+
table = Table(title="Replay Results")
|
|
421
|
+
table.add_column("Spec", style="cyan")
|
|
422
|
+
table.add_column("Score", justify="right")
|
|
423
|
+
table.add_column("Status")
|
|
424
|
+
table.add_column("Duration (ms)", justify="right")
|
|
425
|
+
|
|
426
|
+
for r in results:
|
|
427
|
+
name = r.get("test_name", r.get("testName", "?"))
|
|
428
|
+
score = r.get("score", 0)
|
|
429
|
+
passed = r.get("passed", r.get("pass", False))
|
|
430
|
+
dur = r.get("duration_ms", r.get("durationMs", 0))
|
|
431
|
+
status = "[green]✓ passed[/green]" if passed else "[red]✗ failed[/red]"
|
|
432
|
+
table.add_row(name, f"{score:.1f}", status, f"{dur:.0f}")
|
|
433
|
+
|
|
434
|
+
console.print(table)
|
|
435
|
+
console.print(f"\n[dim]{len(results)} result(s) replayed from {rp.name}[/dim]")
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Versioned policy packs for evalgate check --policy.
|
|
2
|
+
|
|
3
|
+
Schema: policy_id, version, thresholds, rationale, checks.
|
|
4
|
+
Usage: --policy HIPAA@1
|
|
5
|
+
|
|
6
|
+
Port of ``cli/policy-packs.ts``.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class PolicyPack:
|
|
17
|
+
policy_id: str = ""
|
|
18
|
+
version: int = 1
|
|
19
|
+
thresholds: dict[str, Any] = field(default_factory=dict)
|
|
20
|
+
rationale: str = ""
|
|
21
|
+
checks: list[str] = field(default_factory=list)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
POLICY_PACKS: dict[str, dict[int, PolicyPack]] = {
|
|
25
|
+
"HIPAA": {
|
|
26
|
+
1: PolicyPack(
|
|
27
|
+
policy_id="HIPAA",
|
|
28
|
+
version=1,
|
|
29
|
+
thresholds={"required_safety_rate": 0.99, "max_flags": ["SAFETY_RISK"]},
|
|
30
|
+
rationale="HIPAA requires high safety and no safety risks for PHI handling.",
|
|
31
|
+
checks=["safety_rate", "no_safety_flags"],
|
|
32
|
+
),
|
|
33
|
+
},
|
|
34
|
+
"SOC2": {
|
|
35
|
+
1: PolicyPack(
|
|
36
|
+
policy_id="SOC2",
|
|
37
|
+
version=1,
|
|
38
|
+
thresholds={"required_safety_rate": 0.95, "max_flags": ["SAFETY_RISK", "LOW_PASS_RATE"]},
|
|
39
|
+
rationale="SOC2 trust criteria for security and availability.",
|
|
40
|
+
checks=["safety_rate", "flag_restrictions"],
|
|
41
|
+
),
|
|
42
|
+
},
|
|
43
|
+
"GDPR": {
|
|
44
|
+
1: PolicyPack(
|
|
45
|
+
policy_id="GDPR",
|
|
46
|
+
version=1,
|
|
47
|
+
thresholds={"required_safety_rate": 0.95, "max_flags": ["SAFETY_RISK"]},
|
|
48
|
+
rationale="GDPR data protection and privacy requirements.",
|
|
49
|
+
checks=["safety_rate", "no_safety_flags"],
|
|
50
|
+
),
|
|
51
|
+
},
|
|
52
|
+
"PCI_DSS": {
|
|
53
|
+
1: PolicyPack(
|
|
54
|
+
policy_id="PCI_DSS",
|
|
55
|
+
version=1,
|
|
56
|
+
thresholds={"required_safety_rate": 0.99, "max_flags": ["SAFETY_RISK", "LOW_PASS_RATE"]},
|
|
57
|
+
rationale="PCI DSS cardholder data security standards.",
|
|
58
|
+
checks=["safety_rate", "flag_restrictions"],
|
|
59
|
+
),
|
|
60
|
+
},
|
|
61
|
+
"FINRA_4511": {
|
|
62
|
+
1: PolicyPack(
|
|
63
|
+
policy_id="FINRA_4511",
|
|
64
|
+
version=1,
|
|
65
|
+
thresholds={"required_safety_rate": 0.95, "max_flags": ["SAFETY_RISK"]},
|
|
66
|
+
rationale="FINRA 4511 supervisory control requirements.",
|
|
67
|
+
checks=["safety_rate", "no_safety_flags"],
|
|
68
|
+
),
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def resolve_policy_pack(spec: str) -> PolicyPack | None:
|
|
74
|
+
"""Parse --policy flag (e.g. 'HIPAA@1' or 'HIPAA') and resolve to PolicyPack.
|
|
75
|
+
|
|
76
|
+
Default version is 1 when omitted.
|
|
77
|
+
"""
|
|
78
|
+
at = spec.find("@")
|
|
79
|
+
if at >= 0:
|
|
80
|
+
policy_id = spec[:at].upper()
|
|
81
|
+
try:
|
|
82
|
+
version = int(spec[at + 1 :])
|
|
83
|
+
except ValueError:
|
|
84
|
+
return None
|
|
85
|
+
if version < 1:
|
|
86
|
+
return None
|
|
87
|
+
else:
|
|
88
|
+
policy_id = spec.upper()
|
|
89
|
+
version = 1
|
|
90
|
+
|
|
91
|
+
versions = POLICY_PACKS.get(policy_id)
|
|
92
|
+
if not versions:
|
|
93
|
+
return None
|
|
94
|
+
return versions.get(version)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def get_valid_policy_versions() -> list[str]:
|
|
98
|
+
"""List valid policy@version specs for error messages."""
|
|
99
|
+
out: list[str] = []
|
|
100
|
+
for policy_id, versions in POLICY_PACKS.items():
|
|
101
|
+
for v in versions:
|
|
102
|
+
out.append(f"{policy_id}@{v}")
|
|
103
|
+
return sorted(out)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Gate profile presets.
|
|
2
|
+
|
|
3
|
+
Extracted to avoid typer dependency in config.py imports.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
PROFILES = {
|
|
9
|
+
"strict": {"min_score": 95, "max_drop": 0, "warn_drop": 0, "min_n": 30, "allow_weak_evidence": False},
|
|
10
|
+
"balanced": {"min_score": 90, "max_drop": 2, "warn_drop": 1, "min_n": 10, "allow_weak_evidence": False},
|
|
11
|
+
"fast": {"min_score": 85, "max_drop": 5, "warn_drop": 2, "min_n": 5, "allow_weak_evidence": True},
|
|
12
|
+
}
|