aicert 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aicert/cli.py ADDED
@@ -0,0 +1,1423 @@
1
+ """CLI interface for aicert."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ import typer
12
+ from rich.console import Console
13
+ from rich.table import Table
14
+ from rich.text import Text
15
+ from rich.style import Style
16
+
17
+ app = typer.Typer(
18
+ name="aicert",
19
+ help="CI for LLM JSON outputs - Validate, test, and measure LLM outputs",
20
+ add_completion=False,
21
+ )
22
+ console = Console()
23
+
24
+
25
+ def _load_config_and_schema(config_path: str):
26
+ """Load configuration and schema, resolving paths relative to config file.
27
+
28
+ Returns:
29
+ Tuple of (config_obj, schema, prompt_hash, schema_hash)
30
+ """
31
+ from aicert.config import Config, ConfigLoadError, load_config
32
+ from aicert.validation import load_json_schema, SchemaLoadError
33
+ from aicert.hashing import sha256_file
34
+
35
+ try:
36
+ config_obj = load_config(config_path)
37
+ config_obj._config_path = config_path
38
+ except ConfigLoadError as e:
39
+ console.print(e)
40
+ sys.exit(3)
41
+
42
+ # Resolve paths relative to config file
43
+ config_dir = Path(config_path).parent
44
+ schema_file = config_dir / config_obj.schema_file if not Path(config_obj.schema_file).is_absolute() else Path(config_obj.schema_file)
45
+ prompt_file = config_dir / config_obj.prompt_file if not Path(config_obj.prompt_file).is_absolute() else Path(config_obj.prompt_file)
46
+
47
+ try:
48
+ schema = load_json_schema(str(schema_file))
49
+ except SchemaLoadError as e:
50
+ console.print(e)
51
+ sys.exit(3)
52
+
53
+ # Compute hashes
54
+ prompt_hash = sha256_file(prompt_file)
55
+ schema_hash = sha256_file(schema_file)
56
+
57
+ return config_obj, schema, prompt_hash, schema_hash
58
+
59
+
60
+ def _should_store(no_store: bool, save_on_fail: bool, failed: bool) -> bool:
61
+ """Determine if artifacts should be stored."""
62
+ if no_store:
63
+ return False
64
+ if save_on_fail and failed:
65
+ return True
66
+ if not save_on_fail:
67
+ return True # Save on success too
68
+ return False
69
+
70
+
71
+ def _print_stability_summary(summary: dict, title: str = "Stability Results"):
72
+ """Print per-provider stability summary using rich tables."""
73
+ console.print(f"\n[bold]{title}[/bold]")
74
+
75
+ table = Table(show_header=True, header_style="bold magenta")
76
+ table.add_column("Provider", style="cyan")
77
+ table.add_column("Runs", justify="right")
78
+ table.add_column("Stability", justify="right")
79
+ table.add_column("Compliance", justify="right")
80
+ table.add_column("JSON Parse Fail", justify="right")
81
+ table.add_column("Schema Fail", justify="right")
82
+ table.add_column("Provider Err", justify="right")
83
+ table.add_column("Timeouts", justify="right")
84
+ table.add_column("Latency P95", justify="right")
85
+ table.add_column("Cost ($)", justify="right")
86
+
87
+ for provider_id, metrics in summary["per_provider"].items():
88
+ stability = f"{metrics['stability_score']:.1f}%"
89
+ compliance = f"{metrics['schema_compliance']:.1f}%"
90
+ latency_p95 = f"{metrics['latency_stats']['p95']:.0f}ms"
91
+ cost = f"${metrics['total_cost_usd']:.4f}"
92
+
93
+ # Error counts
94
+ json_parse_fail = metrics.get("json_parse_failures", 0)
95
+ schema_fail = metrics.get("schema_failures", 0)
96
+ provider_err = metrics.get("provider_errors", 0)
97
+ timeouts = metrics.get("timeouts", 0)
98
+
99
+ # Color code stability
100
+ if metrics['stability_score'] >= 85:
101
+ stability_text = Text(stability, style="green")
102
+ elif metrics['stability_score'] >= 70:
103
+ stability_text = Text(stability, style="yellow")
104
+ else:
105
+ stability_text = Text(stability, style="red")
106
+
107
+ # Color code error counts
108
+ json_fail_text = Text(str(json_parse_fail), style="red" if json_parse_fail > 0 else "dim")
109
+ schema_fail_text = Text(str(schema_fail), style="red" if schema_fail > 0 else "dim")
110
+ provider_err_text = Text(str(provider_err), style="red" if provider_err > 0 else "dim")
111
+ timeout_text = Text(str(timeouts), style="red" if timeouts > 0 else "dim")
112
+
113
+ table.add_row(
114
+ provider_id,
115
+ str(metrics["total_runs"]),
116
+ stability_text,
117
+ compliance,
118
+ json_fail_text,
119
+ schema_fail_text,
120
+ provider_err_text,
121
+ timeout_text,
122
+ latency_p95,
123
+ cost
124
+ )
125
+
126
+ console.print(table)
127
+
128
+ # Overall summary
129
+ overall = summary["overall"]
130
+ console.print(f"\n[bold]Overall:[/bold] {overall['total_runs']} runs, "
131
+ f"Stability: {overall['stability_score']:.1f}%, "
132
+ f"Compliance: {overall['schema_compliance']:.1f}%, "
133
+ f"JSON Parse Fail: {overall.get('json_parse_failures', 0)}, "
134
+ f"Schema Fail: {overall.get('schema_failures', 0)}, "
135
+ f"Provider Err: {overall.get('provider_errors', 0)}, "
136
+ f"Timeouts: {overall.get('timeouts', 0)}, "
137
+ f"Cost: ${overall['total_cost_usd']:.4f}")
138
+
139
+
140
+ def _print_ranked_table(summary: dict) -> None:
141
+ """Print ranked comparison table."""
142
+ console.print("\n[bold]Provider Comparison (Ranked by Stability > Compliance > Cost)[/bold]")
143
+
144
+ # Sort providers by stability, then compliance, then cost
145
+ providers = []
146
+ for provider_id, metrics in summary["per_provider"].items():
147
+ providers.append({
148
+ "id": provider_id,
149
+ "stability": metrics["stability_score"],
150
+ "compliance": metrics["schema_compliance"],
151
+ "cost": metrics["total_cost_usd"],
152
+ "runs": metrics["total_runs"],
153
+ "latency_p95": metrics["latency_stats"]["p95"],
154
+ })
155
+
156
+ # Sort: higher stability is better, higher compliance is better, lower cost is better
157
+ providers.sort(key=lambda x: (x["stability"], x["compliance"], -x["cost"]), reverse=True)
158
+
159
+ table = Table(show_header=True, header_style="bold magenta")
160
+ table.add_column("Rank", justify="center", style="bold")
161
+ table.add_column("Provider", style="cyan")
162
+ table.add_column("Stability", justify="right")
163
+ table.add_column("Compliance", justify="right")
164
+ table.add_column("Cost ($)", justify="right")
165
+ table.add_column("Latency P95", justify="right")
166
+
167
+ for rank, p in enumerate(providers, 1):
168
+ stability_text = Text(f"{p['stability']:.1f}%", style="green" if p['stability'] >= 85 else "yellow" if p['stability'] >= 70 else "red")
169
+ compliance_text = Text(f"{p['compliance']:.1f}%", style="green" if p['compliance'] >= 95 else "yellow")
170
+
171
+ rank_str = Text(str(rank), style="bold cyan" if rank == 1 else "dim")
172
+ table.add_row(rank_str, p["id"], stability_text, compliance_text, f"${p['cost']:.4f}", f"{p['latency_p95']:.0f}ms")
173
+
174
+ console.print(table)
175
+
176
+
177
+ def _evaluate_thresholds(summary: dict, thresholds: dict) -> tuple[bool, list[str]]:
178
+ """Evaluate thresholds and return (passed, failures)."""
179
+ failures = []
180
+
181
+ overall = summary["overall"]
182
+
183
+ # Check min_stability
184
+ if thresholds.get("min_stability") is not None:
185
+ if overall["stability_score"] < thresholds["min_stability"]:
186
+ failures.append(f"Stability {overall['stability_score']:.1f}% < {thresholds['min_stability']}% threshold")
187
+
188
+ # Check min_compliance
189
+ if thresholds.get("min_compliance") is not None:
190
+ if overall["schema_compliance"] < thresholds["min_compliance"]:
191
+ failures.append(f"Compliance {overall['schema_compliance']:.1f}% < {thresholds['min_compliance']}% threshold")
192
+
193
+ # Check max_cost_usd
194
+ if thresholds.get("max_cost_usd") is not None:
195
+ if overall["total_cost_usd"] > thresholds["max_cost_usd"]:
196
+ failures.append(f"Cost ${overall['total_cost_usd']:.4f} > ${thresholds['max_cost_usd']:.2f} threshold")
197
+
198
+ # Check p95_latency_ms
199
+ if thresholds.get("p95_latency_ms") is not None:
200
+ p95 = overall["latency_stats"]["p95"]
201
+ if p95 > thresholds["p95_latency_ms"]:
202
+ failures.append(f"P95 latency {p95:.0f}ms > {thresholds['p95_latency_ms']}ms threshold")
203
+
204
+ return len(failures) == 0, failures
205
+
206
+
207
+ def _get_failed_thresholds(summary: dict, thresholds: dict) -> list[str]:
208
+ """Get list of threshold names that failed."""
209
+ failed = []
210
+ overall = summary["overall"]
211
+
212
+ if thresholds.get("min_stability") is not None and overall["stability_score"] < thresholds["min_stability"]:
213
+ failed.append("min_stability")
214
+ if thresholds.get("min_compliance") is not None and overall["schema_compliance"] < thresholds["min_compliance"]:
215
+ failed.append("min_compliance")
216
+ if thresholds.get("max_cost_usd") is not None and overall["total_cost_usd"] > thresholds["max_cost_usd"]:
217
+ failed.append("max_cost_usd")
218
+ if thresholds.get("p95_latency_ms") is not None:
219
+ p95 = overall["latency_stats"]["p95"]
220
+ if p95 > thresholds["p95_latency_ms"]:
221
+ failed.append("p95_latency_ms")
222
+
223
+ return failed
224
+
225
+
226
+ def _get_example_failures(results: list[dict], max_examples: int = 3) -> list[dict]:
227
+ """Extract example failures from results for actionable debugging output.
228
+
229
+ Args:
230
+ results: List of result dictionaries from execute_case.
231
+ max_examples: Maximum number of examples to return.
232
+
233
+ Returns:
234
+ List of up to max_examples failure details with case_id and error info.
235
+ """
236
+ examples = []
237
+
238
+ for result in results:
239
+ if not result.get("ok_schema", True):
240
+ case_id = result.get("case_id", "unknown")
241
+
242
+ # Determine failure type and details
243
+ if result.get("error"):
244
+ error = result["error"]
245
+ # Schema validation failure
246
+ if "Extra keys" in error:
247
+ extra_keys = result.get("extra_keys", [])
248
+ examples.append({
249
+ "case_id": case_id,
250
+ "type": "schema",
251
+ "detail": f"extra keys: {', '.join(extra_keys)}" if extra_keys else error,
252
+ })
253
+ elif "missing required field" in error.lower() or "was of type" in error.lower():
254
+ examples.append({
255
+ "case_id": case_id,
256
+ "type": "schema",
257
+ "detail": error,
258
+ })
259
+ else:
260
+ examples.append({
261
+ "case_id": case_id,
262
+ "type": "schema",
263
+ "detail": error,
264
+ })
265
+ elif not result.get("ok_json", True):
266
+ # JSON parse failure
267
+ content = result.get("content", "")
268
+ truncated = content[:200] + "..." if len(content) > 200 else content
269
+ examples.append({
270
+ "case_id": case_id,
271
+ "type": "json_parse",
272
+ "detail": truncated,
273
+ })
274
+ else:
275
+ # Provider error or timeout
276
+ error = result.get("error", "unknown error")
277
+ examples.append({
278
+ "case_id": case_id,
279
+ "type": "provider_error",
280
+ "detail": error,
281
+ })
282
+
283
+ if len(examples) >= max_examples:
284
+ break
285
+
286
+ return examples
287
+
288
+
289
+ def _escape_xml(text: str) -> str:
290
+ """Escape special XML characters."""
291
+ return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace("\"", "&quot;").replace("'", "&apos;")
292
+
293
+
294
+ def _build_junit_output(
295
+ project: str,
296
+ providers: list[str],
297
+ thresholds: dict,
298
+ passed: bool,
299
+ summary: dict,
300
+ ) -> str:
301
+ """Build JUnit XML output for CI mode.
302
+
303
+ One <testsuite> per provider with tests for:
304
+ - stability threshold
305
+ - compliance threshold
306
+ - cost regression (if applicable)
307
+ - latency regression (if applicable)
308
+ """
309
+ import datetime
310
+
311
+ timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
312
+
313
+ xml_parts = []
314
+
315
+ for provider_id in providers:
316
+ metrics = summary["per_provider"].get(provider_id, {})
317
+
318
+ # Count tests and failures for this provider
319
+ tests = 0
320
+ failures = 0
321
+ test_cases = []
322
+
323
+ # Stability threshold test
324
+ tests += 1
325
+ stability_score = metrics.get("stability_score", 0)
326
+ min_stability = thresholds.get("min_stability")
327
+ if min_stability is not None:
328
+ if stability_score < min_stability:
329
+ failures += 1
330
+ message = f"Stability {stability_score:.1f}% < {min_stability}% threshold"
331
+ test_cases.append(
332
+ f'<testcase name="stability_threshold" classname="{_escape_xml(provider_id)}"><failure message="{_escape_xml(message)}">{_escape_xml(message)}</failure></testcase>'
333
+ )
334
+ else:
335
+ test_cases.append(
336
+ f'<testcase name="stability_threshold" classname="{_escape_xml(provider_id)}"/>'
337
+ )
338
+ else:
339
+ test_cases.append(
340
+ f'<testcase name="stability_threshold" classname="{_escape_xml(provider_id)}"/>'
341
+ )
342
+
343
+ # Compliance threshold test
344
+ tests += 1
345
+ compliance_score = metrics.get("schema_compliance", 0)
346
+ min_compliance = thresholds.get("min_compliance")
347
+ if min_compliance is not None:
348
+ if compliance_score < min_compliance:
349
+ failures += 1
350
+ message = f"Compliance {compliance_score:.1f}% < {min_compliance}% threshold"
351
+ test_cases.append(
352
+ f'<testcase name="compliance_threshold" classname="{_escape_xml(provider_id)}"><failure message="{_escape_xml(message)}">{_escape_xml(message)}</failure></testcase>'
353
+ )
354
+ else:
355
+ test_cases.append(
356
+ f'<testcase name="compliance_threshold" classname="{_escape_xml(provider_id)}"/>'
357
+ )
358
+ else:
359
+ test_cases.append(
360
+ f'<testcase name="compliance_threshold" classname="{_escape_xml(provider_id)}"/>'
361
+ )
362
+
363
+ # Cost regression test (if max_cost_usd threshold is set)
364
+ max_cost = thresholds.get("max_cost_usd")
365
+ if max_cost is not None:
366
+ tests += 1
367
+ cost_usd = metrics.get("total_cost_usd", 0)
368
+ if cost_usd > max_cost:
369
+ failures += 1
370
+ message = f"Cost ${cost_usd:.4f} > ${max_cost:.2f} threshold"
371
+ test_cases.append(
372
+ f'<testcase name="cost_regression" classname="{_escape_xml(provider_id)}"><failure message="{_escape_xml(message)}">{_escape_xml(message)}</failure></testcase>'
373
+ )
374
+ else:
375
+ test_cases.append(
376
+ f'<testcase name="cost_regression" classname="{_escape_xml(provider_id)}"/>'
377
+ )
378
+
379
+ # Latency regression test (if p95_latency_ms threshold is set)
380
+ p95_latency = thresholds.get("p95_latency_ms")
381
+ if p95_latency is not None:
382
+ tests += 1
383
+ latency_p95 = metrics.get("latency_stats", {}).get("p95", 0)
384
+ if latency_p95 > p95_latency:
385
+ failures += 1
386
+ message = f"P95 latency {latency_p95:.0f}ms > {p95_latency}ms threshold"
387
+ test_cases.append(
388
+ f'<testcase name="latency_regression" classname="{_escape_xml(provider_id)}"><failure message="{_escape_xml(message)}">{_escape_xml(message)}</failure></testcase>'
389
+ )
390
+ else:
391
+ test_cases.append(
392
+ f'<testcase name="latency_regression" classname="{_escape_xml(provider_id)}"/>'
393
+ )
394
+
395
+ # Build testsuite element for this provider
396
+ xml_parts.append(
397
+ f'<testsuite name="{_escape_xml(provider_id)}" tests="{tests}" failures="{failures}" timestamp="{timestamp}">\n' +
398
+ "\n".join(test_cases) +
399
+ f'\n</testsuite>'
400
+ )
401
+
402
+ # Return complete JUnit XML
403
+ return '<?xml version="1.0" encoding="UTF-8"?>\n' + "\n".join(xml_parts)
404
+
405
+
406
+ def _print_example_failures(examples: list[dict]) -> None:
407
+ """Print example failures in a concise format.
408
+
409
+ Args:
410
+ examples: List of failure examples from _get_example_failures.
411
+ """
412
+ if not examples:
413
+ return
414
+
415
+ console.print("\n[bold]Example failures:[/bold]")
416
+
417
+ for ex in examples:
418
+ case_id = ex["case_id"]
419
+ failure_type = ex["type"]
420
+ detail = ex["detail"]
421
+
422
+ if failure_type == "schema":
423
+ console.print(f" • {case_id}: schema validation - {detail}")
424
+ elif failure_type == "json_parse":
425
+ console.print(f" • {case_id}: invalid JSON - \"{detail}\"")
426
+ elif failure_type == "provider_error":
427
+ console.print(f" • {case_id}: provider error - {detail}")
428
+
429
+
430
+ def _build_json_output(
431
+ project: str,
432
+ run_id: Optional[str],
433
+ providers: list[str],
434
+ thresholds: dict,
435
+ passed: bool,
436
+ summary: dict,
437
+ ) -> dict:
438
+ """Build JSON output for CI mode."""
439
+ per_provider = {}
440
+ for provider_id, metrics in summary["per_provider"].items():
441
+ per_provider[provider_id] = {
442
+ "stability": round(metrics["stability_score"], 2),
443
+ "compliance": round(metrics["schema_compliance"], 2),
444
+ "latency_stats": {
445
+ "mean_ms": round(metrics["latency_stats"]["mean"], 2),
446
+ "p95_ms": round(metrics["latency_stats"]["p95"], 2),
447
+ "std_ms": round(metrics["latency_stats"]["std"], 2),
448
+ },
449
+ "error_counts": {
450
+ "json_parse_failures": metrics.get("json_parse_failures", 0),
451
+ "schema_failures": metrics.get("schema_failures", 0),
452
+ "provider_errors": metrics.get("provider_errors", 0),
453
+ "timeouts": metrics.get("timeouts", 0),
454
+ },
455
+ "total_runs": metrics["total_runs"],
456
+ "total_cost_usd": round(metrics["total_cost_usd"], 6),
457
+ }
458
+
459
+ result = {
460
+ "project": project,
461
+ "providers": providers,
462
+ "thresholds": thresholds,
463
+ "pass": passed,
464
+ "per_provider": per_provider,
465
+ }
466
+
467
+ if run_id is not None:
468
+ result["run_id"] = run_id
469
+
470
+ return result
471
+
472
+
473
+ def _print_ci_summary(summary: dict, thresholds: dict, passed: bool, failures: list[str], run_dir: Optional[Path] = None, results: Optional[list[dict]] = None) -> None:
474
+ """Print CI-friendly summary output.
475
+
476
+ On pass: prints concise per-provider line with stability, compliance, p95 latency, cost, and PASS.
477
+ On fail: prints failed thresholds, per-provider summary with FAIL, example failures, and artifacts path.
478
+
479
+ Args:
480
+ summary: Summary dictionary with per_provider metrics.
481
+ thresholds: Thresholds dictionary.
482
+ passed: Whether all thresholds passed.
483
+ failures: List of threshold failure descriptions.
484
+ run_dir: Optional run directory path.
485
+ results: Optional list of result dictionaries for printing example failures.
486
+ """
487
+ if passed:
488
+ console.print("\n[bold]CI Results[/bold]")
489
+ for provider_id, metrics in summary["per_provider"].items():
490
+ stability = f"{metrics['stability_score']:.1f}%"
491
+ compliance = f"{metrics['schema_compliance']:.1f}%"
492
+ latency_p95 = f"{metrics['latency_stats']['p95']:.0f}ms"
493
+ cost = f"${metrics['total_cost_usd']:.4f}" if metrics['total_cost_usd'] > 0 else "$0.0000"
494
+ console.print(f"{provider_id}: stability={stability} compliance={compliance} p95_latency={latency_p95} cost={cost} PASS")
495
+ else:
496
+ failed_thresholds = _get_failed_thresholds(summary, thresholds)
497
+ console.print("\n[bold red]CI Results[/bold red]")
498
+ console.print(f"Failed thresholds: {', '.join(failed_thresholds)}")
499
+ console.print("\n[bold]Per-provider summary:[/bold]")
500
+ for provider_id, metrics in summary["per_provider"].items():
501
+ stability = f"{metrics['stability_score']:.1f}%"
502
+ compliance = f"{metrics['schema_compliance']:.1f}%"
503
+ latency_p95 = f"{metrics['latency_stats']['p95']:.0f}ms"
504
+ cost = f"${metrics['total_cost_usd']:.4f}" if metrics['total_cost_usd'] > 0 else "$0.0000"
505
+ console.print(f"{provider_id}: stability={stability} compliance={compliance} p95_latency={latency_p95} cost={cost} FAIL")
506
+
507
+ # Print example failures if results are provided
508
+ if results:
509
+ examples = _get_example_failures(results)
510
+ if examples:
511
+ _print_example_failures(examples)
512
+
513
+ if run_dir:
514
+ console.print(f"\nArtifacts saved to: {run_dir}")
515
+
516
+
517
+ @app.command()
518
+ def doctor(
519
+ config: str = typer.Argument(..., help="Path to configuration file"),
520
+ check_connectivity: bool = typer.Option(
521
+ False, "--check-connectivity", help="Check connectivity for openai_compatible providers"
522
+ ),
523
+ ) -> None:
524
+ """Validate installation, configuration, and provider readiness.
525
+
526
+ Performs comprehensive checks without making paid API calls.
527
+ Exit code 0 if all checks pass, 1 otherwise.
528
+ """
529
+ from aicert.config import ConfigLoadError
530
+ from aicert.doctor import run_doctor
531
+
532
+ console.print("[bold]aicert doctor[/bold] - Validating installation and configuration")
533
+
534
+ try:
535
+ exit_code, failed_count = run_doctor(config, check_connectivity_flag=check_connectivity)
536
+ except ConfigLoadError as e:
537
+ # Print error to stderr and exit with code 3
538
+ from rich.console import Console
539
+ stderr_console = Console(file=sys.stderr)
540
+ stderr_console.print(e)
541
+ sys.exit(3)
542
+
543
+ sys.exit(exit_code)
544
+
545
+
546
+ @app.command()
547
+ def init(
548
+ force: bool = typer.Option(
549
+ False, "--force", "-f", help="Overwrite existing files"
550
+ ),
551
+ ) -> None:
552
+ """Initialize a new aicert project scaffold in the current directory.
553
+
554
+ Creates aicert.yaml, prompt.txt, cases.jsonl, schema.json, and aicert_baselines/ directory.
555
+ Does not overwrite existing files unless --force is passed.
556
+ """
557
+ import os
558
+
559
+ console.print("[bold]aicert init[/bold] - Initializing new project scaffold")
560
+
561
+ # Get current directory name for project name
562
+ cwd = Path.cwd()
563
+ project_name = cwd.name or "my-project"
564
+
565
+ # Files to create
566
+ files_to_create = {
567
+ "aicert.yaml": f'''# aicert configuration file
568
+ # Project: {project_name}
569
+ #
570
+ # To use with real providers (OpenAI, Anthropic), change the provider below
571
+ # and set your API key: export OPENAI_API_KEY="your-key"
572
+
573
+ project: {project_name}
574
+
575
+ # Fake adapter for testing - produces deterministic JSON output
576
+ # For real testing, change to: provider: openai, model: gpt-4
577
+ providers:
578
+ - id: fake-test
579
+ provider: fake
580
+ model: fake-model
581
+ temperature: 0.1
582
+
583
+ prompt_file: prompt.txt
584
+ cases_file: cases.jsonl
585
+ schema_file: schema.json
586
+
587
+ runs: 10
588
+ concurrency: 5
589
+ timeout_s: 30
590
+
591
+ validation:
592
+ extract_json: true
593
+ allow_extra_keys: false
594
+
595
+ thresholds:
596
+ min_stability: 85
597
+ min_compliance: 95
598
+
599
+ ci:
600
+ runs: 10
601
+ save_on_fail: true
602
+ ''',
603
+ "prompt.txt": '''# Prompt for LLM
604
+ # Output JSON only, matching the schema
605
+
606
+ User's request: $request
607
+ ''',
608
+ "cases.jsonl": '''{"name": "case_1", "request": "Classify the sentiment of this text: I love aicert!", "variables": {"request": "Classify the sentiment of this text: I love aicert!"}}
609
+ {"name": "case_2", "request": "Classify the sentiment of this text: aicert is okay.", "variables": {"request": "Classify the sentiment of this text: aicert is okay."}}
610
+ {"name": "case_3", "request": "Classify the sentiment of this text: I hate bugs.", "variables": {"request": "Classify the sentiment of this text: I hate bugs."}}
611
+ ''',
612
+ "schema.json": '''{
613
+ "$schema": "http://json-schema.org/draft-07/schema#",
614
+ "type": "object",
615
+ "properties": {
616
+ "label": {
617
+ "type": "string",
618
+ "description": "The classification label"
619
+ },
620
+ "confidence": {
621
+ "type": "number",
622
+ "minimum": 0,
623
+ "maximum": 1,
624
+ "description": "Confidence score between 0 and 1"
625
+ }
626
+ },
627
+ "required": ["label", "confidence"],
628
+ "additionalProperties": false
629
+ }
630
+ ''',
631
+ }
632
+
633
+ # Track what was created/skipped
634
+ created = []
635
+ skipped = []
636
+
637
+ for filename, content in files_to_create.items():
638
+ file_path = cwd / filename
639
+
640
+ if file_path.exists():
641
+ if force:
642
+ file_path.write_text(content)
643
+ created.append(filename)
644
+ console.print(f" [yellow]Overwrote:[/yellow] {filename}")
645
+ else:
646
+ skipped.append(filename)
647
+ console.print(f" [dim]Skipped (exists):[/dim] {filename}")
648
+ else:
649
+ file_path.write_text(content)
650
+ created.append(filename)
651
+ console.print(f" [green]Created:[/green] {filename}")
652
+
653
+ # Create aicert_baselines/ directory
654
+ baselines_dir = cwd / "aicert_baselines"
655
+ if baselines_dir.exists():
656
+ if force:
657
+ console.print(f" [yellow]Kept (exists):[/yellow] aicert_baselines/")
658
+ else:
659
+ console.print(f" [dim]Skipped (exists):[/dim] aicert_baselines/")
660
+ else:
661
+ baselines_dir.mkdir(exist_ok=True)
662
+ created.append("aicert_baselines/")
663
+ console.print(f" [green]Created:[/green] aicert_baselines/")
664
+
665
+ # Summary
666
+ console.print(f"\n[bold]Summary:[/bold] {len(created)} created, {len(skipped)} skipped")
667
+
668
+ if created:
669
+ console.print(f"Project '{project_name}' is ready. Run 'aicert stability' to test!")
670
+
671
+
672
+ @app.command()
673
+ def run(
674
+ config: str = typer.Argument(default="aicert.yaml", help="Path to configuration file"),
675
+ output: Optional[str] = typer.Option(None, "-o", "--out", help="Output directory for artifacts"),
676
+ no_store: bool = typer.Option(False, "--no-store", help="Don't save artifacts"),
677
+ provider: Optional[str] = typer.Option(None, "-p", "--provider", help="Override provider (e.g., 'fake' for testing)"),
678
+ extract_json: Optional[bool] = typer.Option(None, "--extract-json/--no-extract-json", help="Override extract_json setting"),
679
+ concurrency: Optional[int] = typer.Option(None, "--concurrency", help="Override number of concurrent requests"),
680
+ ) -> None:
681
+ """Run test cases once per case per provider (runs=1 override).
682
+
683
+ Exits with code 1 if any ok_schema is false.
684
+ """
685
+ from aicert.config import Config, ProviderConfig, ConfigLoadError, load_config
686
+ from aicert.metrics import compute_summary
687
+ from aicert.runner import run_suite
688
+
689
+ console.print("[bold]aicert run[/bold] - Running test cases (1 run per case)")
690
+
691
+ try:
692
+ config_obj, schema, prompt_hash, schema_hash = _load_config_and_schema(config)
693
+ except SystemExit:
694
+ raise
695
+
696
+ # Override runs=1 for run command
697
+ original_runs = config_obj.runs
698
+ config_obj.runs = 1
699
+
700
+ # Override extract_json if specified
701
+ if extract_json is not None:
702
+ config_obj.validation.extract_json = extract_json
703
+
704
+ # Override concurrency if specified
705
+ if concurrency is not None:
706
+ config_obj.concurrency = concurrency
707
+
708
+ # Override provider if specified
709
+ if provider:
710
+ typer.echo(f"Using provider override: {provider}")
711
+ config_obj.providers = [
712
+ ProviderConfig(
713
+ id=provider,
714
+ provider="fake",
715
+ model=f"fake-{provider}",
716
+ temperature=0.1,
717
+ )
718
+ ]
719
+
720
+ try:
721
+ # Determine output directory
722
+ output_dir = output
723
+ save_on_fail = True # Always save on fail for run command
724
+
725
+ # Run the suite
726
+ typer.echo(f"Running with {len(config_obj.providers)} provider(s), 1 run per case")
727
+ results = asyncio.run(run_suite(config_obj, output_dir=output_dir))
728
+
729
+ # Compute summary
730
+ summary = compute_summary(results, schema, prompt_hash=prompt_hash, schema_hash=schema_hash)
731
+
732
+ # Check for any schema failures
733
+ schema_failures = [r for r in results if not r.get("ok_schema", False)]
734
+
735
+ # Print brief summary
736
+ console.print(f"\n[bold]Results:[/bold] {summary['overall']['total_runs']} runs")
737
+ for provider_id, metrics in summary["per_provider"].items():
738
+ failures = sum(1 for r in results if r.get("provider_id") == provider_id and not r.get("ok_schema", False))
739
+ status = "✓" if failures == 0 else "✗"
740
+ console.print(f" {status} {provider_id}: {metrics['schema_compliance']:.0f}% compliance ({failures} failures)")
741
+
742
+ # Print artifact path if any failures
743
+ if schema_failures:
744
+ console.print(f"\n[bold red]Schema failures: {len(schema_failures)}[/bold red]")
745
+ for r in schema_failures[:5]: # Show first 5
746
+ console.print(f" - {r.get('provider_id')} / {r.get('case_id')}: {r.get('error', 'schema validation failed')}")
747
+ if len(schema_failures) > 5:
748
+ console.print(f" ... and {len(schema_failures) - 5} more")
749
+
750
+ # Find run directory
751
+ from aicert.artifacts import create_run_dir
752
+ run_dir = create_run_dir(output_dir)
753
+ console.print(f"\n[bold]Artifacts saved to:[/bold] {run_dir}")
754
+ sys.exit(1)
755
+ else:
756
+ console.print("\n[bold green]All schema validations passed[/bold green]")
757
+ sys.exit(0)
758
+
759
+ except Exception as e:
760
+ console.print(f"[bold red]Error:[/bold red] {str(e)}")
761
+ sys.exit(1)
762
+ finally:
763
+ # Restore original runs
764
+ config_obj.runs = original_runs
765
+
766
+
767
+ @app.command()
768
+ def stability(
769
+ config: str = typer.Argument(default="aicert.yaml", help="Path to configuration file"),
770
+ output: Optional[str] = typer.Option(None, "-o", "--out", help="Output directory for artifacts"),
771
+ no_store: bool = typer.Option(False, "--no-store", help="Don't save artifacts"),
772
+ provider: Optional[str] = typer.Option(None, "-p", "--provider", help="Override provider (e.g., 'fake' for testing)"),
773
+ extract_json: Optional[bool] = typer.Option(None, "--extract-json/--no-extract-json", help="Override extract_json setting"),
774
+ concurrency: Optional[int] = typer.Option(None, "--concurrency", help="Override number of concurrent requests"),
775
+ dry_run: bool = typer.Option(False, "--dry-run", help="Validate config and print execution plan without running"),
776
+ format: str = typer.Option("text", "--format", help="Output format: text or json", case_sensitive=False),
777
+ ) -> None:
778
+ """Run stability tests with config.runs per case.
779
+
780
+ Prints per-provider stability summary.
781
+ """
782
+ from aicert.config import ProviderConfig, ConfigLoadError, load_config
783
+ from aicert.metrics import compute_summary
784
+ from aicert.runner import run_suite
785
+ from aicert.doctor import run_doctor, load_cases, print_dry_run_plan
786
+
787
+ # Only print header for text mode
788
+ if format == "text":
789
+ console.print("[bold]aicert stability[/bold] - Running stability tests")
790
+
791
+ # Dry-run mode
792
+ if dry_run:
793
+ # Suppress doctor output when format is json
794
+ import io
795
+ import contextlib
796
+
797
+ if format == "json":
798
+ # Capture and discard doctor output
799
+ with contextlib.redirect_stdout(io.StringIO()) as f:
800
+ exit_code, _ = run_doctor(config, check_connectivity_flag=False)
801
+ else:
802
+ exit_code, _ = run_doctor(config, check_connectivity_flag=False)
803
+
804
+ if exit_code != 0:
805
+ if format == "json":
806
+ typer.echo(json.dumps({"error": "Doctor checks failed", "success": False}))
807
+ sys.exit(1)
808
+
809
+ # Load config and print plan
810
+ try:
811
+ config_obj = load_config(config)
812
+ except ConfigLoadError as e:
813
+ if format == "json":
814
+ typer.echo(json.dumps({"error": str(e), "success": False}))
815
+ else:
816
+ console.print(e)
817
+ sys.exit(3)
818
+
819
+ config_dir = Path(config).parent
820
+ cases_file = config_dir / config_obj.cases_file
821
+ cases, case_errors = load_cases(str(cases_file))
822
+ if case_errors:
823
+ error_msg = "Failed to load cases: " + ", ".join(case_errors)
824
+ if format == "json":
825
+ typer.echo(json.dumps({"error": error_msg, "success": False}))
826
+ else:
827
+ console.print("[bold red]Error:[/bold red] Failed to load cases")
828
+ for err in case_errors:
829
+ console.print(f" - {err}")
830
+ sys.exit(1)
831
+
832
+ # Calculate total requests
833
+ providers_count = len(config_obj.providers)
834
+ cases_count = len(cases)
835
+ runs = config_obj.runs
836
+ total_requests = providers_count * cases_count * runs
837
+
838
+ if format == "json":
839
+ # Output JSON execution plan
840
+ providers_list = [{"id": p.id, "provider": p.provider, "model": p.model} for p in config_obj.providers]
841
+ plan = {
842
+ "project": config_obj.project,
843
+ "providers": providers_list,
844
+ "cases_count": cases_count,
845
+ "runs_per_case": runs,
846
+ "total_requests": total_requests,
847
+ "concurrency": config_obj.concurrency,
848
+ "timeout_s": config_obj.timeout_s,
849
+ "validation": {
850
+ "extract_json": config_obj.validation.extract_json,
851
+ "allow_extra_keys": config_obj.validation.allow_extra_keys,
852
+ },
853
+ }
854
+ typer.echo(json.dumps(plan, separators=(',', ':')))
855
+ else:
856
+ print_dry_run_plan(config_obj, cases)
857
+ console.print("[bold green]Dry run complete - no requests made[/bold green]")
858
+ sys.exit(0)
859
+
860
+ try:
861
+ config_obj, schema, prompt_hash, schema_hash = _load_config_and_schema(config)
862
+ except SystemExit:
863
+ raise
864
+
865
+ # Override extract_json if specified
866
+ if extract_json is not None:
867
+ config_obj.validation.extract_json = extract_json
868
+
869
+ # Override concurrency if specified
870
+ if concurrency is not None:
871
+ config_obj.concurrency = concurrency
872
+
873
+ # Override provider if specified
874
+ if provider:
875
+ typer.echo(f"Using provider override: {provider}")
876
+ config_obj.providers = [
877
+ ProviderConfig(
878
+ id=provider,
879
+ provider="fake",
880
+ model=f"fake-{provider}",
881
+ temperature=0.1,
882
+ )
883
+ ]
884
+
885
+ try:
886
+ if format == "text":
887
+ typer.echo(f"Running {config_obj.runs} runs per case with {len(config_obj.providers)} provider(s)")
888
+ results = asyncio.run(run_suite(config_obj, output_dir=output))
889
+
890
+ summary = compute_summary(results, schema, prompt_hash=prompt_hash, schema_hash=schema_hash)
891
+
892
+ if format == "json":
893
+ # Output JSON summary
894
+ providers_list = [p.id for p in config_obj.providers]
895
+ json_output = _build_json_output(
896
+ project=config_obj.project,
897
+ run_id=None,
898
+ providers=providers_list,
899
+ thresholds={},
900
+ passed=True, # Stability doesn't have pass/fail threshold
901
+ summary=summary,
902
+ )
903
+ typer.echo(json.dumps(json_output, separators=(',', ':')))
904
+ else:
905
+ _print_stability_summary(summary, "Stability Results")
906
+
907
+ except Exception as e:
908
+ if format == "json":
909
+ # In JSON mode, output error as JSON to stderr
910
+ from rich.console import Console
911
+ stderr_console = Console(file=sys.stderr)
912
+ error_json = json.dumps({"error": str(e), "pass": False})
913
+ stderr_console.print(error_json)
914
+ else:
915
+ console.print(f"[bold red]Error:[/bold red] {str(e)}")
916
+ sys.exit(1)
917
+
918
+
919
+ @app.command()
920
+ def compare(
921
+ config: str = typer.Argument(default="aicert.yaml", help="Path to configuration file"),
922
+ output: Optional[str] = typer.Option(None, "-o", "--out", help="Output directory for artifacts"),
923
+ no_store: bool = typer.Option(False, "--no-store", help="Don't save artifacts"),
924
+ provider: Optional[str] = typer.Option(None, "-p", "--provider", help="Override provider (e.g., 'fake' for testing)"),
925
+ extract_json: Optional[bool] = typer.Option(None, "--extract-json/--no-extract-json", help="Override extract_json setting"),
926
+ concurrency: Optional[int] = typer.Option(None, "--concurrency", help="Override number of concurrent requests"),
927
+ ) -> None:
928
+ """Compare providers with ranked table by stability, compliance, and cost.
929
+
930
+ Like stability but prints ranked comparison table.
931
+ """
932
+ from aicert.config import ProviderConfig, ConfigLoadError, load_config
933
+ from aicert.metrics import compute_summary
934
+ from aicert.runner import run_suite
935
+
936
+ console.print("[bold]aicert compare[/bold] - Comparing providers")
937
+
938
+ try:
939
+ config_obj, schema, prompt_hash, schema_hash = _load_config_and_schema(config)
940
+ except SystemExit:
941
+ raise
942
+
943
+ # Override extract_json if specified
944
+ if extract_json is not None:
945
+ config_obj.validation.extract_json = extract_json
946
+
947
+ # Override concurrency if specified
948
+ if concurrency is not None:
949
+ config_obj.concurrency = concurrency
950
+
951
+ # Override provider if specified
952
+ if provider:
953
+ typer.echo(f"Using provider override: {provider}")
954
+ config_obj.providers = [
955
+ ProviderConfig(
956
+ id=provider,
957
+ provider="fake",
958
+ model=f"fake-{provider}",
959
+ temperature=0.1,
960
+ )
961
+ ]
962
+
963
+ try:
964
+ typer.echo(f"Running {config_obj.runs} runs per case with {len(config_obj.providers)} provider(s)")
965
+ results = asyncio.run(run_suite(config_obj, output_dir=output))
966
+
967
+ summary = compute_summary(results, schema, prompt_hash=prompt_hash, schema_hash=schema_hash)
968
+ _print_ranked_table(summary)
969
+
970
+ except Exception as e:
971
+ console.print(f"[bold red]Error:[/bold red] {str(e)}")
972
+ sys.exit(1)
973
+
974
+
975
+ @app.command()
976
+ def ci(
977
+ config: str = typer.Argument(default="aicert.yaml", help="Path to configuration file"),
978
+ output: Optional[str] = typer.Option(None, "-o", "--out", help="Output directory for artifacts"),
979
+ always_store: bool = typer.Option(False, "--always-store", help="Always save artifacts, not just on fail"),
980
+ provider: Optional[str] = typer.Option(None, "-p", "--provider", help="Override provider (e.g., 'fake' for testing)"),
981
+ extract_json: Optional[bool] = typer.Option(None, "--extract-json/--no-extract-json", help="Override extract_json setting"),
982
+ concurrency: Optional[int] = typer.Option(None, "--concurrency", help="Override number of concurrent requests"),
983
+ format: str = typer.Option("text", "--format", help="Output format: text or json", case_sensitive=False),
984
+ ) -> None:
985
+ """CI mode: evaluate against thresholds and exit with appropriate code.
986
+
987
+ Exit codes:
988
+ 0 - All thresholds passed
989
+ 2 - Threshold check failed
990
+ 3 - Config/schema error
991
+ 4 - Provider/auth error
992
+
993
+ Uses config.ci.runs for number of runs.
994
+ Default: save artifacts only on fail (unless --always-store is passed).
995
+ """
996
+ from aicert.config import ProviderConfig
997
+ from aicert.metrics import compute_summary
998
+ from aicert.runner import run_suite
999
+ from aicert.artifacts import create_run_dir
1000
+
1001
+ # Only print header for text mode
1002
+ if format == "text":
1003
+ console.print("[bold]aicert ci[/bold] - CI mode threshold evaluation")
1004
+
1005
+ try:
1006
+ config_obj, schema, prompt_hash, schema_hash = _load_config_and_schema(config)
1007
+ except SystemExit:
1008
+ raise
1009
+
1010
+ # Use ci.runs instead of config.runs
1011
+ original_runs = config_obj.runs
1012
+ config_obj.runs = config_obj.ci.runs
1013
+
1014
+ # Override extract_json if specified
1015
+ if extract_json is not None:
1016
+ config_obj.validation.extract_json = extract_json
1017
+
1018
+ # Override concurrency if specified
1019
+ if concurrency is not None:
1020
+ config_obj.concurrency = concurrency
1021
+
1022
+ # Build thresholds dict
1023
+ thresholds = {}
1024
+ if config_obj.thresholds.min_stability is not None:
1025
+ thresholds["min_stability"] = config_obj.thresholds.min_stability
1026
+ if config_obj.thresholds.min_compliance is not None:
1027
+ thresholds["min_compliance"] = config_obj.thresholds.min_compliance
1028
+ if config_obj.thresholds.max_cost_usd is not None:
1029
+ thresholds["max_cost_usd"] = config_obj.thresholds.max_cost_usd
1030
+ if config_obj.thresholds.p95_latency_ms is not None:
1031
+ thresholds["p95_latency_ms"] = config_obj.thresholds.p95_latency_ms
1032
+
1033
+ try:
1034
+ if format == "text":
1035
+ typer.echo(f"Running {config_obj.runs} runs per case with {len(config_obj.providers)} provider(s)")
1036
+ results = asyncio.run(run_suite(config_obj, output_dir=output))
1037
+
1038
+ summary = compute_summary(results, schema, prompt_hash=prompt_hash, schema_hash=schema_hash)
1039
+
1040
+ # Evaluate thresholds
1041
+ passed, failures = _evaluate_thresholds(summary, thresholds)
1042
+
1043
+ # Determine if we should store artifacts
1044
+ store_artifacts = _should_store(always_store, config_obj.ci.save_on_fail, not passed)
1045
+ run_dir = None
1046
+ run_id = None
1047
+ if store_artifacts:
1048
+ run_dir = create_run_dir(output)
1049
+ run_id = run_dir.name
1050
+
1051
+ if format == "json":
1052
+ # Output JSON to stdout
1053
+ providers_list = [p.id for p in config_obj.providers]
1054
+ json_output = _build_json_output(
1055
+ project=config_obj.project,
1056
+ run_id=run_id,
1057
+ providers=providers_list,
1058
+ thresholds=thresholds,
1059
+ passed=passed,
1060
+ summary=summary,
1061
+ )
1062
+ typer.echo(json.dumps(json_output, separators=(',', ':')))
1063
+ elif format == "junit":
1064
+ # Output JUnit XML to stdout
1065
+ providers_list = [p.id for p in config_obj.providers]
1066
+ junit_output = _build_junit_output(
1067
+ project=config_obj.project,
1068
+ providers=providers_list,
1069
+ thresholds=thresholds,
1070
+ passed=passed,
1071
+ summary=summary,
1072
+ )
1073
+ typer.echo(junit_output)
1074
+ sys.exit(0 if passed else 2)
1075
+ else:
1076
+ # Text output (existing behavior)
1077
+ _print_ci_summary(summary, thresholds, passed, failures, run_dir, results=results)
1078
+
1079
+ if passed:
1080
+ console.print("\n[bold green]All thresholds passed[/bold green]")
1081
+ sys.exit(0)
1082
+ else:
1083
+ console.print("\n[bold red]Threshold failures:[/bold red]")
1084
+ for failure in failures:
1085
+ console.print(f" - {failure}")
1086
+ sys.exit(2)
1087
+
1088
+ except Exception as e:
1089
+ if format == "json":
1090
+ # In JSON mode, output error as JSON to stderr
1091
+ from rich.console import Console
1092
+ stderr_console = Console(file=sys.stderr)
1093
+ error_json = json.dumps({"error": str(e), "pass": False})
1094
+ stderr_console.print(error_json)
1095
+ elif format == "junit":
1096
+ # In JUnit mode, output error as a testsuite with an error testcase
1097
+ import datetime
1098
+ timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
1099
+ error_msg = _escape_xml(str(e))
1100
+ junit_error = f'''<?xml version="1.0" encoding="UTF-8"?>
1101
+ <testsuite name="aicert" tests="1" failures="0" errors="1" timestamp="{timestamp}">
1102
+ <testcase name="ci_execution" classname="aicert"><error message="{error_msg}">{error_msg}</error></testcase>
1103
+ </testsuite>'''
1104
+ typer.echo(junit_error)
1105
+ else:
1106
+ console.print(f"[bold red]Error:[/bold red] {str(e)}")
1107
+ sys.exit(1)
1108
+ finally:
1109
+ # Restore original runs
1110
+ config_obj.runs = original_runs
1111
+
1112
+
1113
+ @app.command()
1114
+ def validate(
1115
+ config: str = typer.Argument(default="aicert.yaml", help="Path to configuration file"),
1116
+ ) -> None:
1117
+ """Validate configuration file and referenced files.
1118
+
1119
+ Exits with code 3 if config is invalid.
1120
+ """
1121
+ from aicert.config import load_config, ConfigLoadError
1122
+
1123
+ console.print("[bold]aicert validate[/bold] - Validating configuration")
1124
+
1125
+ try:
1126
+ config_obj = load_config(config)
1127
+ console.print(f"[bold green]Configuration is valid[/bold green]")
1128
+ console.print(f" Project: {config_obj.project}")
1129
+ console.print(f" Providers: {len(config_obj.providers)}")
1130
+ console.print(f" Runs: {config_obj.runs}")
1131
+ console.print(f" Concurrency: {config_obj.concurrency}")
1132
+ sys.exit(0)
1133
+ except ConfigLoadError as e:
1134
+ console.print(e, file=sys.stderr)
1135
+ sys.exit(3)
1136
+
1137
+
1138
+ @app.command()
1139
+ def report(
1140
+ run_arg: str = typer.Argument(..., help="Run directory path or run ID (resolved to .aicert/runs/<run_id>/)"),
1141
+ format: str = typer.Option("text", "--format", help="Output format: text or json", case_sensitive=False),
1142
+ ) -> None:
1143
+ """Generate a report from a previous run.
1144
+
1145
+ Reads summary.json from the run directory and prints a readable report.
1146
+ If report.txt exists, prints it; otherwise generates a report from summary.json.
1147
+
1148
+ Accepts either a path to a run directory or a run ID. If a run ID is given,
1149
+ it will be resolved to .aicert/runs/<run_id>/ in the current working directory.
1150
+
1151
+ Exit codes:
1152
+ 0 - Success
1153
+ 3 - Missing/invalid run_dir or cannot parse summary/report
1154
+ """
1155
+ # Resolve run_arg to actual run directory
1156
+ run_path = _resolve_run_dir(run_arg)
1157
+
1158
+ # Validate run_dir exists
1159
+ if not run_path.exists():
1160
+ console.print(f"[bold red]Error:[/bold red] Run directory not found: {run_path}")
1161
+ sys.exit(3)
1162
+
1163
+ if not run_path.is_dir():
1164
+ console.print(f"[bold red]Error:[/bold red] Path is not a directory: {run_path}")
1165
+ sys.exit(3)
1166
+
1167
+ # Get run_id from directory name
1168
+ run_id = run_path.name
1169
+
1170
+ # Check for report.txt first (only for text format)
1171
+ report_path = run_path / "report.txt"
1172
+ if format == "text" and report_path.exists():
1173
+ console.print("[bold]aicert report[/bold] - Report from stored file")
1174
+ typer.echo(report_path.read_text(encoding="utf-8"))
1175
+ sys.exit(0)
1176
+
1177
+ # Generate report from summary.json
1178
+ try:
1179
+ summary = _load_summary(run_path)
1180
+ except FileNotFoundError:
1181
+ console.print(f"[bold red]Error:[/bold red] summary.json not found in {run_path}")
1182
+ sys.exit(3)
1183
+ except json.JSONDecodeError as e:
1184
+ console.print(f"[bold red]Error:[/bold red] Invalid JSON in summary.json: {e}")
1185
+ sys.exit(3)
1186
+
1187
+ if format == "json":
1188
+ # Output JSON: print ONLY the summary.json content augmented with run_id and run_dir
1189
+ summary_output = dict(summary)
1190
+ summary_output["run_id"] = run_id
1191
+ summary_output["run_dir"] = str(run_path)
1192
+ typer.echo(json.dumps(summary_output, separators=(',', ':')))
1193
+ sys.exit(0)
1194
+ else:
1195
+ # Text format: generate and print the text report
1196
+ report_text = _generate_text_report(summary, run_id)
1197
+ console.print("[bold]aicert report[/bold] - Generated from summary.json")
1198
+ typer.echo(report_text)
1199
+ sys.exit(0)
1200
+
1201
+
1202
+ def _generate_text_report(summary: dict, run_id: str) -> str:
1203
+ """Generate a text report from summary data."""
1204
+ lines = []
1205
+
1206
+ project = summary.get("project", "unknown")
1207
+ lines.append(f"Project: {project}")
1208
+ lines.append(f"Run ID: {run_id}")
1209
+
1210
+ providers = summary.get("per_provider", {})
1211
+ if providers:
1212
+ lines.append(f"Providers ({len(providers)}):")
1213
+
1214
+ for provider_id, metrics in sorted(providers.items()):
1215
+ lines.append(f"\n {provider_id}:")
1216
+
1217
+ stability = metrics.get("stability_score", "N/A")
1218
+ compliance = metrics.get("schema_compliance", "N/A")
1219
+ latency_stats = metrics.get("latency_stats", {})
1220
+ p95_latency = latency_stats.get("p95", "N/A")
1221
+ mean_latency = latency_stats.get("mean", "N/A")
1222
+ total_cost = metrics.get("total_cost_usd", 0)
1223
+ total_runs = metrics.get("total_runs", 0)
1224
+
1225
+ # Calculate mean cost
1226
+ mean_cost = total_cost / total_runs if total_runs > 0 else 0
1227
+
1228
+ lines.append(f" Stability: {stability:.1f}%" if isinstance(stability, float) else f" Stability: {stability}")
1229
+ lines.append(f" Compliance: {compliance:.1f}%" if isinstance(compliance, float) else f" Compliance: {compliance}")
1230
+
1231
+ if isinstance(p95_latency, float):
1232
+ lines.append(f" P95 Latency: {p95_latency:.0f}ms")
1233
+ else:
1234
+ lines.append(f" P95 Latency: {p95_latency}")
1235
+
1236
+ if isinstance(mean_latency, float):
1237
+ lines.append(f" Mean Latency: {mean_latency:.0f}ms")
1238
+ else:
1239
+ lines.append(f" Mean Latency: {mean_latency}")
1240
+
1241
+ if isinstance(mean_cost, float):
1242
+ lines.append(f" Mean Cost: ${mean_cost:.4f}")
1243
+
1244
+ # Error counts
1245
+ json_parse_fail = metrics.get("json_parse_failures", 0)
1246
+ schema_fail = metrics.get("schema_failures", 0)
1247
+ provider_err = metrics.get("provider_errors", 0)
1248
+ timeouts = metrics.get("timeouts", 0)
1249
+
1250
+ lines.append(f" JSON Parse Failures: {json_parse_fail}")
1251
+ lines.append(f" Schema Failures: {schema_fail}")
1252
+ lines.append(f" Provider Errors: {provider_err}")
1253
+ lines.append(f" Timeouts: {timeouts}")
1254
+ else:
1255
+ lines.append(" No providers in run summary")
1256
+
1257
+ return "\n".join(lines)
1258
+
1259
+
1260
+ def _resolve_run_dir(arg: str) -> Path:
1261
+ """Resolve run argument to run directory path.
1262
+
1263
+ If arg looks like an existing path, return it as-is.
1264
+ Otherwise, treat as run_id and resolve to .aicert/runs/<run_id>/ in cwd.
1265
+
1266
+ Args:
1267
+ arg: Either a path (existing or absolute/relative) or a run_id.
1268
+
1269
+ Returns:
1270
+ Path to the run directory.
1271
+ """
1272
+ # Check if arg looks like a path
1273
+ # Paths: starts with /, ./, ../, or contains / or \, or exists
1274
+ arg_path = Path(arg)
1275
+
1276
+ # If the path exists, use it directly
1277
+ if arg_path.exists():
1278
+ return arg_path
1279
+
1280
+ # If it's an absolute path, use it directly
1281
+ if arg_path.is_absolute():
1282
+ return arg_path
1283
+
1284
+ # If it starts with ./ or ../, treat as relative path
1285
+ if arg.startswith('./') or arg.startswith('../') or '/' in arg or '\\' in arg:
1286
+ return arg_path
1287
+
1288
+ # Otherwise, treat as run_id and resolve to .aicert/runs/<run_id>/
1289
+ runs_dir = Path.cwd() / ".aicert" / "runs"
1290
+ return runs_dir / arg
1291
+
1292
+
1293
+
1294
+ """Compare two run directories and show delta between them.
1295
+
1296
+ Exit code is always 0 (informational only).
1297
+ """
1298
+ console.print("[bold]aicert diff[/bold] - Comparing run results")
1299
+
1300
+ # Load both summaries
1301
+ try:
1302
+ summary_a = _load_summary(run_a)
1303
+ except FileNotFoundError as e:
1304
+ console.print(f"[bold red]Error:[/bold red] {e}")
1305
+ sys.exit(3)
1306
+
1307
+ try:
1308
+ summary_b = _load_summary(run_b)
1309
+ except FileNotFoundError as e:
1310
+ console.print(f"[bold red]Error:[/bold red] {e}")
1311
+ sys.exit(3)
1312
+
1313
+ console.print(f"\nRun A: {run_a}")
1314
+ console.print(f"Run B: {run_b}\n")
1315
+
1316
+ # Get providers from both summaries
1317
+ providers_a = summary_a.get("per_provider", {})
1318
+ providers_b = summary_b.get("per_provider", {})
1319
+
1320
+ all_providers = set(providers_a.keys()) | set(providers_b.keys())
1321
+
1322
+ for provider_id in sorted(all_providers):
1323
+ metrics_a = providers_a.get(provider_id, {})
1324
+ metrics_b = providers_b.get(provider_id, {})
1325
+
1326
+ if not metrics_a:
1327
+ console.print(f"[bold cyan]{provider_id}[/bold cyan] - [yellow]Only in Run B[/yellow]")
1328
+ continue
1329
+ if not metrics_b:
1330
+ console.print(f"[bold cyan]{provider_id}[/bold cyan] - [yellow]Only in Run A[/yellow]")
1331
+ continue
1332
+
1333
+ console.print(f"[bold cyan]{provider_id}[/bold cyan]")
1334
+
1335
+ # Calculate deltas
1336
+ stability_a = metrics_a.get("stability_score", 0)
1337
+ stability_b = metrics_b.get("stability_score", 0)
1338
+ stability_delta = stability_b - stability_a
1339
+
1340
+ compliance_a = metrics_a.get("schema_compliance", 0)
1341
+ compliance_b = metrics_b.get("schema_compliance", 0)
1342
+ compliance_delta = compliance_b - compliance_a
1343
+
1344
+ p95_a = metrics_a.get("latency_stats", {}).get("p95", 0)
1345
+ p95_b = metrics_b.get("latency_stats", {}).get("p95", 0)
1346
+ p95_delta = p95_b - p95_a
1347
+
1348
+ cost_a = metrics_a.get("total_cost_usd", 0)
1349
+ cost_b = metrics_b.get("total_cost_usd", 0)
1350
+ cost_delta = cost_b - cost_a
1351
+
1352
+ json_fail_a = metrics_a.get("json_parse_failures", 0)
1353
+ json_fail_b = metrics_b.get("json_parse_failures", 0)
1354
+ json_fail_delta = json_fail_b - json_fail_a
1355
+
1356
+ schema_fail_a = metrics_a.get("schema_failures", 0)
1357
+ schema_fail_b = metrics_b.get("schema_failures", 0)
1358
+ schema_fail_delta = schema_fail_b - schema_fail_a
1359
+
1360
+ # Print deltas (only show if regressions_only is False or if there's a regression)
1361
+ def print_delta(label, value_a, value_b, delta, higher_is_better=True):
1362
+ delta_sign = "+" if delta > 0 else ""
1363
+ if delta == 0:
1364
+ return # No change
1365
+
1366
+ if regressions_only:
1367
+ # Only show if it's a regression
1368
+ if higher_is_better:
1369
+ if delta >= 0:
1370
+ return # Improvement or no change
1371
+ else:
1372
+ if delta <= 0:
1373
+ return # Improvement or no change
1374
+
1375
+ # Determine if it's good or bad
1376
+ if higher_is_better:
1377
+ if delta > 0:
1378
+ style = "green"
1379
+ else:
1380
+ style = "red"
1381
+ else:
1382
+ if delta < 0:
1383
+ style = "green"
1384
+ else:
1385
+ style = "red"
1386
+
1387
+ console.print(f" {label}: {value_a:.1f} → {value_b:.1f} ({delta_sign}{delta:.1f})", style=style)
1388
+
1389
+ print_delta("Stability", stability_a, stability_b, stability_delta)
1390
+ print_delta("Compliance", compliance_a, compliance_b, compliance_delta)
1391
+ print_delta("Latency P95", p95_a, p95_b, p95_delta, higher_is_better=False)
1392
+ print_delta("Cost", cost_a, cost_b, cost_delta, higher_is_better=False)
1393
+
1394
+ # Error counts (lower is better)
1395
+ if not regressions_only or json_fail_delta > 0:
1396
+ if json_fail_delta != 0:
1397
+ style = "red" if json_fail_delta > 0 else "green"
1398
+ console.print(f" JSON parse failures: {json_fail_a} → {json_fail_b} ({'+' if json_fail_delta > 0 else ''}{json_fail_delta})", style=style)
1399
+
1400
+ if not regressions_only or schema_fail_delta > 0:
1401
+ if schema_fail_delta != 0:
1402
+ style = "red" if schema_fail_delta > 0 else "green"
1403
+ console.print(f" Schema failures: {schema_fail_a} → {schema_fail_b} ({'+' if schema_fail_delta > 0 else ''}{schema_fail_delta})", style=style)
1404
+
1405
+
1406
+ @app.command()
1407
+ def version() -> None:
1408
+ """Show version information."""
1409
+ from aicert import __version__
1410
+ console.print(f"aicert version: {__version__}")
1411
+
1412
+
1413
+ def _load_summary(path: str) -> dict:
1414
+ """Load summary.json from a run directory."""
1415
+ from aicert.artifacts import create_run_dir
1416
+
1417
+ summary_path = Path(path) / "summary.json"
1418
+
1419
+ if not summary_path.exists():
1420
+ raise FileNotFoundError(f"summary.json not found in {path}")
1421
+
1422
+ with open(summary_path, "r", encoding="utf-8") as f:
1423
+ return json.load(f)