@evalgate/sdk 2.2.3 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +31 -0
  2. package/README.md +39 -2
  3. package/dist/assertions.d.ts +186 -6
  4. package/dist/assertions.js +515 -61
  5. package/dist/batch.js +4 -4
  6. package/dist/cache.d.ts +4 -0
  7. package/dist/cache.js +4 -0
  8. package/dist/cli/baseline.d.ts +14 -0
  9. package/dist/cli/baseline.js +43 -3
  10. package/dist/cli/check.d.ts +5 -2
  11. package/dist/cli/check.js +20 -12
  12. package/dist/cli/compare.d.ts +80 -0
  13. package/dist/cli/compare.js +266 -0
  14. package/dist/cli/index.js +244 -101
  15. package/dist/cli/regression-gate.js +23 -0
  16. package/dist/cli/run.js +22 -0
  17. package/dist/cli/start.d.ts +26 -0
  18. package/dist/cli/start.js +130 -0
  19. package/dist/cli/templates.d.ts +24 -0
  20. package/dist/cli/templates.js +314 -0
  21. package/dist/cli/traces.d.ts +109 -0
  22. package/dist/cli/traces.js +152 -0
  23. package/dist/cli/validate.d.ts +37 -0
  24. package/dist/cli/validate.js +252 -0
  25. package/dist/cli/watch.d.ts +19 -0
  26. package/dist/cli/watch.js +175 -0
  27. package/dist/client.js +6 -13
  28. package/dist/constants.d.ts +2 -0
  29. package/dist/constants.js +5 -0
  30. package/dist/index.d.ts +8 -6
  31. package/dist/index.js +26 -6
  32. package/dist/integrations/openai.js +83 -60
  33. package/dist/logger.d.ts +3 -1
  34. package/dist/logger.js +2 -1
  35. package/dist/otel.d.ts +130 -0
  36. package/dist/otel.js +309 -0
  37. package/dist/runtime/eval.d.ts +14 -4
  38. package/dist/runtime/eval.js +127 -2
  39. package/dist/runtime/registry.d.ts +4 -2
  40. package/dist/runtime/registry.js +11 -3
  41. package/dist/runtime/run-report.d.ts +1 -1
  42. package/dist/runtime/run-report.js +7 -4
  43. package/dist/runtime/types.d.ts +38 -0
  44. package/dist/testing.d.ts +8 -0
  45. package/dist/testing.js +45 -10
  46. package/dist/version.d.ts +2 -2
  47. package/dist/version.js +2 -2
  48. package/dist/workflows.d.ts +2 -0
  49. package/dist/workflows.js +184 -102
  50. package/package.json +124 -117
package/dist/cli/index.js CHANGED
@@ -11,6 +11,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
11
11
  const baseline_1 = require("./baseline");
12
12
  const check_1 = require("./check");
13
13
  const ci_1 = require("./ci");
14
+ const compare_1 = require("./compare");
14
15
  const diff_1 = require("./diff");
15
16
  const discover_1 = require("./discover");
16
17
  const doctor_1 = require("./doctor");
@@ -22,13 +23,161 @@ const print_config_1 = require("./print-config");
22
23
  const regression_gate_1 = require("./regression-gate");
23
24
  const run_1 = require("./run");
24
25
  const share_1 = require("./share");
26
+ const start_1 = require("./start");
27
+ const templates_1 = require("./templates");
25
28
  const upgrade_1 = require("./upgrade");
29
+ const validate_1 = require("./validate");
30
+ const watch_1 = require("./watch");
26
31
  const argv = process.argv.slice(2);
27
32
  const subcommand = argv[0];
33
+ const subArgs = argv.slice(1);
34
+ const wantsHelp = subArgs.includes("--help") || subArgs.includes("-h");
35
+ // ── Per-subcommand help text ──
36
+ const SUBCOMMAND_HELP = {
37
+ init: `evalgate init — Create evalgate.config.json + baseline + CI workflow\n\nUsage:\n evalgate init [options]\n\nOptions:\n --template <name> Start with a real working template (chatbot, codegen, agent, safety, rag)\n --list-templates Show all available templates\n\nCreates project scaffolding for EvalGate in the current directory.`,
38
+ start: `evalgate start — Zero-config startup (one command → passing run)\n\nUsage:\n evalgate start [options]\n\nOptions:\n --format <fmt> Output format: human (default), json\n --watch Enable watch mode after first run\n --skip-init Skip initialization if not set up\n\nExamples:\n evalgate start\n evalgate start --watch\n evalgate start --format json`,
39
+ compare: `evalgate compare — Side-by-side result file comparison\n\nCompares two or more saved run result JSON files. Does NOT re-run anything.\nYou run each model/config separately (evalgate run --write-results), then compare the artifacts.\n\nUsage:\n evalgate compare --base <file> --head <file> [options]\n evalgate compare --runs <file1> <file2> [file3...] [options]\n\nOptions:\n --base <file> Baseline run result JSON file\n --head <file> Head run result JSON file\n --runs <files> N-way compare (3+ run result JSON files)\n --labels <names> Optional cosmetic labels for the output table (e.g., model names)\n --format <fmt> Output format: human (default), json\n --sort-by <key> Sort by: name (default), score, duration\n\nExamples:\n evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json\n evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"\n evalgate compare --runs run-a.json run-b.json run-c.json`,
40
+ watch: `evalgate watch — Watch mode (re-execute on file save)\n\nUsage:\n evalgate run --watch [options]\n evalgate watch [options]\n\nOptions:\n --debounce <ms> Debounce interval (default: 300ms)\n --no-clear Don't clear screen between runs\n --format <fmt> Output format: human (default), json\n --write-results Write results to .evalgate/last-run.json\n\nExamples:\n evalgate run --watch\n evalgate watch --write-results`,
41
+ gate: `evalgate gate — Run the regression gate\n\nUsage:\n evalgate gate [options]\n\nOptions:\n --format <fmt> Output format: human (default), json, github\n --dry-run Run checks but always exit 0 (preview mode)\n\nExamples:\n evalgate gate\n evalgate gate --format json\n evalgate gate --dry-run`,
42
+ check: `evalgate check — CI/CD evaluation gate (API-based)\n\nUsage:\n evalgate check [options]\n\nOptions:\n --evaluationId <id> Evaluation to gate on\n --apiKey <key> API key (or EVALGATE_API_KEY env)\n --format <fmt> Output format: human (default), json, github\n --explain Show score breakdown\n --minScore <n> Fail if score < n\n --maxDrop <n> Fail if score dropped > n\n --policy <name> Enforce policy (HIPAA, SOC2, etc.)\n\nExamples:\n evalgate check --minScore 92 --evaluationId 42`,
43
+ explain: `evalgate explain — Explain last gate/check failure\n\nUsage:\n evalgate explain [options]\n\nOptions:\n --report <path> Path to report JSON (default: evals/regression-report.json)\n --format <fmt> Output format: human (default), json`,
44
+ discover: `evalgate discover — Discover behavioral specs\n\nUsage:\n evalgate discover [options]\n\nOptions:\n --manifest Generate evaluation manifest for incremental analysis`,
45
+ run: `evalgate run — Run evaluation specifications\n\nUsage:\n evalgate run [options]\n\nOptions:\n --spec-ids <ids> Comma-separated list of spec IDs\n --impacted-only Run only impacted specs (requires --base)\n --base <branch> Base branch for impact analysis\n --format <fmt> Output format: human (default), json\n --write-results Write results to .evalgate/last-run.json`,
46
+ diff: `evalgate diff — Compare two run reports\n\nUsage:\n evalgate diff [options]\n\nOptions:\n --base <ref> Base branch or report path\n --head <path> Head report path\n --format <fmt> Output format: human (default), json`,
47
+ validate: `evalgate validate — Validate spec files without running them\n\nUsage:\n evalgate validate [options]\n\nOptions:\n --format <fmt> Output format: human (default), json`,
48
+ doctor: `evalgate doctor — Comprehensive CI/CD readiness checklist\n\nUsage:\n evalgate doctor [options]\n\nOptions:\n --report Output JSON diagnostic bundle\n --format <fmt> Output format: human (default), json\n --strict Treat warnings as failures\n --apiKey <key> API key\n --evaluationId <id> Evaluation to verify`,
49
+ baseline: `evalgate baseline — Manage regression gate baselines\n\nUsage:\n evalgate baseline init Create starter evals/baseline.json\n evalgate baseline update Run tests and update baseline`,
50
+ upgrade: `evalgate upgrade — Upgrade from Tier 1 to Tier 2\n\nUsage:\n evalgate upgrade --full`,
51
+ ci: `evalgate ci — One-command CI loop (manifest → impact → run → diff)\n\nUsage:\n evalgate ci [options]\n\nOptions:\n --base <ref> Base reference for diff\n --impacted-only Run only impacted specs\n --format <fmt> Output format: human (default), json, github\n --write-results Write run results`,
52
+ share: `evalgate share — Create share link for a run\n\nUsage:\n evalgate share [options]\n\nOptions:\n --scope <s> Share scope\n --evaluationId <id> Evaluation ID\n --runId <id> Run ID\n --expires <dur> Expiry duration (e.g. 7d)\n --apiKey <key> API key`,
53
+ "impact-analysis": `evalgate impact-analysis — Analyze impact of changes\n\nUsage:\n evalgate impact-analysis [options]\n\nOptions:\n --base <branch> Base branch (default: main)\n --changed-files <files> Comma-separated list of changed files\n --format <fmt> Output format: human (default), json`,
54
+ "print-config": `evalgate print-config — Show resolved config\n\nUsage:\n evalgate print-config [options]\n\nOptions:\n --format <fmt> Output format: human (default), json`,
55
+ };
56
+ // Intercept --help for any known subcommand
57
+ if (subcommand && wantsHelp && subcommand in SUBCOMMAND_HELP) {
58
+ console.log(SUBCOMMAND_HELP[subcommand]);
59
+ process.exit(0);
60
+ }
28
61
  if (subcommand === "init") {
29
62
  const cwd = process.cwd();
63
+ const args = argv.slice(1);
64
+ // Handle --list-templates
65
+ if (args.includes("--list-templates")) {
66
+ (0, templates_1.printTemplateList)();
67
+ process.exit(0);
68
+ }
69
+ // Handle --template <name>
70
+ const templateIndex = args.indexOf("--template");
71
+ const templateName = templateIndex !== -1 ? args[templateIndex + 1] : undefined;
72
+ if (templateName) {
73
+ if (!templates_1.AVAILABLE_TEMPLATES.includes(templateName)) {
74
+ console.error(` ✖ Unknown template: ${templateName}`);
75
+ (0, templates_1.printTemplateList)();
76
+ process.exit(1);
77
+ }
78
+ }
30
79
  const ok = (0, init_1.runInit)(cwd);
31
- process.exit(ok ? 0 : 1);
80
+ if (!ok)
81
+ process.exit(1);
82
+ // Install template after init if requested
83
+ if (templateName) {
84
+ console.log(`\n 📋 Installing template: ${templateName}\n`);
85
+ const { filesCreated, filesSkipped } = (0, templates_1.installTemplate)(templateName, cwd);
86
+ for (const f of filesCreated)
87
+ console.log(` ✔ Created ${f}`);
88
+ for (const f of filesSkipped)
89
+ console.log(` – Skipped ${f} (already exists)`);
90
+ console.log("");
91
+ }
92
+ process.exit(0);
93
+ }
94
+ else if (subcommand === "start") {
95
+ // Parse arguments for start command
96
+ const args = argv.slice(1);
97
+ const formatIndex = args.indexOf("--format");
98
+ const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
99
+ const watch = args.includes("--watch");
100
+ const skipInit = args.includes("--skip-init");
101
+ (0, start_1.runStart)({ format, watch, skipInit })
102
+ .then((code) => process.exit(code))
103
+ .catch((err) => {
104
+ console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
105
+ process.exit(1);
106
+ });
107
+ }
108
+ else if (subcommand === "watch") {
109
+ // Parse arguments for watch command
110
+ const args = argv.slice(1);
111
+ const formatIndex = args.indexOf("--format");
112
+ const debounceIndex = args.indexOf("--debounce");
113
+ const writeResultsIndex = args.indexOf("--write-results");
114
+ const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
115
+ const debounceMs = debounceIndex !== -1 ? parseInt(args[debounceIndex + 1], 10) : undefined;
116
+ const writeResults = writeResultsIndex !== -1;
117
+ const clearScreen = !args.includes("--no-clear");
118
+ (0, watch_1.runWatch)({ format, writeResults, debounceMs, clearScreen })
119
+ .then(() => process.exit(0))
120
+ .catch((err) => {
121
+ console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
122
+ process.exit(1);
123
+ });
124
+ }
125
+ else if (subcommand === "compare") {
126
+ // Parse arguments for compare command
127
+ const args = argv.slice(1);
128
+ const runsIndex = args.indexOf("--runs");
129
+ const baseIndex = args.indexOf("--base");
130
+ const headIndex = args.indexOf("--head");
131
+ const labelsIndex = args.indexOf("--labels");
132
+ const formatIndex = args.indexOf("--format");
133
+ const sortByIndex = args.indexOf("--sort-by");
134
+ // Collect run files: --runs <f1> <f2> ... OR --base <f1> --head <f2>
135
+ const runs = [];
136
+ if (runsIndex !== -1) {
137
+ for (let i = runsIndex + 1; i < args.length; i++) {
138
+ if (args[i].startsWith("--"))
139
+ break;
140
+ runs.push(args[i]);
141
+ }
142
+ }
143
+ else {
144
+ // --base / --head shorthand for 2-file compare
145
+ if (baseIndex !== -1 && args[baseIndex + 1])
146
+ runs.push(args[baseIndex + 1]);
147
+ if (headIndex !== -1 && args[headIndex + 1])
148
+ runs.push(args[headIndex + 1]);
149
+ }
150
+ // Collect labels (all args after --labels until next flag)
151
+ const labels = [];
152
+ if (labelsIndex !== -1) {
153
+ for (let i = labelsIndex + 1; i < args.length; i++) {
154
+ if (args[i].startsWith("--"))
155
+ break;
156
+ labels.push(args[i]);
157
+ }
158
+ }
159
+ const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
160
+ const sortBy = sortByIndex !== -1
161
+ ? args[sortByIndex + 1]
162
+ : "name";
163
+ if (runs.length < 2) {
164
+ console.error("Error: At least 2 run files are required.");
165
+ console.error("Usage: evalgate compare --base results-a.json --head results-b.json");
166
+ console.error(" evalgate compare --runs <file1> <file2> [<file3> ...]");
167
+ console.error(" --labels are optional metadata, not required identifiers.");
168
+ process.exit(1);
169
+ }
170
+ (0, compare_1.runCompareCLI)({
171
+ runs,
172
+ labels: labels.length > 0 ? labels : undefined,
173
+ format,
174
+ sortBy,
175
+ })
176
+ .then(() => process.exit(0))
177
+ .catch((err) => {
178
+ console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
179
+ process.exit(1);
180
+ });
32
181
  }
33
182
  else if (subcommand === "baseline") {
34
183
  const code = (0, baseline_1.runBaseline)(argv.slice(1));
@@ -179,23 +328,38 @@ else if (subcommand === "run") {
179
328
  const baseIndex = args.indexOf("--base");
180
329
  const formatIndex = args.indexOf("--format");
181
330
  const writeResultsIndex = args.indexOf("--write-results");
331
+ const watchFlag = args.includes("--watch");
182
332
  const specIds = specIdsIndex !== -1 ? args[specIdsIndex + 1]?.split(",") : undefined;
183
333
  const impactedOnly = impactedOnlyIndex !== -1;
184
334
  const baseBranch = baseIndex !== -1 ? args[baseIndex + 1] : undefined;
185
335
  const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
186
336
  const writeResults = writeResultsIndex !== -1;
187
- (0, run_1.runEvaluationsCLI)({
188
- specIds,
189
- impactedOnly: impactedOnly ? !!baseBranch : false,
190
- baseBranch,
191
- format,
192
- writeResults,
193
- })
194
- .then(() => process.exit(0))
195
- .catch((err) => {
196
- console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
197
- process.exit(2);
198
- });
337
+ if (watchFlag) {
338
+ // Delegate to watch mode
339
+ const debounceIndex = args.indexOf("--debounce");
340
+ const debounceMs = debounceIndex !== -1 ? parseInt(args[debounceIndex + 1], 10) : undefined;
341
+ const clearScreen = !args.includes("--no-clear");
342
+ (0, watch_1.runWatch)({ specIds, format, writeResults, debounceMs, clearScreen })
343
+ .then(() => process.exit(0))
344
+ .catch((err) => {
345
+ console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
346
+ process.exit(1);
347
+ });
348
+ }
349
+ else {
350
+ (0, run_1.runEvaluationsCLI)({
351
+ specIds,
352
+ impactedOnly: impactedOnly ? !!baseBranch : false,
353
+ baseBranch,
354
+ format,
355
+ writeResults,
356
+ })
357
+ .then(() => process.exit(0))
358
+ .catch((err) => {
359
+ console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
360
+ process.exit(2);
361
+ });
362
+ }
199
363
  }
200
364
  else if (subcommand === "diff") {
201
365
  // Parse arguments for diff command
@@ -213,6 +377,14 @@ else if (subcommand === "diff") {
213
377
  process.exit(2);
214
378
  });
215
379
  }
380
+ else if (subcommand === "validate") {
381
+ (0, validate_1.runValidate)(argv.slice(1))
382
+ .then((result) => process.exit(result.passed ? 0 : 1))
383
+ .catch((err) => {
384
+ console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
385
+ process.exit(1);
386
+ });
387
+ }
216
388
  else if (subcommand === "ci") {
217
389
  // Parse arguments for ci command
218
390
  const args = argv.slice(1);
@@ -237,96 +409,67 @@ else {
237
409
  console.log(`EvalGate CLI
238
410
 
239
411
  Usage:
240
- evalgate init Create evalgate.config.json + baseline + CI workflow
241
- evalgate discover Discover behavioral specs in project and show statistics
242
- evalgate discover --manifest Generate evaluation manifest for incremental analysis
243
- evalgate impact-analysis Analyze impact of changes and suggest targeted tests
244
- --base <branch> Base branch to compare against (default: main)
245
- --changed-files <files> Comma-separated list of changed files (for CI)
246
- --format <fmt> Output format: human (default), json
247
- evalgate ci One-command CI loop (manifest impact → run → diff)
248
- --base <ref> Base reference for diff (baseline|last|<runId>|<path>|<gitref>)
249
- --impacted-only Run only specs impacted by changes
250
- --format <fmt> Output format: human (default), json, github
251
- --write-results Write run results to .evalgate/last-run.json
252
- evalgate run Run evaluation specifications
253
- --spec-ids <ids> Comma-separated list of spec IDs to run
254
- --impacted-only Run only specs impacted by changes (requires --base)
255
- --base <branch> Base branch for impact analysis (with --impacted-only)
256
- --format <fmt> Output format: human (default), json
257
- --write-results Write results to .evalgate/last-run.json
258
- evalgate diff Compare two run reports and show behavioral changes
259
- --base <branch> Base branch or report path (default: main)
260
- --head <path> Head report path (default: .evalgate/last-run.json)
261
- --format <fmt> Output format: human (default), json
262
- evalgate gate [options] Run regression gate (local test-based, no API needed)
263
- evalgate check [options] CI/CD evaluation gate (API-based)
264
- evalgate explain [options] Explain last gate/check failure with root causes + fixes
265
- evalgate doctor [options] Comprehensive CI/CD readiness checklist
266
- evalgate baseline init Create starter evals/baseline.json
267
- evalgate baseline update Run tests and update baseline with real scores
268
- evalgate upgrade --full Upgrade from Tier 1 to Tier 2 (full gate)
269
- evalgate print-config Show resolved config with source-of-truth annotations
270
- evalgate share [options] Create share link for a run
271
-
272
- Options for gate:
273
- --format <fmt> Output format: human (default), json, github
274
-
275
- Options for check:
276
- --evaluationId <id> Evaluation to gate on (or from config)
277
- --apiKey <key> API key (or EVALAI_API_KEY env)
278
- --format <fmt> Output format: human (default), json, github
279
- --explain Show score breakdown and thresholds
280
- --onFail import When gate fails, import run with CI context
281
- --minScore <n> Fail if score < n (0-100)
282
- --maxDrop <n> Fail if score dropped > n from baseline
283
- --warnDrop <n> Warn (exit 8) if score dropped > n but < maxDrop
284
- --minN <n> Fail if total test cases < n
285
- --allowWeakEvidence Allow weak evidence level
286
- --policy <name> Enforce policy (HIPAA, SOC2, GDPR, etc.)
287
- --baseline <mode> "published", "previous", or "production"
288
- --share <mode> Share link: always | fail | never (fail = only when gate fails)
289
- --baseUrl <url> API base URL
290
-
291
- Options for explain:
292
- --report <path> Path to report JSON (default: evals/regression-report.json)
293
- --format <fmt> Output format: human (default), json
294
-
295
- Options for print-config:
296
- --format <fmt> Output format: human (default), json
297
-
298
- Options for doctor:
299
- --report Output JSON diagnostic bundle
300
- --format <fmt> Output format: human (default), json
301
- --strict Treat warnings as failures (exit 2)
302
- --apiKey <key> API key (or EVALAI_API_KEY env)
303
- --baseUrl <url> API base URL
304
- --evaluationId <id> Evaluation to verify
412
+ evalgate start Zero-config startup (init + discover + run in one command)
413
+ --watch Enable watch mode after first run
414
+ --format <fmt> Output format: human (default), json
415
+ evalgate init Create evalgate.config.json + baseline + CI workflow
416
+ --template <name> Start with a template (chatbot, codegen, agent, safety, rag)
417
+ --list-templates Show all available templates
418
+ evalgate discover Discover behavioral specs in project and show statistics
419
+ --manifest Generate evaluation manifest for incremental analysis
420
+ evalgate run Run evaluation specifications
421
+ --spec-ids <ids> Comma-separated list of spec IDs to run
422
+ --impacted-only Run only specs impacted by changes (requires --base)
423
+ --base <branch> Base branch for impact analysis (with --impacted-only)
424
+ --format <fmt> Output format: human (default), json
425
+ --write-results Write results to .evalgate/last-run.json
426
+ --watch Re-execute on file save (watch mode)
427
+ --debounce <ms> Watch debounce interval (default: 300ms)
428
+ --no-clear Don't clear screen between watch runs
429
+ evalgate watch Watch mode (alias for evalgate run --watch)
430
+ evalgate compare Side-by-side run comparison
431
+ --base <file> Baseline run result JSON file
432
+ --head <file> Head run result JSON file
433
+ --runs <f1> <f2> [...] N-way compare (3+ run files)
434
+ --labels <l1> <l2> [...] Optional human-readable labels (e.g., model names)
435
+ --sort-by <key> Sort by: name (default), score, duration
436
+ --format <fmt> Output format: human (default), json
437
+ evalgate diff Compare two run reports and show behavioral changes
438
+ --base <branch> Base branch or report path (default: main)
439
+ --head <path> Head report path (default: .evalgate/last-run.json)
440
+ --format <fmt> Output format: human (default), json
441
+ evalgate impact-analysis Analyze impact of changes and suggest targeted tests
442
+ --base <branch> Base branch to compare against (default: main)
443
+ --changed-files <files> Comma-separated list of changed files (for CI)
444
+ --format <fmt> Output format: human (default), json
445
+ evalgate ci One-command CI loop (manifest impact → run → diff)
446
+ --base <ref> Base reference for diff
447
+ --impacted-only Run only specs impacted by changes
448
+ --format <fmt> Output format: human (default), json, github
449
+ --write-results Write run results to .evalgate/last-run.json
450
+ evalgate gate [options] Run regression gate (local test-based, no API needed)
451
+ evalgate check [options] CI/CD evaluation gate (API-based)
452
+ evalgate explain [options] Explain last gate/check failure with root causes + fixes
453
+ evalgate doctor [options] Comprehensive CI/CD readiness checklist
454
+ evalgate validate Validate spec files without running them
455
+ evalgate baseline init Create starter evals/baseline.json
456
+ evalgate baseline update Run tests and update baseline with real scores
457
+ evalgate upgrade --full Upgrade from Tier 1 to Tier 2 (full gate)
458
+ evalgate print-config Show resolved config with source-of-truth annotations
459
+ evalgate share [options] Create share link for a run
305
460
 
306
461
  Examples:
307
- evalgate init
308
- evalgate discover
309
- evalgate discover --manifest
310
- evalgate impact-analysis --base main
311
- evalgate impact-analysis --base main --format json
312
- evalgate impact-analysis --changed-files src/utils.ts,datasets/test.json
313
- evalgate run
314
- evalgate run --spec-ids spec1,spec2
315
- evalgate run --impacted-only --base main
316
- evalgate run --format json --write-results
317
- evalgate diff
318
- evalgate diff --base main
319
- evalgate diff --base main --format json
320
- evalgate diff --a .evalgate/runs/base.json --b .evalgate/last-run.json
321
- evalgate gate
322
- evalgate gate --format json
323
- evalgate explain
324
- evalgate doctor
325
- evalgate print-config
326
- evalgate doctor --report
327
- evalgate check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
328
- evalgate check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
329
- evalgate share --scope run --evaluationId 42 --runId 123 --expires 7d --apiKey $EVALAI_API_KEY
462
+ evalgate start Zero to eval in one command
463
+ evalgate init --template chatbot Scaffold with chatbot evals
464
+ evalgate run --watch Re-run on file save
465
+ evalgate compare --base gpt4o.json --head claude.json Side-by-side run diff
466
+ evalgate run --spec-ids spec1,spec2 Run specific specs
467
+ evalgate run --impacted-only --base main Run only impacted specs
468
+ evalgate diff --base main Behavioral diff
469
+ evalgate ci --base main --impacted-only Full CI loop
470
+ evalgate gate --format json Regression gate
471
+ evalgate check --minScore 92 --evaluationId 42 API-based gate
472
+ evalgate doctor Preflight check
330
473
  `);
331
474
  process.exit(subcommand === "--help" || subcommand === "-h" ? 0 : 1);
332
475
  }
@@ -48,6 +48,7 @@ exports.runGate = runGate;
48
48
  const node_child_process_1 = require("node:child_process");
49
49
  const fs = __importStar(require("node:fs"));
50
50
  const path = __importStar(require("node:path"));
51
+ const baseline_1 = require("./baseline");
51
52
  const REPORT_REL = "evals/regression-report.json";
52
53
  const BASELINE_REL = "evals/baseline.json";
53
54
  /** Detect the package manager used in the project */
@@ -147,6 +148,28 @@ function runBuiltinGate(cwd) {
147
148
  runner,
148
149
  };
149
150
  }
151
+ // Verify baseline integrity
152
+ const checksumResult = (0, baseline_1.verifyBaselineChecksum)(baselineData);
153
+ if (!checksumResult.valid) {
154
+ return {
155
+ schemaVersion: 1,
156
+ timestamp: now,
157
+ exitCode: 2,
158
+ category: "infra_error",
159
+ passed: false,
160
+ failures: [
161
+ checksumResult.reason ?? "Baseline checksum verification failed",
162
+ ],
163
+ deltas: [],
164
+ baseline: null,
165
+ durationMs: Date.now() - t0,
166
+ command,
167
+ runner,
168
+ };
169
+ }
170
+ if (checksumResult.reason === "no_checksum") {
171
+ console.warn("⚠ Baseline has no checksum. Run 'evalgate baseline update' to stamp one.");
172
+ }
150
173
  const baselineMeta = baselineData.updatedAt
151
174
  ? {
152
175
  updatedAt: baselineData.updatedAt,
package/dist/cli/run.js CHANGED
@@ -54,6 +54,7 @@ const fs = __importStar(require("node:fs/promises"));
54
54
  const path = __importStar(require("node:path"));
55
55
  const registry_1 = require("../runtime/registry");
56
56
  const impact_analysis_1 = require("./impact-analysis");
57
+ const traces_1 = require("./traces");
57
58
  /**
58
59
  * Generate deterministic run ID
59
60
  */
@@ -377,6 +378,15 @@ function printHumanResults(result) {
377
378
  console.log(` ❌ Failed: ${result.summary.failed}`);
378
379
  console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
379
380
  console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
381
+ // Latency percentiles
382
+ const durations = result.results
383
+ .filter((r) => r.result.status !== "skipped")
384
+ .map((r) => r.result.duration);
385
+ if (durations.length > 0) {
386
+ const latency = (0, traces_1.calculatePercentiles)(durations);
387
+ console.log("");
388
+ console.log((0, traces_1.formatLatencyTable)(latency));
389
+ }
380
390
  const hasScores = result.results.some((r) => r.result.score !== undefined);
381
391
  console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
382
392
  for (const spec of result.results) {
@@ -404,6 +414,18 @@ function printJsonResults(result) {
404
414
  async function runEvaluationsCLI(options) {
405
415
  try {
406
416
  const result = await runEvaluations(options);
417
+ // Auto-write structured traces
418
+ if (result.results.length > 0) {
419
+ try {
420
+ const tracePath = await (0, traces_1.writeTraces)(result);
421
+ if (options.format !== "json") {
422
+ console.log(`\n🔍 Trace written to ${tracePath}`);
423
+ }
424
+ }
425
+ catch {
426
+ // Trace writing is best-effort, don't fail the run
427
+ }
428
+ }
407
429
  if (options.format === "json") {
408
430
  printJsonResults(result);
409
431
  }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * evalgate start — Zero-config startup
3
+ *
4
+ * One command to go from nothing to a passing eval run:
5
+ * npx evalgate start
6
+ *
7
+ * What it does:
8
+ * 1. If no evalgate.config.json, runs init
9
+ * 2. If no manifest, runs discover --manifest
10
+ * 3. Runs evalgate run --write-results
11
+ * 4. Prints results
12
+ *
13
+ * The goal: zero decisions, one command, immediate value.
14
+ */
15
+ export interface StartOptions {
16
+ /** Output format */
17
+ format?: "human" | "json";
18
+ /** Skip init if not already set up */
19
+ skipInit?: boolean;
20
+ /** Enable watch mode after first run */
21
+ watch?: boolean;
22
+ }
23
+ /**
24
+ * Zero-config startup: one command → passing run
25
+ */
26
+ export declare function runStart(options?: StartOptions, projectRoot?: string): Promise<number>;
@@ -0,0 +1,130 @@
1
+ "use strict";
2
+ /**
3
+ * evalgate start — Zero-config startup
4
+ *
5
+ * One command to go from nothing to a passing eval run:
6
+ * npx evalgate start
7
+ *
8
+ * What it does:
9
+ * 1. If no evalgate.config.json, runs init
10
+ * 2. If no manifest, runs discover --manifest
11
+ * 3. Runs evalgate run --write-results
12
+ * 4. Prints results
13
+ *
14
+ * The goal: zero decisions, one command, immediate value.
15
+ */
16
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
17
+ if (k2 === undefined) k2 = k;
18
+ var desc = Object.getOwnPropertyDescriptor(m, k);
19
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
20
+ desc = { enumerable: true, get: function() { return m[k]; } };
21
+ }
22
+ Object.defineProperty(o, k2, desc);
23
+ }) : (function(o, m, k, k2) {
24
+ if (k2 === undefined) k2 = k;
25
+ o[k2] = m[k];
26
+ }));
27
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
28
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
29
+ }) : function(o, v) {
30
+ o["default"] = v;
31
+ });
32
+ var __importStar = (this && this.__importStar) || (function () {
33
+ var ownKeys = function(o) {
34
+ ownKeys = Object.getOwnPropertyNames || function (o) {
35
+ var ar = [];
36
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
37
+ return ar;
38
+ };
39
+ return ownKeys(o);
40
+ };
41
+ return function (mod) {
42
+ if (mod && mod.__esModule) return mod;
43
+ var result = {};
44
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
45
+ __setModuleDefault(result, mod);
46
+ return result;
47
+ };
48
+ })();
49
+ Object.defineProperty(exports, "__esModule", { value: true });
50
+ exports.runStart = runStart;
51
+ const fs = __importStar(require("node:fs"));
52
+ const path = __importStar(require("node:path"));
53
+ const discover_1 = require("./discover");
54
+ const init_1 = require("./init");
55
+ const run_1 = require("./run");
56
+ /**
57
+ * Zero-config startup: one command → passing run
58
+ */
59
+ async function runStart(options = {}, projectRoot = process.cwd()) {
60
+ const format = options.format ?? "human";
61
+ if (format === "human") {
62
+ console.log("\n🚀 evalgate start — zero-config evaluation run\n");
63
+ }
64
+ // Step 1: Ensure project is initialized
65
+ const configPath = path.join(projectRoot, "evalgate.config.json");
66
+ if (!fs.existsSync(configPath) && !options.skipInit) {
67
+ if (format === "human") {
68
+ console.log("📦 No evalgate.config.json found. Initializing...\n");
69
+ }
70
+ const initOk = (0, init_1.runInit)(projectRoot);
71
+ if (!initOk) {
72
+ console.error("❌ Initialization failed. Run `evalgate init` manually.");
73
+ return 1;
74
+ }
75
+ if (format === "human")
76
+ console.log("");
77
+ }
78
+ // Step 2: Ensure manifest exists (discover specs)
79
+ const manifestPath = path.join(projectRoot, ".evalgate", "manifest.json");
80
+ if (!fs.existsSync(manifestPath)) {
81
+ if (format === "human") {
82
+ console.log("🔍 No manifest found. Discovering specs...\n");
83
+ }
84
+ try {
85
+ await (0, discover_1.discoverSpecs)({ manifest: true });
86
+ }
87
+ catch (err) {
88
+ // Discovery may fail if no spec files exist yet — that's OK for legacy mode
89
+ if (format === "human") {
90
+ console.log(` ℹ️ Discovery: ${err instanceof Error ? err.message : String(err)}`);
91
+ console.log(" Falling back to gate mode...\n");
92
+ }
93
+ }
94
+ }
95
+ // Step 3: Run evaluations
96
+ if (format === "human") {
97
+ console.log("▶️ Running evaluations...\n");
98
+ }
99
+ try {
100
+ const result = await (0, run_1.runEvaluations)({ writeResults: true, format }, projectRoot);
101
+ if (format === "json") {
102
+ (0, run_1.printJsonResults)(result);
103
+ }
104
+ else {
105
+ (0, run_1.printHumanResults)(result);
106
+ }
107
+ // Step 4: If watch mode requested, transition to watch
108
+ if (options.watch) {
109
+ const { runWatch } = await Promise.resolve().then(() => __importStar(require("./watch")));
110
+ await runWatch({ writeResults: true, format }, projectRoot);
111
+ return 0; // Never reached (watch runs forever)
112
+ }
113
+ return result.summary.failed > 0 ? 1 : 0;
114
+ }
115
+ catch (error) {
116
+ if (format === "human") {
117
+ console.error(`\n❌ ${error instanceof Error ? error.message : String(error)}`);
118
+ console.log("\n💡 Tips:");
119
+ console.log(" • Create spec files with defineEval() in eval/ directory");
120
+ console.log(" • Run `evalgate discover` to verify spec detection");
121
+ console.log(" • Run `evalgate doctor` for full diagnostics");
122
+ }
123
+ else {
124
+ console.error(JSON.stringify({
125
+ error: error instanceof Error ? error.message : String(error),
126
+ }));
127
+ }
128
+ return 1;
129
+ }
130
+ }