@evalgate/sdk 2.2.3 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/README.md +39 -2
- package/dist/assertions.d.ts +186 -6
- package/dist/assertions.js +515 -61
- package/dist/batch.js +4 -4
- package/dist/cache.d.ts +4 -0
- package/dist/cache.js +4 -0
- package/dist/cli/baseline.d.ts +14 -0
- package/dist/cli/baseline.js +43 -3
- package/dist/cli/check.d.ts +5 -2
- package/dist/cli/check.js +20 -12
- package/dist/cli/compare.d.ts +80 -0
- package/dist/cli/compare.js +266 -0
- package/dist/cli/index.js +244 -101
- package/dist/cli/regression-gate.js +23 -0
- package/dist/cli/run.js +22 -0
- package/dist/cli/start.d.ts +26 -0
- package/dist/cli/start.js +130 -0
- package/dist/cli/templates.d.ts +24 -0
- package/dist/cli/templates.js +314 -0
- package/dist/cli/traces.d.ts +109 -0
- package/dist/cli/traces.js +152 -0
- package/dist/cli/validate.d.ts +37 -0
- package/dist/cli/validate.js +252 -0
- package/dist/cli/watch.d.ts +19 -0
- package/dist/cli/watch.js +175 -0
- package/dist/client.js +6 -13
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +5 -0
- package/dist/index.d.ts +8 -6
- package/dist/index.js +26 -6
- package/dist/integrations/openai.js +83 -60
- package/dist/logger.d.ts +3 -1
- package/dist/logger.js +2 -1
- package/dist/otel.d.ts +130 -0
- package/dist/otel.js +309 -0
- package/dist/runtime/eval.d.ts +14 -4
- package/dist/runtime/eval.js +127 -2
- package/dist/runtime/registry.d.ts +4 -2
- package/dist/runtime/registry.js +11 -3
- package/dist/runtime/run-report.d.ts +1 -1
- package/dist/runtime/run-report.js +7 -4
- package/dist/runtime/types.d.ts +38 -0
- package/dist/testing.d.ts +8 -0
- package/dist/testing.js +45 -10
- package/dist/version.d.ts +2 -2
- package/dist/version.js +2 -2
- package/dist/workflows.d.ts +2 -0
- package/dist/workflows.js +184 -102
- package/package.json +124 -117
package/dist/cli/index.js
CHANGED
|
@@ -11,6 +11,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
11
11
|
const baseline_1 = require("./baseline");
|
|
12
12
|
const check_1 = require("./check");
|
|
13
13
|
const ci_1 = require("./ci");
|
|
14
|
+
const compare_1 = require("./compare");
|
|
14
15
|
const diff_1 = require("./diff");
|
|
15
16
|
const discover_1 = require("./discover");
|
|
16
17
|
const doctor_1 = require("./doctor");
|
|
@@ -22,13 +23,161 @@ const print_config_1 = require("./print-config");
|
|
|
22
23
|
const regression_gate_1 = require("./regression-gate");
|
|
23
24
|
const run_1 = require("./run");
|
|
24
25
|
const share_1 = require("./share");
|
|
26
|
+
const start_1 = require("./start");
|
|
27
|
+
const templates_1 = require("./templates");
|
|
25
28
|
const upgrade_1 = require("./upgrade");
|
|
29
|
+
const validate_1 = require("./validate");
|
|
30
|
+
const watch_1 = require("./watch");
|
|
26
31
|
const argv = process.argv.slice(2);
|
|
27
32
|
const subcommand = argv[0];
|
|
33
|
+
const subArgs = argv.slice(1);
|
|
34
|
+
const wantsHelp = subArgs.includes("--help") || subArgs.includes("-h");
|
|
35
|
+
// ── Per-subcommand help text ──
|
|
36
|
+
const SUBCOMMAND_HELP = {
|
|
37
|
+
init: `evalgate init — Create evalgate.config.json + baseline + CI workflow\n\nUsage:\n evalgate init [options]\n\nOptions:\n --template <name> Start with a real working template (chatbot, codegen, agent, safety, rag)\n --list-templates Show all available templates\n\nCreates project scaffolding for EvalGate in the current directory.`,
|
|
38
|
+
start: `evalgate start — Zero-config startup (one command → passing run)\n\nUsage:\n evalgate start [options]\n\nOptions:\n --format <fmt> Output format: human (default), json\n --watch Enable watch mode after first run\n --skip-init Skip initialization if not set up\n\nExamples:\n evalgate start\n evalgate start --watch\n evalgate start --format json`,
|
|
39
|
+
compare: `evalgate compare — Side-by-side result file comparison\n\nCompares two or more saved run result JSON files. Does NOT re-run anything.\nYou run each model/config separately (evalgate run --write-results), then compare the artifacts.\n\nUsage:\n evalgate compare --base <file> --head <file> [options]\n evalgate compare --runs <file1> <file2> [file3...] [options]\n\nOptions:\n --base <file> Baseline run result JSON file\n --head <file> Head run result JSON file\n --runs <files> N-way compare (3+ run result JSON files)\n --labels <names> Optional cosmetic labels for the output table (e.g., model names)\n --format <fmt> Output format: human (default), json\n --sort-by <key> Sort by: name (default), score, duration\n\nExamples:\n evalgate compare --base .evalgate/runs/run-a.json --head .evalgate/runs/run-b.json\n evalgate compare --base gpt4o.json --head claude.json --labels "GPT-4o" "Claude 3.5"\n evalgate compare --runs run-a.json run-b.json run-c.json`,
|
|
40
|
+
watch: `evalgate watch — Watch mode (re-execute on file save)\n\nUsage:\n evalgate run --watch [options]\n evalgate watch [options]\n\nOptions:\n --debounce <ms> Debounce interval (default: 300ms)\n --no-clear Don't clear screen between runs\n --format <fmt> Output format: human (default), json\n --write-results Write results to .evalgate/last-run.json\n\nExamples:\n evalgate run --watch\n evalgate watch --write-results`,
|
|
41
|
+
gate: `evalgate gate — Run the regression gate\n\nUsage:\n evalgate gate [options]\n\nOptions:\n --format <fmt> Output format: human (default), json, github\n --dry-run Run checks but always exit 0 (preview mode)\n\nExamples:\n evalgate gate\n evalgate gate --format json\n evalgate gate --dry-run`,
|
|
42
|
+
check: `evalgate check — CI/CD evaluation gate (API-based)\n\nUsage:\n evalgate check [options]\n\nOptions:\n --evaluationId <id> Evaluation to gate on\n --apiKey <key> API key (or EVALGATE_API_KEY env)\n --format <fmt> Output format: human (default), json, github\n --explain Show score breakdown\n --minScore <n> Fail if score < n\n --maxDrop <n> Fail if score dropped > n\n --policy <name> Enforce policy (HIPAA, SOC2, etc.)\n\nExamples:\n evalgate check --minScore 92 --evaluationId 42`,
|
|
43
|
+
explain: `evalgate explain — Explain last gate/check failure\n\nUsage:\n evalgate explain [options]\n\nOptions:\n --report <path> Path to report JSON (default: evals/regression-report.json)\n --format <fmt> Output format: human (default), json`,
|
|
44
|
+
discover: `evalgate discover — Discover behavioral specs\n\nUsage:\n evalgate discover [options]\n\nOptions:\n --manifest Generate evaluation manifest for incremental analysis`,
|
|
45
|
+
run: `evalgate run — Run evaluation specifications\n\nUsage:\n evalgate run [options]\n\nOptions:\n --spec-ids <ids> Comma-separated list of spec IDs\n --impacted-only Run only impacted specs (requires --base)\n --base <branch> Base branch for impact analysis\n --format <fmt> Output format: human (default), json\n --write-results Write results to .evalgate/last-run.json`,
|
|
46
|
+
diff: `evalgate diff — Compare two run reports\n\nUsage:\n evalgate diff [options]\n\nOptions:\n --base <ref> Base branch or report path\n --head <path> Head report path\n --format <fmt> Output format: human (default), json`,
|
|
47
|
+
validate: `evalgate validate — Validate spec files without running them\n\nUsage:\n evalgate validate [options]\n\nOptions:\n --format <fmt> Output format: human (default), json`,
|
|
48
|
+
doctor: `evalgate doctor — Comprehensive CI/CD readiness checklist\n\nUsage:\n evalgate doctor [options]\n\nOptions:\n --report Output JSON diagnostic bundle\n --format <fmt> Output format: human (default), json\n --strict Treat warnings as failures\n --apiKey <key> API key\n --evaluationId <id> Evaluation to verify`,
|
|
49
|
+
baseline: `evalgate baseline — Manage regression gate baselines\n\nUsage:\n evalgate baseline init Create starter evals/baseline.json\n evalgate baseline update Run tests and update baseline`,
|
|
50
|
+
upgrade: `evalgate upgrade — Upgrade from Tier 1 to Tier 2\n\nUsage:\n evalgate upgrade --full`,
|
|
51
|
+
ci: `evalgate ci — One-command CI loop (manifest → impact → run → diff)\n\nUsage:\n evalgate ci [options]\n\nOptions:\n --base <ref> Base reference for diff\n --impacted-only Run only impacted specs\n --format <fmt> Output format: human (default), json, github\n --write-results Write run results`,
|
|
52
|
+
share: `evalgate share — Create share link for a run\n\nUsage:\n evalgate share [options]\n\nOptions:\n --scope <s> Share scope\n --evaluationId <id> Evaluation ID\n --runId <id> Run ID\n --expires <dur> Expiry duration (e.g. 7d)\n --apiKey <key> API key`,
|
|
53
|
+
"impact-analysis": `evalgate impact-analysis — Analyze impact of changes\n\nUsage:\n evalgate impact-analysis [options]\n\nOptions:\n --base <branch> Base branch (default: main)\n --changed-files <files> Comma-separated list of changed files\n --format <fmt> Output format: human (default), json`,
|
|
54
|
+
"print-config": `evalgate print-config — Show resolved config\n\nUsage:\n evalgate print-config [options]\n\nOptions:\n --format <fmt> Output format: human (default), json`,
|
|
55
|
+
};
|
|
56
|
+
// Intercept --help for any known subcommand
|
|
57
|
+
if (subcommand && wantsHelp && subcommand in SUBCOMMAND_HELP) {
|
|
58
|
+
console.log(SUBCOMMAND_HELP[subcommand]);
|
|
59
|
+
process.exit(0);
|
|
60
|
+
}
|
|
28
61
|
if (subcommand === "init") {
|
|
29
62
|
const cwd = process.cwd();
|
|
63
|
+
const args = argv.slice(1);
|
|
64
|
+
// Handle --list-templates
|
|
65
|
+
if (args.includes("--list-templates")) {
|
|
66
|
+
(0, templates_1.printTemplateList)();
|
|
67
|
+
process.exit(0);
|
|
68
|
+
}
|
|
69
|
+
// Handle --template <name>
|
|
70
|
+
const templateIndex = args.indexOf("--template");
|
|
71
|
+
const templateName = templateIndex !== -1 ? args[templateIndex + 1] : undefined;
|
|
72
|
+
if (templateName) {
|
|
73
|
+
if (!templates_1.AVAILABLE_TEMPLATES.includes(templateName)) {
|
|
74
|
+
console.error(` ✖ Unknown template: ${templateName}`);
|
|
75
|
+
(0, templates_1.printTemplateList)();
|
|
76
|
+
process.exit(1);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
30
79
|
const ok = (0, init_1.runInit)(cwd);
|
|
31
|
-
|
|
80
|
+
if (!ok)
|
|
81
|
+
process.exit(1);
|
|
82
|
+
// Install template after init if requested
|
|
83
|
+
if (templateName) {
|
|
84
|
+
console.log(`\n 📋 Installing template: ${templateName}\n`);
|
|
85
|
+
const { filesCreated, filesSkipped } = (0, templates_1.installTemplate)(templateName, cwd);
|
|
86
|
+
for (const f of filesCreated)
|
|
87
|
+
console.log(` ✔ Created ${f}`);
|
|
88
|
+
for (const f of filesSkipped)
|
|
89
|
+
console.log(` – Skipped ${f} (already exists)`);
|
|
90
|
+
console.log("");
|
|
91
|
+
}
|
|
92
|
+
process.exit(0);
|
|
93
|
+
}
|
|
94
|
+
else if (subcommand === "start") {
|
|
95
|
+
// Parse arguments for start command
|
|
96
|
+
const args = argv.slice(1);
|
|
97
|
+
const formatIndex = args.indexOf("--format");
|
|
98
|
+
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
99
|
+
const watch = args.includes("--watch");
|
|
100
|
+
const skipInit = args.includes("--skip-init");
|
|
101
|
+
(0, start_1.runStart)({ format, watch, skipInit })
|
|
102
|
+
.then((code) => process.exit(code))
|
|
103
|
+
.catch((err) => {
|
|
104
|
+
console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
105
|
+
process.exit(1);
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
else if (subcommand === "watch") {
|
|
109
|
+
// Parse arguments for watch command
|
|
110
|
+
const args = argv.slice(1);
|
|
111
|
+
const formatIndex = args.indexOf("--format");
|
|
112
|
+
const debounceIndex = args.indexOf("--debounce");
|
|
113
|
+
const writeResultsIndex = args.indexOf("--write-results");
|
|
114
|
+
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
115
|
+
const debounceMs = debounceIndex !== -1 ? parseInt(args[debounceIndex + 1], 10) : undefined;
|
|
116
|
+
const writeResults = writeResultsIndex !== -1;
|
|
117
|
+
const clearScreen = !args.includes("--no-clear");
|
|
118
|
+
(0, watch_1.runWatch)({ format, writeResults, debounceMs, clearScreen })
|
|
119
|
+
.then(() => process.exit(0))
|
|
120
|
+
.catch((err) => {
|
|
121
|
+
console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
122
|
+
process.exit(1);
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
else if (subcommand === "compare") {
|
|
126
|
+
// Parse arguments for compare command
|
|
127
|
+
const args = argv.slice(1);
|
|
128
|
+
const runsIndex = args.indexOf("--runs");
|
|
129
|
+
const baseIndex = args.indexOf("--base");
|
|
130
|
+
const headIndex = args.indexOf("--head");
|
|
131
|
+
const labelsIndex = args.indexOf("--labels");
|
|
132
|
+
const formatIndex = args.indexOf("--format");
|
|
133
|
+
const sortByIndex = args.indexOf("--sort-by");
|
|
134
|
+
// Collect run files: --runs <f1> <f2> ... OR --base <f1> --head <f2>
|
|
135
|
+
const runs = [];
|
|
136
|
+
if (runsIndex !== -1) {
|
|
137
|
+
for (let i = runsIndex + 1; i < args.length; i++) {
|
|
138
|
+
if (args[i].startsWith("--"))
|
|
139
|
+
break;
|
|
140
|
+
runs.push(args[i]);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
// --base / --head shorthand for 2-file compare
|
|
145
|
+
if (baseIndex !== -1 && args[baseIndex + 1])
|
|
146
|
+
runs.push(args[baseIndex + 1]);
|
|
147
|
+
if (headIndex !== -1 && args[headIndex + 1])
|
|
148
|
+
runs.push(args[headIndex + 1]);
|
|
149
|
+
}
|
|
150
|
+
// Collect labels (all args after --labels until next flag)
|
|
151
|
+
const labels = [];
|
|
152
|
+
if (labelsIndex !== -1) {
|
|
153
|
+
for (let i = labelsIndex + 1; i < args.length; i++) {
|
|
154
|
+
if (args[i].startsWith("--"))
|
|
155
|
+
break;
|
|
156
|
+
labels.push(args[i]);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
160
|
+
const sortBy = sortByIndex !== -1
|
|
161
|
+
? args[sortByIndex + 1]
|
|
162
|
+
: "name";
|
|
163
|
+
if (runs.length < 2) {
|
|
164
|
+
console.error("Error: At least 2 run files are required.");
|
|
165
|
+
console.error("Usage: evalgate compare --base results-a.json --head results-b.json");
|
|
166
|
+
console.error(" evalgate compare --runs <file1> <file2> [<file3> ...]");
|
|
167
|
+
console.error(" --labels are optional metadata, not required identifiers.");
|
|
168
|
+
process.exit(1);
|
|
169
|
+
}
|
|
170
|
+
(0, compare_1.runCompareCLI)({
|
|
171
|
+
runs,
|
|
172
|
+
labels: labels.length > 0 ? labels : undefined,
|
|
173
|
+
format,
|
|
174
|
+
sortBy,
|
|
175
|
+
})
|
|
176
|
+
.then(() => process.exit(0))
|
|
177
|
+
.catch((err) => {
|
|
178
|
+
console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
179
|
+
process.exit(1);
|
|
180
|
+
});
|
|
32
181
|
}
|
|
33
182
|
else if (subcommand === "baseline") {
|
|
34
183
|
const code = (0, baseline_1.runBaseline)(argv.slice(1));
|
|
@@ -179,23 +328,38 @@ else if (subcommand === "run") {
|
|
|
179
328
|
const baseIndex = args.indexOf("--base");
|
|
180
329
|
const formatIndex = args.indexOf("--format");
|
|
181
330
|
const writeResultsIndex = args.indexOf("--write-results");
|
|
331
|
+
const watchFlag = args.includes("--watch");
|
|
182
332
|
const specIds = specIdsIndex !== -1 ? args[specIdsIndex + 1]?.split(",") : undefined;
|
|
183
333
|
const impactedOnly = impactedOnlyIndex !== -1;
|
|
184
334
|
const baseBranch = baseIndex !== -1 ? args[baseIndex + 1] : undefined;
|
|
185
335
|
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
186
336
|
const writeResults = writeResultsIndex !== -1;
|
|
187
|
-
(
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
writeResults,
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
}
|
|
337
|
+
if (watchFlag) {
|
|
338
|
+
// Delegate to watch mode
|
|
339
|
+
const debounceIndex = args.indexOf("--debounce");
|
|
340
|
+
const debounceMs = debounceIndex !== -1 ? parseInt(args[debounceIndex + 1], 10) : undefined;
|
|
341
|
+
const clearScreen = !args.includes("--no-clear");
|
|
342
|
+
(0, watch_1.runWatch)({ specIds, format, writeResults, debounceMs, clearScreen })
|
|
343
|
+
.then(() => process.exit(0))
|
|
344
|
+
.catch((err) => {
|
|
345
|
+
console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
346
|
+
process.exit(1);
|
|
347
|
+
});
|
|
348
|
+
}
|
|
349
|
+
else {
|
|
350
|
+
(0, run_1.runEvaluationsCLI)({
|
|
351
|
+
specIds,
|
|
352
|
+
impactedOnly: impactedOnly ? !!baseBranch : false,
|
|
353
|
+
baseBranch,
|
|
354
|
+
format,
|
|
355
|
+
writeResults,
|
|
356
|
+
})
|
|
357
|
+
.then(() => process.exit(0))
|
|
358
|
+
.catch((err) => {
|
|
359
|
+
console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
360
|
+
process.exit(2);
|
|
361
|
+
});
|
|
362
|
+
}
|
|
199
363
|
}
|
|
200
364
|
else if (subcommand === "diff") {
|
|
201
365
|
// Parse arguments for diff command
|
|
@@ -213,6 +377,14 @@ else if (subcommand === "diff") {
|
|
|
213
377
|
process.exit(2);
|
|
214
378
|
});
|
|
215
379
|
}
|
|
380
|
+
else if (subcommand === "validate") {
|
|
381
|
+
(0, validate_1.runValidate)(argv.slice(1))
|
|
382
|
+
.then((result) => process.exit(result.passed ? 0 : 1))
|
|
383
|
+
.catch((err) => {
|
|
384
|
+
console.error(`EvalGate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
385
|
+
process.exit(1);
|
|
386
|
+
});
|
|
387
|
+
}
|
|
216
388
|
else if (subcommand === "ci") {
|
|
217
389
|
// Parse arguments for ci command
|
|
218
390
|
const args = argv.slice(1);
|
|
@@ -237,96 +409,67 @@ else {
|
|
|
237
409
|
console.log(`EvalGate CLI
|
|
238
410
|
|
|
239
411
|
Usage:
|
|
240
|
-
evalgate init
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
evalgate
|
|
244
|
-
--
|
|
245
|
-
--
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
--
|
|
250
|
-
--
|
|
251
|
-
--
|
|
252
|
-
|
|
253
|
-
--
|
|
254
|
-
--
|
|
255
|
-
--
|
|
256
|
-
--
|
|
257
|
-
|
|
258
|
-
evalgate
|
|
259
|
-
--base <
|
|
260
|
-
--head <
|
|
261
|
-
--
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
evalgate
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
evalgate
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
--
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
--share <mode> Share link: always | fail | never (fail = only when gate fails)
|
|
289
|
-
--baseUrl <url> API base URL
|
|
290
|
-
|
|
291
|
-
Options for explain:
|
|
292
|
-
--report <path> Path to report JSON (default: evals/regression-report.json)
|
|
293
|
-
--format <fmt> Output format: human (default), json
|
|
294
|
-
|
|
295
|
-
Options for print-config:
|
|
296
|
-
--format <fmt> Output format: human (default), json
|
|
297
|
-
|
|
298
|
-
Options for doctor:
|
|
299
|
-
--report Output JSON diagnostic bundle
|
|
300
|
-
--format <fmt> Output format: human (default), json
|
|
301
|
-
--strict Treat warnings as failures (exit 2)
|
|
302
|
-
--apiKey <key> API key (or EVALAI_API_KEY env)
|
|
303
|
-
--baseUrl <url> API base URL
|
|
304
|
-
--evaluationId <id> Evaluation to verify
|
|
412
|
+
evalgate start Zero-config startup (init + discover + run in one command)
|
|
413
|
+
--watch Enable watch mode after first run
|
|
414
|
+
--format <fmt> Output format: human (default), json
|
|
415
|
+
evalgate init Create evalgate.config.json + baseline + CI workflow
|
|
416
|
+
--template <name> Start with a template (chatbot, codegen, agent, safety, rag)
|
|
417
|
+
--list-templates Show all available templates
|
|
418
|
+
evalgate discover Discover behavioral specs in project and show statistics
|
|
419
|
+
--manifest Generate evaluation manifest for incremental analysis
|
|
420
|
+
evalgate run Run evaluation specifications
|
|
421
|
+
--spec-ids <ids> Comma-separated list of spec IDs to run
|
|
422
|
+
--impacted-only Run only specs impacted by changes (requires --base)
|
|
423
|
+
--base <branch> Base branch for impact analysis (with --impacted-only)
|
|
424
|
+
--format <fmt> Output format: human (default), json
|
|
425
|
+
--write-results Write results to .evalgate/last-run.json
|
|
426
|
+
--watch Re-execute on file save (watch mode)
|
|
427
|
+
--debounce <ms> Watch debounce interval (default: 300ms)
|
|
428
|
+
--no-clear Don't clear screen between watch runs
|
|
429
|
+
evalgate watch Watch mode (alias for evalgate run --watch)
|
|
430
|
+
evalgate compare Side-by-side run comparison
|
|
431
|
+
--base <file> Baseline run result JSON file
|
|
432
|
+
--head <file> Head run result JSON file
|
|
433
|
+
--runs <f1> <f2> [...] N-way compare (3+ run files)
|
|
434
|
+
--labels <l1> <l2> [...] Optional human-readable labels (e.g., model names)
|
|
435
|
+
--sort-by <key> Sort by: name (default), score, duration
|
|
436
|
+
--format <fmt> Output format: human (default), json
|
|
437
|
+
evalgate diff Compare two run reports and show behavioral changes
|
|
438
|
+
--base <branch> Base branch or report path (default: main)
|
|
439
|
+
--head <path> Head report path (default: .evalgate/last-run.json)
|
|
440
|
+
--format <fmt> Output format: human (default), json
|
|
441
|
+
evalgate impact-analysis Analyze impact of changes and suggest targeted tests
|
|
442
|
+
--base <branch> Base branch to compare against (default: main)
|
|
443
|
+
--changed-files <files> Comma-separated list of changed files (for CI)
|
|
444
|
+
--format <fmt> Output format: human (default), json
|
|
445
|
+
evalgate ci One-command CI loop (manifest → impact → run → diff)
|
|
446
|
+
--base <ref> Base reference for diff
|
|
447
|
+
--impacted-only Run only specs impacted by changes
|
|
448
|
+
--format <fmt> Output format: human (default), json, github
|
|
449
|
+
--write-results Write run results to .evalgate/last-run.json
|
|
450
|
+
evalgate gate [options] Run regression gate (local test-based, no API needed)
|
|
451
|
+
evalgate check [options] CI/CD evaluation gate (API-based)
|
|
452
|
+
evalgate explain [options] Explain last gate/check failure with root causes + fixes
|
|
453
|
+
evalgate doctor [options] Comprehensive CI/CD readiness checklist
|
|
454
|
+
evalgate validate Validate spec files without running them
|
|
455
|
+
evalgate baseline init Create starter evals/baseline.json
|
|
456
|
+
evalgate baseline update Run tests and update baseline with real scores
|
|
457
|
+
evalgate upgrade --full Upgrade from Tier 1 to Tier 2 (full gate)
|
|
458
|
+
evalgate print-config Show resolved config with source-of-truth annotations
|
|
459
|
+
evalgate share [options] Create share link for a run
|
|
305
460
|
|
|
306
461
|
Examples:
|
|
307
|
-
evalgate
|
|
308
|
-
evalgate
|
|
309
|
-
evalgate
|
|
310
|
-
evalgate
|
|
311
|
-
evalgate
|
|
312
|
-
evalgate
|
|
313
|
-
evalgate
|
|
314
|
-
evalgate
|
|
315
|
-
evalgate
|
|
316
|
-
evalgate
|
|
317
|
-
evalgate
|
|
318
|
-
evalgate diff --base main
|
|
319
|
-
evalgate diff --base main --format json
|
|
320
|
-
evalgate diff --a .evalgate/runs/base.json --b .evalgate/last-run.json
|
|
321
|
-
evalgate gate
|
|
322
|
-
evalgate gate --format json
|
|
323
|
-
evalgate explain
|
|
324
|
-
evalgate doctor
|
|
325
|
-
evalgate print-config
|
|
326
|
-
evalgate doctor --report
|
|
327
|
-
evalgate check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
328
|
-
evalgate check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
329
|
-
evalgate share --scope run --evaluationId 42 --runId 123 --expires 7d --apiKey $EVALAI_API_KEY
|
|
462
|
+
evalgate start Zero to eval in one command
|
|
463
|
+
evalgate init --template chatbot Scaffold with chatbot evals
|
|
464
|
+
evalgate run --watch Re-run on file save
|
|
465
|
+
evalgate compare --base gpt4o.json --head claude.json Side-by-side run diff
|
|
466
|
+
evalgate run --spec-ids spec1,spec2 Run specific specs
|
|
467
|
+
evalgate run --impacted-only --base main Run only impacted specs
|
|
468
|
+
evalgate diff --base main Behavioral diff
|
|
469
|
+
evalgate ci --base main --impacted-only Full CI loop
|
|
470
|
+
evalgate gate --format json Regression gate
|
|
471
|
+
evalgate check --minScore 92 --evaluationId 42 API-based gate
|
|
472
|
+
evalgate doctor Preflight check
|
|
330
473
|
`);
|
|
331
474
|
process.exit(subcommand === "--help" || subcommand === "-h" ? 0 : 1);
|
|
332
475
|
}
|
|
@@ -48,6 +48,7 @@ exports.runGate = runGate;
|
|
|
48
48
|
const node_child_process_1 = require("node:child_process");
|
|
49
49
|
const fs = __importStar(require("node:fs"));
|
|
50
50
|
const path = __importStar(require("node:path"));
|
|
51
|
+
const baseline_1 = require("./baseline");
|
|
51
52
|
const REPORT_REL = "evals/regression-report.json";
|
|
52
53
|
const BASELINE_REL = "evals/baseline.json";
|
|
53
54
|
/** Detect the package manager used in the project */
|
|
@@ -147,6 +148,28 @@ function runBuiltinGate(cwd) {
|
|
|
147
148
|
runner,
|
|
148
149
|
};
|
|
149
150
|
}
|
|
151
|
+
// Verify baseline integrity
|
|
152
|
+
const checksumResult = (0, baseline_1.verifyBaselineChecksum)(baselineData);
|
|
153
|
+
if (!checksumResult.valid) {
|
|
154
|
+
return {
|
|
155
|
+
schemaVersion: 1,
|
|
156
|
+
timestamp: now,
|
|
157
|
+
exitCode: 2,
|
|
158
|
+
category: "infra_error",
|
|
159
|
+
passed: false,
|
|
160
|
+
failures: [
|
|
161
|
+
checksumResult.reason ?? "Baseline checksum verification failed",
|
|
162
|
+
],
|
|
163
|
+
deltas: [],
|
|
164
|
+
baseline: null,
|
|
165
|
+
durationMs: Date.now() - t0,
|
|
166
|
+
command,
|
|
167
|
+
runner,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
if (checksumResult.reason === "no_checksum") {
|
|
171
|
+
console.warn("⚠ Baseline has no checksum. Run 'evalgate baseline update' to stamp one.");
|
|
172
|
+
}
|
|
150
173
|
const baselineMeta = baselineData.updatedAt
|
|
151
174
|
? {
|
|
152
175
|
updatedAt: baselineData.updatedAt,
|
package/dist/cli/run.js
CHANGED
|
@@ -54,6 +54,7 @@ const fs = __importStar(require("node:fs/promises"));
|
|
|
54
54
|
const path = __importStar(require("node:path"));
|
|
55
55
|
const registry_1 = require("../runtime/registry");
|
|
56
56
|
const impact_analysis_1 = require("./impact-analysis");
|
|
57
|
+
const traces_1 = require("./traces");
|
|
57
58
|
/**
|
|
58
59
|
* Generate deterministic run ID
|
|
59
60
|
*/
|
|
@@ -377,6 +378,15 @@ function printHumanResults(result) {
|
|
|
377
378
|
console.log(` ❌ Failed: ${result.summary.failed}`);
|
|
378
379
|
console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
|
|
379
380
|
console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
|
|
381
|
+
// Latency percentiles
|
|
382
|
+
const durations = result.results
|
|
383
|
+
.filter((r) => r.result.status !== "skipped")
|
|
384
|
+
.map((r) => r.result.duration);
|
|
385
|
+
if (durations.length > 0) {
|
|
386
|
+
const latency = (0, traces_1.calculatePercentiles)(durations);
|
|
387
|
+
console.log("");
|
|
388
|
+
console.log((0, traces_1.formatLatencyTable)(latency));
|
|
389
|
+
}
|
|
380
390
|
const hasScores = result.results.some((r) => r.result.score !== undefined);
|
|
381
391
|
console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
|
|
382
392
|
for (const spec of result.results) {
|
|
@@ -404,6 +414,18 @@ function printJsonResults(result) {
|
|
|
404
414
|
async function runEvaluationsCLI(options) {
|
|
405
415
|
try {
|
|
406
416
|
const result = await runEvaluations(options);
|
|
417
|
+
// Auto-write structured traces
|
|
418
|
+
if (result.results.length > 0) {
|
|
419
|
+
try {
|
|
420
|
+
const tracePath = await (0, traces_1.writeTraces)(result);
|
|
421
|
+
if (options.format !== "json") {
|
|
422
|
+
console.log(`\n🔍 Trace written to ${tracePath}`);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
catch {
|
|
426
|
+
// Trace writing is best-effort, don't fail the run
|
|
427
|
+
}
|
|
428
|
+
}
|
|
407
429
|
if (options.format === "json") {
|
|
408
430
|
printJsonResults(result);
|
|
409
431
|
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalgate start — Zero-config startup
|
|
3
|
+
*
|
|
4
|
+
* One command to go from nothing to a passing eval run:
|
|
5
|
+
* npx evalgate start
|
|
6
|
+
*
|
|
7
|
+
* What it does:
|
|
8
|
+
* 1. If no evalgate.config.json, runs init
|
|
9
|
+
* 2. If no manifest, runs discover --manifest
|
|
10
|
+
* 3. Runs evalgate run --write-results
|
|
11
|
+
* 4. Prints results
|
|
12
|
+
*
|
|
13
|
+
* The goal: zero decisions, one command, immediate value.
|
|
14
|
+
*/
|
|
15
|
+
export interface StartOptions {
|
|
16
|
+
/** Output format */
|
|
17
|
+
format?: "human" | "json";
|
|
18
|
+
/** Skip init if not already set up */
|
|
19
|
+
skipInit?: boolean;
|
|
20
|
+
/** Enable watch mode after first run */
|
|
21
|
+
watch?: boolean;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Zero-config startup: one command → passing run
|
|
25
|
+
*/
|
|
26
|
+
export declare function runStart(options?: StartOptions, projectRoot?: string): Promise<number>;
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalgate start — Zero-config startup
|
|
4
|
+
*
|
|
5
|
+
* One command to go from nothing to a passing eval run:
|
|
6
|
+
* npx evalgate start
|
|
7
|
+
*
|
|
8
|
+
* What it does:
|
|
9
|
+
* 1. If no evalgate.config.json, runs init
|
|
10
|
+
* 2. If no manifest, runs discover --manifest
|
|
11
|
+
* 3. Runs evalgate run --write-results
|
|
12
|
+
* 4. Prints results
|
|
13
|
+
*
|
|
14
|
+
* The goal: zero decisions, one command, immediate value.
|
|
15
|
+
*/
|
|
16
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
17
|
+
if (k2 === undefined) k2 = k;
|
|
18
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
19
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
20
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
21
|
+
}
|
|
22
|
+
Object.defineProperty(o, k2, desc);
|
|
23
|
+
}) : (function(o, m, k, k2) {
|
|
24
|
+
if (k2 === undefined) k2 = k;
|
|
25
|
+
o[k2] = m[k];
|
|
26
|
+
}));
|
|
27
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
28
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
29
|
+
}) : function(o, v) {
|
|
30
|
+
o["default"] = v;
|
|
31
|
+
});
|
|
32
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
33
|
+
var ownKeys = function(o) {
|
|
34
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
35
|
+
var ar = [];
|
|
36
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
37
|
+
return ar;
|
|
38
|
+
};
|
|
39
|
+
return ownKeys(o);
|
|
40
|
+
};
|
|
41
|
+
return function (mod) {
|
|
42
|
+
if (mod && mod.__esModule) return mod;
|
|
43
|
+
var result = {};
|
|
44
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
45
|
+
__setModuleDefault(result, mod);
|
|
46
|
+
return result;
|
|
47
|
+
};
|
|
48
|
+
})();
|
|
49
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
50
|
+
exports.runStart = runStart;
|
|
51
|
+
const fs = __importStar(require("node:fs"));
|
|
52
|
+
const path = __importStar(require("node:path"));
|
|
53
|
+
const discover_1 = require("./discover");
|
|
54
|
+
const init_1 = require("./init");
|
|
55
|
+
const run_1 = require("./run");
|
|
56
|
+
/**
|
|
57
|
+
* Zero-config startup: one command → passing run
|
|
58
|
+
*/
|
|
59
|
+
async function runStart(options = {}, projectRoot = process.cwd()) {
|
|
60
|
+
const format = options.format ?? "human";
|
|
61
|
+
if (format === "human") {
|
|
62
|
+
console.log("\n🚀 evalgate start — zero-config evaluation run\n");
|
|
63
|
+
}
|
|
64
|
+
// Step 1: Ensure project is initialized
|
|
65
|
+
const configPath = path.join(projectRoot, "evalgate.config.json");
|
|
66
|
+
if (!fs.existsSync(configPath) && !options.skipInit) {
|
|
67
|
+
if (format === "human") {
|
|
68
|
+
console.log("📦 No evalgate.config.json found. Initializing...\n");
|
|
69
|
+
}
|
|
70
|
+
const initOk = (0, init_1.runInit)(projectRoot);
|
|
71
|
+
if (!initOk) {
|
|
72
|
+
console.error("❌ Initialization failed. Run `evalgate init` manually.");
|
|
73
|
+
return 1;
|
|
74
|
+
}
|
|
75
|
+
if (format === "human")
|
|
76
|
+
console.log("");
|
|
77
|
+
}
|
|
78
|
+
// Step 2: Ensure manifest exists (discover specs)
|
|
79
|
+
const manifestPath = path.join(projectRoot, ".evalgate", "manifest.json");
|
|
80
|
+
if (!fs.existsSync(manifestPath)) {
|
|
81
|
+
if (format === "human") {
|
|
82
|
+
console.log("🔍 No manifest found. Discovering specs...\n");
|
|
83
|
+
}
|
|
84
|
+
try {
|
|
85
|
+
await (0, discover_1.discoverSpecs)({ manifest: true });
|
|
86
|
+
}
|
|
87
|
+
catch (err) {
|
|
88
|
+
// Discovery may fail if no spec files exist yet — that's OK for legacy mode
|
|
89
|
+
if (format === "human") {
|
|
90
|
+
console.log(` ℹ️ Discovery: ${err instanceof Error ? err.message : String(err)}`);
|
|
91
|
+
console.log(" Falling back to gate mode...\n");
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
// Step 3: Run evaluations
|
|
96
|
+
if (format === "human") {
|
|
97
|
+
console.log("▶️ Running evaluations...\n");
|
|
98
|
+
}
|
|
99
|
+
try {
|
|
100
|
+
const result = await (0, run_1.runEvaluations)({ writeResults: true, format }, projectRoot);
|
|
101
|
+
if (format === "json") {
|
|
102
|
+
(0, run_1.printJsonResults)(result);
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
(0, run_1.printHumanResults)(result);
|
|
106
|
+
}
|
|
107
|
+
// Step 4: If watch mode requested, transition to watch
|
|
108
|
+
if (options.watch) {
|
|
109
|
+
const { runWatch } = await Promise.resolve().then(() => __importStar(require("./watch")));
|
|
110
|
+
await runWatch({ writeResults: true, format }, projectRoot);
|
|
111
|
+
return 0; // Never reached (watch runs forever)
|
|
112
|
+
}
|
|
113
|
+
return result.summary.failed > 0 ? 1 : 0;
|
|
114
|
+
}
|
|
115
|
+
catch (error) {
|
|
116
|
+
if (format === "human") {
|
|
117
|
+
console.error(`\n❌ ${error instanceof Error ? error.message : String(error)}`);
|
|
118
|
+
console.log("\n💡 Tips:");
|
|
119
|
+
console.log(" • Create spec files with defineEval() in eval/ directory");
|
|
120
|
+
console.log(" • Run `evalgate discover` to verify spec detection");
|
|
121
|
+
console.log(" • Run `evalgate doctor` for full diagnostics");
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
console.error(JSON.stringify({
|
|
125
|
+
error: error instanceof Error ? error.message : String(error),
|
|
126
|
+
}));
|
|
127
|
+
}
|
|
128
|
+
return 1;
|
|
129
|
+
}
|
|
130
|
+
}
|