@pauly4010/evalai-sdk 1.5.7 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,51 @@ All notable changes to the @pauly4010/evalai-sdk package will be documented in t
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.6.0] - 2026-02-24
9
+
10
+ ### ✨ Added
11
+
12
+ #### CLI — Regression Gate & Baseline Management
13
+
14
+ - **`evalai baseline init`** — Create a starter `evals/baseline.json` with sample values and provenance metadata
15
+ - **`evalai baseline update`** — Run confidence tests, golden eval, and latency benchmark, then update baseline with real scores
16
+ - **`evalai gate`** — Run the local regression gate with proper exit code taxonomy (0=pass, 1=regression, 2=infra_error, 3=confidence_failed, 4=confidence_missing)
17
+ - **`evalai gate --format json`** — Output `evals/regression-report.json` as machine-readable JSON to stdout
18
+ - **`evalai gate --format github`** — Output GitHub Step Summary markdown with delta table
19
+
20
+ #### SDK Exports — Regression Gate Constants & Types
21
+
22
+ - **`GATE_EXIT`** — Exit code constants (`PASS`, `REGRESSION`, `INFRA_ERROR`, `CONFIDENCE_FAILED`, `CONFIDENCE_MISSING`)
23
+ - **`GATE_CATEGORY`** — Report category constants (`pass`, `regression`, `infra_error`)
24
+ - **`REPORT_SCHEMA_VERSION`** — Current schema version for `regression-report.json`
25
+ - **`ARTIFACTS`** — Well-known artifact paths (`BASELINE`, `REGRESSION_REPORT`, `CONFIDENCE_SUMMARY`, `LATENCY_BENCHMARK`)
26
+ - **Types**: `RegressionReport`, `RegressionDelta`, `Baseline`, `BaselineTolerance`, `GateExitCode`, `GateCategory`
27
+ - **Subpath export**: `@pauly4010/evalai-sdk/regression` for tree-shakeable imports
28
+
29
+ ### 🔧 Changed
30
+
31
+ - CLI help text updated to include `baseline` and `gate` commands
32
+ - SDK becomes the public contract for regression gate — scripts are implementation detail
33
+
34
+ ---
35
+
36
+ ## [1.5.8] - 2026-02-22
37
+
38
+ ### 🐛 Fixed
39
+
40
+ - **secureRoute TypeScript overload compatibility** — Fixed implementation signature to use `ctx: any` for proper overload compatibility
41
+ - **Test infrastructure fixes** — Replaced invalid `expect.unknown()` with `expect.any()` across test files
42
+ - **NextRequest constructor** — Fixed test mocks using incorrect `(NextRequest as any)()` syntax
43
+ - **304 response handling** — Fixed exports API returning invalid 304 response with body
44
+ - **Error catalog tests** — Updated test expectations to match actual EvalAIError behavior
45
+ - **Redis cache timeout** — Added explicit timeout to prevent test hangs
46
+
47
+ ### 🔧 Changed
48
+
49
+ - **Biome formatting** — Applied consistent line endings across 199 files
50
+
51
+ ---
52
+
8
53
  ## [1.5.7] - 2026-02-20
9
54
 
10
55
  ### 📚 Documentation
@@ -32,7 +77,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
32
77
  - **`--warnDrop <n>`** — Introduce a WARN band when score drops > `warnDrop` but < `maxDrop`
33
78
  - **Gate verdicts:** PASS, WARN, FAIL
34
79
  - **Profiles:** `strict` (warnDrop: 0), `balanced` (warnDrop: 1), `fast` (warnDrop: 2)
35
- - **`--fail-on-flake`** — Fail the gate if any case is flagged as flaky (partial pass rate across determinism runs)
80
+ - **`--fail-on-flake`** — Fail the gate if unknown case is flagged as flaky (partial pass rate across determinism runs)
36
81
 
37
82
  #### Determinism & flake intelligence
38
83
 
package/README.md CHANGED
@@ -99,7 +99,7 @@ Key flags
99
99
 
100
100
  --maxDrop → hard regression fail
101
101
 
102
- --fail-on-flake → fail if any test is unstable
102
+ --fail-on-flake → fail if unknown test is unstable
103
103
 
104
104
  This lets teams tune signal vs noise in CI.
105
105
 
@@ -190,7 +190,7 @@ Option Description
190
190
  --allowWeakEvidence Permit weak evidence
191
191
  --policy <name> HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511
192
192
  --baseline <mode> published, previous, production
193
- --fail-on-flake Fail if any case is flaky
193
+ --fail-on-flake Fail if unknown case is flaky
194
194
  --baseUrl <url> Override API base URL
195
195
 
196
196
  Exit codes
@@ -257,7 +257,16 @@ await openai.chat.completions.create({
257
257
 
258
258
 
259
259
  🧭 Changelog
260
- v1.5.7 (Latest)
260
+ v1.5.8 (Latest)
261
+ Fixed secureRoute TypeScript overload compatibility
262
+
263
+ Fixed test infrastructure (expect.any, NextRequest constructor)
264
+
265
+ Fixed 304 response handling in exports API
266
+
267
+ Improved error catalog test coverage
268
+
269
+ v1.5.7
261
270
  Documentation updates for CJS compatibility
262
271
 
263
272
  Version alignment across README and changelog
@@ -17,26 +17,26 @@
17
17
  export interface AssertionResult {
18
18
  name: string;
19
19
  passed: boolean;
20
- expected: any;
21
- actual: any;
20
+ expected: unknown;
21
+ actual: unknown;
22
22
  message?: string;
23
23
  }
24
24
  export declare class AssertionError extends Error {
25
- expected: any;
26
- actual: any;
27
- constructor(message: string, expected: any, actual: any);
25
+ expected: unknown;
26
+ actual: unknown;
27
+ constructor(message: string, expected: unknown, actual: unknown);
28
28
  }
29
29
  /**
30
30
  * Fluent assertion builder
31
31
  */
32
32
  export declare class Expectation {
33
33
  private value;
34
- constructor(value: any);
34
+ constructor(value: unknown);
35
35
  /**
36
36
  * Assert value equals expected
37
37
  * @example expect(output).toEqual("Hello")
38
38
  */
39
- toEqual(expected: any, message?: string): AssertionResult;
39
+ toEqual(expected: unknown, message?: string): AssertionResult;
40
40
  /**
41
41
  * Assert value contains substring
42
42
  * @example expect(output).toContain("help")
@@ -71,7 +71,7 @@ export declare class Expectation {
71
71
  * Assert JSON matches schema
72
72
  * @example expect(output).toMatchJSON({ status: 'success' })
73
73
  */
74
- toMatchJSON(schema: Record<string, any>, message?: string): AssertionResult;
74
+ toMatchJSON(schema: Record<string, unknown>, message?: string): AssertionResult;
75
75
  /**
76
76
  * Assert value has expected sentiment
77
77
  * @example expect(output).toHaveSentiment('positive')
@@ -148,7 +148,7 @@ export declare class Expectation {
148
148
  * expect(output).toHaveLength({ min: 10, max: 100 });
149
149
  * ```
150
150
  */
151
- export declare function expect(value: any): Expectation;
151
+ export declare function expect(value: unknown): Expectation;
152
152
  /**
153
153
  * Run multiple assertions and collect results
154
154
  *
@@ -178,12 +178,12 @@ export declare function withinRange(value: number, min: number, max: number): bo
178
178
  export declare function isValidEmail(email: string): boolean;
179
179
  export declare function isValidURL(url: string): boolean;
180
180
  export declare function hasNoHallucinations(text: string, groundTruth: string[]): boolean;
181
- export declare function matchesSchema(value: any, schema: Record<string, any>): boolean;
181
+ export declare function matchesSchema(value: unknown, schema: Record<string, unknown>): boolean;
182
182
  export declare function hasReadabilityScore(text: string, minScore: number): boolean;
183
183
  export declare function containsLanguage(text: string, language: string): boolean;
184
184
  export declare function hasFactualAccuracy(text: string, facts: string[]): boolean;
185
185
  export declare function respondedWithinTime(startTime: number, maxMs: number): boolean;
186
186
  export declare function hasNoToxicity(text: string): boolean;
187
187
  export declare function followsInstructions(text: string, instructions: string[]): boolean;
188
- export declare function containsAllRequiredFields(obj: any, requiredFields: string[]): boolean;
188
+ export declare function containsAllRequiredFields(obj: unknown, requiredFields: string[]): boolean;
189
189
  export declare function hasValidCodeSyntax(code: string, language: string): boolean;
@@ -612,7 +612,7 @@ function followsInstructions(text, instructions) {
612
612
  });
613
613
  }
614
614
  function containsAllRequiredFields(obj, requiredFields) {
615
- return requiredFields.every((field) => field in obj);
615
+ return requiredFields.every((field) => obj && typeof obj === "object" && field in obj);
616
616
  }
617
617
  function hasValidCodeSyntax(code, language) {
618
618
  // This is a simplified implementation
package/dist/batch.d.ts CHANGED
@@ -6,13 +6,13 @@ export interface BatchRequest {
6
6
  id: string;
7
7
  method: string;
8
8
  endpoint: string;
9
- body?: any;
9
+ body?: unknown;
10
10
  headers?: Record<string, string>;
11
11
  }
12
12
  export interface BatchResponse {
13
13
  id: string;
14
14
  status: number;
15
- data?: any;
15
+ data?: unknown;
16
16
  error?: string;
17
17
  }
18
18
  /**
@@ -32,7 +32,7 @@ export declare class RequestBatcher {
32
32
  /**
33
33
  * Add request to batch queue
34
34
  */
35
- enqueue(method: string, endpoint: string, body?: any, headers?: Record<string, string>): Promise<any>;
35
+ enqueue(method: string, endpoint: string, body?: unknown, headers?: Record<string, string>): Promise<unknown>;
36
36
  /**
37
37
  * Schedule batch processing after delay
38
38
  */
package/dist/batch.js CHANGED
@@ -85,7 +85,7 @@ class RequestBatcher {
85
85
  }
86
86
  }
87
87
  }
88
- // Handle any requests that didn't get a response
88
+ // Handle unknown requests that didn't get a response
89
89
  for (const item of batch) {
90
90
  if (!responses.find((r) => r.id === item.id)) {
91
91
  item.reject(new Error("No response received for request"));
package/dist/cache.d.ts CHANGED
@@ -17,15 +17,15 @@ export declare class RequestCache {
17
17
  /**
18
18
  * Get cached response if valid
19
19
  */
20
- get<T>(method: string, url: string, params?: any): T | null;
20
+ get<T>(method: string, url: string, params?: unknown): T | null;
21
21
  /**
22
22
  * Store response in cache
23
23
  */
24
- set<T>(method: string, url: string, data: T, ttl: number, params?: any): void;
24
+ set<T>(method: string, url: string, data: T, ttl: number, params?: unknown): void;
25
25
  /**
26
26
  * Invalidate specific cache entry
27
27
  */
28
- invalidate(method: string, url: string, params?: any): void;
28
+ invalidate(method: string, url: string, params?: unknown): void;
29
29
  /**
30
30
  * Invalidate all cache entries matching a pattern
31
31
  */
package/dist/cache.js CHANGED
@@ -69,7 +69,7 @@ class RequestCache {
69
69
  * Invalidate all cache entries matching a pattern
70
70
  */
71
71
  invalidatePattern(pattern) {
72
- for (const key of this.cache.keys()) {
72
+ for (const key of Array.from(this.cache.keys())) {
73
73
  if (key.includes(pattern)) {
74
74
  this.cache.delete(key);
75
75
  }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * evalai baseline — Baseline management commands
3
+ *
4
+ * Subcommands:
5
+ * evalai baseline init — Create a starter evals/baseline.json
6
+ * evalai baseline update — Run tests + update baseline with real scores
7
+ */
8
+ export declare function runBaselineInit(cwd: string): number;
9
+ export declare function runBaselineUpdate(cwd: string): number;
10
+ export declare function runBaseline(argv: string[]): number;
@@ -0,0 +1,172 @@
1
+ "use strict";
2
+ /**
3
+ * evalai baseline — Baseline management commands
4
+ *
5
+ * Subcommands:
6
+ * evalai baseline init — Create a starter evals/baseline.json
7
+ * evalai baseline update — Run tests + update baseline with real scores
8
+ */
9
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ var desc = Object.getOwnPropertyDescriptor(m, k);
12
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
13
+ desc = { enumerable: true, get: function() { return m[k]; } };
14
+ }
15
+ Object.defineProperty(o, k2, desc);
16
+ }) : (function(o, m, k, k2) {
17
+ if (k2 === undefined) k2 = k;
18
+ o[k2] = m[k];
19
+ }));
20
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
21
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
22
+ }) : function(o, v) {
23
+ o["default"] = v;
24
+ });
25
+ var __importStar = (this && this.__importStar) || (function () {
26
+ var ownKeys = function(o) {
27
+ ownKeys = Object.getOwnPropertyNames || function (o) {
28
+ var ar = [];
29
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
30
+ return ar;
31
+ };
32
+ return ownKeys(o);
33
+ };
34
+ return function (mod) {
35
+ if (mod && mod.__esModule) return mod;
36
+ var result = {};
37
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
38
+ __setModuleDefault(result, mod);
39
+ return result;
40
+ };
41
+ })();
42
+ Object.defineProperty(exports, "__esModule", { value: true });
43
+ exports.runBaselineInit = runBaselineInit;
44
+ exports.runBaselineUpdate = runBaselineUpdate;
45
+ exports.runBaseline = runBaseline;
46
+ const node_child_process_1 = require("node:child_process");
47
+ const fs = __importStar(require("node:fs"));
48
+ const path = __importStar(require("node:path"));
49
+ const BASELINE_REL = "evals/baseline.json";
50
+ /** Detect the package manager used in the project */
51
+ function detectPackageManager(cwd) {
52
+ if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
53
+ return "pnpm";
54
+ if (fs.existsSync(path.join(cwd, "yarn.lock")))
55
+ return "yarn";
56
+ return "npm";
57
+ }
58
+ /** Run an npm script via the detected package manager */
59
+ function runScript(cwd, scriptName) {
60
+ const pm = detectPackageManager(cwd);
61
+ const isWin = process.platform === "win32";
62
+ const result = (0, node_child_process_1.spawnSync)(pm, ["run", scriptName], {
63
+ cwd,
64
+ stdio: "inherit",
65
+ shell: isWin,
66
+ });
67
+ return result.status ?? 1;
68
+ }
69
+ function runBaselineInit(cwd) {
70
+ const baselinePath = path.join(cwd, BASELINE_REL);
71
+ if (fs.existsSync(baselinePath)) {
72
+ console.log(`⚠ ${BASELINE_REL} already exists. Delete it first or use 'evalai baseline update'.`);
73
+ return 1;
74
+ }
75
+ // Ensure evals/ directory exists
76
+ const evalsDir = path.join(cwd, "evals");
77
+ if (!fs.existsSync(evalsDir)) {
78
+ fs.mkdirSync(evalsDir, { recursive: true });
79
+ }
80
+ const user = process.env.USER || process.env.USERNAME || "unknown";
81
+ const now = new Date().toISOString();
82
+ const baseline = {
83
+ schemaVersion: 1,
84
+ description: "Regression gate baseline — created by evalai baseline init",
85
+ generatedAt: now,
86
+ generatedBy: user,
87
+ commitSha: "0000000",
88
+ updatedAt: now,
89
+ updatedBy: user,
90
+ tolerance: {
91
+ scoreDrop: 5,
92
+ passRateDrop: 5,
93
+ maxLatencyIncreaseMs: 200,
94
+ maxCostIncreaseUsd: 0.05,
95
+ },
96
+ goldenEval: {
97
+ score: 100,
98
+ passRate: 100,
99
+ totalCases: 3,
100
+ passedCases: 3,
101
+ },
102
+ qualityScore: {
103
+ overall: 90,
104
+ grade: "A",
105
+ accuracy: 85,
106
+ safety: 100,
107
+ latency: 90,
108
+ cost: 90,
109
+ consistency: 90,
110
+ },
111
+ confidenceTests: {
112
+ unitPassed: true,
113
+ unitTotal: 0,
114
+ dbPassed: true,
115
+ dbTotal: 0,
116
+ },
117
+ productMetrics: {},
118
+ };
119
+ fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
120
+ console.log(`✅ Created ${BASELINE_REL} with sample values\n`);
121
+ console.log("Next steps:");
122
+ console.log(` 1. Commit ${BASELINE_REL} to your repo`);
123
+ console.log(" 2. Run 'evalai baseline update' to populate with real scores");
124
+ console.log(" 3. Run 'evalai gate' to verify the regression gate\n");
125
+ return 0;
126
+ }
127
+ // ── baseline update ──
128
+ function runBaselineUpdate(cwd) {
129
+ // Check if eval:baseline-update script exists in package.json
130
+ const pkgPath = path.join(cwd, "package.json");
131
+ if (!fs.existsSync(pkgPath)) {
132
+ console.error("❌ No package.json found. Run this from your project root.");
133
+ return 1;
134
+ }
135
+ let pkg;
136
+ try {
137
+ pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
138
+ }
139
+ catch {
140
+ console.error("❌ Failed to parse package.json");
141
+ return 1;
142
+ }
143
+ if (!pkg.scripts?.["eval:baseline-update"]) {
144
+ console.error("❌ Missing 'eval:baseline-update' script in package.json.");
145
+ console.error(" Add it: \"eval:baseline-update\": \"npx tsx scripts/regression-gate.ts --update-baseline\"");
146
+ return 1;
147
+ }
148
+ console.log("📊 Running baseline update...\n");
149
+ return runScript(cwd, "eval:baseline-update");
150
+ }
151
+ // ── baseline router ──
152
+ function runBaseline(argv) {
153
+ const sub = argv[0];
154
+ const cwd = process.cwd();
155
+ if (sub === "init") {
156
+ return runBaselineInit(cwd);
157
+ }
158
+ if (sub === "update") {
159
+ return runBaselineUpdate(cwd);
160
+ }
161
+ console.log(`evalai baseline — Manage regression gate baselines
162
+
163
+ Usage:
164
+ evalai baseline init Create starter ${BASELINE_REL}
165
+ evalai baseline update Run tests and update baseline with real scores
166
+
167
+ Examples:
168
+ evalai baseline init
169
+ evalai baseline update
170
+ `);
171
+ return sub === "--help" || sub === "-h" ? 0 : 1;
172
+ }
@@ -81,7 +81,7 @@ function appendStepSummary(report) {
81
81
  const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 80);
82
82
  const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 80);
83
83
  const reason = out ? `got "${out}"` : "no output";
84
- lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || "(any)"}, ${reason}`);
84
+ lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || "(unknown)"}, ${reason}`);
85
85
  }
86
86
  if (failedCases.length > 10) {
87
87
  lines.push(`- _+ ${failedCases.length - 10} more_`);
@@ -30,7 +30,7 @@ function formatHuman(report) {
30
30
  const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 50);
31
31
  const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 50);
32
32
  const reason = out ? `got "${out}"` : "no output";
33
- lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || "(any)"}, ${reason}`);
33
+ lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || "(unknown)"}, ${reason}`);
34
34
  }
35
35
  if (failedCases.length > toShow.length) {
36
36
  lines.push(` + ${failedCases.length - toShow.length} more`);
@@ -49,7 +49,7 @@ function buildPrComment(report) {
49
49
  lines.push(`_${escapeMarkdown(report.reasonMessage)}_`);
50
50
  }
51
51
  lines.push("");
52
- // Policy (if any)
52
+ // Policy (if unknown)
53
53
  if (report.policy) {
54
54
  lines.push(`**Policy:** ${report.policy}`);
55
55
  lines.push("");
package/dist/cli/index.js CHANGED
@@ -8,9 +8,11 @@
8
8
  * evalai check — CI/CD evaluation gate (see evalai check --help)
9
9
  */
10
10
  Object.defineProperty(exports, "__esModule", { value: true });
11
+ const baseline_1 = require("./baseline");
11
12
  const check_1 = require("./check");
12
13
  const doctor_1 = require("./doctor");
13
14
  const init_1 = require("./init");
15
+ const regression_gate_1 = require("./regression-gate");
14
16
  const share_1 = require("./share");
15
17
  const argv = process.argv.slice(2);
16
18
  const subcommand = argv[0];
@@ -19,6 +21,14 @@ if (subcommand === "init") {
19
21
  const ok = (0, init_1.runInit)(cwd);
20
22
  process.exit(ok ? 0 : 1);
21
23
  }
24
+ else if (subcommand === "baseline") {
25
+ const code = (0, baseline_1.runBaseline)(argv.slice(1));
26
+ process.exit(code);
27
+ }
28
+ else if (subcommand === "gate") {
29
+ const code = (0, regression_gate_1.runGate)(argv.slice(1));
30
+ process.exit(code);
31
+ }
22
32
  else if (subcommand === "doctor") {
23
33
  (0, doctor_1.runDoctor)(argv.slice(1))
24
34
  .then((code) => process.exit(code))
@@ -57,10 +67,16 @@ else {
57
67
  console.log(`EvalAI CLI
58
68
 
59
69
  Usage:
60
- evalai init Create evalai.config.json
61
- evalai doctor [options] Verify CI/CD setup (same endpoint as check)
62
- evalai check [options] CI/CD evaluation gate
63
- evalai share [options] Create share link for a run
70
+ evalai init Create evalai.config.json
71
+ evalai baseline init Create starter evals/baseline.json
72
+ evalai baseline update Run tests and update baseline with real scores
73
+ evalai gate [options] Run regression gate (local test-based)
74
+ evalai doctor [options] Verify CI/CD setup (same endpoint as check)
75
+ evalai check [options] CI/CD evaluation gate (API-based)
76
+ evalai share [options] Create share link for a run
77
+
78
+ Options for gate:
79
+ --format <fmt> Output format: human (default), json, github
64
80
 
65
81
  Options for check:
66
82
  --evaluationId <id> Evaluation to gate on (or from config)
@@ -0,0 +1,11 @@
1
+ /**
2
+ * evalai gate — Run the regression gate
3
+ *
4
+ * Delegates to the project's eval:regression-gate npm script.
5
+ * Supports --format json to output the regression-report.json contents.
6
+ */
7
+ export interface GateArgs {
8
+ format: "human" | "json" | "github";
9
+ }
10
+ export declare function parseGateArgs(argv: string[]): GateArgs;
11
+ export declare function runGate(argv: string[]): number;
@@ -0,0 +1,150 @@
1
+ "use strict";
2
+ /**
3
+ * evalai gate — Run the regression gate
4
+ *
5
+ * Delegates to the project's eval:regression-gate npm script.
6
+ * Supports --format json to output the regression-report.json contents.
7
+ */
8
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
9
+ if (k2 === undefined) k2 = k;
10
+ var desc = Object.getOwnPropertyDescriptor(m, k);
11
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
12
+ desc = { enumerable: true, get: function() { return m[k]; } };
13
+ }
14
+ Object.defineProperty(o, k2, desc);
15
+ }) : (function(o, m, k, k2) {
16
+ if (k2 === undefined) k2 = k;
17
+ o[k2] = m[k];
18
+ }));
19
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
20
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
21
+ }) : function(o, v) {
22
+ o["default"] = v;
23
+ });
24
+ var __importStar = (this && this.__importStar) || (function () {
25
+ var ownKeys = function(o) {
26
+ ownKeys = Object.getOwnPropertyNames || function (o) {
27
+ var ar = [];
28
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
29
+ return ar;
30
+ };
31
+ return ownKeys(o);
32
+ };
33
+ return function (mod) {
34
+ if (mod && mod.__esModule) return mod;
35
+ var result = {};
36
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
37
+ __setModuleDefault(result, mod);
38
+ return result;
39
+ };
40
+ })();
41
+ Object.defineProperty(exports, "__esModule", { value: true });
42
+ exports.parseGateArgs = parseGateArgs;
43
+ exports.runGate = runGate;
44
+ const node_child_process_1 = require("node:child_process");
45
+ const fs = __importStar(require("node:fs"));
46
+ const path = __importStar(require("node:path"));
47
+ const REPORT_REL = "evals/regression-report.json";
48
+ /** Detect the package manager used in the project */
49
+ function detectPackageManager(cwd) {
50
+ if (fs.existsSync(path.join(cwd, "pnpm-lock.yaml")))
51
+ return "pnpm";
52
+ if (fs.existsSync(path.join(cwd, "yarn.lock")))
53
+ return "yarn";
54
+ return "npm";
55
+ }
56
+ function parseGateArgs(argv) {
57
+ const args = { format: "human" };
58
+ for (let i = 0; i < argv.length; i++) {
59
+ if (argv[i] === "--format" && argv[i + 1]) {
60
+ const fmt = argv[i + 1];
61
+ if (fmt === "json" || fmt === "github" || fmt === "human") {
62
+ args.format = fmt;
63
+ }
64
+ i++;
65
+ }
66
+ }
67
+ return args;
68
+ }
69
+ function runGate(argv) {
70
+ const cwd = process.cwd();
71
+ const args = parseGateArgs(argv);
72
+ // Check if eval:regression-gate script exists
73
+ const pkgPath = path.join(cwd, "package.json");
74
+ if (!fs.existsSync(pkgPath)) {
75
+ console.error("❌ No package.json found. Run this from your project root.");
76
+ return 1;
77
+ }
78
+ let pkg;
79
+ try {
80
+ pkg = JSON.parse(fs.readFileSync(pkgPath, "utf-8"));
81
+ }
82
+ catch {
83
+ console.error("❌ Failed to parse package.json");
84
+ return 1;
85
+ }
86
+ if (!pkg.scripts?.["eval:regression-gate"]) {
87
+ console.error("❌ Missing 'eval:regression-gate' script in package.json.");
88
+ console.error(' Add it: "eval:regression-gate": "npx tsx scripts/regression-gate.ts"');
89
+ return 1;
90
+ }
91
+ const pm = detectPackageManager(cwd);
92
+ const isWin = process.platform === "win32";
93
+ // For json format, suppress human output and print report JSON
94
+ const stdio = args.format === "json" ? "pipe" : "inherit";
95
+ const result = (0, node_child_process_1.spawnSync)(pm, ["run", "eval:regression-gate"], {
96
+ cwd,
97
+ stdio: stdio,
98
+ shell: isWin,
99
+ });
100
+ const exitCode = result.status ?? 1;
101
+ if (args.format === "json") {
102
+ // Output the regression report as JSON
103
+ const reportPath = path.join(cwd, REPORT_REL);
104
+ if (fs.existsSync(reportPath)) {
105
+ const report = fs.readFileSync(reportPath, "utf-8");
106
+ process.stdout.write(report);
107
+ }
108
+ else {
109
+ console.error(JSON.stringify({ error: "regression-report.json not found", exitCode }));
110
+ }
111
+ }
112
+ else if (args.format === "github") {
113
+ // Output GitHub Step Summary markdown
114
+ const reportPath = path.join(cwd, REPORT_REL);
115
+ if (fs.existsSync(reportPath)) {
116
+ try {
117
+ const report = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
118
+ const icon = report.passed ? "✅" : "❌";
119
+ const lines = [
120
+ `## ${icon} Regression Gate: ${report.category}`,
121
+ "",
122
+ "| Metric | Baseline | Current | Delta | Status |",
123
+ "|--------|----------|---------|-------|--------|",
124
+ ];
125
+ for (const d of report.deltas ?? []) {
126
+ const statusIcon = d.status === "pass" ? "✅" : "❌";
127
+ lines.push(`| ${d.metric} | ${d.baseline} | ${d.current} | ${d.delta} | ${statusIcon} |`);
128
+ }
129
+ if (report.failures?.length > 0) {
130
+ lines.push("", "### Failures", "");
131
+ for (const f of report.failures) {
132
+ lines.push(`- ${f}`);
133
+ }
134
+ }
135
+ lines.push("", `Schema version: ${report.schemaVersion ?? "unknown"}`);
136
+ const md = lines.join("\n");
137
+ // Write to $GITHUB_STEP_SUMMARY if available
138
+ const summaryPath = process.env.GITHUB_STEP_SUMMARY;
139
+ if (summaryPath) {
140
+ fs.appendFileSync(summaryPath, `${md}\n`);
141
+ }
142
+ console.log(md);
143
+ }
144
+ catch {
145
+ // Fall through — human output already printed
146
+ }
147
+ }
148
+ }
149
+ return exitCode;
150
+ }