@evalgate/sdk 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,43 @@ All notable changes to the @evalgate/sdk package will be documented in this file
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.2.0] - 2026-03-03
9
+
10
+ ### Breaking
11
+
12
+ - **`snapshot(output, name)` → `snapshot(name, output)`** — parameter order swapped to match natural call convention (`name` first, value second, same as `test('name', fn)`). Update any existing `snapshot(output, 'label')` calls to `snapshot('label', output)`.
13
+
14
+ ### Added
15
+
16
+ - **`expect().not` modifier** — `expect('drop table').not.toContain('drop table')` now works; negates `passed` on any chained assertion via Proxy
17
+ - **`hasPII(text)`** — semantic inverse of `notContainsPII`; returns `true` when PII is detected (email, phone, SSN, IP). Exported from main package. Eliminates double-negative confusion.
18
+ - **`defineSuite` object form** — now accepts both `defineSuite(name, [...fns])` and `defineSuite({ name, specs: [...fns] })`. README updated with examples.
19
+
20
+ ### Fixed
21
+
22
+ - **`specId` collision** — all specs in `eval/` directory shared the same 8-char ID (`ZXZhbC9j`). Root cause: short base64 prefix was identical for any path starting with `eval/c`. Fixed: SHA-256 hex (16 chars) in `discover.ts`.
23
+ - **`explain` UNKNOWN verdict** — `evalgate explain` showed `Verdict: UNKNOWN` when reading `.evalgate/last-run.json`. Added `RunResult` format detection (`results[]` + `summary`). Added `.evalgate/last-run.json` and `.evalgate/runs/latest.json` to auto-search paths. Passing runs now show clean `✅ PASS` with no spurious "Run doctor" suggestions.
24
+ - **`print-config` baseUrl default** — was `http://localhost:3000`; now `https://api.evalgate.com` to match `evalgate doctor`.
25
+ - **`baseline update` self-contained** — no longer requires a custom `eval:baseline-update` npm script. Falls back to built-in mode (runs `pm test`, stamps baseline) if no script is present.
26
+ - **`notContainsPII` phone regex** — broadened to cover `555-123-4567`, `555.123.4567`, and `555 123 4567` formats. JSDoc clarified: `false` = PII found (unsafe), `true` = no PII (safe).
27
+ - **`impact-analysis` git error** — replaced raw `git diff --help` wall-of-text with clean targeted messages: `Not a git repository`, `Base branch 'X' not found. Fetch it first`, or generic exit-code message.
28
+ - **README quickstart** — both `defineEval` examples now include an `executor` function. Running the quickstart no longer throws `Executor must be a function`.
29
+ - **`snapshot` module docstring** — updated `@example` to reflect new `(name, output)` parameter order.
30
+
31
+ ---
32
+
33
+ ## [2.1.3] - 2026-03-02
34
+
35
+ ### Fixed
36
+
37
+ - **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
38
+ - **High:** First-run gate false regression on fresh init when no test script exists
39
+ - **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
40
+ - **Critical:** Simulated executeSpec replaced with real spec execution
41
+ - **High:** Run scores now include scoring model context for clarity
42
+ - **Low:** Explain no longer shows "unnamed" for builtin gate failures
43
+ - **Docs:** Added missing `discover --manifest` step to local quickstart
44
+
8
45
  ## [2.1.2] - 2026-03-02
9
46
 
10
47
  ### Fixed
package/README.md CHANGED
@@ -40,13 +40,20 @@ Create `eval/your-spec.spec.ts`:
40
40
  ```typescript
41
41
  import { defineEval } from "@evalgate/sdk";
42
42
 
43
+ defineEval("Basic Math Operations", async () => {
44
+ const result = 1 + 1;
45
+ return { pass: result === 2, score: result === 2 ? 100 : 0 };
46
+ });
47
+
48
+ // Object form (with metadata):
43
49
  defineEval({
44
- name: "Basic Math Operations",
45
- description: "Test fundamental arithmetic",
46
- prompt: "Test: 1+1=2, string concatenation, array includes",
47
- expected: "All tests should pass",
50
+ name: "String concatenation",
51
+ description: "Test string operations",
48
52
  tags: ["basic", "math"],
49
- category: "unit-test"
53
+ executor: async () => {
54
+ const result = "hello" + " world";
55
+ return { pass: result === "hello world", score: 100 };
56
+ },
50
57
  });
51
58
  ```
52
59
 
@@ -254,6 +261,46 @@ All commands automatically write artifacts so `explain` works with zero flags.
254
261
  npm install @evalgate/sdk openai
255
262
  ```
256
263
 
264
+ Create `eval/your-spec.spec.ts`:
265
+
266
+ ```typescript
267
+ import { defineEval } from "@evalgate/sdk";
268
+
269
+ defineEval("Basic Math Operations", async () => {
270
+ const result = 1 + 1;
271
+ return { pass: result === 2, score: result === 2 ? 100 : 0 };
272
+ });
273
+
274
+ // Object form (with metadata):
275
+ defineEval({
276
+ name: "String concatenation",
277
+ description: "Test string operations",
278
+ tags: ["basic", "math"],
279
+ executor: async () => {
280
+ const result = "hello" + " world";
281
+ return { pass: result === "hello world", score: 100 };
282
+ },
283
+ });
284
+
285
+ // Suite form — group related specs:
286
+ defineSuite("Math suite", [
287
+ () => defineEval("addition", async () => ({ pass: 1 + 1 === 2, score: 100 })),
288
+ () => defineEval("subtraction", async () => ({ pass: 5 - 3 === 2, score: 100 })),
289
+ ]);
290
+ ```
291
+
292
+ ```bash
293
+ # Discover specs and generate manifest
294
+ npx @evalgate/sdk discover
295
+ npx @evalgate/sdk discover --manifest
296
+
297
+ # Run evaluations
298
+ npx @evalgate/sdk run --write-results
299
+
300
+ # Run local regression gate
301
+ npx @evalgate/sdk gate
302
+ ```
303
+
257
304
  ```typescript
258
305
  import { openAIChatEval } from "@evalgate/sdk";
259
306
 
@@ -32,6 +32,11 @@ export declare class AssertionError extends Error {
32
32
  export declare class Expectation {
33
33
  private value;
34
34
  constructor(value: unknown);
35
+ /**
36
+ * Negate the next assertion — inverts `passed` on any chained method.
37
+ * @example expect('drop table').not.toContain('drop table')
38
+ */
39
+ get not(): Expectation;
35
40
  /**
36
41
  * Assert value equals expected
37
42
  * @example expect(output).toEqual("Hello")
@@ -171,7 +176,23 @@ export declare function hasLength(text: string, range: {
171
176
  max?: number;
172
177
  }): boolean;
173
178
  export declare function containsJSON(text: string): boolean;
179
+ /**
180
+ * Returns `true` when the text is PII-free (safe to use), `false` when PII is detected.
181
+ *
182
+ * @example
183
+ * if (!notContainsPII(response)) throw new Error("PII leak detected");
184
+ * // Or use the clearer alias:
185
+ * if (hasPII(response)) throw new Error("PII leak detected");
186
+ */
174
187
  export declare function notContainsPII(text: string): boolean;
188
+ /**
189
+ * Returns `true` when PII is detected in the text (unsafe), `false` when safe.
190
+ * This is the semantic inverse of `notContainsPII` and may be easier to reason about.
191
+ *
192
+ * @example
193
+ * if (hasPII(response)) throw new Error("PII leak");
194
+ */
195
+ export declare function hasPII(text: string): boolean;
175
196
  export declare function hasSentiment(text: string, expected: "positive" | "negative" | "neutral"): boolean;
176
197
  export declare function similarTo(text1: string, text2: string, threshold?: number): boolean;
177
198
  export declare function withinRange(value: number, min: number, max: number): boolean;
@@ -24,6 +24,7 @@ exports.matchesPattern = matchesPattern;
24
24
  exports.hasLength = hasLength;
25
25
  exports.containsJSON = containsJSON;
26
26
  exports.notContainsPII = notContainsPII;
27
+ exports.hasPII = hasPII;
27
28
  exports.hasSentiment = hasSentiment;
28
29
  exports.similarTo = similarTo;
29
30
  exports.withinRange = withinRange;
@@ -56,6 +57,28 @@ class Expectation {
56
57
  constructor(value) {
57
58
  this.value = value;
58
59
  }
60
+ /**
61
+ * Negate the next assertion — inverts `passed` on any chained method.
62
+ * @example expect('drop table').not.toContain('drop table')
63
+ */
64
+ get not() {
65
+ const value = this.value;
66
+ return new Proxy(new Expectation(value), {
67
+ get(target, prop) {
68
+ const orig = target[prop];
69
+ if (typeof orig === "function" && prop !== "constructor") {
70
+ return (...args) => {
71
+ const result = orig.call(target, ...args);
72
+ if (result && typeof result === "object" && "passed" in result) {
73
+ return { ...result, passed: !result.passed };
74
+ }
75
+ return result;
76
+ };
77
+ }
78
+ return orig;
79
+ },
80
+ });
81
+ }
59
82
  /**
60
83
  * Assert value equals expected
61
84
  * @example expect(output).toEqual("Hello")
@@ -539,17 +562,35 @@ function containsJSON(text) {
539
562
  return false;
540
563
  }
541
564
  }
565
+ /**
566
+ * Returns `true` when the text is PII-free (safe to use), `false` when PII is detected.
567
+ *
568
+ * @example
569
+ * if (!notContainsPII(response)) throw new Error("PII leak detected");
570
+ * // Or use the clearer alias:
571
+ * if (hasPII(response)) throw new Error("PII leak detected");
572
+ */
542
573
  function notContainsPII(text) {
543
574
  // Simple PII detection patterns
544
575
  const piiPatterns = [
545
576
  /\b\d{3}-\d{2}-\d{4}\b/, // SSN
546
577
  /\b\d{3}\.\d{3}\.\d{4}\b/, // SSN with dots
547
- /\b\d{10}\b/, // Phone number
548
- /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/, // Email
578
+ /\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b/, // Phone (various formats)
579
+ /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/, // Email
549
580
  /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/, // IP address
550
581
  ];
551
582
  return !piiPatterns.some((pattern) => pattern.test(text));
552
583
  }
584
+ /**
585
+ * Returns `true` when PII is detected in the text (unsafe), `false` when safe.
586
+ * This is the semantic inverse of `notContainsPII` and may be easier to reason about.
587
+ *
588
+ * @example
589
+ * if (hasPII(response)) throw new Error("PII leak");
590
+ */
591
+ function hasPII(text) {
592
+ return !notContainsPII(text);
593
+ }
553
594
  function hasSentiment(text, expected) {
554
595
  // This is a simplified implementation
555
596
  const positiveWords = ["good", "great", "excellent", "awesome"];
@@ -126,7 +126,6 @@ function runBaselineInit(cwd) {
126
126
  }
127
127
  // ── baseline update ──
128
128
  function runBaselineUpdate(cwd) {
129
- // Check if eval:baseline-update script exists in package.json
130
129
  const pkgPath = path.join(cwd, "package.json");
131
130
  if (!fs.existsSync(pkgPath)) {
132
131
  console.error("❌ No package.json found. Run this from your project root.");
@@ -140,13 +139,39 @@ function runBaselineUpdate(cwd) {
140
139
  console.error("❌ Failed to parse package.json");
141
140
  return 1;
142
141
  }
143
- if (!pkg.scripts?.["eval:baseline-update"]) {
144
- console.error("❌ Missing 'eval:baseline-update' script in package.json.");
145
- console.error(' Add it: "eval:baseline-update": "npx tsx scripts/regression-gate.ts --update-baseline"');
142
+ // Use custom script if available
143
+ if (pkg.scripts?.["eval:baseline-update"]) {
144
+ console.log("📊 Running baseline update (custom script)...\n");
145
+ return runScript(cwd, "eval:baseline-update");
146
+ }
147
+ // Self-contained built-in mode: run the test suite then stamp the baseline
148
+ console.log("📊 Running baseline update (built-in mode)...\n");
149
+ const pm = detectPackageManager(cwd);
150
+ const isWin = process.platform === "win32";
151
+ const testResult = (0, node_child_process_1.spawnSync)(pm, ["test"], {
152
+ cwd,
153
+ stdio: "inherit",
154
+ shell: isWin,
155
+ });
156
+ const baselinePath = path.join(cwd, BASELINE_REL);
157
+ if (!fs.existsSync(baselinePath)) {
158
+ console.error("❌ No baseline found. Run 'evalgate baseline init' first.");
159
+ return 1;
160
+ }
161
+ try {
162
+ const baseline = JSON.parse(fs.readFileSync(baselinePath, "utf-8"));
163
+ baseline.updatedAt = new Date().toISOString();
164
+ baseline.updatedBy = process.env.USER || process.env.USERNAME || "unknown";
165
+ baseline.confidenceTests = baseline.confidenceTests ?? {};
166
+ baseline.confidenceTests.unitPassed = testResult.status === 0;
167
+ fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
168
+ console.log("\n✅ Baseline updated successfully");
169
+ }
170
+ catch {
171
+ console.error("❌ Failed to update baseline file");
146
172
  return 1;
147
173
  }
148
- console.log("📊 Running baseline update...\n");
149
- return runScript(cwd, "eval:baseline-update");
174
+ return testResult.status ?? 1;
150
175
  }
151
176
  // ── baseline router ──
152
177
  function runBaseline(argv) {
@@ -59,6 +59,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
59
59
  exports.discoverSpecs = discoverSpecs;
60
60
  exports.printDiscoveryResults = printDiscoveryResults;
61
61
  exports.runDiscover = runDiscover;
62
+ const crypto = __importStar(require("node:crypto"));
62
63
  const fs = __importStar(require("node:fs/promises"));
63
64
  const path = __importStar(require("node:path"));
64
65
  const execution_mode_1 = require("../runtime/execution-mode");
@@ -145,8 +146,8 @@ async function analyzeSpecifications(specFiles) {
145
146
  for (const filePath of specFiles) {
146
147
  try {
147
148
  const content = await fs.readFile(filePath, "utf-8");
148
- const analysis = analyzeSpecFile(filePath, content);
149
- specs.push(analysis);
149
+ const fileSpecs = analyzeSpecFile(filePath, content);
150
+ specs.push(...fileSpecs);
150
151
  }
151
152
  catch (error) {
152
153
  console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
@@ -155,20 +156,40 @@ async function analyzeSpecifications(specFiles) {
155
156
  return specs;
156
157
  }
157
158
  /**
158
- * Analyze a single specification file
159
+ * Extract all spec names from file content (handles both call forms)
160
+ */
161
+ function extractSpecNames(content) {
162
+ const names = [];
163
+ // Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
164
+ const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
165
+ let m = stringArgPattern.exec(content);
166
+ while (m !== null) {
167
+ names.push(m[1]);
168
+ m = stringArgPattern.exec(content);
169
+ }
170
+ if (names.length > 0)
171
+ return names;
172
+ // Form 2: defineEval({ name: "..." }) — object-first form
173
+ const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
174
+ m = objNamePattern.exec(content);
175
+ while (m !== null) {
176
+ names.push(m[1]);
177
+ m = objNamePattern.exec(content);
178
+ }
179
+ return names;
180
+ }
181
+ /**
182
+ * Analyze a single specification file — returns one SpecAnalysis per defineEval call
159
183
  */
160
184
  function analyzeSpecFile(filePath, content) {
161
- // Extract defineEval calls
162
- const defineEvalMatches = content.match(/defineEval\s*\([^)]+\)/g) || [];
163
- const specNames = defineEvalMatches.map((match) => {
164
- const nameMatch = match.match(/["'`](.+?)["'`](?:\s*,|\s*\))/);
165
- return nameMatch ? nameMatch[1] : "unnamed";
166
- });
167
- // Extract tags
185
+ const specNames = extractSpecNames(content);
186
+ // Fallback: file matched as a spec file but we couldn't parse names
187
+ if (specNames.length === 0) {
188
+ specNames.push(path.basename(filePath, path.extname(filePath)));
189
+ }
190
+ // Shared analysis for the file
168
191
  const tags = extractTags(content);
169
- // Analyze complexity
170
192
  const complexity = analyzeComplexity(content);
171
- // Check for models and tools
172
193
  const usesModels = content.includes("model:") ||
173
194
  content.includes("model=") ||
174
195
  content.includes("openai") ||
@@ -176,22 +197,20 @@ function analyzeSpecFile(filePath, content) {
176
197
  const usesTools = content.includes("tool:") ||
177
198
  content.includes("function.") ||
178
199
  content.includes("call(");
179
- // Check for assertions
180
200
  const hasAssertions = content.includes("assert") ||
181
201
  content.includes("expect") ||
182
202
  content.includes("should");
183
- // Generate ID from file path
184
- const id = generateSpecId(filePath);
185
- return {
186
- id,
187
- name: specNames[0] || path.basename(filePath, ".ts"),
188
- file: path.relative(process.cwd(), filePath),
203
+ const relFile = path.relative(process.cwd(), filePath);
204
+ return specNames.map((name, idx) => ({
205
+ id: generateSpecId(filePath, name, idx),
206
+ name,
207
+ file: relFile,
189
208
  tags,
190
209
  hasAssertions,
191
210
  usesModels,
192
211
  usesTools,
193
212
  complexity,
194
- };
213
+ }));
195
214
  }
196
215
  /**
197
216
  * Extract tags from specification content
@@ -263,15 +282,14 @@ function analyzeComplexity(content) {
263
282
  return "complex";
264
283
  }
265
284
  /**
266
- * Generate specification ID from file path
285
+ * Generate specification ID from file path + name + index (unique per defineEval call)
267
286
  */
268
- function generateSpecId(filePath) {
269
- const relativePath = path.relative(process.cwd(), filePath);
270
- const hash = Buffer.from(relativePath)
271
- .toString("base64")
272
- .replace(/[+/=]/g, "")
273
- .slice(0, 8);
274
- return hash;
287
+ function generateSpecId(filePath, name, index) {
288
+ const relativePath = path
289
+ .relative(process.cwd(), filePath)
290
+ .replace(/\\/g, "/");
291
+ const key = `${relativePath}|${name}|${index}`;
292
+ return crypto.createHash("sha256").update(key).digest("hex").slice(0, 16);
275
293
  }
276
294
  /**
277
295
  * Calculate discovery statistics
@@ -96,7 +96,7 @@ function parseFlags(argv) {
96
96
  const baseUrl = raw.baseUrl ||
97
97
  process.env.EVALGATE_BASE_URL ||
98
98
  process.env.EVALAI_BASE_URL ||
99
- "http://localhost:3000";
99
+ "https://api.evalgate.com";
100
100
  const apiKey = raw.apiKey ||
101
101
  process.env.EVALGATE_API_KEY ||
102
102
  process.env.EVALAI_API_KEY ||
@@ -84,6 +84,8 @@ const REPORT_SEARCH_PATHS = [
84
84
  "evals/regression-report.json",
85
85
  ".evalgate/last-report.json",
86
86
  ".evalgate/last_report.json",
87
+ ".evalgate/last-run.json",
88
+ ".evalgate/runs/latest.json",
87
89
  ];
88
90
  function findReport(cwd, explicitPath) {
89
91
  if (explicitPath) {
@@ -354,13 +356,78 @@ function suggestFixes(causes) {
354
356
  }
355
357
  // ── Build explain output ──
356
358
  function buildExplainOutput(report, reportPath) {
357
- // Support both CheckReport (from evalgate check) and BuiltinReport (from evalgate gate)
359
+ // Support RunResult (from evalgate run) has schemaVersion + results[] + summary
360
+ const isRunResult = "results" in report &&
361
+ Array.isArray(report.results) &&
362
+ "summary" in report &&
363
+ report.summary !== null &&
364
+ typeof report.summary === "object";
365
+ if (isRunResult) {
366
+ return buildFromRunResult(report, reportPath);
367
+ }
368
+ // Support BuiltinReport (from evalgate gate)
358
369
  const isBuiltinReport = "category" in report && "deltas" in report;
359
370
  if (isBuiltinReport) {
360
371
  return buildFromBuiltinReport(report, reportPath);
361
372
  }
362
373
  return buildFromCheckReport(report, reportPath);
363
374
  }
375
+ function buildFromRunResult(report, reportPath) {
376
+ const summary = report.summary;
377
+ const results = report.results ?? [];
378
+ const passed = summary.failed === 0;
379
+ // Top failures
380
+ const failures = results.filter((r) => r.result.status === "failed");
381
+ const topFailures = failures.slice(0, 3).map((r, i) => ({
382
+ rank: i + 1,
383
+ name: r.name,
384
+ filePath: r.filePath,
385
+ reason: r.result.error,
386
+ }));
387
+ // Changes: pass rate
388
+ const changes = [
389
+ {
390
+ metric: "Pass rate",
391
+ baseline: "—",
392
+ current: `${Math.round(summary.passRate * 100)}%`,
393
+ direction: passed ? "same" : "worse",
394
+ },
395
+ ];
396
+ // For passing runs, emit nothing so no misleading "Run doctor" suggestions appear
397
+ if (passed) {
398
+ return {
399
+ verdict: "pass",
400
+ reasonMessage: `All ${summary.passed} spec${summary.passed === 1 ? "" : "s"} passed`,
401
+ topFailures: [],
402
+ totalFailures: 0,
403
+ changes,
404
+ rootCauses: [],
405
+ suggestedFixes: [],
406
+ reportPath,
407
+ };
408
+ }
409
+ // Classify root cause by inspecting error messages
410
+ const errorText = failures
411
+ .map((r) => (r.result.error ?? "").toLowerCase())
412
+ .join(" ");
413
+ const rootCauses = [];
414
+ if (errorText.includes("pii") || errorText.includes("safety"))
415
+ rootCauses.push("safety_regression");
416
+ if (errorText.includes("tool") || errorText.includes("function_call"))
417
+ rootCauses.push("tool_use_drift");
418
+ if (rootCauses.length === 0)
419
+ rootCauses.push("prompt_drift");
420
+ return {
421
+ verdict: "fail",
422
+ reasonMessage: `${summary.failed} of ${results.length} spec${results.length === 1 ? "" : "s"} failed`,
423
+ topFailures,
424
+ totalFailures: failures.length,
425
+ changes,
426
+ rootCauses,
427
+ suggestedFixes: suggestFixes(rootCauses),
428
+ reportPath,
429
+ };
430
+ }
364
431
  function buildFromCheckReport(report, reportPath) {
365
432
  const failedCases = report.failedCases ?? [];
366
433
  // Top failures (up to 3)
@@ -430,6 +497,7 @@ function buildFromBuiltinReport(report, reportPath) {
430
497
  }));
431
498
  const topFailures = failures.slice(0, 3).map((f, i) => ({
432
499
  rank: i + 1,
500
+ name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
433
501
  reason: f,
434
502
  }));
435
503
  // Simple root cause for builtin reports
@@ -109,7 +109,18 @@ async function getChangedFiles(baseBranch) {
109
109
  });
110
110
  git.on("close", (code) => {
111
111
  if (code !== 0) {
112
- reject(new Error(`Git diff failed: ${error}`));
112
+ const lowerError = error.toLowerCase();
113
+ if (lowerError.includes("not a git repository") ||
114
+ lowerError.includes("fatal: not a git")) {
115
+ reject(new Error("Not a git repository. Run 'git init' or run evalgate from inside a git repo."));
116
+ }
117
+ else if (lowerError.includes("unknown revision") ||
118
+ lowerError.includes("bad revision")) {
119
+ reject(new Error(`Base branch '${baseBranch}' not found. Fetch it first: git fetch origin ${baseBranch}`));
120
+ }
121
+ else {
122
+ reject(new Error(`Git diff failed (exit ${code}). Ensure git is installed and '${baseBranch}' exists.`));
123
+ }
113
124
  return;
114
125
  }
115
126
  const files = output
@@ -138,7 +138,7 @@ function buildResolvedConfig(cwd, flags) {
138
138
  value: flags.baseUrl ||
139
139
  envBaseUrl ||
140
140
  fileConfig?.baseUrl ||
141
- "http://localhost:3000",
141
+ "https://api.evalgate.com",
142
142
  source: baseUrlSource,
143
143
  });
144
144
  // apiKey (always redacted)
@@ -94,6 +94,16 @@ function detectRunner(cwd) {
94
94
  }
95
95
  return "unknown";
96
96
  }
97
+ function hasTestScript(cwd) {
98
+ try {
99
+ const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
100
+ const script = pkg.scripts?.test ?? "";
101
+ return !!script && script !== 'echo "Error: no test specified" && exit 1';
102
+ }
103
+ catch {
104
+ return false;
105
+ }
106
+ }
97
107
  function runBuiltinGate(cwd) {
98
108
  const t0 = Date.now();
99
109
  const baselinePath = path.join(cwd, BASELINE_REL);
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
101
111
  const pm = detectPackageManager(cwd);
102
112
  const command = `${pm} test`;
103
113
  const runner = detectRunner(cwd);
114
+ const projectHasTestScript = hasTestScript(cwd);
104
115
  // Load baseline
105
116
  if (!fs.existsSync(baselinePath)) {
106
117
  return {
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
165
176
  const baselineTotal = baselineData.confidenceTests?.total ?? 0;
166
177
  const failures = [];
167
178
  const deltas = [];
168
- // Delta: tests passing
169
- deltas.push({
170
- metric: "tests_passing",
171
- baseline: baselinePassed,
172
- current: testsPassed,
173
- delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
174
- status: testsPassed ? "pass" : "fail",
175
- });
176
- if (!testsPassed && baselinePassed) {
177
- failures.push("Tests were passing in baseline but are now failing");
179
+ // Delta: tests passing — only meaningful when a test script exists
180
+ if (projectHasTestScript) {
181
+ deltas.push({
182
+ metric: "tests_passing",
183
+ baseline: baselinePassed,
184
+ current: testsPassed,
185
+ delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
186
+ status: testsPassed ? "pass" : "fail",
187
+ });
188
+ if (!testsPassed && baselinePassed) {
189
+ failures.push("Tests were passing in baseline but are now failing");
190
+ }
178
191
  }
179
192
  // Delta: test count (only if we captured counts)
180
193
  if (testCount > 0 || baselineTotal > 0) {
package/dist/cli/run.js CHANGED
@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
52
52
  const node_child_process_1 = require("node:child_process");
53
53
  const fs = __importStar(require("node:fs/promises"));
54
54
  const path = __importStar(require("node:path"));
55
+ const registry_1 = require("../runtime/registry");
55
56
  const impact_analysis_1 = require("./impact-analysis");
56
57
  /**
57
58
  * Generate deterministic run ID
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
138
139
  }
139
140
  }
140
141
  /**
141
- * Execute specifications
142
+ * Execute specifications — grouped by file to avoid redundant loads
142
143
  */
143
144
  async function executeSpecs(specs) {
144
- const results = [];
145
+ // Group specs by their absolute file path
146
+ const specsByFile = new Map();
145
147
  for (const spec of specs) {
146
- const result = await executeSpec(spec);
147
- results.push(result);
148
+ const abs = path.isAbsolute(spec.filePath)
149
+ ? spec.filePath
150
+ : path.join(process.cwd(), spec.filePath);
151
+ const group = specsByFile.get(abs) ?? [];
152
+ group.push(spec);
153
+ specsByFile.set(abs, group);
148
154
  }
149
- return results;
150
- }
151
- /**
152
- * Execute individual specification
153
- */
154
- async function executeSpec(spec) {
155
- const startTime = Date.now();
156
- try {
157
- // For now, simulate execution
158
- // In a real implementation, this would:
159
- // 1. Load the spec file
160
- // 2. Execute the defineEval function
161
- // 3. Capture the result
162
- // Simulate some work
163
- await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
164
- // Simulate success/failure (90% success rate for demo)
165
- const success = Math.random() > 0.1;
166
- const duration = Date.now() - startTime;
167
- if (success) {
168
- return {
169
- specId: spec.id,
170
- name: spec.name,
171
- filePath: spec.filePath,
172
- result: {
173
- status: "passed",
174
- score: Math.random() * 0.3 + 0.7, // 0.7-1.0
175
- duration,
176
- },
177
- };
155
+ const results = [];
156
+ for (const [absPath, fileSpecs] of specsByFile) {
157
+ // Fresh runtime per file to avoid cross-file contamination
158
+ (0, registry_1.disposeActiveRuntime)();
159
+ try {
160
+ // Bust require cache so the file re-executes its defineEval calls
161
+ delete require.cache[require.resolve(absPath)];
178
162
  }
179
- else {
180
- return {
181
- specId: spec.id,
182
- name: spec.name,
183
- filePath: spec.filePath,
184
- result: {
185
- status: "failed",
186
- error: "Simulated execution failure",
187
- duration,
188
- },
189
- };
163
+ catch {
164
+ // Not in cache yet — fine
165
+ }
166
+ try {
167
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
168
+ require(absPath);
169
+ }
170
+ catch (loadError) {
171
+ const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
172
+ const msg = isTs &&
173
+ loadError instanceof Error &&
174
+ (loadError.message.includes("Unknown file extension") ||
175
+ loadError.message.includes("SyntaxError"))
176
+ ? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
177
+ : loadError instanceof Error
178
+ ? loadError.message
179
+ : String(loadError);
180
+ for (const spec of fileSpecs) {
181
+ results.push(makeErrorResult(spec, msg, 0));
182
+ }
183
+ continue;
184
+ }
185
+ const runtime = (0, registry_1.getActiveRuntime)();
186
+ const registered = runtime.list();
187
+ for (const spec of fileSpecs) {
188
+ const registeredSpec = registered.find((r) => r.name === spec.name);
189
+ if (!registeredSpec) {
190
+ results.push({
191
+ specId: spec.id,
192
+ name: spec.name,
193
+ filePath: spec.filePath,
194
+ result: {
195
+ status: "skipped",
196
+ error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
197
+ duration: 0,
198
+ },
199
+ });
200
+ continue;
201
+ }
202
+ const startTime = Date.now();
203
+ try {
204
+ const evalResult = await registeredSpec.executor({ input: "" });
205
+ results.push({
206
+ specId: spec.id,
207
+ name: spec.name,
208
+ filePath: spec.filePath,
209
+ result: {
210
+ status: evalResult.pass ? "passed" : "failed",
211
+ score: typeof evalResult.score === "number"
212
+ ? evalResult.score / 100
213
+ : undefined,
214
+ error: evalResult.error,
215
+ duration: Date.now() - startTime,
216
+ },
217
+ });
218
+ }
219
+ catch (execError) {
220
+ results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
221
+ }
190
222
  }
191
223
  }
192
- catch (error) {
193
- return {
194
- specId: spec.id,
195
- name: spec.name,
196
- filePath: spec.filePath,
197
- result: {
198
- status: "failed",
199
- error: error instanceof Error ? error.message : String(error),
200
- duration: Date.now() - startTime,
201
- },
202
- };
203
- }
224
+ return results;
225
+ }
226
+ function makeErrorResult(spec, error, duration) {
227
+ return {
228
+ specId: spec.id,
229
+ name: spec.name,
230
+ filePath: spec.filePath,
231
+ result: { status: "failed", error, duration },
232
+ };
204
233
  }
205
234
  /**
206
235
  * Calculate summary statistics
@@ -348,7 +377,8 @@ function printHumanResults(result) {
348
377
  console.log(` ❌ Failed: ${result.summary.failed}`);
349
378
  console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
350
379
  console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
351
- console.log("\n📋 Individual Results:");
380
+ const hasScores = result.results.some((r) => r.result.score !== undefined);
381
+ console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
352
382
  for (const spec of result.results) {
353
383
  const status = spec.result.status === "passed"
354
384
  ? "✅"
package/dist/index.d.ts CHANGED
@@ -10,7 +10,7 @@ export { AIEvalClient } from "./client";
10
10
  import { AuthenticationError, EvalGateError, NetworkError, RateLimitError, SDKError } from "./errors";
11
11
  export { EvalGateError, RateLimitError, AuthenticationError, SDKError as ValidationError, // Using SDKError as ValidationError for backward compatibility
12
12
  NetworkError, };
13
- export { containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, expect, followsInstructions, hasFactualAccuracy, hasLength, hasNoHallucinations, hasNoToxicity, hasReadabilityScore, hasSentiment, hasValidCodeSyntax, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinTime, similarTo, withinRange, } from "./assertions";
13
+ export { containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, expect, followsInstructions, hasFactualAccuracy, hasLength, hasNoHallucinations, hasNoToxicity, hasPII, hasReadabilityScore, hasSentiment, hasValidCodeSyntax, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinTime, similarTo, withinRange, } from "./assertions";
14
14
  import { createContext, EvalContext, getCurrentContext, withContext } from "./context";
15
15
  export { createContext, getCurrentContext as getContext, withContext, EvalContext as ContextManager, };
16
16
  export { cloneContext, mergeContexts, validateContext, } from "./runtime/context";
package/dist/index.js CHANGED
@@ -8,8 +8,8 @@
8
8
  * @packageDocumentation
9
9
  */
10
10
  Object.defineProperty(exports, "__esModule", { value: true });
11
- exports.createTestSuite = exports.SpecRegistrationError = exports.SpecExecutionError = exports.RuntimeError = exports.EvalRuntimeError = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.defaultLocalExecutor = exports.createLocalExecutor = exports.evalai = exports.defineSuite = exports.defineEval = exports.createResult = exports.createEvalContext = exports.validateContext = exports.mergeContexts = exports.cloneContext = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalGateError = exports.AIEvalClient = void 0;
12
- exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = void 0;
11
+ exports.SpecRegistrationError = exports.SpecExecutionError = exports.RuntimeError = exports.EvalRuntimeError = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.defaultLocalExecutor = exports.createLocalExecutor = exports.evalai = exports.defineSuite = exports.defineEval = exports.createResult = exports.createEvalContext = exports.validateContext = exports.mergeContexts = exports.cloneContext = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasPII = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalGateError = exports.AIEvalClient = void 0;
12
+ exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = void 0;
13
13
  // Main SDK exports
14
14
  var client_1 = require("./client");
15
15
  Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
@@ -32,6 +32,7 @@ Object.defineProperty(exports, "hasFactualAccuracy", { enumerable: true, get: fu
32
32
  Object.defineProperty(exports, "hasLength", { enumerable: true, get: function () { return assertions_1.hasLength; } });
33
33
  Object.defineProperty(exports, "hasNoHallucinations", { enumerable: true, get: function () { return assertions_1.hasNoHallucinations; } });
34
34
  Object.defineProperty(exports, "hasNoToxicity", { enumerable: true, get: function () { return assertions_1.hasNoToxicity; } });
35
+ Object.defineProperty(exports, "hasPII", { enumerable: true, get: function () { return assertions_1.hasPII; } });
35
36
  Object.defineProperty(exports, "hasReadabilityScore", { enumerable: true, get: function () { return assertions_1.hasReadabilityScore; } });
36
37
  Object.defineProperty(exports, "hasSentiment", { enumerable: true, get: function () { return assertions_1.hasSentiment; } });
37
38
  Object.defineProperty(exports, "hasValidCodeSyntax", { enumerable: true, get: function () { return assertions_1.hasValidCodeSyntax; } });
@@ -18,10 +18,19 @@ export declare const evalai: {
18
18
  test: DefineEvalFunction;
19
19
  };
20
20
  /**
21
- * Suite definition for grouping related specifications
22
- * This will be expanded in Layer 3 for dependency graph support
21
+ * Suite definition for grouping related specifications.
22
+ * Accepts both a positional form and an object form:
23
+ *
24
+ * @example Positional form:
25
+ * defineSuite('My Suite', [() => defineEval('spec 1', executor), ...])
26
+ *
27
+ * @example Object form:
28
+ * defineSuite({ name: 'My Suite', specs: [() => defineEval('spec 1', executor), ...] })
23
29
  */
24
- export declare function defineSuite(_name: string, specs: (() => void)[]): void;
30
+ export declare function defineSuite(nameOrConfig: string | {
31
+ name: string;
32
+ specs: (() => void)[];
33
+ }, specsArg?: (() => void)[]): void;
25
34
  /**
26
35
  * Helper function to create specification contexts
27
36
  * Useful for testing and manual execution
@@ -204,13 +204,22 @@ exports.evalai = {
204
204
  test: exports.defineEval,
205
205
  };
206
206
  /**
207
- * Suite definition for grouping related specifications
208
- * This will be expanded in Layer 3 for dependency graph support
207
+ * Suite definition for grouping related specifications.
208
+ * Accepts both a positional form and an object form:
209
+ *
210
+ * @example Positional form:
211
+ * defineSuite('My Suite', [() => defineEval('spec 1', executor), ...])
212
+ *
213
+ * @example Object form:
214
+ * defineSuite({ name: 'My Suite', specs: [() => defineEval('spec 1', executor), ...] })
209
215
  */
210
- function defineSuite(_name, specs) {
211
- // For now, just execute the specs to register them
212
- // In Layer 3, this will build the dependency graph
213
- for (const specFn of specs) {
216
+ function defineSuite(nameOrConfig, specsArg) {
217
+ const specFns = typeof nameOrConfig === "string"
218
+ ? (specsArg ?? [])
219
+ : (nameOrConfig.specs ?? []);
220
+ // Execute each spec function to register its defineEval calls
221
+ // In Layer 3, this will also build the dependency graph
222
+ for (const specFn of specFns) {
214
223
  specFn();
215
224
  }
216
225
  }
@@ -9,7 +9,7 @@
9
9
  * import { snapshot, loadSnapshot } from '@ai-eval-platform/sdk';
10
10
  *
11
11
  * const output = await generateText('Write a haiku about coding');
12
- * await snapshot(output, 'haiku-test');
12
+ * await snapshot('haiku-test', output);
13
13
  *
14
14
  * // Later, compare with snapshot
15
15
  * const saved = await loadSnapshot('haiku-test');
@@ -135,10 +135,10 @@ export declare class SnapshotManager {
135
135
  * @example
136
136
  * ```typescript
137
137
  * const output = await generateText('Write a haiku');
138
- * await snapshot(output, 'haiku-test');
138
+ * await snapshot('haiku-test', output);
139
139
  * ```
140
140
  */
141
- export declare function snapshot(output: string, name: string, options?: {
141
+ export declare function snapshot(name: string, output: string, options?: {
142
142
  tags?: string[];
143
143
  metadata?: Record<string, unknown>;
144
144
  overwrite?: boolean;
package/dist/snapshot.js CHANGED
@@ -10,7 +10,7 @@
10
10
  * import { snapshot, loadSnapshot } from '@ai-eval-platform/sdk';
11
11
  *
12
12
  * const output = await generateText('Write a haiku about coding');
13
- * await snapshot(output, 'haiku-test');
13
+ * await snapshot('haiku-test', output);
14
14
  *
15
15
  * // Later, compare with snapshot
16
16
  * const saved = await loadSnapshot('haiku-test');
@@ -271,10 +271,10 @@ function getSnapshotManager(dir) {
271
271
  * @example
272
272
  * ```typescript
273
273
  * const output = await generateText('Write a haiku');
274
- * await snapshot(output, 'haiku-test');
274
+ * await snapshot('haiku-test', output);
275
275
  * ```
276
276
  */
277
- async function snapshot(output, name, options) {
277
+ async function snapshot(name, output, options) {
278
278
  const manager = getSnapshotManager(options?.dir);
279
279
  return manager.save(name, output, options);
280
280
  }
package/dist/version.d.ts CHANGED
@@ -3,5 +3,5 @@
3
3
  * X-EvalGate-SDK-Version: SDK package version
4
4
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
5
5
  */
6
- export declare const SDK_VERSION = "2.1.0";
7
- export declare const SPEC_VERSION = "2.1.0";
6
+ export declare const SDK_VERSION = "2.2.0";
7
+ export declare const SPEC_VERSION = "2.2.0";
package/dist/version.js CHANGED
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
6
6
  * X-EvalGate-SDK-Version: SDK package version
7
7
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
8
8
  */
9
- exports.SDK_VERSION = "2.1.0";
10
- exports.SPEC_VERSION = "2.1.0";
9
+ exports.SDK_VERSION = "2.2.0";
10
+ exports.SPEC_VERSION = "2.2.0";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@evalgate/sdk",
3
- "version": "2.1.2",
3
+ "version": "2.2.0",
4
4
  "publishConfig": {
5
5
  "access": "public",
6
6
  "registry": "https://registry.npmjs.org/"