npm - @evalgate/sdk - Versions diffs - 2.1.2 → 2.2.0 - Mend

@evalgate/sdk 2.1.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/CHANGELOG.md +37 -0
package/README.md +52 -5
package/dist/assertions.d.ts +21 -0
package/dist/assertions.js +43 -2
package/dist/cli/baseline.js +31 -6
package/dist/cli/discover.js +46 -28
package/dist/cli/doctor.js +1 -1
package/dist/cli/explain.js +69 -1
package/dist/cli/impact-analysis.js +12 -1
package/dist/cli/print-config.js +1 -1
package/dist/cli/regression-gate.js +23 -10
package/dist/cli/run.js +87 -57
package/dist/index.d.ts +1 -1
package/dist/index.js +3 -2
package/dist/runtime/eval.d.ts +12 -3
package/dist/runtime/eval.js +15 -6
package/dist/snapshot.d.ts +3 -3
package/dist/snapshot.js +3 -3
package/dist/version.d.ts +2 -2
package/dist/version.js +2 -2
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,43 @@ All notable changes to the @evalgate/sdk package will be documented in this file
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [2.2.0] - 2026-03-03
+### Breaking
+- **`snapshot(output, name)` → `snapshot(name, output)`** — parameter order swapped to match natural call convention (`name` first, value second, same as `test('name', fn)`). Update any existing `snapshot(output, 'label')` calls to `snapshot('label', output)`.
+### Added
+- **`expect().not` modifier** — `expect('drop table').not.toContain('drop table')` now works; negates `passed` on any chained assertion via Proxy
+- **`hasPII(text)`** — semantic inverse of `notContainsPII`; returns `true` when PII is detected (email, phone, SSN, IP). Exported from main package. Eliminates double-negative confusion.
+- **`defineSuite` object form** — now accepts both `defineSuite(name, [...fns])` and `defineSuite({ name, specs: [...fns] })`. README updated with examples.
+### Fixed
+- **`specId` collision** — all specs in `eval/` directory shared the same 8-char ID (`ZXZhbC9j`). Root cause: short base64 prefix was identical for any path starting with `eval/c`. Fixed: SHA-256 hex (16 chars) in `discover.ts`.
+- **`explain` UNKNOWN verdict** — `evalgate explain` showed `Verdict: UNKNOWN` when reading `.evalgate/last-run.json`. Added `RunResult` format detection (`results[]` + `summary`). Added `.evalgate/last-run.json` and `.evalgate/runs/latest.json` to auto-search paths. Passing runs now show clean `✅ PASS` with no spurious "Run doctor" suggestions.
+- **`print-config` baseUrl default** — was `http://localhost:3000`; now `https://api.evalgate.com` to match `evalgate doctor`.
+- **`baseline update` self-contained** — no longer requires a custom `eval:baseline-update` npm script. Falls back to built-in mode (runs `pm test`, stamps baseline) if no script is present.
+- **`notContainsPII` phone regex** — broadened to cover `555-123-4567`, `555.123.4567`, and `555 123 4567` formats. JSDoc clarified: `false` = PII found (unsafe), `true` = no PII (safe).
+- **`impact-analysis` git error** — replaced raw `git diff --help` wall-of-text with clean targeted messages: `Not a git repository`, `Base branch 'X' not found. Fetch it first`, or generic exit-code message.
+- **README quickstart** — both `defineEval` examples now include an `executor` function. Running the quickstart no longer throws `Executor must be a function`.
+- **`snapshot` module docstring** — updated `@example` to reflect new `(name, output)` parameter order.
+---
+## [2.1.3] - 2026-03-02
+### Fixed
+- **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
+- **High:** First-run gate false regression on fresh init when no test script exists
+- **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
+- **Critical:** Simulated executeSpec replaced with real spec execution
+- **High:** Run scores now include scoring model context for clarity
+- **Low:** Explain no longer shows "unnamed" for builtin gate failures
+- **Docs:** Added missing `discover --manifest` step to local quickstart
 ## [2.1.2] - 2026-03-02
 ### Fixed

package/README.md CHANGED Viewed

@@ -40,13 +40,20 @@ Create `eval/your-spec.spec.ts`:
 ```typescript
 import { defineEval } from "@evalgate/sdk";
+defineEval("Basic Math Operations", async () => {
+  const result = 1 + 1;
+  return { pass: result === 2, score: result === 2 ? 100 : 0 };
+});
+// Object form (with metadata):
 defineEval({
-  name: "Basic Math Operations",
-  description: "Test fundamental arithmetic",
-  prompt: "Test: 1+1=2, string concatenation, array includes",
-  expected: "All tests should pass",
+  name: "String concatenation",
+  description: "Test string operations",
   tags: ["basic", "math"],
-  category: "unit-test"
+  executor: async () => {
+    const result = "hello" + " world";
+    return { pass: result === "hello world", score: 100 };
+  },
 });
 ```
@@ -254,6 +261,46 @@ All commands automatically write artifacts so `explain` works with zero flags.
 npm install @evalgate/sdk openai
 ```
+Create `eval/your-spec.spec.ts`:
+```typescript
+import { defineEval } from "@evalgate/sdk";
+defineEval("Basic Math Operations", async () => {
+  const result = 1 + 1;
+  return { pass: result === 2, score: result === 2 ? 100 : 0 };
+});
+// Object form (with metadata):
+defineEval({
+  name: "String concatenation",
+  description: "Test string operations",
+  tags: ["basic", "math"],
+  executor: async () => {
+    const result = "hello" + " world";
+    return { pass: result === "hello world", score: 100 };
+  },
+});
+// Suite form — group related specs:
+defineSuite("Math suite", [
+  () => defineEval("addition", async () => ({ pass: 1 + 1 === 2, score: 100 })),
+  () => defineEval("subtraction", async () => ({ pass: 5 - 3 === 2, score: 100 })),
+]);
+```
+```bash
+# Discover specs and generate manifest
+npx @evalgate/sdk discover
+npx @evalgate/sdk discover --manifest
+# Run evaluations
+npx @evalgate/sdk run --write-results
+# Run local regression gate
+npx @evalgate/sdk gate
+```
 ```typescript
 import { openAIChatEval } from "@evalgate/sdk";

package/dist/assertions.d.ts CHANGED Viewed

@@ -32,6 +32,11 @@ export declare class AssertionError extends Error {
 export declare class Expectation {
     private value;
     constructor(value: unknown);
+    /**
+     * Negate the next assertion — inverts `passed` on any chained method.
+     * @example expect('drop table').not.toContain('drop table')
+     */
+    get not(): Expectation;
     /**
      * Assert value equals expected
      * @example expect(output).toEqual("Hello")
@@ -171,7 +176,23 @@ export declare function hasLength(text: string, range: {
     max?: number;
 }): boolean;
 export declare function containsJSON(text: string): boolean;
+/**
+ * Returns `true` when the text is PII-free (safe to use), `false` when PII is detected.
+ *
+ * @example
+ * if (!notContainsPII(response)) throw new Error("PII leak detected");
+ * // Or use the clearer alias:
+ * if (hasPII(response)) throw new Error("PII leak detected");
+ */
 export declare function notContainsPII(text: string): boolean;
+/**
+ * Returns `true` when PII is detected in the text (unsafe), `false` when safe.
+ * This is the semantic inverse of `notContainsPII` and may be easier to reason about.
+ *
+ * @example
+ * if (hasPII(response)) throw new Error("PII leak");
+ */
+export declare function hasPII(text: string): boolean;
 export declare function hasSentiment(text: string, expected: "positive" | "negative" | "neutral"): boolean;
 export declare function similarTo(text1: string, text2: string, threshold?: number): boolean;
 export declare function withinRange(value: number, min: number, max: number): boolean;

package/dist/assertions.js CHANGED Viewed

@@ -24,6 +24,7 @@ exports.matchesPattern = matchesPattern;
 exports.hasLength = hasLength;
 exports.containsJSON = containsJSON;
 exports.notContainsPII = notContainsPII;
+exports.hasPII = hasPII;
 exports.hasSentiment = hasSentiment;
 exports.similarTo = similarTo;
 exports.withinRange = withinRange;
@@ -56,6 +57,28 @@ class Expectation {
     constructor(value) {
         this.value = value;
     }
+    /**
+     * Negate the next assertion — inverts `passed` on any chained method.
+     * @example expect('drop table').not.toContain('drop table')
+     */
+    get not() {
+        const value = this.value;
+        return new Proxy(new Expectation(value), {
+            get(target, prop) {
+                const orig = target[prop];
+                if (typeof orig === "function" && prop !== "constructor") {
+                    return (...args) => {
+                        const result = orig.call(target, ...args);
+                        if (result && typeof result === "object" && "passed" in result) {
+                            return { ...result, passed: !result.passed };
+                        }
+                        return result;
+                    };
+                }
+                return orig;
+            },
+        });
+    }
     /**
      * Assert value equals expected
      * @example expect(output).toEqual("Hello")
@@ -539,17 +562,35 @@ function containsJSON(text) {
         return false;
     }
 }
+/**
+ * Returns `true` when the text is PII-free (safe to use), `false` when PII is detected.
+ *
+ * @example
+ * if (!notContainsPII(response)) throw new Error("PII leak detected");
+ * // Or use the clearer alias:
+ * if (hasPII(response)) throw new Error("PII leak detected");
+ */
 function notContainsPII(text) {
     // Simple PII detection patterns
     const piiPatterns = [
         /\b\d{3}-\d{2}-\d{4}\b/, // SSN
         /\b\d{3}\.\d{3}\.\d{4}\b/, // SSN with dots
-        /\b\d{10}\b/, // Phone number
-        /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/, // Email
+        /\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b/, // Phone (various formats)
+        /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/, // Email
         /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/, // IP address
     ];
     return !piiPatterns.some((pattern) => pattern.test(text));
 }
+/**
+ * Returns `true` when PII is detected in the text (unsafe), `false` when safe.
+ * This is the semantic inverse of `notContainsPII` and may be easier to reason about.
+ *
+ * @example
+ * if (hasPII(response)) throw new Error("PII leak");
+ */
+function hasPII(text) {
+    return !notContainsPII(text);
+}
 function hasSentiment(text, expected) {
     // This is a simplified implementation
     const positiveWords = ["good", "great", "excellent", "awesome"];

package/dist/cli/baseline.js CHANGED Viewed

@@ -126,7 +126,6 @@ function runBaselineInit(cwd) {
 }
 // ── baseline update ──
 function runBaselineUpdate(cwd) {
-    // Check if eval:baseline-update script exists in package.json
     const pkgPath = path.join(cwd, "package.json");
     if (!fs.existsSync(pkgPath)) {
         console.error("❌ No package.json found. Run this from your project root.");
@@ -140,13 +139,39 @@ function runBaselineUpdate(cwd) {
         console.error("❌ Failed to parse package.json");
         return 1;
     }
-    if (!pkg.scripts?.["eval:baseline-update"]) {
-        console.error("❌ Missing 'eval:baseline-update' script in package.json.");
-        console.error('   Add it:  "eval:baseline-update": "npx tsx scripts/regression-gate.ts --update-baseline"');
+    // Use custom script if available
+    if (pkg.scripts?.["eval:baseline-update"]) {
+        console.log("📊 Running baseline update (custom script)...\n");
+        return runScript(cwd, "eval:baseline-update");
+    }
+    // Self-contained built-in mode: run the test suite then stamp the baseline
+    console.log("📊 Running baseline update (built-in mode)...\n");
+    const pm = detectPackageManager(cwd);
+    const isWin = process.platform === "win32";
+    const testResult = (0, node_child_process_1.spawnSync)(pm, ["test"], {
+        cwd,
+        stdio: "inherit",
+        shell: isWin,
+    });
+    const baselinePath = path.join(cwd, BASELINE_REL);
+    if (!fs.existsSync(baselinePath)) {
+        console.error("❌ No baseline found. Run 'evalgate baseline init' first.");
+        return 1;
+    }
+    try {
+        const baseline = JSON.parse(fs.readFileSync(baselinePath, "utf-8"));
+        baseline.updatedAt = new Date().toISOString();
+        baseline.updatedBy = process.env.USER || process.env.USERNAME || "unknown";
+        baseline.confidenceTests = baseline.confidenceTests ?? {};
+        baseline.confidenceTests.unitPassed = testResult.status === 0;
+        fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}\n`);
+        console.log("\n✅ Baseline updated successfully");
+    }
+    catch {
+        console.error("❌ Failed to update baseline file");
         return 1;
     }
-    console.log("📊 Running baseline update...\n");
-    return runScript(cwd, "eval:baseline-update");
+    return testResult.status ?? 1;
 }
 // ── baseline router ──
 function runBaseline(argv) {

package/dist/cli/discover.js CHANGED Viewed

@@ -59,6 +59,7 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.discoverSpecs = discoverSpecs;
 exports.printDiscoveryResults = printDiscoveryResults;
 exports.runDiscover = runDiscover;
+const crypto = __importStar(require("node:crypto"));
 const fs = __importStar(require("node:fs/promises"));
 const path = __importStar(require("node:path"));
 const execution_mode_1 = require("../runtime/execution-mode");
@@ -145,8 +146,8 @@ async function analyzeSpecifications(specFiles) {
     for (const filePath of specFiles) {
         try {
             const content = await fs.readFile(filePath, "utf-8");
-            const analysis = analyzeSpecFile(filePath, content);
-            specs.push(analysis);
+            const fileSpecs = analyzeSpecFile(filePath, content);
+            specs.push(...fileSpecs);
         }
         catch (error) {
             console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
@@ -155,20 +156,40 @@ async function analyzeSpecifications(specFiles) {
     return specs;
 }
 /**
- * Analyze a single specification file
+ * Extract all spec names from file content (handles both call forms)
+ */
+function extractSpecNames(content) {
+    const names = [];
+    // Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
+    const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
+    let m = stringArgPattern.exec(content);
+    while (m !== null) {
+        names.push(m[1]);
+        m = stringArgPattern.exec(content);
+    }
+    if (names.length > 0)
+        return names;
+    // Form 2: defineEval({ name: "..." }) — object-first form
+    const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
+    m = objNamePattern.exec(content);
+    while (m !== null) {
+        names.push(m[1]);
+        m = objNamePattern.exec(content);
+    }
+    return names;
+}
+/**
+ * Analyze a single specification file — returns one SpecAnalysis per defineEval call
  */
 function analyzeSpecFile(filePath, content) {
-    // Extract defineEval calls
-    const defineEvalMatches = content.match(/defineEval\s*\([^)]+\)/g) || [];
-    const specNames = defineEvalMatches.map((match) => {
-        const nameMatch = match.match(/["'`](.+?)["'`](?:\s*,|\s*\))/);
-        return nameMatch ? nameMatch[1] : "unnamed";
-    });
-    // Extract tags
+    const specNames = extractSpecNames(content);
+    // Fallback: file matched as a spec file but we couldn't parse names
+    if (specNames.length === 0) {
+        specNames.push(path.basename(filePath, path.extname(filePath)));
+    }
+    // Shared analysis for the file
     const tags = extractTags(content);
-    // Analyze complexity
     const complexity = analyzeComplexity(content);
-    // Check for models and tools
     const usesModels = content.includes("model:") ||
         content.includes("model=") ||
         content.includes("openai") ||
@@ -176,22 +197,20 @@ function analyzeSpecFile(filePath, content) {
     const usesTools = content.includes("tool:") ||
         content.includes("function.") ||
         content.includes("call(");
-    // Check for assertions
     const hasAssertions = content.includes("assert") ||
         content.includes("expect") ||
         content.includes("should");
-    // Generate ID from file path
-    const id = generateSpecId(filePath);
-    return {
-        id,
-        name: specNames[0] || path.basename(filePath, ".ts"),
-        file: path.relative(process.cwd(), filePath),
+    const relFile = path.relative(process.cwd(), filePath);
+    return specNames.map((name, idx) => ({
+        id: generateSpecId(filePath, name, idx),
+        name,
+        file: relFile,
         tags,
         hasAssertions,
         usesModels,
         usesTools,
         complexity,
-    };
+    }));
 }
 /**
  * Extract tags from specification content
@@ -263,15 +282,14 @@ function analyzeComplexity(content) {
     return "complex";
 }
 /**
- * Generate specification ID from file path
+ * Generate specification ID from file path + name + index (unique per defineEval call)
  */
-function generateSpecId(filePath) {
-    const relativePath = path.relative(process.cwd(), filePath);
-    const hash = Buffer.from(relativePath)
-        .toString("base64")
-        .replace(/[+/=]/g, "")
-        .slice(0, 8);
-    return hash;
+function generateSpecId(filePath, name, index) {
+    const relativePath = path
+        .relative(process.cwd(), filePath)
+        .replace(/\\/g, "/");
+    const key = `${relativePath}|${name}|${index}`;
+    return crypto.createHash("sha256").update(key).digest("hex").slice(0, 16);
 }
 /**
  * Calculate discovery statistics

package/dist/cli/doctor.js CHANGED Viewed

@@ -96,7 +96,7 @@ function parseFlags(argv) {
     const baseUrl = raw.baseUrl ||
         process.env.EVALGATE_BASE_URL ||
         process.env.EVALAI_BASE_URL ||
-        "http://localhost:3000";
+        "https://api.evalgate.com";
     const apiKey = raw.apiKey ||
         process.env.EVALGATE_API_KEY ||
         process.env.EVALAI_API_KEY ||

package/dist/cli/explain.js CHANGED Viewed

@@ -84,6 +84,8 @@ const REPORT_SEARCH_PATHS = [
     "evals/regression-report.json",
     ".evalgate/last-report.json",
     ".evalgate/last_report.json",
+    ".evalgate/last-run.json",
+    ".evalgate/runs/latest.json",
 ];
 function findReport(cwd, explicitPath) {
     if (explicitPath) {
@@ -354,13 +356,78 @@ function suggestFixes(causes) {
 }
 // ── Build explain output ──
 function buildExplainOutput(report, reportPath) {
-    // Support both CheckReport (from evalgate check) and BuiltinReport (from evalgate gate)
+    // Support RunResult (from evalgate run) — has schemaVersion + results[] + summary
+    const isRunResult = "results" in report &&
+        Array.isArray(report.results) &&
+        "summary" in report &&
+        report.summary !== null &&
+        typeof report.summary === "object";
+    if (isRunResult) {
+        return buildFromRunResult(report, reportPath);
+    }
+    // Support BuiltinReport (from evalgate gate)
     const isBuiltinReport = "category" in report && "deltas" in report;
     if (isBuiltinReport) {
         return buildFromBuiltinReport(report, reportPath);
     }
     return buildFromCheckReport(report, reportPath);
 }
+function buildFromRunResult(report, reportPath) {
+    const summary = report.summary;
+    const results = report.results ?? [];
+    const passed = summary.failed === 0;
+    // Top failures
+    const failures = results.filter((r) => r.result.status === "failed");
+    const topFailures = failures.slice(0, 3).map((r, i) => ({
+        rank: i + 1,
+        name: r.name,
+        filePath: r.filePath,
+        reason: r.result.error,
+    }));
+    // Changes: pass rate
+    const changes = [
+        {
+            metric: "Pass rate",
+            baseline: "—",
+            current: `${Math.round(summary.passRate * 100)}%`,
+            direction: passed ? "same" : "worse",
+        },
+    ];
+    // For passing runs, emit nothing so no misleading "Run doctor" suggestions appear
+    if (passed) {
+        return {
+            verdict: "pass",
+            reasonMessage: `All ${summary.passed} spec${summary.passed === 1 ? "" : "s"} passed`,
+            topFailures: [],
+            totalFailures: 0,
+            changes,
+            rootCauses: [],
+            suggestedFixes: [],
+            reportPath,
+        };
+    }
+    // Classify root cause by inspecting error messages
+    const errorText = failures
+        .map((r) => (r.result.error ?? "").toLowerCase())
+        .join(" ");
+    const rootCauses = [];
+    if (errorText.includes("pii") || errorText.includes("safety"))
+        rootCauses.push("safety_regression");
+    if (errorText.includes("tool") || errorText.includes("function_call"))
+        rootCauses.push("tool_use_drift");
+    if (rootCauses.length === 0)
+        rootCauses.push("prompt_drift");
+    return {
+        verdict: "fail",
+        reasonMessage: `${summary.failed} of ${results.length} spec${results.length === 1 ? "" : "s"} failed`,
+        topFailures,
+        totalFailures: failures.length,
+        changes,
+        rootCauses,
+        suggestedFixes: suggestFixes(rootCauses),
+        reportPath,
+    };
+}
 function buildFromCheckReport(report, reportPath) {
     const failedCases = report.failedCases ?? [];
     // Top failures (up to 3)
@@ -430,6 +497,7 @@ function buildFromBuiltinReport(report, reportPath) {
     }));
     const topFailures = failures.slice(0, 3).map((f, i) => ({
         rank: i + 1,
+        name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
         reason: f,
     }));
     // Simple root cause for builtin reports

package/dist/cli/impact-analysis.js CHANGED Viewed

@@ -109,7 +109,18 @@ async function getChangedFiles(baseBranch) {
         });
         git.on("close", (code) => {
             if (code !== 0) {
-                reject(new Error(`Git diff failed: ${error}`));
+                const lowerError = error.toLowerCase();
+                if (lowerError.includes("not a git repository") ||
+                    lowerError.includes("fatal: not a git")) {
+                    reject(new Error("Not a git repository. Run 'git init' or run evalgate from inside a git repo."));
+                }
+                else if (lowerError.includes("unknown revision") ||
+                    lowerError.includes("bad revision")) {
+                    reject(new Error(`Base branch '${baseBranch}' not found. Fetch it first: git fetch origin ${baseBranch}`));
+                }
+                else {
+                    reject(new Error(`Git diff failed (exit ${code}). Ensure git is installed and '${baseBranch}' exists.`));
+                }
                 return;
             }
             const files = output

package/dist/cli/print-config.js CHANGED Viewed

@@ -138,7 +138,7 @@ function buildResolvedConfig(cwd, flags) {
         value: flags.baseUrl ||
             envBaseUrl ||
             fileConfig?.baseUrl ||
-            "http://localhost:3000",
+            "https://api.evalgate.com",
         source: baseUrlSource,
     });
     // apiKey (always redacted)

package/dist/cli/regression-gate.js CHANGED Viewed

@@ -94,6 +94,16 @@ function detectRunner(cwd) {
     }
     return "unknown";
 }
+function hasTestScript(cwd) {
+    try {
+        const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
+        const script = pkg.scripts?.test ?? "";
+        return !!script && script !== 'echo "Error: no test specified" && exit 1';
+    }
+    catch {
+        return false;
+    }
+}
 function runBuiltinGate(cwd) {
     const t0 = Date.now();
     const baselinePath = path.join(cwd, BASELINE_REL);
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
     const pm = detectPackageManager(cwd);
     const command = `${pm} test`;
     const runner = detectRunner(cwd);
+    const projectHasTestScript = hasTestScript(cwd);
     // Load baseline
     if (!fs.existsSync(baselinePath)) {
         return {
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
     const baselineTotal = baselineData.confidenceTests?.total ?? 0;
     const failures = [];
     const deltas = [];
-    // Delta: tests passing
-    deltas.push({
-        metric: "tests_passing",
-        baseline: baselinePassed,
-        current: testsPassed,
-        delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
-        status: testsPassed ? "pass" : "fail",
-    });
-    if (!testsPassed && baselinePassed) {
-        failures.push("Tests were passing in baseline but are now failing");
+    // Delta: tests passing — only meaningful when a test script exists
+    if (projectHasTestScript) {
+        deltas.push({
+            metric: "tests_passing",
+            baseline: baselinePassed,
+            current: testsPassed,
+            delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
+            status: testsPassed ? "pass" : "fail",
+        });
+        if (!testsPassed && baselinePassed) {
+            failures.push("Tests were passing in baseline but are now failing");
+        }
     }
     // Delta: test count (only if we captured counts)
     if (testCount > 0 || baselineTotal > 0) {

package/dist/cli/run.js CHANGED Viewed

@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
 const node_child_process_1 = require("node:child_process");
 const fs = __importStar(require("node:fs/promises"));
 const path = __importStar(require("node:path"));
+const registry_1 = require("../runtime/registry");
 const impact_analysis_1 = require("./impact-analysis");
 /**
  * Generate deterministic run ID
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
     }
 }
 /**
- * Execute specifications
+ * Execute specifications — grouped by file to avoid redundant loads
  */
 async function executeSpecs(specs) {
-    const results = [];
+    // Group specs by their absolute file path
+    const specsByFile = new Map();
     for (const spec of specs) {
-        const result = await executeSpec(spec);
-        results.push(result);
+        const abs = path.isAbsolute(spec.filePath)
+            ? spec.filePath
+            : path.join(process.cwd(), spec.filePath);
+        const group = specsByFile.get(abs) ?? [];
+        group.push(spec);
+        specsByFile.set(abs, group);
     }
-    return results;
-}
-/**
- * Execute individual specification
- */
-async function executeSpec(spec) {
-    const startTime = Date.now();
-    try {
-        // For now, simulate execution
-        // In a real implementation, this would:
-        // 1. Load the spec file
-        // 2. Execute the defineEval function
-        // 3. Capture the result
-        // Simulate some work
-        await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
-        // Simulate success/failure (90% success rate for demo)
-        const success = Math.random() > 0.1;
-        const duration = Date.now() - startTime;
-        if (success) {
-            return {
-                specId: spec.id,
-                name: spec.name,
-                filePath: spec.filePath,
-                result: {
-                    status: "passed",
-                    score: Math.random() * 0.3 + 0.7, // 0.7-1.0
-                    duration,
-                },
-            };
+    const results = [];
+    for (const [absPath, fileSpecs] of specsByFile) {
+        // Fresh runtime per file to avoid cross-file contamination
+        (0, registry_1.disposeActiveRuntime)();
+        try {
+            // Bust require cache so the file re-executes its defineEval calls
+            delete require.cache[require.resolve(absPath)];
         }
-        else {
-            return {
-                specId: spec.id,
-                name: spec.name,
-                filePath: spec.filePath,
-                result: {
-                    status: "failed",
-                    error: "Simulated execution failure",
-                    duration,
-                },
-            };
+        catch {
+            // Not in cache yet — fine
+        }
+        try {
+            // eslint-disable-next-line @typescript-eslint/no-require-imports
+            require(absPath);
+        }
+        catch (loadError) {
+            const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
+            const msg = isTs &&
+                loadError instanceof Error &&
+                (loadError.message.includes("Unknown file extension") ||
+                    loadError.message.includes("SyntaxError"))
+                ? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
+                : loadError instanceof Error
+                    ? loadError.message
+                    : String(loadError);
+            for (const spec of fileSpecs) {
+                results.push(makeErrorResult(spec, msg, 0));
+            }
+            continue;
+        }
+        const runtime = (0, registry_1.getActiveRuntime)();
+        const registered = runtime.list();
+        for (const spec of fileSpecs) {
+            const registeredSpec = registered.find((r) => r.name === spec.name);
+            if (!registeredSpec) {
+                results.push({
+                    specId: spec.id,
+                    name: spec.name,
+                    filePath: spec.filePath,
+                    result: {
+                        status: "skipped",
+                        error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
+                        duration: 0,
+                    },
+                });
+                continue;
+            }
+            const startTime = Date.now();
+            try {
+                const evalResult = await registeredSpec.executor({ input: "" });
+                results.push({
+                    specId: spec.id,
+                    name: spec.name,
+                    filePath: spec.filePath,
+                    result: {
+                        status: evalResult.pass ? "passed" : "failed",
+                        score: typeof evalResult.score === "number"
+                            ? evalResult.score / 100
+                            : undefined,
+                        error: evalResult.error,
+                        duration: Date.now() - startTime,
+                    },
+                });
+            }
+            catch (execError) {
+                results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
+            }
         }
     }
-    catch (error) {
-        return {
-            specId: spec.id,
-            name: spec.name,
-            filePath: spec.filePath,
-            result: {
-                status: "failed",
-                error: error instanceof Error ? error.message : String(error),
-                duration: Date.now() - startTime,
-            },
-        };
-    }
+    return results;
+}
+function makeErrorResult(spec, error, duration) {
+    return {
+        specId: spec.id,
+        name: spec.name,
+        filePath: spec.filePath,
+        result: { status: "failed", error, duration },
+    };
 }
 /**
  * Calculate summary statistics
@@ -348,7 +377,8 @@ function printHumanResults(result) {
     console.log(`   ❌ Failed: ${result.summary.failed}`);
     console.log(`   ⏭️  Skipped: ${result.summary.skipped}`);
     console.log(`   📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
-    console.log("\n📋 Individual Results:");
+    const hasScores = result.results.some((r) => r.result.score !== undefined);
+    console.log(`\n📋 Individual Results:${hasScores ? "  (score = value returned by spec executor, 0–100)" : ""}`);
     for (const spec of result.results) {
         const status = spec.result.status === "passed"
             ? "✅"

package/dist/index.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@ export { AIEvalClient } from "./client";
 import { AuthenticationError, EvalGateError, NetworkError, RateLimitError, SDKError } from "./errors";
 export { EvalGateError, RateLimitError, AuthenticationError, SDKError as ValidationError, // Using SDKError as ValidationError for backward compatibility
 NetworkError, };
-export { containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, expect, followsInstructions, hasFactualAccuracy, hasLength, hasNoHallucinations, hasNoToxicity, hasReadabilityScore, hasSentiment, hasValidCodeSyntax, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinTime, similarTo, withinRange, } from "./assertions";
+export { containsAllRequiredFields, containsJSON, containsKeywords, containsLanguage, expect, followsInstructions, hasFactualAccuracy, hasLength, hasNoHallucinations, hasNoToxicity, hasPII, hasReadabilityScore, hasSentiment, hasValidCodeSyntax, isValidEmail, isValidURL, matchesPattern, matchesSchema, notContainsPII, respondedWithinTime, similarTo, withinRange, } from "./assertions";
 import { createContext, EvalContext, getCurrentContext, withContext } from "./context";
 export { createContext, getCurrentContext as getContext, withContext, EvalContext as ContextManager, };
 export { cloneContext, mergeContexts, validateContext, } from "./runtime/context";

package/dist/index.js CHANGED Viewed

@@ -8,8 +8,8 @@
  * @packageDocumentation
  */
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.createTestSuite = exports.SpecRegistrationError = exports.SpecExecutionError = exports.RuntimeError = exports.EvalRuntimeError = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.defaultLocalExecutor = exports.createLocalExecutor = exports.evalai = exports.defineSuite = exports.defineEval = exports.createResult = exports.createEvalContext = exports.validateContext = exports.mergeContexts = exports.cloneContext = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalGateError = exports.AIEvalClient = void 0;
-exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = void 0;
+exports.SpecRegistrationError = exports.SpecExecutionError = exports.RuntimeError = exports.EvalRuntimeError = exports.setActiveRuntime = exports.getActiveRuntime = exports.disposeActiveRuntime = exports.createEvalRuntime = exports.defaultLocalExecutor = exports.createLocalExecutor = exports.evalai = exports.defineSuite = exports.defineEval = exports.createResult = exports.createEvalContext = exports.validateContext = exports.mergeContexts = exports.cloneContext = exports.ContextManager = exports.withContext = exports.getContext = exports.createContext = exports.withinRange = exports.similarTo = exports.respondedWithinTime = exports.notContainsPII = exports.matchesSchema = exports.matchesPattern = exports.isValidURL = exports.isValidEmail = exports.hasValidCodeSyntax = exports.hasSentiment = exports.hasReadabilityScore = exports.hasPII = exports.hasNoToxicity = exports.hasNoHallucinations = exports.hasLength = exports.hasFactualAccuracy = exports.followsInstructions = exports.expect = exports.containsLanguage = exports.containsKeywords = exports.containsJSON = exports.containsAllRequiredFields = exports.NetworkError = exports.ValidationError = exports.AuthenticationError = exports.RateLimitError = exports.EvalGateError = exports.AIEvalClient = void 0;
+exports.WorkflowTracer = exports.traceWorkflowStep = exports.traceLangChainAgent = exports.traceCrewAI = exports.traceAutoGen = exports.createWorkflowTracer = exports.EvaluationTemplates = exports.streamEvaluation = exports.RateLimiter = exports.batchRead = exports.batchProcess = exports.REPORT_SCHEMA_VERSION = exports.GATE_EXIT = exports.GATE_CATEGORY = exports.ARTIFACTS = exports.PaginatedIterator = exports.encodeCursor = exports.decodeCursor = exports.createPaginatedIterator = exports.autoPaginate = exports.extendExpectWithToPassGate = exports.Logger = exports.openAIChatEval = exports.traceOpenAI = exports.traceAnthropic = exports.runCheck = exports.parseArgs = exports.EXIT = exports.RequestCache = exports.CacheTTL = exports.RequestBatcher = exports.importData = exports.exportData = exports.compareSnapshots = exports.saveSnapshot = exports.compareWithSnapshot = exports.snapshot = exports.TestSuite = exports.createTestSuite = void 0;
 // Main SDK exports
 var client_1 = require("./client");
 Object.defineProperty(exports, "AIEvalClient", { enumerable: true, get: function () { return client_1.AIEvalClient; } });
@@ -32,6 +32,7 @@ Object.defineProperty(exports, "hasFactualAccuracy", { enumerable: true, get: fu
 Object.defineProperty(exports, "hasLength", { enumerable: true, get: function () { return assertions_1.hasLength; } });
 Object.defineProperty(exports, "hasNoHallucinations", { enumerable: true, get: function () { return assertions_1.hasNoHallucinations; } });
 Object.defineProperty(exports, "hasNoToxicity", { enumerable: true, get: function () { return assertions_1.hasNoToxicity; } });
+Object.defineProperty(exports, "hasPII", { enumerable: true, get: function () { return assertions_1.hasPII; } });
 Object.defineProperty(exports, "hasReadabilityScore", { enumerable: true, get: function () { return assertions_1.hasReadabilityScore; } });
 Object.defineProperty(exports, "hasSentiment", { enumerable: true, get: function () { return assertions_1.hasSentiment; } });
 Object.defineProperty(exports, "hasValidCodeSyntax", { enumerable: true, get: function () { return assertions_1.hasValidCodeSyntax; } });

package/dist/runtime/eval.d.ts CHANGED Viewed

@@ -18,10 +18,19 @@ export declare const evalai: {
     test: DefineEvalFunction;
 };
 /**
- * Suite definition for grouping related specifications
- * This will be expanded in Layer 3 for dependency graph support
+ * Suite definition for grouping related specifications.
+ * Accepts both a positional form and an object form:
+ *
+ * @example Positional form:
+ * defineSuite('My Suite', [() => defineEval('spec 1', executor), ...])
+ *
+ * @example Object form:
+ * defineSuite({ name: 'My Suite', specs: [() => defineEval('spec 1', executor), ...] })
  */
-export declare function defineSuite(_name: string, specs: (() => void)[]): void;
+export declare function defineSuite(nameOrConfig: string | {
+    name: string;
+    specs: (() => void)[];
+}, specsArg?: (() => void)[]): void;
 /**
  * Helper function to create specification contexts
  * Useful for testing and manual execution

package/dist/runtime/eval.js CHANGED Viewed

@@ -204,13 +204,22 @@ exports.evalai = {
     test: exports.defineEval,
 };
 /**
- * Suite definition for grouping related specifications
- * This will be expanded in Layer 3 for dependency graph support
+ * Suite definition for grouping related specifications.
+ * Accepts both a positional form and an object form:
+ *
+ * @example Positional form:
+ * defineSuite('My Suite', [() => defineEval('spec 1', executor), ...])
+ *
+ * @example Object form:
+ * defineSuite({ name: 'My Suite', specs: [() => defineEval('spec 1', executor), ...] })
  */
-function defineSuite(_name, specs) {
-    // For now, just execute the specs to register them
-    // In Layer 3, this will build the dependency graph
-    for (const specFn of specs) {
+function defineSuite(nameOrConfig, specsArg) {
+    const specFns = typeof nameOrConfig === "string"
+        ? (specsArg ?? [])
+        : (nameOrConfig.specs ?? []);
+    // Execute each spec function to register its defineEval calls
+    // In Layer 3, this will also build the dependency graph
+    for (const specFn of specFns) {
         specFn();
     }
 }

package/dist/snapshot.d.ts CHANGED Viewed

@@ -9,7 +9,7 @@
  * import { snapshot, loadSnapshot } from '@ai-eval-platform/sdk';
  *
  * const output = await generateText('Write a haiku about coding');
- * await snapshot(output, 'haiku-test');
+ * await snapshot('haiku-test', output);
  *
  * // Later, compare with snapshot
  * const saved = await loadSnapshot('haiku-test');
@@ -135,10 +135,10 @@ export declare class SnapshotManager {
  * @example
  * ```typescript
  * const output = await generateText('Write a haiku');
- * await snapshot(output, 'haiku-test');
+ * await snapshot('haiku-test', output);
  * ```
  */
-export declare function snapshot(output: string, name: string, options?: {
+export declare function snapshot(name: string, output: string, options?: {
     tags?: string[];
     metadata?: Record<string, unknown>;
     overwrite?: boolean;

package/dist/snapshot.js CHANGED Viewed

@@ -10,7 +10,7 @@
  * import { snapshot, loadSnapshot } from '@ai-eval-platform/sdk';
  *
  * const output = await generateText('Write a haiku about coding');
- * await snapshot(output, 'haiku-test');
+ * await snapshot('haiku-test', output);
  *
  * // Later, compare with snapshot
  * const saved = await loadSnapshot('haiku-test');
@@ -271,10 +271,10 @@ function getSnapshotManager(dir) {
  * @example
  * ```typescript
  * const output = await generateText('Write a haiku');
- * await snapshot(output, 'haiku-test');
+ * await snapshot('haiku-test', output);
  * ```
  */
-async function snapshot(output, name, options) {
+async function snapshot(name, output, options) {
     const manager = getSnapshotManager(options?.dir);
     return manager.save(name, output, options);
 }

package/dist/version.d.ts CHANGED Viewed

@@ -3,5 +3,5 @@
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-export declare const SDK_VERSION = "2.1.0";
-export declare const SPEC_VERSION = "2.1.0";
+export declare const SDK_VERSION = "2.2.0";
+export declare const SPEC_VERSION = "2.2.0";

package/dist/version.js CHANGED Viewed

@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-exports.SDK_VERSION = "2.1.0";
-exports.SPEC_VERSION = "2.1.0";
+exports.SDK_VERSION = "2.2.0";
+exports.SPEC_VERSION = "2.2.0";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@evalgate/sdk",
-	"version": "2.1.2",
+	"version": "2.2.0",
 	"publishConfig": {
 		"access": "public",
 		"registry": "https://registry.npmjs.org/"