npm - @evalgate/sdk - Versions diffs - 2.1.0 → 2.1.3 - Mend

@evalgate/sdk 2.1.0 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +25 -0
package/README.md +47 -20
package/dist/cli/discover.js +42 -23
package/dist/cli/doctor.js +1 -1
package/dist/cli/explain.js +1 -0
package/dist/cli/regression-gate.js +23 -10
package/dist/cli/run.js +87 -57
package/dist/version.d.ts +2 -2
package/dist/version.js +2 -2
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,31 @@ All notable changes to the @evalgate/sdk package will be documented in this file
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [2.1.3] - 2026-03-02
+### Fixed
+- **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
+- **High:** First-run gate false regression on fresh init when no test script exists
+- **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
+- **Critical:** Simulated executeSpec replaced with real spec execution
+- **High:** Run scores now include scoring model context for clarity
+- **Low:** Explain no longer shows "unnamed" for builtin gate failures
+- **Docs:** Added missing `discover --manifest` step to local quickstart
+## [2.1.2] - 2026-03-02
+### Fixed
+- **Type safety** — aligned with platform 2.1.2; zero TypeScript errors across all integration points
+- **CI gate** — all SDK tests, lint, and build checks passing
+## [2.1.1] - 2026-03-02
+### Fixed
+- Version alignment with platform 2.1.1
 ## [2.0.0] - 2026-03-01
 ### Breaking — EvalGate Rebrand

package/README.md CHANGED Viewed

@@ -15,13 +15,13 @@ Zero to production CI in 60 seconds. No infra. No lock-in. Remove anytime.
 ## Quick Start (60 seconds)
-Add this to your `.github/workflows/evalai.yml`:
+Add this to your `.github/workflows/evalgate.yml`:
 ```yaml
 name: EvalGate CI
 on: [push, pull_request]
 jobs:
-  evalai:
+  evalgate:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -31,8 +31,8 @@ jobs:
       - uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: evalai-results
-          path: .evalai/
+          name: evalgate-results
+          path: .evalgate/
 ```
 Create `eval/your-spec.spec.ts`:
@@ -51,7 +51,7 @@ defineEval({
 ```
 ```bash
-git add .github/workflows/evalai.yml eval/
+git add .github/workflows/evalgate.yml eval/
 git commit -m "feat: add EvalGate CI pipeline"
 git push
 ```
@@ -67,7 +67,7 @@ That's it! Your CI now:
 ## 🚀 New in v2.0.0: One-Command CI
-### `evalai ci` - Complete CI Pipeline
+### `evalgate ci` - Complete CI Pipeline
 ```bash
 npx @evalgate/sdk ci --format github --write-results --base main
@@ -108,8 +108,8 @@ Every failure prints a clear next step:
 ```
 🔧 Next step for debugging:
-   Download base artifact and run: evalai diff --base .evalai/base-run.json --head .evalai/last-run.json
-   Artifacts: .evalai/runs/
+   Download base artifact and run: evalgate diff --base .evalgate/base-run.json --head .evalgate/last-run.json
+   Artifacts: .evalgate/runs/
 ```
 ---
@@ -181,23 +181,23 @@ Every failure prints a clear next step:
 | Command | Description |
 |---------|-------------|
-| `npx evalgate migrate config --in evalai.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
+| `npx evalgate migrate config --in evalgate.config.json --out eval/migrated.spec.ts` | Convert legacy config to DSL |
 **Guided failure flow:**
 ```
-evalai ci  →  fails  →  "Next: evalai explain --report .evalai/last-run.json"
+evalgate ci  →  fails  →  "Next: evalgate explain --report .evalgate/last-run.json"
                               ↓
-                   evalai explain  →  root causes + fixes
+                   evalgate explain  →  root causes + fixes
 ```
 **GitHub Actions step summary** — CI result at a glance with regressions and artifacts:
-![GitHub Actions step summary showing CI pass/fail with delta table](../../docs/images/evalai-gate-step-summary.svg)
+![GitHub Actions step summary showing CI pass/fail with delta table](../../docs/images/evalgate-gate-step-summary.svg)
-**`evalai explain` terminal output** — root causes + fix commands:
+**`evalgate explain` terminal output** — root causes + fix commands:
-![Terminal output of evalai explain showing top failures and suggested fixes](../../docs/images/evalai-explain-terminal.svg)
+![Terminal output of evalgate explain showing top failures and suggested fixes](../../docs/images/evalgate-explain-terminal.svg)
 All commands automatically write artifacts so `explain` works with zero flags.
@@ -254,6 +254,33 @@ All commands automatically write artifacts so `explain` works with zero flags.
 npm install @evalgate/sdk openai
 ```
+Create `eval/your-spec.spec.ts`:
+```typescript
+import { defineEval } from "@evalgate/sdk";
+defineEval({
+  name: "Basic Math Operations",
+  description: "Test fundamental arithmetic",
+  prompt: "Test: 1+1=2, string concatenation, array includes",
+  expected: "All tests should pass",
+  tags: ["basic", "math"],
+  category: "unit-test"
+});
+```
+```bash
+# Discover specs and generate manifest
+npx @evalgate/sdk discover
+npx @evalgate/sdk discover --manifest
+# Run evaluations
+npx @evalgate/sdk run --write-results
+# Run local regression gate
+npx @evalgate/sdk gate
+```
 ```typescript
 import { openAIChatEval } from "@evalgate/sdk";
@@ -324,7 +351,7 @@ import type {
 ```typescript
 import { AIEvalClient } from "@evalgate/sdk";
-const client = AIEvalClient.init(); // from EVALAI_API_KEY env
+const client = AIEvalClient.init(); // from EVALGATE_API_KEY env
 // or
 const client = new AIEvalClient({ apiKey: "...", organizationId: 123 });
 ```
@@ -367,7 +394,7 @@ npm install openai
 ## No Lock-in
 ```bash
-rm evalai.config.json
+rm evalgate.config.json
 ```
 Your local `openAIChatEval` runs continue to work. No account cancellation. No data export required.
@@ -376,17 +403,17 @@ Your local `openAIChatEval` runs continue to work. No account cancellation. No d
 See [CHANGELOG.md](CHANGELOG.md) for the full release history.
-**v1.8.0** — `evalai doctor` rewrite (9-check checklist), `evalai explain` command, guided failure flow, CI template with doctor preflight
+**v1.8.0** — `evalgate doctor` rewrite (9-check checklist), `evalgate explain` command, guided failure flow, CI template with doctor preflight
-**v1.7.0** — `evalai init` scaffolder, `evalai upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
+**v1.7.0** — `evalgate init` scaffolder, `evalgate upgrade --full`, `detectRunner()`, machine-readable gate output, init test matrix
-**v1.6.0** — `evalai gate`, `evalai baseline`, regression gate constants & types
+**v1.6.0** — `evalgate gate`, `evalgate baseline`, regression gate constants & types
 **v1.5.8** — secureRoute fix, test infra fixes, 304 handling fix
 **v1.5.5** — PASS/WARN/FAIL semantics, flake intelligence, golden regression suite
-**v1.5.0** — GitHub annotations, `--onFail import`, `evalai doctor`
+**v1.5.0** — GitHub annotations, `--onFail import`, `evalgate doctor`
 ## License

package/dist/cli/discover.js CHANGED Viewed

@@ -145,8 +145,8 @@ async function analyzeSpecifications(specFiles) {
     for (const filePath of specFiles) {
         try {
             const content = await fs.readFile(filePath, "utf-8");
-            const analysis = analyzeSpecFile(filePath, content);
-            specs.push(analysis);
+            const fileSpecs = analyzeSpecFile(filePath, content);
+            specs.push(...fileSpecs);
         }
         catch (error) {
             console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
@@ -155,20 +155,40 @@ async function analyzeSpecifications(specFiles) {
     return specs;
 }
 /**
- * Analyze a single specification file
+ * Extract all spec names from file content (handles both call forms)
+ */
+function extractSpecNames(content) {
+    const names = [];
+    // Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
+    const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
+    let m = stringArgPattern.exec(content);
+    while (m !== null) {
+        names.push(m[1]);
+        m = stringArgPattern.exec(content);
+    }
+    if (names.length > 0)
+        return names;
+    // Form 2: defineEval({ name: "..." }) — object-first form
+    const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
+    m = objNamePattern.exec(content);
+    while (m !== null) {
+        names.push(m[1]);
+        m = objNamePattern.exec(content);
+    }
+    return names;
+}
+/**
+ * Analyze a single specification file — returns one SpecAnalysis per defineEval call
  */
 function analyzeSpecFile(filePath, content) {
-    // Extract defineEval calls
-    const defineEvalMatches = content.match(/defineEval\s*\([^)]+\)/g) || [];
-    const specNames = defineEvalMatches.map((match) => {
-        const nameMatch = match.match(/["'`](.+?)["'`](?:\s*,|\s*\))/);
-        return nameMatch ? nameMatch[1] : "unnamed";
-    });
-    // Extract tags
+    const specNames = extractSpecNames(content);
+    // Fallback: file matched as a spec file but we couldn't parse names
+    if (specNames.length === 0) {
+        specNames.push(path.basename(filePath, path.extname(filePath)));
+    }
+    // Shared analysis for the file
     const tags = extractTags(content);
-    // Analyze complexity
     const complexity = analyzeComplexity(content);
-    // Check for models and tools
     const usesModels = content.includes("model:") ||
         content.includes("model=") ||
         content.includes("openai") ||
@@ -176,22 +196,20 @@ function analyzeSpecFile(filePath, content) {
     const usesTools = content.includes("tool:") ||
         content.includes("function.") ||
         content.includes("call(");
-    // Check for assertions
     const hasAssertions = content.includes("assert") ||
         content.includes("expect") ||
         content.includes("should");
-    // Generate ID from file path
-    const id = generateSpecId(filePath);
-    return {
-        id,
-        name: specNames[0] || path.basename(filePath, ".ts"),
-        file: path.relative(process.cwd(), filePath),
+    const relFile = path.relative(process.cwd(), filePath);
+    return specNames.map((name, idx) => ({
+        id: generateSpecId(filePath, name, idx),
+        name,
+        file: relFile,
         tags,
         hasAssertions,
         usesModels,
         usesTools,
         complexity,
-    };
+    }));
 }
 /**
  * Extract tags from specification content
@@ -263,11 +281,12 @@ function analyzeComplexity(content) {
     return "complex";
 }
 /**
- * Generate specification ID from file path
+ * Generate specification ID from file path + name + index (unique per defineEval call)
  */
-function generateSpecId(filePath) {
+function generateSpecId(filePath, name, index) {
     const relativePath = path.relative(process.cwd(), filePath);
-    const hash = Buffer.from(relativePath)
+    const key = `${relativePath}:${name}:${index}`;
+    const hash = Buffer.from(key)
         .toString("base64")
         .replace(/[+/=]/g, "")
         .slice(0, 8);

package/dist/cli/doctor.js CHANGED Viewed

@@ -96,7 +96,7 @@ function parseFlags(argv) {
     const baseUrl = raw.baseUrl ||
         process.env.EVALGATE_BASE_URL ||
         process.env.EVALAI_BASE_URL ||
-        "http://localhost:3000";
+        "https://api.evalgate.com";
     const apiKey = raw.apiKey ||
         process.env.EVALGATE_API_KEY ||
         process.env.EVALAI_API_KEY ||

package/dist/cli/explain.js CHANGED Viewed

@@ -430,6 +430,7 @@ function buildFromBuiltinReport(report, reportPath) {
     }));
     const topFailures = failures.slice(0, 3).map((f, i) => ({
         rank: i + 1,
+        name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
         reason: f,
     }));
     // Simple root cause for builtin reports

package/dist/cli/regression-gate.js CHANGED Viewed

@@ -94,6 +94,16 @@ function detectRunner(cwd) {
     }
     return "unknown";
 }
+function hasTestScript(cwd) {
+    try {
+        const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
+        const script = pkg.scripts?.test ?? "";
+        return !!script && script !== 'echo "Error: no test specified" && exit 1';
+    }
+    catch {
+        return false;
+    }
+}
 function runBuiltinGate(cwd) {
     const t0 = Date.now();
     const baselinePath = path.join(cwd, BASELINE_REL);
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
     const pm = detectPackageManager(cwd);
     const command = `${pm} test`;
     const runner = detectRunner(cwd);
+    const projectHasTestScript = hasTestScript(cwd);
     // Load baseline
     if (!fs.existsSync(baselinePath)) {
         return {
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
     const baselineTotal = baselineData.confidenceTests?.total ?? 0;
     const failures = [];
     const deltas = [];
-    // Delta: tests passing
-    deltas.push({
-        metric: "tests_passing",
-        baseline: baselinePassed,
-        current: testsPassed,
-        delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
-        status: testsPassed ? "pass" : "fail",
-    });
-    if (!testsPassed && baselinePassed) {
-        failures.push("Tests were passing in baseline but are now failing");
+    // Delta: tests passing — only meaningful when a test script exists
+    if (projectHasTestScript) {
+        deltas.push({
+            metric: "tests_passing",
+            baseline: baselinePassed,
+            current: testsPassed,
+            delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
+            status: testsPassed ? "pass" : "fail",
+        });
+        if (!testsPassed && baselinePassed) {
+            failures.push("Tests were passing in baseline but are now failing");
+        }
     }
     // Delta: test count (only if we captured counts)
     if (testCount > 0 || baselineTotal > 0) {

package/dist/cli/run.js CHANGED Viewed

@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
 const node_child_process_1 = require("node:child_process");
 const fs = __importStar(require("node:fs/promises"));
 const path = __importStar(require("node:path"));
+const registry_1 = require("../runtime/registry");
 const impact_analysis_1 = require("./impact-analysis");
 /**
  * Generate deterministic run ID
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
     }
 }
 /**
- * Execute specifications
+ * Execute specifications — grouped by file to avoid redundant loads
  */
 async function executeSpecs(specs) {
-    const results = [];
+    // Group specs by their absolute file path
+    const specsByFile = new Map();
     for (const spec of specs) {
-        const result = await executeSpec(spec);
-        results.push(result);
+        const abs = path.isAbsolute(spec.filePath)
+            ? spec.filePath
+            : path.join(process.cwd(), spec.filePath);
+        const group = specsByFile.get(abs) ?? [];
+        group.push(spec);
+        specsByFile.set(abs, group);
     }
-    return results;
-}
-/**
- * Execute individual specification
- */
-async function executeSpec(spec) {
-    const startTime = Date.now();
-    try {
-        // For now, simulate execution
-        // In a real implementation, this would:
-        // 1. Load the spec file
-        // 2. Execute the defineEval function
-        // 3. Capture the result
-        // Simulate some work
-        await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
-        // Simulate success/failure (90% success rate for demo)
-        const success = Math.random() > 0.1;
-        const duration = Date.now() - startTime;
-        if (success) {
-            return {
-                specId: spec.id,
-                name: spec.name,
-                filePath: spec.filePath,
-                result: {
-                    status: "passed",
-                    score: Math.random() * 0.3 + 0.7, // 0.7-1.0
-                    duration,
-                },
-            };
+    const results = [];
+    for (const [absPath, fileSpecs] of specsByFile) {
+        // Fresh runtime per file to avoid cross-file contamination
+        (0, registry_1.disposeActiveRuntime)();
+        try {
+            // Bust require cache so the file re-executes its defineEval calls
+            delete require.cache[require.resolve(absPath)];
         }
-        else {
-            return {
-                specId: spec.id,
-                name: spec.name,
-                filePath: spec.filePath,
-                result: {
-                    status: "failed",
-                    error: "Simulated execution failure",
-                    duration,
-                },
-            };
+        catch {
+            // Not in cache yet — fine
+        }
+        try {
+            // eslint-disable-next-line @typescript-eslint/no-require-imports
+            require(absPath);
+        }
+        catch (loadError) {
+            const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
+            const msg = isTs &&
+                loadError instanceof Error &&
+                (loadError.message.includes("Unknown file extension") ||
+                    loadError.message.includes("SyntaxError"))
+                ? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
+                : loadError instanceof Error
+                    ? loadError.message
+                    : String(loadError);
+            for (const spec of fileSpecs) {
+                results.push(makeErrorResult(spec, msg, 0));
+            }
+            continue;
+        }
+        const runtime = (0, registry_1.getActiveRuntime)();
+        const registered = runtime.list();
+        for (const spec of fileSpecs) {
+            const registeredSpec = registered.find((r) => r.name === spec.name);
+            if (!registeredSpec) {
+                results.push({
+                    specId: spec.id,
+                    name: spec.name,
+                    filePath: spec.filePath,
+                    result: {
+                        status: "skipped",
+                        error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
+                        duration: 0,
+                    },
+                });
+                continue;
+            }
+            const startTime = Date.now();
+            try {
+                const evalResult = await registeredSpec.executor({ input: "" });
+                results.push({
+                    specId: spec.id,
+                    name: spec.name,
+                    filePath: spec.filePath,
+                    result: {
+                        status: evalResult.pass ? "passed" : "failed",
+                        score: typeof evalResult.score === "number"
+                            ? evalResult.score / 100
+                            : undefined,
+                        error: evalResult.error,
+                        duration: Date.now() - startTime,
+                    },
+                });
+            }
+            catch (execError) {
+                results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
+            }
         }
     }
-    catch (error) {
-        return {
-            specId: spec.id,
-            name: spec.name,
-            filePath: spec.filePath,
-            result: {
-                status: "failed",
-                error: error instanceof Error ? error.message : String(error),
-                duration: Date.now() - startTime,
-            },
-        };
-    }
+    return results;
+}
+function makeErrorResult(spec, error, duration) {
+    return {
+        specId: spec.id,
+        name: spec.name,
+        filePath: spec.filePath,
+        result: { status: "failed", error, duration },
+    };
 }
 /**
  * Calculate summary statistics
@@ -348,7 +377,8 @@ function printHumanResults(result) {
     console.log(`   ❌ Failed: ${result.summary.failed}`);
     console.log(`   ⏭️  Skipped: ${result.summary.skipped}`);
     console.log(`   📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
-    console.log("\n📋 Individual Results:");
+    const hasScores = result.results.some((r) => r.result.score !== undefined);
+    console.log(`\n📋 Individual Results:${hasScores ? "  (score = value returned by spec executor, 0–100)" : ""}`);
     for (const spec of result.results) {
         const status = spec.result.status === "passed"
             ? "✅"

package/dist/version.d.ts CHANGED Viewed

@@ -3,5 +3,5 @@
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-export declare const SDK_VERSION = "2.1.0";
-export declare const SPEC_VERSION = "2.1.0";
+export declare const SDK_VERSION = "2.1.3";
+export declare const SPEC_VERSION = "2.1.3";

package/dist/version.js CHANGED Viewed

@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-exports.SDK_VERSION = "2.1.0";
-exports.SPEC_VERSION = "2.1.0";
+exports.SDK_VERSION = "2.1.3";
+exports.SPEC_VERSION = "2.1.3";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@evalgate/sdk",
-	"version": "2.1.0",
+	"version": "2.1.3",
 	"publishConfig": {
 		"access": "public",
 		"registry": "https://registry.npmjs.org/"