npm - @evalgate/sdk - Versions diffs - 2.1.2 → 2.1.3 - Mend

@evalgate/sdk 2.1.2 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/CHANGELOG.md +12 -0
package/README.md +27 -0
package/dist/cli/discover.js +42 -23
package/dist/cli/doctor.js +1 -1
package/dist/cli/explain.js +1 -0
package/dist/cli/regression-gate.js +23 -10
package/dist/cli/run.js +87 -57
package/dist/version.d.ts +2 -2
package/dist/version.js +2 -2
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,18 @@ All notable changes to the @evalgate/sdk package will be documented in this file
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [2.1.3] - 2026-03-02
+### Fixed
+- **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
+- **High:** First-run gate false regression on fresh init when no test script exists
+- **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
+- **Critical:** Simulated executeSpec replaced with real spec execution
+- **High:** Run scores now include scoring model context for clarity
+- **Low:** Explain no longer shows "unnamed" for builtin gate failures
+- **Docs:** Added missing `discover --manifest` step to local quickstart
 ## [2.1.2] - 2026-03-02
 ### Fixed

package/README.md CHANGED Viewed

@@ -254,6 +254,33 @@ All commands automatically write artifacts so `explain` works with zero flags.
 npm install @evalgate/sdk openai
 ```
+Create `eval/your-spec.spec.ts`:
+```typescript
+import { defineEval } from "@evalgate/sdk";
+defineEval({
+  name: "Basic Math Operations",
+  description: "Test fundamental arithmetic",
+  prompt: "Test: 1+1=2, string concatenation, array includes",
+  expected: "All tests should pass",
+  tags: ["basic", "math"],
+  category: "unit-test"
+});
+```
+```bash
+# Discover specs and generate manifest
+npx @evalgate/sdk discover
+npx @evalgate/sdk discover --manifest
+# Run evaluations
+npx @evalgate/sdk run --write-results
+# Run local regression gate
+npx @evalgate/sdk gate
+```
 ```typescript
 import { openAIChatEval } from "@evalgate/sdk";

package/dist/cli/discover.js CHANGED Viewed

@@ -145,8 +145,8 @@ async function analyzeSpecifications(specFiles) {
     for (const filePath of specFiles) {
         try {
             const content = await fs.readFile(filePath, "utf-8");
-            const analysis = analyzeSpecFile(filePath, content);
-            specs.push(analysis);
+            const fileSpecs = analyzeSpecFile(filePath, content);
+            specs.push(...fileSpecs);
         }
         catch (error) {
             console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
@@ -155,20 +155,40 @@ async function analyzeSpecifications(specFiles) {
     return specs;
 }
 /**
- * Analyze a single specification file
+ * Extract all spec names from file content (handles both call forms)
+ */
+function extractSpecNames(content) {
+    const names = [];
+    // Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
+    const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
+    let m = stringArgPattern.exec(content);
+    while (m !== null) {
+        names.push(m[1]);
+        m = stringArgPattern.exec(content);
+    }
+    if (names.length > 0)
+        return names;
+    // Form 2: defineEval({ name: "..." }) — object-first form
+    const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
+    m = objNamePattern.exec(content);
+    while (m !== null) {
+        names.push(m[1]);
+        m = objNamePattern.exec(content);
+    }
+    return names;
+}
+/**
+ * Analyze a single specification file — returns one SpecAnalysis per defineEval call
  */
 function analyzeSpecFile(filePath, content) {
-    // Extract defineEval calls
-    const defineEvalMatches = content.match(/defineEval\s*\([^)]+\)/g) || [];
-    const specNames = defineEvalMatches.map((match) => {
-        const nameMatch = match.match(/["'`](.+?)["'`](?:\s*,|\s*\))/);
-        return nameMatch ? nameMatch[1] : "unnamed";
-    });
-    // Extract tags
+    const specNames = extractSpecNames(content);
+    // Fallback: file matched as a spec file but we couldn't parse names
+    if (specNames.length === 0) {
+        specNames.push(path.basename(filePath, path.extname(filePath)));
+    }
+    // Shared analysis for the file
     const tags = extractTags(content);
-    // Analyze complexity
     const complexity = analyzeComplexity(content);
-    // Check for models and tools
     const usesModels = content.includes("model:") ||
         content.includes("model=") ||
         content.includes("openai") ||
@@ -176,22 +196,20 @@ function analyzeSpecFile(filePath, content) {
     const usesTools = content.includes("tool:") ||
         content.includes("function.") ||
         content.includes("call(");
-    // Check for assertions
     const hasAssertions = content.includes("assert") ||
         content.includes("expect") ||
         content.includes("should");
-    // Generate ID from file path
-    const id = generateSpecId(filePath);
-    return {
-        id,
-        name: specNames[0] || path.basename(filePath, ".ts"),
-        file: path.relative(process.cwd(), filePath),
+    const relFile = path.relative(process.cwd(), filePath);
+    return specNames.map((name, idx) => ({
+        id: generateSpecId(filePath, name, idx),
+        name,
+        file: relFile,
         tags,
         hasAssertions,
         usesModels,
         usesTools,
         complexity,
-    };
+    }));
 }
 /**
  * Extract tags from specification content
@@ -263,11 +281,12 @@ function analyzeComplexity(content) {
     return "complex";
 }
 /**
- * Generate specification ID from file path
+ * Generate specification ID from file path + name + index (unique per defineEval call)
  */
-function generateSpecId(filePath) {
+function generateSpecId(filePath, name, index) {
     const relativePath = path.relative(process.cwd(), filePath);
-    const hash = Buffer.from(relativePath)
+    const key = `${relativePath}:${name}:${index}`;
+    const hash = Buffer.from(key)
         .toString("base64")
         .replace(/[+/=]/g, "")
         .slice(0, 8);

package/dist/cli/doctor.js CHANGED Viewed

@@ -96,7 +96,7 @@ function parseFlags(argv) {
     const baseUrl = raw.baseUrl ||
         process.env.EVALGATE_BASE_URL ||
         process.env.EVALAI_BASE_URL ||
-        "http://localhost:3000";
+        "https://api.evalgate.com";
     const apiKey = raw.apiKey ||
         process.env.EVALGATE_API_KEY ||
         process.env.EVALAI_API_KEY ||

package/dist/cli/explain.js CHANGED Viewed

@@ -430,6 +430,7 @@ function buildFromBuiltinReport(report, reportPath) {
     }));
     const topFailures = failures.slice(0, 3).map((f, i) => ({
         rank: i + 1,
+        name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
         reason: f,
     }));
     // Simple root cause for builtin reports

package/dist/cli/regression-gate.js CHANGED Viewed

@@ -94,6 +94,16 @@ function detectRunner(cwd) {
     }
     return "unknown";
 }
+function hasTestScript(cwd) {
+    try {
+        const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
+        const script = pkg.scripts?.test ?? "";
+        return !!script && script !== 'echo "Error: no test specified" && exit 1';
+    }
+    catch {
+        return false;
+    }
+}
 function runBuiltinGate(cwd) {
     const t0 = Date.now();
     const baselinePath = path.join(cwd, BASELINE_REL);
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
     const pm = detectPackageManager(cwd);
     const command = `${pm} test`;
     const runner = detectRunner(cwd);
+    const projectHasTestScript = hasTestScript(cwd);
     // Load baseline
     if (!fs.existsSync(baselinePath)) {
         return {
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
     const baselineTotal = baselineData.confidenceTests?.total ?? 0;
     const failures = [];
     const deltas = [];
-    // Delta: tests passing
-    deltas.push({
-        metric: "tests_passing",
-        baseline: baselinePassed,
-        current: testsPassed,
-        delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
-        status: testsPassed ? "pass" : "fail",
-    });
-    if (!testsPassed && baselinePassed) {
-        failures.push("Tests were passing in baseline but are now failing");
+    // Delta: tests passing — only meaningful when a test script exists
+    if (projectHasTestScript) {
+        deltas.push({
+            metric: "tests_passing",
+            baseline: baselinePassed,
+            current: testsPassed,
+            delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
+            status: testsPassed ? "pass" : "fail",
+        });
+        if (!testsPassed && baselinePassed) {
+            failures.push("Tests were passing in baseline but are now failing");
+        }
     }
     // Delta: test count (only if we captured counts)
     if (testCount > 0 || baselineTotal > 0) {

package/dist/cli/run.js CHANGED Viewed

@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
 const node_child_process_1 = require("node:child_process");
 const fs = __importStar(require("node:fs/promises"));
 const path = __importStar(require("node:path"));
+const registry_1 = require("../runtime/registry");
 const impact_analysis_1 = require("./impact-analysis");
 /**
  * Generate deterministic run ID
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
     }
 }
 /**
- * Execute specifications
+ * Execute specifications — grouped by file to avoid redundant loads
  */
 async function executeSpecs(specs) {
-    const results = [];
+    // Group specs by their absolute file path
+    const specsByFile = new Map();
     for (const spec of specs) {
-        const result = await executeSpec(spec);
-        results.push(result);
+        const abs = path.isAbsolute(spec.filePath)
+            ? spec.filePath
+            : path.join(process.cwd(), spec.filePath);
+        const group = specsByFile.get(abs) ?? [];
+        group.push(spec);
+        specsByFile.set(abs, group);
     }
-    return results;
-}
-/**
- * Execute individual specification
- */
-async function executeSpec(spec) {
-    const startTime = Date.now();
-    try {
-        // For now, simulate execution
-        // In a real implementation, this would:
-        // 1. Load the spec file
-        // 2. Execute the defineEval function
-        // 3. Capture the result
-        // Simulate some work
-        await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
-        // Simulate success/failure (90% success rate for demo)
-        const success = Math.random() > 0.1;
-        const duration = Date.now() - startTime;
-        if (success) {
-            return {
-                specId: spec.id,
-                name: spec.name,
-                filePath: spec.filePath,
-                result: {
-                    status: "passed",
-                    score: Math.random() * 0.3 + 0.7, // 0.7-1.0
-                    duration,
-                },
-            };
+    const results = [];
+    for (const [absPath, fileSpecs] of specsByFile) {
+        // Fresh runtime per file to avoid cross-file contamination
+        (0, registry_1.disposeActiveRuntime)();
+        try {
+            // Bust require cache so the file re-executes its defineEval calls
+            delete require.cache[require.resolve(absPath)];
         }
-        else {
-            return {
-                specId: spec.id,
-                name: spec.name,
-                filePath: spec.filePath,
-                result: {
-                    status: "failed",
-                    error: "Simulated execution failure",
-                    duration,
-                },
-            };
+        catch {
+            // Not in cache yet — fine
+        }
+        try {
+            // eslint-disable-next-line @typescript-eslint/no-require-imports
+            require(absPath);
+        }
+        catch (loadError) {
+            const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
+            const msg = isTs &&
+                loadError instanceof Error &&
+                (loadError.message.includes("Unknown file extension") ||
+                    loadError.message.includes("SyntaxError"))
+                ? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
+                : loadError instanceof Error
+                    ? loadError.message
+                    : String(loadError);
+            for (const spec of fileSpecs) {
+                results.push(makeErrorResult(spec, msg, 0));
+            }
+            continue;
+        }
+        const runtime = (0, registry_1.getActiveRuntime)();
+        const registered = runtime.list();
+        for (const spec of fileSpecs) {
+            const registeredSpec = registered.find((r) => r.name === spec.name);
+            if (!registeredSpec) {
+                results.push({
+                    specId: spec.id,
+                    name: spec.name,
+                    filePath: spec.filePath,
+                    result: {
+                        status: "skipped",
+                        error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
+                        duration: 0,
+                    },
+                });
+                continue;
+            }
+            const startTime = Date.now();
+            try {
+                const evalResult = await registeredSpec.executor({ input: "" });
+                results.push({
+                    specId: spec.id,
+                    name: spec.name,
+                    filePath: spec.filePath,
+                    result: {
+                        status: evalResult.pass ? "passed" : "failed",
+                        score: typeof evalResult.score === "number"
+                            ? evalResult.score / 100
+                            : undefined,
+                        error: evalResult.error,
+                        duration: Date.now() - startTime,
+                    },
+                });
+            }
+            catch (execError) {
+                results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
+            }
         }
     }
-    catch (error) {
-        return {
-            specId: spec.id,
-            name: spec.name,
-            filePath: spec.filePath,
-            result: {
-                status: "failed",
-                error: error instanceof Error ? error.message : String(error),
-                duration: Date.now() - startTime,
-            },
-        };
-    }
+    return results;
+}
+function makeErrorResult(spec, error, duration) {
+    return {
+        specId: spec.id,
+        name: spec.name,
+        filePath: spec.filePath,
+        result: { status: "failed", error, duration },
+    };
 }
 /**
  * Calculate summary statistics
@@ -348,7 +377,8 @@ function printHumanResults(result) {
     console.log(`   ❌ Failed: ${result.summary.failed}`);
     console.log(`   ⏭️  Skipped: ${result.summary.skipped}`);
     console.log(`   📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
-    console.log("\n📋 Individual Results:");
+    const hasScores = result.results.some((r) => r.result.score !== undefined);
+    console.log(`\n📋 Individual Results:${hasScores ? "  (score = value returned by spec executor, 0–100)" : ""}`);
     for (const spec of result.results) {
         const status = spec.result.status === "passed"
             ? "✅"

package/dist/version.d.ts CHANGED Viewed

@@ -3,5 +3,5 @@
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-export declare const SDK_VERSION = "2.1.0";
-export declare const SPEC_VERSION = "2.1.0";
+export declare const SDK_VERSION = "2.1.3";
+export declare const SPEC_VERSION = "2.1.3";

package/dist/version.js CHANGED Viewed

@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-exports.SDK_VERSION = "2.1.0";
-exports.SPEC_VERSION = "2.1.0";
+exports.SDK_VERSION = "2.1.3";
+exports.SPEC_VERSION = "2.1.3";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@evalgate/sdk",
-	"version": "2.1.2",
+	"version": "2.1.3",
 	"publishConfig": {
 		"access": "public",
 		"registry": "https://registry.npmjs.org/"