@evalgate/sdk 2.1.2 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -5,6 +5,18 @@ All notable changes to the @evalgate/sdk package will be documented in this file
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [2.1.3] - 2026-03-02
9
+
10
+ ### Fixed
11
+
12
+ - **Critical:** Multi-`defineEval` calls per file — only first was discovered (silent data loss)
13
+ - **High:** First-run gate false regression on fresh init when no test script exists
14
+ - **High:** Doctor defaults baseUrl to localhost:3000 instead of production API
15
+ - **Critical:** Simulated executeSpec replaced with real spec execution
16
+ - **High:** Run scores now include scoring model context for clarity
17
+ - **Low:** Explain no longer shows "unnamed" for builtin gate failures
18
+ - **Docs:** Added missing `discover --manifest` step to local quickstart
19
+
8
20
  ## [2.1.2] - 2026-03-02
9
21
 
10
22
  ### Fixed
package/README.md CHANGED
@@ -254,6 +254,33 @@ All commands automatically write artifacts so `explain` works with zero flags.
254
254
  npm install @evalgate/sdk openai
255
255
  ```
256
256
 
257
+ Create `eval/your-spec.spec.ts`:
258
+
259
+ ```typescript
260
+ import { defineEval } from "@evalgate/sdk";
261
+
262
+ defineEval({
263
+ name: "Basic Math Operations",
264
+ description: "Test fundamental arithmetic",
265
+ prompt: "Test: 1+1=2, string concatenation, array includes",
266
+ expected: "All tests should pass",
267
+ tags: ["basic", "math"],
268
+ category: "unit-test"
269
+ });
270
+ ```
271
+
272
+ ```bash
273
+ # Discover specs and generate manifest
274
+ npx @evalgate/sdk discover
275
+ npx @evalgate/sdk discover --manifest
276
+
277
+ # Run evaluations
278
+ npx @evalgate/sdk run --write-results
279
+
280
+ # Run local regression gate
281
+ npx @evalgate/sdk gate
282
+ ```
283
+
257
284
  ```typescript
258
285
  import { openAIChatEval } from "@evalgate/sdk";
259
286
 
@@ -145,8 +145,8 @@ async function analyzeSpecifications(specFiles) {
145
145
  for (const filePath of specFiles) {
146
146
  try {
147
147
  const content = await fs.readFile(filePath, "utf-8");
148
- const analysis = analyzeSpecFile(filePath, content);
149
- specs.push(analysis);
148
+ const fileSpecs = analyzeSpecFile(filePath, content);
149
+ specs.push(...fileSpecs);
150
150
  }
151
151
  catch (error) {
152
152
  console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
@@ -155,20 +155,40 @@ async function analyzeSpecifications(specFiles) {
155
155
  return specs;
156
156
  }
157
157
  /**
158
- * Analyze a single specification file
158
+ * Extract all spec names from file content (handles both call forms)
159
+ */
160
+ function extractSpecNames(content) {
161
+ const names = [];
162
+ // Form 1: defineEval("name", ...) or defineEval('name', ...) or defineEval(`name`, ...)
163
+ const stringArgPattern = /defineEval\s*\(\s*["'`]([^"'`]+)["'`]/g;
164
+ let m = stringArgPattern.exec(content);
165
+ while (m !== null) {
166
+ names.push(m[1]);
167
+ m = stringArgPattern.exec(content);
168
+ }
169
+ if (names.length > 0)
170
+ return names;
171
+ // Form 2: defineEval({ name: "..." }) — object-first form
172
+ const objNamePattern = /defineEval\s*\(\s*\{[\s\S]*?name\s*:\s*["'`]([^"'`]+)["'`]/g;
173
+ m = objNamePattern.exec(content);
174
+ while (m !== null) {
175
+ names.push(m[1]);
176
+ m = objNamePattern.exec(content);
177
+ }
178
+ return names;
179
+ }
180
+ /**
181
+ * Analyze a single specification file — returns one SpecAnalysis per defineEval call
159
182
  */
160
183
  function analyzeSpecFile(filePath, content) {
161
- // Extract defineEval calls
162
- const defineEvalMatches = content.match(/defineEval\s*\([^)]+\)/g) || [];
163
- const specNames = defineEvalMatches.map((match) => {
164
- const nameMatch = match.match(/["'`](.+?)["'`](?:\s*,|\s*\))/);
165
- return nameMatch ? nameMatch[1] : "unnamed";
166
- });
167
- // Extract tags
184
+ const specNames = extractSpecNames(content);
185
+ // Fallback: file matched as a spec file but we couldn't parse names
186
+ if (specNames.length === 0) {
187
+ specNames.push(path.basename(filePath, path.extname(filePath)));
188
+ }
189
+ // Shared analysis for the file
168
190
  const tags = extractTags(content);
169
- // Analyze complexity
170
191
  const complexity = analyzeComplexity(content);
171
- // Check for models and tools
172
192
  const usesModels = content.includes("model:") ||
173
193
  content.includes("model=") ||
174
194
  content.includes("openai") ||
@@ -176,22 +196,20 @@ function analyzeSpecFile(filePath, content) {
176
196
  const usesTools = content.includes("tool:") ||
177
197
  content.includes("function.") ||
178
198
  content.includes("call(");
179
- // Check for assertions
180
199
  const hasAssertions = content.includes("assert") ||
181
200
  content.includes("expect") ||
182
201
  content.includes("should");
183
- // Generate ID from file path
184
- const id = generateSpecId(filePath);
185
- return {
186
- id,
187
- name: specNames[0] || path.basename(filePath, ".ts"),
188
- file: path.relative(process.cwd(), filePath),
202
+ const relFile = path.relative(process.cwd(), filePath);
203
+ return specNames.map((name, idx) => ({
204
+ id: generateSpecId(filePath, name, idx),
205
+ name,
206
+ file: relFile,
189
207
  tags,
190
208
  hasAssertions,
191
209
  usesModels,
192
210
  usesTools,
193
211
  complexity,
194
- };
212
+ }));
195
213
  }
196
214
  /**
197
215
  * Extract tags from specification content
@@ -263,11 +281,12 @@ function analyzeComplexity(content) {
263
281
  return "complex";
264
282
  }
265
283
  /**
266
- * Generate specification ID from file path
284
+ * Generate specification ID from file path + name + index (unique per defineEval call)
267
285
  */
268
- function generateSpecId(filePath) {
286
+ function generateSpecId(filePath, name, index) {
269
287
  const relativePath = path.relative(process.cwd(), filePath);
270
- const hash = Buffer.from(relativePath)
288
+ const key = `${relativePath}:${name}:${index}`;
289
+ const hash = Buffer.from(key)
271
290
  .toString("base64")
272
291
  .replace(/[+/=]/g, "")
273
292
  .slice(0, 8);
@@ -96,7 +96,7 @@ function parseFlags(argv) {
96
96
  const baseUrl = raw.baseUrl ||
97
97
  process.env.EVALGATE_BASE_URL ||
98
98
  process.env.EVALAI_BASE_URL ||
99
- "http://localhost:3000";
99
+ "https://api.evalgate.com";
100
100
  const apiKey = raw.apiKey ||
101
101
  process.env.EVALGATE_API_KEY ||
102
102
  process.env.EVALAI_API_KEY ||
@@ -430,6 +430,7 @@ function buildFromBuiltinReport(report, reportPath) {
430
430
  }));
431
431
  const topFailures = failures.slice(0, 3).map((f, i) => ({
432
432
  rank: i + 1,
433
+ name: f.length > 60 ? `${f.slice(0, 57)}...` : f,
433
434
  reason: f,
434
435
  }));
435
436
  // Simple root cause for builtin reports
@@ -94,6 +94,16 @@ function detectRunner(cwd) {
94
94
  }
95
95
  return "unknown";
96
96
  }
97
+ function hasTestScript(cwd) {
98
+ try {
99
+ const pkg = JSON.parse(fs.readFileSync(path.join(cwd, "package.json"), "utf-8"));
100
+ const script = pkg.scripts?.test ?? "";
101
+ return !!script && script !== 'echo "Error: no test specified" && exit 1';
102
+ }
103
+ catch {
104
+ return false;
105
+ }
106
+ }
97
107
  function runBuiltinGate(cwd) {
98
108
  const t0 = Date.now();
99
109
  const baselinePath = path.join(cwd, BASELINE_REL);
@@ -101,6 +111,7 @@ function runBuiltinGate(cwd) {
101
111
  const pm = detectPackageManager(cwd);
102
112
  const command = `${pm} test`;
103
113
  const runner = detectRunner(cwd);
114
+ const projectHasTestScript = hasTestScript(cwd);
104
115
  // Load baseline
105
116
  if (!fs.existsSync(baselinePath)) {
106
117
  return {
@@ -165,16 +176,18 @@ function runBuiltinGate(cwd) {
165
176
  const baselineTotal = baselineData.confidenceTests?.total ?? 0;
166
177
  const failures = [];
167
178
  const deltas = [];
168
- // Delta: tests passing
169
- deltas.push({
170
- metric: "tests_passing",
171
- baseline: baselinePassed,
172
- current: testsPassed,
173
- delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
174
- status: testsPassed ? "pass" : "fail",
175
- });
176
- if (!testsPassed && baselinePassed) {
177
- failures.push("Tests were passing in baseline but are now failing");
179
+ // Delta: tests passing — only meaningful when a test script exists
180
+ if (projectHasTestScript) {
181
+ deltas.push({
182
+ metric: "tests_passing",
183
+ baseline: baselinePassed,
184
+ current: testsPassed,
185
+ delta: testsPassed === baselinePassed ? "0" : testsPassed ? "+1" : "-1",
186
+ status: testsPassed ? "pass" : "fail",
187
+ });
188
+ if (!testsPassed && baselinePassed) {
189
+ failures.push("Tests were passing in baseline but are now failing");
190
+ }
178
191
  }
179
192
  // Delta: test count (only if we captured counts)
180
193
  if (testCount > 0 || baselineTotal > 0) {
package/dist/cli/run.js CHANGED
@@ -52,6 +52,7 @@ exports.runEvaluationsCLI = runEvaluationsCLI;
52
52
  const node_child_process_1 = require("node:child_process");
53
53
  const fs = __importStar(require("node:fs/promises"));
54
54
  const path = __importStar(require("node:path"));
55
+ const registry_1 = require("../runtime/registry");
55
56
  const impact_analysis_1 = require("./impact-analysis");
56
57
  /**
57
58
  * Generate deterministic run ID
@@ -138,69 +139,97 @@ async function loadManifest(projectRoot = process.cwd()) {
138
139
  }
139
140
  }
140
141
  /**
141
- * Execute specifications
142
+ * Execute specifications — grouped by file to avoid redundant loads
142
143
  */
143
144
  async function executeSpecs(specs) {
144
- const results = [];
145
+ // Group specs by their absolute file path
146
+ const specsByFile = new Map();
145
147
  for (const spec of specs) {
146
- const result = await executeSpec(spec);
147
- results.push(result);
148
+ const abs = path.isAbsolute(spec.filePath)
149
+ ? spec.filePath
150
+ : path.join(process.cwd(), spec.filePath);
151
+ const group = specsByFile.get(abs) ?? [];
152
+ group.push(spec);
153
+ specsByFile.set(abs, group);
148
154
  }
149
- return results;
150
- }
151
- /**
152
- * Execute individual specification
153
- */
154
- async function executeSpec(spec) {
155
- const startTime = Date.now();
156
- try {
157
- // For now, simulate execution
158
- // In a real implementation, this would:
159
- // 1. Load the spec file
160
- // 2. Execute the defineEval function
161
- // 3. Capture the result
162
- // Simulate some work
163
- await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
164
- // Simulate success/failure (90% success rate for demo)
165
- const success = Math.random() > 0.1;
166
- const duration = Date.now() - startTime;
167
- if (success) {
168
- return {
169
- specId: spec.id,
170
- name: spec.name,
171
- filePath: spec.filePath,
172
- result: {
173
- status: "passed",
174
- score: Math.random() * 0.3 + 0.7, // 0.7-1.0
175
- duration,
176
- },
177
- };
155
+ const results = [];
156
+ for (const [absPath, fileSpecs] of specsByFile) {
157
+ // Fresh runtime per file to avoid cross-file contamination
158
+ (0, registry_1.disposeActiveRuntime)();
159
+ try {
160
+ // Bust require cache so the file re-executes its defineEval calls
161
+ delete require.cache[require.resolve(absPath)];
178
162
  }
179
- else {
180
- return {
181
- specId: spec.id,
182
- name: spec.name,
183
- filePath: spec.filePath,
184
- result: {
185
- status: "failed",
186
- error: "Simulated execution failure",
187
- duration,
188
- },
189
- };
163
+ catch {
164
+ // Not in cache yet — fine
165
+ }
166
+ try {
167
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
168
+ require(absPath);
169
+ }
170
+ catch (loadError) {
171
+ const isTs = absPath.endsWith(".ts") || absPath.endsWith(".tsx");
172
+ const msg = isTs &&
173
+ loadError instanceof Error &&
174
+ (loadError.message.includes("Unknown file extension") ||
175
+ loadError.message.includes("SyntaxError"))
176
+ ? `TypeScript spec files require ts-node. Install: npm i -D ts-node, then run: node -r ts-node/register -e "require('@evalgate/sdk/register')" evalgate run`
177
+ : loadError instanceof Error
178
+ ? loadError.message
179
+ : String(loadError);
180
+ for (const spec of fileSpecs) {
181
+ results.push(makeErrorResult(spec, msg, 0));
182
+ }
183
+ continue;
184
+ }
185
+ const runtime = (0, registry_1.getActiveRuntime)();
186
+ const registered = runtime.list();
187
+ for (const spec of fileSpecs) {
188
+ const registeredSpec = registered.find((r) => r.name === spec.name);
189
+ if (!registeredSpec) {
190
+ results.push({
191
+ specId: spec.id,
192
+ name: spec.name,
193
+ filePath: spec.filePath,
194
+ result: {
195
+ status: "skipped",
196
+ error: `defineEval name "${spec.name}" not found in ${spec.filePath}`,
197
+ duration: 0,
198
+ },
199
+ });
200
+ continue;
201
+ }
202
+ const startTime = Date.now();
203
+ try {
204
+ const evalResult = await registeredSpec.executor({ input: "" });
205
+ results.push({
206
+ specId: spec.id,
207
+ name: spec.name,
208
+ filePath: spec.filePath,
209
+ result: {
210
+ status: evalResult.pass ? "passed" : "failed",
211
+ score: typeof evalResult.score === "number"
212
+ ? evalResult.score / 100
213
+ : undefined,
214
+ error: evalResult.error,
215
+ duration: Date.now() - startTime,
216
+ },
217
+ });
218
+ }
219
+ catch (execError) {
220
+ results.push(makeErrorResult(spec, execError instanceof Error ? execError.message : String(execError), Date.now() - startTime));
221
+ }
190
222
  }
191
223
  }
192
- catch (error) {
193
- return {
194
- specId: spec.id,
195
- name: spec.name,
196
- filePath: spec.filePath,
197
- result: {
198
- status: "failed",
199
- error: error instanceof Error ? error.message : String(error),
200
- duration: Date.now() - startTime,
201
- },
202
- };
203
- }
224
+ return results;
225
+ }
226
+ function makeErrorResult(spec, error, duration) {
227
+ return {
228
+ specId: spec.id,
229
+ name: spec.name,
230
+ filePath: spec.filePath,
231
+ result: { status: "failed", error, duration },
232
+ };
204
233
  }
205
234
  /**
206
235
  * Calculate summary statistics
@@ -348,7 +377,8 @@ function printHumanResults(result) {
348
377
  console.log(` ❌ Failed: ${result.summary.failed}`);
349
378
  console.log(` ⏭️ Skipped: ${result.summary.skipped}`);
350
379
  console.log(` 📊 Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
351
- console.log("\n📋 Individual Results:");
380
+ const hasScores = result.results.some((r) => r.result.score !== undefined);
381
+ console.log(`\n📋 Individual Results:${hasScores ? " (score = value returned by spec executor, 0–100)" : ""}`);
352
382
  for (const spec of result.results) {
353
383
  const status = spec.result.status === "passed"
354
384
  ? "✅"
package/dist/version.d.ts CHANGED
@@ -3,5 +3,5 @@
3
3
  * X-EvalGate-SDK-Version: SDK package version
4
4
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
5
5
  */
6
- export declare const SDK_VERSION = "2.1.0";
7
- export declare const SPEC_VERSION = "2.1.0";
6
+ export declare const SDK_VERSION = "2.1.3";
7
+ export declare const SPEC_VERSION = "2.1.3";
package/dist/version.js CHANGED
@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
6
6
  * X-EvalGate-SDK-Version: SDK package version
7
7
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
8
8
  */
9
- exports.SDK_VERSION = "2.1.0";
10
- exports.SPEC_VERSION = "2.1.0";
9
+ exports.SDK_VERSION = "2.1.3";
10
+ exports.SPEC_VERSION = "2.1.3";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@evalgate/sdk",
3
- "version": "2.1.2",
3
+ "version": "2.1.3",
4
4
  "publishConfig": {
5
5
  "access": "public",
6
6
  "registry": "https://registry.npmjs.org/"