npm - @evalgate/sdk - Versions diffs - 2.2.2 → 2.2.4 - Mend

@evalgate/sdk 2.2.2 → 2.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/CHANGELOG.md +32 -0
package/README.md +40 -1
package/dist/assertions.d.ts +194 -10
package/dist/assertions.js +525 -73
package/dist/batch.js +4 -4
package/dist/cache.d.ts +5 -1
package/dist/cache.js +5 -1
package/dist/cli/baseline.d.ts +14 -0
package/dist/cli/baseline.js +43 -3
package/dist/cli/check.d.ts +5 -2
package/dist/cli/check.js +20 -12
package/dist/cli/compare.d.ts +80 -0
package/dist/cli/compare.js +266 -0
package/dist/cli/index.js +244 -101
package/dist/cli/regression-gate.js +23 -0
package/dist/cli/run.js +22 -0
package/dist/cli/start.d.ts +26 -0
package/dist/cli/start.js +130 -0
package/dist/cli/templates.d.ts +24 -0
package/dist/cli/templates.js +314 -0
package/dist/cli/traces.d.ts +109 -0
package/dist/cli/traces.js +152 -0
package/dist/cli/upgrade.js +5 -0
package/dist/cli/validate.d.ts +37 -0
package/dist/cli/validate.js +252 -0
package/dist/cli/watch.d.ts +19 -0
package/dist/cli/watch.js +175 -0
package/dist/client.js +6 -13
package/dist/constants.d.ts +2 -0
package/dist/constants.js +5 -0
package/dist/errors.js +7 -0
package/dist/export.js +2 -2
package/dist/index.d.ts +10 -9
package/dist/index.js +24 -7
package/dist/integrations/anthropic.js +6 -6
package/dist/integrations/openai.js +84 -61
package/dist/logger.d.ts +3 -1
package/dist/logger.js +2 -1
package/dist/otel.d.ts +130 -0
package/dist/otel.js +309 -0
package/dist/pagination.d.ts +13 -2
package/dist/pagination.js +28 -2
package/dist/runtime/adapters/testsuite-to-dsl.js +1 -6
package/dist/runtime/eval.d.ts +14 -4
package/dist/runtime/eval.js +127 -2
package/dist/runtime/executor.d.ts +3 -2
package/dist/runtime/executor.js +3 -2
package/dist/runtime/registry.d.ts +8 -3
package/dist/runtime/registry.js +15 -4
package/dist/runtime/run-report.d.ts +1 -1
package/dist/runtime/run-report.js +7 -4
package/dist/runtime/types.d.ts +38 -0
package/dist/snapshot.d.ts +12 -0
package/dist/snapshot.js +24 -1
package/dist/testing.d.ts +8 -0
package/dist/testing.js +45 -10
package/dist/version.d.ts +2 -2
package/dist/version.js +2 -2
package/dist/workflows.d.ts +2 -0
package/dist/workflows.js +184 -102
package/package.json +8 -1

package/dist/runtime/registry.js CHANGED Viewed

@@ -39,6 +39,7 @@ var __importStar = (this && this.__importStar) || (function () {
     };
 })();
 Object.defineProperty(exports, "__esModule", { value: true });
+exports._registerDefineEval = _registerDefineEval;
 exports.createEvalRuntime = createEvalRuntime;
 exports.withRuntime = withRuntime;
 exports.getActiveRuntime = getActiveRuntime;
@@ -47,6 +48,12 @@ exports.disposeActiveRuntime = disposeActiveRuntime;
 const crypto = __importStar(require("node:crypto"));
 const path = __importStar(require("node:path"));
 const types_1 = require("./types");
+// Registration pattern to break circular dependency (eval.ts imports from registry.ts)
+let _registeredDefineEval = null;
+/** @internal Called by eval.ts to register defineEval without circular import */
+function _registerDefineEval(fn) {
+    _registeredDefineEval = fn;
+}
 /**
  * Runtime registry implementation
  * Scoped lifecycle with proper memory management
@@ -315,7 +322,10 @@ class EvalRuntimeImpl {
  * Create a new scoped runtime with lifecycle management
  * Returns a handle for proper resource management
  */
-function createEvalRuntime(projectRoot = process.cwd()) {
+function createEvalRuntime(projectRootOrConfig = process.cwd()) {
+    const projectRoot = typeof projectRootOrConfig === "string"
+        ? projectRootOrConfig
+        : (projectRootOrConfig.projectRoot ?? process.cwd());
     const runtime = new EvalRuntimeImpl(projectRoot);
     // Create bound defineEval function
     const boundDefineEval = ((nameOrConfig, executor, options) => {
@@ -323,9 +333,10 @@ function createEvalRuntime(projectRoot = process.cwd()) {
         const previousRuntime = activeRuntime;
         activeRuntime = runtime;
         try {
-            // Import and call defineEval
-            const { defineEval } = require("./eval");
-            return defineEval(nameOrConfig, executor, options);
+            if (!_registeredDefineEval) {
+                throw new types_1.RuntimeError("defineEval not registered. Ensure eval.ts is imported before calling createEvalRuntime.");
+            }
+            return _registeredDefineEval(nameOrConfig, executor, options);
         }
         finally {
             // Restore previous runtime

package/dist/runtime/run-report.d.ts CHANGED Viewed

@@ -159,7 +159,7 @@ export declare class RunReportBuilder {
     addResult(testId: string, testName: string, filePath: string, position: {
         line: number;
         column: number;
-    }, input: string, result: EnhancedEvalResult): void;
+    }, input: string, result: EnhancedEvalResult, tags?: string[]): void;
     /**
      * Update summary statistics
      */

package/dist/runtime/run-report.js CHANGED Viewed

@@ -77,7 +77,7 @@ class RunReportBuilder {
     /**
      * Add a test result to the report
      */
-    addResult(testId, testName, filePath, position, input, result) {
+    addResult(testId, testName, filePath, position, input, result, tags) {
         const runResult = {
             testId,
             testName,
@@ -88,7 +88,7 @@ class RunReportBuilder {
             score: result.score,
             durationMs: result.durationMs || 0,
             metadata: result.metadata,
-            tags: [], // TODO: Extract from spec
+            tags: tags ?? [],
             assertions: result.assertions?.map((assertion, index) => ({
                 name: assertion.name || `assertion-${index}`,
                 passed: assertion.passed,
@@ -182,8 +182,11 @@ class RunReportBuilder {
         // Set completion timestamp
         this.report.finishedAt = new Date().toISOString();
         const finalReport = this.report;
-        // Add toJSON method
-        finalReport.toJSON = () => JSON.stringify(finalReport, null, 2);
+        // Add toJSON method (spread to avoid circular reference via toJSON itself)
+        finalReport.toJSON = () => {
+            const { toJSON: _, ...data } = finalReport;
+            return JSON.stringify(data, null, 2);
+        };
         return finalReport;
     }
     /**

package/dist/runtime/types.d.ts CHANGED Viewed

@@ -36,6 +36,8 @@ export interface EvalSpec {
         budget?: string;
         model?: string | "auto";
     };
+    /** Filtering mode: skip = registered but never executed, only = exclusive execution */
+    mode?: "normal" | "skip" | "only";
 }
 /**
  * Specification execution context
@@ -81,6 +83,10 @@ export interface EvalResult {
     durationMs?: number;
     /** Execution error if failed */
     error?: string;
+    /** Generated output text */
+    output?: string;
+    /** Token count consumed */
+    tokens?: number;
 }
 /**
  * Scoped runtime context - prevents cross-run contamination
@@ -183,6 +189,38 @@ export interface DefineEvalFunction {
      * @param config - Complete specification configuration
      */
     (config: SpecConfig): void;
+    /**
+     * Register a specification but skip it during execution.
+     * Follows the vitest/jest `.skip` convention.
+     */
+    skip: DefineEvalFunction;
+    /**
+     * Register a specification for exclusive execution.
+     * If any spec is marked `.only`, only those specs run.
+     * Follows the vitest/jest `.only` convention.
+     */
+    only: DefineEvalFunction;
+    /**
+     * Load a JSONL or CSV dataset and register one spec per row.
+     * Each row is passed as `context.input` (the parsed row object) to the executor.
+     *
+     * @param name - Base name for specs (each gets " [row N]" suffix)
+     * @param datasetPath - Path to a .jsonl or .csv file
+     * @param executor - Receives the parsed row as input
+     * @param options - Optional spec configuration applied to all rows
+     *
+     * @example
+     * ```ts
+     * defineEval.fromDataset("rag-accuracy", "./evals/golden.jsonl", async (ctx) => {
+     *   const row = ctx.input; // { question: string, expected: string }
+     *   const answer = await myRag(row.question);
+     *   return createResult({ pass: answer.includes(row.expected), score: 100 });
+     * });
+     * ```
+     */
+    fromDataset: <TRow extends Record<string, unknown> = Record<string, unknown>>(name: string, datasetPath: string, executor: (context: EvalContext & {
+        input: TRow;
+    }) => Promise<EvalResult>, options?: SpecOptions) => void;
 }
 /**
  * Specification definition options

package/dist/snapshot.d.ts CHANGED Viewed

@@ -166,6 +166,18 @@ export declare function loadSnapshot(name: string, dir?: string): Promise<Snapsh
  * ```
  */
 export declare function compareWithSnapshot(name: string, currentOutput: unknown, dir?: string): Promise<SnapshotComparison>;
+/**
+ * Compare two saved snapshots by name (convenience function)
+ *
+ * @example
+ * ```typescript
+ * const comparison = await compareSnapshots('baseline', 'current');
+ * if (!comparison.matches) {
+ *   console.log('Snapshots differ!', comparison.differences);
+ * }
+ * ```
+ */
+export declare function compareSnapshots(nameA: string, nameB: string, dir?: string): Promise<SnapshotComparison>;
 /**
  * Delete a snapshot (convenience function)
  */

package/dist/snapshot.js CHANGED Viewed

@@ -55,6 +55,7 @@ exports.SnapshotManager = void 0;
 exports.snapshot = snapshot;
 exports.loadSnapshot = loadSnapshot;
 exports.compareWithSnapshot = compareWithSnapshot;
+exports.compareSnapshots = compareSnapshots;
 exports.deleteSnapshot = deleteSnapshot;
 exports.listSnapshots = listSnapshots;
 // Environment check
@@ -130,7 +131,13 @@ class SnapshotManager {
         if (!options?.overwrite && fs.existsSync(filePath)) {
             throw new Error(`Snapshot '${name}' already exists. Use overwrite: true to update.`);
         }
-        const serialized = typeof output === "string" ? output : JSON.stringify(output);
+        const serialized = output === undefined
+            ? "undefined"
+            : output === null
+                ? "null"
+                : typeof output === "string"
+                    ? output
+                    : JSON.stringify(output);
         const snapshotData = {
             output: serialized,
             metadata: {
@@ -310,6 +317,22 @@ async function compareWithSnapshot(name, currentOutput, dir) {
     const manager = getSnapshotManager(dir);
     return manager.compare(name, currentOutput);
 }
+/**
+ * Compare two saved snapshots by name (convenience function)
+ *
+ * @example
+ * ```typescript
+ * const comparison = await compareSnapshots('baseline', 'current');
+ * if (!comparison.matches) {
+ *   console.log('Snapshots differ!', comparison.differences);
+ * }
+ * ```
+ */
+async function compareSnapshots(nameA, nameB, dir) {
+    const manager = getSnapshotManager(dir);
+    const snapshotB = await manager.load(nameB);
+    return manager.compare(nameA, snapshotB.output);
+}
 /**
  * Delete a snapshot (convenience function)
  */

package/dist/testing.d.ts CHANGED Viewed

@@ -51,8 +51,16 @@ export interface TestSuiteConfig {
     stopOnFailure?: boolean;
     /** Timeout per test case in ms (default: 30000) */
     timeout?: number;
+    /** Alias for stopOnFailure — fail the entire suite on the first failing case. Useful in pre-commit hooks. */
+    strict?: boolean;
     /** Retry failing cases N times (default: 0). Only failing cases are retried. */
     retries?: number;
+    /** Base delay between retries in ms (default: 500). Exponential backoff: delay * 2^attempt. */
+    retryDelayMs?: number;
+    /** Add random jitter up to this fraction of the delay (default: 0.5 = ±50%). Set 0 to disable. */
+    retryJitter?: number;
+    /** Seed for deterministic case ordering. When set, cases are shuffled using this seed for reproducible runs. */
+    seed?: number;
 }
 export interface TestSuiteCaseResult {
     /** Test case ID */

package/dist/testing.js CHANGED Viewed

@@ -50,6 +50,26 @@ class TestSuite {
     async run() {
         const startTime = Date.now();
         const results = [];
+        // Deterministic shuffle when seed is provided
+        const orderedCases = this.config.cases.map((c, i) => ({
+            case: c,
+            originalIndex: i,
+        }));
+        if (this.config.seed !== undefined) {
+            // mulberry32 seeded PRNG
+            let s = this.config.seed | 0;
+            const rand = () => {
+                s = (s + 0x6d2b79f5) | 0;
+                let t = Math.imul(s ^ (s >>> 15), 1 | s);
+                t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
+                return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+            };
+            // Fisher-Yates shuffle
+            for (let i = orderedCases.length - 1; i > 0; i--) {
+                const j = Math.floor(rand() * (i + 1));
+                [orderedCases[i], orderedCases[j]] = [orderedCases[j], orderedCases[i]];
+            }
+        }
         const runTestCase = async (testCase, index) => {
             const caseStartTime = Date.now();
             const id = testCase.id || `case-${index}`;
@@ -114,37 +134,52 @@ class TestSuite {
                 };
             }
         };
-        // Run tests
+        // Run tests (using orderedCases which may be seeded-shuffled)
         if (this.config.parallel) {
-            results.push(...(await Promise.all(this.config.cases.map((tc, i) => runTestCase(tc, i)))));
+            results.push(...(await Promise.all(orderedCases.map((oc) => runTestCase(oc.case, oc.originalIndex)))));
         }
         else {
-            for (let i = 0; i < this.config.cases.length; i++) {
-                const result = await runTestCase(this.config.cases[i], i);
+            for (const oc of orderedCases) {
+                const result = await runTestCase(oc.case, oc.originalIndex);
                 results.push(result);
-                if (this.config.stopOnFailure && !result.passed) {
+                if ((this.config.stopOnFailure || this.config.strict) &&
+                    !result.passed) {
                     break;
                 }
             }
         }
         const retriedCases = [];
         const retries = this.config.retries ?? 0;
+        const baseDelay = this.config.retryDelayMs ?? 500;
+        const jitterFraction = this.config.retryJitter ?? 0.5;
         if (retries > 0 && results.length > 0) {
             const failingIndices = results
                 .map((r, i) => (r.passed ? -1 : i))
                 .filter((i) => i >= 0);
             for (let attempt = 0; attempt < retries && failingIndices.length > 0; attempt++) {
+                // Exponential backoff with jitter before each retry round
+                const delay = baseDelay * 2 ** attempt;
+                const jitter = jitterFraction > 0
+                    ? delay * jitterFraction * (Math.random() * 2 - 1)
+                    : 0;
+                const waitMs = Math.max(0, Math.round(delay + jitter));
+                if (waitMs > 0) {
+                    await new Promise((resolve) => setTimeout(resolve, waitMs));
+                }
                 const toRetry = [...failingIndices];
                 failingIndices.length = 0;
-                for (const i of toRetry) {
-                    const tc = this.config.cases[i];
-                    const retryResult = await runTestCase(tc, i);
+                for (const idx of toRetry) {
+                    const tc = results[idx]; // retry based on result index
+                    const originalCase = orderedCases.find((oc) => (oc.case.id || `case-${oc.originalIndex}`) === tc.id);
+                    if (!originalCase)
+                        continue;
+                    const retryResult = await runTestCase(originalCase.case, originalCase.originalIndex);
                     if (retryResult.passed) {
-                        results[i] = retryResult;
+                        results[idx] = retryResult;
                         retriedCases.push(retryResult.id);
                     }
                     else {
-                        failingIndices.push(i);
+                        failingIndices.push(idx);
                     }
                 }
             }

package/dist/version.d.ts CHANGED Viewed

@@ -3,5 +3,5 @@
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-export declare const SDK_VERSION = "2.2.2";
-export declare const SPEC_VERSION = "2.2.2";
+export declare const SDK_VERSION = "2.2.4";
+export declare const SPEC_VERSION = "2.2.3";

package/dist/version.js CHANGED Viewed

@@ -6,5 +6,5 @@ exports.SPEC_VERSION = exports.SDK_VERSION = void 0;
  * X-EvalGate-SDK-Version: SDK package version
  * X-EvalGate-Spec-Version: OpenAPI spec version (docs/openapi.json info.version)
  */
-exports.SDK_VERSION = "2.2.2";
-exports.SPEC_VERSION = "2.2.2";
+exports.SDK_VERSION = "2.2.4";
+exports.SPEC_VERSION = "2.2.3";

package/dist/workflows.d.ts CHANGED Viewed

@@ -170,6 +170,8 @@ export interface WorkflowTracerOptions {
     captureFullPayloads?: boolean;
     /** Debug mode */
     debug?: boolean;
+    /** Offline mode — skip all API calls, keep in-memory state only */
+    offline?: boolean;
 }
 /**
  * Agent span context