npm - @pauly4010/evalai-sdk - Versions diffs - 1.8.0 → 1.9.0 - Mend

@pauly4010/evalai-sdk 1.8.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/CHANGELOG.md +54 -0
package/dist/cli/ci.d.ts +45 -0
package/dist/cli/ci.js +192 -0
package/dist/cli/diff.d.ts +173 -0
package/dist/cli/diff.js +680 -0
package/dist/cli/discover.d.ts +84 -0
package/dist/cli/discover.js +408 -0
package/dist/cli/doctor.js +19 -10
package/dist/cli/env.d.ts +21 -0
package/dist/cli/env.js +42 -0
package/dist/cli/explain.js +143 -37
package/dist/cli/impact-analysis.d.ts +63 -0
package/dist/cli/impact-analysis.js +251 -0
package/dist/cli/index.js +173 -0
package/dist/cli/manifest.d.ts +105 -0
package/dist/cli/manifest.js +275 -0
package/dist/cli/migrate.d.ts +41 -0
package/dist/cli/migrate.js +349 -0
package/dist/cli/print-config.js +18 -14
package/dist/cli/run.d.ts +101 -0
package/dist/cli/run.js +389 -0
package/dist/cli/workspace.d.ts +28 -0
package/dist/cli/workspace.js +58 -0
package/dist/index.d.ts +6 -0
package/dist/index.js +30 -5
package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
package/dist/runtime/adapters/config-to-dsl.js +391 -0
package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
package/dist/runtime/adapters/testsuite-to-dsl.js +271 -0
package/dist/runtime/context.d.ts +26 -0
package/dist/runtime/context.js +74 -0
package/dist/runtime/eval.d.ts +46 -0
package/dist/runtime/eval.js +237 -0
package/dist/runtime/execution-mode.d.ts +80 -0
package/dist/runtime/execution-mode.js +353 -0
package/dist/runtime/executor.d.ts +16 -0
package/dist/runtime/executor.js +152 -0
package/dist/runtime/registry.d.ts +78 -0
package/dist/runtime/registry.js +416 -0
package/dist/runtime/run-report.d.ts +202 -0
package/dist/runtime/run-report.js +220 -0
package/dist/runtime/types.d.ts +356 -0
package/dist/runtime/types.js +76 -0
package/dist/testing.d.ts +65 -0
package/dist/testing.js +42 -0
package/dist/version.d.ts +1 -1
package/dist/version.js +1 -1
package/package.json +4 -3

package/dist/cli/explain.js CHANGED Viewed

@@ -124,9 +124,7 @@ function classifyRootCauses(report) {
     }
     // Analyze failed cases for drift patterns
     if (failedCases.length > 0) {
-        const outputs = failedCases
-            .map((fc) => (fc.output ?? "").toLowerCase())
-            .filter(Boolean);
+        const outputs = failedCases.map((fc) => (fc.output ?? "").toLowerCase()).filter(Boolean);
         const expectedOutputs = failedCases
             .map((fc) => (fc.expectedOutput ?? "").toLowerCase())
             .filter(Boolean);
@@ -138,9 +136,7 @@ function classifyRootCauses(report) {
             causes.push("formatting_drift");
         }
         // Tool use drift: output mentions tool calls or function calls
-        const hasToolIssue = outputs.some((o) => o.includes("tool_call") ||
-            o.includes("function_call") ||
-            o.includes("tool_use"));
+        const hasToolIssue = outputs.some((o) => o.includes("tool_call") || o.includes("function_call") || o.includes("tool_use"));
         if (hasToolIssue) {
             causes.push("tool_use_drift");
         }
@@ -173,52 +169,164 @@ function classifyRootCauses(report) {
 // ── Suggested fixes ──
 const ROOT_CAUSE_FIXES = {
     prompt_drift: [
-        { action: "Review prompt changes", detail: "Compare current prompt with the version used in baseline run. Diff system/user messages.", priority: "high" },
-        { action: "Pin model version", detail: "Use a specific model snapshot (e.g. gpt-4-0613) instead of a rolling alias.", priority: "medium" },
-        { action: "Update baseline", detail: "If changes are intentional, run: npx evalai baseline update", priority: "low" },
+        {
+            action: "Review prompt changes",
+            detail: "Compare current prompt with the version used in baseline run. Diff system/user messages.",
+            priority: "high",
+        },
+        {
+            action: "Pin model version",
+            detail: "Use a specific model snapshot (e.g. gpt-4-0613) instead of a rolling alias.",
+            priority: "medium",
+        },
+        {
+            action: "Update baseline",
+            detail: "If changes are intentional, run: npx evalai baseline update",
+            priority: "low",
+        },
     ],
     retrieval_drift: [
-        { action: "Check retrieval pipeline", detail: "Verify embeddings, index, and chunk strategy haven't changed.", priority: "high" },
-        { action: "Update test case context", detail: "If knowledge base changed, update expected outputs in test cases.", priority: "medium" },
-        { action: "Add retrieval-specific tests", detail: "Add test cases that verify document retrieval before generation.", priority: "low" },
+        {
+            action: "Check retrieval pipeline",
+            detail: "Verify embeddings, index, and chunk strategy haven't changed.",
+            priority: "high",
+        },
+        {
+            action: "Update test case context",
+            detail: "If knowledge base changed, update expected outputs in test cases.",
+            priority: "medium",
+        },
+        {
+            action: "Add retrieval-specific tests",
+            detail: "Add test cases that verify document retrieval before generation.",
+            priority: "low",
+        },
     ],
     formatting_drift: [
-        { action: "Update output format instructions", detail: "Check if system prompt format instructions match expected output structure.", priority: "high" },
-        { action: "Add format validators", detail: "Use schema assertions to validate output structure (JSON schema, regex).", priority: "medium" },
-        { action: "Refresh baseline", detail: "If new format is intentional, run: npx evalai baseline update", priority: "low" },
+        {
+            action: "Update output format instructions",
+            detail: "Check if system prompt format instructions match expected output structure.",
+            priority: "high",
+        },
+        {
+            action: "Add format validators",
+            detail: "Use schema assertions to validate output structure (JSON schema, regex).",
+            priority: "medium",
+        },
+        {
+            action: "Refresh baseline",
+            detail: "If new format is intentional, run: npx evalai baseline update",
+            priority: "low",
+        },
     ],
     tool_use_drift: [
-        { action: "Verify tool definitions", detail: "Check that tool/function schemas match what the model expects.", priority: "high" },
-        { action: "Review tool call patterns", detail: "Compare tool call sequences in failing vs passing cases.", priority: "medium" },
-        { action: "Add tool-use assertions", detail: "Assert specific tool calls are made (or not made) per test case.", priority: "low" },
+        {
+            action: "Verify tool definitions",
+            detail: "Check that tool/function schemas match what the model expects.",
+            priority: "high",
+        },
+        {
+            action: "Review tool call patterns",
+            detail: "Compare tool call sequences in failing vs passing cases.",
+            priority: "medium",
+        },
+        {
+            action: "Add tool-use assertions",
+            detail: "Assert specific tool calls are made (or not made) per test case.",
+            priority: "low",
+        },
     ],
     safety_regression: [
-        { action: "Review safety assertions", detail: "Check which safety test cases are failing and why.", priority: "high" },
-        { action: "Strengthen guardrails", detail: "Add or update content filters, system prompt safety instructions.", priority: "high" },
-        { action: "Update rubric", detail: "If safety criteria changed, update the LLM judge rubric.", priority: "medium" },
+        {
+            action: "Review safety assertions",
+            detail: "Check which safety test cases are failing and why.",
+            priority: "high",
+        },
+        {
+            action: "Strengthen guardrails",
+            detail: "Add or update content filters, system prompt safety instructions.",
+            priority: "high",
+        },
+        {
+            action: "Update rubric",
+            detail: "If safety criteria changed, update the LLM judge rubric.",
+            priority: "medium",
+        },
     ],
     cost_regression: [
-        { action: "Check token usage", detail: "Compare input/output token counts between baseline and current run.", priority: "high" },
-        { action: "Optimize prompts", detail: "Reduce prompt length or use a smaller model for non-critical paths.", priority: "medium" },
-        { action: "Update cost budget", detail: "If higher cost is expected, adjust --max-cost-usd threshold.", priority: "low" },
+        {
+            action: "Check token usage",
+            detail: "Compare input/output token counts between baseline and current run.",
+            priority: "high",
+        },
+        {
+            action: "Optimize prompts",
+            detail: "Reduce prompt length or use a smaller model for non-critical paths.",
+            priority: "medium",
+        },
+        {
+            action: "Update cost budget",
+            detail: "If higher cost is expected, adjust --max-cost-usd threshold.",
+            priority: "low",
+        },
     ],
     latency_regression: [
-        { action: "Check response times", detail: "Compare per-test-case latency between baseline and current run.", priority: "high" },
-        { action: "Reduce prompt complexity", detail: "Simplify prompts or use streaming to reduce perceived latency.", priority: "medium" },
-        { action: "Update latency budget", detail: "If higher latency is expected, adjust --max-latency-ms threshold.", priority: "low" },
+        {
+            action: "Check response times",
+            detail: "Compare per-test-case latency between baseline and current run.",
+            priority: "high",
+        },
+        {
+            action: "Reduce prompt complexity",
+            detail: "Simplify prompts or use streaming to reduce perceived latency.",
+            priority: "medium",
+        },
+        {
+            action: "Update latency budget",
+            detail: "If higher latency is expected, adjust --max-latency-ms threshold.",
+            priority: "low",
+        },
     ],
     coverage_drop: [
-        { action: "Add test cases", detail: "Current test count is below minimum. Add more test cases to the evaluation.", priority: "high" },
-        { action: "Check test case filtering", detail: "Verify no test cases were accidentally deleted or filtered out.", priority: "medium" },
+        {
+            action: "Add test cases",
+            detail: "Current test count is below minimum. Add more test cases to the evaluation.",
+            priority: "high",
+        },
+        {
+            action: "Check test case filtering",
+            detail: "Verify no test cases were accidentally deleted or filtered out.",
+            priority: "medium",
+        },
     ],
     baseline_stale: [
-        { action: "Create baseline", detail: "Run: npx evalai baseline init  (or publish a run from the dashboard)", priority: "high" },
-        { action: "Use --baseline previous", detail: "Compare against the previous run instead of a published baseline.", priority: "medium" },
+        {
+            action: "Create baseline",
+            detail: "Run: npx evalai baseline init  (or publish a run from the dashboard)",
+            priority: "high",
+        },
+        {
+            action: "Use --baseline previous",
+            detail: "Compare against the previous run instead of a published baseline.",
+            priority: "medium",
+        },
     ],
     unknown: [
-        { action: "Run evalai doctor", detail: "Run: npx evalai doctor  to check your full CI/CD setup.", priority: "high" },
-        { action: "Check logs", detail: "Review CI logs for errors or unexpected behavior.", priority: "medium" },
-        { action: "Update baseline", detail: "If changes are intentional, run: npx evalai baseline update", priority: "low" },
+        {
+            action: "Run evalai doctor",
+            detail: "Run: npx evalai doctor  to check your full CI/CD setup.",
+            priority: "high",
+        },
+        {
+            action: "Check logs",
+            detail: "Review CI logs for errors or unexpected behavior.",
+            priority: "medium",
+        },
+        {
+            action: "Update baseline",
+            detail: "If changes are intentional, run: npx evalai baseline update",
+            priority: "low",
+        },
     ],
 };
 function suggestFixes(causes) {
@@ -395,9 +503,7 @@ async function runExplain(argv) {
     const cwd = process.cwd();
     const reportPath = findReport(cwd, flags.reportPath);
     if (!reportPath) {
-        const searched = flags.reportPath
-            ? flags.reportPath
-            : REPORT_SEARCH_PATHS.join(", ");
+        const searched = flags.reportPath ? flags.reportPath : REPORT_SEARCH_PATHS.join(", ");
         console.error(`\n  \u274C No report found. Searched: ${searched}`);
         console.error("  Run a gate first:");
         console.error("    npx evalai gate --format json");

package/dist/cli/impact-analysis.d.ts ADDED Viewed

@@ -0,0 +1,63 @@
+/**
+ * TICKET 3 — Impact Analysis CLI Command (v0)
+ *
+ * Goal: Modal-like perceived speed via incremental intelligence
+ *
+ * Algorithm v0 (practical, shippable):
+ * - Inputs: manifest.json + git diff --name-only base...HEAD
+ * - Rules: Direct file mapping, dependency tracking, safe fallback
+ * - Output: Human-readable counts + JSON for automation
+ */
+import type { EvaluationManifest } from "./manifest";
+/**
+ * Impact analysis result
+ */
+export interface ImpactAnalysisResult {
+    /** Impacted specification IDs */
+    impactedSpecIds: string[];
+    /** Reason for each impacted spec */
+    reasonBySpecId: Record<string, string>;
+    /** Changed files that triggered the analysis */
+    changedFiles: string[];
+    /** Analysis metadata */
+    metadata: {
+        baseBranch: string;
+        totalSpecs: number;
+        impactedCount: number;
+        analysisTime: number;
+    };
+}
+/**
+ * Impact analysis options
+ */
+export interface ImpactAnalysisOptions {
+    /** Base branch to compare against */
+    baseBranch: string;
+    /** Optional explicit list of changed files (for CI) */
+    changedFiles?: string[];
+    /** Output format */
+    format?: "human" | "json";
+}
+/**
+ * Run impact analysis
+ */
+export declare function runImpactAnalysis(options: ImpactAnalysisOptions, projectRoot?: string): Promise<ImpactAnalysisResult>;
+/**
+ * Analyze impact of changed files
+ */
+export declare function analyzeImpact(changedFiles: string[], manifest: EvaluationManifest): {
+    impactedSpecIds: string[];
+    reasonBySpecId: Record<string, string>;
+};
+/**
+ * Print human-readable results
+ */
+export declare function printHumanResults(result: ImpactAnalysisResult): void;
+/**
+ * Print JSON results
+ */
+export declare function printJsonResults(result: ImpactAnalysisResult): void;
+/**
+ * CLI entry point
+ */
+export declare function runImpactAnalysisCLI(options: ImpactAnalysisOptions): Promise<void>;

package/dist/cli/impact-analysis.js ADDED Viewed

@@ -0,0 +1,251 @@
+"use strict";
+/**
+ * TICKET 3 — Impact Analysis CLI Command (v0)
+ *
+ * Goal: Modal-like perceived speed via incremental intelligence
+ *
+ * Algorithm v0 (practical, shippable):
+ * - Inputs: manifest.json + git diff --name-only base...HEAD
+ * - Rules: Direct file mapping, dependency tracking, safe fallback
+ * - Output: Human-readable counts + JSON for automation
+ */
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.runImpactAnalysis = runImpactAnalysis;
+exports.analyzeImpact = analyzeImpact;
+exports.printHumanResults = printHumanResults;
+exports.printJsonResults = printJsonResults;
+exports.runImpactAnalysisCLI = runImpactAnalysisCLI;
+const fs = __importStar(require("node:fs/promises"));
+const path = __importStar(require("node:path"));
+const node_child_process_1 = require("node:child_process");
+/**
+ * Run impact analysis
+ */
+async function runImpactAnalysis(options, projectRoot = process.cwd()) {
+    const startTime = Date.now();
+    // Read manifest
+    const manifest = await readManifest(projectRoot);
+    if (!manifest) {
+        throw new Error("No evaluation manifest found. Run 'evalai discover --manifest' first.");
+    }
+    // Get changed files
+    const changedFiles = options.changedFiles || (await getChangedFiles(options.baseBranch));
+    // Analyze impact
+    const { impactedSpecIds, reasonBySpecId } = analyzeImpact(changedFiles, manifest);
+    const result = {
+        impactedSpecIds,
+        reasonBySpecId,
+        changedFiles,
+        metadata: {
+            baseBranch: options.baseBranch,
+            totalSpecs: manifest.specs.length,
+            impactedCount: impactedSpecIds.length,
+            analysisTime: Date.now() - startTime,
+        },
+    };
+    return result;
+}
+/**
+ * Read evaluation manifest
+ */
+async function readManifest(projectRoot = process.cwd()) {
+    const manifestPath = path.join(projectRoot, ".evalai", "manifest.json");
+    try {
+        const content = await fs.readFile(manifestPath, "utf-8");
+        return JSON.parse(content);
+    }
+    catch (error) {
+        return null;
+    }
+}
+/**
+ * Get changed files from git
+ */
+async function getChangedFiles(baseBranch) {
+    return new Promise((resolve, reject) => {
+        const git = (0, node_child_process_1.spawn)("git", ["diff", "--name-only", `${baseBranch}...HEAD`], {
+            stdio: ["pipe", "pipe", "pipe"],
+        });
+        let output = "";
+        let error = "";
+        git.stdout?.on("data", (data) => {
+            output += data.toString();
+        });
+        git.stderr?.on("data", (data) => {
+            error += data.toString();
+        });
+        git.on("close", (code) => {
+            if (code !== 0) {
+                reject(new Error(`Git diff failed: ${error}`));
+                return;
+            }
+            const files = output
+                .split("\n")
+                .map((f) => f.trim())
+                .filter((f) => f.length > 0)
+                .map((f) => f.replace(/\\/g, "/")); // Normalize to POSIX
+            resolve(files);
+        });
+    });
+}
+/**
+ * Analyze impact of changed files
+ */
+function analyzeImpact(changedFiles, manifest) {
+    const impactedSpecIds = new Set();
+    const reasonBySpecId = {};
+    // Normalize changed files to POSIX format
+    const normalizedChangedFiles = changedFiles.map((f) => f.replace(/\\/g, "/"));
+    // Create lookup maps
+    const specsByFile = new Map();
+    const specsByDependency = new Map();
+    // Index specs by file
+    for (const spec of manifest.specs) {
+        // By file path
+        if (!specsByFile.has(spec.filePath)) {
+            specsByFile.set(spec.filePath, []);
+        }
+        specsByFile.get(spec.filePath).push(spec);
+        // By dependencies
+        const deps = [
+            ...spec.dependsOn.prompts,
+            ...spec.dependsOn.datasets,
+            ...spec.dependsOn.tools,
+            ...spec.dependsOn.code,
+        ];
+        for (const dep of deps) {
+            if (!specsByDependency.has(dep)) {
+                specsByDependency.set(dep, []);
+            }
+            specsByDependency.get(dep).push(spec);
+        }
+    }
+    // Analyze each changed file
+    for (const changedFile of normalizedChangedFiles) {
+        // Rule 1: Direct spec file change
+        const specsInFile = specsByFile.get(changedFile);
+        if (specsInFile) {
+            for (const spec of specsInFile) {
+                impactedSpecIds.add(spec.id);
+                reasonBySpecId[spec.id] = `Spec file changed: ${changedFile}`;
+            }
+        }
+        // Rule 2: Dependency change
+        const specsUsingDep = specsByDependency.get(changedFile);
+        if (specsUsingDep) {
+            for (const spec of specsUsingDep) {
+                impactedSpecIds.add(spec.id);
+                reasonBySpecId[spec.id] = `Dependency changed: ${changedFile}`;
+            }
+        }
+        // Rule 3: Safe fallback for unknown files
+        if (!specsInFile && !specsUsingDep) {
+            // If we can't map the file, be conservative and run everything
+            console.warn(`⚠️  Unknown changed file: ${changedFile}`);
+            console.warn(`🛡️  Running full suite for safety`);
+            // Add all specs
+            for (const spec of manifest.specs) {
+                impactedSpecIds.add(spec.id);
+                reasonBySpecId[spec.id] = `Unknown file changed: ${changedFile} (safe fallback)`;
+            }
+            break; // No need to continue analyzing
+        }
+    }
+    return {
+        impactedSpecIds: Array.from(impactedSpecIds).sort(),
+        reasonBySpecId,
+    };
+}
+/**
+ * Print human-readable results
+ */
+function printHumanResults(result) {
+    console.log("\n🔍 Impact Analysis Results");
+    console.log(`📊 Base branch: ${result.metadata.baseBranch}`);
+    console.log(`📁 Changed files: ${result.changedFiles.length}`);
+    console.log(`🎯 Impacted specs: ${result.metadata.impactedCount}/${result.metadata.totalSpecs}`);
+    console.log(`⏱️  Analysis time: ${result.metadata.analysisTime}ms`);
+    if (result.changedFiles.length > 0) {
+        console.log("\n📝 Changed files:");
+        for (const file of result.changedFiles) {
+            console.log(`   • ${file}`);
+        }
+    }
+    if (result.impactedSpecIds.length > 0) {
+        console.log("\n🎯 Impacted specifications:");
+        for (const specId of result.impactedSpecIds) {
+            const reason = result.reasonBySpecId[specId];
+            console.log(`   • ${specId} (${reason})`);
+        }
+        console.log("\n💡 Suggested command:");
+        console.log(`   evalai run --spec-ids ${result.impactedSpecIds.join(",")}`);
+    }
+    else {
+        console.log("\n✅ No specifications impacted");
+        console.log("💡 No tests needed to run");
+    }
+}
+/**
+ * Print JSON results
+ */
+function printJsonResults(result) {
+    console.log(JSON.stringify(result, null, 2));
+}
+/**
+ * CLI entry point
+ */
+async function runImpactAnalysisCLI(options) {
+    try {
+        const result = await runImpactAnalysis(options);
+        if (options.format === "json") {
+            printJsonResults(result);
+        }
+        else {
+            printHumanResults(result);
+        }
+        // Exit with appropriate code
+        if (result.metadata.impactedCount === 0) {
+            process.exit(0);
+        }
+        else {
+            process.exit(1); // Signal that tests should run
+        }
+    }
+    catch (error) {
+        console.error("❌ Impact analysis failed:", error instanceof Error ? error.message : String(error));
+        process.exit(2);
+    }
+}