npm - @wix/eval-assertions - Versions diffs - 0.16.0 → 0.18.0 - Mend

@wix/eval-assertions 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +3 -2
package/build/index.js +181 -103
package/build/index.js.map +4 -4
package/build/index.mjs +172 -102
package/build/index.mjs.map +4 -4
package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
package/build/types/evaluators/cost-evaluator.d.ts +10 -0
package/build/types/evaluators/index.d.ts +3 -2
package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
package/build/types/index.d.ts +3 -2
package/build/types/tools/index.d.ts +1 -0
package/build/types/tools/read-file-tool.d.ts +10 -0
package/build/types/types/assertions.d.ts +12 -0
package/build/types/types/index.d.ts +1 -1
package/package.json +4 -3

package/README.md CHANGED Viewed

@@ -149,11 +149,12 @@ Optional context for assertions:
 ```typescript
 interface AssertionContext {
   workDir?: string;                           // For build_passed
-  llmConfig?: {                               // For llm_judge
+  llmConfig?: {                               // For llm_judge
     baseUrl: string;
     headers: Record<string, string>;
   };
-  generateTextForLlmJudge?: (options) => Promise<{ text: string }>;  // For testing
+  defaultJudgeModel?: string;                 // Default model for llm_judge
+  model?: LanguageModel;                      // Override model
 }
 ```

package/build/index.js CHANGED Viewed

@@ -1,7 +1,9 @@
 "use strict";
+var __create = Object.create;
 var __defProp = Object.defineProperty;
 var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
 var __getOwnPropNames = Object.getOwnPropertyNames;
+var __getProtoOf = Object.getPrototypeOf;
 var __hasOwnProp = Object.prototype.hasOwnProperty;
 var __export = (target, all) => {
   for (var name in all)
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
   }
   return to;
 };
+var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
+  // If the importer is in node compatibility mode or this is not an ESM
+  // file that has been converted to a CommonJS file using a Babel-
+  // compatible transform (i.e. "__esModule" has not been set), then set
+  // "default" to the CommonJS "module.exports" for node compatibility.
+  isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
+  mod
+));
 var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
 // src/index.ts
@@ -26,6 +36,9 @@ __export(index_exports, {
   AssertionSchema: () => AssertionSchema,
   BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
   BuildPassedEvaluator: () => BuildPassedEvaluator,
+  CostAssertionSchema: () => CostAssertionSchema,
+  CostEvaluator: () => CostEvaluator,
+  JudgeResultSchema: () => JudgeResultSchema,
   LLMBreakdownStatsSchema: () => LLMBreakdownStatsSchema,
   LLMStepType: () => LLMStepType,
   LLMTraceSchema: () => LLMTraceSchema,
@@ -38,6 +51,7 @@ __export(index_exports, {
   TimeAssertionSchema: () => TimeAssertionSchema,
   TimeEvaluator: () => TimeEvaluator,
   TokenUsageSchema: () => TokenUsageSchema,
+  createReadFileTool: () => createReadFileTool,
   evaluateAssertions: () => evaluateAssertions,
   formatTraceForJudge: () => formatTraceForJudge,
   getEvaluator: () => getEvaluator,
@@ -62,6 +76,11 @@ var BuildPassedAssertionSchema = import_zod.z.object({
   /** Expected exit code (default: 0) */
   expectedExitCode: import_zod.z.number().int().optional()
 });
+var CostAssertionSchema = import_zod.z.object({
+  type: import_zod.z.literal("cost"),
+  /** Maximum allowed cost in USD */
+  maxCostUsd: import_zod.z.number().positive()
+});
 var LlmJudgeAssertionSchema = import_zod.z.object({
   type: import_zod.z.literal("llm_judge"),
   /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */
@@ -84,6 +103,7 @@ var AssertionSchema = import_zod.z.union([
   SkillWasCalledAssertionSchema,
   BuildPassedAssertionSchema,
   TimeAssertionSchema,
+  CostAssertionSchema,
   LlmJudgeAssertionSchema
 ]);
@@ -163,7 +183,7 @@ var AssertionResultSchema = import_zod3.z.object({
 });
 // src/evaluators/index.ts
-var import_crypto5 = require("crypto");
+var import_crypto6 = require("crypto");
 // src/evaluators/skill-was-called-evaluator.ts
 var import_crypto = require("crypto");
@@ -355,10 +375,86 @@ var TimeEvaluator = class extends AssertionEvaluator {
   }
 };
-// src/evaluators/llm-judge-evaluator.ts
+// src/evaluators/cost-evaluator.ts
 var import_crypto4 = require("crypto");
-var import_anthropic = require("@ai-sdk/anthropic");
+var CostEvaluator = class extends AssertionEvaluator {
+  type = "cost";
+  evaluate(assertion, input) {
+    const assertionId = (0, import_crypto4.randomUUID)();
+    const id = (0, import_crypto4.randomUUID)();
+    const assertionName = "Cost";
+    const assertionType = "cost";
+    const maxCostUsd = assertion.maxCostUsd;
+    if (!input.llmTrace) {
+      return {
+        id,
+        assertionId,
+        assertionType,
+        assertionName,
+        status: "skipped" /* SKIPPED */,
+        message: "No LLM trace available to check cost"
+      };
+    }
+    const actualCostUsd = input.llmTrace.summary.totalCostUsd;
+    const formattedActual = actualCostUsd.toFixed(6);
+    const formattedMax = maxCostUsd.toFixed(6);
+    const passed = Number(formattedActual) <= Number(formattedMax);
+    return {
+      id,
+      assertionId,
+      assertionType,
+      assertionName,
+      status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
+      message: passed ? `Cost $${formattedActual} is within limit of $${formattedMax}` : `Cost $${formattedActual} exceeds limit of $${formattedMax}`,
+      expected: `<= $${formattedMax}`,
+      actual: `$${formattedActual}`,
+      details: { actualCostUsd, maxCostUsd }
+    };
+  }
+};
+// src/tools/read-file-tool.ts
 var import_ai = require("ai");
+var import_zod4 = require("zod");
+var import_promises = require("fs/promises");
+var import_path = __toESM(require("path"));
+function createReadFileTool(workDir) {
+  const resolvedWorkDir = import_path.default.resolve(workDir);
+  return (0, import_ai.tool)({
+    description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
+    inputSchema: import_zod4.z.object({
+      path: import_zod4.z.string().describe("Relative file path in the workspace")
+    }),
+    execute: async ({
+      path: filePath
+    }) => {
+      const resolved = import_path.default.resolve(resolvedWorkDir, filePath);
+      if (!resolved.startsWith(resolvedWorkDir + import_path.default.sep)) {
+        return { error: `Access denied: path escapes workspace directory` };
+      }
+      try {
+        const content = await (0, import_promises.readFile)(resolved, "utf-8");
+        return { path: filePath, content };
+      } catch {
+        return { error: `File not found: ${filePath}` };
+      }
+    }
+  });
+}
+// src/evaluators/llm-judge-evaluator.ts
+var import_crypto5 = require("crypto");
+var import_anthropic = require("@ai-sdk/anthropic");
+var import_ai2 = require("ai");
+var import_zod5 = require("zod");
+var JudgeResultSchema = import_zod5.z.object({
+  text: import_zod5.z.string().describe("A brief textual verdict of the test result"),
+  score: import_zod5.z.number().min(0).max(100).describe(
+    "A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
+  ),
+  scoreReasoning: import_zod5.z.string().describe("A concise explanation justifying the assigned score")
+});
+var MAX_JUDGE_STEPS = 20;
 function formatTraceForJudge(llmTrace) {
   if (!llmTrace?.steps?.length) {
     return "No trace available.";
@@ -429,40 +525,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
 - {{newFiles}}: list of new files that were created (or "No new files were created")
 - {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
-CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
-var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
+You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
-{
-  "text": string,
-  "score": number (0-100),
-  "scoreReasoning": string
-}
-- text: A brief textual verdict of the test result.
-- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
-- scoreReasoning: A concise explanation justifying the assigned score.
-Your response must:
-- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
-- Be valid and parseable by \`JSON.parse\`.
-- Use only double quotes for all keys and strings, as required by JSON.
-Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
+CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
 var LlmJudgeEvaluator = class extends AssertionEvaluator {
   type = "llm_judge";
   async evaluate(assertion, input, context) {
-    const assertionId = (0, import_crypto4.randomUUID)();
-    const llmConfig = context?.llmConfig;
+    const assertionId = (0, import_crypto5.randomUUID)();
     const workDir = context?.workDir ?? "";
-    const generateTextStub = context?.generateTextForLlmJudge;
     const output = input.outputText ?? "";
     const fileDiffs = input.fileDiffs ?? [];
     const changedPaths = fileDiffs.map((d) => d.path);
     const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
     const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
-    const changedFiles = changedPaths.length > 0 ? changedPaths.map((path) => `- ${path}`).join("\n") : "No files were changed";
-    const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
-    const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
+    const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
+    const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
+    const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
     const trace = formatTraceForJudge(input.llmTrace);
     const ctx = {
       output,
@@ -474,101 +552,77 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
     };
     const replace = (s) => replacePlaceholders(s, ctx);
     const finalPrompt = replace(assertion.prompt);
-    const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
     const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
     const maxOutputTokens = assertion.maxTokens ?? 1024;
     const temperature = assertion.temperature ?? 0;
-    const modelUsed = assertion.model ?? context?.defaultJudgeModel;
-    if (!modelUsed && !generateTextStub) {
+    const modelId = assertion.model ?? context?.defaultJudgeModel;
+    const model = this.resolveModel(context, modelId);
+    if (!model) {
+      const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
       return {
-        id: (0, import_crypto4.randomUUID)(),
+        id: (0, import_crypto5.randomUUID)(),
         assertionId,
         assertionType: "llm_judge",
         assertionName: "LLM judge",
         status: "failed" /* FAILED */,
-        message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
+        message: reason,
         expected: String(minScore)
       };
     }
-    if (!generateTextStub && !llmConfig) {
-      return {
-        id: (0, import_crypto4.randomUUID)(),
-        assertionId,
-        assertionType: "llm_judge",
-        assertionName: "LLM judge",
-        status: "failed" /* FAILED */,
-        message: "No llmConfig for llm_judge assertion (AI gateway required)",
-        expected: String(minScore)
-      };
-    }
-    const maxParseAttempts = 3;
-    let lastParseError;
-    let lastRawText;
+    const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
     try {
-      for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {
-        const result = generateTextStub ? await generateTextStub({
-          prompt: finalPrompt,
-          system: systemPrompt,
-          maxOutputTokens,
-          temperature
-        }) : await this.callGenerateText(
-          llmConfig,
-          modelUsed,
-          finalPrompt,
-          systemPrompt,
-          maxOutputTokens,
-          temperature
-        );
-        lastRawText = result.text;
-        try {
-          const cleaned = stripMarkdownCodeBlock(result.text);
-          const parsed = JSON.parse(cleaned);
-          const judgeResult = validateJudgeResult(parsed);
-          const passed = judgeResult.score >= minScore;
-          return {
-            id: (0, import_crypto4.randomUUID)(),
-            assertionId,
-            assertionType: "llm_judge",
-            assertionName: "LLM judge",
-            status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
-            message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
-            expected: String(minScore),
-            actual: String(judgeResult.score),
-            details: {
-              score: judgeResult.score,
-              scoreReasoning: judgeResult.scoreReasoning,
-              text: judgeResult.text
-            }
-          };
-        } catch (parseErr) {
-          lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
-        }
-      }
+      const judgeResult = await this.callGenerateText(
+        model,
+        finalPrompt,
+        systemPrompt,
+        maxOutputTokens,
+        temperature,
+        workDir || void 0
+      );
+      const passed = judgeResult.score >= minScore;
       return {
-        id: (0, import_crypto4.randomUUID)(),
+        id: (0, import_crypto5.randomUUID)(),
         assertionId,
         assertionType: "llm_judge",
         assertionName: "LLM judge",
-        status: "failed" /* FAILED */,
-        message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? "unknown"}`,
+        status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
+        message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
         expected: String(minScore),
-        actual: void 0,
-        details: { rawText: lastRawText?.slice(0, 500) }
+        actual: String(judgeResult.score),
+        details: {
+          score: judgeResult.score,
+          scoreReasoning: judgeResult.scoreReasoning,
+          text: judgeResult.text
+        }
       };
     } catch (err) {
+      if (import_ai2.NoObjectGeneratedError.isInstance(err)) {
+        return {
+          id: (0, import_crypto5.randomUUID)(),
+          assertionId,
+          assertionType: "llm_judge",
+          assertionName: "LLM judge",
+          status: "failed" /* FAILED */,
+          message: "LLM judge failed to produce valid structured output",
+          expected: String(minScore),
+          details: {
+            rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
+          }
+        };
+      }
       const message = err instanceof Error ? err.message : String(err);
       const details = {
         error: message,
-        model: modelUsed
+        model: modelId
       };
-      if (import_ai.APICallError.isInstance(err)) {
+      if (import_ai2.APICallError.isInstance(err)) {
         details.statusCode = err.statusCode;
         details.url = err.url;
         details.isRetryable = err.isRetryable;
         details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
       }
       return {
-        id: (0, import_crypto4.randomUUID)(),
+        id: (0, import_crypto5.randomUUID)(),
         assertionId,
         assertionType: "llm_judge",
         assertionName: "LLM judge",
@@ -579,20 +633,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
       };
     }
   }
-  async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature) {
+  /**
+   * Resolve the LanguageModel to use: context.model (injected mock/override)
+   * takes precedence, otherwise create from llmConfig + modelId.
+   */
+  resolveModel(context, modelId) {
+    if (context?.model) {
+      return context.model;
+    }
+    if (!modelId || !context?.llmConfig) {
+      return null;
+    }
     const anthropic = (0, import_anthropic.createAnthropic)({
-      baseURL: llmConfig.baseUrl,
+      baseURL: context.llmConfig.baseUrl,
       apiKey: "dummy",
-      headers: llmConfig.headers
+      headers: context.llmConfig.headers
     });
-    const result = await (0, import_ai.generateText)({
-      model: anthropic(modelId),
+    return anthropic(modelId);
+  }
+  async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
+    const baseOptions = {
+      model,
       prompt,
       system,
       maxOutputTokens,
-      temperature
-    });
-    return { text: result.text };
+      temperature,
+      output: import_ai2.Output.object({ schema: JudgeResultSchema }),
+      stopWhen: (0, import_ai2.stepCountIs)(MAX_JUDGE_STEPS)
+    };
+    const { output } = workDir ? await (0, import_ai2.generateText)({
+      ...baseOptions,
+      tools: { read_file: createReadFileTool(workDir) }
+    }) : await (0, import_ai2.generateText)(baseOptions);
+    return output;
   }
 };
@@ -602,6 +675,7 @@ var evaluators = {
   skill_was_called: new SkillWasCalledEvaluator(),
   build_passed: new BuildPassedEvaluator(),
   time_limit: new TimeEvaluator(),
+  cost: new CostEvaluator(),
   llm_judge: llmJudgeEvaluator,
   // Custom assertions use the same LLM-based evaluation as llm_judge
   custom: llmJudgeEvaluator
@@ -621,8 +695,8 @@ async function evaluateAssertions(input, assertions, context) {
       const evaluator = evaluators[assertion.type];
       if (!evaluator) {
         return {
-          id: (0, import_crypto5.randomUUID)(),
-          assertionId: (0, import_crypto5.randomUUID)(),
+          id: (0, import_crypto6.randomUUID)(),
+          assertionId: (0, import_crypto6.randomUUID)(),
           assertionType: assertion.type,
           assertionName: "Unknown assertion",
           status: "error" /* ERROR */,
@@ -645,6 +719,9 @@ async function evaluateAssertions(input, assertions, context) {
   AssertionSchema,
   BuildPassedAssertionSchema,
   BuildPassedEvaluator,
+  CostAssertionSchema,
+  CostEvaluator,
+  JudgeResultSchema,
   LLMBreakdownStatsSchema,
   LLMStepType,
   LLMTraceSchema,
@@ -657,6 +734,7 @@ async function evaluateAssertions(input, assertions, context) {
   TimeAssertionSchema,
   TimeEvaluator,
   TokenUsageSchema,
+  createReadFileTool,
   evaluateAssertions,
   formatTraceForJudge,
   getEvaluator,