npm - @wix/eval-assertions - Versions diffs - 0.17.0 → 0.19.0 - Mend

@wix/eval-assertions 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/README.md +3 -2
package/build/index.js +218 -112
package/build/index.js.map +4 -4
package/build/index.mjs +209 -111
package/build/index.mjs.map +4 -4
package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
package/build/types/evaluators/index.d.ts +3 -2
package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
package/build/types/evaluators/tool-called-with-param-evaluator.d.ts +12 -0
package/build/types/index.d.ts +3 -2
package/build/types/tools/index.d.ts +1 -0
package/build/types/tools/read-file-tool.d.ts +10 -0
package/build/types/types/assertions.d.ts +14 -0
package/build/types/types/index.d.ts +1 -1
package/package.json +4 -3

package/build/index.mjs CHANGED Viewed

@@ -5,6 +5,13 @@ var SkillWasCalledAssertionSchema = z.object({
   /** Names of the skills that must have been called (matched against trace Skill tool args) */
   skillNames: z.array(z.string()).min(1)
 });
+var ToolCalledWithParamAssertionSchema = z.object({
+  type: z.literal("tool_called_with_param"),
+  /** Name of the tool that must have been called */
+  toolName: z.string().min(1),
+  /** JSON string of key-value pairs for expected parameters (substring match) */
+  expectedParams: z.string().min(1)
+});
 var BuildPassedAssertionSchema = z.object({
   type: z.literal("build_passed"),
   /** Command to run (default: "yarn build") */
@@ -37,6 +44,7 @@ var TimeAssertionSchema = z.object({
 });
 var AssertionSchema = z.union([
   SkillWasCalledAssertionSchema,
+  ToolCalledWithParamAssertionSchema,
   BuildPassedAssertionSchema,
   TimeAssertionSchema,
   CostAssertionSchema,
@@ -119,7 +127,7 @@ var AssertionResultSchema = z3.object({
 });
 // src/evaluators/index.ts
-import { randomUUID as randomUUID6 } from "crypto";
+import { randomUUID as randomUUID7 } from "crypto";
 // src/evaluators/skill-was-called-evaluator.ts
 import { randomUUID } from "crypto";
@@ -198,15 +206,79 @@ var SkillWasCalledEvaluator = class extends AssertionEvaluator {
   }
 };
-// src/evaluators/build-passed-evaluator.ts
+// src/evaluators/tool-called-with-param-evaluator.ts
 import { randomUUID as randomUUID2 } from "crypto";
+var ASSERTION_TYPE = "tool_called_with_param";
+var ASSERTION_NAME = "Tool called with param";
+var containsAll = ({
+  actual,
+  expected
+}) => Object.entries(expected).every(([key, val]) => {
+  const actualVal = actual[key];
+  if (actualVal === null || actualVal === void 0) return false;
+  const actualStr = typeof actualVal === "string" ? actualVal : JSON.stringify(actualVal);
+  return actualStr.includes(String(val));
+});
+var ToolCalledWithParamEvaluator = class extends AssertionEvaluator {
+  type = ASSERTION_TYPE;
+  evaluate(assertion, input, _context) {
+    const assertionId = randomUUID2();
+    const { toolName, expectedParams: expectedParamsStr } = assertion;
+    const buildResult = (status, message, expected2, actual) => ({
+      id: randomUUID2(),
+      assertionId,
+      assertionType: ASSERTION_TYPE,
+      assertionName: ASSERTION_NAME,
+      status,
+      message,
+      expected: expected2,
+      ...actual !== void 0 ? { actual } : {}
+    });
+    let expected;
+    try {
+      expected = JSON.parse(expectedParamsStr);
+    } catch {
+      return buildResult(
+        "failed" /* FAILED */,
+        `Tool "${toolName}" assertion has invalid expected params JSON`,
+        `${toolName}(invalid expected params)`,
+        "Invalid expected params JSON"
+      );
+    }
+    const expectedLabel = `${toolName}(${Object.entries(expected).map(([k, v]) => `${k}="${v}"`).join(", ")})`;
+    const steps = input.llmTrace?.steps ?? [];
+    const toolCalls = steps.filter((s) => s.toolName === toolName && s.toolArguments !== void 0).map((s) => {
+      try {
+        return JSON.parse(s.toolArguments);
+      } catch {
+        return null;
+      }
+    }).filter((call) => call !== null);
+    if (toolCalls.some((actual) => containsAll({ actual, expected }))) {
+      return buildResult(
+        "passed" /* PASSED */,
+        `Tool "${toolName}" was called with params matching ${expectedParamsStr}`,
+        expectedLabel
+      );
+    }
+    return buildResult(
+      "failed" /* FAILED */,
+      `Tool "${toolName}" was never called with params matching ${expectedParamsStr}`,
+      expectedLabel,
+      toolCalls.length > 0 ? `Found ${toolName} calls but params didn't match` : `No matching tool calls found`
+    );
+  }
+};
+// src/evaluators/build-passed-evaluator.ts
+import { randomUUID as randomUUID3 } from "crypto";
 import { execSync } from "child_process";
 var DEFAULT_COMMAND = "yarn build";
 var DEFAULT_EXIT_CODE = 0;
 var BuildPassedEvaluator = class extends AssertionEvaluator {
   type = "build_passed";
   evaluate(assertion, _input, context) {
-    const assertionId = randomUUID2();
+    const assertionId = randomUUID3();
     const workDir = context?.workDir;
     const command = assertion.command ?? DEFAULT_COMMAND;
     const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;
@@ -254,7 +326,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
   }
   createResult(assertionId, fields) {
     return {
-      id: randomUUID2(),
+      id: randomUUID3(),
       assertionId,
       assertionType: "build_passed",
       assertionName: "Build passed",
@@ -279,7 +351,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
 };
 // src/evaluators/time-evaluator.ts
-import { randomUUID as randomUUID3 } from "crypto";
+import { randomUUID as randomUUID4 } from "crypto";
 var TimeEvaluator = class extends AssertionEvaluator {
   type = "time_limit";
   evaluate(assertion, input) {
@@ -301,8 +373,8 @@ var TimeEvaluator = class extends AssertionEvaluator {
   }
   createResult(fields) {
     return {
-      id: randomUUID3(),
-      assertionId: randomUUID3(),
+      id: randomUUID4(),
+      assertionId: randomUUID4(),
       assertionType: "time_limit",
       assertionName: "Time limit",
       status: "failed" /* FAILED */,
@@ -312,12 +384,12 @@ var TimeEvaluator = class extends AssertionEvaluator {
 };
 // src/evaluators/cost-evaluator.ts
-import { randomUUID as randomUUID4 } from "crypto";
+import { randomUUID as randomUUID5 } from "crypto";
 var CostEvaluator = class extends AssertionEvaluator {
   type = "cost";
   evaluate(assertion, input) {
-    const assertionId = randomUUID4();
-    const id = randomUUID4();
+    const assertionId = randomUUID5();
+    const id = randomUUID5();
     const assertionName = "Cost";
     const assertionType = "cost";
     const maxCostUsd = assertion.maxCostUsd;
@@ -349,10 +421,54 @@ var CostEvaluator = class extends AssertionEvaluator {
   }
 };
+// src/tools/read-file-tool.ts
+import { tool } from "ai";
+import { z as z4 } from "zod";
+import { readFile } from "fs/promises";
+import path from "path";
+function createReadFileTool(workDir) {
+  const resolvedWorkDir = path.resolve(workDir);
+  return tool({
+    description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
+    inputSchema: z4.object({
+      path: z4.string().describe("Relative file path in the workspace")
+    }),
+    execute: async ({
+      path: filePath
+    }) => {
+      const resolved = path.resolve(resolvedWorkDir, filePath);
+      if (!resolved.startsWith(resolvedWorkDir + path.sep)) {
+        return { error: `Access denied: path escapes workspace directory` };
+      }
+      try {
+        const content = await readFile(resolved, "utf-8");
+        return { path: filePath, content };
+      } catch {
+        return { error: `File not found: ${filePath}` };
+      }
+    }
+  });
+}
 // src/evaluators/llm-judge-evaluator.ts
-import { randomUUID as randomUUID5 } from "crypto";
+import { randomUUID as randomUUID6 } from "crypto";
 import { createAnthropic } from "@ai-sdk/anthropic";
-import { generateText, APICallError } from "ai";
+import {
+  generateText,
+  Output,
+  APICallError,
+  NoObjectGeneratedError,
+  stepCountIs
+} from "ai";
+import { z as z5 } from "zod";
+var JudgeResultSchema = z5.object({
+  text: z5.string().describe("A brief textual verdict of the test result"),
+  score: z5.number().min(0).max(100).describe(
+    "A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
+  ),
+  scoreReasoning: z5.string().describe("A concise explanation justifying the assigned score")
+});
+var MAX_JUDGE_STEPS = 20;
 function formatTraceForJudge(llmTrace) {
   if (!llmTrace?.steps?.length) {
     return "No trace available.";
@@ -423,40 +539,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
 - {{newFiles}}: list of new files that were created (or "No new files were created")
 - {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
-CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
-var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
-{
-  "text": string,
-  "score": number (0-100),
-  "scoreReasoning": string
-}
-- text: A brief textual verdict of the test result.
-- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
-- scoreReasoning: A concise explanation justifying the assigned score.
+You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
-Your response must:
-- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
-- Be valid and parseable by \`JSON.parse\`.
-- Use only double quotes for all keys and strings, as required by JSON.
-Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
+CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
 var LlmJudgeEvaluator = class extends AssertionEvaluator {
   type = "llm_judge";
   async evaluate(assertion, input, context) {
-    const assertionId = randomUUID5();
-    const llmConfig = context?.llmConfig;
+    const assertionId = randomUUID6();
     const workDir = context?.workDir ?? "";
-    const generateTextStub = context?.generateTextForLlmJudge;
     const output = input.outputText ?? "";
     const fileDiffs = input.fileDiffs ?? [];
     const changedPaths = fileDiffs.map((d) => d.path);
     const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
     const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
-    const changedFiles = changedPaths.length > 0 ? changedPaths.map((path) => `- ${path}`).join("\n") : "No files were changed";
-    const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
-    const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
+    const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
+    const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
+    const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
     const trace = formatTraceForJudge(input.llmTrace);
     const ctx = {
       output,
@@ -468,92 +566,68 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
     };
     const replace = (s) => replacePlaceholders(s, ctx);
     const finalPrompt = replace(assertion.prompt);
-    const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
     const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
     const maxOutputTokens = assertion.maxTokens ?? 1024;
     const temperature = assertion.temperature ?? 0;
-    const modelUsed = assertion.model ?? context?.defaultJudgeModel;
-    if (!modelUsed && !generateTextStub) {
-      return {
-        id: randomUUID5(),
-        assertionId,
-        assertionType: "llm_judge",
-        assertionName: "LLM judge",
-        status: "failed" /* FAILED */,
-        message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
-        expected: String(minScore)
-      };
-    }
-    if (!generateTextStub && !llmConfig) {
+    const modelId = assertion.model ?? context?.defaultJudgeModel;
+    const model = this.resolveModel(context, modelId);
+    if (!model) {
+      const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
       return {
-        id: randomUUID5(),
+        id: randomUUID6(),
         assertionId,
         assertionType: "llm_judge",
         assertionName: "LLM judge",
         status: "failed" /* FAILED */,
-        message: "No llmConfig for llm_judge assertion (AI gateway required)",
+        message: reason,
         expected: String(minScore)
       };
     }
-    const maxParseAttempts = 3;
-    let lastParseError;
-    let lastRawText;
+    const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
     try {
-      for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {
-        const result = generateTextStub ? await generateTextStub({
-          prompt: finalPrompt,
-          system: systemPrompt,
-          maxOutputTokens,
-          temperature
-        }) : await this.callGenerateText(
-          llmConfig,
-          modelUsed,
-          finalPrompt,
-          systemPrompt,
-          maxOutputTokens,
-          temperature
-        );
-        lastRawText = result.text;
-        try {
-          const cleaned = stripMarkdownCodeBlock(result.text);
-          const parsed = JSON.parse(cleaned);
-          const judgeResult = validateJudgeResult(parsed);
-          const passed = judgeResult.score >= minScore;
-          return {
-            id: randomUUID5(),
-            assertionId,
-            assertionType: "llm_judge",
-            assertionName: "LLM judge",
-            status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
-            message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
-            expected: String(minScore),
-            actual: String(judgeResult.score),
-            details: {
-              score: judgeResult.score,
-              scoreReasoning: judgeResult.scoreReasoning,
-              text: judgeResult.text
-            }
-          };
-        } catch (parseErr) {
-          lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
-        }
-      }
+      const judgeResult = await this.callGenerateText(
+        model,
+        finalPrompt,
+        systemPrompt,
+        maxOutputTokens,
+        temperature,
+        workDir || void 0
+      );
+      const passed = judgeResult.score >= minScore;
       return {
-        id: randomUUID5(),
+        id: randomUUID6(),
         assertionId,
         assertionType: "llm_judge",
         assertionName: "LLM judge",
-        status: "failed" /* FAILED */,
-        message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? "unknown"}`,
+        status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
+        message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
         expected: String(minScore),
-        actual: void 0,
-        details: { rawText: lastRawText?.slice(0, 500) }
+        actual: String(judgeResult.score),
+        details: {
+          score: judgeResult.score,
+          scoreReasoning: judgeResult.scoreReasoning,
+          text: judgeResult.text
+        }
       };
     } catch (err) {
+      if (NoObjectGeneratedError.isInstance(err)) {
+        return {
+          id: randomUUID6(),
+          assertionId,
+          assertionType: "llm_judge",
+          assertionName: "LLM judge",
+          status: "failed" /* FAILED */,
+          message: "LLM judge failed to produce valid structured output",
+          expected: String(minScore),
+          details: {
+            rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
+          }
+        };
+      }
       const message = err instanceof Error ? err.message : String(err);
       const details = {
         error: message,
-        model: modelUsed
+        model: modelId
       };
       if (APICallError.isInstance(err)) {
         details.statusCode = err.statusCode;
@@ -562,7 +636,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
         details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
       }
       return {
-        id: randomUUID5(),
+        id: randomUUID6(),
         assertionId,
         assertionType: "llm_judge",
         assertionName: "LLM judge",
@@ -573,20 +647,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
       };
     }
   }
-  async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature) {
+  /**
+   * Resolve the LanguageModel to use: context.model (injected mock/override)
+   * takes precedence, otherwise create from llmConfig + modelId.
+   */
+  resolveModel(context, modelId) {
+    if (context?.model) {
+      return context.model;
+    }
+    if (!modelId || !context?.llmConfig) {
+      return null;
+    }
     const anthropic = createAnthropic({
-      baseURL: llmConfig.baseUrl,
+      baseURL: context.llmConfig.baseUrl,
       apiKey: "dummy",
-      headers: llmConfig.headers
+      headers: context.llmConfig.headers
     });
-    const result = await generateText({
-      model: anthropic(modelId),
+    return anthropic(modelId);
+  }
+  async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
+    const baseOptions = {
+      model,
       prompt,
       system,
       maxOutputTokens,
-      temperature
-    });
-    return { text: result.text };
+      temperature,
+      output: Output.object({ schema: JudgeResultSchema }),
+      stopWhen: stepCountIs(MAX_JUDGE_STEPS)
+    };
+    const { output } = workDir ? await generateText({
+      ...baseOptions,
+      tools: { read_file: createReadFileTool(workDir) }
+    }) : await generateText(baseOptions);
+    return output;
   }
 };
@@ -594,6 +687,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
 var llmJudgeEvaluator = new LlmJudgeEvaluator();
 var evaluators = {
   skill_was_called: new SkillWasCalledEvaluator(),
+  tool_called_with_param: new ToolCalledWithParamEvaluator(),
   build_passed: new BuildPassedEvaluator(),
   time_limit: new TimeEvaluator(),
   cost: new CostEvaluator(),
@@ -616,8 +710,8 @@ async function evaluateAssertions(input, assertions, context) {
       const evaluator = evaluators[assertion.type];
       if (!evaluator) {
         return {
-          id: randomUUID6(),
-          assertionId: randomUUID6(),
+          id: randomUUID7(),
+          assertionId: randomUUID7(),
           assertionType: assertion.type,
           assertionName: "Unknown assertion",
           status: "error" /* ERROR */,
@@ -641,6 +735,7 @@ export {
   BuildPassedEvaluator,
   CostAssertionSchema,
   CostEvaluator,
+  JudgeResultSchema,
   LLMBreakdownStatsSchema,
   LLMStepType,
   LLMTraceSchema,
@@ -653,6 +748,9 @@ export {
   TimeAssertionSchema,
   TimeEvaluator,
   TokenUsageSchema,
+  ToolCalledWithParamAssertionSchema,
+  ToolCalledWithParamEvaluator,
+  createReadFileTool,
   evaluateAssertions,
   formatTraceForJudge,
   getEvaluator,