@wix/eval-assertions 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -5,6 +5,13 @@ var SkillWasCalledAssertionSchema = z.object({
5
5
  /** Names of the skills that must have been called (matched against trace Skill tool args) */
6
6
  skillNames: z.array(z.string()).min(1)
7
7
  });
8
+ var ToolCalledWithParamAssertionSchema = z.object({
9
+ type: z.literal("tool_called_with_param"),
10
+ /** Name of the tool that must have been called */
11
+ toolName: z.string().min(1),
12
+ /** JSON string of key-value pairs for expected parameters (substring match) */
13
+ expectedParams: z.string().min(1)
14
+ });
8
15
  var BuildPassedAssertionSchema = z.object({
9
16
  type: z.literal("build_passed"),
10
17
  /** Command to run (default: "yarn build") */
@@ -37,6 +44,7 @@ var TimeAssertionSchema = z.object({
37
44
  });
38
45
  var AssertionSchema = z.union([
39
46
  SkillWasCalledAssertionSchema,
47
+ ToolCalledWithParamAssertionSchema,
40
48
  BuildPassedAssertionSchema,
41
49
  TimeAssertionSchema,
42
50
  CostAssertionSchema,
@@ -119,7 +127,7 @@ var AssertionResultSchema = z3.object({
119
127
  });
120
128
 
121
129
  // src/evaluators/index.ts
122
- import { randomUUID as randomUUID6 } from "crypto";
130
+ import { randomUUID as randomUUID7 } from "crypto";
123
131
 
124
132
  // src/evaluators/skill-was-called-evaluator.ts
125
133
  import { randomUUID } from "crypto";
@@ -198,15 +206,79 @@ var SkillWasCalledEvaluator = class extends AssertionEvaluator {
198
206
  }
199
207
  };
200
208
 
201
- // src/evaluators/build-passed-evaluator.ts
209
+ // src/evaluators/tool-called-with-param-evaluator.ts
202
210
  import { randomUUID as randomUUID2 } from "crypto";
211
+ var ASSERTION_TYPE = "tool_called_with_param";
212
+ var ASSERTION_NAME = "Tool called with param";
213
+ var containsAll = ({
214
+ actual,
215
+ expected
216
+ }) => Object.entries(expected).every(([key, val]) => {
217
+ const actualVal = actual[key];
218
+ if (actualVal === null || actualVal === void 0) return false;
219
+ const actualStr = typeof actualVal === "string" ? actualVal : JSON.stringify(actualVal);
220
+ return actualStr.includes(String(val));
221
+ });
222
+ var ToolCalledWithParamEvaluator = class extends AssertionEvaluator {
223
+ type = ASSERTION_TYPE;
224
+ evaluate(assertion, input, _context) {
225
+ const assertionId = randomUUID2();
226
+ const { toolName, expectedParams: expectedParamsStr } = assertion;
227
+ const buildResult = (status, message, expected2, actual) => ({
228
+ id: randomUUID2(),
229
+ assertionId,
230
+ assertionType: ASSERTION_TYPE,
231
+ assertionName: ASSERTION_NAME,
232
+ status,
233
+ message,
234
+ expected: expected2,
235
+ ...actual !== void 0 ? { actual } : {}
236
+ });
237
+ let expected;
238
+ try {
239
+ expected = JSON.parse(expectedParamsStr);
240
+ } catch {
241
+ return buildResult(
242
+ "failed" /* FAILED */,
243
+ `Tool "${toolName}" assertion has invalid expected params JSON`,
244
+ `${toolName}(invalid expected params)`,
245
+ "Invalid expected params JSON"
246
+ );
247
+ }
248
+ const expectedLabel = `${toolName}(${Object.entries(expected).map(([k, v]) => `${k}="${v}"`).join(", ")})`;
249
+ const steps = input.llmTrace?.steps ?? [];
250
+ const toolCalls = steps.filter((s) => s.toolName === toolName && s.toolArguments !== void 0).map((s) => {
251
+ try {
252
+ return JSON.parse(s.toolArguments);
253
+ } catch {
254
+ return null;
255
+ }
256
+ }).filter((call) => call !== null);
257
+ if (toolCalls.some((actual) => containsAll({ actual, expected }))) {
258
+ return buildResult(
259
+ "passed" /* PASSED */,
260
+ `Tool "${toolName}" was called with params matching ${expectedParamsStr}`,
261
+ expectedLabel
262
+ );
263
+ }
264
+ return buildResult(
265
+ "failed" /* FAILED */,
266
+ `Tool "${toolName}" was never called with params matching ${expectedParamsStr}`,
267
+ expectedLabel,
268
+ toolCalls.length > 0 ? `Found ${toolName} calls but params didn't match` : `No matching tool calls found`
269
+ );
270
+ }
271
+ };
272
+
273
+ // src/evaluators/build-passed-evaluator.ts
274
+ import { randomUUID as randomUUID3 } from "crypto";
203
275
  import { execSync } from "child_process";
204
276
  var DEFAULT_COMMAND = "yarn build";
205
277
  var DEFAULT_EXIT_CODE = 0;
206
278
  var BuildPassedEvaluator = class extends AssertionEvaluator {
207
279
  type = "build_passed";
208
280
  evaluate(assertion, _input, context) {
209
- const assertionId = randomUUID2();
281
+ const assertionId = randomUUID3();
210
282
  const workDir = context?.workDir;
211
283
  const command = assertion.command ?? DEFAULT_COMMAND;
212
284
  const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;
@@ -254,7 +326,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
254
326
  }
255
327
  createResult(assertionId, fields) {
256
328
  return {
257
- id: randomUUID2(),
329
+ id: randomUUID3(),
258
330
  assertionId,
259
331
  assertionType: "build_passed",
260
332
  assertionName: "Build passed",
@@ -279,7 +351,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
279
351
  };
280
352
 
281
353
  // src/evaluators/time-evaluator.ts
282
- import { randomUUID as randomUUID3 } from "crypto";
354
+ import { randomUUID as randomUUID4 } from "crypto";
283
355
  var TimeEvaluator = class extends AssertionEvaluator {
284
356
  type = "time_limit";
285
357
  evaluate(assertion, input) {
@@ -301,8 +373,8 @@ var TimeEvaluator = class extends AssertionEvaluator {
301
373
  }
302
374
  createResult(fields) {
303
375
  return {
304
- id: randomUUID3(),
305
- assertionId: randomUUID3(),
376
+ id: randomUUID4(),
377
+ assertionId: randomUUID4(),
306
378
  assertionType: "time_limit",
307
379
  assertionName: "Time limit",
308
380
  status: "failed" /* FAILED */,
@@ -312,12 +384,12 @@ var TimeEvaluator = class extends AssertionEvaluator {
312
384
  };
313
385
 
314
386
  // src/evaluators/cost-evaluator.ts
315
- import { randomUUID as randomUUID4 } from "crypto";
387
+ import { randomUUID as randomUUID5 } from "crypto";
316
388
  var CostEvaluator = class extends AssertionEvaluator {
317
389
  type = "cost";
318
390
  evaluate(assertion, input) {
319
- const assertionId = randomUUID4();
320
- const id = randomUUID4();
391
+ const assertionId = randomUUID5();
392
+ const id = randomUUID5();
321
393
  const assertionName = "Cost";
322
394
  const assertionType = "cost";
323
395
  const maxCostUsd = assertion.maxCostUsd;
@@ -349,10 +421,54 @@ var CostEvaluator = class extends AssertionEvaluator {
349
421
  }
350
422
  };
351
423
 
424
+ // src/tools/read-file-tool.ts
425
+ import { tool } from "ai";
426
+ import { z as z4 } from "zod";
427
+ import { readFile } from "fs/promises";
428
+ import path from "path";
429
+ function createReadFileTool(workDir) {
430
+ const resolvedWorkDir = path.resolve(workDir);
431
+ return tool({
432
+ description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
433
+ inputSchema: z4.object({
434
+ path: z4.string().describe("Relative file path in the workspace")
435
+ }),
436
+ execute: async ({
437
+ path: filePath
438
+ }) => {
439
+ const resolved = path.resolve(resolvedWorkDir, filePath);
440
+ if (!resolved.startsWith(resolvedWorkDir + path.sep)) {
441
+ return { error: `Access denied: path escapes workspace directory` };
442
+ }
443
+ try {
444
+ const content = await readFile(resolved, "utf-8");
445
+ return { path: filePath, content };
446
+ } catch {
447
+ return { error: `File not found: ${filePath}` };
448
+ }
449
+ }
450
+ });
451
+ }
452
+
352
453
  // src/evaluators/llm-judge-evaluator.ts
353
- import { randomUUID as randomUUID5 } from "crypto";
454
+ import { randomUUID as randomUUID6 } from "crypto";
354
455
  import { createAnthropic } from "@ai-sdk/anthropic";
355
- import { generateText, APICallError } from "ai";
456
+ import {
457
+ generateText,
458
+ Output,
459
+ APICallError,
460
+ NoObjectGeneratedError,
461
+ stepCountIs
462
+ } from "ai";
463
+ import { z as z5 } from "zod";
464
+ var JudgeResultSchema = z5.object({
465
+ text: z5.string().describe("A brief textual verdict of the test result"),
466
+ score: z5.number().min(0).max(100).describe(
467
+ "A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
468
+ ),
469
+ scoreReasoning: z5.string().describe("A concise explanation justifying the assigned score")
470
+ });
471
+ var MAX_JUDGE_STEPS = 20;
356
472
  function formatTraceForJudge(llmTrace) {
357
473
  if (!llmTrace?.steps?.length) {
358
474
  return "No trace available.";
@@ -423,40 +539,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
423
539
  - {{newFiles}}: list of new files that were created (or "No new files were created")
424
540
  - {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
425
541
 
426
- CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
427
- var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
428
-
429
- {
430
- "text": string,
431
- "score": number (0-100),
432
- "scoreReasoning": string
433
- }
434
-
435
- - text: A brief textual verdict of the test result.
436
- - score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
437
- - scoreReasoning: A concise explanation justifying the assigned score.
542
+ You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
438
543
 
439
- Your response must:
440
- - Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
441
- - Be valid and parseable by \`JSON.parse\`.
442
- - Use only double quotes for all keys and strings, as required by JSON.
443
-
444
- Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
544
+ CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
445
545
  var LlmJudgeEvaluator = class extends AssertionEvaluator {
446
546
  type = "llm_judge";
447
547
  async evaluate(assertion, input, context) {
448
- const assertionId = randomUUID5();
449
- const llmConfig = context?.llmConfig;
548
+ const assertionId = randomUUID6();
450
549
  const workDir = context?.workDir ?? "";
451
- const generateTextStub = context?.generateTextForLlmJudge;
452
550
  const output = input.outputText ?? "";
453
551
  const fileDiffs = input.fileDiffs ?? [];
454
552
  const changedPaths = fileDiffs.map((d) => d.path);
455
553
  const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
456
554
  const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
457
- const changedFiles = changedPaths.length > 0 ? changedPaths.map((path) => `- ${path}`).join("\n") : "No files were changed";
458
- const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
459
- const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
555
+ const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
556
+ const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
557
+ const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
460
558
  const trace = formatTraceForJudge(input.llmTrace);
461
559
  const ctx = {
462
560
  output,
@@ -468,92 +566,68 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
468
566
  };
469
567
  const replace = (s) => replacePlaceholders(s, ctx);
470
568
  const finalPrompt = replace(assertion.prompt);
471
- const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
472
569
  const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
473
570
  const maxOutputTokens = assertion.maxTokens ?? 1024;
474
571
  const temperature = assertion.temperature ?? 0;
475
- const modelUsed = assertion.model ?? context?.defaultJudgeModel;
476
- if (!modelUsed && !generateTextStub) {
477
- return {
478
- id: randomUUID5(),
479
- assertionId,
480
- assertionType: "llm_judge",
481
- assertionName: "LLM judge",
482
- status: "failed" /* FAILED */,
483
- message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
484
- expected: String(minScore)
485
- };
486
- }
487
- if (!generateTextStub && !llmConfig) {
572
+ const modelId = assertion.model ?? context?.defaultJudgeModel;
573
+ const model = this.resolveModel(context, modelId);
574
+ if (!model) {
575
+ const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
488
576
  return {
489
- id: randomUUID5(),
577
+ id: randomUUID6(),
490
578
  assertionId,
491
579
  assertionType: "llm_judge",
492
580
  assertionName: "LLM judge",
493
581
  status: "failed" /* FAILED */,
494
- message: "No llmConfig for llm_judge assertion (AI gateway required)",
582
+ message: reason,
495
583
  expected: String(minScore)
496
584
  };
497
585
  }
498
- const maxParseAttempts = 3;
499
- let lastParseError;
500
- let lastRawText;
586
+ const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
501
587
  try {
502
- for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {
503
- const result = generateTextStub ? await generateTextStub({
504
- prompt: finalPrompt,
505
- system: systemPrompt,
506
- maxOutputTokens,
507
- temperature
508
- }) : await this.callGenerateText(
509
- llmConfig,
510
- modelUsed,
511
- finalPrompt,
512
- systemPrompt,
513
- maxOutputTokens,
514
- temperature
515
- );
516
- lastRawText = result.text;
517
- try {
518
- const cleaned = stripMarkdownCodeBlock(result.text);
519
- const parsed = JSON.parse(cleaned);
520
- const judgeResult = validateJudgeResult(parsed);
521
- const passed = judgeResult.score >= minScore;
522
- return {
523
- id: randomUUID5(),
524
- assertionId,
525
- assertionType: "llm_judge",
526
- assertionName: "LLM judge",
527
- status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
528
- message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
529
- expected: String(minScore),
530
- actual: String(judgeResult.score),
531
- details: {
532
- score: judgeResult.score,
533
- scoreReasoning: judgeResult.scoreReasoning,
534
- text: judgeResult.text
535
- }
536
- };
537
- } catch (parseErr) {
538
- lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
539
- }
540
- }
588
+ const judgeResult = await this.callGenerateText(
589
+ model,
590
+ finalPrompt,
591
+ systemPrompt,
592
+ maxOutputTokens,
593
+ temperature,
594
+ workDir || void 0
595
+ );
596
+ const passed = judgeResult.score >= minScore;
541
597
  return {
542
- id: randomUUID5(),
598
+ id: randomUUID6(),
543
599
  assertionId,
544
600
  assertionType: "llm_judge",
545
601
  assertionName: "LLM judge",
546
- status: "failed" /* FAILED */,
547
- message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? "unknown"}`,
602
+ status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
603
+ message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
548
604
  expected: String(minScore),
549
- actual: void 0,
550
- details: { rawText: lastRawText?.slice(0, 500) }
605
+ actual: String(judgeResult.score),
606
+ details: {
607
+ score: judgeResult.score,
608
+ scoreReasoning: judgeResult.scoreReasoning,
609
+ text: judgeResult.text
610
+ }
551
611
  };
552
612
  } catch (err) {
613
+ if (NoObjectGeneratedError.isInstance(err)) {
614
+ return {
615
+ id: randomUUID6(),
616
+ assertionId,
617
+ assertionType: "llm_judge",
618
+ assertionName: "LLM judge",
619
+ status: "failed" /* FAILED */,
620
+ message: "LLM judge failed to produce valid structured output",
621
+ expected: String(minScore),
622
+ details: {
623
+ rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
624
+ }
625
+ };
626
+ }
553
627
  const message = err instanceof Error ? err.message : String(err);
554
628
  const details = {
555
629
  error: message,
556
- model: modelUsed
630
+ model: modelId
557
631
  };
558
632
  if (APICallError.isInstance(err)) {
559
633
  details.statusCode = err.statusCode;
@@ -562,7 +636,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
562
636
  details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
563
637
  }
564
638
  return {
565
- id: randomUUID5(),
639
+ id: randomUUID6(),
566
640
  assertionId,
567
641
  assertionType: "llm_judge",
568
642
  assertionName: "LLM judge",
@@ -573,20 +647,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
573
647
  };
574
648
  }
575
649
  }
576
- async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature) {
650
+ /**
651
+ * Resolve the LanguageModel to use: context.model (injected mock/override)
652
+ * takes precedence, otherwise create from llmConfig + modelId.
653
+ */
654
+ resolveModel(context, modelId) {
655
+ if (context?.model) {
656
+ return context.model;
657
+ }
658
+ if (!modelId || !context?.llmConfig) {
659
+ return null;
660
+ }
577
661
  const anthropic = createAnthropic({
578
- baseURL: llmConfig.baseUrl,
662
+ baseURL: context.llmConfig.baseUrl,
579
663
  apiKey: "dummy",
580
- headers: llmConfig.headers
664
+ headers: context.llmConfig.headers
581
665
  });
582
- const result = await generateText({
583
- model: anthropic(modelId),
666
+ return anthropic(modelId);
667
+ }
668
+ async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
669
+ const baseOptions = {
670
+ model,
584
671
  prompt,
585
672
  system,
586
673
  maxOutputTokens,
587
- temperature
588
- });
589
- return { text: result.text };
674
+ temperature,
675
+ output: Output.object({ schema: JudgeResultSchema }),
676
+ stopWhen: stepCountIs(MAX_JUDGE_STEPS)
677
+ };
678
+ const { output } = workDir ? await generateText({
679
+ ...baseOptions,
680
+ tools: { read_file: createReadFileTool(workDir) }
681
+ }) : await generateText(baseOptions);
682
+ return output;
590
683
  }
591
684
  };
592
685
 
@@ -594,6 +687,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
594
687
  var llmJudgeEvaluator = new LlmJudgeEvaluator();
595
688
  var evaluators = {
596
689
  skill_was_called: new SkillWasCalledEvaluator(),
690
+ tool_called_with_param: new ToolCalledWithParamEvaluator(),
597
691
  build_passed: new BuildPassedEvaluator(),
598
692
  time_limit: new TimeEvaluator(),
599
693
  cost: new CostEvaluator(),
@@ -616,8 +710,8 @@ async function evaluateAssertions(input, assertions, context) {
616
710
  const evaluator = evaluators[assertion.type];
617
711
  if (!evaluator) {
618
712
  return {
619
- id: randomUUID6(),
620
- assertionId: randomUUID6(),
713
+ id: randomUUID7(),
714
+ assertionId: randomUUID7(),
621
715
  assertionType: assertion.type,
622
716
  assertionName: "Unknown assertion",
623
717
  status: "error" /* ERROR */,
@@ -641,6 +735,7 @@ export {
641
735
  BuildPassedEvaluator,
642
736
  CostAssertionSchema,
643
737
  CostEvaluator,
738
+ JudgeResultSchema,
644
739
  LLMBreakdownStatsSchema,
645
740
  LLMStepType,
646
741
  LLMTraceSchema,
@@ -653,6 +748,9 @@ export {
653
748
  TimeAssertionSchema,
654
749
  TimeEvaluator,
655
750
  TokenUsageSchema,
751
+ ToolCalledWithParamAssertionSchema,
752
+ ToolCalledWithParamEvaluator,
753
+ createReadFileTool,
656
754
  evaluateAssertions,
657
755
  formatTraceForJudge,
658
756
  getEvaluator,