@wix/eval-assertions 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -149,11 +149,12 @@ Optional context for assertions:
149
149
  ```typescript
150
150
  interface AssertionContext {
151
151
  workDir?: string; // For build_passed
152
- llmConfig?: { // For llm_judge
152
+ llmConfig?: { // For llm_judge
153
153
  baseUrl: string;
154
154
  headers: Record<string, string>;
155
155
  };
156
- generateTextForLlmJudge?: (options) => Promise<{ text: string }>; // For testing
156
+ defaultJudgeModel?: string; // Default model for llm_judge
157
+ model?: LanguageModel; // Override model
157
158
  }
158
159
  ```
159
160
 
package/build/index.js CHANGED
@@ -1,7 +1,9 @@
1
1
  "use strict";
2
+ var __create = Object.create;
2
3
  var __defProp = Object.defineProperty;
3
4
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
5
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
8
  var __export = (target, all) => {
7
9
  for (var name in all)
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
15
17
  }
16
18
  return to;
17
19
  };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
18
28
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
29
 
20
30
  // src/index.ts
@@ -26,6 +36,9 @@ __export(index_exports, {
26
36
  AssertionSchema: () => AssertionSchema,
27
37
  BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
28
38
  BuildPassedEvaluator: () => BuildPassedEvaluator,
39
+ CostAssertionSchema: () => CostAssertionSchema,
40
+ CostEvaluator: () => CostEvaluator,
41
+ JudgeResultSchema: () => JudgeResultSchema,
29
42
  LLMBreakdownStatsSchema: () => LLMBreakdownStatsSchema,
30
43
  LLMStepType: () => LLMStepType,
31
44
  LLMTraceSchema: () => LLMTraceSchema,
@@ -38,6 +51,7 @@ __export(index_exports, {
38
51
  TimeAssertionSchema: () => TimeAssertionSchema,
39
52
  TimeEvaluator: () => TimeEvaluator,
40
53
  TokenUsageSchema: () => TokenUsageSchema,
54
+ createReadFileTool: () => createReadFileTool,
41
55
  evaluateAssertions: () => evaluateAssertions,
42
56
  formatTraceForJudge: () => formatTraceForJudge,
43
57
  getEvaluator: () => getEvaluator,
@@ -62,6 +76,11 @@ var BuildPassedAssertionSchema = import_zod.z.object({
62
76
  /** Expected exit code (default: 0) */
63
77
  expectedExitCode: import_zod.z.number().int().optional()
64
78
  });
79
+ var CostAssertionSchema = import_zod.z.object({
80
+ type: import_zod.z.literal("cost"),
81
+ /** Maximum allowed cost in USD */
82
+ maxCostUsd: import_zod.z.number().positive()
83
+ });
65
84
  var LlmJudgeAssertionSchema = import_zod.z.object({
66
85
  type: import_zod.z.literal("llm_judge"),
67
86
  /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */
@@ -84,6 +103,7 @@ var AssertionSchema = import_zod.z.union([
84
103
  SkillWasCalledAssertionSchema,
85
104
  BuildPassedAssertionSchema,
86
105
  TimeAssertionSchema,
106
+ CostAssertionSchema,
87
107
  LlmJudgeAssertionSchema
88
108
  ]);
89
109
 
@@ -163,7 +183,7 @@ var AssertionResultSchema = import_zod3.z.object({
163
183
  });
164
184
 
165
185
  // src/evaluators/index.ts
166
- var import_crypto5 = require("crypto");
186
+ var import_crypto6 = require("crypto");
167
187
 
168
188
  // src/evaluators/skill-was-called-evaluator.ts
169
189
  var import_crypto = require("crypto");
@@ -355,10 +375,86 @@ var TimeEvaluator = class extends AssertionEvaluator {
355
375
  }
356
376
  };
357
377
 
358
- // src/evaluators/llm-judge-evaluator.ts
378
+ // src/evaluators/cost-evaluator.ts
359
379
  var import_crypto4 = require("crypto");
360
- var import_anthropic = require("@ai-sdk/anthropic");
380
+ var CostEvaluator = class extends AssertionEvaluator {
381
+ type = "cost";
382
+ evaluate(assertion, input) {
383
+ const assertionId = (0, import_crypto4.randomUUID)();
384
+ const id = (0, import_crypto4.randomUUID)();
385
+ const assertionName = "Cost";
386
+ const assertionType = "cost";
387
+ const maxCostUsd = assertion.maxCostUsd;
388
+ if (!input.llmTrace) {
389
+ return {
390
+ id,
391
+ assertionId,
392
+ assertionType,
393
+ assertionName,
394
+ status: "skipped" /* SKIPPED */,
395
+ message: "No LLM trace available to check cost"
396
+ };
397
+ }
398
+ const actualCostUsd = input.llmTrace.summary.totalCostUsd;
399
+ const formattedActual = actualCostUsd.toFixed(6);
400
+ const formattedMax = maxCostUsd.toFixed(6);
401
+ const passed = Number(formattedActual) <= Number(formattedMax);
402
+ return {
403
+ id,
404
+ assertionId,
405
+ assertionType,
406
+ assertionName,
407
+ status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
408
+ message: passed ? `Cost $${formattedActual} is within limit of $${formattedMax}` : `Cost $${formattedActual} exceeds limit of $${formattedMax}`,
409
+ expected: `<= $${formattedMax}`,
410
+ actual: `$${formattedActual}`,
411
+ details: { actualCostUsd, maxCostUsd }
412
+ };
413
+ }
414
+ };
415
+
416
+ // src/tools/read-file-tool.ts
361
417
  var import_ai = require("ai");
418
+ var import_zod4 = require("zod");
419
+ var import_promises = require("fs/promises");
420
+ var import_path = __toESM(require("path"));
421
+ function createReadFileTool(workDir) {
422
+ const resolvedWorkDir = import_path.default.resolve(workDir);
423
+ return (0, import_ai.tool)({
424
+ description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
425
+ inputSchema: import_zod4.z.object({
426
+ path: import_zod4.z.string().describe("Relative file path in the workspace")
427
+ }),
428
+ execute: async ({
429
+ path: filePath
430
+ }) => {
431
+ const resolved = import_path.default.resolve(resolvedWorkDir, filePath);
432
+ if (!resolved.startsWith(resolvedWorkDir + import_path.default.sep)) {
433
+ return { error: `Access denied: path escapes workspace directory` };
434
+ }
435
+ try {
436
+ const content = await (0, import_promises.readFile)(resolved, "utf-8");
437
+ return { path: filePath, content };
438
+ } catch {
439
+ return { error: `File not found: ${filePath}` };
440
+ }
441
+ }
442
+ });
443
+ }
444
+
445
+ // src/evaluators/llm-judge-evaluator.ts
446
+ var import_crypto5 = require("crypto");
447
+ var import_anthropic = require("@ai-sdk/anthropic");
448
+ var import_ai2 = require("ai");
449
+ var import_zod5 = require("zod");
450
+ var JudgeResultSchema = import_zod5.z.object({
451
+ text: import_zod5.z.string().describe("A brief textual verdict of the test result"),
452
+ score: import_zod5.z.number().min(0).max(100).describe(
453
+ "A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
454
+ ),
455
+ scoreReasoning: import_zod5.z.string().describe("A concise explanation justifying the assigned score")
456
+ });
457
+ var MAX_JUDGE_STEPS = 20;
362
458
  function formatTraceForJudge(llmTrace) {
363
459
  if (!llmTrace?.steps?.length) {
364
460
  return "No trace available.";
@@ -429,40 +525,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
429
525
  - {{newFiles}}: list of new files that were created (or "No new files were created")
430
526
  - {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
431
527
 
432
- CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
433
- var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
528
+ You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
434
529
 
435
- {
436
- "text": string,
437
- "score": number (0-100),
438
- "scoreReasoning": string
439
- }
440
-
441
- - text: A brief textual verdict of the test result.
442
- - score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
443
- - scoreReasoning: A concise explanation justifying the assigned score.
444
-
445
- Your response must:
446
- - Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
447
- - Be valid and parseable by \`JSON.parse\`.
448
- - Use only double quotes for all keys and strings, as required by JSON.
449
-
450
- Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
530
+ CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
451
531
  var LlmJudgeEvaluator = class extends AssertionEvaluator {
452
532
  type = "llm_judge";
453
533
  async evaluate(assertion, input, context) {
454
- const assertionId = (0, import_crypto4.randomUUID)();
455
- const llmConfig = context?.llmConfig;
534
+ const assertionId = (0, import_crypto5.randomUUID)();
456
535
  const workDir = context?.workDir ?? "";
457
- const generateTextStub = context?.generateTextForLlmJudge;
458
536
  const output = input.outputText ?? "";
459
537
  const fileDiffs = input.fileDiffs ?? [];
460
538
  const changedPaths = fileDiffs.map((d) => d.path);
461
539
  const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
462
540
  const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
463
- const changedFiles = changedPaths.length > 0 ? changedPaths.map((path) => `- ${path}`).join("\n") : "No files were changed";
464
- const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
465
- const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
541
+ const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
542
+ const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
543
+ const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
466
544
  const trace = formatTraceForJudge(input.llmTrace);
467
545
  const ctx = {
468
546
  output,
@@ -474,101 +552,77 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
474
552
  };
475
553
  const replace = (s) => replacePlaceholders(s, ctx);
476
554
  const finalPrompt = replace(assertion.prompt);
477
- const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
478
555
  const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
479
556
  const maxOutputTokens = assertion.maxTokens ?? 1024;
480
557
  const temperature = assertion.temperature ?? 0;
481
- const modelUsed = assertion.model ?? context?.defaultJudgeModel;
482
- if (!modelUsed && !generateTextStub) {
558
+ const modelId = assertion.model ?? context?.defaultJudgeModel;
559
+ const model = this.resolveModel(context, modelId);
560
+ if (!model) {
561
+ const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
483
562
  return {
484
- id: (0, import_crypto4.randomUUID)(),
563
+ id: (0, import_crypto5.randomUUID)(),
485
564
  assertionId,
486
565
  assertionType: "llm_judge",
487
566
  assertionName: "LLM judge",
488
567
  status: "failed" /* FAILED */,
489
- message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
568
+ message: reason,
490
569
  expected: String(minScore)
491
570
  };
492
571
  }
493
- if (!generateTextStub && !llmConfig) {
494
- return {
495
- id: (0, import_crypto4.randomUUID)(),
496
- assertionId,
497
- assertionType: "llm_judge",
498
- assertionName: "LLM judge",
499
- status: "failed" /* FAILED */,
500
- message: "No llmConfig for llm_judge assertion (AI gateway required)",
501
- expected: String(minScore)
502
- };
503
- }
504
- const maxParseAttempts = 3;
505
- let lastParseError;
506
- let lastRawText;
572
+ const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
507
573
  try {
508
- for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {
509
- const result = generateTextStub ? await generateTextStub({
510
- prompt: finalPrompt,
511
- system: systemPrompt,
512
- maxOutputTokens,
513
- temperature
514
- }) : await this.callGenerateText(
515
- llmConfig,
516
- modelUsed,
517
- finalPrompt,
518
- systemPrompt,
519
- maxOutputTokens,
520
- temperature
521
- );
522
- lastRawText = result.text;
523
- try {
524
- const cleaned = stripMarkdownCodeBlock(result.text);
525
- const parsed = JSON.parse(cleaned);
526
- const judgeResult = validateJudgeResult(parsed);
527
- const passed = judgeResult.score >= minScore;
528
- return {
529
- id: (0, import_crypto4.randomUUID)(),
530
- assertionId,
531
- assertionType: "llm_judge",
532
- assertionName: "LLM judge",
533
- status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
534
- message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
535
- expected: String(minScore),
536
- actual: String(judgeResult.score),
537
- details: {
538
- score: judgeResult.score,
539
- scoreReasoning: judgeResult.scoreReasoning,
540
- text: judgeResult.text
541
- }
542
- };
543
- } catch (parseErr) {
544
- lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
545
- }
546
- }
574
+ const judgeResult = await this.callGenerateText(
575
+ model,
576
+ finalPrompt,
577
+ systemPrompt,
578
+ maxOutputTokens,
579
+ temperature,
580
+ workDir || void 0
581
+ );
582
+ const passed = judgeResult.score >= minScore;
547
583
  return {
548
- id: (0, import_crypto4.randomUUID)(),
584
+ id: (0, import_crypto5.randomUUID)(),
549
585
  assertionId,
550
586
  assertionType: "llm_judge",
551
587
  assertionName: "LLM judge",
552
- status: "failed" /* FAILED */,
553
- message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? "unknown"}`,
588
+ status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
589
+ message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
554
590
  expected: String(minScore),
555
- actual: void 0,
556
- details: { rawText: lastRawText?.slice(0, 500) }
591
+ actual: String(judgeResult.score),
592
+ details: {
593
+ score: judgeResult.score,
594
+ scoreReasoning: judgeResult.scoreReasoning,
595
+ text: judgeResult.text
596
+ }
557
597
  };
558
598
  } catch (err) {
599
+ if (import_ai2.NoObjectGeneratedError.isInstance(err)) {
600
+ return {
601
+ id: (0, import_crypto5.randomUUID)(),
602
+ assertionId,
603
+ assertionType: "llm_judge",
604
+ assertionName: "LLM judge",
605
+ status: "failed" /* FAILED */,
606
+ message: "LLM judge failed to produce valid structured output",
607
+ expected: String(minScore),
608
+ details: {
609
+ rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
610
+ }
611
+ };
612
+ }
559
613
  const message = err instanceof Error ? err.message : String(err);
560
614
  const details = {
561
615
  error: message,
562
- model: modelUsed
616
+ model: modelId
563
617
  };
564
- if (import_ai.APICallError.isInstance(err)) {
618
+ if (import_ai2.APICallError.isInstance(err)) {
565
619
  details.statusCode = err.statusCode;
566
620
  details.url = err.url;
567
621
  details.isRetryable = err.isRetryable;
568
622
  details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
569
623
  }
570
624
  return {
571
- id: (0, import_crypto4.randomUUID)(),
625
+ id: (0, import_crypto5.randomUUID)(),
572
626
  assertionId,
573
627
  assertionType: "llm_judge",
574
628
  assertionName: "LLM judge",
@@ -579,20 +633,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
579
633
  };
580
634
  }
581
635
  }
582
- async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature) {
636
+ /**
637
+ * Resolve the LanguageModel to use: context.model (injected mock/override)
638
+ * takes precedence, otherwise create from llmConfig + modelId.
639
+ */
640
+ resolveModel(context, modelId) {
641
+ if (context?.model) {
642
+ return context.model;
643
+ }
644
+ if (!modelId || !context?.llmConfig) {
645
+ return null;
646
+ }
583
647
  const anthropic = (0, import_anthropic.createAnthropic)({
584
- baseURL: llmConfig.baseUrl,
648
+ baseURL: context.llmConfig.baseUrl,
585
649
  apiKey: "dummy",
586
- headers: llmConfig.headers
650
+ headers: context.llmConfig.headers
587
651
  });
588
- const result = await (0, import_ai.generateText)({
589
- model: anthropic(modelId),
652
+ return anthropic(modelId);
653
+ }
654
+ async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
655
+ const baseOptions = {
656
+ model,
590
657
  prompt,
591
658
  system,
592
659
  maxOutputTokens,
593
- temperature
594
- });
595
- return { text: result.text };
660
+ temperature,
661
+ output: import_ai2.Output.object({ schema: JudgeResultSchema }),
662
+ stopWhen: (0, import_ai2.stepCountIs)(MAX_JUDGE_STEPS)
663
+ };
664
+ const { output } = workDir ? await (0, import_ai2.generateText)({
665
+ ...baseOptions,
666
+ tools: { read_file: createReadFileTool(workDir) }
667
+ }) : await (0, import_ai2.generateText)(baseOptions);
668
+ return output;
596
669
  }
597
670
  };
598
671
 
@@ -602,6 +675,7 @@ var evaluators = {
602
675
  skill_was_called: new SkillWasCalledEvaluator(),
603
676
  build_passed: new BuildPassedEvaluator(),
604
677
  time_limit: new TimeEvaluator(),
678
+ cost: new CostEvaluator(),
605
679
  llm_judge: llmJudgeEvaluator,
606
680
  // Custom assertions use the same LLM-based evaluation as llm_judge
607
681
  custom: llmJudgeEvaluator
@@ -621,8 +695,8 @@ async function evaluateAssertions(input, assertions, context) {
621
695
  const evaluator = evaluators[assertion.type];
622
696
  if (!evaluator) {
623
697
  return {
624
- id: (0, import_crypto5.randomUUID)(),
625
- assertionId: (0, import_crypto5.randomUUID)(),
698
+ id: (0, import_crypto6.randomUUID)(),
699
+ assertionId: (0, import_crypto6.randomUUID)(),
626
700
  assertionType: assertion.type,
627
701
  assertionName: "Unknown assertion",
628
702
  status: "error" /* ERROR */,
@@ -645,6 +719,9 @@ async function evaluateAssertions(input, assertions, context) {
645
719
  AssertionSchema,
646
720
  BuildPassedAssertionSchema,
647
721
  BuildPassedEvaluator,
722
+ CostAssertionSchema,
723
+ CostEvaluator,
724
+ JudgeResultSchema,
648
725
  LLMBreakdownStatsSchema,
649
726
  LLMStepType,
650
727
  LLMTraceSchema,
@@ -657,6 +734,7 @@ async function evaluateAssertions(input, assertions, context) {
657
734
  TimeAssertionSchema,
658
735
  TimeEvaluator,
659
736
  TokenUsageSchema,
737
+ createReadFileTool,
660
738
  evaluateAssertions,
661
739
  formatTraceForJudge,
662
740
  getEvaluator,