@wix/eval-assertions 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -149,11 +149,12 @@ Optional context for assertions:
149
149
  ```typescript
150
150
  interface AssertionContext {
151
151
  workDir?: string; // For build_passed
152
- llmConfig?: { // For llm_judge
152
+ llmConfig?: { // For llm_judge
153
153
  baseUrl: string;
154
154
  headers: Record<string, string>;
155
155
  };
156
- generateTextForLlmJudge?: (options) => Promise<{ text: string }>; // For testing
156
+ defaultJudgeModel?: string; // Default model for llm_judge
157
+ model?: LanguageModel; // Override model
157
158
  }
158
159
  ```
159
160
 
package/build/index.js CHANGED
@@ -1,7 +1,9 @@
1
1
  "use strict";
2
+ var __create = Object.create;
2
3
  var __defProp = Object.defineProperty;
3
4
  var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
5
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
6
8
  var __export = (target, all) => {
7
9
  for (var name in all)
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
15
17
  }
16
18
  return to;
17
19
  };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
18
28
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
29
 
20
30
  // src/index.ts
@@ -28,6 +38,7 @@ __export(index_exports, {
28
38
  BuildPassedEvaluator: () => BuildPassedEvaluator,
29
39
  CostAssertionSchema: () => CostAssertionSchema,
30
40
  CostEvaluator: () => CostEvaluator,
41
+ JudgeResultSchema: () => JudgeResultSchema,
31
42
  LLMBreakdownStatsSchema: () => LLMBreakdownStatsSchema,
32
43
  LLMStepType: () => LLMStepType,
33
44
  LLMTraceSchema: () => LLMTraceSchema,
@@ -40,6 +51,9 @@ __export(index_exports, {
40
51
  TimeAssertionSchema: () => TimeAssertionSchema,
41
52
  TimeEvaluator: () => TimeEvaluator,
42
53
  TokenUsageSchema: () => TokenUsageSchema,
54
+ ToolCalledWithParamAssertionSchema: () => ToolCalledWithParamAssertionSchema,
55
+ ToolCalledWithParamEvaluator: () => ToolCalledWithParamEvaluator,
56
+ createReadFileTool: () => createReadFileTool,
43
57
  evaluateAssertions: () => evaluateAssertions,
44
58
  formatTraceForJudge: () => formatTraceForJudge,
45
59
  getEvaluator: () => getEvaluator,
@@ -57,6 +71,13 @@ var SkillWasCalledAssertionSchema = import_zod.z.object({
57
71
  /** Names of the skills that must have been called (matched against trace Skill tool args) */
58
72
  skillNames: import_zod.z.array(import_zod.z.string()).min(1)
59
73
  });
74
+ var ToolCalledWithParamAssertionSchema = import_zod.z.object({
75
+ type: import_zod.z.literal("tool_called_with_param"),
76
+ /** Name of the tool that must have been called */
77
+ toolName: import_zod.z.string().min(1),
78
+ /** JSON string of key-value pairs for expected parameters (substring match) */
79
+ expectedParams: import_zod.z.string().min(1)
80
+ });
60
81
  var BuildPassedAssertionSchema = import_zod.z.object({
61
82
  type: import_zod.z.literal("build_passed"),
62
83
  /** Command to run (default: "yarn build") */
@@ -89,6 +110,7 @@ var TimeAssertionSchema = import_zod.z.object({
89
110
  });
90
111
  var AssertionSchema = import_zod.z.union([
91
112
  SkillWasCalledAssertionSchema,
113
+ ToolCalledWithParamAssertionSchema,
92
114
  BuildPassedAssertionSchema,
93
115
  TimeAssertionSchema,
94
116
  CostAssertionSchema,
@@ -171,7 +193,7 @@ var AssertionResultSchema = import_zod3.z.object({
171
193
  });
172
194
 
173
195
  // src/evaluators/index.ts
174
- var import_crypto6 = require("crypto");
196
+ var import_crypto7 = require("crypto");
175
197
 
176
198
  // src/evaluators/skill-was-called-evaluator.ts
177
199
  var import_crypto = require("crypto");
@@ -250,15 +272,79 @@ var SkillWasCalledEvaluator = class extends AssertionEvaluator {
250
272
  }
251
273
  };
252
274
 
253
- // src/evaluators/build-passed-evaluator.ts
275
+ // src/evaluators/tool-called-with-param-evaluator.ts
254
276
  var import_crypto2 = require("crypto");
277
+ var ASSERTION_TYPE = "tool_called_with_param";
278
+ var ASSERTION_NAME = "Tool called with param";
279
+ var containsAll = ({
280
+ actual,
281
+ expected
282
+ }) => Object.entries(expected).every(([key, val]) => {
283
+ const actualVal = actual[key];
284
+ if (actualVal === null || actualVal === void 0) return false;
285
+ const actualStr = typeof actualVal === "string" ? actualVal : JSON.stringify(actualVal);
286
+ return actualStr.includes(String(val));
287
+ });
288
+ var ToolCalledWithParamEvaluator = class extends AssertionEvaluator {
289
+ type = ASSERTION_TYPE;
290
+ evaluate(assertion, input, _context) {
291
+ const assertionId = (0, import_crypto2.randomUUID)();
292
+ const { toolName, expectedParams: expectedParamsStr } = assertion;
293
+ const buildResult = (status, message, expected2, actual) => ({
294
+ id: (0, import_crypto2.randomUUID)(),
295
+ assertionId,
296
+ assertionType: ASSERTION_TYPE,
297
+ assertionName: ASSERTION_NAME,
298
+ status,
299
+ message,
300
+ expected: expected2,
301
+ ...actual !== void 0 ? { actual } : {}
302
+ });
303
+ let expected;
304
+ try {
305
+ expected = JSON.parse(expectedParamsStr);
306
+ } catch {
307
+ return buildResult(
308
+ "failed" /* FAILED */,
309
+ `Tool "${toolName}" assertion has invalid expected params JSON`,
310
+ `${toolName}(invalid expected params)`,
311
+ "Invalid expected params JSON"
312
+ );
313
+ }
314
+ const expectedLabel = `${toolName}(${Object.entries(expected).map(([k, v]) => `${k}="${v}"`).join(", ")})`;
315
+ const steps = input.llmTrace?.steps ?? [];
316
+ const toolCalls = steps.filter((s) => s.toolName === toolName && s.toolArguments !== void 0).map((s) => {
317
+ try {
318
+ return JSON.parse(s.toolArguments);
319
+ } catch {
320
+ return null;
321
+ }
322
+ }).filter((call) => call !== null);
323
+ if (toolCalls.some((actual) => containsAll({ actual, expected }))) {
324
+ return buildResult(
325
+ "passed" /* PASSED */,
326
+ `Tool "${toolName}" was called with params matching ${expectedParamsStr}`,
327
+ expectedLabel
328
+ );
329
+ }
330
+ return buildResult(
331
+ "failed" /* FAILED */,
332
+ `Tool "${toolName}" was never called with params matching ${expectedParamsStr}`,
333
+ expectedLabel,
334
+ toolCalls.length > 0 ? `Found ${toolName} calls but params didn't match` : `No matching tool calls found`
335
+ );
336
+ }
337
+ };
338
+
339
+ // src/evaluators/build-passed-evaluator.ts
340
+ var import_crypto3 = require("crypto");
255
341
  var import_child_process = require("child_process");
256
342
  var DEFAULT_COMMAND = "yarn build";
257
343
  var DEFAULT_EXIT_CODE = 0;
258
344
  var BuildPassedEvaluator = class extends AssertionEvaluator {
259
345
  type = "build_passed";
260
346
  evaluate(assertion, _input, context) {
261
- const assertionId = (0, import_crypto2.randomUUID)();
347
+ const assertionId = (0, import_crypto3.randomUUID)();
262
348
  const workDir = context?.workDir;
263
349
  const command = assertion.command ?? DEFAULT_COMMAND;
264
350
  const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;
@@ -306,7 +392,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
306
392
  }
307
393
  createResult(assertionId, fields) {
308
394
  return {
309
- id: (0, import_crypto2.randomUUID)(),
395
+ id: (0, import_crypto3.randomUUID)(),
310
396
  assertionId,
311
397
  assertionType: "build_passed",
312
398
  assertionName: "Build passed",
@@ -331,7 +417,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
331
417
  };
332
418
 
333
419
  // src/evaluators/time-evaluator.ts
334
- var import_crypto3 = require("crypto");
420
+ var import_crypto4 = require("crypto");
335
421
  var TimeEvaluator = class extends AssertionEvaluator {
336
422
  type = "time_limit";
337
423
  evaluate(assertion, input) {
@@ -353,8 +439,8 @@ var TimeEvaluator = class extends AssertionEvaluator {
353
439
  }
354
440
  createResult(fields) {
355
441
  return {
356
- id: (0, import_crypto3.randomUUID)(),
357
- assertionId: (0, import_crypto3.randomUUID)(),
442
+ id: (0, import_crypto4.randomUUID)(),
443
+ assertionId: (0, import_crypto4.randomUUID)(),
358
444
  assertionType: "time_limit",
359
445
  assertionName: "Time limit",
360
446
  status: "failed" /* FAILED */,
@@ -364,12 +450,12 @@ var TimeEvaluator = class extends AssertionEvaluator {
364
450
  };
365
451
 
366
452
  // src/evaluators/cost-evaluator.ts
367
- var import_crypto4 = require("crypto");
453
+ var import_crypto5 = require("crypto");
368
454
  var CostEvaluator = class extends AssertionEvaluator {
369
455
  type = "cost";
370
456
  evaluate(assertion, input) {
371
- const assertionId = (0, import_crypto4.randomUUID)();
372
- const id = (0, import_crypto4.randomUUID)();
457
+ const assertionId = (0, import_crypto5.randomUUID)();
458
+ const id = (0, import_crypto5.randomUUID)();
373
459
  const assertionName = "Cost";
374
460
  const assertionType = "cost";
375
461
  const maxCostUsd = assertion.maxCostUsd;
@@ -401,10 +487,48 @@ var CostEvaluator = class extends AssertionEvaluator {
401
487
  }
402
488
  };
403
489
 
490
+ // src/tools/read-file-tool.ts
491
+ var import_ai = require("ai");
492
+ var import_zod4 = require("zod");
493
+ var import_promises = require("fs/promises");
494
+ var import_path = __toESM(require("path"));
495
+ function createReadFileTool(workDir) {
496
+ const resolvedWorkDir = import_path.default.resolve(workDir);
497
+ return (0, import_ai.tool)({
498
+ description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
499
+ inputSchema: import_zod4.z.object({
500
+ path: import_zod4.z.string().describe("Relative file path in the workspace")
501
+ }),
502
+ execute: async ({
503
+ path: filePath
504
+ }) => {
505
+ const resolved = import_path.default.resolve(resolvedWorkDir, filePath);
506
+ if (!resolved.startsWith(resolvedWorkDir + import_path.default.sep)) {
507
+ return { error: `Access denied: path escapes workspace directory` };
508
+ }
509
+ try {
510
+ const content = await (0, import_promises.readFile)(resolved, "utf-8");
511
+ return { path: filePath, content };
512
+ } catch {
513
+ return { error: `File not found: ${filePath}` };
514
+ }
515
+ }
516
+ });
517
+ }
518
+
404
519
  // src/evaluators/llm-judge-evaluator.ts
405
- var import_crypto5 = require("crypto");
520
+ var import_crypto6 = require("crypto");
406
521
  var import_anthropic = require("@ai-sdk/anthropic");
407
- var import_ai = require("ai");
522
+ var import_ai2 = require("ai");
523
+ var import_zod5 = require("zod");
524
+ var JudgeResultSchema = import_zod5.z.object({
525
+ text: import_zod5.z.string().describe("A brief textual verdict of the test result"),
526
+ score: import_zod5.z.number().min(0).max(100).describe(
527
+ "A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
528
+ ),
529
+ scoreReasoning: import_zod5.z.string().describe("A concise explanation justifying the assigned score")
530
+ });
531
+ var MAX_JUDGE_STEPS = 20;
408
532
  function formatTraceForJudge(llmTrace) {
409
533
  if (!llmTrace?.steps?.length) {
410
534
  return "No trace available.";
@@ -475,40 +599,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
475
599
  - {{newFiles}}: list of new files that were created (or "No new files were created")
476
600
  - {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
477
601
 
478
- CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
479
- var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
480
-
481
- {
482
- "text": string,
483
- "score": number (0-100),
484
- "scoreReasoning": string
485
- }
486
-
487
- - text: A brief textual verdict of the test result.
488
- - score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
489
- - scoreReasoning: A concise explanation justifying the assigned score.
602
+ You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
490
603
 
491
- Your response must:
492
- - Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
493
- - Be valid and parseable by \`JSON.parse\`.
494
- - Use only double quotes for all keys and strings, as required by JSON.
495
-
496
- Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
604
+ CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
497
605
  var LlmJudgeEvaluator = class extends AssertionEvaluator {
498
606
  type = "llm_judge";
499
607
  async evaluate(assertion, input, context) {
500
- const assertionId = (0, import_crypto5.randomUUID)();
501
- const llmConfig = context?.llmConfig;
608
+ const assertionId = (0, import_crypto6.randomUUID)();
502
609
  const workDir = context?.workDir ?? "";
503
- const generateTextStub = context?.generateTextForLlmJudge;
504
610
  const output = input.outputText ?? "";
505
611
  const fileDiffs = input.fileDiffs ?? [];
506
612
  const changedPaths = fileDiffs.map((d) => d.path);
507
613
  const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
508
614
  const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
509
- const changedFiles = changedPaths.length > 0 ? changedPaths.map((path) => `- ${path}`).join("\n") : "No files were changed";
510
- const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
511
- const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
615
+ const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
616
+ const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
617
+ const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
512
618
  const trace = formatTraceForJudge(input.llmTrace);
513
619
  const ctx = {
514
620
  output,
@@ -520,101 +626,77 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
520
626
  };
521
627
  const replace = (s) => replacePlaceholders(s, ctx);
522
628
  const finalPrompt = replace(assertion.prompt);
523
- const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
524
629
  const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
525
630
  const maxOutputTokens = assertion.maxTokens ?? 1024;
526
631
  const temperature = assertion.temperature ?? 0;
527
- const modelUsed = assertion.model ?? context?.defaultJudgeModel;
528
- if (!modelUsed && !generateTextStub) {
632
+ const modelId = assertion.model ?? context?.defaultJudgeModel;
633
+ const model = this.resolveModel(context, modelId);
634
+ if (!model) {
635
+ const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
529
636
  return {
530
- id: (0, import_crypto5.randomUUID)(),
637
+ id: (0, import_crypto6.randomUUID)(),
531
638
  assertionId,
532
639
  assertionType: "llm_judge",
533
640
  assertionName: "LLM judge",
534
641
  status: "failed" /* FAILED */,
535
- message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
642
+ message: reason,
536
643
  expected: String(minScore)
537
644
  };
538
645
  }
539
- if (!generateTextStub && !llmConfig) {
540
- return {
541
- id: (0, import_crypto5.randomUUID)(),
542
- assertionId,
543
- assertionType: "llm_judge",
544
- assertionName: "LLM judge",
545
- status: "failed" /* FAILED */,
546
- message: "No llmConfig for llm_judge assertion (AI gateway required)",
547
- expected: String(minScore)
548
- };
549
- }
550
- const maxParseAttempts = 3;
551
- let lastParseError;
552
- let lastRawText;
646
+ const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
553
647
  try {
554
- for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {
555
- const result = generateTextStub ? await generateTextStub({
556
- prompt: finalPrompt,
557
- system: systemPrompt,
558
- maxOutputTokens,
559
- temperature
560
- }) : await this.callGenerateText(
561
- llmConfig,
562
- modelUsed,
563
- finalPrompt,
564
- systemPrompt,
565
- maxOutputTokens,
566
- temperature
567
- );
568
- lastRawText = result.text;
569
- try {
570
- const cleaned = stripMarkdownCodeBlock(result.text);
571
- const parsed = JSON.parse(cleaned);
572
- const judgeResult = validateJudgeResult(parsed);
573
- const passed = judgeResult.score >= minScore;
574
- return {
575
- id: (0, import_crypto5.randomUUID)(),
576
- assertionId,
577
- assertionType: "llm_judge",
578
- assertionName: "LLM judge",
579
- status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
580
- message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
581
- expected: String(minScore),
582
- actual: String(judgeResult.score),
583
- details: {
584
- score: judgeResult.score,
585
- scoreReasoning: judgeResult.scoreReasoning,
586
- text: judgeResult.text
587
- }
588
- };
589
- } catch (parseErr) {
590
- lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
591
- }
592
- }
648
+ const judgeResult = await this.callGenerateText(
649
+ model,
650
+ finalPrompt,
651
+ systemPrompt,
652
+ maxOutputTokens,
653
+ temperature,
654
+ workDir || void 0
655
+ );
656
+ const passed = judgeResult.score >= minScore;
593
657
  return {
594
- id: (0, import_crypto5.randomUUID)(),
658
+ id: (0, import_crypto6.randomUUID)(),
595
659
  assertionId,
596
660
  assertionType: "llm_judge",
597
661
  assertionName: "LLM judge",
598
- status: "failed" /* FAILED */,
599
- message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? "unknown"}`,
662
+ status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
663
+ message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
600
664
  expected: String(minScore),
601
- actual: void 0,
602
- details: { rawText: lastRawText?.slice(0, 500) }
665
+ actual: String(judgeResult.score),
666
+ details: {
667
+ score: judgeResult.score,
668
+ scoreReasoning: judgeResult.scoreReasoning,
669
+ text: judgeResult.text
670
+ }
603
671
  };
604
672
  } catch (err) {
673
+ if (import_ai2.NoObjectGeneratedError.isInstance(err)) {
674
+ return {
675
+ id: (0, import_crypto6.randomUUID)(),
676
+ assertionId,
677
+ assertionType: "llm_judge",
678
+ assertionName: "LLM judge",
679
+ status: "failed" /* FAILED */,
680
+ message: "LLM judge failed to produce valid structured output",
681
+ expected: String(minScore),
682
+ details: {
683
+ rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
684
+ }
685
+ };
686
+ }
605
687
  const message = err instanceof Error ? err.message : String(err);
606
688
  const details = {
607
689
  error: message,
608
- model: modelUsed
690
+ model: modelId
609
691
  };
610
- if (import_ai.APICallError.isInstance(err)) {
692
+ if (import_ai2.APICallError.isInstance(err)) {
611
693
  details.statusCode = err.statusCode;
612
694
  details.url = err.url;
613
695
  details.isRetryable = err.isRetryable;
614
696
  details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
615
697
  }
616
698
  return {
617
- id: (0, import_crypto5.randomUUID)(),
699
+ id: (0, import_crypto6.randomUUID)(),
618
700
  assertionId,
619
701
  assertionType: "llm_judge",
620
702
  assertionName: "LLM judge",
@@ -625,20 +707,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
625
707
  };
626
708
  }
627
709
  }
628
- async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature) {
710
+ /**
711
+ * Resolve the LanguageModel to use: context.model (injected mock/override)
712
+ * takes precedence, otherwise create from llmConfig + modelId.
713
+ */
714
+ resolveModel(context, modelId) {
715
+ if (context?.model) {
716
+ return context.model;
717
+ }
718
+ if (!modelId || !context?.llmConfig) {
719
+ return null;
720
+ }
629
721
  const anthropic = (0, import_anthropic.createAnthropic)({
630
- baseURL: llmConfig.baseUrl,
722
+ baseURL: context.llmConfig.baseUrl,
631
723
  apiKey: "dummy",
632
- headers: llmConfig.headers
724
+ headers: context.llmConfig.headers
633
725
  });
634
- const result = await (0, import_ai.generateText)({
635
- model: anthropic(modelId),
726
+ return anthropic(modelId);
727
+ }
728
+ async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
729
+ const baseOptions = {
730
+ model,
636
731
  prompt,
637
732
  system,
638
733
  maxOutputTokens,
639
- temperature
640
- });
641
- return { text: result.text };
734
+ temperature,
735
+ output: import_ai2.Output.object({ schema: JudgeResultSchema }),
736
+ stopWhen: (0, import_ai2.stepCountIs)(MAX_JUDGE_STEPS)
737
+ };
738
+ const { output } = workDir ? await (0, import_ai2.generateText)({
739
+ ...baseOptions,
740
+ tools: { read_file: createReadFileTool(workDir) }
741
+ }) : await (0, import_ai2.generateText)(baseOptions);
742
+ return output;
642
743
  }
643
744
  };
644
745
 
@@ -646,6 +747,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
646
747
  var llmJudgeEvaluator = new LlmJudgeEvaluator();
647
748
  var evaluators = {
648
749
  skill_was_called: new SkillWasCalledEvaluator(),
750
+ tool_called_with_param: new ToolCalledWithParamEvaluator(),
649
751
  build_passed: new BuildPassedEvaluator(),
650
752
  time_limit: new TimeEvaluator(),
651
753
  cost: new CostEvaluator(),
@@ -668,8 +770,8 @@ async function evaluateAssertions(input, assertions, context) {
668
770
  const evaluator = evaluators[assertion.type];
669
771
  if (!evaluator) {
670
772
  return {
671
- id: (0, import_crypto6.randomUUID)(),
672
- assertionId: (0, import_crypto6.randomUUID)(),
773
+ id: (0, import_crypto7.randomUUID)(),
774
+ assertionId: (0, import_crypto7.randomUUID)(),
673
775
  assertionType: assertion.type,
674
776
  assertionName: "Unknown assertion",
675
777
  status: "error" /* ERROR */,
@@ -694,6 +796,7 @@ async function evaluateAssertions(input, assertions, context) {
694
796
  BuildPassedEvaluator,
695
797
  CostAssertionSchema,
696
798
  CostEvaluator,
799
+ JudgeResultSchema,
697
800
  LLMBreakdownStatsSchema,
698
801
  LLMStepType,
699
802
  LLMTraceSchema,
@@ -706,6 +809,9 @@ async function evaluateAssertions(input, assertions, context) {
706
809
  TimeAssertionSchema,
707
810
  TimeEvaluator,
708
811
  TokenUsageSchema,
812
+ ToolCalledWithParamAssertionSchema,
813
+ ToolCalledWithParamEvaluator,
814
+ createReadFileTool,
709
815
  evaluateAssertions,
710
816
  formatTraceForJudge,
711
817
  getEvaluator,