@wix/eval-assertions 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/build/index.js +218 -112
- package/build/index.js.map +4 -4
- package/build/index.mjs +209 -111
- package/build/index.mjs.map +4 -4
- package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
- package/build/types/evaluators/index.d.ts +3 -2
- package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
- package/build/types/evaluators/tool-called-with-param-evaluator.d.ts +12 -0
- package/build/types/index.d.ts +3 -2
- package/build/types/tools/index.d.ts +1 -0
- package/build/types/tools/read-file-tool.d.ts +10 -0
- package/build/types/types/assertions.d.ts +14 -0
- package/build/types/types/index.d.ts +1 -1
- package/package.json +4 -3
package/build/index.mjs
CHANGED
|
@@ -5,6 +5,13 @@ var SkillWasCalledAssertionSchema = z.object({
|
|
|
5
5
|
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
6
6
|
skillNames: z.array(z.string()).min(1)
|
|
7
7
|
});
|
|
8
|
+
var ToolCalledWithParamAssertionSchema = z.object({
|
|
9
|
+
type: z.literal("tool_called_with_param"),
|
|
10
|
+
/** Name of the tool that must have been called */
|
|
11
|
+
toolName: z.string().min(1),
|
|
12
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
13
|
+
expectedParams: z.string().min(1)
|
|
14
|
+
});
|
|
8
15
|
var BuildPassedAssertionSchema = z.object({
|
|
9
16
|
type: z.literal("build_passed"),
|
|
10
17
|
/** Command to run (default: "yarn build") */
|
|
@@ -37,6 +44,7 @@ var TimeAssertionSchema = z.object({
|
|
|
37
44
|
});
|
|
38
45
|
var AssertionSchema = z.union([
|
|
39
46
|
SkillWasCalledAssertionSchema,
|
|
47
|
+
ToolCalledWithParamAssertionSchema,
|
|
40
48
|
BuildPassedAssertionSchema,
|
|
41
49
|
TimeAssertionSchema,
|
|
42
50
|
CostAssertionSchema,
|
|
@@ -119,7 +127,7 @@ var AssertionResultSchema = z3.object({
|
|
|
119
127
|
});
|
|
120
128
|
|
|
121
129
|
// src/evaluators/index.ts
|
|
122
|
-
import { randomUUID as
|
|
130
|
+
import { randomUUID as randomUUID7 } from "crypto";
|
|
123
131
|
|
|
124
132
|
// src/evaluators/skill-was-called-evaluator.ts
|
|
125
133
|
import { randomUUID } from "crypto";
|
|
@@ -198,15 +206,79 @@ var SkillWasCalledEvaluator = class extends AssertionEvaluator {
|
|
|
198
206
|
}
|
|
199
207
|
};
|
|
200
208
|
|
|
201
|
-
// src/evaluators/
|
|
209
|
+
// src/evaluators/tool-called-with-param-evaluator.ts
|
|
202
210
|
import { randomUUID as randomUUID2 } from "crypto";
|
|
211
|
+
var ASSERTION_TYPE = "tool_called_with_param";
|
|
212
|
+
var ASSERTION_NAME = "Tool called with param";
|
|
213
|
+
var containsAll = ({
|
|
214
|
+
actual,
|
|
215
|
+
expected
|
|
216
|
+
}) => Object.entries(expected).every(([key, val]) => {
|
|
217
|
+
const actualVal = actual[key];
|
|
218
|
+
if (actualVal === null || actualVal === void 0) return false;
|
|
219
|
+
const actualStr = typeof actualVal === "string" ? actualVal : JSON.stringify(actualVal);
|
|
220
|
+
return actualStr.includes(String(val));
|
|
221
|
+
});
|
|
222
|
+
var ToolCalledWithParamEvaluator = class extends AssertionEvaluator {
|
|
223
|
+
type = ASSERTION_TYPE;
|
|
224
|
+
evaluate(assertion, input, _context) {
|
|
225
|
+
const assertionId = randomUUID2();
|
|
226
|
+
const { toolName, expectedParams: expectedParamsStr } = assertion;
|
|
227
|
+
const buildResult = (status, message, expected2, actual) => ({
|
|
228
|
+
id: randomUUID2(),
|
|
229
|
+
assertionId,
|
|
230
|
+
assertionType: ASSERTION_TYPE,
|
|
231
|
+
assertionName: ASSERTION_NAME,
|
|
232
|
+
status,
|
|
233
|
+
message,
|
|
234
|
+
expected: expected2,
|
|
235
|
+
...actual !== void 0 ? { actual } : {}
|
|
236
|
+
});
|
|
237
|
+
let expected;
|
|
238
|
+
try {
|
|
239
|
+
expected = JSON.parse(expectedParamsStr);
|
|
240
|
+
} catch {
|
|
241
|
+
return buildResult(
|
|
242
|
+
"failed" /* FAILED */,
|
|
243
|
+
`Tool "${toolName}" assertion has invalid expected params JSON`,
|
|
244
|
+
`${toolName}(invalid expected params)`,
|
|
245
|
+
"Invalid expected params JSON"
|
|
246
|
+
);
|
|
247
|
+
}
|
|
248
|
+
const expectedLabel = `${toolName}(${Object.entries(expected).map(([k, v]) => `${k}="${v}"`).join(", ")})`;
|
|
249
|
+
const steps = input.llmTrace?.steps ?? [];
|
|
250
|
+
const toolCalls = steps.filter((s) => s.toolName === toolName && s.toolArguments !== void 0).map((s) => {
|
|
251
|
+
try {
|
|
252
|
+
return JSON.parse(s.toolArguments);
|
|
253
|
+
} catch {
|
|
254
|
+
return null;
|
|
255
|
+
}
|
|
256
|
+
}).filter((call) => call !== null);
|
|
257
|
+
if (toolCalls.some((actual) => containsAll({ actual, expected }))) {
|
|
258
|
+
return buildResult(
|
|
259
|
+
"passed" /* PASSED */,
|
|
260
|
+
`Tool "${toolName}" was called with params matching ${expectedParamsStr}`,
|
|
261
|
+
expectedLabel
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
return buildResult(
|
|
265
|
+
"failed" /* FAILED */,
|
|
266
|
+
`Tool "${toolName}" was never called with params matching ${expectedParamsStr}`,
|
|
267
|
+
expectedLabel,
|
|
268
|
+
toolCalls.length > 0 ? `Found ${toolName} calls but params didn't match` : `No matching tool calls found`
|
|
269
|
+
);
|
|
270
|
+
}
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
// src/evaluators/build-passed-evaluator.ts
|
|
274
|
+
import { randomUUID as randomUUID3 } from "crypto";
|
|
203
275
|
import { execSync } from "child_process";
|
|
204
276
|
var DEFAULT_COMMAND = "yarn build";
|
|
205
277
|
var DEFAULT_EXIT_CODE = 0;
|
|
206
278
|
var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
207
279
|
type = "build_passed";
|
|
208
280
|
evaluate(assertion, _input, context) {
|
|
209
|
-
const assertionId =
|
|
281
|
+
const assertionId = randomUUID3();
|
|
210
282
|
const workDir = context?.workDir;
|
|
211
283
|
const command = assertion.command ?? DEFAULT_COMMAND;
|
|
212
284
|
const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;
|
|
@@ -254,7 +326,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
|
254
326
|
}
|
|
255
327
|
createResult(assertionId, fields) {
|
|
256
328
|
return {
|
|
257
|
-
id:
|
|
329
|
+
id: randomUUID3(),
|
|
258
330
|
assertionId,
|
|
259
331
|
assertionType: "build_passed",
|
|
260
332
|
assertionName: "Build passed",
|
|
@@ -279,7 +351,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
|
279
351
|
};
|
|
280
352
|
|
|
281
353
|
// src/evaluators/time-evaluator.ts
|
|
282
|
-
import { randomUUID as
|
|
354
|
+
import { randomUUID as randomUUID4 } from "crypto";
|
|
283
355
|
var TimeEvaluator = class extends AssertionEvaluator {
|
|
284
356
|
type = "time_limit";
|
|
285
357
|
evaluate(assertion, input) {
|
|
@@ -301,8 +373,8 @@ var TimeEvaluator = class extends AssertionEvaluator {
|
|
|
301
373
|
}
|
|
302
374
|
createResult(fields) {
|
|
303
375
|
return {
|
|
304
|
-
id:
|
|
305
|
-
assertionId:
|
|
376
|
+
id: randomUUID4(),
|
|
377
|
+
assertionId: randomUUID4(),
|
|
306
378
|
assertionType: "time_limit",
|
|
307
379
|
assertionName: "Time limit",
|
|
308
380
|
status: "failed" /* FAILED */,
|
|
@@ -312,12 +384,12 @@ var TimeEvaluator = class extends AssertionEvaluator {
|
|
|
312
384
|
};
|
|
313
385
|
|
|
314
386
|
// src/evaluators/cost-evaluator.ts
|
|
315
|
-
import { randomUUID as
|
|
387
|
+
import { randomUUID as randomUUID5 } from "crypto";
|
|
316
388
|
var CostEvaluator = class extends AssertionEvaluator {
|
|
317
389
|
type = "cost";
|
|
318
390
|
evaluate(assertion, input) {
|
|
319
|
-
const assertionId =
|
|
320
|
-
const id =
|
|
391
|
+
const assertionId = randomUUID5();
|
|
392
|
+
const id = randomUUID5();
|
|
321
393
|
const assertionName = "Cost";
|
|
322
394
|
const assertionType = "cost";
|
|
323
395
|
const maxCostUsd = assertion.maxCostUsd;
|
|
@@ -349,10 +421,54 @@ var CostEvaluator = class extends AssertionEvaluator {
|
|
|
349
421
|
}
|
|
350
422
|
};
|
|
351
423
|
|
|
424
|
+
// src/tools/read-file-tool.ts
|
|
425
|
+
import { tool } from "ai";
|
|
426
|
+
import { z as z4 } from "zod";
|
|
427
|
+
import { readFile } from "fs/promises";
|
|
428
|
+
import path from "path";
|
|
429
|
+
function createReadFileTool(workDir) {
|
|
430
|
+
const resolvedWorkDir = path.resolve(workDir);
|
|
431
|
+
return tool({
|
|
432
|
+
description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
|
|
433
|
+
inputSchema: z4.object({
|
|
434
|
+
path: z4.string().describe("Relative file path in the workspace")
|
|
435
|
+
}),
|
|
436
|
+
execute: async ({
|
|
437
|
+
path: filePath
|
|
438
|
+
}) => {
|
|
439
|
+
const resolved = path.resolve(resolvedWorkDir, filePath);
|
|
440
|
+
if (!resolved.startsWith(resolvedWorkDir + path.sep)) {
|
|
441
|
+
return { error: `Access denied: path escapes workspace directory` };
|
|
442
|
+
}
|
|
443
|
+
try {
|
|
444
|
+
const content = await readFile(resolved, "utf-8");
|
|
445
|
+
return { path: filePath, content };
|
|
446
|
+
} catch {
|
|
447
|
+
return { error: `File not found: ${filePath}` };
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
});
|
|
451
|
+
}
|
|
452
|
+
|
|
352
453
|
// src/evaluators/llm-judge-evaluator.ts
|
|
353
|
-
import { randomUUID as
|
|
454
|
+
import { randomUUID as randomUUID6 } from "crypto";
|
|
354
455
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
355
|
-
import {
|
|
456
|
+
import {
|
|
457
|
+
generateText,
|
|
458
|
+
Output,
|
|
459
|
+
APICallError,
|
|
460
|
+
NoObjectGeneratedError,
|
|
461
|
+
stepCountIs
|
|
462
|
+
} from "ai";
|
|
463
|
+
import { z as z5 } from "zod";
|
|
464
|
+
var JudgeResultSchema = z5.object({
|
|
465
|
+
text: z5.string().describe("A brief textual verdict of the test result"),
|
|
466
|
+
score: z5.number().min(0).max(100).describe(
|
|
467
|
+
"A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
|
|
468
|
+
),
|
|
469
|
+
scoreReasoning: z5.string().describe("A concise explanation justifying the assigned score")
|
|
470
|
+
});
|
|
471
|
+
var MAX_JUDGE_STEPS = 20;
|
|
356
472
|
function formatTraceForJudge(llmTrace) {
|
|
357
473
|
if (!llmTrace?.steps?.length) {
|
|
358
474
|
return "No trace available.";
|
|
@@ -423,40 +539,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
|
|
|
423
539
|
- {{newFiles}}: list of new files that were created (or "No new files were created")
|
|
424
540
|
- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
|
|
425
541
|
|
|
426
|
-
|
|
427
|
-
var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
|
|
428
|
-
|
|
429
|
-
{
|
|
430
|
-
"text": string,
|
|
431
|
-
"score": number (0-100),
|
|
432
|
-
"scoreReasoning": string
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
- text: A brief textual verdict of the test result.
|
|
436
|
-
- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
|
|
437
|
-
- scoreReasoning: A concise explanation justifying the assigned score.
|
|
542
|
+
You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
|
|
438
543
|
|
|
439
|
-
|
|
440
|
-
- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
|
|
441
|
-
- Be valid and parseable by \`JSON.parse\`.
|
|
442
|
-
- Use only double quotes for all keys and strings, as required by JSON.
|
|
443
|
-
|
|
444
|
-
Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
|
|
544
|
+
CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
|
|
445
545
|
var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
446
546
|
type = "llm_judge";
|
|
447
547
|
async evaluate(assertion, input, context) {
|
|
448
|
-
const assertionId =
|
|
449
|
-
const llmConfig = context?.llmConfig;
|
|
548
|
+
const assertionId = randomUUID6();
|
|
450
549
|
const workDir = context?.workDir ?? "";
|
|
451
|
-
const generateTextStub = context?.generateTextForLlmJudge;
|
|
452
550
|
const output = input.outputText ?? "";
|
|
453
551
|
const fileDiffs = input.fileDiffs ?? [];
|
|
454
552
|
const changedPaths = fileDiffs.map((d) => d.path);
|
|
455
553
|
const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
|
|
456
554
|
const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
|
|
457
|
-
const changedFiles = changedPaths.length > 0 ? changedPaths.map((
|
|
458
|
-
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((
|
|
459
|
-
const newFiles = newPaths.length > 0 ? newPaths.map((
|
|
555
|
+
const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
|
|
556
|
+
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
|
|
557
|
+
const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
|
|
460
558
|
const trace = formatTraceForJudge(input.llmTrace);
|
|
461
559
|
const ctx = {
|
|
462
560
|
output,
|
|
@@ -468,92 +566,68 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
468
566
|
};
|
|
469
567
|
const replace = (s) => replacePlaceholders(s, ctx);
|
|
470
568
|
const finalPrompt = replace(assertion.prompt);
|
|
471
|
-
const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
|
|
472
569
|
const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
|
|
473
570
|
const maxOutputTokens = assertion.maxTokens ?? 1024;
|
|
474
571
|
const temperature = assertion.temperature ?? 0;
|
|
475
|
-
const
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
assertionId,
|
|
480
|
-
assertionType: "llm_judge",
|
|
481
|
-
assertionName: "LLM judge",
|
|
482
|
-
status: "failed" /* FAILED */,
|
|
483
|
-
message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
|
|
484
|
-
expected: String(minScore)
|
|
485
|
-
};
|
|
486
|
-
}
|
|
487
|
-
if (!generateTextStub && !llmConfig) {
|
|
572
|
+
const modelId = assertion.model ?? context?.defaultJudgeModel;
|
|
573
|
+
const model = this.resolveModel(context, modelId);
|
|
574
|
+
if (!model) {
|
|
575
|
+
const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
|
|
488
576
|
return {
|
|
489
|
-
id:
|
|
577
|
+
id: randomUUID6(),
|
|
490
578
|
assertionId,
|
|
491
579
|
assertionType: "llm_judge",
|
|
492
580
|
assertionName: "LLM judge",
|
|
493
581
|
status: "failed" /* FAILED */,
|
|
494
|
-
message:
|
|
582
|
+
message: reason,
|
|
495
583
|
expected: String(minScore)
|
|
496
584
|
};
|
|
497
585
|
}
|
|
498
|
-
const
|
|
499
|
-
let lastParseError;
|
|
500
|
-
let lastRawText;
|
|
586
|
+
const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
|
|
501
587
|
try {
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
finalPrompt,
|
|
512
|
-
systemPrompt,
|
|
513
|
-
maxOutputTokens,
|
|
514
|
-
temperature
|
|
515
|
-
);
|
|
516
|
-
lastRawText = result.text;
|
|
517
|
-
try {
|
|
518
|
-
const cleaned = stripMarkdownCodeBlock(result.text);
|
|
519
|
-
const parsed = JSON.parse(cleaned);
|
|
520
|
-
const judgeResult = validateJudgeResult(parsed);
|
|
521
|
-
const passed = judgeResult.score >= minScore;
|
|
522
|
-
return {
|
|
523
|
-
id: randomUUID5(),
|
|
524
|
-
assertionId,
|
|
525
|
-
assertionType: "llm_judge",
|
|
526
|
-
assertionName: "LLM judge",
|
|
527
|
-
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
528
|
-
message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
|
|
529
|
-
expected: String(minScore),
|
|
530
|
-
actual: String(judgeResult.score),
|
|
531
|
-
details: {
|
|
532
|
-
score: judgeResult.score,
|
|
533
|
-
scoreReasoning: judgeResult.scoreReasoning,
|
|
534
|
-
text: judgeResult.text
|
|
535
|
-
}
|
|
536
|
-
};
|
|
537
|
-
} catch (parseErr) {
|
|
538
|
-
lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
|
|
539
|
-
}
|
|
540
|
-
}
|
|
588
|
+
const judgeResult = await this.callGenerateText(
|
|
589
|
+
model,
|
|
590
|
+
finalPrompt,
|
|
591
|
+
systemPrompt,
|
|
592
|
+
maxOutputTokens,
|
|
593
|
+
temperature,
|
|
594
|
+
workDir || void 0
|
|
595
|
+
);
|
|
596
|
+
const passed = judgeResult.score >= minScore;
|
|
541
597
|
return {
|
|
542
|
-
id:
|
|
598
|
+
id: randomUUID6(),
|
|
543
599
|
assertionId,
|
|
544
600
|
assertionType: "llm_judge",
|
|
545
601
|
assertionName: "LLM judge",
|
|
546
|
-
status: "failed" /* FAILED */,
|
|
547
|
-
message:
|
|
602
|
+
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
603
|
+
message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
|
|
548
604
|
expected: String(minScore),
|
|
549
|
-
actual:
|
|
550
|
-
details: {
|
|
605
|
+
actual: String(judgeResult.score),
|
|
606
|
+
details: {
|
|
607
|
+
score: judgeResult.score,
|
|
608
|
+
scoreReasoning: judgeResult.scoreReasoning,
|
|
609
|
+
text: judgeResult.text
|
|
610
|
+
}
|
|
551
611
|
};
|
|
552
612
|
} catch (err) {
|
|
613
|
+
if (NoObjectGeneratedError.isInstance(err)) {
|
|
614
|
+
return {
|
|
615
|
+
id: randomUUID6(),
|
|
616
|
+
assertionId,
|
|
617
|
+
assertionType: "llm_judge",
|
|
618
|
+
assertionName: "LLM judge",
|
|
619
|
+
status: "failed" /* FAILED */,
|
|
620
|
+
message: "LLM judge failed to produce valid structured output",
|
|
621
|
+
expected: String(minScore),
|
|
622
|
+
details: {
|
|
623
|
+
rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
|
|
624
|
+
}
|
|
625
|
+
};
|
|
626
|
+
}
|
|
553
627
|
const message = err instanceof Error ? err.message : String(err);
|
|
554
628
|
const details = {
|
|
555
629
|
error: message,
|
|
556
|
-
model:
|
|
630
|
+
model: modelId
|
|
557
631
|
};
|
|
558
632
|
if (APICallError.isInstance(err)) {
|
|
559
633
|
details.statusCode = err.statusCode;
|
|
@@ -562,7 +636,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
562
636
|
details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
|
|
563
637
|
}
|
|
564
638
|
return {
|
|
565
|
-
id:
|
|
639
|
+
id: randomUUID6(),
|
|
566
640
|
assertionId,
|
|
567
641
|
assertionType: "llm_judge",
|
|
568
642
|
assertionName: "LLM judge",
|
|
@@ -573,20 +647,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
573
647
|
};
|
|
574
648
|
}
|
|
575
649
|
}
|
|
576
|
-
|
|
650
|
+
/**
|
|
651
|
+
* Resolve the LanguageModel to use: context.model (injected mock/override)
|
|
652
|
+
* takes precedence, otherwise create from llmConfig + modelId.
|
|
653
|
+
*/
|
|
654
|
+
resolveModel(context, modelId) {
|
|
655
|
+
if (context?.model) {
|
|
656
|
+
return context.model;
|
|
657
|
+
}
|
|
658
|
+
if (!modelId || !context?.llmConfig) {
|
|
659
|
+
return null;
|
|
660
|
+
}
|
|
577
661
|
const anthropic = createAnthropic({
|
|
578
|
-
baseURL: llmConfig.baseUrl,
|
|
662
|
+
baseURL: context.llmConfig.baseUrl,
|
|
579
663
|
apiKey: "dummy",
|
|
580
|
-
headers: llmConfig.headers
|
|
664
|
+
headers: context.llmConfig.headers
|
|
581
665
|
});
|
|
582
|
-
|
|
583
|
-
|
|
666
|
+
return anthropic(modelId);
|
|
667
|
+
}
|
|
668
|
+
async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
|
|
669
|
+
const baseOptions = {
|
|
670
|
+
model,
|
|
584
671
|
prompt,
|
|
585
672
|
system,
|
|
586
673
|
maxOutputTokens,
|
|
587
|
-
temperature
|
|
588
|
-
|
|
589
|
-
|
|
674
|
+
temperature,
|
|
675
|
+
output: Output.object({ schema: JudgeResultSchema }),
|
|
676
|
+
stopWhen: stepCountIs(MAX_JUDGE_STEPS)
|
|
677
|
+
};
|
|
678
|
+
const { output } = workDir ? await generateText({
|
|
679
|
+
...baseOptions,
|
|
680
|
+
tools: { read_file: createReadFileTool(workDir) }
|
|
681
|
+
}) : await generateText(baseOptions);
|
|
682
|
+
return output;
|
|
590
683
|
}
|
|
591
684
|
};
|
|
592
685
|
|
|
@@ -594,6 +687,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
594
687
|
var llmJudgeEvaluator = new LlmJudgeEvaluator();
|
|
595
688
|
var evaluators = {
|
|
596
689
|
skill_was_called: new SkillWasCalledEvaluator(),
|
|
690
|
+
tool_called_with_param: new ToolCalledWithParamEvaluator(),
|
|
597
691
|
build_passed: new BuildPassedEvaluator(),
|
|
598
692
|
time_limit: new TimeEvaluator(),
|
|
599
693
|
cost: new CostEvaluator(),
|
|
@@ -616,8 +710,8 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
616
710
|
const evaluator = evaluators[assertion.type];
|
|
617
711
|
if (!evaluator) {
|
|
618
712
|
return {
|
|
619
|
-
id:
|
|
620
|
-
assertionId:
|
|
713
|
+
id: randomUUID7(),
|
|
714
|
+
assertionId: randomUUID7(),
|
|
621
715
|
assertionType: assertion.type,
|
|
622
716
|
assertionName: "Unknown assertion",
|
|
623
717
|
status: "error" /* ERROR */,
|
|
@@ -641,6 +735,7 @@ export {
|
|
|
641
735
|
BuildPassedEvaluator,
|
|
642
736
|
CostAssertionSchema,
|
|
643
737
|
CostEvaluator,
|
|
738
|
+
JudgeResultSchema,
|
|
644
739
|
LLMBreakdownStatsSchema,
|
|
645
740
|
LLMStepType,
|
|
646
741
|
LLMTraceSchema,
|
|
@@ -653,6 +748,9 @@ export {
|
|
|
653
748
|
TimeAssertionSchema,
|
|
654
749
|
TimeEvaluator,
|
|
655
750
|
TokenUsageSchema,
|
|
751
|
+
ToolCalledWithParamAssertionSchema,
|
|
752
|
+
ToolCalledWithParamEvaluator,
|
|
753
|
+
createReadFileTool,
|
|
656
754
|
evaluateAssertions,
|
|
657
755
|
formatTraceForJudge,
|
|
658
756
|
getEvaluator,
|