@wix/eval-assertions 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/build/index.js +218 -112
- package/build/index.js.map +4 -4
- package/build/index.mjs +209 -111
- package/build/index.mjs.map +4 -4
- package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
- package/build/types/evaluators/index.d.ts +3 -2
- package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
- package/build/types/evaluators/tool-called-with-param-evaluator.d.ts +12 -0
- package/build/types/index.d.ts +3 -2
- package/build/types/tools/index.d.ts +1 -0
- package/build/types/tools/read-file-tool.d.ts +10 -0
- package/build/types/types/assertions.d.ts +14 -0
- package/build/types/types/index.d.ts +1 -1
- package/package.json +4 -3
package/README.md
CHANGED
|
@@ -149,11 +149,12 @@ Optional context for assertions:
|
|
|
149
149
|
```typescript
|
|
150
150
|
interface AssertionContext {
|
|
151
151
|
workDir?: string; // For build_passed
|
|
152
|
-
llmConfig?: { // For llm_judge
|
|
152
|
+
llmConfig?: { // For llm_judge
|
|
153
153
|
baseUrl: string;
|
|
154
154
|
headers: Record<string, string>;
|
|
155
155
|
};
|
|
156
|
-
|
|
156
|
+
defaultJudgeModel?: string; // Default model for llm_judge
|
|
157
|
+
model?: LanguageModel; // Override model
|
|
157
158
|
}
|
|
158
159
|
```
|
|
159
160
|
|
package/build/index.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
2
3
|
var __defProp = Object.defineProperty;
|
|
3
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
5
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
8
|
var __export = (target, all) => {
|
|
7
9
|
for (var name in all)
|
|
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
15
17
|
}
|
|
16
18
|
return to;
|
|
17
19
|
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
18
28
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
29
|
|
|
20
30
|
// src/index.ts
|
|
@@ -28,6 +38,7 @@ __export(index_exports, {
|
|
|
28
38
|
BuildPassedEvaluator: () => BuildPassedEvaluator,
|
|
29
39
|
CostAssertionSchema: () => CostAssertionSchema,
|
|
30
40
|
CostEvaluator: () => CostEvaluator,
|
|
41
|
+
JudgeResultSchema: () => JudgeResultSchema,
|
|
31
42
|
LLMBreakdownStatsSchema: () => LLMBreakdownStatsSchema,
|
|
32
43
|
LLMStepType: () => LLMStepType,
|
|
33
44
|
LLMTraceSchema: () => LLMTraceSchema,
|
|
@@ -40,6 +51,9 @@ __export(index_exports, {
|
|
|
40
51
|
TimeAssertionSchema: () => TimeAssertionSchema,
|
|
41
52
|
TimeEvaluator: () => TimeEvaluator,
|
|
42
53
|
TokenUsageSchema: () => TokenUsageSchema,
|
|
54
|
+
ToolCalledWithParamAssertionSchema: () => ToolCalledWithParamAssertionSchema,
|
|
55
|
+
ToolCalledWithParamEvaluator: () => ToolCalledWithParamEvaluator,
|
|
56
|
+
createReadFileTool: () => createReadFileTool,
|
|
43
57
|
evaluateAssertions: () => evaluateAssertions,
|
|
44
58
|
formatTraceForJudge: () => formatTraceForJudge,
|
|
45
59
|
getEvaluator: () => getEvaluator,
|
|
@@ -57,6 +71,13 @@ var SkillWasCalledAssertionSchema = import_zod.z.object({
|
|
|
57
71
|
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
58
72
|
skillNames: import_zod.z.array(import_zod.z.string()).min(1)
|
|
59
73
|
});
|
|
74
|
+
var ToolCalledWithParamAssertionSchema = import_zod.z.object({
|
|
75
|
+
type: import_zod.z.literal("tool_called_with_param"),
|
|
76
|
+
/** Name of the tool that must have been called */
|
|
77
|
+
toolName: import_zod.z.string().min(1),
|
|
78
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
79
|
+
expectedParams: import_zod.z.string().min(1)
|
|
80
|
+
});
|
|
60
81
|
var BuildPassedAssertionSchema = import_zod.z.object({
|
|
61
82
|
type: import_zod.z.literal("build_passed"),
|
|
62
83
|
/** Command to run (default: "yarn build") */
|
|
@@ -89,6 +110,7 @@ var TimeAssertionSchema = import_zod.z.object({
|
|
|
89
110
|
});
|
|
90
111
|
var AssertionSchema = import_zod.z.union([
|
|
91
112
|
SkillWasCalledAssertionSchema,
|
|
113
|
+
ToolCalledWithParamAssertionSchema,
|
|
92
114
|
BuildPassedAssertionSchema,
|
|
93
115
|
TimeAssertionSchema,
|
|
94
116
|
CostAssertionSchema,
|
|
@@ -171,7 +193,7 @@ var AssertionResultSchema = import_zod3.z.object({
|
|
|
171
193
|
});
|
|
172
194
|
|
|
173
195
|
// src/evaluators/index.ts
|
|
174
|
-
var
|
|
196
|
+
var import_crypto7 = require("crypto");
|
|
175
197
|
|
|
176
198
|
// src/evaluators/skill-was-called-evaluator.ts
|
|
177
199
|
var import_crypto = require("crypto");
|
|
@@ -250,15 +272,79 @@ var SkillWasCalledEvaluator = class extends AssertionEvaluator {
|
|
|
250
272
|
}
|
|
251
273
|
};
|
|
252
274
|
|
|
253
|
-
// src/evaluators/
|
|
275
|
+
// src/evaluators/tool-called-with-param-evaluator.ts
|
|
254
276
|
var import_crypto2 = require("crypto");
|
|
277
|
+
var ASSERTION_TYPE = "tool_called_with_param";
|
|
278
|
+
var ASSERTION_NAME = "Tool called with param";
|
|
279
|
+
var containsAll = ({
|
|
280
|
+
actual,
|
|
281
|
+
expected
|
|
282
|
+
}) => Object.entries(expected).every(([key, val]) => {
|
|
283
|
+
const actualVal = actual[key];
|
|
284
|
+
if (actualVal === null || actualVal === void 0) return false;
|
|
285
|
+
const actualStr = typeof actualVal === "string" ? actualVal : JSON.stringify(actualVal);
|
|
286
|
+
return actualStr.includes(String(val));
|
|
287
|
+
});
|
|
288
|
+
var ToolCalledWithParamEvaluator = class extends AssertionEvaluator {
|
|
289
|
+
type = ASSERTION_TYPE;
|
|
290
|
+
evaluate(assertion, input, _context) {
|
|
291
|
+
const assertionId = (0, import_crypto2.randomUUID)();
|
|
292
|
+
const { toolName, expectedParams: expectedParamsStr } = assertion;
|
|
293
|
+
const buildResult = (status, message, expected2, actual) => ({
|
|
294
|
+
id: (0, import_crypto2.randomUUID)(),
|
|
295
|
+
assertionId,
|
|
296
|
+
assertionType: ASSERTION_TYPE,
|
|
297
|
+
assertionName: ASSERTION_NAME,
|
|
298
|
+
status,
|
|
299
|
+
message,
|
|
300
|
+
expected: expected2,
|
|
301
|
+
...actual !== void 0 ? { actual } : {}
|
|
302
|
+
});
|
|
303
|
+
let expected;
|
|
304
|
+
try {
|
|
305
|
+
expected = JSON.parse(expectedParamsStr);
|
|
306
|
+
} catch {
|
|
307
|
+
return buildResult(
|
|
308
|
+
"failed" /* FAILED */,
|
|
309
|
+
`Tool "${toolName}" assertion has invalid expected params JSON`,
|
|
310
|
+
`${toolName}(invalid expected params)`,
|
|
311
|
+
"Invalid expected params JSON"
|
|
312
|
+
);
|
|
313
|
+
}
|
|
314
|
+
const expectedLabel = `${toolName}(${Object.entries(expected).map(([k, v]) => `${k}="${v}"`).join(", ")})`;
|
|
315
|
+
const steps = input.llmTrace?.steps ?? [];
|
|
316
|
+
const toolCalls = steps.filter((s) => s.toolName === toolName && s.toolArguments !== void 0).map((s) => {
|
|
317
|
+
try {
|
|
318
|
+
return JSON.parse(s.toolArguments);
|
|
319
|
+
} catch {
|
|
320
|
+
return null;
|
|
321
|
+
}
|
|
322
|
+
}).filter((call) => call !== null);
|
|
323
|
+
if (toolCalls.some((actual) => containsAll({ actual, expected }))) {
|
|
324
|
+
return buildResult(
|
|
325
|
+
"passed" /* PASSED */,
|
|
326
|
+
`Tool "${toolName}" was called with params matching ${expectedParamsStr}`,
|
|
327
|
+
expectedLabel
|
|
328
|
+
);
|
|
329
|
+
}
|
|
330
|
+
return buildResult(
|
|
331
|
+
"failed" /* FAILED */,
|
|
332
|
+
`Tool "${toolName}" was never called with params matching ${expectedParamsStr}`,
|
|
333
|
+
expectedLabel,
|
|
334
|
+
toolCalls.length > 0 ? `Found ${toolName} calls but params didn't match` : `No matching tool calls found`
|
|
335
|
+
);
|
|
336
|
+
}
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
// src/evaluators/build-passed-evaluator.ts
|
|
340
|
+
var import_crypto3 = require("crypto");
|
|
255
341
|
var import_child_process = require("child_process");
|
|
256
342
|
var DEFAULT_COMMAND = "yarn build";
|
|
257
343
|
var DEFAULT_EXIT_CODE = 0;
|
|
258
344
|
var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
259
345
|
type = "build_passed";
|
|
260
346
|
evaluate(assertion, _input, context) {
|
|
261
|
-
const assertionId = (0,
|
|
347
|
+
const assertionId = (0, import_crypto3.randomUUID)();
|
|
262
348
|
const workDir = context?.workDir;
|
|
263
349
|
const command = assertion.command ?? DEFAULT_COMMAND;
|
|
264
350
|
const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;
|
|
@@ -306,7 +392,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
|
306
392
|
}
|
|
307
393
|
createResult(assertionId, fields) {
|
|
308
394
|
return {
|
|
309
|
-
id: (0,
|
|
395
|
+
id: (0, import_crypto3.randomUUID)(),
|
|
310
396
|
assertionId,
|
|
311
397
|
assertionType: "build_passed",
|
|
312
398
|
assertionName: "Build passed",
|
|
@@ -331,7 +417,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
|
331
417
|
};
|
|
332
418
|
|
|
333
419
|
// src/evaluators/time-evaluator.ts
|
|
334
|
-
var
|
|
420
|
+
var import_crypto4 = require("crypto");
|
|
335
421
|
var TimeEvaluator = class extends AssertionEvaluator {
|
|
336
422
|
type = "time_limit";
|
|
337
423
|
evaluate(assertion, input) {
|
|
@@ -353,8 +439,8 @@ var TimeEvaluator = class extends AssertionEvaluator {
|
|
|
353
439
|
}
|
|
354
440
|
createResult(fields) {
|
|
355
441
|
return {
|
|
356
|
-
id: (0,
|
|
357
|
-
assertionId: (0,
|
|
442
|
+
id: (0, import_crypto4.randomUUID)(),
|
|
443
|
+
assertionId: (0, import_crypto4.randomUUID)(),
|
|
358
444
|
assertionType: "time_limit",
|
|
359
445
|
assertionName: "Time limit",
|
|
360
446
|
status: "failed" /* FAILED */,
|
|
@@ -364,12 +450,12 @@ var TimeEvaluator = class extends AssertionEvaluator {
|
|
|
364
450
|
};
|
|
365
451
|
|
|
366
452
|
// src/evaluators/cost-evaluator.ts
|
|
367
|
-
var
|
|
453
|
+
var import_crypto5 = require("crypto");
|
|
368
454
|
var CostEvaluator = class extends AssertionEvaluator {
|
|
369
455
|
type = "cost";
|
|
370
456
|
evaluate(assertion, input) {
|
|
371
|
-
const assertionId = (0,
|
|
372
|
-
const id = (0,
|
|
457
|
+
const assertionId = (0, import_crypto5.randomUUID)();
|
|
458
|
+
const id = (0, import_crypto5.randomUUID)();
|
|
373
459
|
const assertionName = "Cost";
|
|
374
460
|
const assertionType = "cost";
|
|
375
461
|
const maxCostUsd = assertion.maxCostUsd;
|
|
@@ -401,10 +487,48 @@ var CostEvaluator = class extends AssertionEvaluator {
|
|
|
401
487
|
}
|
|
402
488
|
};
|
|
403
489
|
|
|
490
|
+
// src/tools/read-file-tool.ts
|
|
491
|
+
var import_ai = require("ai");
|
|
492
|
+
var import_zod4 = require("zod");
|
|
493
|
+
var import_promises = require("fs/promises");
|
|
494
|
+
var import_path = __toESM(require("path"));
|
|
495
|
+
function createReadFileTool(workDir) {
|
|
496
|
+
const resolvedWorkDir = import_path.default.resolve(workDir);
|
|
497
|
+
return (0, import_ai.tool)({
|
|
498
|
+
description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
|
|
499
|
+
inputSchema: import_zod4.z.object({
|
|
500
|
+
path: import_zod4.z.string().describe("Relative file path in the workspace")
|
|
501
|
+
}),
|
|
502
|
+
execute: async ({
|
|
503
|
+
path: filePath
|
|
504
|
+
}) => {
|
|
505
|
+
const resolved = import_path.default.resolve(resolvedWorkDir, filePath);
|
|
506
|
+
if (!resolved.startsWith(resolvedWorkDir + import_path.default.sep)) {
|
|
507
|
+
return { error: `Access denied: path escapes workspace directory` };
|
|
508
|
+
}
|
|
509
|
+
try {
|
|
510
|
+
const content = await (0, import_promises.readFile)(resolved, "utf-8");
|
|
511
|
+
return { path: filePath, content };
|
|
512
|
+
} catch {
|
|
513
|
+
return { error: `File not found: ${filePath}` };
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
|
|
404
519
|
// src/evaluators/llm-judge-evaluator.ts
|
|
405
|
-
var
|
|
520
|
+
var import_crypto6 = require("crypto");
|
|
406
521
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
407
|
-
var
|
|
522
|
+
var import_ai2 = require("ai");
|
|
523
|
+
var import_zod5 = require("zod");
|
|
524
|
+
var JudgeResultSchema = import_zod5.z.object({
|
|
525
|
+
text: import_zod5.z.string().describe("A brief textual verdict of the test result"),
|
|
526
|
+
score: import_zod5.z.number().min(0).max(100).describe(
|
|
527
|
+
"A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
|
|
528
|
+
),
|
|
529
|
+
scoreReasoning: import_zod5.z.string().describe("A concise explanation justifying the assigned score")
|
|
530
|
+
});
|
|
531
|
+
var MAX_JUDGE_STEPS = 20;
|
|
408
532
|
function formatTraceForJudge(llmTrace) {
|
|
409
533
|
if (!llmTrace?.steps?.length) {
|
|
410
534
|
return "No trace available.";
|
|
@@ -475,40 +599,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
|
|
|
475
599
|
- {{newFiles}}: list of new files that were created (or "No new files were created")
|
|
476
600
|
- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
|
|
477
601
|
|
|
478
|
-
|
|
479
|
-
var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
|
|
480
|
-
|
|
481
|
-
{
|
|
482
|
-
"text": string,
|
|
483
|
-
"score": number (0-100),
|
|
484
|
-
"scoreReasoning": string
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
- text: A brief textual verdict of the test result.
|
|
488
|
-
- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
|
|
489
|
-
- scoreReasoning: A concise explanation justifying the assigned score.
|
|
602
|
+
You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
|
|
490
603
|
|
|
491
|
-
|
|
492
|
-
- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
|
|
493
|
-
- Be valid and parseable by \`JSON.parse\`.
|
|
494
|
-
- Use only double quotes for all keys and strings, as required by JSON.
|
|
495
|
-
|
|
496
|
-
Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
|
|
604
|
+
CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
|
|
497
605
|
var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
498
606
|
type = "llm_judge";
|
|
499
607
|
async evaluate(assertion, input, context) {
|
|
500
|
-
const assertionId = (0,
|
|
501
|
-
const llmConfig = context?.llmConfig;
|
|
608
|
+
const assertionId = (0, import_crypto6.randomUUID)();
|
|
502
609
|
const workDir = context?.workDir ?? "";
|
|
503
|
-
const generateTextStub = context?.generateTextForLlmJudge;
|
|
504
610
|
const output = input.outputText ?? "";
|
|
505
611
|
const fileDiffs = input.fileDiffs ?? [];
|
|
506
612
|
const changedPaths = fileDiffs.map((d) => d.path);
|
|
507
613
|
const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
|
|
508
614
|
const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
|
|
509
|
-
const changedFiles = changedPaths.length > 0 ? changedPaths.map((
|
|
510
|
-
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((
|
|
511
|
-
const newFiles = newPaths.length > 0 ? newPaths.map((
|
|
615
|
+
const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
|
|
616
|
+
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
|
|
617
|
+
const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
|
|
512
618
|
const trace = formatTraceForJudge(input.llmTrace);
|
|
513
619
|
const ctx = {
|
|
514
620
|
output,
|
|
@@ -520,101 +626,77 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
520
626
|
};
|
|
521
627
|
const replace = (s) => replacePlaceholders(s, ctx);
|
|
522
628
|
const finalPrompt = replace(assertion.prompt);
|
|
523
|
-
const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
|
|
524
629
|
const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
|
|
525
630
|
const maxOutputTokens = assertion.maxTokens ?? 1024;
|
|
526
631
|
const temperature = assertion.temperature ?? 0;
|
|
527
|
-
const
|
|
528
|
-
|
|
632
|
+
const modelId = assertion.model ?? context?.defaultJudgeModel;
|
|
633
|
+
const model = this.resolveModel(context, modelId);
|
|
634
|
+
if (!model) {
|
|
635
|
+
const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
|
|
529
636
|
return {
|
|
530
|
-
id: (0,
|
|
637
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
531
638
|
assertionId,
|
|
532
639
|
assertionType: "llm_judge",
|
|
533
640
|
assertionName: "LLM judge",
|
|
534
641
|
status: "failed" /* FAILED */,
|
|
535
|
-
message:
|
|
642
|
+
message: reason,
|
|
536
643
|
expected: String(minScore)
|
|
537
644
|
};
|
|
538
645
|
}
|
|
539
|
-
|
|
540
|
-
return {
|
|
541
|
-
id: (0, import_crypto5.randomUUID)(),
|
|
542
|
-
assertionId,
|
|
543
|
-
assertionType: "llm_judge",
|
|
544
|
-
assertionName: "LLM judge",
|
|
545
|
-
status: "failed" /* FAILED */,
|
|
546
|
-
message: "No llmConfig for llm_judge assertion (AI gateway required)",
|
|
547
|
-
expected: String(minScore)
|
|
548
|
-
};
|
|
549
|
-
}
|
|
550
|
-
const maxParseAttempts = 3;
|
|
551
|
-
let lastParseError;
|
|
552
|
-
let lastRawText;
|
|
646
|
+
const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
|
|
553
647
|
try {
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
finalPrompt,
|
|
564
|
-
systemPrompt,
|
|
565
|
-
maxOutputTokens,
|
|
566
|
-
temperature
|
|
567
|
-
);
|
|
568
|
-
lastRawText = result.text;
|
|
569
|
-
try {
|
|
570
|
-
const cleaned = stripMarkdownCodeBlock(result.text);
|
|
571
|
-
const parsed = JSON.parse(cleaned);
|
|
572
|
-
const judgeResult = validateJudgeResult(parsed);
|
|
573
|
-
const passed = judgeResult.score >= minScore;
|
|
574
|
-
return {
|
|
575
|
-
id: (0, import_crypto5.randomUUID)(),
|
|
576
|
-
assertionId,
|
|
577
|
-
assertionType: "llm_judge",
|
|
578
|
-
assertionName: "LLM judge",
|
|
579
|
-
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
580
|
-
message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
|
|
581
|
-
expected: String(minScore),
|
|
582
|
-
actual: String(judgeResult.score),
|
|
583
|
-
details: {
|
|
584
|
-
score: judgeResult.score,
|
|
585
|
-
scoreReasoning: judgeResult.scoreReasoning,
|
|
586
|
-
text: judgeResult.text
|
|
587
|
-
}
|
|
588
|
-
};
|
|
589
|
-
} catch (parseErr) {
|
|
590
|
-
lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
|
|
591
|
-
}
|
|
592
|
-
}
|
|
648
|
+
const judgeResult = await this.callGenerateText(
|
|
649
|
+
model,
|
|
650
|
+
finalPrompt,
|
|
651
|
+
systemPrompt,
|
|
652
|
+
maxOutputTokens,
|
|
653
|
+
temperature,
|
|
654
|
+
workDir || void 0
|
|
655
|
+
);
|
|
656
|
+
const passed = judgeResult.score >= minScore;
|
|
593
657
|
return {
|
|
594
|
-
id: (0,
|
|
658
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
595
659
|
assertionId,
|
|
596
660
|
assertionType: "llm_judge",
|
|
597
661
|
assertionName: "LLM judge",
|
|
598
|
-
status: "failed" /* FAILED */,
|
|
599
|
-
message:
|
|
662
|
+
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
663
|
+
message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
|
|
600
664
|
expected: String(minScore),
|
|
601
|
-
actual:
|
|
602
|
-
details: {
|
|
665
|
+
actual: String(judgeResult.score),
|
|
666
|
+
details: {
|
|
667
|
+
score: judgeResult.score,
|
|
668
|
+
scoreReasoning: judgeResult.scoreReasoning,
|
|
669
|
+
text: judgeResult.text
|
|
670
|
+
}
|
|
603
671
|
};
|
|
604
672
|
} catch (err) {
|
|
673
|
+
if (import_ai2.NoObjectGeneratedError.isInstance(err)) {
|
|
674
|
+
return {
|
|
675
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
676
|
+
assertionId,
|
|
677
|
+
assertionType: "llm_judge",
|
|
678
|
+
assertionName: "LLM judge",
|
|
679
|
+
status: "failed" /* FAILED */,
|
|
680
|
+
message: "LLM judge failed to produce valid structured output",
|
|
681
|
+
expected: String(minScore),
|
|
682
|
+
details: {
|
|
683
|
+
rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
|
|
684
|
+
}
|
|
685
|
+
};
|
|
686
|
+
}
|
|
605
687
|
const message = err instanceof Error ? err.message : String(err);
|
|
606
688
|
const details = {
|
|
607
689
|
error: message,
|
|
608
|
-
model:
|
|
690
|
+
model: modelId
|
|
609
691
|
};
|
|
610
|
-
if (
|
|
692
|
+
if (import_ai2.APICallError.isInstance(err)) {
|
|
611
693
|
details.statusCode = err.statusCode;
|
|
612
694
|
details.url = err.url;
|
|
613
695
|
details.isRetryable = err.isRetryable;
|
|
614
696
|
details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
|
|
615
697
|
}
|
|
616
698
|
return {
|
|
617
|
-
id: (0,
|
|
699
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
618
700
|
assertionId,
|
|
619
701
|
assertionType: "llm_judge",
|
|
620
702
|
assertionName: "LLM judge",
|
|
@@ -625,20 +707,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
625
707
|
};
|
|
626
708
|
}
|
|
627
709
|
}
|
|
628
|
-
|
|
710
|
+
/**
|
|
711
|
+
* Resolve the LanguageModel to use: context.model (injected mock/override)
|
|
712
|
+
* takes precedence, otherwise create from llmConfig + modelId.
|
|
713
|
+
*/
|
|
714
|
+
resolveModel(context, modelId) {
|
|
715
|
+
if (context?.model) {
|
|
716
|
+
return context.model;
|
|
717
|
+
}
|
|
718
|
+
if (!modelId || !context?.llmConfig) {
|
|
719
|
+
return null;
|
|
720
|
+
}
|
|
629
721
|
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
630
|
-
baseURL: llmConfig.baseUrl,
|
|
722
|
+
baseURL: context.llmConfig.baseUrl,
|
|
631
723
|
apiKey: "dummy",
|
|
632
|
-
headers: llmConfig.headers
|
|
724
|
+
headers: context.llmConfig.headers
|
|
633
725
|
});
|
|
634
|
-
|
|
635
|
-
|
|
726
|
+
return anthropic(modelId);
|
|
727
|
+
}
|
|
728
|
+
async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
|
|
729
|
+
const baseOptions = {
|
|
730
|
+
model,
|
|
636
731
|
prompt,
|
|
637
732
|
system,
|
|
638
733
|
maxOutputTokens,
|
|
639
|
-
temperature
|
|
640
|
-
|
|
641
|
-
|
|
734
|
+
temperature,
|
|
735
|
+
output: import_ai2.Output.object({ schema: JudgeResultSchema }),
|
|
736
|
+
stopWhen: (0, import_ai2.stepCountIs)(MAX_JUDGE_STEPS)
|
|
737
|
+
};
|
|
738
|
+
const { output } = workDir ? await (0, import_ai2.generateText)({
|
|
739
|
+
...baseOptions,
|
|
740
|
+
tools: { read_file: createReadFileTool(workDir) }
|
|
741
|
+
}) : await (0, import_ai2.generateText)(baseOptions);
|
|
742
|
+
return output;
|
|
642
743
|
}
|
|
643
744
|
};
|
|
644
745
|
|
|
@@ -646,6 +747,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
646
747
|
var llmJudgeEvaluator = new LlmJudgeEvaluator();
|
|
647
748
|
var evaluators = {
|
|
648
749
|
skill_was_called: new SkillWasCalledEvaluator(),
|
|
750
|
+
tool_called_with_param: new ToolCalledWithParamEvaluator(),
|
|
649
751
|
build_passed: new BuildPassedEvaluator(),
|
|
650
752
|
time_limit: new TimeEvaluator(),
|
|
651
753
|
cost: new CostEvaluator(),
|
|
@@ -668,8 +770,8 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
668
770
|
const evaluator = evaluators[assertion.type];
|
|
669
771
|
if (!evaluator) {
|
|
670
772
|
return {
|
|
671
|
-
id: (0,
|
|
672
|
-
assertionId: (0,
|
|
773
|
+
id: (0, import_crypto7.randomUUID)(),
|
|
774
|
+
assertionId: (0, import_crypto7.randomUUID)(),
|
|
673
775
|
assertionType: assertion.type,
|
|
674
776
|
assertionName: "Unknown assertion",
|
|
675
777
|
status: "error" /* ERROR */,
|
|
@@ -694,6 +796,7 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
694
796
|
BuildPassedEvaluator,
|
|
695
797
|
CostAssertionSchema,
|
|
696
798
|
CostEvaluator,
|
|
799
|
+
JudgeResultSchema,
|
|
697
800
|
LLMBreakdownStatsSchema,
|
|
698
801
|
LLMStepType,
|
|
699
802
|
LLMTraceSchema,
|
|
@@ -706,6 +809,9 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
706
809
|
TimeAssertionSchema,
|
|
707
810
|
TimeEvaluator,
|
|
708
811
|
TokenUsageSchema,
|
|
812
|
+
ToolCalledWithParamAssertionSchema,
|
|
813
|
+
ToolCalledWithParamEvaluator,
|
|
814
|
+
createReadFileTool,
|
|
709
815
|
evaluateAssertions,
|
|
710
816
|
formatTraceForJudge,
|
|
711
817
|
getEvaluator,
|