@wix/eval-assertions 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/build/index.js +181 -103
- package/build/index.js.map +4 -4
- package/build/index.mjs +172 -102
- package/build/index.mjs.map +4 -4
- package/build/types/evaluators/assertion-evaluator.d.ts +4 -17
- package/build/types/evaluators/cost-evaluator.d.ts +10 -0
- package/build/types/evaluators/index.d.ts +3 -2
- package/build/types/evaluators/llm-judge-evaluator.d.ts +11 -0
- package/build/types/index.d.ts +3 -2
- package/build/types/tools/index.d.ts +1 -0
- package/build/types/tools/read-file-tool.d.ts +10 -0
- package/build/types/types/assertions.d.ts +12 -0
- package/build/types/types/index.d.ts +1 -1
- package/package.json +4 -3
package/README.md
CHANGED
|
@@ -149,11 +149,12 @@ Optional context for assertions:
|
|
|
149
149
|
```typescript
|
|
150
150
|
interface AssertionContext {
|
|
151
151
|
workDir?: string; // For build_passed
|
|
152
|
-
llmConfig?: { // For llm_judge
|
|
152
|
+
llmConfig?: { // For llm_judge
|
|
153
153
|
baseUrl: string;
|
|
154
154
|
headers: Record<string, string>;
|
|
155
155
|
};
|
|
156
|
-
|
|
156
|
+
defaultJudgeModel?: string; // Default model for llm_judge
|
|
157
|
+
model?: LanguageModel; // Override model
|
|
157
158
|
}
|
|
158
159
|
```
|
|
159
160
|
|
package/build/index.js
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
2
3
|
var __defProp = Object.defineProperty;
|
|
3
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
5
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
8
|
var __export = (target, all) => {
|
|
7
9
|
for (var name in all)
|
|
@@ -15,6 +17,14 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
15
17
|
}
|
|
16
18
|
return to;
|
|
17
19
|
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
18
28
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
29
|
|
|
20
30
|
// src/index.ts
|
|
@@ -26,6 +36,9 @@ __export(index_exports, {
|
|
|
26
36
|
AssertionSchema: () => AssertionSchema,
|
|
27
37
|
BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
|
|
28
38
|
BuildPassedEvaluator: () => BuildPassedEvaluator,
|
|
39
|
+
CostAssertionSchema: () => CostAssertionSchema,
|
|
40
|
+
CostEvaluator: () => CostEvaluator,
|
|
41
|
+
JudgeResultSchema: () => JudgeResultSchema,
|
|
29
42
|
LLMBreakdownStatsSchema: () => LLMBreakdownStatsSchema,
|
|
30
43
|
LLMStepType: () => LLMStepType,
|
|
31
44
|
LLMTraceSchema: () => LLMTraceSchema,
|
|
@@ -38,6 +51,7 @@ __export(index_exports, {
|
|
|
38
51
|
TimeAssertionSchema: () => TimeAssertionSchema,
|
|
39
52
|
TimeEvaluator: () => TimeEvaluator,
|
|
40
53
|
TokenUsageSchema: () => TokenUsageSchema,
|
|
54
|
+
createReadFileTool: () => createReadFileTool,
|
|
41
55
|
evaluateAssertions: () => evaluateAssertions,
|
|
42
56
|
formatTraceForJudge: () => formatTraceForJudge,
|
|
43
57
|
getEvaluator: () => getEvaluator,
|
|
@@ -62,6 +76,11 @@ var BuildPassedAssertionSchema = import_zod.z.object({
|
|
|
62
76
|
/** Expected exit code (default: 0) */
|
|
63
77
|
expectedExitCode: import_zod.z.number().int().optional()
|
|
64
78
|
});
|
|
79
|
+
var CostAssertionSchema = import_zod.z.object({
|
|
80
|
+
type: import_zod.z.literal("cost"),
|
|
81
|
+
/** Maximum allowed cost in USD */
|
|
82
|
+
maxCostUsd: import_zod.z.number().positive()
|
|
83
|
+
});
|
|
65
84
|
var LlmJudgeAssertionSchema = import_zod.z.object({
|
|
66
85
|
type: import_zod.z.literal("llm_judge"),
|
|
67
86
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */
|
|
@@ -84,6 +103,7 @@ var AssertionSchema = import_zod.z.union([
|
|
|
84
103
|
SkillWasCalledAssertionSchema,
|
|
85
104
|
BuildPassedAssertionSchema,
|
|
86
105
|
TimeAssertionSchema,
|
|
106
|
+
CostAssertionSchema,
|
|
87
107
|
LlmJudgeAssertionSchema
|
|
88
108
|
]);
|
|
89
109
|
|
|
@@ -163,7 +183,7 @@ var AssertionResultSchema = import_zod3.z.object({
|
|
|
163
183
|
});
|
|
164
184
|
|
|
165
185
|
// src/evaluators/index.ts
|
|
166
|
-
var
|
|
186
|
+
var import_crypto6 = require("crypto");
|
|
167
187
|
|
|
168
188
|
// src/evaluators/skill-was-called-evaluator.ts
|
|
169
189
|
var import_crypto = require("crypto");
|
|
@@ -355,10 +375,86 @@ var TimeEvaluator = class extends AssertionEvaluator {
|
|
|
355
375
|
}
|
|
356
376
|
};
|
|
357
377
|
|
|
358
|
-
// src/evaluators/
|
|
378
|
+
// src/evaluators/cost-evaluator.ts
|
|
359
379
|
var import_crypto4 = require("crypto");
|
|
360
|
-
var
|
|
380
|
+
var CostEvaluator = class extends AssertionEvaluator {
|
|
381
|
+
type = "cost";
|
|
382
|
+
evaluate(assertion, input) {
|
|
383
|
+
const assertionId = (0, import_crypto4.randomUUID)();
|
|
384
|
+
const id = (0, import_crypto4.randomUUID)();
|
|
385
|
+
const assertionName = "Cost";
|
|
386
|
+
const assertionType = "cost";
|
|
387
|
+
const maxCostUsd = assertion.maxCostUsd;
|
|
388
|
+
if (!input.llmTrace) {
|
|
389
|
+
return {
|
|
390
|
+
id,
|
|
391
|
+
assertionId,
|
|
392
|
+
assertionType,
|
|
393
|
+
assertionName,
|
|
394
|
+
status: "skipped" /* SKIPPED */,
|
|
395
|
+
message: "No LLM trace available to check cost"
|
|
396
|
+
};
|
|
397
|
+
}
|
|
398
|
+
const actualCostUsd = input.llmTrace.summary.totalCostUsd;
|
|
399
|
+
const formattedActual = actualCostUsd.toFixed(6);
|
|
400
|
+
const formattedMax = maxCostUsd.toFixed(6);
|
|
401
|
+
const passed = Number(formattedActual) <= Number(formattedMax);
|
|
402
|
+
return {
|
|
403
|
+
id,
|
|
404
|
+
assertionId,
|
|
405
|
+
assertionType,
|
|
406
|
+
assertionName,
|
|
407
|
+
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
408
|
+
message: passed ? `Cost $${formattedActual} is within limit of $${formattedMax}` : `Cost $${formattedActual} exceeds limit of $${formattedMax}`,
|
|
409
|
+
expected: `<= $${formattedMax}`,
|
|
410
|
+
actual: `$${formattedActual}`,
|
|
411
|
+
details: { actualCostUsd, maxCostUsd }
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
};
|
|
415
|
+
|
|
416
|
+
// src/tools/read-file-tool.ts
|
|
361
417
|
var import_ai = require("ai");
|
|
418
|
+
var import_zod4 = require("zod");
|
|
419
|
+
var import_promises = require("fs/promises");
|
|
420
|
+
var import_path = __toESM(require("path"));
|
|
421
|
+
function createReadFileTool(workDir) {
|
|
422
|
+
const resolvedWorkDir = import_path.default.resolve(workDir);
|
|
423
|
+
return (0, import_ai.tool)({
|
|
424
|
+
description: "Read the content of any file in the workspace by its relative path. Use this to inspect file contents when evaluating code changes.",
|
|
425
|
+
inputSchema: import_zod4.z.object({
|
|
426
|
+
path: import_zod4.z.string().describe("Relative file path in the workspace")
|
|
427
|
+
}),
|
|
428
|
+
execute: async ({
|
|
429
|
+
path: filePath
|
|
430
|
+
}) => {
|
|
431
|
+
const resolved = import_path.default.resolve(resolvedWorkDir, filePath);
|
|
432
|
+
if (!resolved.startsWith(resolvedWorkDir + import_path.default.sep)) {
|
|
433
|
+
return { error: `Access denied: path escapes workspace directory` };
|
|
434
|
+
}
|
|
435
|
+
try {
|
|
436
|
+
const content = await (0, import_promises.readFile)(resolved, "utf-8");
|
|
437
|
+
return { path: filePath, content };
|
|
438
|
+
} catch {
|
|
439
|
+
return { error: `File not found: ${filePath}` };
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
});
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// src/evaluators/llm-judge-evaluator.ts
|
|
446
|
+
var import_crypto5 = require("crypto");
|
|
447
|
+
var import_anthropic = require("@ai-sdk/anthropic");
|
|
448
|
+
var import_ai2 = require("ai");
|
|
449
|
+
var import_zod5 = require("zod");
|
|
450
|
+
var JudgeResultSchema = import_zod5.z.object({
|
|
451
|
+
text: import_zod5.z.string().describe("A brief textual verdict of the test result"),
|
|
452
|
+
score: import_zod5.z.number().min(0).max(100).describe(
|
|
453
|
+
"A number from 0 to 100 reflecting how well the answer meets the acceptance criteria"
|
|
454
|
+
),
|
|
455
|
+
scoreReasoning: import_zod5.z.string().describe("A concise explanation justifying the assigned score")
|
|
456
|
+
});
|
|
457
|
+
var MAX_JUDGE_STEPS = 20;
|
|
362
458
|
function formatTraceForJudge(llmTrace) {
|
|
363
459
|
if (!llmTrace?.steps?.length) {
|
|
364
460
|
return "No trace available.";
|
|
@@ -429,40 +525,22 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
|
|
|
429
525
|
- {{newFiles}}: list of new files that were created (or "No new files were created")
|
|
430
526
|
- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
|
|
431
527
|
|
|
432
|
-
|
|
433
|
-
var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
|
|
528
|
+
You have access to a read_file tool that lets you read the content of ANY file in the workspace (not just changed files). Use it to inspect file contents whenever you need to verify claims about code, check imports, review implementations, or validate that specific code patterns exist. Always read files before making judgments about their content \u2014 do not guess.
|
|
434
529
|
|
|
435
|
-
|
|
436
|
-
"text": string,
|
|
437
|
-
"score": number (0-100),
|
|
438
|
-
"scoreReasoning": string
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
- text: A brief textual verdict of the test result.
|
|
442
|
-
- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.
|
|
443
|
-
- scoreReasoning: A concise explanation justifying the assigned score.
|
|
444
|
-
|
|
445
|
-
Your response must:
|
|
446
|
-
- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.
|
|
447
|
-
- Be valid and parseable by \`JSON.parse\`.
|
|
448
|
-
- Use only double quotes for all keys and strings, as required by JSON.
|
|
449
|
-
|
|
450
|
-
Any response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;
|
|
530
|
+
CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above and the actual file contents (use the read_file tool). If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
|
|
451
531
|
var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
452
532
|
type = "llm_judge";
|
|
453
533
|
async evaluate(assertion, input, context) {
|
|
454
|
-
const assertionId = (0,
|
|
455
|
-
const llmConfig = context?.llmConfig;
|
|
534
|
+
const assertionId = (0, import_crypto5.randomUUID)();
|
|
456
535
|
const workDir = context?.workDir ?? "";
|
|
457
|
-
const generateTextStub = context?.generateTextForLlmJudge;
|
|
458
536
|
const output = input.outputText ?? "";
|
|
459
537
|
const fileDiffs = input.fileDiffs ?? [];
|
|
460
538
|
const changedPaths = fileDiffs.map((d) => d.path);
|
|
461
539
|
const modifiedPaths = fileDiffs.filter((d) => d.status === "modified").map((d) => d.path);
|
|
462
540
|
const newPaths = fileDiffs.filter((d) => d.status === "new").map((d) => d.path);
|
|
463
|
-
const changedFiles = changedPaths.length > 0 ? changedPaths.map((
|
|
464
|
-
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((
|
|
465
|
-
const newFiles = newPaths.length > 0 ? newPaths.map((
|
|
541
|
+
const changedFiles = changedPaths.length > 0 ? changedPaths.map((p) => `- ${p}`).join("\n") : "No files were changed";
|
|
542
|
+
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((p) => `- ${p}`).join("\n") : "No files were modified";
|
|
543
|
+
const newFiles = newPaths.length > 0 ? newPaths.map((p) => `- ${p}`).join("\n") : "No new files were created";
|
|
466
544
|
const trace = formatTraceForJudge(input.llmTrace);
|
|
467
545
|
const ctx = {
|
|
468
546
|
output,
|
|
@@ -474,101 +552,77 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
474
552
|
};
|
|
475
553
|
const replace = (s) => replacePlaceholders(s, ctx);
|
|
476
554
|
const finalPrompt = replace(assertion.prompt);
|
|
477
|
-
const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS : replace(DEFAULT_JUDGE_CONTEXT) + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
|
|
478
555
|
const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
|
|
479
556
|
const maxOutputTokens = assertion.maxTokens ?? 1024;
|
|
480
557
|
const temperature = assertion.temperature ?? 0;
|
|
481
|
-
const
|
|
482
|
-
|
|
558
|
+
const modelId = assertion.model ?? context?.defaultJudgeModel;
|
|
559
|
+
const model = this.resolveModel(context, modelId);
|
|
560
|
+
if (!model) {
|
|
561
|
+
const reason = !modelId && !context?.model ? "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel/model in context)" : "No llmConfig for llm_judge assertion (AI gateway required)";
|
|
483
562
|
return {
|
|
484
|
-
id: (0,
|
|
563
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
485
564
|
assertionId,
|
|
486
565
|
assertionType: "llm_judge",
|
|
487
566
|
assertionName: "LLM judge",
|
|
488
567
|
status: "failed" /* FAILED */,
|
|
489
|
-
message:
|
|
568
|
+
message: reason,
|
|
490
569
|
expected: String(minScore)
|
|
491
570
|
};
|
|
492
571
|
}
|
|
493
|
-
|
|
494
|
-
return {
|
|
495
|
-
id: (0, import_crypto4.randomUUID)(),
|
|
496
|
-
assertionId,
|
|
497
|
-
assertionType: "llm_judge",
|
|
498
|
-
assertionName: "LLM judge",
|
|
499
|
-
status: "failed" /* FAILED */,
|
|
500
|
-
message: "No llmConfig for llm_judge assertion (AI gateway required)",
|
|
501
|
-
expected: String(minScore)
|
|
502
|
-
};
|
|
503
|
-
}
|
|
504
|
-
const maxParseAttempts = 3;
|
|
505
|
-
let lastParseError;
|
|
506
|
-
let lastRawText;
|
|
572
|
+
const systemPrompt = assertion.systemPrompt != null && assertion.systemPrompt !== "" ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
|
|
507
573
|
try {
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
finalPrompt,
|
|
518
|
-
systemPrompt,
|
|
519
|
-
maxOutputTokens,
|
|
520
|
-
temperature
|
|
521
|
-
);
|
|
522
|
-
lastRawText = result.text;
|
|
523
|
-
try {
|
|
524
|
-
const cleaned = stripMarkdownCodeBlock(result.text);
|
|
525
|
-
const parsed = JSON.parse(cleaned);
|
|
526
|
-
const judgeResult = validateJudgeResult(parsed);
|
|
527
|
-
const passed = judgeResult.score >= minScore;
|
|
528
|
-
return {
|
|
529
|
-
id: (0, import_crypto4.randomUUID)(),
|
|
530
|
-
assertionId,
|
|
531
|
-
assertionType: "llm_judge",
|
|
532
|
-
assertionName: "LLM judge",
|
|
533
|
-
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
534
|
-
message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
|
|
535
|
-
expected: String(minScore),
|
|
536
|
-
actual: String(judgeResult.score),
|
|
537
|
-
details: {
|
|
538
|
-
score: judgeResult.score,
|
|
539
|
-
scoreReasoning: judgeResult.scoreReasoning,
|
|
540
|
-
text: judgeResult.text
|
|
541
|
-
}
|
|
542
|
-
};
|
|
543
|
-
} catch (parseErr) {
|
|
544
|
-
lastParseError = parseErr instanceof Error ? parseErr : new Error(String(parseErr));
|
|
545
|
-
}
|
|
546
|
-
}
|
|
574
|
+
const judgeResult = await this.callGenerateText(
|
|
575
|
+
model,
|
|
576
|
+
finalPrompt,
|
|
577
|
+
systemPrompt,
|
|
578
|
+
maxOutputTokens,
|
|
579
|
+
temperature,
|
|
580
|
+
workDir || void 0
|
|
581
|
+
);
|
|
582
|
+
const passed = judgeResult.score >= minScore;
|
|
547
583
|
return {
|
|
548
|
-
id: (0,
|
|
584
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
549
585
|
assertionId,
|
|
550
586
|
assertionType: "llm_judge",
|
|
551
587
|
assertionName: "LLM judge",
|
|
552
|
-
status: "failed" /* FAILED */,
|
|
553
|
-
message:
|
|
588
|
+
status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
|
|
589
|
+
message: passed ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}` : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,
|
|
554
590
|
expected: String(minScore),
|
|
555
|
-
actual:
|
|
556
|
-
details: {
|
|
591
|
+
actual: String(judgeResult.score),
|
|
592
|
+
details: {
|
|
593
|
+
score: judgeResult.score,
|
|
594
|
+
scoreReasoning: judgeResult.scoreReasoning,
|
|
595
|
+
text: judgeResult.text
|
|
596
|
+
}
|
|
557
597
|
};
|
|
558
598
|
} catch (err) {
|
|
599
|
+
if (import_ai2.NoObjectGeneratedError.isInstance(err)) {
|
|
600
|
+
return {
|
|
601
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
602
|
+
assertionId,
|
|
603
|
+
assertionType: "llm_judge",
|
|
604
|
+
assertionName: "LLM judge",
|
|
605
|
+
status: "failed" /* FAILED */,
|
|
606
|
+
message: "LLM judge failed to produce valid structured output",
|
|
607
|
+
expected: String(minScore),
|
|
608
|
+
details: {
|
|
609
|
+
rawText: typeof err.text === "string" ? err.text.slice(0, 500) : void 0
|
|
610
|
+
}
|
|
611
|
+
};
|
|
612
|
+
}
|
|
559
613
|
const message = err instanceof Error ? err.message : String(err);
|
|
560
614
|
const details = {
|
|
561
615
|
error: message,
|
|
562
|
-
model:
|
|
616
|
+
model: modelId
|
|
563
617
|
};
|
|
564
|
-
if (
|
|
618
|
+
if (import_ai2.APICallError.isInstance(err)) {
|
|
565
619
|
details.statusCode = err.statusCode;
|
|
566
620
|
details.url = err.url;
|
|
567
621
|
details.isRetryable = err.isRetryable;
|
|
568
622
|
details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
|
|
569
623
|
}
|
|
570
624
|
return {
|
|
571
|
-
id: (0,
|
|
625
|
+
id: (0, import_crypto5.randomUUID)(),
|
|
572
626
|
assertionId,
|
|
573
627
|
assertionType: "llm_judge",
|
|
574
628
|
assertionName: "LLM judge",
|
|
@@ -579,20 +633,39 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
579
633
|
};
|
|
580
634
|
}
|
|
581
635
|
}
|
|
582
|
-
|
|
636
|
+
/**
|
|
637
|
+
* Resolve the LanguageModel to use: context.model (injected mock/override)
|
|
638
|
+
* takes precedence, otherwise create from llmConfig + modelId.
|
|
639
|
+
*/
|
|
640
|
+
resolveModel(context, modelId) {
|
|
641
|
+
if (context?.model) {
|
|
642
|
+
return context.model;
|
|
643
|
+
}
|
|
644
|
+
if (!modelId || !context?.llmConfig) {
|
|
645
|
+
return null;
|
|
646
|
+
}
|
|
583
647
|
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
584
|
-
baseURL: llmConfig.baseUrl,
|
|
648
|
+
baseURL: context.llmConfig.baseUrl,
|
|
585
649
|
apiKey: "dummy",
|
|
586
|
-
headers: llmConfig.headers
|
|
650
|
+
headers: context.llmConfig.headers
|
|
587
651
|
});
|
|
588
|
-
|
|
589
|
-
|
|
652
|
+
return anthropic(modelId);
|
|
653
|
+
}
|
|
654
|
+
async callGenerateText(model, prompt, system, maxOutputTokens, temperature, workDir) {
|
|
655
|
+
const baseOptions = {
|
|
656
|
+
model,
|
|
590
657
|
prompt,
|
|
591
658
|
system,
|
|
592
659
|
maxOutputTokens,
|
|
593
|
-
temperature
|
|
594
|
-
|
|
595
|
-
|
|
660
|
+
temperature,
|
|
661
|
+
output: import_ai2.Output.object({ schema: JudgeResultSchema }),
|
|
662
|
+
stopWhen: (0, import_ai2.stepCountIs)(MAX_JUDGE_STEPS)
|
|
663
|
+
};
|
|
664
|
+
const { output } = workDir ? await (0, import_ai2.generateText)({
|
|
665
|
+
...baseOptions,
|
|
666
|
+
tools: { read_file: createReadFileTool(workDir) }
|
|
667
|
+
}) : await (0, import_ai2.generateText)(baseOptions);
|
|
668
|
+
return output;
|
|
596
669
|
}
|
|
597
670
|
};
|
|
598
671
|
|
|
@@ -602,6 +675,7 @@ var evaluators = {
|
|
|
602
675
|
skill_was_called: new SkillWasCalledEvaluator(),
|
|
603
676
|
build_passed: new BuildPassedEvaluator(),
|
|
604
677
|
time_limit: new TimeEvaluator(),
|
|
678
|
+
cost: new CostEvaluator(),
|
|
605
679
|
llm_judge: llmJudgeEvaluator,
|
|
606
680
|
// Custom assertions use the same LLM-based evaluation as llm_judge
|
|
607
681
|
custom: llmJudgeEvaluator
|
|
@@ -621,8 +695,8 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
621
695
|
const evaluator = evaluators[assertion.type];
|
|
622
696
|
if (!evaluator) {
|
|
623
697
|
return {
|
|
624
|
-
id: (0,
|
|
625
|
-
assertionId: (0,
|
|
698
|
+
id: (0, import_crypto6.randomUUID)(),
|
|
699
|
+
assertionId: (0, import_crypto6.randomUUID)(),
|
|
626
700
|
assertionType: assertion.type,
|
|
627
701
|
assertionName: "Unknown assertion",
|
|
628
702
|
status: "error" /* ERROR */,
|
|
@@ -645,6 +719,9 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
645
719
|
AssertionSchema,
|
|
646
720
|
BuildPassedAssertionSchema,
|
|
647
721
|
BuildPassedEvaluator,
|
|
722
|
+
CostAssertionSchema,
|
|
723
|
+
CostEvaluator,
|
|
724
|
+
JudgeResultSchema,
|
|
648
725
|
LLMBreakdownStatsSchema,
|
|
649
726
|
LLMStepType,
|
|
650
727
|
LLMTraceSchema,
|
|
@@ -657,6 +734,7 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
657
734
|
TimeAssertionSchema,
|
|
658
735
|
TimeEvaluator,
|
|
659
736
|
TokenUsageSchema,
|
|
737
|
+
createReadFileTool,
|
|
660
738
|
evaluateAssertions,
|
|
661
739
|
formatTraceForJudge,
|
|
662
740
|
getEvaluator,
|