@wix/eval-assertions 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/build/index.js +57 -9
- package/build/index.js.map +3 -3
- package/build/index.mjs +56 -9
- package/build/index.mjs.map +3 -3
- package/build/types/evaluators/index.d.ts +1 -1
- package/build/types/evaluators/llm-judge-evaluator.d.ts +21 -3
- package/build/types/index.d.ts +1 -1
- package/build/types/types/assertions.d.ts +1 -0
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -104,7 +104,7 @@ Runs a command in the working directory and checks the exit code. When the comma
|
|
|
104
104
|
|
|
105
105
|
### llm_judge
|
|
106
106
|
|
|
107
|
-
Uses an LLM to evaluate the output with a customizable prompt. The
|
|
107
|
+
Uses an LLM to evaluate the output with a customizable prompt. The judge has a `readFile` tool that lets it inspect the content of changed files on demand (in-memory lookup, no filesystem access). The default system prompt instructs the judge to be strict on factual verification and to use the `readFile` tool before scoring. When the judge returns invalid JSON, the evaluator retries up to 3 times before failing.
|
|
108
108
|
|
|
109
109
|
```typescript
|
|
110
110
|
{
|
|
@@ -118,14 +118,18 @@ Uses an LLM to evaluate the output with a customizable prompt. The default syste
|
|
|
118
118
|
}
|
|
119
119
|
```
|
|
120
120
|
|
|
121
|
-
**
|
|
121
|
+
**readFile tool:** The judge LLM automatically receives a `readFile` tool that can read the content of any changed file by path. The tool performs an in-memory lookup against the `fileDiffs` array — no filesystem access. Files under `.claude/` are pre-filtered and not accessible to the tool.
|
|
122
122
|
|
|
123
123
|
**Available placeholders in prompts:**
|
|
124
124
|
- `{{output}}` - The agent's final output text
|
|
125
125
|
- `{{cwd}}` - Working directory path
|
|
126
|
-
- `{{changedFiles}}` - List of files
|
|
126
|
+
- `{{changedFiles}}` - List of changed file paths (excludes `.claude/` files). The judge uses `readFile` to inspect contents.
|
|
127
|
+
- `{{modifiedFiles}}` - List of existing files that were modified (paths only)
|
|
128
|
+
- `{{newFiles}}` - List of new files that were created (paths only)
|
|
127
129
|
- `{{trace}}` - Formatted LLM trace showing tool calls and completions
|
|
128
130
|
|
|
131
|
+
> **Note:** Files under `.claude/` (e.g., skill definitions) are automatically excluded from all file placeholders and the `readFile` tool to reduce noise in the judge prompt.
|
|
132
|
+
|
|
129
133
|
## Types
|
|
130
134
|
|
|
131
135
|
### EvaluationInput
|
package/build/index.js
CHANGED
|
@@ -36,6 +36,7 @@ __export(index_exports, {
|
|
|
36
36
|
SkillWasCalledAssertionSchema: () => SkillWasCalledAssertionSchema,
|
|
37
37
|
SkillWasCalledEvaluator: () => SkillWasCalledEvaluator,
|
|
38
38
|
TokenUsageSchema: () => TokenUsageSchema,
|
|
39
|
+
buildReadFileTool: () => buildReadFileTool,
|
|
39
40
|
evaluateAssertions: () => evaluateAssertions,
|
|
40
41
|
formatTraceForJudge: () => formatTraceForJudge,
|
|
41
42
|
getEvaluator: () => getEvaluator,
|
|
@@ -318,6 +319,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
|
318
319
|
var import_crypto3 = require("crypto");
|
|
319
320
|
var import_anthropic = require("@ai-sdk/anthropic");
|
|
320
321
|
var import_ai = require("ai");
|
|
322
|
+
var import_zod4 = require("zod");
|
|
321
323
|
function formatTraceForJudge(llmTrace) {
|
|
322
324
|
if (!llmTrace?.steps?.length) {
|
|
323
325
|
return "No trace available.";
|
|
@@ -345,6 +347,31 @@ function formatTraceForJudge(llmTrace) {
|
|
|
345
347
|
}
|
|
346
348
|
return lines.join("\n");
|
|
347
349
|
}
|
|
350
|
+
var readFileInputSchema = import_zod4.z.object({
|
|
351
|
+
path: import_zod4.z.string().describe("Relative file path from the changed files list")
|
|
352
|
+
});
|
|
353
|
+
function buildReadFileTool(fileDiffs) {
|
|
354
|
+
return {
|
|
355
|
+
description: "Read the content of a changed file by its path. Only files that were changed during the scenario run are available.",
|
|
356
|
+
inputSchema: readFileInputSchema,
|
|
357
|
+
execute: async ({ path }) => {
|
|
358
|
+
const diff = fileDiffs.find((d) => d.path === path);
|
|
359
|
+
if (!diff) {
|
|
360
|
+
const paths = fileDiffs.map((d) => d.path);
|
|
361
|
+
const MAX_LISTED = 20;
|
|
362
|
+
const preview = paths.length <= MAX_LISTED ? paths.join(", ") : `${paths.slice(0, MAX_LISTED).join(", ")} ... and ${paths.length - MAX_LISTED} more`;
|
|
363
|
+
return {
|
|
364
|
+
error: `File not found: ${path}. Available files: ${preview}`
|
|
365
|
+
};
|
|
366
|
+
}
|
|
367
|
+
return {
|
|
368
|
+
path: diff.path,
|
|
369
|
+
content: diff.content ?? "(content not available)"
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
var MAX_JUDGE_STEPS = 15;
|
|
348
375
|
function replacePlaceholders(str, ctx) {
|
|
349
376
|
return str.replace(/\{\{output\}\}/g, ctx.output).replace(/\{\{cwd\}\}/g, ctx.cwd).replace(/\{\{changedFiles\}\}/g, ctx.changedFiles).replace(/\{\{modifiedFiles\}\}/g, ctx.modifiedFiles).replace(/\{\{newFiles\}\}/g, ctx.newFiles).replace(/\{\{trace\}\}/g, ctx.trace);
|
|
350
377
|
}
|
|
@@ -384,11 +411,13 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
|
|
|
384
411
|
|
|
385
412
|
- {{output}}: the agent's final output
|
|
386
413
|
- {{cwd}}: working directory
|
|
387
|
-
- {{changedFiles}}: list of
|
|
414
|
+
- {{changedFiles}}: list of files changed (or "No files were changed"). Use the readFile tool to inspect the content of any file you need to evaluate.
|
|
388
415
|
- {{modifiedFiles}}: list of existing files that were modified (or "No files were modified")
|
|
389
416
|
- {{newFiles}}: list of new files that were created (or "No new files were created")
|
|
390
417
|
- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
|
|
391
418
|
|
|
419
|
+
You have a readFile tool available. Use it to read the actual content of changed files before scoring.
|
|
420
|
+
|
|
392
421
|
CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
|
|
393
422
|
var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
|
|
394
423
|
|
|
@@ -417,10 +446,13 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
417
446
|
const generateTextStub = context?.generateTextForLlmJudge;
|
|
418
447
|
const output = input.outputText ?? "";
|
|
419
448
|
const fileDiffs = input.fileDiffs ?? [];
|
|
420
|
-
const
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
const
|
|
449
|
+
const filteredDiffs = fileDiffs.filter(
|
|
450
|
+
(d) => !d.path.startsWith(".claude/")
|
|
451
|
+
);
|
|
452
|
+
const changedPaths = filteredDiffs.map((d) => d.path);
|
|
453
|
+
const modifiedPaths = filteredDiffs.filter((d) => d.status === "modified").map((d) => d.path);
|
|
454
|
+
const newPaths = filteredDiffs.filter((d) => d.status === "new").map((d) => d.path);
|
|
455
|
+
const changedFiles = this.formatChangedFilesPaths(changedPaths);
|
|
424
456
|
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
|
|
425
457
|
const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
|
|
426
458
|
const trace = formatTraceForJudge(input.llmTrace);
|
|
@@ -434,7 +466,10 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
434
466
|
};
|
|
435
467
|
const replace = (s) => replacePlaceholders(s, ctx);
|
|
436
468
|
const finalPrompt = replace(assertion.prompt);
|
|
437
|
-
const
|
|
469
|
+
const hasCustomSystem = assertion.systemPrompt != null && assertion.systemPrompt !== "";
|
|
470
|
+
const baseSystem = hasCustomSystem ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
|
|
471
|
+
const readFileHint = filteredDiffs.length > 0 ? "\n\nYou have a readFile tool available. Use it to read the actual content of changed files before scoring." : "";
|
|
472
|
+
const systemPrompt = baseSystem + (hasCustomSystem ? readFileHint : "") + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
|
|
438
473
|
const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
|
|
439
474
|
const maxOutputTokens = assertion.maxTokens ?? 1024;
|
|
440
475
|
const temperature = assertion.temperature ?? 0;
|
|
@@ -465,7 +500,8 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
465
500
|
finalPrompt,
|
|
466
501
|
systemPrompt,
|
|
467
502
|
maxOutputTokens,
|
|
468
|
-
temperature
|
|
503
|
+
temperature,
|
|
504
|
+
filteredDiffs
|
|
469
505
|
);
|
|
470
506
|
lastRawText = result.text;
|
|
471
507
|
try {
|
|
@@ -506,7 +542,10 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
506
542
|
} catch (err) {
|
|
507
543
|
const message = err instanceof Error ? err.message : String(err);
|
|
508
544
|
const modelUsed = assertion.model ?? DEFAULT_MODEL;
|
|
509
|
-
const details = {
|
|
545
|
+
const details = {
|
|
546
|
+
error: message,
|
|
547
|
+
model: modelUsed
|
|
548
|
+
};
|
|
510
549
|
if (import_ai.APICallError.isInstance(err)) {
|
|
511
550
|
details.statusCode = err.statusCode;
|
|
512
551
|
details.url = err.url;
|
|
@@ -525,16 +564,24 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
525
564
|
};
|
|
526
565
|
}
|
|
527
566
|
}
|
|
528
|
-
|
|
567
|
+
formatChangedFilesPaths(paths) {
|
|
568
|
+
return paths.length > 0 ? paths.map((p) => `- ${p}`).join("\n") : "No files were changed";
|
|
569
|
+
}
|
|
570
|
+
async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature, fileDiffs) {
|
|
529
571
|
const anthropic = (0, import_anthropic.createAnthropic)({
|
|
530
572
|
baseURL: llmConfig.baseUrl,
|
|
531
573
|
apiKey: "dummy",
|
|
532
574
|
headers: llmConfig.headers
|
|
533
575
|
});
|
|
576
|
+
const hasFiles = fileDiffs.length > 0;
|
|
534
577
|
const result = await (0, import_ai.generateText)({
|
|
535
578
|
model: anthropic(modelId),
|
|
536
579
|
prompt,
|
|
537
580
|
system,
|
|
581
|
+
...hasFiles && {
|
|
582
|
+
tools: { readFile: buildReadFileTool(fileDiffs) },
|
|
583
|
+
maxSteps: MAX_JUDGE_STEPS
|
|
584
|
+
},
|
|
538
585
|
maxOutputTokens,
|
|
539
586
|
temperature
|
|
540
587
|
});
|
|
@@ -600,6 +647,7 @@ async function evaluateAssertions(input, assertions, context) {
|
|
|
600
647
|
SkillWasCalledAssertionSchema,
|
|
601
648
|
SkillWasCalledEvaluator,
|
|
602
649
|
TokenUsageSchema,
|
|
650
|
+
buildReadFileTool,
|
|
603
651
|
evaluateAssertions,
|
|
604
652
|
formatTraceForJudge,
|
|
605
653
|
getEvaluator,
|
package/build/index.js.map
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../src/index.ts", "../src/types/assertions.ts", "../src/types/trace.ts", "../src/types/result.ts", "../src/evaluators/index.ts", "../src/evaluators/skill-was-called-evaluator.ts", "../src/evaluators/assertion-evaluator.ts", "../src/evaluators/build-passed-evaluator.ts", "../src/evaluators/llm-judge-evaluator.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * @wix/eval-assertions\n *\n * Assertion framework for AI agent evaluations.\n * Supports skill invocation checks, build validation, and LLM-based judging.\n */\n\n// Types\nexport {\n // Assertion schemas and types\n AssertionSchema,\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n type Assertion,\n type SkillWasCalledAssertion,\n type BuildPassedAssertion,\n type LlmJudgeAssertion,\n // Trace types\n LLMTraceSchema,\n LLMTraceStepSchema,\n LLMTraceSummarySchema,\n LLMBreakdownStatsSchema,\n TokenUsageSchema,\n LLMStepType,\n type LLMTrace,\n type LLMTraceStep,\n type LLMTraceSummary,\n type LLMBreakdownStats,\n type TokenUsage,\n // Result types\n AssertionResultSchema,\n AssertionResultStatus,\n type AssertionResult,\n // Input types\n type EvaluationInput,\n type FileDiff,\n} from \"./types/index.js\";\n\n// Evaluators\nexport {\n evaluateAssertions,\n registerEvaluator,\n getEvaluator,\n AssertionEvaluator,\n SkillWasCalledEvaluator,\n BuildPassedEvaluator,\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type AssertionContext,\n type LlmConfig,\n type LlmJudgeGenerateTextOptions,\n type JudgeResult,\n} from \"./evaluators/index.js\";\n", "import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\nconst DEFAULT_MODEL = \"claude-haiku-4-5-20251001\";\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of all files changed (or \"No files were changed\")\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.\n * Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n // Compute file lists by status\n const changedPaths = fileDiffs.map((d) => d.path);\n const modifiedPaths = fileDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = fileDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles =\n changedPaths.length > 0\n ? changedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were changed\";\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n const systemPrompt =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\"\n ? replace(assertion.systemPrompt) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS\n : replace(DEFAULT_JUDGE_CONTEXT) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n assertion.model ?? DEFAULT_MODEL,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const modelUsed = assertion.model ?? DEFAULT_MODEL;\n const details: Record<string, unknown> = { error: message, model: modelUsed };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
|
|
5
|
-
"mappings": ";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,iBAAkB;AAQX,IAAM,gCAAgC,aAAE,OAAO;AAAA,EACpD,MAAM,aAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,aAAE,OAAO;AAAA,EACjD,MAAM,aAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AASM,IAAM,0BAA0B,aAAE,OAAO;AAAA,EAC9C,MAAM,aAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,aAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,aAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,aAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AASM,IAAM,kBAAkB,aAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC9DD,IAAAA,cAAkB;AAKX,IAAM,mBAAmB,cAAE,OAAO;AAAA,EACvC,QAAQ,cAAE,OAAO;AAAA,EACjB,YAAY,cAAE,OAAO;AAAA,EACrB,OAAO,cAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqB,cAAE,OAAO;AAAA,EACzC,IAAI,cAAE,OAAO;AAAA,EACb,YAAY,cAAE,OAAO;AAAA,EACrB,MAAM,cAAE,KAAK,WAAW;AAAA,EACxB,OAAO,cAAE,OAAO;AAAA,EAChB,UAAU,cAAE,OAAO;AAAA,EACnB,WAAW,cAAE,OAAO;AAAA,EACpB,YAAY,cAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAAS,cAAE,OAAO;AAAA,EAClB,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAc,cAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAAS,cAAE,QAAQ;AAAA,EACnB,OAAO,cAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0B,cAAE,OAAO;AAAA,EAC9C,OAAO,cAAE,OAAO;AAAA,EAChB,YAAY,cAAE,OAAO;AAAA,EACrB,QAAQ,cAAE,OAAO;AAAA,EACjB,SAAS,cAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,YAAY,cAAE,OAAO;AAAA,EACrB,iBAAiB,cAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAc,cAAE,OAAO;AAAA,EACvB,mBAAmB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAY,cAAE,MAAM,cAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiB,cAAE,OAAO;AAAA,EACrC,IAAI,cAAE,OAAO;AAAA,EACb,OAAO,cAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,IAAAC,cAAkB;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,IAAI,cAAE,OAAO;AAAA,EACb,aAAa,cAAE,OAAO;AAAA,EACtB,eAAe,cAAE,OAAO;AAAA,EACxB,eAAe,cAAE,OAAO;AAAA,EACxB,QAAQ,cAAE,KAAK,qBAAqB;AAAA,EACpC,SAAS,cAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQ,cAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAAS,cAAE,OAAO,cAAE,OAAO,GAAG,cAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAe,cAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,IAAAC,iBAA2B;;;ACK3B,oBAA2B;;;ACyCpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD1CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,kBAAc,0BAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,QAAI,0BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,IAAAC,iBAA2B;AAC3B,2BAAyB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,kBAAc,2BAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,yCAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,QAAI,2BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACxHA,IAAAC,iBAA2B;AAC3B,uBAAgC;AAChC,gBAA2C;AAapC,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAcO,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAC1B,IAAM,gBAAgB;AAGtB,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAW9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,kBAAc,2BAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAGtC,UAAM,eAAe,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AAChD,UAAM,gBAAgB,UACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,UACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eACJ,aAAa,SAAS,IAClB,aAAa,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACzD;AACN,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAC5C,UAAM,eACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB,KACzD,QAAQ,UAAU,YAAY,IAC9B,SACA,kCACA,QAAQ,qBAAqB,IAC7B,SACA;AAEN,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAE7C,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA,UAAU,SAAS;AAAA,UACnB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,QAAI,2BAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,YAAY,UAAU,SAAS;AACrC,YAAM,UAAmC,EAAE,OAAO,SAAS,OAAO,UAAU;AAE5E,UAAI,uBAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aAC2B;AAC3B,UAAM,gBAAY,kCAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AACD,UAAM,SAAS,UAAM,wBAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;AJzUA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,QAAI,2BAAW;AAAA,UACf,iBAAa,2BAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
|
|
6
|
-
"names": ["import_zod", "LLMStepType", "import_zod", "AssertionResultStatus", "import_crypto", "message", "import_crypto", "import_crypto"]
|
|
4
|
+
"sourcesContent": ["/**\n * @wix/eval-assertions\n *\n * Assertion framework for AI agent evaluations.\n * Supports skill invocation checks, build validation, and LLM-based judging.\n */\n\n// Types\nexport {\n // Assertion schemas and types\n AssertionSchema,\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n type Assertion,\n type SkillWasCalledAssertion,\n type BuildPassedAssertion,\n type LlmJudgeAssertion,\n // Trace types\n LLMTraceSchema,\n LLMTraceStepSchema,\n LLMTraceSummarySchema,\n LLMBreakdownStatsSchema,\n TokenUsageSchema,\n LLMStepType,\n type LLMTrace,\n type LLMTraceStep,\n type LLMTraceSummary,\n type LLMBreakdownStats,\n type TokenUsage,\n // Result types\n AssertionResultSchema,\n AssertionResultStatus,\n type AssertionResult,\n // Input types\n type EvaluationInput,\n type FileDiff,\n} from \"./types/index.js\";\n\n// Evaluators\nexport {\n evaluateAssertions,\n registerEvaluator,\n getEvaluator,\n AssertionEvaluator,\n SkillWasCalledEvaluator,\n BuildPassedEvaluator,\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type AssertionContext,\n type LlmConfig,\n type LlmJudgeGenerateTextOptions,\n type JudgeResult,\n buildReadFileTool,\n} from \"./evaluators/index.js\";\n", "import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * The judge has a readFile tool to inspect changed file contents on demand.\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n buildReadFileTool,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n FileDiff,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { Tool } from \"ai\";\nimport { z } from \"zod\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\nexport interface ReadFileResult {\n path?: string;\n content?: string;\n error?: string;\n}\n\nconst readFileInputSchema = z.object({\n path: z.string().describe(\"Relative file path from the changed files list\"),\n});\n\ntype ReadFileInput = z.infer<typeof readFileInputSchema>;\n\n/**\n * Build a readFile tool that looks up content from in-memory file diffs.\n * Scoped to only the pre-filtered diffs (no .claude/ files).\n */\nexport function buildReadFileTool(\n fileDiffs: FileDiff[],\n): Tool<ReadFileInput, ReadFileResult> {\n return {\n description:\n \"Read the content of a changed file by its path. \" +\n \"Only files that were changed during the scenario run are available.\",\n inputSchema: readFileInputSchema,\n execute: async ({ path }) => {\n const diff = fileDiffs.find((d) => d.path === path);\n if (!diff) {\n const paths = fileDiffs.map((d) => d.path);\n const MAX_LISTED = 20;\n const preview =\n paths.length <= MAX_LISTED\n ? paths.join(\", \")\n : `${paths.slice(0, MAX_LISTED).join(\", \")} ... and ${paths.length - MAX_LISTED} more`;\n return {\n error: `File not found: ${path}. Available files: ${preview}`,\n };\n }\n return {\n path: diff.path,\n content: diff.content ?? \"(content not available)\",\n };\n },\n };\n}\n\n/** Max tool-call round trips for the judge before it must produce a verdict. */\nconst MAX_JUDGE_STEPS = 15;\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\nconst DEFAULT_MODEL = \"claude-haiku-4-5-20251001\";\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of files changed (or \"No files were changed\"). Use the readFile tool to inspect the content of any file you need to evaluate.\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nYou have a readFile tool available. Use it to read the actual content of changed files before scoring.\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * and returns a score 0-100. The real LLM call gets a readFile tool to\n * inspect changed files on demand. Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n const filteredDiffs = fileDiffs.filter(\n (d) => !d.path.startsWith(\".claude/\"),\n );\n\n const changedPaths = filteredDiffs.map((d) => d.path);\n const modifiedPaths = filteredDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = filteredDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles = this.formatChangedFilesPaths(changedPaths);\n\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n\n const hasCustomSystem =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\";\n const baseSystem = hasCustomSystem\n ? replace(assertion.systemPrompt!)\n : replace(DEFAULT_JUDGE_CONTEXT);\n\n const readFileHint =\n filteredDiffs.length > 0\n ? \"\\n\\nYou have a readFile tool available. Use it to read the actual content of changed files before scoring.\"\n : \"\";\n const systemPrompt =\n baseSystem +\n (hasCustomSystem ? readFileHint : \"\") +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n assertion.model ?? DEFAULT_MODEL,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n filteredDiffs,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const modelUsed = assertion.model ?? DEFAULT_MODEL;\n const details: Record<string, unknown> = {\n error: message,\n model: modelUsed,\n };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private formatChangedFilesPaths(paths: string[]): string {\n return paths.length > 0\n ? paths.map((p) => `- ${p}`).join(\"\\n\")\n : \"No files were changed\";\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n fileDiffs: FileDiff[],\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n\n const hasFiles = fileDiffs.length > 0;\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n ...(hasFiles && {\n tools: { readFile: buildReadFileTool(fileDiffs) },\n maxSteps: MAX_JUDGE_STEPS,\n }),\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
|
|
5
|
+
"mappings": ";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,iBAAkB;AAQX,IAAM,gCAAgC,aAAE,OAAO;AAAA,EACpD,MAAM,aAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,aAAE,OAAO;AAAA,EACjD,MAAM,aAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AAUM,IAAM,0BAA0B,aAAE,OAAO;AAAA,EAC9C,MAAM,aAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,aAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,aAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,aAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AASM,IAAM,kBAAkB,aAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC/DD,IAAAA,cAAkB;AAKX,IAAM,mBAAmB,cAAE,OAAO;AAAA,EACvC,QAAQ,cAAE,OAAO;AAAA,EACjB,YAAY,cAAE,OAAO;AAAA,EACrB,OAAO,cAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqB,cAAE,OAAO;AAAA,EACzC,IAAI,cAAE,OAAO;AAAA,EACb,YAAY,cAAE,OAAO;AAAA,EACrB,MAAM,cAAE,KAAK,WAAW;AAAA,EACxB,OAAO,cAAE,OAAO;AAAA,EAChB,UAAU,cAAE,OAAO;AAAA,EACnB,WAAW,cAAE,OAAO;AAAA,EACpB,YAAY,cAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAAS,cAAE,OAAO;AAAA,EAClB,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAc,cAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAAS,cAAE,QAAQ;AAAA,EACnB,OAAO,cAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0B,cAAE,OAAO;AAAA,EAC9C,OAAO,cAAE,OAAO;AAAA,EAChB,YAAY,cAAE,OAAO;AAAA,EACrB,QAAQ,cAAE,OAAO;AAAA,EACjB,SAAS,cAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,YAAY,cAAE,OAAO;AAAA,EACrB,iBAAiB,cAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAc,cAAE,OAAO;AAAA,EACvB,mBAAmB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAY,cAAE,MAAM,cAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiB,cAAE,OAAO;AAAA,EACrC,IAAI,cAAE,OAAO;AAAA,EACb,OAAO,cAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,IAAAC,cAAkB;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,IAAI,cAAE,OAAO;AAAA,EACb,aAAa,cAAE,OAAO;AAAA,EACtB,eAAe,cAAE,OAAO;AAAA,EACxB,eAAe,cAAE,OAAO;AAAA,EACxB,QAAQ,cAAE,KAAK,qBAAqB;AAAA,EACpC,SAAS,cAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQ,cAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAAS,cAAE,OAAO,cAAE,OAAO,GAAG,cAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAe,cAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,IAAAC,iBAA2B;;;ACK3B,oBAA2B;;;ACyCpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD1CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,kBAAc,0BAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,QAAI,0BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,IAAAC,iBAA2B;AAC3B,2BAAyB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,kBAAc,2BAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,yCAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,QAAI,2BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACvHA,IAAAC,iBAA2B;AAC3B,uBAAgC;AAChC,gBAA2C;AAE3C,IAAAC,cAAkB;AAaX,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAQA,IAAM,sBAAsB,cAAE,OAAO;AAAA,EACnC,MAAM,cAAE,OAAO,EAAE,SAAS,gDAAgD;AAC5E,CAAC;AAQM,SAAS,kBACd,WACqC;AACrC,SAAO;AAAA,IACL,aACE;AAAA,IAEF,aAAa;AAAA,IACb,SAAS,OAAO,EAAE,KAAK,MAAM;AAC3B,YAAM,OAAO,UAAU,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI;AAClD,UAAI,CAAC,MAAM;AACT,cAAM,QAAQ,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AACzC,cAAM,aAAa;AACnB,cAAM,UACJ,MAAM,UAAU,aACZ,MAAM,KAAK,IAAI,IACf,GAAG,MAAM,MAAM,GAAG,UAAU,EAAE,KAAK,IAAI,CAAC,YAAY,MAAM,SAAS,UAAU;AACnF,eAAO;AAAA,UACL,OAAO,mBAAmB,IAAI,sBAAsB,OAAO;AAAA,QAC7D;AAAA,MACF;AACA,aAAO;AAAA,QACL,MAAM,KAAK;AAAA,QACX,SAAS,KAAK,WAAW;AAAA,MAC3B;AAAA,IACF;AAAA,EACF;AACF;AAGA,IAAM,kBAAkB;AAcjB,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAC1B,IAAM,gBAAgB;AAGtB,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAa9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,kBAAc,2BAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAEtC,UAAM,gBAAgB,UAAU;AAAA,MAC9B,CAAC,MAAM,CAAC,EAAE,KAAK,WAAW,UAAU;AAAA,IACtC;AAEA,UAAM,eAAe,cAAc,IAAI,CAAC,MAAM,EAAE,IAAI;AACpD,UAAM,gBAAgB,cACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,cACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eAAe,KAAK,wBAAwB,YAAY;AAE9D,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAE5C,UAAM,kBACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB;AAC/D,UAAM,aAAa,kBACf,QAAQ,UAAU,YAAa,IAC/B,QAAQ,qBAAqB;AAEjC,UAAM,eACJ,cAAc,SAAS,IACnB,+GACA;AACN,UAAM,eACJ,cACC,kBAAkB,eAAe,MAClC,SACA;AAEF,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAE7C,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA,UAAU,SAAS;AAAA,UACnB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,QAAI,2BAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,YAAY,UAAU,SAAS;AACrC,YAAM,UAAmC;AAAA,QACvC,OAAO;AAAA,QACP,OAAO;AAAA,MACT;AAEA,UAAI,uBAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,wBAAwB,OAAyB;AACvD,WAAO,MAAM,SAAS,IAClB,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,IACpC;AAAA,EACN;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aACA,WAC2B;AAC3B,UAAM,gBAAY,kCAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AAED,UAAM,WAAW,UAAU,SAAS;AACpC,UAAM,SAAS,UAAM,wBAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA,GAAI,YAAY;AAAA,QACd,OAAO,EAAE,UAAU,kBAAkB,SAAS,EAAE;AAAA,QAChD,UAAU;AAAA,MACZ;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;AJxZA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,QAAI,2BAAW;AAAA,UACf,iBAAa,2BAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
|
|
6
|
+
"names": ["import_zod", "LLMStepType", "import_zod", "AssertionResultStatus", "import_crypto", "message", "import_crypto", "import_crypto", "import_zod"]
|
|
7
7
|
}
|
package/build/index.mjs
CHANGED
|
@@ -270,6 +270,7 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
|
|
|
270
270
|
import { randomUUID as randomUUID3 } from "crypto";
|
|
271
271
|
import { createAnthropic } from "@ai-sdk/anthropic";
|
|
272
272
|
import { generateText, APICallError } from "ai";
|
|
273
|
+
import { z as z4 } from "zod";
|
|
273
274
|
function formatTraceForJudge(llmTrace) {
|
|
274
275
|
if (!llmTrace?.steps?.length) {
|
|
275
276
|
return "No trace available.";
|
|
@@ -297,6 +298,31 @@ function formatTraceForJudge(llmTrace) {
|
|
|
297
298
|
}
|
|
298
299
|
return lines.join("\n");
|
|
299
300
|
}
|
|
301
|
+
var readFileInputSchema = z4.object({
|
|
302
|
+
path: z4.string().describe("Relative file path from the changed files list")
|
|
303
|
+
});
|
|
304
|
+
function buildReadFileTool(fileDiffs) {
|
|
305
|
+
return {
|
|
306
|
+
description: "Read the content of a changed file by its path. Only files that were changed during the scenario run are available.",
|
|
307
|
+
inputSchema: readFileInputSchema,
|
|
308
|
+
execute: async ({ path }) => {
|
|
309
|
+
const diff = fileDiffs.find((d) => d.path === path);
|
|
310
|
+
if (!diff) {
|
|
311
|
+
const paths = fileDiffs.map((d) => d.path);
|
|
312
|
+
const MAX_LISTED = 20;
|
|
313
|
+
const preview = paths.length <= MAX_LISTED ? paths.join(", ") : `${paths.slice(0, MAX_LISTED).join(", ")} ... and ${paths.length - MAX_LISTED} more`;
|
|
314
|
+
return {
|
|
315
|
+
error: `File not found: ${path}. Available files: ${preview}`
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
return {
|
|
319
|
+
path: diff.path,
|
|
320
|
+
content: diff.content ?? "(content not available)"
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
var MAX_JUDGE_STEPS = 15;
|
|
300
326
|
function replacePlaceholders(str, ctx) {
|
|
301
327
|
return str.replace(/\{\{output\}\}/g, ctx.output).replace(/\{\{cwd\}\}/g, ctx.cwd).replace(/\{\{changedFiles\}\}/g, ctx.changedFiles).replace(/\{\{modifiedFiles\}\}/g, ctx.modifiedFiles).replace(/\{\{newFiles\}\}/g, ctx.newFiles).replace(/\{\{trace\}\}/g, ctx.trace);
|
|
302
328
|
}
|
|
@@ -336,11 +362,13 @@ var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data
|
|
|
336
362
|
|
|
337
363
|
- {{output}}: the agent's final output
|
|
338
364
|
- {{cwd}}: working directory
|
|
339
|
-
- {{changedFiles}}: list of
|
|
365
|
+
- {{changedFiles}}: list of files changed (or "No files were changed"). Use the readFile tool to inspect the content of any file you need to evaluate.
|
|
340
366
|
- {{modifiedFiles}}: list of existing files that were modified (or "No files were modified")
|
|
341
367
|
- {{newFiles}}: list of new files that were created (or "No new files were created")
|
|
342
368
|
- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times
|
|
343
369
|
|
|
370
|
+
You have a readFile tool available. Use it to read the actual content of changed files before scoring.
|
|
371
|
+
|
|
344
372
|
CRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;
|
|
345
373
|
var JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:
|
|
346
374
|
|
|
@@ -369,10 +397,13 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
369
397
|
const generateTextStub = context?.generateTextForLlmJudge;
|
|
370
398
|
const output = input.outputText ?? "";
|
|
371
399
|
const fileDiffs = input.fileDiffs ?? [];
|
|
372
|
-
const
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
const
|
|
400
|
+
const filteredDiffs = fileDiffs.filter(
|
|
401
|
+
(d) => !d.path.startsWith(".claude/")
|
|
402
|
+
);
|
|
403
|
+
const changedPaths = filteredDiffs.map((d) => d.path);
|
|
404
|
+
const modifiedPaths = filteredDiffs.filter((d) => d.status === "modified").map((d) => d.path);
|
|
405
|
+
const newPaths = filteredDiffs.filter((d) => d.status === "new").map((d) => d.path);
|
|
406
|
+
const changedFiles = this.formatChangedFilesPaths(changedPaths);
|
|
376
407
|
const modifiedFiles = modifiedPaths.length > 0 ? modifiedPaths.map((path) => `- ${path}`).join("\n") : "No files were modified";
|
|
377
408
|
const newFiles = newPaths.length > 0 ? newPaths.map((path) => `- ${path}`).join("\n") : "No new files were created";
|
|
378
409
|
const trace = formatTraceForJudge(input.llmTrace);
|
|
@@ -386,7 +417,10 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
386
417
|
};
|
|
387
418
|
const replace = (s) => replacePlaceholders(s, ctx);
|
|
388
419
|
const finalPrompt = replace(assertion.prompt);
|
|
389
|
-
const
|
|
420
|
+
const hasCustomSystem = assertion.systemPrompt != null && assertion.systemPrompt !== "";
|
|
421
|
+
const baseSystem = hasCustomSystem ? replace(assertion.systemPrompt) : replace(DEFAULT_JUDGE_CONTEXT);
|
|
422
|
+
const readFileHint = filteredDiffs.length > 0 ? "\n\nYou have a readFile tool available. Use it to read the actual content of changed files before scoring." : "";
|
|
423
|
+
const systemPrompt = baseSystem + (hasCustomSystem ? readFileHint : "") + "\n\n" + JSON_OUTPUT_FORMAT_INSTRUCTIONS;
|
|
390
424
|
const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
|
|
391
425
|
const maxOutputTokens = assertion.maxTokens ?? 1024;
|
|
392
426
|
const temperature = assertion.temperature ?? 0;
|
|
@@ -417,7 +451,8 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
417
451
|
finalPrompt,
|
|
418
452
|
systemPrompt,
|
|
419
453
|
maxOutputTokens,
|
|
420
|
-
temperature
|
|
454
|
+
temperature,
|
|
455
|
+
filteredDiffs
|
|
421
456
|
);
|
|
422
457
|
lastRawText = result.text;
|
|
423
458
|
try {
|
|
@@ -458,7 +493,10 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
458
493
|
} catch (err) {
|
|
459
494
|
const message = err instanceof Error ? err.message : String(err);
|
|
460
495
|
const modelUsed = assertion.model ?? DEFAULT_MODEL;
|
|
461
|
-
const details = {
|
|
496
|
+
const details = {
|
|
497
|
+
error: message,
|
|
498
|
+
model: modelUsed
|
|
499
|
+
};
|
|
462
500
|
if (APICallError.isInstance(err)) {
|
|
463
501
|
details.statusCode = err.statusCode;
|
|
464
502
|
details.url = err.url;
|
|
@@ -477,16 +515,24 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
|
|
|
477
515
|
};
|
|
478
516
|
}
|
|
479
517
|
}
|
|
480
|
-
|
|
518
|
+
formatChangedFilesPaths(paths) {
|
|
519
|
+
return paths.length > 0 ? paths.map((p) => `- ${p}`).join("\n") : "No files were changed";
|
|
520
|
+
}
|
|
521
|
+
async callGenerateText(llmConfig, modelId, prompt, system, maxOutputTokens, temperature, fileDiffs) {
|
|
481
522
|
const anthropic = createAnthropic({
|
|
482
523
|
baseURL: llmConfig.baseUrl,
|
|
483
524
|
apiKey: "dummy",
|
|
484
525
|
headers: llmConfig.headers
|
|
485
526
|
});
|
|
527
|
+
const hasFiles = fileDiffs.length > 0;
|
|
486
528
|
const result = await generateText({
|
|
487
529
|
model: anthropic(modelId),
|
|
488
530
|
prompt,
|
|
489
531
|
system,
|
|
532
|
+
...hasFiles && {
|
|
533
|
+
tools: { readFile: buildReadFileTool(fileDiffs) },
|
|
534
|
+
maxSteps: MAX_JUDGE_STEPS
|
|
535
|
+
},
|
|
490
536
|
maxOutputTokens,
|
|
491
537
|
temperature
|
|
492
538
|
});
|
|
@@ -551,6 +597,7 @@ export {
|
|
|
551
597
|
SkillWasCalledAssertionSchema,
|
|
552
598
|
SkillWasCalledEvaluator,
|
|
553
599
|
TokenUsageSchema,
|
|
600
|
+
buildReadFileTool,
|
|
554
601
|
evaluateAssertions,
|
|
555
602
|
formatTraceForJudge,
|
|
556
603
|
getEvaluator,
|
package/build/index.mjs.map
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../src/types/assertions.ts", "../src/types/trace.ts", "../src/types/result.ts", "../src/evaluators/index.ts", "../src/evaluators/skill-was-called-evaluator.ts", "../src/evaluators/assertion-evaluator.ts", "../src/evaluators/build-passed-evaluator.ts", "../src/evaluators/llm-judge-evaluator.ts"],
|
|
4
|
-
"sourcesContent": ["import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\nconst DEFAULT_MODEL = \"claude-haiku-4-5-20251001\";\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of all files changed (or \"No files were changed\")\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.\n * Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n // Compute file lists by status\n const changedPaths = fileDiffs.map((d) => d.path);\n const modifiedPaths = fileDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = fileDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles =\n changedPaths.length > 0\n ? changedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were changed\";\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n const systemPrompt =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\"\n ? replace(assertion.systemPrompt) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS\n : replace(DEFAULT_JUDGE_CONTEXT) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n assertion.model ?? DEFAULT_MODEL,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const modelUsed = assertion.model ?? DEFAULT_MODEL;\n const details: Record<string, unknown> = { error: message, model: modelUsed };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
|
|
5
|
-
"mappings": ";AAAA,SAAS,SAAS;AAQX,IAAM,gCAAgC,EAAE,OAAO;AAAA,EACpD,MAAM,EAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,EAAE,OAAO;AAAA,EACjD,MAAM,EAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AASM,IAAM,0BAA0B,EAAE,OAAO;AAAA,EAC9C,MAAM,EAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,EAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,EAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AASM,IAAM,kBAAkB,EAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC9DD,SAAS,KAAAA,UAAS;AAKX,IAAM,mBAAmBA,GAAE,OAAO;AAAA,EACvC,QAAQA,GAAE,OAAO;AAAA,EACjB,YAAYA,GAAE,OAAO;AAAA,EACrB,OAAOA,GAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqBD,GAAE,OAAO;AAAA,EACzC,IAAIA,GAAE,OAAO;AAAA,EACb,YAAYA,GAAE,OAAO;AAAA,EACrB,MAAMA,GAAE,KAAK,WAAW;AAAA,EACxB,OAAOA,GAAE,OAAO;AAAA,EAChB,UAAUA,GAAE,OAAO;AAAA,EACnB,WAAWA,GAAE,OAAO;AAAA,EACpB,YAAYA,GAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAASA,GAAE,OAAO;AAAA,EAClB,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAcA,GAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAASA,GAAE,QAAQ;AAAA,EACnB,OAAOA,GAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,OAAOA,GAAE,OAAO;AAAA,EAChB,YAAYA,GAAE,OAAO;AAAA,EACrB,QAAQA,GAAE,OAAO;AAAA,EACjB,SAASA,GAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwBA,GAAE,OAAO;AAAA,EAC5C,YAAYA,GAAE,OAAO;AAAA,EACrB,iBAAiBA,GAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAcA,GAAE,OAAO;AAAA,EACvB,mBAAmBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAYA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,IAAIA,GAAE,OAAO;AAAA,EACb,OAAOA,GAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,SAAS,KAAAE,UAAS;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,IAAIA,GAAE,OAAO;AAAA,EACb,aAAaA,GAAE,OAAO;AAAA,EACtB,eAAeA,GAAE,OAAO;AAAA,EACxB,eAAeA,GAAE,OAAO;AAAA,EACxB,QAAQA,GAAE,KAAK,qBAAqB;AAAA,EACpC,SAASA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAASA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAeA,GAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,SAAS,cAAAC,mBAAkB;;;ACK3B,SAAS,kBAAkB;;;ACyCpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD1CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,cAAc,WAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,IAAI,WAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,gBAAgB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,cAAcC,YAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,eAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,IAAIA,YAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACxHA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,cAAc,oBAAoB;AAapC,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAcO,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAC1B,IAAM,gBAAgB;AAGtB,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAW9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,cAAcC,YAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAGtC,UAAM,eAAe,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AAChD,UAAM,gBAAgB,UACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,UACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eACJ,aAAa,SAAS,IAClB,aAAa,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACzD;AACN,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAC5C,UAAM,eACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB,KACzD,QAAQ,UAAU,YAAY,IAC9B,SACA,kCACA,QAAQ,qBAAqB,IAC7B,SACA;AAEN,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAE7C,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA,UAAU,SAAS;AAAA,UACnB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,IAAIA,YAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,YAAY,UAAU,SAAS;AACrC,YAAM,UAAmC,EAAE,OAAO,SAAS,OAAO,UAAU;AAE5E,UAAI,aAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aAC2B;AAC3B,UAAM,YAAY,gBAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AACD,UAAM,SAAS,MAAM,aAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;AJzUA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,IAAIC,YAAW;AAAA,UACf,aAAaA,YAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
|
|
6
|
-
"names": ["z", "LLMStepType", "z", "AssertionResultStatus", "z", "randomUUID", "message", "randomUUID", "randomUUID", "randomUUID", "randomUUID", "randomUUID"]
|
|
4
|
+
"sourcesContent": ["import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * The judge has a readFile tool to inspect changed file contents on demand.\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n buildReadFileTool,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n FileDiff,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { Tool } from \"ai\";\nimport { z } from \"zod\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\nexport interface ReadFileResult {\n path?: string;\n content?: string;\n error?: string;\n}\n\nconst readFileInputSchema = z.object({\n path: z.string().describe(\"Relative file path from the changed files list\"),\n});\n\ntype ReadFileInput = z.infer<typeof readFileInputSchema>;\n\n/**\n * Build a readFile tool that looks up content from in-memory file diffs.\n * Scoped to only the pre-filtered diffs (no .claude/ files).\n */\nexport function buildReadFileTool(\n fileDiffs: FileDiff[],\n): Tool<ReadFileInput, ReadFileResult> {\n return {\n description:\n \"Read the content of a changed file by its path. \" +\n \"Only files that were changed during the scenario run are available.\",\n inputSchema: readFileInputSchema,\n execute: async ({ path }) => {\n const diff = fileDiffs.find((d) => d.path === path);\n if (!diff) {\n const paths = fileDiffs.map((d) => d.path);\n const MAX_LISTED = 20;\n const preview =\n paths.length <= MAX_LISTED\n ? paths.join(\", \")\n : `${paths.slice(0, MAX_LISTED).join(\", \")} ... and ${paths.length - MAX_LISTED} more`;\n return {\n error: `File not found: ${path}. Available files: ${preview}`,\n };\n }\n return {\n path: diff.path,\n content: diff.content ?? \"(content not available)\",\n };\n },\n };\n}\n\n/** Max tool-call round trips for the judge before it must produce a verdict. */\nconst MAX_JUDGE_STEPS = 15;\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\nconst DEFAULT_MODEL = \"claude-haiku-4-5-20251001\";\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of files changed (or \"No files were changed\"). Use the readFile tool to inspect the content of any file you need to evaluate.\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nYou have a readFile tool available. Use it to read the actual content of changed files before scoring.\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * and returns a score 0-100. The real LLM call gets a readFile tool to\n * inspect changed files on demand. Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n const filteredDiffs = fileDiffs.filter(\n (d) => !d.path.startsWith(\".claude/\"),\n );\n\n const changedPaths = filteredDiffs.map((d) => d.path);\n const modifiedPaths = filteredDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = filteredDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles = this.formatChangedFilesPaths(changedPaths);\n\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n\n const hasCustomSystem =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\";\n const baseSystem = hasCustomSystem\n ? replace(assertion.systemPrompt!)\n : replace(DEFAULT_JUDGE_CONTEXT);\n\n const readFileHint =\n filteredDiffs.length > 0\n ? \"\\n\\nYou have a readFile tool available. Use it to read the actual content of changed files before scoring.\"\n : \"\";\n const systemPrompt =\n baseSystem +\n (hasCustomSystem ? readFileHint : \"\") +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n assertion.model ?? DEFAULT_MODEL,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n filteredDiffs,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const modelUsed = assertion.model ?? DEFAULT_MODEL;\n const details: Record<string, unknown> = {\n error: message,\n model: modelUsed,\n };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private formatChangedFilesPaths(paths: string[]): string {\n return paths.length > 0\n ? paths.map((p) => `- ${p}`).join(\"\\n\")\n : \"No files were changed\";\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n fileDiffs: FileDiff[],\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n\n const hasFiles = fileDiffs.length > 0;\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n ...(hasFiles && {\n tools: { readFile: buildReadFileTool(fileDiffs) },\n maxSteps: MAX_JUDGE_STEPS,\n }),\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
|
|
5
|
+
"mappings": ";AAAA,SAAS,SAAS;AAQX,IAAM,gCAAgC,EAAE,OAAO;AAAA,EACpD,MAAM,EAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,EAAE,OAAO;AAAA,EACjD,MAAM,EAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AAUM,IAAM,0BAA0B,EAAE,OAAO;AAAA,EAC9C,MAAM,EAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,EAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,EAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AASM,IAAM,kBAAkB,EAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC/DD,SAAS,KAAAA,UAAS;AAKX,IAAM,mBAAmBA,GAAE,OAAO;AAAA,EACvC,QAAQA,GAAE,OAAO;AAAA,EACjB,YAAYA,GAAE,OAAO;AAAA,EACrB,OAAOA,GAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqBD,GAAE,OAAO;AAAA,EACzC,IAAIA,GAAE,OAAO;AAAA,EACb,YAAYA,GAAE,OAAO;AAAA,EACrB,MAAMA,GAAE,KAAK,WAAW;AAAA,EACxB,OAAOA,GAAE,OAAO;AAAA,EAChB,UAAUA,GAAE,OAAO;AAAA,EACnB,WAAWA,GAAE,OAAO;AAAA,EACpB,YAAYA,GAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAASA,GAAE,OAAO;AAAA,EAClB,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAcA,GAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAASA,GAAE,QAAQ;AAAA,EACnB,OAAOA,GAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,OAAOA,GAAE,OAAO;AAAA,EAChB,YAAYA,GAAE,OAAO;AAAA,EACrB,QAAQA,GAAE,OAAO;AAAA,EACjB,SAASA,GAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwBA,GAAE,OAAO;AAAA,EAC5C,YAAYA,GAAE,OAAO;AAAA,EACrB,iBAAiBA,GAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAcA,GAAE,OAAO;AAAA,EACvB,mBAAmBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAYA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,IAAIA,GAAE,OAAO;AAAA,EACb,OAAOA,GAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,SAAS,KAAAE,UAAS;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,IAAIA,GAAE,OAAO;AAAA,EACb,aAAaA,GAAE,OAAO;AAAA,EACtB,eAAeA,GAAE,OAAO;AAAA,EACxB,eAAeA,GAAE,OAAO;AAAA,EACxB,QAAQA,GAAE,KAAK,qBAAqB;AAAA,EACpC,SAASA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAASA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAeA,GAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,SAAS,cAAAC,mBAAkB;;;ACK3B,SAAS,kBAAkB;;;ACyCpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD1CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,cAAc,WAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,IAAI,WAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,gBAAgB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,cAAcC,YAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,eAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,IAAIA,YAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACvHA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,cAAc,oBAAoB;AAE3C,SAAS,KAAAC,UAAS;AAaX,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAQA,IAAM,sBAAsBC,GAAE,OAAO;AAAA,EACnC,MAAMA,GAAE,OAAO,EAAE,SAAS,gDAAgD;AAC5E,CAAC;AAQM,SAAS,kBACd,WACqC;AACrC,SAAO;AAAA,IACL,aACE;AAAA,IAEF,aAAa;AAAA,IACb,SAAS,OAAO,EAAE,KAAK,MAAM;AAC3B,YAAM,OAAO,UAAU,KAAK,CAAC,MAAM,EAAE,SAAS,IAAI;AAClD,UAAI,CAAC,MAAM;AACT,cAAM,QAAQ,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AACzC,cAAM,aAAa;AACnB,cAAM,UACJ,MAAM,UAAU,aACZ,MAAM,KAAK,IAAI,IACf,GAAG,MAAM,MAAM,GAAG,UAAU,EAAE,KAAK,IAAI,CAAC,YAAY,MAAM,SAAS,UAAU;AACnF,eAAO;AAAA,UACL,OAAO,mBAAmB,IAAI,sBAAsB,OAAO;AAAA,QAC7D;AAAA,MACF;AACA,aAAO;AAAA,QACL,MAAM,KAAK;AAAA,QACX,SAAS,KAAK,WAAW;AAAA,MAC3B;AAAA,IACF;AAAA,EACF;AACF;AAGA,IAAM,kBAAkB;AAcjB,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAC1B,IAAM,gBAAgB;AAGtB,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAa9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,cAAcC,YAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAEtC,UAAM,gBAAgB,UAAU;AAAA,MAC9B,CAAC,MAAM,CAAC,EAAE,KAAK,WAAW,UAAU;AAAA,IACtC;AAEA,UAAM,eAAe,cAAc,IAAI,CAAC,MAAM,EAAE,IAAI;AACpD,UAAM,gBAAgB,cACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,cACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eAAe,KAAK,wBAAwB,YAAY;AAE9D,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAE5C,UAAM,kBACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB;AAC/D,UAAM,aAAa,kBACf,QAAQ,UAAU,YAAa,IAC/B,QAAQ,qBAAqB;AAEjC,UAAM,eACJ,cAAc,SAAS,IACnB,+GACA;AACN,UAAM,eACJ,cACC,kBAAkB,eAAe,MAClC,SACA;AAEF,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAE7C,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA,UAAU,SAAS;AAAA,UACnB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,IAAIA,YAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,YAAY,UAAU,SAAS;AACrC,YAAM,UAAmC;AAAA,QACvC,OAAO;AAAA,QACP,OAAO;AAAA,MACT;AAEA,UAAI,aAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEQ,wBAAwB,OAAyB;AACvD,WAAO,MAAM,SAAS,IAClB,MAAM,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,EAAE,KAAK,IAAI,IACpC;AAAA,EACN;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aACA,WAC2B;AAC3B,UAAM,YAAY,gBAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AAED,UAAM,WAAW,UAAU,SAAS;AACpC,UAAM,SAAS,MAAM,aAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA,GAAI,YAAY;AAAA,QACd,OAAO,EAAE,UAAU,kBAAkB,SAAS,EAAE;AAAA,QAChD,UAAU;AAAA,MACZ;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;AJxZA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,IAAIC,YAAW;AAAA,UACf,aAAaA,YAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
|
|
6
|
+
"names": ["z", "LLMStepType", "z", "AssertionResultStatus", "z", "randomUUID", "message", "randomUUID", "randomUUID", "randomUUID", "z", "z", "randomUUID", "randomUUID"]
|
|
7
7
|
}
|
|
@@ -29,4 +29,4 @@ export { AssertionEvaluator } from "./assertion-evaluator.js";
|
|
|
29
29
|
export type { AssertionContext, LlmConfig, LlmJudgeGenerateTextOptions, } from "./assertion-evaluator.js";
|
|
30
30
|
export { SkillWasCalledEvaluator } from "./skill-was-called-evaluator.js";
|
|
31
31
|
export { BuildPassedEvaluator } from "./build-passed-evaluator.js";
|
|
32
|
-
export { LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type JudgeResult, } from "./llm-judge-evaluator.js";
|
|
32
|
+
export { LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type JudgeResult, buildReadFileTool, } from "./llm-judge-evaluator.js";
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import type { LlmJudgeAssertion, AssertionResult, LLMTrace, EvaluationInput } from "../types/index.js";
|
|
1
|
+
import type { LlmJudgeAssertion, AssertionResult, LLMTrace, EvaluationInput, FileDiff } from "../types/index.js";
|
|
2
|
+
import type { Tool } from "ai";
|
|
3
|
+
import { z } from "zod";
|
|
2
4
|
import type { AssertionContext } from "./assertion-evaluator.js";
|
|
3
5
|
import { AssertionEvaluator } from "./assertion-evaluator.js";
|
|
4
6
|
export interface JudgeResult {
|
|
@@ -10,6 +12,20 @@ export interface JudgeResult {
|
|
|
10
12
|
* Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).
|
|
11
13
|
*/
|
|
12
14
|
export declare function formatTraceForJudge(llmTrace: LLMTrace | undefined): string;
|
|
15
|
+
export interface ReadFileResult {
|
|
16
|
+
path?: string;
|
|
17
|
+
content?: string;
|
|
18
|
+
error?: string;
|
|
19
|
+
}
|
|
20
|
+
declare const readFileInputSchema: z.ZodObject<{
|
|
21
|
+
path: z.ZodString;
|
|
22
|
+
}, z.core.$strip>;
|
|
23
|
+
type ReadFileInput = z.infer<typeof readFileInputSchema>;
|
|
24
|
+
/**
|
|
25
|
+
* Build a readFile tool that looks up content from in-memory file diffs.
|
|
26
|
+
* Scoped to only the pre-filtered diffs (no .claude/ files).
|
|
27
|
+
*/
|
|
28
|
+
export declare function buildReadFileTool(fileDiffs: FileDiff[]): Tool<ReadFileInput, ReadFileResult>;
|
|
13
29
|
/**
|
|
14
30
|
* Context object for placeholder replacement.
|
|
15
31
|
*/
|
|
@@ -30,11 +46,13 @@ export declare function stripMarkdownCodeBlock(text: string): string;
|
|
|
30
46
|
export declare function validateJudgeResult(parsed: unknown): JudgeResult;
|
|
31
47
|
/**
|
|
32
48
|
* Evaluator for "llm_judge" assertion: an LLM judges the scenario output
|
|
33
|
-
*
|
|
34
|
-
* Passes if score >= minScore.
|
|
49
|
+
* and returns a score 0-100. The real LLM call gets a readFile tool to
|
|
50
|
+
* inspect changed files on demand. Passes if score >= minScore.
|
|
35
51
|
*/
|
|
36
52
|
export declare class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {
|
|
37
53
|
readonly type: "llm_judge";
|
|
38
54
|
evaluate(assertion: LlmJudgeAssertion, input: EvaluationInput, context?: AssertionContext): Promise<AssertionResult>;
|
|
55
|
+
private formatChangedFilesPaths;
|
|
39
56
|
private callGenerateText;
|
|
40
57
|
}
|
|
58
|
+
export {};
|
package/build/types/index.d.ts
CHANGED
|
@@ -5,4 +5,4 @@
|
|
|
5
5
|
* Supports skill invocation checks, build validation, and LLM-based judging.
|
|
6
6
|
*/
|
|
7
7
|
export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
|
|
8
|
-
export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type LlmJudgeGenerateTextOptions, type JudgeResult, } from "./evaluators/index.js";
|
|
8
|
+
export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type LlmJudgeGenerateTextOptions, type JudgeResult, buildReadFileTool, } from "./evaluators/index.js";
|
|
@@ -22,6 +22,7 @@ export declare const BuildPassedAssertionSchema: z.ZodObject<{
|
|
|
22
22
|
export type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;
|
|
23
23
|
/**
|
|
24
24
|
* Assertion: an LLM judges the scenario output (score 0-100).
|
|
25
|
+
* The judge has a readFile tool to inspect changed file contents on demand.
|
|
25
26
|
* Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.
|
|
26
27
|
* Passes if judge score >= minScore.
|
|
27
28
|
*/
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wix/eval-assertions",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.0",
|
|
4
4
|
"description": "Assertion framework for AI agent evaluations - supports skill invocation checks, build validation, and LLM-based judging",
|
|
5
5
|
"files": [
|
|
6
6
|
"build"
|
|
@@ -60,5 +60,5 @@
|
|
|
60
60
|
],
|
|
61
61
|
"license": "MIT",
|
|
62
62
|
"author": "Wix",
|
|
63
|
-
"falconPackageHash": "
|
|
63
|
+
"falconPackageHash": "c3b2ca9dc605b30db0161d25ae6b79b55cf54f33f9217b0cad10393c"
|
|
64
64
|
}
|