@wix/eval-assertions 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -35,6 +35,8 @@ __export(index_exports, {
35
35
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
36
  SkillWasCalledAssertionSchema: () => SkillWasCalledAssertionSchema,
37
37
  SkillWasCalledEvaluator: () => SkillWasCalledEvaluator,
38
+ TimeAssertionSchema: () => TimeAssertionSchema,
39
+ TimeEvaluator: () => TimeEvaluator,
38
40
  TokenUsageSchema: () => TokenUsageSchema,
39
41
  evaluateAssertions: () => evaluateAssertions,
40
42
  formatTraceForJudge: () => formatTraceForJudge,
@@ -73,9 +75,15 @@ var LlmJudgeAssertionSchema = import_zod.z.object({
73
75
  maxTokens: import_zod.z.number().int().optional(),
74
76
  temperature: import_zod.z.number().min(0).max(1).optional()
75
77
  });
78
+ var TimeAssertionSchema = import_zod.z.object({
79
+ type: import_zod.z.literal("time_limit"),
80
+ /** Maximum allowed duration in milliseconds */
81
+ maxDurationMs: import_zod.z.number().int().positive()
82
+ });
76
83
  var AssertionSchema = import_zod.z.union([
77
84
  SkillWasCalledAssertionSchema,
78
85
  BuildPassedAssertionSchema,
86
+ TimeAssertionSchema,
79
87
  LlmJudgeAssertionSchema
80
88
  ]);
81
89
 
@@ -155,7 +163,7 @@ var AssertionResultSchema = import_zod3.z.object({
155
163
  });
156
164
 
157
165
  // src/evaluators/index.ts
158
- var import_crypto4 = require("crypto");
166
+ var import_crypto5 = require("crypto");
159
167
 
160
168
  // src/evaluators/skill-was-called-evaluator.ts
161
169
  var import_crypto = require("crypto");
@@ -314,8 +322,41 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
314
322
  }
315
323
  };
316
324
 
317
- // src/evaluators/llm-judge-evaluator.ts
325
+ // src/evaluators/time-evaluator.ts
318
326
  var import_crypto3 = require("crypto");
327
+ var TimeEvaluator = class extends AssertionEvaluator {
328
+ type = "time_limit";
329
+ evaluate(assertion, input) {
330
+ const maxDurationMs = assertion.maxDurationMs;
331
+ if (input.durationMs == null) {
332
+ return this.createResult({
333
+ status: "failed" /* FAILED */,
334
+ message: "No duration data available for time assertion",
335
+ expected: `<= ${maxDurationMs}ms`
336
+ });
337
+ }
338
+ const passed = input.durationMs <= maxDurationMs;
339
+ return this.createResult({
340
+ status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
341
+ message: passed ? `Completed in ${input.durationMs}ms (limit: ${maxDurationMs}ms)` : `Exceeded time limit: ${input.durationMs}ms > ${maxDurationMs}ms`,
342
+ expected: `<= ${maxDurationMs}ms`,
343
+ actual: `${input.durationMs}ms`
344
+ });
345
+ }
346
+ createResult(fields) {
347
+ return {
348
+ id: (0, import_crypto3.randomUUID)(),
349
+ assertionId: (0, import_crypto3.randomUUID)(),
350
+ assertionType: "time_limit",
351
+ assertionName: "Time limit",
352
+ status: "failed" /* FAILED */,
353
+ ...fields
354
+ };
355
+ }
356
+ };
357
+
358
+ // src/evaluators/llm-judge-evaluator.ts
359
+ var import_crypto4 = require("crypto");
319
360
  var import_anthropic = require("@ai-sdk/anthropic");
320
361
  var import_ai = require("ai");
321
362
  function formatTraceForJudge(llmTrace) {
@@ -379,7 +420,6 @@ function validateJudgeResult(parsed) {
379
420
  };
380
421
  }
381
422
  var DEFAULT_MIN_SCORE = 70;
382
- var DEFAULT_MODEL = "claude-haiku-4-5-20251001";
383
423
  var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:
384
424
 
385
425
  - {{output}}: the agent's final output
@@ -411,7 +451,7 @@ Any response that includes extra content or deviates from the specified format w
411
451
  var LlmJudgeEvaluator = class extends AssertionEvaluator {
412
452
  type = "llm_judge";
413
453
  async evaluate(assertion, input, context) {
414
- const assertionId = (0, import_crypto3.randomUUID)();
454
+ const assertionId = (0, import_crypto4.randomUUID)();
415
455
  const llmConfig = context?.llmConfig;
416
456
  const workDir = context?.workDir ?? "";
417
457
  const generateTextStub = context?.generateTextForLlmJudge;
@@ -438,9 +478,21 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
438
478
  const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
439
479
  const maxOutputTokens = assertion.maxTokens ?? 1024;
440
480
  const temperature = assertion.temperature ?? 0;
481
+ const modelUsed = assertion.model ?? context?.defaultJudgeModel;
482
+ if (!modelUsed && !generateTextStub) {
483
+ return {
484
+ id: (0, import_crypto4.randomUUID)(),
485
+ assertionId,
486
+ assertionType: "llm_judge",
487
+ assertionName: "LLM judge",
488
+ status: "failed" /* FAILED */,
489
+ message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
490
+ expected: String(minScore)
491
+ };
492
+ }
441
493
  if (!generateTextStub && !llmConfig) {
442
494
  return {
443
- id: (0, import_crypto3.randomUUID)(),
495
+ id: (0, import_crypto4.randomUUID)(),
444
496
  assertionId,
445
497
  assertionType: "llm_judge",
446
498
  assertionName: "LLM judge",
@@ -461,7 +513,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
461
513
  temperature
462
514
  }) : await this.callGenerateText(
463
515
  llmConfig,
464
- assertion.model ?? DEFAULT_MODEL,
516
+ modelUsed,
465
517
  finalPrompt,
466
518
  systemPrompt,
467
519
  maxOutputTokens,
@@ -474,7 +526,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
474
526
  const judgeResult = validateJudgeResult(parsed);
475
527
  const passed = judgeResult.score >= minScore;
476
528
  return {
477
- id: (0, import_crypto3.randomUUID)(),
529
+ id: (0, import_crypto4.randomUUID)(),
478
530
  assertionId,
479
531
  assertionType: "llm_judge",
480
532
  assertionName: "LLM judge",
@@ -493,7 +545,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
493
545
  }
494
546
  }
495
547
  return {
496
- id: (0, import_crypto3.randomUUID)(),
548
+ id: (0, import_crypto4.randomUUID)(),
497
549
  assertionId,
498
550
  assertionType: "llm_judge",
499
551
  assertionName: "LLM judge",
@@ -505,8 +557,10 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
505
557
  };
506
558
  } catch (err) {
507
559
  const message = err instanceof Error ? err.message : String(err);
508
- const modelUsed = assertion.model ?? DEFAULT_MODEL;
509
- const details = { error: message, model: modelUsed };
560
+ const details = {
561
+ error: message,
562
+ model: modelUsed
563
+ };
510
564
  if (import_ai.APICallError.isInstance(err)) {
511
565
  details.statusCode = err.statusCode;
512
566
  details.url = err.url;
@@ -514,7 +568,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
514
568
  details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
515
569
  }
516
570
  return {
517
- id: (0, import_crypto3.randomUUID)(),
571
+ id: (0, import_crypto4.randomUUID)(),
518
572
  assertionId,
519
573
  assertionType: "llm_judge",
520
574
  assertionName: "LLM judge",
@@ -547,6 +601,7 @@ var llmJudgeEvaluator = new LlmJudgeEvaluator();
547
601
  var evaluators = {
548
602
  skill_was_called: new SkillWasCalledEvaluator(),
549
603
  build_passed: new BuildPassedEvaluator(),
604
+ time_limit: new TimeEvaluator(),
550
605
  llm_judge: llmJudgeEvaluator,
551
606
  // Custom assertions use the same LLM-based evaluation as llm_judge
552
607
  custom: llmJudgeEvaluator
@@ -566,8 +621,8 @@ async function evaluateAssertions(input, assertions, context) {
566
621
  const evaluator = evaluators[assertion.type];
567
622
  if (!evaluator) {
568
623
  return {
569
- id: (0, import_crypto4.randomUUID)(),
570
- assertionId: (0, import_crypto4.randomUUID)(),
624
+ id: (0, import_crypto5.randomUUID)(),
625
+ assertionId: (0, import_crypto5.randomUUID)(),
571
626
  assertionType: assertion.type,
572
627
  assertionName: "Unknown assertion",
573
628
  status: "error" /* ERROR */,
@@ -599,6 +654,8 @@ async function evaluateAssertions(input, assertions, context) {
599
654
  LlmJudgeEvaluator,
600
655
  SkillWasCalledAssertionSchema,
601
656
  SkillWasCalledEvaluator,
657
+ TimeAssertionSchema,
658
+ TimeEvaluator,
602
659
  TokenUsageSchema,
603
660
  evaluateAssertions,
604
661
  formatTraceForJudge,
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "version": 3,
3
- "sources": ["../src/index.ts", "../src/types/assertions.ts", "../src/types/trace.ts", "../src/types/result.ts", "../src/evaluators/index.ts", "../src/evaluators/skill-was-called-evaluator.ts", "../src/evaluators/assertion-evaluator.ts", "../src/evaluators/build-passed-evaluator.ts", "../src/evaluators/llm-judge-evaluator.ts"],
4
- "sourcesContent": ["/**\n * @wix/eval-assertions\n *\n * Assertion framework for AI agent evaluations.\n * Supports skill invocation checks, build validation, and LLM-based judging.\n */\n\n// Types\nexport {\n // Assertion schemas and types\n AssertionSchema,\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n type Assertion,\n type SkillWasCalledAssertion,\n type BuildPassedAssertion,\n type LlmJudgeAssertion,\n // Trace types\n LLMTraceSchema,\n LLMTraceStepSchema,\n LLMTraceSummarySchema,\n LLMBreakdownStatsSchema,\n TokenUsageSchema,\n LLMStepType,\n type LLMTrace,\n type LLMTraceStep,\n type LLMTraceSummary,\n type LLMBreakdownStats,\n type TokenUsage,\n // Result types\n AssertionResultSchema,\n AssertionResultStatus,\n type AssertionResult,\n // Input types\n type EvaluationInput,\n type FileDiff,\n} from \"./types/index.js\";\n\n// Evaluators\nexport {\n evaluateAssertions,\n registerEvaluator,\n getEvaluator,\n AssertionEvaluator,\n SkillWasCalledEvaluator,\n BuildPassedEvaluator,\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type AssertionContext,\n type LlmConfig,\n type LlmJudgeGenerateTextOptions,\n type JudgeResult,\n} from \"./evaluators/index.js\";\n", "import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\nconst DEFAULT_MODEL = \"claude-haiku-4-5-20251001\";\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of all files changed (or \"No files were changed\")\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.\n * Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n // Compute file lists by status\n const changedPaths = fileDiffs.map((d) => d.path);\n const modifiedPaths = fileDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = fileDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles =\n changedPaths.length > 0\n ? changedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were changed\";\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n const systemPrompt =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\"\n ? replace(assertion.systemPrompt) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS\n : replace(DEFAULT_JUDGE_CONTEXT) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n assertion.model ?? DEFAULT_MODEL,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const modelUsed = assertion.model ?? DEFAULT_MODEL;\n const details: Record<string, unknown> = { error: message, model: modelUsed };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
5
- "mappings": ";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,iBAAkB;AAQX,IAAM,gCAAgC,aAAE,OAAO;AAAA,EACpD,MAAM,aAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,aAAE,OAAO;AAAA,EACjD,MAAM,aAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AASM,IAAM,0BAA0B,aAAE,OAAO;AAAA,EAC9C,MAAM,aAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,aAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,aAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,aAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AASM,IAAM,kBAAkB,aAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC9DD,IAAAA,cAAkB;AAKX,IAAM,mBAAmB,cAAE,OAAO;AAAA,EACvC,QAAQ,cAAE,OAAO;AAAA,EACjB,YAAY,cAAE,OAAO;AAAA,EACrB,OAAO,cAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqB,cAAE,OAAO;AAAA,EACzC,IAAI,cAAE,OAAO;AAAA,EACb,YAAY,cAAE,OAAO;AAAA,EACrB,MAAM,cAAE,KAAK,WAAW;AAAA,EACxB,OAAO,cAAE,OAAO;AAAA,EAChB,UAAU,cAAE,OAAO;AAAA,EACnB,WAAW,cAAE,OAAO;AAAA,EACpB,YAAY,cAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAAS,cAAE,OAAO;AAAA,EAClB,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAc,cAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAAS,cAAE,QAAQ;AAAA,EACnB,OAAO,cAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0B,cAAE,OAAO;AAAA,EAC9C,OAAO,cAAE,OAAO;AAAA,EAChB,YAAY,cAAE,OAAO;AAAA,EACrB,QAAQ,cAAE,OAAO;AAAA,EACjB,SAAS,cAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,YAAY,cAAE,OAAO;AAAA,EACrB,iBAAiB,cAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAc,cAAE,OAAO;AAAA,EACvB,mBAAmB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAY,cAAE,MAAM,cAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiB,cAAE,OAAO;AAAA,EACrC,IAAI,cAAE,OAAO;AAAA,EACb,OAAO,cAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,IAAAC,cAAkB;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,IAAI,cAAE,OAAO;AAAA,EACb,aAAa,cAAE,OAAO;AAAA,EACtB,eAAe,cAAE,OAAO;AAAA,EACxB,eAAe,cAAE,OAAO;AAAA,EACxB,QAAQ,cAAE,KAAK,qBAAqB;AAAA,EACpC,SAAS,cAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQ,cAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAAS,cAAE,OAAO,cAAE,OAAO,GAAG,cAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAe,cAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,IAAAC,iBAA2B;;;ACK3B,oBAA2B;;;ACyCpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD1CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,kBAAc,0BAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,QAAI,0BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,IAAAC,iBAA2B;AAC3B,2BAAyB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,kBAAc,2BAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,yCAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,QAAI,2BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACxHA,IAAAC,iBAA2B;AAC3B,uBAAgC;AAChC,gBAA2C;AAapC,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAcO,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAC1B,IAAM,gBAAgB;AAGtB,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAW9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,kBAAc,2BAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAGtC,UAAM,eAAe,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AAChD,UAAM,gBAAgB,UACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,UACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eACJ,aAAa,SAAS,IAClB,aAAa,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACzD;AACN,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAC5C,UAAM,eACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB,KACzD,QAAQ,UAAU,YAAY,IAC9B,SACA,kCACA,QAAQ,qBAAqB,IAC7B,SACA;AAEN,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAE7C,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA,UAAU,SAAS;AAAA,UACnB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,QAAI,2BAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,YAAY,UAAU,SAAS;AACrC,YAAM,UAAmC,EAAE,OAAO,SAAS,OAAO,UAAU;AAE5E,UAAI,uBAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aAC2B;AAC3B,UAAM,gBAAY,kCAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AACD,UAAM,SAAS,UAAM,wBAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;AJzUA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,QAAI,2BAAW;AAAA,UACf,iBAAa,2BAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
6
- "names": ["import_zod", "LLMStepType", "import_zod", "AssertionResultStatus", "import_crypto", "message", "import_crypto", "import_crypto"]
3
+ "sources": ["../src/index.ts", "../src/types/assertions.ts", "../src/types/trace.ts", "../src/types/result.ts", "../src/evaluators/index.ts", "../src/evaluators/skill-was-called-evaluator.ts", "../src/evaluators/assertion-evaluator.ts", "../src/evaluators/build-passed-evaluator.ts", "../src/evaluators/time-evaluator.ts", "../src/evaluators/llm-judge-evaluator.ts"],
4
+ "sourcesContent": ["/**\n * @wix/eval-assertions\n *\n * Assertion framework for AI agent evaluations.\n * Supports skill invocation checks, build validation, and LLM-based judging.\n */\n\n// Types\nexport {\n // Assertion schemas and types\n AssertionSchema,\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n TimeAssertionSchema,\n LlmJudgeAssertionSchema,\n type Assertion,\n type SkillWasCalledAssertion,\n type BuildPassedAssertion,\n type TimeAssertion,\n type LlmJudgeAssertion,\n // Trace types\n LLMTraceSchema,\n LLMTraceStepSchema,\n LLMTraceSummarySchema,\n LLMBreakdownStatsSchema,\n TokenUsageSchema,\n LLMStepType,\n type LLMTrace,\n type LLMTraceStep,\n type LLMTraceSummary,\n type LLMBreakdownStats,\n type TokenUsage,\n // Result types\n AssertionResultSchema,\n AssertionResultStatus,\n type AssertionResult,\n // Input types\n type EvaluationInput,\n type FileDiff,\n} from \"./types/index.js\";\n\n// Evaluators\nexport {\n evaluateAssertions,\n registerEvaluator,\n getEvaluator,\n AssertionEvaluator,\n SkillWasCalledEvaluator,\n BuildPassedEvaluator,\n TimeEvaluator,\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type AssertionContext,\n type LlmConfig,\n type LlmJudgeGenerateTextOptions,\n type JudgeResult,\n} from \"./evaluators/index.js\";\n", "import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Assertion: scenario must complete within a maximum duration.\n * Deterministic check against the scenario execution time.\n */\nexport const TimeAssertionSchema = z.object({\n type: z.literal(\"time_limit\"),\n /** Maximum allowed duration in milliseconds */\n maxDurationMs: z.number().int().positive(),\n});\n\nexport type TimeAssertion = z.infer<typeof TimeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n TimeAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { TimeEvaluator } from \"./time-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n time_limit: new TimeEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport { TimeEvaluator } from \"./time-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /** Default model for llm_judge when assertion.model is not set. Caller provides this. */\n defaultJudgeModel?: string;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n TimeAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Evaluator for \"time_limit\" assertion: passes if the scenario completed\n * within the configured maximum duration (maxDurationMs).\n */\nexport class TimeEvaluator extends AssertionEvaluator<TimeAssertion> {\n readonly type = \"time_limit\" as const;\n\n evaluate(assertion: TimeAssertion, input: EvaluationInput): AssertionResult {\n const maxDurationMs = assertion.maxDurationMs;\n\n if (input.durationMs == null) {\n return this.createResult({\n status: AssertionResultStatus.FAILED,\n message: \"No duration data available for time assertion\",\n expected: `<= ${maxDurationMs}ms`,\n });\n }\n\n const passed = input.durationMs <= maxDurationMs;\n\n return this.createResult({\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Completed in ${input.durationMs}ms (limit: ${maxDurationMs}ms)`\n : `Exceeded time limit: ${input.durationMs}ms > ${maxDurationMs}ms`,\n expected: `<= ${maxDurationMs}ms`,\n actual: `${input.durationMs}ms`,\n });\n }\n\n private createResult(fields: Partial<AssertionResult>): AssertionResult {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: \"time_limit\",\n assertionName: \"Time limit\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of all files changed (or \"No files were changed\")\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.\n * Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n // Compute file lists by status\n const changedPaths = fileDiffs.map((d) => d.path);\n const modifiedPaths = fileDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = fileDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles =\n changedPaths.length > 0\n ? changedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were changed\";\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n const systemPrompt =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\"\n ? replace(assertion.systemPrompt) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS\n : replace(DEFAULT_JUDGE_CONTEXT) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n const modelUsed = assertion.model ?? context?.defaultJudgeModel;\n\n if (!modelUsed && !generateTextStub) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message:\n \"No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)\",\n expected: String(minScore),\n };\n }\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n modelUsed!,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const details: Record<string, unknown> = {\n error: message,\n model: modelUsed,\n };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
5
+ "mappings": ";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,iBAAkB;AAQX,IAAM,gCAAgC,aAAE,OAAO;AAAA,EACpD,MAAM,aAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,aAAE,MAAM,aAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,aAAE,OAAO;AAAA,EACjD,MAAM,aAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AASM,IAAM,0BAA0B,aAAE,OAAO;AAAA,EAC9C,MAAM,aAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,aAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,aAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,aAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,aAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,aAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AAQM,IAAM,sBAAsB,aAAE,OAAO;AAAA,EAC1C,MAAM,aAAE,QAAQ,YAAY;AAAA;AAAA,EAE5B,eAAe,aAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC3C,CAAC;AASM,IAAM,kBAAkB,aAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC3ED,IAAAA,cAAkB;AAKX,IAAM,mBAAmB,cAAE,OAAO;AAAA,EACvC,QAAQ,cAAE,OAAO;AAAA,EACjB,YAAY,cAAE,OAAO;AAAA,EACrB,OAAO,cAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqB,cAAE,OAAO;AAAA,EACzC,IAAI,cAAE,OAAO;AAAA,EACb,YAAY,cAAE,OAAO;AAAA,EACrB,MAAM,cAAE,KAAK,WAAW;AAAA,EACxB,OAAO,cAAE,OAAO;AAAA,EAChB,UAAU,cAAE,OAAO;AAAA,EACnB,WAAW,cAAE,OAAO;AAAA,EACpB,YAAY,cAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAAS,cAAE,OAAO;AAAA,EAClB,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAc,cAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAe,cAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAAS,cAAE,QAAQ;AAAA,EACnB,OAAO,cAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0B,cAAE,OAAO;AAAA,EAC9C,OAAO,cAAE,OAAO;AAAA,EAChB,YAAY,cAAE,OAAO;AAAA,EACrB,QAAQ,cAAE,OAAO;AAAA,EACjB,SAAS,cAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,YAAY,cAAE,OAAO;AAAA,EACrB,iBAAiB,cAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAc,cAAE,OAAO;AAAA,EACvB,mBAAmB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgB,cAAE,OAAO,cAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAY,cAAE,MAAM,cAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiB,cAAE,OAAO;AAAA,EACrC,IAAI,cAAE,OAAO;AAAA,EACb,OAAO,cAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,IAAAC,cAAkB;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwB,cAAE,OAAO;AAAA,EAC5C,IAAI,cAAE,OAAO;AAAA,EACb,aAAa,cAAE,OAAO;AAAA,EACtB,eAAe,cAAE,OAAO;AAAA,EACxB,eAAe,cAAE,OAAO;AAAA,EACxB,QAAQ,cAAE,KAAK,qBAAqB;AAAA,EACpC,SAAS,cAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQ,cAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAU,cAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAAS,cAAE,OAAO,cAAE,OAAO,GAAG,cAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAe,cAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,IAAAC,iBAA2B;;;ACK3B,oBAA2B;;;AC2CpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD5CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,kBAAc,0BAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,QAAI,0BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,QAAI,0BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,IAAAC,iBAA2B;AAC3B,2BAAyB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,kBAAc,2BAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,yCAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,QAAI,2BAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACzHA,IAAAC,iBAA2B;AAOpB,IAAM,gBAAN,cAA4B,mBAAkC;AAAA,EAC1D,OAAO;AAAA,EAEhB,SAAS,WAA0B,OAAyC;AAC1E,UAAM,gBAAgB,UAAU;AAEhC,QAAI,MAAM,cAAc,MAAM;AAC5B,aAAO,KAAK,aAAa;AAAA,QACvB;AAAA,QACA,SAAS;AAAA,QACT,UAAU,MAAM,aAAa;AAAA,MAC/B,CAAC;AAAA,IACH;AAEA,UAAM,SAAS,MAAM,cAAc;AAEnC,WAAO,KAAK,aAAa;AAAA,MACvB,QAAQ;AAAA,MAGR,SAAS,SACL,gBAAgB,MAAM,UAAU,cAAc,aAAa,QAC3D,wBAAwB,MAAM,UAAU,QAAQ,aAAa;AAAA,MACjE,UAAU,MAAM,aAAa;AAAA,MAC7B,QAAQ,GAAG,MAAM,UAAU;AAAA,IAC7B,CAAC;AAAA,EACH;AAAA,EAEQ,aAAa,QAAmD;AACtE,WAAO;AAAA,MACL,QAAI,2BAAW;AAAA,MACf,iBAAa,2BAAW;AAAA,MACxB,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AACF;;;AC5CA,IAAAC,iBAA2B;AAC3B,uBAAgC;AAChC,gBAA2C;AAapC,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAcO,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAG1B,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAW9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,kBAAc,2BAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAGtC,UAAM,eAAe,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AAChD,UAAM,gBAAgB,UACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,UACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eACJ,aAAa,SAAS,IAClB,aAAa,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACzD;AACN,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAC5C,UAAM,eACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB,KACzD,QAAQ,UAAU,YAAY,IAC9B,SACA,kCACA,QAAQ,qBAAqB,IAC7B,SACA;AAEN,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAC7C,UAAM,YAAY,UAAU,SAAS,SAAS;AAE9C,QAAI,CAAC,aAAa,CAAC,kBAAkB;AACnC,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SACE;AAAA,QACF,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,QAAI,2BAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,UAAmC;AAAA,QACvC,OAAO;AAAA,QACP,OAAO;AAAA,MACT;AAEA,UAAI,uBAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,QAAI,2BAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aAC2B;AAC3B,UAAM,gBAAY,kCAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AACD,UAAM,SAAS,UAAM,wBAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;ALvVA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,YAAY,IAAI,cAAc;AAAA,EAC9B,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,QAAI,2BAAW;AAAA,UACf,iBAAa,2BAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
6
+ "names": ["import_zod", "LLMStepType", "import_zod", "AssertionResultStatus", "import_crypto", "message", "import_crypto", "import_crypto", "import_crypto"]
7
7
  }
package/build/index.mjs CHANGED
@@ -25,9 +25,15 @@ var LlmJudgeAssertionSchema = z.object({
25
25
  maxTokens: z.number().int().optional(),
26
26
  temperature: z.number().min(0).max(1).optional()
27
27
  });
28
+ var TimeAssertionSchema = z.object({
29
+ type: z.literal("time_limit"),
30
+ /** Maximum allowed duration in milliseconds */
31
+ maxDurationMs: z.number().int().positive()
32
+ });
28
33
  var AssertionSchema = z.union([
29
34
  SkillWasCalledAssertionSchema,
30
35
  BuildPassedAssertionSchema,
36
+ TimeAssertionSchema,
31
37
  LlmJudgeAssertionSchema
32
38
  ]);
33
39
 
@@ -107,7 +113,7 @@ var AssertionResultSchema = z3.object({
107
113
  });
108
114
 
109
115
  // src/evaluators/index.ts
110
- import { randomUUID as randomUUID4 } from "crypto";
116
+ import { randomUUID as randomUUID5 } from "crypto";
111
117
 
112
118
  // src/evaluators/skill-was-called-evaluator.ts
113
119
  import { randomUUID } from "crypto";
@@ -266,8 +272,41 @@ var BuildPassedEvaluator = class extends AssertionEvaluator {
266
272
  }
267
273
  };
268
274
 
269
- // src/evaluators/llm-judge-evaluator.ts
275
+ // src/evaluators/time-evaluator.ts
270
276
  import { randomUUID as randomUUID3 } from "crypto";
277
+ var TimeEvaluator = class extends AssertionEvaluator {
278
+ type = "time_limit";
279
+ evaluate(assertion, input) {
280
+ const maxDurationMs = assertion.maxDurationMs;
281
+ if (input.durationMs == null) {
282
+ return this.createResult({
283
+ status: "failed" /* FAILED */,
284
+ message: "No duration data available for time assertion",
285
+ expected: `<= ${maxDurationMs}ms`
286
+ });
287
+ }
288
+ const passed = input.durationMs <= maxDurationMs;
289
+ return this.createResult({
290
+ status: passed ? "passed" /* PASSED */ : "failed" /* FAILED */,
291
+ message: passed ? `Completed in ${input.durationMs}ms (limit: ${maxDurationMs}ms)` : `Exceeded time limit: ${input.durationMs}ms > ${maxDurationMs}ms`,
292
+ expected: `<= ${maxDurationMs}ms`,
293
+ actual: `${input.durationMs}ms`
294
+ });
295
+ }
296
+ createResult(fields) {
297
+ return {
298
+ id: randomUUID3(),
299
+ assertionId: randomUUID3(),
300
+ assertionType: "time_limit",
301
+ assertionName: "Time limit",
302
+ status: "failed" /* FAILED */,
303
+ ...fields
304
+ };
305
+ }
306
+ };
307
+
308
+ // src/evaluators/llm-judge-evaluator.ts
309
+ import { randomUUID as randomUUID4 } from "crypto";
271
310
  import { createAnthropic } from "@ai-sdk/anthropic";
272
311
  import { generateText, APICallError } from "ai";
273
312
  function formatTraceForJudge(llmTrace) {
@@ -331,7 +370,6 @@ function validateJudgeResult(parsed) {
331
370
  };
332
371
  }
333
372
  var DEFAULT_MIN_SCORE = 70;
334
- var DEFAULT_MODEL = "claude-haiku-4-5-20251001";
335
373
  var DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:
336
374
 
337
375
  - {{output}}: the agent's final output
@@ -363,7 +401,7 @@ Any response that includes extra content or deviates from the specified format w
363
401
  var LlmJudgeEvaluator = class extends AssertionEvaluator {
364
402
  type = "llm_judge";
365
403
  async evaluate(assertion, input, context) {
366
- const assertionId = randomUUID3();
404
+ const assertionId = randomUUID4();
367
405
  const llmConfig = context?.llmConfig;
368
406
  const workDir = context?.workDir ?? "";
369
407
  const generateTextStub = context?.generateTextForLlmJudge;
@@ -390,9 +428,21 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
390
428
  const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;
391
429
  const maxOutputTokens = assertion.maxTokens ?? 1024;
392
430
  const temperature = assertion.temperature ?? 0;
431
+ const modelUsed = assertion.model ?? context?.defaultJudgeModel;
432
+ if (!modelUsed && !generateTextStub) {
433
+ return {
434
+ id: randomUUID4(),
435
+ assertionId,
436
+ assertionType: "llm_judge",
437
+ assertionName: "LLM judge",
438
+ status: "failed" /* FAILED */,
439
+ message: "No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)",
440
+ expected: String(minScore)
441
+ };
442
+ }
393
443
  if (!generateTextStub && !llmConfig) {
394
444
  return {
395
- id: randomUUID3(),
445
+ id: randomUUID4(),
396
446
  assertionId,
397
447
  assertionType: "llm_judge",
398
448
  assertionName: "LLM judge",
@@ -413,7 +463,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
413
463
  temperature
414
464
  }) : await this.callGenerateText(
415
465
  llmConfig,
416
- assertion.model ?? DEFAULT_MODEL,
466
+ modelUsed,
417
467
  finalPrompt,
418
468
  systemPrompt,
419
469
  maxOutputTokens,
@@ -426,7 +476,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
426
476
  const judgeResult = validateJudgeResult(parsed);
427
477
  const passed = judgeResult.score >= minScore;
428
478
  return {
429
- id: randomUUID3(),
479
+ id: randomUUID4(),
430
480
  assertionId,
431
481
  assertionType: "llm_judge",
432
482
  assertionName: "LLM judge",
@@ -445,7 +495,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
445
495
  }
446
496
  }
447
497
  return {
448
- id: randomUUID3(),
498
+ id: randomUUID4(),
449
499
  assertionId,
450
500
  assertionType: "llm_judge",
451
501
  assertionName: "LLM judge",
@@ -457,8 +507,10 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
457
507
  };
458
508
  } catch (err) {
459
509
  const message = err instanceof Error ? err.message : String(err);
460
- const modelUsed = assertion.model ?? DEFAULT_MODEL;
461
- const details = { error: message, model: modelUsed };
510
+ const details = {
511
+ error: message,
512
+ model: modelUsed
513
+ };
462
514
  if (APICallError.isInstance(err)) {
463
515
  details.statusCode = err.statusCode;
464
516
  details.url = err.url;
@@ -466,7 +518,7 @@ var LlmJudgeEvaluator = class extends AssertionEvaluator {
466
518
  details.responseBody = typeof err.responseBody === "string" ? err.responseBody.slice(0, 2e3) : err.responseBody;
467
519
  }
468
520
  return {
469
- id: randomUUID3(),
521
+ id: randomUUID4(),
470
522
  assertionId,
471
523
  assertionType: "llm_judge",
472
524
  assertionName: "LLM judge",
@@ -499,6 +551,7 @@ var llmJudgeEvaluator = new LlmJudgeEvaluator();
499
551
  var evaluators = {
500
552
  skill_was_called: new SkillWasCalledEvaluator(),
501
553
  build_passed: new BuildPassedEvaluator(),
554
+ time_limit: new TimeEvaluator(),
502
555
  llm_judge: llmJudgeEvaluator,
503
556
  // Custom assertions use the same LLM-based evaluation as llm_judge
504
557
  custom: llmJudgeEvaluator
@@ -518,8 +571,8 @@ async function evaluateAssertions(input, assertions, context) {
518
571
  const evaluator = evaluators[assertion.type];
519
572
  if (!evaluator) {
520
573
  return {
521
- id: randomUUID4(),
522
- assertionId: randomUUID4(),
574
+ id: randomUUID5(),
575
+ assertionId: randomUUID5(),
523
576
  assertionType: assertion.type,
524
577
  assertionName: "Unknown assertion",
525
578
  status: "error" /* ERROR */,
@@ -550,6 +603,8 @@ export {
550
603
  LlmJudgeEvaluator,
551
604
  SkillWasCalledAssertionSchema,
552
605
  SkillWasCalledEvaluator,
606
+ TimeAssertionSchema,
607
+ TimeEvaluator,
553
608
  TokenUsageSchema,
554
609
  evaluateAssertions,
555
610
  formatTraceForJudge,
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "version": 3,
3
- "sources": ["../src/types/assertions.ts", "../src/types/trace.ts", "../src/types/result.ts", "../src/evaluators/index.ts", "../src/evaluators/skill-was-called-evaluator.ts", "../src/evaluators/assertion-evaluator.ts", "../src/evaluators/build-passed-evaluator.ts", "../src/evaluators/llm-judge-evaluator.ts"],
4
- "sourcesContent": ["import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\nconst DEFAULT_MODEL = \"claude-haiku-4-5-20251001\";\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of all files changed (or \"No files were changed\")\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.\n * Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n // Compute file lists by status\n const changedPaths = fileDiffs.map((d) => d.path);\n const modifiedPaths = fileDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = fileDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles =\n changedPaths.length > 0\n ? changedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were changed\";\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n const systemPrompt =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\"\n ? replace(assertion.systemPrompt) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS\n : replace(DEFAULT_JUDGE_CONTEXT) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n assertion.model ?? DEFAULT_MODEL,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const modelUsed = assertion.model ?? DEFAULT_MODEL;\n const details: Record<string, unknown> = { error: message, model: modelUsed };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
5
- "mappings": ";AAAA,SAAS,SAAS;AAQX,IAAM,gCAAgC,EAAE,OAAO;AAAA,EACpD,MAAM,EAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,EAAE,OAAO;AAAA,EACjD,MAAM,EAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AASM,IAAM,0BAA0B,EAAE,OAAO;AAAA,EAC9C,MAAM,EAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,EAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,EAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AASM,IAAM,kBAAkB,EAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC9DD,SAAS,KAAAA,UAAS;AAKX,IAAM,mBAAmBA,GAAE,OAAO;AAAA,EACvC,QAAQA,GAAE,OAAO;AAAA,EACjB,YAAYA,GAAE,OAAO;AAAA,EACrB,OAAOA,GAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqBD,GAAE,OAAO;AAAA,EACzC,IAAIA,GAAE,OAAO;AAAA,EACb,YAAYA,GAAE,OAAO;AAAA,EACrB,MAAMA,GAAE,KAAK,WAAW;AAAA,EACxB,OAAOA,GAAE,OAAO;AAAA,EAChB,UAAUA,GAAE,OAAO;AAAA,EACnB,WAAWA,GAAE,OAAO;AAAA,EACpB,YAAYA,GAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAASA,GAAE,OAAO;AAAA,EAClB,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAcA,GAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAASA,GAAE,QAAQ;AAAA,EACnB,OAAOA,GAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,OAAOA,GAAE,OAAO;AAAA,EAChB,YAAYA,GAAE,OAAO;AAAA,EACrB,QAAQA,GAAE,OAAO;AAAA,EACjB,SAASA,GAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwBA,GAAE,OAAO;AAAA,EAC5C,YAAYA,GAAE,OAAO;AAAA,EACrB,iBAAiBA,GAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAcA,GAAE,OAAO;AAAA,EACvB,mBAAmBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAYA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,IAAIA,GAAE,OAAO;AAAA,EACb,OAAOA,GAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,SAAS,KAAAE,UAAS;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,IAAIA,GAAE,OAAO;AAAA,EACb,aAAaA,GAAE,OAAO;AAAA,EACtB,eAAeA,GAAE,OAAO;AAAA,EACxB,eAAeA,GAAE,OAAO;AAAA,EACxB,QAAQA,GAAE,KAAK,qBAAqB;AAAA,EACpC,SAASA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAASA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAeA,GAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,SAAS,cAAAC,mBAAkB;;;ACK3B,SAAS,kBAAkB;;;ACyCpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD1CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,cAAc,WAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,IAAI,WAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,gBAAgB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,cAAcC,YAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,eAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,IAAIA,YAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACxHA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,cAAc,oBAAoB;AAapC,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAcO,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAC1B,IAAM,gBAAgB;AAGtB,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAW9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,cAAcC,YAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAGtC,UAAM,eAAe,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AAChD,UAAM,gBAAgB,UACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,UACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eACJ,aAAa,SAAS,IAClB,aAAa,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACzD;AACN,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAC5C,UAAM,eACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB,KACzD,QAAQ,UAAU,YAAY,IAC9B,SACA,kCACA,QAAQ,qBAAqB,IAC7B,SACA;AAEN,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAE7C,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA,UAAU,SAAS;AAAA,UACnB;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,IAAIA,YAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,YAAY,UAAU,SAAS;AACrC,YAAM,UAAmC,EAAE,OAAO,SAAS,OAAO,UAAU;AAE5E,UAAI,aAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aAC2B;AAC3B,UAAM,YAAY,gBAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AACD,UAAM,SAAS,MAAM,aAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;AJzUA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,IAAIC,YAAW;AAAA,UACf,aAAaA,YAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
6
- "names": ["z", "LLMStepType", "z", "AssertionResultStatus", "z", "randomUUID", "message", "randomUUID", "randomUUID", "randomUUID", "randomUUID", "randomUUID"]
3
+ "sources": ["../src/types/assertions.ts", "../src/types/trace.ts", "../src/types/result.ts", "../src/evaluators/index.ts", "../src/evaluators/skill-was-called-evaluator.ts", "../src/evaluators/assertion-evaluator.ts", "../src/evaluators/build-passed-evaluator.ts", "../src/evaluators/time-evaluator.ts", "../src/evaluators/llm-judge-evaluator.ts"],
4
+ "sourcesContent": ["import { z } from \"zod\";\n\n/**\n * Assertion: the agent must have invoked one or more skills during the run.\n * Checked by inspecting the LLM trace for \"Skill\" tool uses with the given skills.\n * When multiple skills are in one assertion, they are treated as a group (1 assertion).\n * Each skill in the group must have been called for the assertion to pass.\n */\nexport const SkillWasCalledAssertionSchema = z.object({\n type: z.literal(\"skill_was_called\"),\n /** Names of the skills that must have been called (matched against trace Skill tool args) */\n skillNames: z.array(z.string()).min(1),\n});\n\nexport type SkillWasCalledAssertion = z.infer<\n typeof SkillWasCalledAssertionSchema\n>;\n\n/**\n * Assertion: a build command must exit with the expected code (default 0).\n * Runs the command in the scenario working directory.\n */\nexport const BuildPassedAssertionSchema = z.object({\n type: z.literal(\"build_passed\"),\n /** Command to run (default: \"yarn build\") */\n command: z.string().optional(),\n /** Expected exit code (default: 0) */\n expectedExitCode: z.number().int().optional(),\n});\n\nexport type BuildPassedAssertion = z.infer<typeof BuildPassedAssertionSchema>;\n\n/**\n * Assertion: an LLM judges the scenario output (score 0-100).\n * Prompt can use {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}}.\n * Passes if judge score >= minScore.\n */\nexport const LlmJudgeAssertionSchema = z.object({\n type: z.literal(\"llm_judge\"),\n /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{modifiedFiles}}, {{newFiles}}, {{trace}} */\n prompt: z.string(),\n /** Optional system prompt for the judge (default asks for JSON with score) */\n systemPrompt: z.string().optional(),\n /** Minimum score to pass (0-100, default 70) */\n minScore: z.number().int().min(0).max(100).optional(),\n /** Model for the judge (e.g. claude-3-5-haiku) */\n model: z.string().optional(),\n maxTokens: z.number().int().optional(),\n temperature: z.number().min(0).max(1).optional(),\n});\n\nexport type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;\n\n/**\n * Assertion: scenario must complete within a maximum duration.\n * Deterministic check against the scenario execution time.\n */\nexport const TimeAssertionSchema = z.object({\n type: z.literal(\"time_limit\"),\n /** Maximum allowed duration in milliseconds */\n maxDurationMs: z.number().int().positive(),\n});\n\nexport type TimeAssertion = z.infer<typeof TimeAssertionSchema>;\n\n/**\n * Union of all assertion types.\n * Each assertion has a type and type-specific data.\n * Uses z.union (not z.discriminatedUnion) for Zod v4 compatibility when used as array element.\n */\nexport const AssertionSchema = z.union([\n SkillWasCalledAssertionSchema,\n BuildPassedAssertionSchema,\n TimeAssertionSchema,\n LlmJudgeAssertionSchema,\n]);\n\nexport type Assertion = z.infer<typeof AssertionSchema>;\n", "import { z } from \"zod\";\n\n/**\n * Token usage schema.\n */\nexport const TokenUsageSchema = z.object({\n prompt: z.number(),\n completion: z.number(),\n total: z.number(),\n});\n\nexport type TokenUsage = z.infer<typeof TokenUsageSchema>;\n\n/**\n * LLM step type enum.\n */\nexport enum LLMStepType {\n COMPLETION = \"completion\",\n TOOL_USE = \"tool_use\",\n TOOL_RESULT = \"tool_result\",\n THINKING = \"thinking\",\n}\n\n/**\n * LLM trace step schema.\n */\nexport const LLMTraceStepSchema = z.object({\n id: z.string(),\n stepNumber: z.number(),\n type: z.enum(LLMStepType),\n model: z.string(),\n provider: z.string(),\n startedAt: z.string(),\n durationMs: z.number(),\n tokenUsage: TokenUsageSchema,\n costUsd: z.number(),\n toolName: z.string().optional(),\n toolArguments: z.string().optional(),\n inputPreview: z.string().optional(),\n outputPreview: z.string().optional(),\n success: z.boolean(),\n error: z.string().optional(),\n});\n\nexport type LLMTraceStep = z.infer<typeof LLMTraceStepSchema>;\n\n/**\n * LLM breakdown stats schema.\n */\nexport const LLMBreakdownStatsSchema = z.object({\n count: z.number(),\n durationMs: z.number(),\n tokens: z.number(),\n costUsd: z.number(),\n});\n\nexport type LLMBreakdownStats = z.infer<typeof LLMBreakdownStatsSchema>;\n\n/**\n * LLM trace summary schema.\n */\nexport const LLMTraceSummarySchema = z.object({\n totalSteps: z.number(),\n totalDurationMs: z.number(),\n totalTokens: TokenUsageSchema,\n totalCostUsd: z.number(),\n stepTypeBreakdown: z.record(z.string(), LLMBreakdownStatsSchema).optional(),\n modelBreakdown: z.record(z.string(), LLMBreakdownStatsSchema),\n modelsUsed: z.array(z.string()),\n});\n\nexport type LLMTraceSummary = z.infer<typeof LLMTraceSummarySchema>;\n\n/**\n * LLM trace schema.\n */\nexport const LLMTraceSchema = z.object({\n id: z.string(),\n steps: z.array(LLMTraceStepSchema),\n summary: LLMTraceSummarySchema,\n});\n\nexport type LLMTrace = z.infer<typeof LLMTraceSchema>;\n", "import { z } from \"zod\";\nimport { LLMTraceStepSchema } from \"./trace.js\";\n\n/**\n * Assertion result status enum.\n */\nexport enum AssertionResultStatus {\n PASSED = \"passed\",\n FAILED = \"failed\",\n SKIPPED = \"skipped\",\n ERROR = \"error\",\n}\n\n/**\n * Assertion result schema.\n */\nexport const AssertionResultSchema = z.object({\n id: z.string(),\n assertionId: z.string(),\n assertionType: z.string(),\n assertionName: z.string(),\n status: z.enum(AssertionResultStatus),\n message: z.string().optional(),\n expected: z.string().optional(),\n actual: z.string().optional(),\n duration: z.number().optional(),\n details: z.record(z.string(), z.unknown()).optional(),\n llmTraceSteps: z.array(LLMTraceStepSchema).optional(),\n});\n\nexport type AssertionResult = z.infer<typeof AssertionResultSchema>;\n", "import type { Assertion, AssertionResult } from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nimport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nimport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nimport { TimeEvaluator } from \"./time-evaluator.js\";\nimport { LlmJudgeEvaluator } from \"./llm-judge-evaluator.js\";\nimport type { EvaluationInput } from \"../types/index.js\";\n\nconst llmJudgeEvaluator = new LlmJudgeEvaluator();\n\nconst evaluators: Record<string, AssertionEvaluator> = {\n skill_was_called: new SkillWasCalledEvaluator(),\n build_passed: new BuildPassedEvaluator(),\n time_limit: new TimeEvaluator(),\n llm_judge: llmJudgeEvaluator,\n // Custom assertions use the same LLM-based evaluation as llm_judge\n custom: llmJudgeEvaluator,\n};\n\n/**\n * Register a custom assertion evaluator.\n *\n * @param type - The assertion type identifier\n * @param evaluator - The evaluator instance\n */\nexport function registerEvaluator(\n type: string,\n evaluator: AssertionEvaluator,\n): void {\n evaluators[type] = evaluator;\n}\n\n/**\n * Get a registered evaluator by type.\n *\n * @param type - The assertion type identifier\n * @returns The evaluator or undefined if not found\n */\nexport function getEvaluator(type: string): AssertionEvaluator | undefined {\n return evaluators[type];\n}\n\n/**\n * Evaluate all assertions against the input.\n *\n * @param input - Evaluation input (includes outputText, llmTrace, fileDiffs)\n * @param assertions - List of assertions to evaluate\n * @param context - Optional context (e.g. workDir for build_passed, llmConfig for llm_judge)\n * @returns Array of assertion results; empty if no assertions\n */\nexport async function evaluateAssertions(\n input: EvaluationInput,\n assertions: Assertion[],\n context?: AssertionContext,\n): Promise<AssertionResult[]> {\n if (assertions.length === 0) {\n return [];\n }\n return Promise.all(\n assertions.map(async (assertion) => {\n const evaluator = evaluators[assertion.type];\n if (!evaluator) {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: assertion.type,\n assertionName: \"Unknown assertion\",\n status: AssertionResultStatus.ERROR,\n message: `Unsupported assertion type: ${assertion.type}`,\n duration: 0,\n };\n }\n const startMs = Date.now();\n const result = await evaluator.evaluate(assertion, input, context);\n const durationMs = Date.now() - startMs;\n return { ...result, duration: durationMs };\n }),\n );\n}\n\n// Re-export evaluator classes and types\nexport { AssertionEvaluator } from \"./assertion-evaluator.js\";\nexport type {\n AssertionContext,\n LlmConfig,\n LlmJudgeGenerateTextOptions,\n} from \"./assertion-evaluator.js\";\nexport { SkillWasCalledEvaluator } from \"./skill-was-called-evaluator.js\";\nexport { BuildPassedEvaluator } from \"./build-passed-evaluator.js\";\nexport { TimeEvaluator } from \"./time-evaluator.js\";\nexport {\n LlmJudgeEvaluator,\n formatTraceForJudge,\n replacePlaceholders,\n stripMarkdownCodeBlock,\n validateJudgeResult,\n type JudgeResult,\n} from \"./llm-judge-evaluator.js\";\n", "import type {\n SkillWasCalledAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Collect all skill names that were called in the LLM trace.\n */\nfunction collectCalledSkillNames(llmTrace: LLMTrace): Set<string> {\n const calledSkills = new Set<string>();\n for (const step of llmTrace.steps) {\n if (step.toolName !== \"Skill\") {\n continue;\n }\n let args: unknown;\n try {\n args = step.toolArguments\n ? (JSON.parse(step.toolArguments) as unknown)\n : undefined;\n } catch {\n continue;\n }\n if (args !== null && typeof args === \"object\") {\n const obj = args as Record<string, unknown>;\n if (typeof obj.skill === \"string\") {\n calledSkills.add(obj.skill);\n }\n }\n }\n return calledSkills;\n}\n\n/**\n * Evaluator for \"skill_was_called\" assertion: the LLM trace must contain steps\n * where the \"Skill\" tool was used with ALL expected skills (by name).\n *\n * Multiple skills in one assertion are treated as a group \u2014 all must be called\n * for the assertion to pass. To check skills independently, add separate assertions.\n */\nexport class SkillWasCalledEvaluator extends AssertionEvaluator<SkillWasCalledAssertion> {\n readonly type = \"skill_was_called\" as const;\n\n evaluate(\n assertion: SkillWasCalledAssertion,\n input: EvaluationInput,\n // eslint-disable-next-line @typescript-eslint/no-unused-vars -- context not used for skill_was_called\n _context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const expectedSkills = assertion.skillNames;\n const expectedLabel = expectedSkills.join(\", \");\n\n const llmTrace: LLMTrace | undefined = input.llmTrace;\n if (!llmTrace?.steps?.length) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message: \"No LLM trace steps to check for skill invocation\",\n expected: expectedLabel,\n };\n }\n\n const calledSkills = collectCalledSkillNames(llmTrace);\n const missingSkills = expectedSkills.filter(\n (name) => !calledSkills.has(name),\n );\n\n if (missingSkills.length === 0) {\n const message =\n expectedSkills.length === 1\n ? `Skill \"${expectedSkills[0]}\" was called`\n : `All skills were called: ${expectedLabel}`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.PASSED,\n message,\n expected: expectedLabel,\n };\n }\n\n const missingLabel = missingSkills.join(\", \");\n const message =\n expectedSkills.length === 1\n ? `Skill \"${missingSkills[0]}\" was not called`\n : `Missing skills: ${missingLabel} (expected all of: ${expectedLabel})`;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"skill_was_called\",\n assertionName: \"Skill was called\",\n status: AssertionResultStatus.FAILED,\n message,\n expected: expectedLabel,\n };\n }\n}\n", "import type {\n Assertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\n\n/**\n * Options passed to the LLM for llm_judge. Used by the optional stub for testing.\n */\nexport interface LlmJudgeGenerateTextOptions {\n prompt: string;\n system: string;\n maxOutputTokens: number;\n temperature: number;\n}\n\n/**\n * Configuration for LLM calls (used by llm_judge assertion).\n */\nexport interface LlmConfig {\n /** Base URL for the AI API (e.g., 'https://api.anthropic.com') */\n baseUrl: string;\n /** Headers to include in API requests (e.g., API key) */\n headers: Record<string, string>;\n}\n\n/**\n * Optional context passed when evaluating assertions.\n */\nexport interface AssertionContext {\n /** Working directory for the scenario (used by build_passed) */\n workDir?: string;\n /** LLM configuration (used by llm_judge) */\n llmConfig?: LlmConfig;\n /** Default model for llm_judge when assertion.model is not set. Caller provides this. */\n defaultJudgeModel?: string;\n /**\n * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.\n * Used only in tests to avoid hitting the API.\n */\n generateTextForLlmJudge?: (\n options: LlmJudgeGenerateTextOptions,\n ) => Promise<{ text: string }>;\n}\n\n/**\n * Abstract base for assertion evaluators.\n * Each assertion type has a concrete class that implements evaluate().\n * evaluate() may return a Promise for async assertions (e.g. llm_judge).\n */\nexport abstract class AssertionEvaluator<T extends Assertion = Assertion> {\n abstract readonly type: T[\"type\"];\n\n abstract evaluate(\n assertion: T,\n input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult | Promise<AssertionResult>;\n}\n", "import type {\n BuildPassedAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { execSync } from \"child_process\";\nimport type { AssertionContext } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nconst DEFAULT_COMMAND = \"yarn build\";\nconst DEFAULT_EXIT_CODE = 0;\n\n/**\n * Evaluator for \"build_passed\" assertion: runs a build command in the scenario\n * working directory and passes if the command exits with the expected code (default 0).\n */\nexport class BuildPassedEvaluator extends AssertionEvaluator<BuildPassedAssertion> {\n readonly type = \"build_passed\" as const;\n\n evaluate(\n assertion: BuildPassedAssertion,\n _input: EvaluationInput,\n context?: AssertionContext,\n ): AssertionResult {\n const assertionId = randomUUID();\n const workDir = context?.workDir;\n const command = assertion.command ?? DEFAULT_COMMAND;\n const expectedExitCode = assertion.expectedExitCode ?? DEFAULT_EXIT_CODE;\n\n if (!workDir) {\n return this.createResult(assertionId, {\n status: AssertionResultStatus.FAILED,\n message: \"No working directory provided for build_passed assertion\",\n expected: String(expectedExitCode),\n });\n }\n\n let exitCode: number | null = null;\n let errorMessage: string | null = null;\n let stdout: string | undefined;\n let stderr: string | undefined;\n\n console.log(`[build_passed] Running \"${command}\" in: ${workDir}`);\n\n try {\n execSync(command, {\n cwd: workDir,\n encoding: \"utf-8\",\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n exitCode = 0;\n } catch (err) {\n const error = err as Error & {\n status?: number;\n code?: number;\n stdout?: string | Buffer;\n stderr?: string | Buffer;\n };\n exitCode =\n typeof error.status === \"number\"\n ? error.status\n : typeof error.code === \"number\"\n ? error.code\n : null;\n errorMessage = error.message;\n stdout = this.bufferToString(error.stdout);\n stderr = this.bufferToString(error.stderr);\n }\n\n const passed = exitCode !== null && exitCode === expectedExitCode;\n\n const details: Record<string, unknown> = { workDir, command };\n if (stdout !== undefined && stdout !== \"\") {\n details.stdout = stdout;\n }\n if (stderr !== undefined && stderr !== \"\") {\n details.stderr = stderr;\n }\n\n return this.createResult(assertionId, {\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: this.formatMessage(exitCode, expectedExitCode, errorMessage),\n expected: String(expectedExitCode),\n actual: exitCode !== null ? String(exitCode) : undefined,\n details,\n });\n }\n\n private createResult(\n assertionId: string,\n fields: Partial<AssertionResult>,\n ): AssertionResult {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"build_passed\",\n assertionName: \"Build passed\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n\n private bufferToString(\n value: string | Buffer | undefined,\n ): string | undefined {\n if (value === undefined || value === null) return undefined;\n if (typeof value === \"string\") return value;\n return value.toString(\"utf-8\");\n }\n\n private formatMessage(\n exitCode: number | null,\n expectedExitCode: number,\n errorMessage: string | null,\n ): string {\n if (exitCode === null) {\n return `Build failed: ${errorMessage}`;\n }\n if (exitCode === expectedExitCode) {\n return `Build passed (exit code ${exitCode})`;\n }\n return `Build exited with ${exitCode}, expected ${expectedExitCode}`;\n }\n}\n", "import type {\n TimeAssertion,\n AssertionResult,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\n/**\n * Evaluator for \"time_limit\" assertion: passes if the scenario completed\n * within the configured maximum duration (maxDurationMs).\n */\nexport class TimeEvaluator extends AssertionEvaluator<TimeAssertion> {\n readonly type = \"time_limit\" as const;\n\n evaluate(assertion: TimeAssertion, input: EvaluationInput): AssertionResult {\n const maxDurationMs = assertion.maxDurationMs;\n\n if (input.durationMs == null) {\n return this.createResult({\n status: AssertionResultStatus.FAILED,\n message: \"No duration data available for time assertion\",\n expected: `<= ${maxDurationMs}ms`,\n });\n }\n\n const passed = input.durationMs <= maxDurationMs;\n\n return this.createResult({\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Completed in ${input.durationMs}ms (limit: ${maxDurationMs}ms)`\n : `Exceeded time limit: ${input.durationMs}ms > ${maxDurationMs}ms`,\n expected: `<= ${maxDurationMs}ms`,\n actual: `${input.durationMs}ms`,\n });\n }\n\n private createResult(fields: Partial<AssertionResult>): AssertionResult {\n return {\n id: randomUUID(),\n assertionId: randomUUID(),\n assertionType: \"time_limit\",\n assertionName: \"Time limit\",\n status: AssertionResultStatus.FAILED,\n ...fields,\n };\n }\n}\n", "import type {\n LlmJudgeAssertion,\n AssertionResult,\n LLMTrace,\n EvaluationInput,\n} from \"../types/index.js\";\nimport { AssertionResultStatus } from \"../types/index.js\";\nimport { randomUUID } from \"crypto\";\nimport { createAnthropic } from \"@ai-sdk/anthropic\";\nimport { generateText, APICallError } from \"ai\";\nimport type { AssertionContext, LlmConfig } from \"./assertion-evaluator.js\";\nimport { AssertionEvaluator } from \"./assertion-evaluator.js\";\n\nexport interface JudgeResult {\n text: string;\n score: number;\n scoreReasoning: string;\n}\n\n/**\n * Format LLM trace as readable text for the judge (step number, type, tool name/args, output preview).\n */\nexport function formatTraceForJudge(llmTrace: LLMTrace | undefined): string {\n if (!llmTrace?.steps?.length) {\n return \"No trace available.\";\n }\n const lines: string[] = [];\n for (const step of llmTrace.steps) {\n const parts: string[] = [\n `Step ${step.stepNumber}`,\n `type: ${step.type}`,\n `duration: ${step.durationMs}ms`,\n ];\n if (step.toolName) {\n parts.push(`tool: ${step.toolName}`);\n if (step.toolArguments) {\n parts.push(`args: ${step.toolArguments}`);\n }\n }\n if (step.outputPreview) {\n parts.push(`output: ${step.outputPreview}`);\n }\n if (step.error) {\n parts.push(`error: ${step.error}`);\n }\n lines.push(parts.join(\", \"));\n }\n return lines.join(\"\\n\");\n}\n\n/**\n * Context object for placeholder replacement.\n */\nexport interface PlaceholderContext {\n output: string;\n cwd: string;\n changedFiles: string;\n modifiedFiles: string;\n newFiles: string;\n trace: string;\n}\n\nexport function replacePlaceholders(\n str: string,\n ctx: PlaceholderContext,\n): string {\n return str\n .replace(/\\{\\{output\\}\\}/g, ctx.output)\n .replace(/\\{\\{cwd\\}\\}/g, ctx.cwd)\n .replace(/\\{\\{changedFiles\\}\\}/g, ctx.changedFiles)\n .replace(/\\{\\{modifiedFiles\\}\\}/g, ctx.modifiedFiles)\n .replace(/\\{\\{newFiles\\}\\}/g, ctx.newFiles)\n .replace(/\\{\\{trace\\}\\}/g, ctx.trace);\n}\n\n/**\n * Strip markdown code fences (e.g. ```json ... ```) from LLM output,\n * returning only the inner content for JSON parsing.\n */\nexport function stripMarkdownCodeBlock(text: string): string {\n const trimmed = text.trim();\n const match = trimmed.match(/^```(?:\\w+)?\\s*\\n?([\\s\\S]*?)\\n?\\s*```$/);\n return match ? match[1].trim() : trimmed;\n}\n\nexport function validateJudgeResult(parsed: unknown): JudgeResult {\n if (parsed === null || typeof parsed !== \"object\") {\n throw new Error(\"Judge result is not an object\");\n }\n const obj = parsed as Record<string, unknown>;\n if (typeof obj.text !== \"string\") {\n throw new Error(\"Judge result does not contain a valid text field\");\n }\n if (typeof obj.score !== \"number\") {\n throw new Error(\"Judge result does not contain a valid score field\");\n }\n if (obj.score < 0 || obj.score > 100) {\n throw new Error(\"Judge result score is not between 0 and 100\");\n }\n if (typeof obj.scoreReasoning !== \"string\") {\n throw new Error(\n \"Judge result does not contain a valid scoreReasoning field\",\n );\n }\n return {\n text: obj.text,\n score: obj.score,\n scoreReasoning: obj.scoreReasoning,\n };\n}\n\nconst DEFAULT_MIN_SCORE = 70;\n\n/** Default judge context (run data + placeholders); used when assertion.systemPrompt is empty. */\nconst DEFAULT_JUDGE_CONTEXT = `You are judging a scenario run. The ACTUAL run data is provided below \u2014 use it to verify facts:\n\n- {{output}}: the agent's final output\n- {{cwd}}: working directory\n- {{changedFiles}}: list of all files changed (or \"No files were changed\")\n- {{modifiedFiles}}: list of existing files that were modified (or \"No files were modified\")\n- {{newFiles}}: list of new files that were created (or \"No new files were created\")\n- {{trace}}: step-by-step trace (tool calls, completions) so you can check e.g. which tools were called and how many times\n\nCRITICAL: When the user asks you to verify a specific fact, compare it strictly against the actual data above. If the expected outcome does NOT match the actual outcome, you MUST give a score of 0 or near 0. Do not be lenient \u2014 factual mismatches are failures.`;\n\nconst JSON_OUTPUT_FORMAT_INSTRUCTIONS = `You must respond only with a valid JSON object that conforms exactly to the following structure:\n\n{\n \"text\": string,\n \"score\": number (0-100),\n \"scoreReasoning\": string\n}\n\n- text: A brief textual verdict of the test result.\n- score: A number from 0 to 100 that reflects how well the answer meets the acceptance criteria.\n- scoreReasoning: A concise explanation justifying the assigned score.\n\nYour response must:\n- Contain only the JSON object above \u2014 no introductory text, no code formatting (e.g., no triple backticks), and no trailing comments.\n- Be valid and parseable by \\`JSON.parse\\`.\n- Use only double quotes for all keys and strings, as required by JSON.\n\nAny response that includes extra content or deviates from the specified format will cause parsing to fail. Follow these instructions exactly.`;\n\n/**\n * Evaluator for \"llm_judge\" assertion: an LLM judges the scenario output\n * (prompt with {{output}}, {{cwd}}, {{changedFiles}}, {{trace}}) and returns a score 0-100.\n * Passes if score >= minScore.\n */\nexport class LlmJudgeEvaluator extends AssertionEvaluator<LlmJudgeAssertion> {\n readonly type = \"llm_judge\" as const;\n\n async evaluate(\n assertion: LlmJudgeAssertion,\n input: EvaluationInput,\n context?: AssertionContext,\n ): Promise<AssertionResult> {\n const assertionId = randomUUID();\n const llmConfig = context?.llmConfig;\n const workDir = context?.workDir ?? \"\";\n const generateTextStub = context?.generateTextForLlmJudge;\n\n const output = input.outputText ?? \"\";\n const fileDiffs = input.fileDiffs ?? [];\n\n // Compute file lists by status\n const changedPaths = fileDiffs.map((d) => d.path);\n const modifiedPaths = fileDiffs\n .filter((d) => d.status === \"modified\")\n .map((d) => d.path);\n const newPaths = fileDiffs\n .filter((d) => d.status === \"new\")\n .map((d) => d.path);\n\n const changedFiles =\n changedPaths.length > 0\n ? changedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were changed\";\n const modifiedFiles =\n modifiedPaths.length > 0\n ? modifiedPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No files were modified\";\n const newFiles =\n newPaths.length > 0\n ? newPaths.map((path: string) => `- ${path}`).join(\"\\n\")\n : \"No new files were created\";\n\n const trace = formatTraceForJudge(input.llmTrace);\n const ctx: PlaceholderContext = {\n output,\n cwd: workDir,\n changedFiles,\n modifiedFiles,\n newFiles,\n trace,\n };\n const replace = (s: string) => replacePlaceholders(s, ctx);\n\n const finalPrompt = replace(assertion.prompt);\n const systemPrompt =\n assertion.systemPrompt != null && assertion.systemPrompt !== \"\"\n ? replace(assertion.systemPrompt) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS\n : replace(DEFAULT_JUDGE_CONTEXT) +\n \"\\n\\n\" +\n JSON_OUTPUT_FORMAT_INSTRUCTIONS;\n\n const minScore = assertion.minScore ?? DEFAULT_MIN_SCORE;\n const maxOutputTokens = assertion.maxTokens ?? 1024;\n const temperature = assertion.temperature ?? 0;\n const modelUsed = assertion.model ?? context?.defaultJudgeModel;\n\n if (!modelUsed && !generateTextStub) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message:\n \"No model configured for llm_judge assertion (set model on assertion or provide defaultJudgeModel in context)\",\n expected: String(minScore),\n };\n }\n\n if (!generateTextStub && !llmConfig) {\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: \"No llmConfig for llm_judge assertion (AI gateway required)\",\n expected: String(minScore),\n };\n }\n\n const maxParseAttempts = 3;\n let lastParseError: Error | undefined;\n let lastRawText: string | undefined;\n\n try {\n for (let attempt = 1; attempt <= maxParseAttempts; attempt++) {\n const result = generateTextStub\n ? await generateTextStub({\n prompt: finalPrompt,\n system: systemPrompt,\n maxOutputTokens,\n temperature,\n })\n : await this.callGenerateText(\n llmConfig!,\n modelUsed!,\n finalPrompt,\n systemPrompt,\n maxOutputTokens,\n temperature,\n );\n\n lastRawText = result.text;\n try {\n const cleaned = stripMarkdownCodeBlock(result.text);\n const parsed = JSON.parse(cleaned);\n const judgeResult = validateJudgeResult(parsed);\n const passed = judgeResult.score >= minScore;\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: passed\n ? AssertionResultStatus.PASSED\n : AssertionResultStatus.FAILED,\n message: passed\n ? `Judge score ${judgeResult.score} >= ${minScore}: ${judgeResult.text}`\n : `Judge score ${judgeResult.score} < ${minScore}: ${judgeResult.text}`,\n expected: String(minScore),\n actual: String(judgeResult.score),\n details: {\n score: judgeResult.score,\n scoreReasoning: judgeResult.scoreReasoning,\n text: judgeResult.text,\n },\n };\n } catch (parseErr) {\n lastParseError =\n parseErr instanceof Error ? parseErr : new Error(String(parseErr));\n }\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `Failed to parse judge response after ${maxParseAttempts} attempts: ${lastParseError?.message ?? \"unknown\"}`,\n expected: String(minScore),\n actual: undefined,\n details: { rawText: lastRawText?.slice(0, 500) },\n };\n } catch (err) {\n const message = err instanceof Error ? err.message : String(err);\n const details: Record<string, unknown> = {\n error: message,\n model: modelUsed,\n };\n\n if (APICallError.isInstance(err)) {\n details.statusCode = err.statusCode;\n details.url = err.url;\n details.isRetryable = err.isRetryable;\n details.responseBody =\n typeof err.responseBody === \"string\"\n ? err.responseBody.slice(0, 2000)\n : err.responseBody;\n }\n\n return {\n id: randomUUID(),\n assertionId,\n assertionType: \"llm_judge\",\n assertionName: \"LLM judge\",\n status: AssertionResultStatus.FAILED,\n message: `LLM judge call failed: ${message}`,\n expected: String(minScore),\n details,\n };\n }\n }\n\n private async callGenerateText(\n llmConfig: LlmConfig,\n modelId: string,\n prompt: string,\n system: string,\n maxOutputTokens: number,\n temperature: number,\n ): Promise<{ text: string }> {\n const anthropic = createAnthropic({\n baseURL: llmConfig.baseUrl,\n apiKey: \"dummy\",\n headers: llmConfig.headers,\n });\n const result = await generateText({\n model: anthropic(modelId),\n prompt,\n system,\n maxOutputTokens,\n temperature,\n });\n return { text: result.text };\n }\n}\n"],
5
+ "mappings": ";AAAA,SAAS,SAAS;AAQX,IAAM,gCAAgC,EAAE,OAAO;AAAA,EACpD,MAAM,EAAE,QAAQ,kBAAkB;AAAA;AAAA,EAElC,YAAY,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,IAAI,CAAC;AACvC,CAAC;AAUM,IAAM,6BAA6B,EAAE,OAAO;AAAA,EACjD,MAAM,EAAE,QAAQ,cAAc;AAAA;AAAA,EAE9B,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAE7B,kBAAkB,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC9C,CAAC;AASM,IAAM,0BAA0B,EAAE,OAAO;AAAA,EAC9C,MAAM,EAAE,QAAQ,WAAW;AAAA;AAAA,EAE3B,QAAQ,EAAE,OAAO;AAAA;AAAA,EAEjB,cAAc,EAAE,OAAO,EAAE,SAAS;AAAA;AAAA,EAElC,UAAU,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,GAAG,EAAE,SAAS;AAAA;AAAA,EAEpD,OAAO,EAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,WAAW,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAAA,EACrC,aAAa,EAAE,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,SAAS;AACjD,CAAC;AAQM,IAAM,sBAAsB,EAAE,OAAO;AAAA,EAC1C,MAAM,EAAE,QAAQ,YAAY;AAAA;AAAA,EAE5B,eAAe,EAAE,OAAO,EAAE,IAAI,EAAE,SAAS;AAC3C,CAAC;AASM,IAAM,kBAAkB,EAAE,MAAM;AAAA,EACrC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;;;AC3ED,SAAS,KAAAA,UAAS;AAKX,IAAM,mBAAmBA,GAAE,OAAO;AAAA,EACvC,QAAQA,GAAE,OAAO;AAAA,EACjB,YAAYA,GAAE,OAAO;AAAA,EACrB,OAAOA,GAAE,OAAO;AAClB,CAAC;AAOM,IAAK,cAAL,kBAAKC,iBAAL;AACL,EAAAA,aAAA,gBAAa;AACb,EAAAA,aAAA,cAAW;AACX,EAAAA,aAAA,iBAAc;AACd,EAAAA,aAAA,cAAW;AAJD,SAAAA;AAAA,GAAA;AAUL,IAAM,qBAAqBD,GAAE,OAAO;AAAA,EACzC,IAAIA,GAAE,OAAO;AAAA,EACb,YAAYA,GAAE,OAAO;AAAA,EACrB,MAAMA,GAAE,KAAK,WAAW;AAAA,EACxB,OAAOA,GAAE,OAAO;AAAA,EAChB,UAAUA,GAAE,OAAO;AAAA,EACnB,WAAWA,GAAE,OAAO;AAAA,EACpB,YAAYA,GAAE,OAAO;AAAA,EACrB,YAAY;AAAA,EACZ,SAASA,GAAE,OAAO;AAAA,EAClB,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,cAAcA,GAAE,OAAO,EAAE,SAAS;AAAA,EAClC,eAAeA,GAAE,OAAO,EAAE,SAAS;AAAA,EACnC,SAASA,GAAE,QAAQ;AAAA,EACnB,OAAOA,GAAE,OAAO,EAAE,SAAS;AAC7B,CAAC;AAOM,IAAM,0BAA0BA,GAAE,OAAO;AAAA,EAC9C,OAAOA,GAAE,OAAO;AAAA,EAChB,YAAYA,GAAE,OAAO;AAAA,EACrB,QAAQA,GAAE,OAAO;AAAA,EACjB,SAASA,GAAE,OAAO;AACpB,CAAC;AAOM,IAAM,wBAAwBA,GAAE,OAAO;AAAA,EAC5C,YAAYA,GAAE,OAAO;AAAA,EACrB,iBAAiBA,GAAE,OAAO;AAAA,EAC1B,aAAa;AAAA,EACb,cAAcA,GAAE,OAAO;AAAA,EACvB,mBAAmBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB,EAAE,SAAS;AAAA,EAC1E,gBAAgBA,GAAE,OAAOA,GAAE,OAAO,GAAG,uBAAuB;AAAA,EAC5D,YAAYA,GAAE,MAAMA,GAAE,OAAO,CAAC;AAChC,CAAC;AAOM,IAAM,iBAAiBA,GAAE,OAAO;AAAA,EACrC,IAAIA,GAAE,OAAO;AAAA,EACb,OAAOA,GAAE,MAAM,kBAAkB;AAAA,EACjC,SAAS;AACX,CAAC;;;AChFD,SAAS,KAAAE,UAAS;AAMX,IAAK,wBAAL,kBAAKC,2BAAL;AACL,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,YAAS;AACT,EAAAA,uBAAA,aAAU;AACV,EAAAA,uBAAA,WAAQ;AAJE,SAAAA;AAAA,GAAA;AAUL,IAAM,wBAAwBC,GAAE,OAAO;AAAA,EAC5C,IAAIA,GAAE,OAAO;AAAA,EACb,aAAaA,GAAE,OAAO;AAAA,EACtB,eAAeA,GAAE,OAAO;AAAA,EACxB,eAAeA,GAAE,OAAO;AAAA,EACxB,QAAQA,GAAE,KAAK,qBAAqB;AAAA,EACpC,SAASA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,QAAQA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,UAAUA,GAAE,OAAO,EAAE,SAAS;AAAA,EAC9B,SAASA,GAAE,OAAOA,GAAE,OAAO,GAAGA,GAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACpD,eAAeA,GAAE,MAAM,kBAAkB,EAAE,SAAS;AACtD,CAAC;;;AC1BD,SAAS,cAAAC,mBAAkB;;;ACK3B,SAAS,kBAAkB;;;AC2CpB,IAAe,qBAAf,MAAmE;AAQ1E;;;AD5CA,SAAS,wBAAwB,UAAiC;AAChE,QAAM,eAAe,oBAAI,IAAY;AACrC,aAAW,QAAQ,SAAS,OAAO;AACjC,QAAI,KAAK,aAAa,SAAS;AAC7B;AAAA,IACF;AACA,QAAI;AACJ,QAAI;AACF,aAAO,KAAK,gBACP,KAAK,MAAM,KAAK,aAAa,IAC9B;AAAA,IACN,QAAQ;AACN;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,OAAO,SAAS,UAAU;AAC7C,YAAM,MAAM;AACZ,UAAI,OAAO,IAAI,UAAU,UAAU;AACjC,qBAAa,IAAI,IAAI,KAAK;AAAA,MAC5B;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AASO,IAAM,0BAAN,cAAsC,mBAA4C;AAAA,EAC9E,OAAO;AAAA,EAEhB,SACE,WACA,OAEA,UACiB;AACjB,UAAM,cAAc,WAAW;AAC/B,UAAM,iBAAiB,UAAU;AACjC,UAAM,gBAAgB,eAAe,KAAK,IAAI;AAE9C,UAAM,WAAiC,MAAM;AAC7C,QAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,wBAAwB,QAAQ;AACrD,UAAM,gBAAgB,eAAe;AAAA,MACnC,CAAC,SAAS,CAAC,aAAa,IAAI,IAAI;AAAA,IAClC;AAEA,QAAI,cAAc,WAAW,GAAG;AAC9B,YAAMC,WACJ,eAAe,WAAW,IACtB,UAAU,eAAe,CAAC,CAAC,iBAC3B,2BAA2B,aAAa;AAC9C,aAAO;AAAA,QACL,IAAI,WAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAAA;AAAA,QACA,UAAU;AAAA,MACZ;AAAA,IACF;AAEA,UAAM,eAAe,cAAc,KAAK,IAAI;AAC5C,UAAM,UACJ,eAAe,WAAW,IACtB,UAAU,cAAc,CAAC,CAAC,qBAC1B,mBAAmB,YAAY,sBAAsB,aAAa;AACxE,WAAO;AAAA,MACL,IAAI,WAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA;AAAA,MACA,UAAU;AAAA,IACZ;AAAA,EACF;AACF;;;AErGA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,gBAAgB;AAIzB,IAAM,kBAAkB;AACxB,IAAM,oBAAoB;AAMnB,IAAM,uBAAN,cAAmC,mBAAyC;AAAA,EACxE,OAAO;AAAA,EAEhB,SACE,WACA,QACA,SACiB;AACjB,UAAM,cAAcC,YAAW;AAC/B,UAAM,UAAU,SAAS;AACzB,UAAM,UAAU,UAAU,WAAW;AACrC,UAAM,mBAAmB,UAAU,oBAAoB;AAEvD,QAAI,CAAC,SAAS;AACZ,aAAO,KAAK,aAAa,aAAa;AAAA,QACpC;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,gBAAgB;AAAA,MACnC,CAAC;AAAA,IACH;AAEA,QAAI,WAA0B;AAC9B,QAAI,eAA8B;AAClC,QAAI;AACJ,QAAI;AAEJ,YAAQ,IAAI,2BAA2B,OAAO,SAAS,OAAO,EAAE;AAEhE,QAAI;AACF,eAAS,SAAS;AAAA,QAChB,KAAK;AAAA,QACL,UAAU;AAAA,QACV,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,MAClC,CAAC;AACD,iBAAW;AAAA,IACb,SAAS,KAAK;AACZ,YAAM,QAAQ;AAMd,iBACE,OAAO,MAAM,WAAW,WACpB,MAAM,SACN,OAAO,MAAM,SAAS,WACpB,MAAM,OACN;AACR,qBAAe,MAAM;AACrB,eAAS,KAAK,eAAe,MAAM,MAAM;AACzC,eAAS,KAAK,eAAe,MAAM,MAAM;AAAA,IAC3C;AAEA,UAAM,SAAS,aAAa,QAAQ,aAAa;AAEjD,UAAM,UAAmC,EAAE,SAAS,QAAQ;AAC5D,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AACA,QAAI,WAAW,UAAa,WAAW,IAAI;AACzC,cAAQ,SAAS;AAAA,IACnB;AAEA,WAAO,KAAK,aAAa,aAAa;AAAA,MACpC,QAAQ;AAAA,MAGR,SAAS,KAAK,cAAc,UAAU,kBAAkB,YAAY;AAAA,MACpE,UAAU,OAAO,gBAAgB;AAAA,MACjC,QAAQ,aAAa,OAAO,OAAO,QAAQ,IAAI;AAAA,MAC/C;AAAA,IACF,CAAC;AAAA,EACH;AAAA,EAEQ,aACN,aACA,QACiB;AACjB,WAAO;AAAA,MACL,IAAIA,YAAW;AAAA,MACf;AAAA,MACA,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AAAA,EAEQ,eACN,OACoB;AACpB,QAAI,UAAU,UAAa,UAAU,KAAM,QAAO;AAClD,QAAI,OAAO,UAAU,SAAU,QAAO;AACtC,WAAO,MAAM,SAAS,OAAO;AAAA,EAC/B;AAAA,EAEQ,cACN,UACA,kBACA,cACQ;AACR,QAAI,aAAa,MAAM;AACrB,aAAO,iBAAiB,YAAY;AAAA,IACtC;AACA,QAAI,aAAa,kBAAkB;AACjC,aAAO,2BAA2B,QAAQ;AAAA,IAC5C;AACA,WAAO,qBAAqB,QAAQ,cAAc,gBAAgB;AAAA,EACpE;AACF;;;ACzHA,SAAS,cAAAC,mBAAkB;AAOpB,IAAM,gBAAN,cAA4B,mBAAkC;AAAA,EAC1D,OAAO;AAAA,EAEhB,SAAS,WAA0B,OAAyC;AAC1E,UAAM,gBAAgB,UAAU;AAEhC,QAAI,MAAM,cAAc,MAAM;AAC5B,aAAO,KAAK,aAAa;AAAA,QACvB;AAAA,QACA,SAAS;AAAA,QACT,UAAU,MAAM,aAAa;AAAA,MAC/B,CAAC;AAAA,IACH;AAEA,UAAM,SAAS,MAAM,cAAc;AAEnC,WAAO,KAAK,aAAa;AAAA,MACvB,QAAQ;AAAA,MAGR,SAAS,SACL,gBAAgB,MAAM,UAAU,cAAc,aAAa,QAC3D,wBAAwB,MAAM,UAAU,QAAQ,aAAa;AAAA,MACjE,UAAU,MAAM,aAAa;AAAA,MAC7B,QAAQ,GAAG,MAAM,UAAU;AAAA,IAC7B,CAAC;AAAA,EACH;AAAA,EAEQ,aAAa,QAAmD;AACtE,WAAO;AAAA,MACL,IAAIC,YAAW;AAAA,MACf,aAAaA,YAAW;AAAA,MACxB,eAAe;AAAA,MACf,eAAe;AAAA,MACf;AAAA,MACA,GAAG;AAAA,IACL;AAAA,EACF;AACF;;;AC5CA,SAAS,cAAAC,mBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,cAAc,oBAAoB;AAapC,SAAS,oBAAoB,UAAwC;AAC1E,MAAI,CAAC,UAAU,OAAO,QAAQ;AAC5B,WAAO;AAAA,EACT;AACA,QAAM,QAAkB,CAAC;AACzB,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,QAAkB;AAAA,MACtB,QAAQ,KAAK,UAAU;AAAA,MACvB,SAAS,KAAK,IAAI;AAAA,MAClB,aAAa,KAAK,UAAU;AAAA,IAC9B;AACA,QAAI,KAAK,UAAU;AACjB,YAAM,KAAK,SAAS,KAAK,QAAQ,EAAE;AACnC,UAAI,KAAK,eAAe;AACtB,cAAM,KAAK,SAAS,KAAK,aAAa,EAAE;AAAA,MAC1C;AAAA,IACF;AACA,QAAI,KAAK,eAAe;AACtB,YAAM,KAAK,WAAW,KAAK,aAAa,EAAE;AAAA,IAC5C;AACA,QAAI,KAAK,OAAO;AACd,YAAM,KAAK,UAAU,KAAK,KAAK,EAAE;AAAA,IACnC;AACA,UAAM,KAAK,MAAM,KAAK,IAAI,CAAC;AAAA,EAC7B;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAcO,SAAS,oBACd,KACA,KACQ;AACR,SAAO,IACJ,QAAQ,mBAAmB,IAAI,MAAM,EACrC,QAAQ,gBAAgB,IAAI,GAAG,EAC/B,QAAQ,yBAAyB,IAAI,YAAY,EACjD,QAAQ,0BAA0B,IAAI,aAAa,EACnD,QAAQ,qBAAqB,IAAI,QAAQ,EACzC,QAAQ,kBAAkB,IAAI,KAAK;AACxC;AAMO,SAAS,uBAAuB,MAAsB;AAC3D,QAAM,UAAU,KAAK,KAAK;AAC1B,QAAM,QAAQ,QAAQ,MAAM,wCAAwC;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;AAEO,SAAS,oBAAoB,QAA8B;AAChE,MAAI,WAAW,QAAQ,OAAO,WAAW,UAAU;AACjD,UAAM,IAAI,MAAM,+BAA+B;AAAA,EACjD;AACA,QAAM,MAAM;AACZ,MAAI,OAAO,IAAI,SAAS,UAAU;AAChC,UAAM,IAAI,MAAM,kDAAkD;AAAA,EACpE;AACA,MAAI,OAAO,IAAI,UAAU,UAAU;AACjC,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACrE;AACA,MAAI,IAAI,QAAQ,KAAK,IAAI,QAAQ,KAAK;AACpC,UAAM,IAAI,MAAM,6CAA6C;AAAA,EAC/D;AACA,MAAI,OAAO,IAAI,mBAAmB,UAAU;AAC1C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,OAAO,IAAI;AAAA,IACX,gBAAgB,IAAI;AAAA,EACtB;AACF;AAEA,IAAM,oBAAoB;AAG1B,IAAM,wBAAwB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAW9B,IAAM,kCAAkC;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAwBjC,IAAM,oBAAN,cAAgC,mBAAsC;AAAA,EAClE,OAAO;AAAA,EAEhB,MAAM,SACJ,WACA,OACA,SAC0B;AAC1B,UAAM,cAAcC,YAAW;AAC/B,UAAM,YAAY,SAAS;AAC3B,UAAM,UAAU,SAAS,WAAW;AACpC,UAAM,mBAAmB,SAAS;AAElC,UAAM,SAAS,MAAM,cAAc;AACnC,UAAM,YAAY,MAAM,aAAa,CAAC;AAGtC,UAAM,eAAe,UAAU,IAAI,CAAC,MAAM,EAAE,IAAI;AAChD,UAAM,gBAAgB,UACnB,OAAO,CAAC,MAAM,EAAE,WAAW,UAAU,EACrC,IAAI,CAAC,MAAM,EAAE,IAAI;AACpB,UAAM,WAAW,UACd,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,EAChC,IAAI,CAAC,MAAM,EAAE,IAAI;AAEpB,UAAM,eACJ,aAAa,SAAS,IAClB,aAAa,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACzD;AACN,UAAM,gBACJ,cAAc,SAAS,IACnB,cAAc,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IAC1D;AACN,UAAM,WACJ,SAAS,SAAS,IACd,SAAS,IAAI,CAAC,SAAiB,KAAK,IAAI,EAAE,EAAE,KAAK,IAAI,IACrD;AAEN,UAAM,QAAQ,oBAAoB,MAAM,QAAQ;AAChD,UAAM,MAA0B;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,MACL;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF;AACA,UAAM,UAAU,CAAC,MAAc,oBAAoB,GAAG,GAAG;AAEzD,UAAM,cAAc,QAAQ,UAAU,MAAM;AAC5C,UAAM,eACJ,UAAU,gBAAgB,QAAQ,UAAU,iBAAiB,KACzD,QAAQ,UAAU,YAAY,IAC9B,SACA,kCACA,QAAQ,qBAAqB,IAC7B,SACA;AAEN,UAAM,WAAW,UAAU,YAAY;AACvC,UAAM,kBAAkB,UAAU,aAAa;AAC/C,UAAM,cAAc,UAAU,eAAe;AAC7C,UAAM,YAAY,UAAU,SAAS,SAAS;AAE9C,QAAI,CAAC,aAAa,CAAC,kBAAkB;AACnC,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SACE;AAAA,QACF,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,QAAI,CAAC,oBAAoB,CAAC,WAAW;AACnC,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS;AAAA,QACT,UAAU,OAAO,QAAQ;AAAA,MAC3B;AAAA,IACF;AAEA,UAAM,mBAAmB;AACzB,QAAI;AACJ,QAAI;AAEJ,QAAI;AACF,eAAS,UAAU,GAAG,WAAW,kBAAkB,WAAW;AAC5D,cAAM,SAAS,mBACX,MAAM,iBAAiB;AAAA,UACrB,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC,IACD,MAAM,KAAK;AAAA,UACT;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,UACA;AAAA,QACF;AAEJ,sBAAc,OAAO;AACrB,YAAI;AACF,gBAAM,UAAU,uBAAuB,OAAO,IAAI;AAClD,gBAAM,SAAS,KAAK,MAAM,OAAO;AACjC,gBAAM,cAAc,oBAAoB,MAAM;AAC9C,gBAAM,SAAS,YAAY,SAAS;AACpC,iBAAO;AAAA,YACL,IAAIA,YAAW;AAAA,YACf;AAAA,YACA,eAAe;AAAA,YACf,eAAe;AAAA,YACf,QAAQ;AAAA,YAGR,SAAS,SACL,eAAe,YAAY,KAAK,OAAO,QAAQ,KAAK,YAAY,IAAI,KACpE,eAAe,YAAY,KAAK,MAAM,QAAQ,KAAK,YAAY,IAAI;AAAA,YACvE,UAAU,OAAO,QAAQ;AAAA,YACzB,QAAQ,OAAO,YAAY,KAAK;AAAA,YAChC,SAAS;AAAA,cACP,OAAO,YAAY;AAAA,cACnB,gBAAgB,YAAY;AAAA,cAC5B,MAAM,YAAY;AAAA,YACpB;AAAA,UACF;AAAA,QACF,SAAS,UAAU;AACjB,2BACE,oBAAoB,QAAQ,WAAW,IAAI,MAAM,OAAO,QAAQ,CAAC;AAAA,QACrE;AAAA,MACF;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,wCAAwC,gBAAgB,cAAc,gBAAgB,WAAW,SAAS;AAAA,QACnH,UAAU,OAAO,QAAQ;AAAA,QACzB,QAAQ;AAAA,QACR,SAAS,EAAE,SAAS,aAAa,MAAM,GAAG,GAAG,EAAE;AAAA,MACjD;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,UAAU,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC/D,YAAM,UAAmC;AAAA,QACvC,OAAO;AAAA,QACP,OAAO;AAAA,MACT;AAEA,UAAI,aAAa,WAAW,GAAG,GAAG;AAChC,gBAAQ,aAAa,IAAI;AACzB,gBAAQ,MAAM,IAAI;AAClB,gBAAQ,cAAc,IAAI;AAC1B,gBAAQ,eACN,OAAO,IAAI,iBAAiB,WACxB,IAAI,aAAa,MAAM,GAAG,GAAI,IAC9B,IAAI;AAAA,MACZ;AAEA,aAAO;AAAA,QACL,IAAIA,YAAW;AAAA,QACf;AAAA,QACA,eAAe;AAAA,QACf,eAAe;AAAA,QACf;AAAA,QACA,SAAS,0BAA0B,OAAO;AAAA,QAC1C,UAAU,OAAO,QAAQ;AAAA,QACzB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAc,iBACZ,WACA,SACA,QACA,QACA,iBACA,aAC2B;AAC3B,UAAM,YAAY,gBAAgB;AAAA,MAChC,SAAS,UAAU;AAAA,MACnB,QAAQ;AAAA,MACR,SAAS,UAAU;AAAA,IACrB,CAAC;AACD,UAAM,SAAS,MAAM,aAAa;AAAA,MAChC,OAAO,UAAU,OAAO;AAAA,MACxB;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,IACF,CAAC;AACD,WAAO,EAAE,MAAM,OAAO,KAAK;AAAA,EAC7B;AACF;;;ALvVA,IAAM,oBAAoB,IAAI,kBAAkB;AAEhD,IAAM,aAAiD;AAAA,EACrD,kBAAkB,IAAI,wBAAwB;AAAA,EAC9C,cAAc,IAAI,qBAAqB;AAAA,EACvC,YAAY,IAAI,cAAc;AAAA,EAC9B,WAAW;AAAA;AAAA,EAEX,QAAQ;AACV;AAQO,SAAS,kBACd,MACA,WACM;AACN,aAAW,IAAI,IAAI;AACrB;AAQO,SAAS,aAAa,MAA8C;AACzE,SAAO,WAAW,IAAI;AACxB;AAUA,eAAsB,mBACpB,OACA,YACA,SAC4B;AAC5B,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO,CAAC;AAAA,EACV;AACA,SAAO,QAAQ;AAAA,IACb,WAAW,IAAI,OAAO,cAAc;AAClC,YAAM,YAAY,WAAW,UAAU,IAAI;AAC3C,UAAI,CAAC,WAAW;AACd,eAAO;AAAA,UACL,IAAIC,YAAW;AAAA,UACf,aAAaA,YAAW;AAAA,UACxB,eAAe,UAAU;AAAA,UACzB,eAAe;AAAA,UACf;AAAA,UACA,SAAS,+BAA+B,UAAU,IAAI;AAAA,UACtD,UAAU;AAAA,QACZ;AAAA,MACF;AACA,YAAM,UAAU,KAAK,IAAI;AACzB,YAAM,SAAS,MAAM,UAAU,SAAS,WAAW,OAAO,OAAO;AACjE,YAAM,aAAa,KAAK,IAAI,IAAI;AAChC,aAAO,EAAE,GAAG,QAAQ,UAAU,WAAW;AAAA,IAC3C,CAAC;AAAA,EACH;AACF;",
6
+ "names": ["z", "LLMStepType", "z", "AssertionResultStatus", "z", "randomUUID", "message", "randomUUID", "randomUUID", "randomUUID", "randomUUID", "randomUUID", "randomUUID", "randomUUID"]
7
7
  }
@@ -25,6 +25,8 @@ export interface AssertionContext {
25
25
  workDir?: string;
26
26
  /** LLM configuration (used by llm_judge) */
27
27
  llmConfig?: LlmConfig;
28
+ /** Default model for llm_judge when assertion.model is not set. Caller provides this. */
29
+ defaultJudgeModel?: string;
28
30
  /**
29
31
  * Optional stub for llm_judge: when set, the evaluator uses this instead of the real AI call.
30
32
  * Used only in tests to avoid hitting the API.
@@ -29,4 +29,5 @@ export { AssertionEvaluator } from "./assertion-evaluator.js";
29
29
  export type { AssertionContext, LlmConfig, LlmJudgeGenerateTextOptions, } from "./assertion-evaluator.js";
30
30
  export { SkillWasCalledEvaluator } from "./skill-was-called-evaluator.js";
31
31
  export { BuildPassedEvaluator } from "./build-passed-evaluator.js";
32
+ export { TimeEvaluator } from "./time-evaluator.js";
32
33
  export { LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type JudgeResult, } from "./llm-judge-evaluator.js";
@@ -0,0 +1,11 @@
1
+ import type { TimeAssertion, AssertionResult, EvaluationInput } from "../types/index.js";
2
+ import { AssertionEvaluator } from "./assertion-evaluator.js";
3
+ /**
4
+ * Evaluator for "time_limit" assertion: passes if the scenario completed
5
+ * within the configured maximum duration (maxDurationMs).
6
+ */
7
+ export declare class TimeEvaluator extends AssertionEvaluator<TimeAssertion> {
8
+ readonly type: "time_limit";
9
+ evaluate(assertion: TimeAssertion, input: EvaluationInput): AssertionResult;
10
+ private createResult;
11
+ }
@@ -4,5 +4,5 @@
4
4
  * Assertion framework for AI agent evaluations.
5
5
  * Supports skill invocation checks, build validation, and LLM-based judging.
6
6
  */
7
- export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
8
- export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type LlmJudgeGenerateTextOptions, type JudgeResult, } from "./evaluators/index.js";
7
+ export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type LlmJudgeAssertion, LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, AssertionResultSchema, AssertionResultStatus, type AssertionResult, type EvaluationInput, type FileDiff, } from "./types/index.js";
8
+ export { evaluateAssertions, registerEvaluator, getEvaluator, AssertionEvaluator, SkillWasCalledEvaluator, BuildPassedEvaluator, TimeEvaluator, LlmJudgeEvaluator, formatTraceForJudge, replacePlaceholders, stripMarkdownCodeBlock, validateJudgeResult, type AssertionContext, type LlmConfig, type LlmJudgeGenerateTextOptions, type JudgeResult, } from "./evaluators/index.js";
@@ -35,6 +35,15 @@ export declare const LlmJudgeAssertionSchema: z.ZodObject<{
35
35
  temperature: z.ZodOptional<z.ZodNumber>;
36
36
  }, z.core.$strip>;
37
37
  export type LlmJudgeAssertion = z.infer<typeof LlmJudgeAssertionSchema>;
38
+ /**
39
+ * Assertion: scenario must complete within a maximum duration.
40
+ * Deterministic check against the scenario execution time.
41
+ */
42
+ export declare const TimeAssertionSchema: z.ZodObject<{
43
+ type: z.ZodLiteral<"time_limit">;
44
+ maxDurationMs: z.ZodNumber;
45
+ }, z.core.$strip>;
46
+ export type TimeAssertion = z.infer<typeof TimeAssertionSchema>;
38
47
  /**
39
48
  * Union of all assertion types.
40
49
  * Each assertion has a type and type-specific data.
@@ -47,6 +56,9 @@ export declare const AssertionSchema: z.ZodUnion<readonly [z.ZodObject<{
47
56
  type: z.ZodLiteral<"build_passed">;
48
57
  command: z.ZodOptional<z.ZodString>;
49
58
  expectedExitCode: z.ZodOptional<z.ZodNumber>;
59
+ }, z.core.$strip>, z.ZodObject<{
60
+ type: z.ZodLiteral<"time_limit">;
61
+ maxDurationMs: z.ZodNumber;
50
62
  }, z.core.$strip>, z.ZodObject<{
51
63
  type: z.ZodLiteral<"llm_judge">;
52
64
  prompt: z.ZodString;
@@ -1,4 +1,4 @@
1
- export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type LlmJudgeAssertion, } from "./assertions.js";
1
+ export { AssertionSchema, SkillWasCalledAssertionSchema, BuildPassedAssertionSchema, TimeAssertionSchema, LlmJudgeAssertionSchema, type Assertion, type SkillWasCalledAssertion, type BuildPassedAssertion, type TimeAssertion, type LlmJudgeAssertion, } from "./assertions.js";
2
2
  export { LLMTraceSchema, LLMTraceStepSchema, LLMTraceSummarySchema, LLMBreakdownStatsSchema, TokenUsageSchema, LLMStepType, type LLMTrace, type LLMTraceStep, type LLMTraceSummary, type LLMBreakdownStats, type TokenUsage, } from "./trace.js";
3
3
  export { AssertionResultSchema, AssertionResultStatus, type AssertionResult, } from "./result.js";
4
4
  export { type EvaluationInput, type FileDiff, type FileStatus, } from "./input.js";
@@ -23,4 +23,6 @@ export interface EvaluationInput {
23
23
  llmTrace?: LLMTrace;
24
24
  /** List of files that were modified during the evaluation */
25
25
  fileDiffs?: FileDiff[];
26
+ /** Scenario execution duration in milliseconds */
27
+ durationMs?: number;
26
28
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@wix/eval-assertions",
3
- "version": "0.14.0",
3
+ "version": "0.16.0",
4
4
  "description": "Assertion framework for AI agent evaluations - supports skill invocation checks, build validation, and LLM-based judging",
5
5
  "files": [
6
6
  "build"
@@ -60,5 +60,5 @@
60
60
  ],
61
61
  "license": "MIT",
62
62
  "author": "Wix",
63
- "falconPackageHash": "135086b1ca08c74e4be4c2b5d49b26b93ee079701fc7c8d3bff1ac52"
63
+ "falconPackageHash": "9d0a90b3fcf13f9ce2aa735a208d8e96d027956486330ac73dba07d1"
64
64
  }