@wix/evalforge-types 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -27,9 +27,11 @@ __export(index_exports, {
27
27
  ApiCallSchema: () => ApiCallSchema,
28
28
  AssertionResultSchema: () => AssertionResultSchema,
29
29
  AssertionResultStatus: () => AssertionResultStatus,
30
+ AssertionSchema: () => AssertionSchema,
30
31
  BaseEntitySchema: () => BaseEntitySchema,
31
32
  BaseTestSchema: () => BaseTestSchema,
32
33
  BuildCheckTestSchema: () => BuildCheckTestSchema,
34
+ BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
33
35
  CommandExecutionSchema: () => CommandExecutionSchema,
34
36
  CommandExecutionTestSchema: () => CommandExecutionTestSchema,
35
37
  CreateAgentInputSchema: () => CreateAgentInputSchema,
@@ -71,6 +73,7 @@ __export(index_exports, {
71
73
  LeanEvaluationResultSchema: () => LeanEvaluationResultSchema,
72
74
  LiveTraceEventSchema: () => LiveTraceEventSchema,
73
75
  LiveTraceEventType: () => LiveTraceEventType,
76
+ LlmJudgeAssertionSchema: () => LlmJudgeAssertionSchema,
74
77
  LocalProjectConfigSchema: () => LocalProjectConfigSchema,
75
78
  MCPServerConfigSchema: () => MCPServerConfigSchema,
76
79
  MetaSiteConfigSchema: () => MetaSiteConfigSchema,
@@ -86,6 +89,7 @@ __export(index_exports, {
86
89
  SkillMetadataSchema: () => SkillMetadataSchema,
87
90
  SkillSchema: () => SkillSchema,
88
91
  SkillVersionSchema: () => SkillVersionSchema,
92
+ SkillWasCalledAssertionSchema: () => SkillWasCalledAssertionSchema,
89
93
  SkillsGroupSchema: () => SkillsGroupSchema,
90
94
  TRACE_EVENT_PREFIX: () => TRACE_EVENT_PREFIX,
91
95
  TargetSchema: () => TargetSchema,
@@ -492,34 +496,67 @@ var TestSchema = import_zod17.z.discriminatedUnion("type", [
492
496
  PlaywrightNLTestSchema
493
497
  ]);
494
498
 
495
- // src/scenario/environment.ts
499
+ // src/scenario/assertions.ts
496
500
  var import_zod18 = require("zod");
497
- var LocalProjectConfigSchema = import_zod18.z.object({
501
+ var SkillWasCalledAssertionSchema = import_zod18.z.object({
502
+ type: import_zod18.z.literal("skill_was_called"),
503
+ /** Name of the skill that must have been called (matched against trace Skill tool args) */
504
+ skillName: import_zod18.z.string()
505
+ });
506
+ var BuildPassedAssertionSchema = import_zod18.z.object({
507
+ type: import_zod18.z.literal("build_passed"),
508
+ /** Command to run (default: "yarn build") */
509
+ command: import_zod18.z.string().optional(),
510
+ /** Expected exit code (default: 0) */
511
+ expectedExitCode: import_zod18.z.number().int().optional()
512
+ });
513
+ var LlmJudgeAssertionSchema = import_zod18.z.object({
514
+ type: import_zod18.z.literal("llm_judge"),
515
+ /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
516
+ prompt: import_zod18.z.string(),
517
+ /** Optional system prompt for the judge (default asks for JSON with score) */
518
+ systemPrompt: import_zod18.z.string().optional(),
519
+ /** Minimum score to pass (0–100, default 70) */
520
+ minScore: import_zod18.z.number().int().min(0).max(100).optional(),
521
+ /** Model for the judge (e.g. claude-3-5-haiku) */
522
+ model: import_zod18.z.string().optional(),
523
+ maxTokens: import_zod18.z.number().int().optional(),
524
+ temperature: import_zod18.z.number().min(0).max(1).optional()
525
+ });
526
+ var AssertionSchema = import_zod18.z.union([
527
+ SkillWasCalledAssertionSchema,
528
+ BuildPassedAssertionSchema,
529
+ LlmJudgeAssertionSchema
530
+ ]);
531
+
532
+ // src/scenario/environment.ts
533
+ var import_zod19 = require("zod");
534
+ var LocalProjectConfigSchema = import_zod19.z.object({
498
535
  /** Template ID to use for the local project */
499
- templateId: import_zod18.z.string().optional(),
536
+ templateId: import_zod19.z.string().optional(),
500
537
  /** Files to create in the project */
501
- files: import_zod18.z.array(
502
- import_zod18.z.object({
503
- path: import_zod18.z.string().min(1),
504
- content: import_zod18.z.string().min(1)
538
+ files: import_zod19.z.array(
539
+ import_zod19.z.object({
540
+ path: import_zod19.z.string().min(1),
541
+ content: import_zod19.z.string().min(1)
505
542
  })
506
543
  ).optional()
507
544
  });
508
- var MetaSiteConfigSchema = import_zod18.z.object({
509
- configurations: import_zod18.z.array(
510
- import_zod18.z.object({
511
- name: import_zod18.z.string().min(1),
512
- apiCalls: import_zod18.z.array(
513
- import_zod18.z.object({
514
- url: import_zod18.z.string().url(),
515
- method: import_zod18.z.enum(["POST", "PUT"]),
516
- body: import_zod18.z.string()
545
+ var MetaSiteConfigSchema = import_zod19.z.object({
546
+ configurations: import_zod19.z.array(
547
+ import_zod19.z.object({
548
+ name: import_zod19.z.string().min(1),
549
+ apiCalls: import_zod19.z.array(
550
+ import_zod19.z.object({
551
+ url: import_zod19.z.string().url(),
552
+ method: import_zod19.z.enum(["POST", "PUT"]),
553
+ body: import_zod19.z.string()
517
554
  })
518
555
  )
519
556
  })
520
557
  ).optional()
521
558
  });
522
- var EnvironmentSchema = import_zod18.z.object({
559
+ var EnvironmentSchema = import_zod19.z.object({
523
560
  /** Local project configuration */
524
561
  localProject: LocalProjectConfigSchema.optional(),
525
562
  /** Meta site configuration */
@@ -527,18 +564,20 @@ var EnvironmentSchema = import_zod18.z.object({
527
564
  });
528
565
 
529
566
  // src/scenario/test-scenario.ts
530
- var import_zod19 = require("zod");
531
- var ExpectedFileSchema = import_zod19.z.object({
567
+ var import_zod20 = require("zod");
568
+ var ExpectedFileSchema = import_zod20.z.object({
532
569
  /** Relative path where the file should be created */
533
- path: import_zod19.z.string(),
570
+ path: import_zod20.z.string(),
534
571
  /** Optional expected content */
535
- content: import_zod19.z.string().optional()
572
+ content: import_zod20.z.string().optional()
536
573
  });
537
574
  var TestScenarioSchema = TenantEntitySchema.extend({
538
575
  /** The prompt sent to the agent to trigger the task */
539
- triggerPrompt: import_zod19.z.string().min(10),
576
+ triggerPrompt: import_zod20.z.string().min(10),
540
577
  /** ID of the template to use for this scenario */
541
- templateId: import_zod19.z.string().optional()
578
+ templateId: import_zod20.z.string().optional(),
579
+ /** Assertions to evaluate for this scenario */
580
+ assertions: import_zod20.z.array(AssertionSchema).optional()
542
581
  });
543
582
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
544
583
  id: true,
@@ -549,10 +588,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
549
588
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
550
589
 
551
590
  // src/suite/test-suite.ts
552
- var import_zod20 = require("zod");
591
+ var import_zod21 = require("zod");
553
592
  var TestSuiteSchema = TenantEntitySchema.extend({
554
593
  /** IDs of test scenarios in this suite */
555
- scenarioIds: import_zod20.z.array(import_zod20.z.string())
594
+ scenarioIds: import_zod21.z.array(import_zod21.z.string())
556
595
  });
557
596
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
558
597
  id: true,
@@ -563,21 +602,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
563
602
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
564
603
 
565
604
  // src/evaluation/metrics.ts
566
- var import_zod21 = require("zod");
567
- var TokenUsageSchema = import_zod21.z.object({
568
- prompt: import_zod21.z.number(),
569
- completion: import_zod21.z.number(),
570
- total: import_zod21.z.number()
571
- });
572
- var EvalMetricsSchema = import_zod21.z.object({
573
- totalAssertions: import_zod21.z.number(),
574
- passed: import_zod21.z.number(),
575
- failed: import_zod21.z.number(),
576
- skipped: import_zod21.z.number(),
577
- errors: import_zod21.z.number(),
578
- passRate: import_zod21.z.number(),
579
- avgDuration: import_zod21.z.number(),
580
- totalDuration: import_zod21.z.number()
605
+ var import_zod22 = require("zod");
606
+ var TokenUsageSchema = import_zod22.z.object({
607
+ prompt: import_zod22.z.number(),
608
+ completion: import_zod22.z.number(),
609
+ total: import_zod22.z.number()
610
+ });
611
+ var EvalMetricsSchema = import_zod22.z.object({
612
+ totalAssertions: import_zod22.z.number(),
613
+ passed: import_zod22.z.number(),
614
+ failed: import_zod22.z.number(),
615
+ skipped: import_zod22.z.number(),
616
+ errors: import_zod22.z.number(),
617
+ passRate: import_zod22.z.number(),
618
+ avgDuration: import_zod22.z.number(),
619
+ totalDuration: import_zod22.z.number()
581
620
  });
582
621
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
583
622
  EvalStatus2["PENDING"] = "pending";
@@ -587,7 +626,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
587
626
  EvalStatus2["CANCELLED"] = "cancelled";
588
627
  return EvalStatus2;
589
628
  })(EvalStatus || {});
590
- var EvalStatusSchema = import_zod21.z.enum(EvalStatus);
629
+ var EvalStatusSchema = import_zod22.z.enum(EvalStatus);
591
630
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
592
631
  LLMStepType2["COMPLETION"] = "completion";
593
632
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -595,52 +634,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
595
634
  LLMStepType2["THINKING"] = "thinking";
596
635
  return LLMStepType2;
597
636
  })(LLMStepType || {});
598
- var LLMTraceStepSchema = import_zod21.z.object({
599
- id: import_zod21.z.string(),
600
- stepNumber: import_zod21.z.number(),
601
- type: import_zod21.z.enum(LLMStepType),
602
- model: import_zod21.z.string(),
603
- provider: import_zod21.z.string(),
604
- startedAt: import_zod21.z.string(),
605
- durationMs: import_zod21.z.number(),
637
+ var LLMTraceStepSchema = import_zod22.z.object({
638
+ id: import_zod22.z.string(),
639
+ stepNumber: import_zod22.z.number(),
640
+ type: import_zod22.z.enum(LLMStepType),
641
+ model: import_zod22.z.string(),
642
+ provider: import_zod22.z.string(),
643
+ startedAt: import_zod22.z.string(),
644
+ durationMs: import_zod22.z.number(),
606
645
  tokenUsage: TokenUsageSchema,
607
- costUsd: import_zod21.z.number(),
608
- toolName: import_zod21.z.string().optional(),
609
- toolArguments: import_zod21.z.string().optional(),
610
- inputPreview: import_zod21.z.string().optional(),
611
- outputPreview: import_zod21.z.string().optional(),
612
- success: import_zod21.z.boolean(),
613
- error: import_zod21.z.string().optional()
614
- });
615
- var LLMBreakdownStatsSchema = import_zod21.z.object({
616
- count: import_zod21.z.number(),
617
- durationMs: import_zod21.z.number(),
618
- tokens: import_zod21.z.number(),
619
- costUsd: import_zod21.z.number()
620
- });
621
- var LLMTraceSummarySchema = import_zod21.z.object({
622
- totalSteps: import_zod21.z.number(),
623
- totalDurationMs: import_zod21.z.number(),
646
+ costUsd: import_zod22.z.number(),
647
+ toolName: import_zod22.z.string().optional(),
648
+ toolArguments: import_zod22.z.string().optional(),
649
+ inputPreview: import_zod22.z.string().optional(),
650
+ outputPreview: import_zod22.z.string().optional(),
651
+ success: import_zod22.z.boolean(),
652
+ error: import_zod22.z.string().optional()
653
+ });
654
+ var LLMBreakdownStatsSchema = import_zod22.z.object({
655
+ count: import_zod22.z.number(),
656
+ durationMs: import_zod22.z.number(),
657
+ tokens: import_zod22.z.number(),
658
+ costUsd: import_zod22.z.number()
659
+ });
660
+ var LLMTraceSummarySchema = import_zod22.z.object({
661
+ totalSteps: import_zod22.z.number(),
662
+ totalDurationMs: import_zod22.z.number(),
624
663
  totalTokens: TokenUsageSchema,
625
- totalCostUsd: import_zod21.z.number(),
626
- stepTypeBreakdown: import_zod21.z.record(import_zod21.z.string(), LLMBreakdownStatsSchema).optional(),
627
- modelBreakdown: import_zod21.z.record(import_zod21.z.string(), LLMBreakdownStatsSchema),
628
- modelsUsed: import_zod21.z.array(import_zod21.z.string())
629
- });
630
- var LLMTraceSchema = import_zod21.z.object({
631
- id: import_zod21.z.string(),
632
- steps: import_zod21.z.array(LLMTraceStepSchema),
664
+ totalCostUsd: import_zod22.z.number(),
665
+ stepTypeBreakdown: import_zod22.z.record(import_zod22.z.string(), LLMBreakdownStatsSchema).optional(),
666
+ modelBreakdown: import_zod22.z.record(import_zod22.z.string(), LLMBreakdownStatsSchema),
667
+ modelsUsed: import_zod22.z.array(import_zod22.z.string())
668
+ });
669
+ var LLMTraceSchema = import_zod22.z.object({
670
+ id: import_zod22.z.string(),
671
+ steps: import_zod22.z.array(LLMTraceStepSchema),
633
672
  summary: LLMTraceSummarySchema
634
673
  });
635
674
 
636
675
  // src/evaluation/eval-result.ts
637
- var import_zod24 = require("zod");
676
+ var import_zod25 = require("zod");
638
677
 
639
678
  // src/evaluation/eval-run.ts
640
- var import_zod23 = require("zod");
679
+ var import_zod24 = require("zod");
641
680
 
642
681
  // src/evaluation/live-trace.ts
643
- var import_zod22 = require("zod");
682
+ var import_zod23 = require("zod");
644
683
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
645
684
  LiveTraceEventType2["THINKING"] = "thinking";
646
685
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -649,31 +688,31 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
649
688
  LiveTraceEventType2["DIAGNOSTIC"] = "diagnostic";
650
689
  return LiveTraceEventType2;
651
690
  })(LiveTraceEventType || {});
652
- var LiveTraceEventSchema = import_zod22.z.object({
691
+ var LiveTraceEventSchema = import_zod23.z.object({
653
692
  /** The evaluation run ID */
654
- evalRunId: import_zod22.z.string(),
693
+ evalRunId: import_zod23.z.string(),
655
694
  /** The scenario ID being executed */
656
- scenarioId: import_zod22.z.string(),
695
+ scenarioId: import_zod23.z.string(),
657
696
  /** The scenario name for display */
658
- scenarioName: import_zod22.z.string(),
697
+ scenarioName: import_zod23.z.string(),
659
698
  /** The target ID (skill, agent, etc.) */
660
- targetId: import_zod22.z.string(),
699
+ targetId: import_zod23.z.string(),
661
700
  /** The target name for display */
662
- targetName: import_zod22.z.string(),
701
+ targetName: import_zod23.z.string(),
663
702
  /** Step number in the current scenario execution */
664
- stepNumber: import_zod22.z.number(),
703
+ stepNumber: import_zod23.z.number(),
665
704
  /** Type of trace event */
666
- type: import_zod22.z.enum(LiveTraceEventType),
705
+ type: import_zod23.z.enum(LiveTraceEventType),
667
706
  /** Tool name if this is a tool_use event */
668
- toolName: import_zod22.z.string().optional(),
707
+ toolName: import_zod23.z.string().optional(),
669
708
  /** Tool arguments preview (truncated JSON) */
670
- toolArgs: import_zod22.z.string().optional(),
709
+ toolArgs: import_zod23.z.string().optional(),
671
710
  /** Output preview (truncated text) */
672
- outputPreview: import_zod22.z.string().optional(),
711
+ outputPreview: import_zod23.z.string().optional(),
673
712
  /** Timestamp when this event occurred */
674
- timestamp: import_zod22.z.string(),
713
+ timestamp: import_zod23.z.string(),
675
714
  /** Whether this is the final event for this scenario */
676
- isComplete: import_zod22.z.boolean()
715
+ isComplete: import_zod23.z.boolean()
677
716
  });
678
717
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
679
718
  function parseTraceEventLine(line) {
@@ -701,14 +740,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
701
740
  TriggerType2["MANUAL"] = "MANUAL";
702
741
  return TriggerType2;
703
742
  })(TriggerType || {});
704
- var TriggerMetadataSchema = import_zod23.z.object({
705
- version: import_zod23.z.string().optional(),
706
- resourceUpdated: import_zod23.z.array(import_zod23.z.string()).optional()
743
+ var TriggerMetadataSchema = import_zod24.z.object({
744
+ version: import_zod24.z.string().optional(),
745
+ resourceUpdated: import_zod24.z.array(import_zod24.z.string()).optional()
707
746
  });
708
- var TriggerSchema = import_zod23.z.object({
709
- id: import_zod23.z.string(),
747
+ var TriggerSchema = import_zod24.z.object({
748
+ id: import_zod24.z.string(),
710
749
  metadata: TriggerMetadataSchema.optional(),
711
- type: import_zod23.z.enum(TriggerType)
750
+ type: import_zod24.z.enum(TriggerType)
712
751
  });
713
752
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
714
753
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -726,89 +765,89 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
726
765
  FailureSeverity2["LOW"] = "low";
727
766
  return FailureSeverity2;
728
767
  })(FailureSeverity || {});
729
- var DiffLineTypeSchema = import_zod23.z.enum(["added", "removed", "unchanged"]);
730
- var DiffLineSchema = import_zod23.z.object({
768
+ var DiffLineTypeSchema = import_zod24.z.enum(["added", "removed", "unchanged"]);
769
+ var DiffLineSchema = import_zod24.z.object({
731
770
  type: DiffLineTypeSchema,
732
- content: import_zod23.z.string(),
733
- lineNumber: import_zod23.z.number()
734
- });
735
- var DiffContentSchema = import_zod23.z.object({
736
- path: import_zod23.z.string(),
737
- expected: import_zod23.z.string(),
738
- actual: import_zod23.z.string(),
739
- diffLines: import_zod23.z.array(DiffLineSchema)
740
- });
741
- var CommandExecutionSchema = import_zod23.z.object({
742
- command: import_zod23.z.string(),
743
- exitCode: import_zod23.z.number(),
744
- output: import_zod23.z.string().optional(),
745
- duration: import_zod23.z.number()
746
- });
747
- var FileModificationSchema = import_zod23.z.object({
748
- path: import_zod23.z.string(),
749
- action: import_zod23.z.enum(["created", "modified", "deleted"])
750
- });
751
- var ApiCallSchema = import_zod23.z.object({
752
- endpoint: import_zod23.z.string(),
753
- tokensUsed: import_zod23.z.number(),
754
- duration: import_zod23.z.number()
755
- });
756
- var ExecutionTraceSchema = import_zod23.z.object({
757
- commands: import_zod23.z.array(CommandExecutionSchema),
758
- filesModified: import_zod23.z.array(FileModificationSchema),
759
- apiCalls: import_zod23.z.array(ApiCallSchema),
760
- totalDuration: import_zod23.z.number()
761
- });
762
- var FailureAnalysisSchema = import_zod23.z.object({
763
- category: import_zod23.z.enum(FailureCategory),
764
- severity: import_zod23.z.enum(FailureSeverity),
765
- summary: import_zod23.z.string(),
766
- details: import_zod23.z.string(),
767
- rootCause: import_zod23.z.string(),
768
- suggestedFix: import_zod23.z.string(),
769
- relatedAssertions: import_zod23.z.array(import_zod23.z.string()),
770
- codeSnippet: import_zod23.z.string().optional(),
771
- similarIssues: import_zod23.z.array(import_zod23.z.string()).optional(),
772
- patternId: import_zod23.z.string().optional(),
771
+ content: import_zod24.z.string(),
772
+ lineNumber: import_zod24.z.number()
773
+ });
774
+ var DiffContentSchema = import_zod24.z.object({
775
+ path: import_zod24.z.string(),
776
+ expected: import_zod24.z.string(),
777
+ actual: import_zod24.z.string(),
778
+ diffLines: import_zod24.z.array(DiffLineSchema)
779
+ });
780
+ var CommandExecutionSchema = import_zod24.z.object({
781
+ command: import_zod24.z.string(),
782
+ exitCode: import_zod24.z.number(),
783
+ output: import_zod24.z.string().optional(),
784
+ duration: import_zod24.z.number()
785
+ });
786
+ var FileModificationSchema = import_zod24.z.object({
787
+ path: import_zod24.z.string(),
788
+ action: import_zod24.z.enum(["created", "modified", "deleted"])
789
+ });
790
+ var ApiCallSchema = import_zod24.z.object({
791
+ endpoint: import_zod24.z.string(),
792
+ tokensUsed: import_zod24.z.number(),
793
+ duration: import_zod24.z.number()
794
+ });
795
+ var ExecutionTraceSchema = import_zod24.z.object({
796
+ commands: import_zod24.z.array(CommandExecutionSchema),
797
+ filesModified: import_zod24.z.array(FileModificationSchema),
798
+ apiCalls: import_zod24.z.array(ApiCallSchema),
799
+ totalDuration: import_zod24.z.number()
800
+ });
801
+ var FailureAnalysisSchema = import_zod24.z.object({
802
+ category: import_zod24.z.enum(FailureCategory),
803
+ severity: import_zod24.z.enum(FailureSeverity),
804
+ summary: import_zod24.z.string(),
805
+ details: import_zod24.z.string(),
806
+ rootCause: import_zod24.z.string(),
807
+ suggestedFix: import_zod24.z.string(),
808
+ relatedAssertions: import_zod24.z.array(import_zod24.z.string()),
809
+ codeSnippet: import_zod24.z.string().optional(),
810
+ similarIssues: import_zod24.z.array(import_zod24.z.string()).optional(),
811
+ patternId: import_zod24.z.string().optional(),
773
812
  // Extended fields for detailed debugging
774
813
  diff: DiffContentSchema.optional(),
775
814
  executionTrace: ExecutionTraceSchema.optional()
776
815
  });
777
816
  var EvalRunSchema = TenantEntitySchema.extend({
778
817
  /** Agent ID for this run */
779
- agentId: import_zod23.z.string().optional(),
818
+ agentId: import_zod24.z.string().optional(),
780
819
  /** Skills group ID for this run */
781
- skillsGroupId: import_zod23.z.string().optional(),
820
+ skillsGroupId: import_zod24.z.string().optional(),
782
821
  /** Scenario IDs to run */
783
- scenarioIds: import_zod23.z.array(import_zod23.z.string()),
822
+ scenarioIds: import_zod24.z.array(import_zod24.z.string()),
784
823
  /** Current status */
785
824
  status: EvalStatusSchema,
786
825
  /** Progress percentage (0-100) */
787
- progress: import_zod23.z.number(),
788
- /** Results for each scenario/target combination */
789
- results: import_zod23.z.array(EvalRunResultSchema),
826
+ progress: import_zod24.z.number(),
827
+ /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
828
+ results: import_zod24.z.array(import_zod24.z.lazy(() => EvalRunResultSchema)),
790
829
  /** Aggregated metrics across all results */
791
830
  aggregateMetrics: EvalMetricsSchema,
792
831
  /** Failure analyses */
793
- failureAnalyses: import_zod23.z.array(FailureAnalysisSchema).optional(),
832
+ failureAnalyses: import_zod24.z.array(FailureAnalysisSchema).optional(),
794
833
  /** Aggregated LLM trace summary */
795
834
  llmTraceSummary: LLMTraceSummarySchema.optional(),
796
835
  /** What triggered this run */
797
836
  trigger: TriggerSchema.optional(),
798
837
  /** When the run started (set when evaluation is triggered) */
799
- startedAt: import_zod23.z.string().optional(),
838
+ startedAt: import_zod24.z.string().optional(),
800
839
  /** When the run completed */
801
- completedAt: import_zod23.z.string().optional(),
840
+ completedAt: import_zod24.z.string().optional(),
802
841
  /** Live trace events captured during execution (for playback on results page) */
803
- liveTraceEvents: import_zod23.z.array(LiveTraceEventSchema).optional(),
842
+ liveTraceEvents: import_zod24.z.array(LiveTraceEventSchema).optional(),
804
843
  /** Remote job ID for tracking execution in Dev Machines */
805
- jobId: import_zod23.z.string().optional(),
844
+ jobId: import_zod24.z.string().optional(),
806
845
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
807
- jobStatus: import_zod23.z.string().optional(),
846
+ jobStatus: import_zod24.z.string().optional(),
808
847
  /** Remote job error message if the job failed */
809
- jobError: import_zod23.z.string().optional(),
848
+ jobError: import_zod24.z.string().optional(),
810
849
  /** Timestamp of the last job status check */
811
- jobStatusCheckedAt: import_zod23.z.string().optional()
850
+ jobStatusCheckedAt: import_zod24.z.string().optional()
812
851
  });
813
852
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
814
853
  id: true,
@@ -821,28 +860,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
821
860
  startedAt: true,
822
861
  completedAt: true
823
862
  });
824
- var EvaluationProgressSchema = import_zod23.z.object({
825
- runId: import_zod23.z.string(),
826
- targetId: import_zod23.z.string(),
827
- totalScenarios: import_zod23.z.number(),
828
- completedScenarios: import_zod23.z.number(),
829
- scenarioProgress: import_zod23.z.array(
830
- import_zod23.z.object({
831
- scenarioId: import_zod23.z.string(),
832
- currentStep: import_zod23.z.string(),
833
- error: import_zod23.z.string().optional()
863
+ var EvaluationProgressSchema = import_zod24.z.object({
864
+ runId: import_zod24.z.string(),
865
+ targetId: import_zod24.z.string(),
866
+ totalScenarios: import_zod24.z.number(),
867
+ completedScenarios: import_zod24.z.number(),
868
+ scenarioProgress: import_zod24.z.array(
869
+ import_zod24.z.object({
870
+ scenarioId: import_zod24.z.string(),
871
+ currentStep: import_zod24.z.string(),
872
+ error: import_zod24.z.string().optional()
834
873
  })
835
874
  ),
836
- createdAt: import_zod23.z.number()
875
+ createdAt: import_zod24.z.number()
837
876
  });
838
- var EvaluationLogSchema = import_zod23.z.object({
839
- runId: import_zod23.z.string(),
840
- scenarioId: import_zod23.z.string(),
841
- log: import_zod23.z.object({
842
- level: import_zod23.z.enum(["info", "error", "debug"]),
843
- message: import_zod23.z.string().optional(),
844
- args: import_zod23.z.array(import_zod23.z.any()).optional(),
845
- error: import_zod23.z.string().optional()
877
+ var EvaluationLogSchema = import_zod24.z.object({
878
+ runId: import_zod24.z.string(),
879
+ scenarioId: import_zod24.z.string(),
880
+ log: import_zod24.z.object({
881
+ level: import_zod24.z.enum(["info", "error", "debug"]),
882
+ message: import_zod24.z.string().optional(),
883
+ args: import_zod24.z.array(import_zod24.z.any()).optional(),
884
+ error: import_zod24.z.string().optional()
846
885
  })
847
886
  });
848
887
  var LLM_TIMEOUT = 12e4;
@@ -855,90 +894,89 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
855
894
  AssertionResultStatus2["ERROR"] = "error";
856
895
  return AssertionResultStatus2;
857
896
  })(AssertionResultStatus || {});
858
- var AssertionResultSchema = import_zod24.z.object({
859
- id: import_zod24.z.string(),
860
- assertionId: import_zod24.z.string(),
861
- assertionType: import_zod24.z.string(),
862
- assertionName: import_zod24.z.string(),
863
- status: import_zod24.z.enum(AssertionResultStatus),
864
- message: import_zod24.z.string().optional(),
865
- expected: import_zod24.z.string().optional(),
866
- actual: import_zod24.z.string().optional(),
867
- duration: import_zod24.z.number().optional(),
868
- details: import_zod24.z.record(import_zod24.z.string(), import_zod24.z.unknown()).optional(),
869
- llmTraceSteps: import_zod24.z.array(LLMTraceStepSchema).optional()
870
- });
871
- var EvalRunResultSchema = import_zod24.z.object({
872
- id: import_zod24.z.string(),
873
- targetId: import_zod24.z.string(),
874
- targetName: import_zod24.z.string().optional(),
875
- scenarioId: import_zod24.z.string(),
876
- scenarioName: import_zod24.z.string(),
897
+ var AssertionResultSchema = import_zod25.z.object({
898
+ id: import_zod25.z.string(),
899
+ assertionId: import_zod25.z.string(),
900
+ assertionType: import_zod25.z.string(),
901
+ assertionName: import_zod25.z.string(),
902
+ status: import_zod25.z.enum(AssertionResultStatus),
903
+ message: import_zod25.z.string().optional(),
904
+ expected: import_zod25.z.string().optional(),
905
+ actual: import_zod25.z.string().optional(),
906
+ duration: import_zod25.z.number().optional(),
907
+ details: import_zod25.z.record(import_zod25.z.string(), import_zod25.z.unknown()).optional(),
908
+ llmTraceSteps: import_zod25.z.array(LLMTraceStepSchema).optional()
909
+ });
910
+ var EvalRunResultSchema = import_zod25.z.object({
911
+ id: import_zod25.z.string(),
912
+ targetId: import_zod25.z.string(),
913
+ targetName: import_zod25.z.string().optional(),
914
+ scenarioId: import_zod25.z.string(),
915
+ scenarioName: import_zod25.z.string(),
877
916
  modelConfig: ModelConfigSchema.optional(),
878
- assertionResults: import_zod24.z.array(AssertionResultSchema),
917
+ assertionResults: import_zod25.z.array(AssertionResultSchema),
879
918
  metrics: EvalMetricsSchema.optional(),
880
- passed: import_zod24.z.number(),
881
- failed: import_zod24.z.number(),
882
- passRate: import_zod24.z.number(),
883
- duration: import_zod24.z.number(),
884
- outputText: import_zod24.z.string().optional(),
885
- files: import_zod24.z.array(ExpectedFileSchema).optional(),
886
- /** File diffs showing changes made by the agent during execution */
887
- fileDiffs: import_zod24.z.array(DiffContentSchema).optional(),
888
- startedAt: import_zod24.z.string().optional(),
889
- completedAt: import_zod24.z.string().optional(),
919
+ passed: import_zod25.z.number(),
920
+ failed: import_zod25.z.number(),
921
+ passRate: import_zod25.z.number(),
922
+ duration: import_zod25.z.number(),
923
+ outputText: import_zod25.z.string().optional(),
924
+ files: import_zod25.z.array(ExpectedFileSchema).optional(),
925
+ fileDiffs: import_zod25.z.array(DiffContentSchema).optional(),
926
+ startedAt: import_zod25.z.string().optional(),
927
+ completedAt: import_zod25.z.string().optional(),
890
928
  llmTrace: LLMTraceSchema.optional()
891
929
  });
892
- var PromptResultSchema = import_zod24.z.object({
893
- text: import_zod24.z.string(),
894
- files: import_zod24.z.array(import_zod24.z.unknown()).optional(),
895
- finishReason: import_zod24.z.string().optional(),
896
- reasoning: import_zod24.z.string().optional(),
897
- reasoningDetails: import_zod24.z.unknown().optional(),
898
- toolCalls: import_zod24.z.array(import_zod24.z.unknown()).optional(),
899
- toolResults: import_zod24.z.array(import_zod24.z.unknown()).optional(),
900
- warnings: import_zod24.z.array(import_zod24.z.unknown()).optional(),
901
- sources: import_zod24.z.array(import_zod24.z.unknown()).optional(),
902
- steps: import_zod24.z.array(import_zod24.z.unknown()),
903
- generationTimeMs: import_zod24.z.number(),
904
- prompt: import_zod24.z.string(),
905
- systemPrompt: import_zod24.z.string(),
906
- usage: import_zod24.z.object({
907
- totalTokens: import_zod24.z.number().optional(),
908
- totalMicrocentsSpent: import_zod24.z.number().optional()
930
+ var PromptResultSchema = import_zod25.z.object({
931
+ text: import_zod25.z.string(),
932
+ files: import_zod25.z.array(import_zod25.z.unknown()).optional(),
933
+ finishReason: import_zod25.z.string().optional(),
934
+ reasoning: import_zod25.z.string().optional(),
935
+ reasoningDetails: import_zod25.z.unknown().optional(),
936
+ toolCalls: import_zod25.z.array(import_zod25.z.unknown()).optional(),
937
+ toolResults: import_zod25.z.array(import_zod25.z.unknown()).optional(),
938
+ warnings: import_zod25.z.array(import_zod25.z.unknown()).optional(),
939
+ sources: import_zod25.z.array(import_zod25.z.unknown()).optional(),
940
+ steps: import_zod25.z.array(import_zod25.z.unknown()),
941
+ generationTimeMs: import_zod25.z.number(),
942
+ prompt: import_zod25.z.string(),
943
+ systemPrompt: import_zod25.z.string(),
944
+ usage: import_zod25.z.object({
945
+ totalTokens: import_zod25.z.number().optional(),
946
+ totalMicrocentsSpent: import_zod25.z.number().optional()
909
947
  })
910
948
  });
911
- var EvaluationResultSchema = import_zod24.z.object({
912
- id: import_zod24.z.string(),
913
- runId: import_zod24.z.string(),
914
- timestamp: import_zod24.z.number(),
949
+ var EvaluationResultSchema = import_zod25.z.object({
950
+ id: import_zod25.z.string(),
951
+ runId: import_zod25.z.string(),
952
+ timestamp: import_zod25.z.number(),
915
953
  promptResult: PromptResultSchema,
916
- testResults: import_zod24.z.array(import_zod24.z.unknown()),
917
- tags: import_zod24.z.array(import_zod24.z.string()).optional(),
918
- feedback: import_zod24.z.string().optional(),
919
- score: import_zod24.z.number(),
920
- suiteId: import_zod24.z.string().optional()
921
- });
922
- var LeanEvaluationResultSchema = import_zod24.z.object({
923
- id: import_zod24.z.string(),
924
- runId: import_zod24.z.string(),
925
- timestamp: import_zod24.z.number(),
926
- tags: import_zod24.z.array(import_zod24.z.string()).optional(),
927
- scenarioId: import_zod24.z.string(),
928
- scenarioVersion: import_zod24.z.number().optional(),
929
- targetId: import_zod24.z.string(),
930
- targetVersion: import_zod24.z.number().optional(),
931
- suiteId: import_zod24.z.string().optional(),
932
- score: import_zod24.z.number(),
933
- time: import_zod24.z.number().optional(),
934
- microcentsSpent: import_zod24.z.number().optional()
954
+ testResults: import_zod25.z.array(import_zod25.z.unknown()),
955
+ tags: import_zod25.z.array(import_zod25.z.string()).optional(),
956
+ feedback: import_zod25.z.string().optional(),
957
+ score: import_zod25.z.number(),
958
+ suiteId: import_zod25.z.string().optional()
959
+ });
960
+ var LeanEvaluationResultSchema = import_zod25.z.object({
961
+ id: import_zod25.z.string(),
962
+ runId: import_zod25.z.string(),
963
+ timestamp: import_zod25.z.number(),
964
+ tags: import_zod25.z.array(import_zod25.z.string()).optional(),
965
+ scenarioId: import_zod25.z.string(),
966
+ scenarioVersion: import_zod25.z.number().optional(),
967
+ targetId: import_zod25.z.string(),
968
+ targetVersion: import_zod25.z.number().optional(),
969
+ suiteId: import_zod25.z.string().optional(),
970
+ score: import_zod25.z.number(),
971
+ time: import_zod25.z.number().optional(),
972
+ microcentsSpent: import_zod25.z.number().optional()
935
973
  });
936
974
 
937
975
  // src/project/project.ts
938
- var import_zod25 = require("zod");
976
+ var import_zod26 = require("zod");
939
977
  var ProjectSchema = BaseEntitySchema.extend({
940
- appId: import_zod25.z.string().optional().describe("The ID of the app in Dev Center"),
941
- appSecret: import_zod25.z.string().optional().describe("The secret of the app in Dev Center")
978
+ appId: import_zod26.z.string().optional().describe("The ID of the app in Dev Center"),
979
+ appSecret: import_zod26.z.string().optional().describe("The secret of the app in Dev Center")
942
980
  });
943
981
  var CreateProjectInputSchema = ProjectSchema.omit({
944
982
  id: true,
@@ -949,10 +987,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
949
987
  var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
950
988
 
951
989
  // src/template/template.ts
952
- var import_zod26 = require("zod");
990
+ var import_zod27 = require("zod");
953
991
  var TemplateSchema = TenantEntitySchema.extend({
954
992
  /** URL to download the template from */
955
- downloadUrl: import_zod26.z.url()
993
+ downloadUrl: import_zod27.z.url()
956
994
  });
957
995
  var CreateTemplateInputSchema = TemplateSchema.omit({
958
996
  id: true,
@@ -970,9 +1008,11 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
970
1008
  ApiCallSchema,
971
1009
  AssertionResultSchema,
972
1010
  AssertionResultStatus,
1011
+ AssertionSchema,
973
1012
  BaseEntitySchema,
974
1013
  BaseTestSchema,
975
1014
  BuildCheckTestSchema,
1015
+ BuildPassedAssertionSchema,
976
1016
  CommandExecutionSchema,
977
1017
  CommandExecutionTestSchema,
978
1018
  CreateAgentInputSchema,
@@ -1014,6 +1054,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1014
1054
  LeanEvaluationResultSchema,
1015
1055
  LiveTraceEventSchema,
1016
1056
  LiveTraceEventType,
1057
+ LlmJudgeAssertionSchema,
1017
1058
  LocalProjectConfigSchema,
1018
1059
  MCPServerConfigSchema,
1019
1060
  MetaSiteConfigSchema,
@@ -1029,6 +1070,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1029
1070
  SkillMetadataSchema,
1030
1071
  SkillSchema,
1031
1072
  SkillVersionSchema,
1073
+ SkillWasCalledAssertionSchema,
1032
1074
  SkillsGroupSchema,
1033
1075
  TRACE_EVENT_PREFIX,
1034
1076
  TargetSchema,