@wix/evalforge-types 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -26,6 +26,8 @@ __export(index_exports, {
26
26
  AllowedCommands: () => AllowedCommands,
27
27
  ApiCallSchema: () => ApiCallSchema,
28
28
  AssertionConfigSchema: () => AssertionConfigSchema,
29
+ AssertionParameterSchema: () => AssertionParameterSchema,
30
+ AssertionParameterTypeSchema: () => AssertionParameterTypeSchema,
29
31
  AssertionResultSchema: () => AssertionResultSchema,
30
32
  AssertionResultStatus: () => AssertionResultStatus,
31
33
  AssertionSchema: () => AssertionSchema,
@@ -92,6 +94,9 @@ __export(index_exports, {
92
94
  ProjectSchema: () => ProjectSchema,
93
95
  PromptResultSchema: () => PromptResultSchema,
94
96
  SKILL_FOLDER_NAME_REGEX: () => SKILL_FOLDER_NAME_REGEX,
97
+ SYSTEM_ASSERTIONS: () => SYSTEM_ASSERTIONS,
98
+ SYSTEM_ASSERTION_IDS: () => SYSTEM_ASSERTION_IDS,
99
+ ScenarioAssertionLinkSchema: () => ScenarioAssertionLinkSchema,
95
100
  SiteConfigTestSchema: () => SiteConfigTestSchema,
96
101
  SkillMetadataSchema: () => SkillMetadataSchema,
97
102
  SkillSchema: () => SkillSchema,
@@ -130,6 +135,9 @@ __export(index_exports, {
130
135
  getBuildPassedConfig: () => getBuildPassedConfig,
131
136
  getLlmJudgeConfig: () => getLlmJudgeConfig,
132
137
  getSkillWasCalledConfig: () => getSkillWasCalledConfig,
138
+ getSystemAssertion: () => getSystemAssertion,
139
+ getSystemAssertions: () => getSystemAssertions,
140
+ isSystemAssertionId: () => isSystemAssertionId,
133
141
  isValidSkillFolderName: () => isValidSkillFolderName,
134
142
  parseTraceEventLine: () => parseTraceEventLine,
135
143
  validateAssertionConfig: () => validateAssertionConfig
@@ -592,22 +600,147 @@ var EnvironmentSchema = import_zod19.z.object({
592
600
  });
593
601
 
594
602
  // src/scenario/test-scenario.ts
603
+ var import_zod21 = require("zod");
604
+
605
+ // src/assertion/assertion.ts
595
606
  var import_zod20 = require("zod");
596
- var ExpectedFileSchema = import_zod20.z.object({
607
+ var AssertionTypeSchema = import_zod20.z.enum([
608
+ "skill_was_called",
609
+ "build_passed",
610
+ "llm_judge"
611
+ ]);
612
+ var AssertionParameterTypeSchema = import_zod20.z.enum([
613
+ "string",
614
+ "number",
615
+ "boolean"
616
+ ]);
617
+ var AssertionParameterSchema = import_zod20.z.object({
618
+ /** Parameter name (used as key in params object) */
619
+ name: import_zod20.z.string().min(1),
620
+ /** Display label for the parameter */
621
+ label: import_zod20.z.string().min(1),
622
+ /** Parameter type */
623
+ type: AssertionParameterTypeSchema,
624
+ /** Whether this parameter is required */
625
+ required: import_zod20.z.boolean(),
626
+ /** Default value (optional, used when not provided) */
627
+ defaultValue: import_zod20.z.union([import_zod20.z.string(), import_zod20.z.number(), import_zod20.z.boolean()]).optional(),
628
+ /** If true, parameter is hidden by default behind "Show advanced options" */
629
+ advanced: import_zod20.z.boolean().optional()
630
+ });
631
+ var ScenarioAssertionLinkSchema = import_zod20.z.object({
632
+ /** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
633
+ assertionId: import_zod20.z.string(),
634
+ /** Parameter values for this assertion in this scenario */
635
+ params: import_zod20.z.record(
636
+ import_zod20.z.string(),
637
+ import_zod20.z.union([import_zod20.z.string(), import_zod20.z.number(), import_zod20.z.boolean(), import_zod20.z.null()])
638
+ ).optional()
639
+ });
640
+ var SkillWasCalledConfigSchema = import_zod20.z.object({
641
+ /** Name of the skill that must have been called */
642
+ skillName: import_zod20.z.string().min(1)
643
+ });
644
+ var BuildPassedConfigSchema = import_zod20.z.strictObject({
645
+ /** Command to run (default: "yarn build") */
646
+ command: import_zod20.z.string().optional(),
647
+ /** Expected exit code (default: 0) */
648
+ expectedExitCode: import_zod20.z.number().int().optional()
649
+ });
650
+ var LlmJudgeConfigSchema = import_zod20.z.object({
651
+ /**
652
+ * Prompt template with placeholders:
653
+ * - {{output}}: agent's final output
654
+ * - {{cwd}}: working directory
655
+ * - {{changedFiles}}: all files changed (new, modified)
656
+ * - {{modifiedFiles}}: only existing files that were modified
657
+ * - {{newFiles}}: only new files that were created
658
+ * - {{trace}}: step-by-step trace of tool calls
659
+ * - Custom parameters defined in the parameters array
660
+ */
661
+ prompt: import_zod20.z.string().min(1),
662
+ /** Optional system prompt for the judge */
663
+ systemPrompt: import_zod20.z.string().optional(),
664
+ /** Minimum score to pass (0-100, default 70) */
665
+ minScore: import_zod20.z.number().int().min(0).max(100).optional(),
666
+ /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
667
+ model: import_zod20.z.string().optional(),
668
+ /** Max output tokens */
669
+ maxTokens: import_zod20.z.number().int().optional(),
670
+ /** Temperature (0-1) */
671
+ temperature: import_zod20.z.number().min(0).max(1).optional(),
672
+ /** User-defined parameters for this assertion */
673
+ parameters: import_zod20.z.array(AssertionParameterSchema).optional()
674
+ });
675
+ var AssertionConfigSchema = import_zod20.z.union([
676
+ LlmJudgeConfigSchema,
677
+ // requires prompt - check first
678
+ SkillWasCalledConfigSchema,
679
+ // requires skillName
680
+ BuildPassedConfigSchema,
681
+ // all optional, uses strictObject to reject unknown keys
682
+ import_zod20.z.object({})
683
+ // fallback empty config
684
+ ]);
685
+ var CustomAssertionSchema = TenantEntitySchema.extend({
686
+ /** The assertion type */
687
+ type: AssertionTypeSchema,
688
+ /** Type-specific configuration */
689
+ config: AssertionConfigSchema
690
+ });
691
+ var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
692
+ id: true,
693
+ createdAt: true,
694
+ updatedAt: true,
695
+ deleted: true
696
+ });
697
+ var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
698
+ function validateAssertionConfig(type, config) {
699
+ switch (type) {
700
+ case "skill_was_called":
701
+ return SkillWasCalledConfigSchema.safeParse(config).success;
702
+ case "build_passed":
703
+ return BuildPassedConfigSchema.safeParse(config).success;
704
+ case "llm_judge":
705
+ return LlmJudgeConfigSchema.safeParse(config).success;
706
+ default:
707
+ return false;
708
+ }
709
+ }
710
+ function getSkillWasCalledConfig(assertion) {
711
+ if (assertion.type !== "skill_was_called") return null;
712
+ const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
713
+ return result.success ? result.data : null;
714
+ }
715
+ function getBuildPassedConfig(assertion) {
716
+ if (assertion.type !== "build_passed") return null;
717
+ const result = BuildPassedConfigSchema.safeParse(assertion.config);
718
+ return result.success ? result.data : null;
719
+ }
720
+ function getLlmJudgeConfig(assertion) {
721
+ if (assertion.type !== "llm_judge") return null;
722
+ const result = LlmJudgeConfigSchema.safeParse(assertion.config);
723
+ return result.success ? result.data : null;
724
+ }
725
+
726
+ // src/scenario/test-scenario.ts
727
+ var ExpectedFileSchema = import_zod21.z.object({
597
728
  /** Relative path where the file should be created */
598
- path: import_zod20.z.string(),
729
+ path: import_zod21.z.string(),
599
730
  /** Optional expected content */
600
- content: import_zod20.z.string().optional()
731
+ content: import_zod21.z.string().optional()
601
732
  });
602
733
  var TestScenarioSchema = TenantEntitySchema.extend({
603
734
  /** The prompt sent to the agent to trigger the task */
604
- triggerPrompt: import_zod20.z.string().min(10),
735
+ triggerPrompt: import_zod21.z.string().min(10),
605
736
  /** ID of the template to use for this scenario (null = no template) */
606
- templateId: import_zod20.z.string().nullish(),
737
+ templateId: import_zod21.z.string().nullish(),
607
738
  /** Inline assertions to evaluate for this scenario (legacy) */
608
- assertions: import_zod20.z.array(AssertionSchema).optional(),
609
- /** IDs of saved assertions to evaluate (from assertions table) */
610
- assertionIds: import_zod20.z.array(import_zod20.z.string()).optional()
739
+ assertions: import_zod21.z.array(AssertionSchema).optional(),
740
+ /** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
741
+ assertionIds: import_zod21.z.array(import_zod21.z.string()).optional(),
742
+ /** Linked assertions with per-scenario parameter values */
743
+ assertionLinks: import_zod21.z.array(ScenarioAssertionLinkSchema).optional()
611
744
  });
612
745
  var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
613
746
  id: true,
@@ -618,10 +751,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
618
751
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
619
752
 
620
753
  // src/suite/test-suite.ts
621
- var import_zod21 = require("zod");
754
+ var import_zod22 = require("zod");
622
755
  var TestSuiteSchema = TenantEntitySchema.extend({
623
756
  /** IDs of test scenarios in this suite */
624
- scenarioIds: import_zod21.z.array(import_zod21.z.string())
757
+ scenarioIds: import_zod22.z.array(import_zod22.z.string())
625
758
  });
626
759
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
627
760
  id: true,
@@ -632,21 +765,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
632
765
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
633
766
 
634
767
  // src/evaluation/metrics.ts
635
- var import_zod22 = require("zod");
636
- var TokenUsageSchema = import_zod22.z.object({
637
- prompt: import_zod22.z.number(),
638
- completion: import_zod22.z.number(),
639
- total: import_zod22.z.number()
640
- });
641
- var EvalMetricsSchema = import_zod22.z.object({
642
- totalAssertions: import_zod22.z.number(),
643
- passed: import_zod22.z.number(),
644
- failed: import_zod22.z.number(),
645
- skipped: import_zod22.z.number(),
646
- errors: import_zod22.z.number(),
647
- passRate: import_zod22.z.number(),
648
- avgDuration: import_zod22.z.number(),
649
- totalDuration: import_zod22.z.number()
768
+ var import_zod23 = require("zod");
769
+ var TokenUsageSchema = import_zod23.z.object({
770
+ prompt: import_zod23.z.number(),
771
+ completion: import_zod23.z.number(),
772
+ total: import_zod23.z.number()
773
+ });
774
+ var EvalMetricsSchema = import_zod23.z.object({
775
+ totalAssertions: import_zod23.z.number(),
776
+ passed: import_zod23.z.number(),
777
+ failed: import_zod23.z.number(),
778
+ skipped: import_zod23.z.number(),
779
+ errors: import_zod23.z.number(),
780
+ passRate: import_zod23.z.number(),
781
+ avgDuration: import_zod23.z.number(),
782
+ totalDuration: import_zod23.z.number()
650
783
  });
651
784
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
652
785
  EvalStatus2["PENDING"] = "pending";
@@ -656,7 +789,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
656
789
  EvalStatus2["CANCELLED"] = "cancelled";
657
790
  return EvalStatus2;
658
791
  })(EvalStatus || {});
659
- var EvalStatusSchema = import_zod22.z.enum(EvalStatus);
792
+ var EvalStatusSchema = import_zod23.z.enum(EvalStatus);
660
793
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
661
794
  LLMStepType2["COMPLETION"] = "completion";
662
795
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -664,52 +797,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
664
797
  LLMStepType2["THINKING"] = "thinking";
665
798
  return LLMStepType2;
666
799
  })(LLMStepType || {});
667
- var LLMTraceStepSchema = import_zod22.z.object({
668
- id: import_zod22.z.string(),
669
- stepNumber: import_zod22.z.number(),
670
- type: import_zod22.z.enum(LLMStepType),
671
- model: import_zod22.z.string(),
672
- provider: import_zod22.z.string(),
673
- startedAt: import_zod22.z.string(),
674
- durationMs: import_zod22.z.number(),
800
+ var LLMTraceStepSchema = import_zod23.z.object({
801
+ id: import_zod23.z.string(),
802
+ stepNumber: import_zod23.z.number(),
803
+ type: import_zod23.z.enum(LLMStepType),
804
+ model: import_zod23.z.string(),
805
+ provider: import_zod23.z.string(),
806
+ startedAt: import_zod23.z.string(),
807
+ durationMs: import_zod23.z.number(),
675
808
  tokenUsage: TokenUsageSchema,
676
- costUsd: import_zod22.z.number(),
677
- toolName: import_zod22.z.string().optional(),
678
- toolArguments: import_zod22.z.string().optional(),
679
- inputPreview: import_zod22.z.string().optional(),
680
- outputPreview: import_zod22.z.string().optional(),
681
- success: import_zod22.z.boolean(),
682
- error: import_zod22.z.string().optional()
683
- });
684
- var LLMBreakdownStatsSchema = import_zod22.z.object({
685
- count: import_zod22.z.number(),
686
- durationMs: import_zod22.z.number(),
687
- tokens: import_zod22.z.number(),
688
- costUsd: import_zod22.z.number()
689
- });
690
- var LLMTraceSummarySchema = import_zod22.z.object({
691
- totalSteps: import_zod22.z.number(),
692
- totalDurationMs: import_zod22.z.number(),
809
+ costUsd: import_zod23.z.number(),
810
+ toolName: import_zod23.z.string().optional(),
811
+ toolArguments: import_zod23.z.string().optional(),
812
+ inputPreview: import_zod23.z.string().optional(),
813
+ outputPreview: import_zod23.z.string().optional(),
814
+ success: import_zod23.z.boolean(),
815
+ error: import_zod23.z.string().optional()
816
+ });
817
+ var LLMBreakdownStatsSchema = import_zod23.z.object({
818
+ count: import_zod23.z.number(),
819
+ durationMs: import_zod23.z.number(),
820
+ tokens: import_zod23.z.number(),
821
+ costUsd: import_zod23.z.number()
822
+ });
823
+ var LLMTraceSummarySchema = import_zod23.z.object({
824
+ totalSteps: import_zod23.z.number(),
825
+ totalDurationMs: import_zod23.z.number(),
693
826
  totalTokens: TokenUsageSchema,
694
- totalCostUsd: import_zod22.z.number(),
695
- stepTypeBreakdown: import_zod22.z.record(import_zod22.z.string(), LLMBreakdownStatsSchema).optional(),
696
- modelBreakdown: import_zod22.z.record(import_zod22.z.string(), LLMBreakdownStatsSchema),
697
- modelsUsed: import_zod22.z.array(import_zod22.z.string())
698
- });
699
- var LLMTraceSchema = import_zod22.z.object({
700
- id: import_zod22.z.string(),
701
- steps: import_zod22.z.array(LLMTraceStepSchema),
827
+ totalCostUsd: import_zod23.z.number(),
828
+ stepTypeBreakdown: import_zod23.z.record(import_zod23.z.string(), LLMBreakdownStatsSchema).optional(),
829
+ modelBreakdown: import_zod23.z.record(import_zod23.z.string(), LLMBreakdownStatsSchema),
830
+ modelsUsed: import_zod23.z.array(import_zod23.z.string())
831
+ });
832
+ var LLMTraceSchema = import_zod23.z.object({
833
+ id: import_zod23.z.string(),
834
+ steps: import_zod23.z.array(LLMTraceStepSchema),
702
835
  summary: LLMTraceSummarySchema
703
836
  });
704
837
 
705
838
  // src/evaluation/eval-result.ts
706
- var import_zod25 = require("zod");
839
+ var import_zod26 = require("zod");
707
840
 
708
841
  // src/evaluation/eval-run.ts
709
- var import_zod24 = require("zod");
842
+ var import_zod25 = require("zod");
710
843
 
711
844
  // src/evaluation/live-trace.ts
712
- var import_zod23 = require("zod");
845
+ var import_zod24 = require("zod");
713
846
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
714
847
  LiveTraceEventType2["THINKING"] = "thinking";
715
848
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -723,37 +856,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
723
856
  LiveTraceEventType2["USER"] = "user";
724
857
  return LiveTraceEventType2;
725
858
  })(LiveTraceEventType || {});
726
- var LiveTraceEventSchema = import_zod23.z.object({
859
+ var LiveTraceEventSchema = import_zod24.z.object({
727
860
  /** The evaluation run ID */
728
- evalRunId: import_zod23.z.string(),
861
+ evalRunId: import_zod24.z.string(),
729
862
  /** The scenario ID being executed */
730
- scenarioId: import_zod23.z.string(),
863
+ scenarioId: import_zod24.z.string(),
731
864
  /** The scenario name for display */
732
- scenarioName: import_zod23.z.string(),
865
+ scenarioName: import_zod24.z.string(),
733
866
  /** The target ID (skill, agent, etc.) */
734
- targetId: import_zod23.z.string(),
867
+ targetId: import_zod24.z.string(),
735
868
  /** The target name for display */
736
- targetName: import_zod23.z.string(),
869
+ targetName: import_zod24.z.string(),
737
870
  /** Step number in the current scenario execution */
738
- stepNumber: import_zod23.z.number(),
871
+ stepNumber: import_zod24.z.number(),
739
872
  /** Type of trace event */
740
- type: import_zod23.z.enum(LiveTraceEventType),
873
+ type: import_zod24.z.enum(LiveTraceEventType),
741
874
  /** Tool name if this is a tool_use event */
742
- toolName: import_zod23.z.string().optional(),
875
+ toolName: import_zod24.z.string().optional(),
743
876
  /** Tool arguments preview (truncated JSON) */
744
- toolArgs: import_zod23.z.string().optional(),
877
+ toolArgs: import_zod24.z.string().optional(),
745
878
  /** Output preview (truncated text) */
746
- outputPreview: import_zod23.z.string().optional(),
879
+ outputPreview: import_zod24.z.string().optional(),
747
880
  /** File path for file operations */
748
- filePath: import_zod23.z.string().optional(),
881
+ filePath: import_zod24.z.string().optional(),
749
882
  /** Elapsed time in milliseconds for progress events */
750
- elapsedMs: import_zod23.z.number().optional(),
883
+ elapsedMs: import_zod24.z.number().optional(),
751
884
  /** Thinking/reasoning text from Claude */
752
- thinking: import_zod23.z.string().optional(),
885
+ thinking: import_zod24.z.string().optional(),
753
886
  /** Timestamp when this event occurred */
754
- timestamp: import_zod23.z.string(),
887
+ timestamp: import_zod24.z.string(),
755
888
  /** Whether this is the final event for this scenario */
756
- isComplete: import_zod23.z.boolean()
889
+ isComplete: import_zod24.z.boolean()
757
890
  });
758
891
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
759
892
  function parseTraceEventLine(line) {
@@ -781,14 +914,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
781
914
  TriggerType2["MANUAL"] = "MANUAL";
782
915
  return TriggerType2;
783
916
  })(TriggerType || {});
784
- var TriggerMetadataSchema = import_zod24.z.object({
785
- version: import_zod24.z.string().optional(),
786
- resourceUpdated: import_zod24.z.array(import_zod24.z.string()).optional()
917
+ var TriggerMetadataSchema = import_zod25.z.object({
918
+ version: import_zod25.z.string().optional(),
919
+ resourceUpdated: import_zod25.z.array(import_zod25.z.string()).optional()
787
920
  });
788
- var TriggerSchema = import_zod24.z.object({
789
- id: import_zod24.z.string(),
921
+ var TriggerSchema = import_zod25.z.object({
922
+ id: import_zod25.z.string(),
790
923
  metadata: TriggerMetadataSchema.optional(),
791
- type: import_zod24.z.enum(TriggerType)
924
+ type: import_zod25.z.enum(TriggerType)
792
925
  });
793
926
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
794
927
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -806,28 +939,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
806
939
  FailureSeverity2["LOW"] = "low";
807
940
  return FailureSeverity2;
808
941
  })(FailureSeverity || {});
809
- var DiffLineTypeSchema = import_zod24.z.enum(["added", "removed", "unchanged"]);
810
- var DiffLineSchema = import_zod24.z.object({
942
+ var DiffLineTypeSchema = import_zod25.z.enum(["added", "removed", "unchanged"]);
943
+ var DiffLineSchema = import_zod25.z.object({
811
944
  type: DiffLineTypeSchema,
812
- content: import_zod24.z.string(),
813
- lineNumber: import_zod24.z.number()
814
- });
815
- var DiffContentSchema = import_zod24.z.object({
816
- path: import_zod24.z.string(),
817
- expected: import_zod24.z.string(),
818
- actual: import_zod24.z.string(),
819
- diffLines: import_zod24.z.array(DiffLineSchema),
820
- renamedFrom: import_zod24.z.string().optional()
821
- });
822
- var CommandExecutionSchema = import_zod24.z.object({
823
- command: import_zod24.z.string(),
824
- exitCode: import_zod24.z.number(),
825
- output: import_zod24.z.string().optional(),
826
- duration: import_zod24.z.number()
827
- });
828
- var FileModificationSchema = import_zod24.z.object({
829
- path: import_zod24.z.string(),
830
- action: import_zod24.z.enum(["created", "modified", "deleted"])
945
+ content: import_zod25.z.string(),
946
+ lineNumber: import_zod25.z.number()
947
+ });
948
+ var DiffContentSchema = import_zod25.z.object({
949
+ path: import_zod25.z.string(),
950
+ expected: import_zod25.z.string(),
951
+ actual: import_zod25.z.string(),
952
+ diffLines: import_zod25.z.array(DiffLineSchema),
953
+ renamedFrom: import_zod25.z.string().optional()
954
+ });
955
+ var CommandExecutionSchema = import_zod25.z.object({
956
+ command: import_zod25.z.string(),
957
+ exitCode: import_zod25.z.number(),
958
+ output: import_zod25.z.string().optional(),
959
+ duration: import_zod25.z.number()
960
+ });
961
+ var FileModificationSchema = import_zod25.z.object({
962
+ path: import_zod25.z.string(),
963
+ action: import_zod25.z.enum(["created", "modified", "deleted"])
831
964
  });
832
965
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
833
966
  TemplateFileStatus2["NEW"] = "new";
@@ -835,75 +968,75 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
835
968
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
836
969
  return TemplateFileStatus2;
837
970
  })(TemplateFileStatus || {});
838
- var TemplateFileSchema = import_zod24.z.object({
971
+ var TemplateFileSchema = import_zod25.z.object({
839
972
  /** Relative path within the template */
840
- path: import_zod24.z.string(),
973
+ path: import_zod25.z.string(),
841
974
  /** Full file content after execution */
842
- content: import_zod24.z.string(),
975
+ content: import_zod25.z.string(),
843
976
  /** File status (new, modified, unchanged) */
844
- status: import_zod24.z.enum(["new", "modified", "unchanged"])
845
- });
846
- var ApiCallSchema = import_zod24.z.object({
847
- endpoint: import_zod24.z.string(),
848
- tokensUsed: import_zod24.z.number(),
849
- duration: import_zod24.z.number()
850
- });
851
- var ExecutionTraceSchema = import_zod24.z.object({
852
- commands: import_zod24.z.array(CommandExecutionSchema),
853
- filesModified: import_zod24.z.array(FileModificationSchema),
854
- apiCalls: import_zod24.z.array(ApiCallSchema),
855
- totalDuration: import_zod24.z.number()
856
- });
857
- var FailureAnalysisSchema = import_zod24.z.object({
858
- category: import_zod24.z.enum(FailureCategory),
859
- severity: import_zod24.z.enum(FailureSeverity),
860
- summary: import_zod24.z.string(),
861
- details: import_zod24.z.string(),
862
- rootCause: import_zod24.z.string(),
863
- suggestedFix: import_zod24.z.string(),
864
- relatedAssertions: import_zod24.z.array(import_zod24.z.string()),
865
- codeSnippet: import_zod24.z.string().optional(),
866
- similarIssues: import_zod24.z.array(import_zod24.z.string()).optional(),
867
- patternId: import_zod24.z.string().optional(),
977
+ status: import_zod25.z.enum(["new", "modified", "unchanged"])
978
+ });
979
+ var ApiCallSchema = import_zod25.z.object({
980
+ endpoint: import_zod25.z.string(),
981
+ tokensUsed: import_zod25.z.number(),
982
+ duration: import_zod25.z.number()
983
+ });
984
+ var ExecutionTraceSchema = import_zod25.z.object({
985
+ commands: import_zod25.z.array(CommandExecutionSchema),
986
+ filesModified: import_zod25.z.array(FileModificationSchema),
987
+ apiCalls: import_zod25.z.array(ApiCallSchema),
988
+ totalDuration: import_zod25.z.number()
989
+ });
990
+ var FailureAnalysisSchema = import_zod25.z.object({
991
+ category: import_zod25.z.enum(FailureCategory),
992
+ severity: import_zod25.z.enum(FailureSeverity),
993
+ summary: import_zod25.z.string(),
994
+ details: import_zod25.z.string(),
995
+ rootCause: import_zod25.z.string(),
996
+ suggestedFix: import_zod25.z.string(),
997
+ relatedAssertions: import_zod25.z.array(import_zod25.z.string()),
998
+ codeSnippet: import_zod25.z.string().optional(),
999
+ similarIssues: import_zod25.z.array(import_zod25.z.string()).optional(),
1000
+ patternId: import_zod25.z.string().optional(),
868
1001
  // Extended fields for detailed debugging
869
1002
  diff: DiffContentSchema.optional(),
870
1003
  executionTrace: ExecutionTraceSchema.optional()
871
1004
  });
872
1005
  var EvalRunSchema = TenantEntitySchema.extend({
873
1006
  /** Agent ID for this run */
874
- agentId: import_zod24.z.string().optional(),
1007
+ agentId: import_zod25.z.string().optional(),
875
1008
  /** Skills group ID for this run */
876
- skillsGroupId: import_zod24.z.string().optional(),
1009
+ skillsGroupId: import_zod25.z.string().optional(),
877
1010
  /** Scenario IDs to run */
878
- scenarioIds: import_zod24.z.array(import_zod24.z.string()),
1011
+ scenarioIds: import_zod25.z.array(import_zod25.z.string()),
879
1012
  /** Current status */
880
1013
  status: EvalStatusSchema,
881
1014
  /** Progress percentage (0-100) */
882
- progress: import_zod24.z.number(),
1015
+ progress: import_zod25.z.number(),
883
1016
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
884
- results: import_zod24.z.array(import_zod24.z.lazy(() => EvalRunResultSchema)),
1017
+ results: import_zod25.z.array(import_zod25.z.lazy(() => EvalRunResultSchema)),
885
1018
  /** Aggregated metrics across all results */
886
1019
  aggregateMetrics: EvalMetricsSchema,
887
1020
  /** Failure analyses */
888
- failureAnalyses: import_zod24.z.array(FailureAnalysisSchema).optional(),
1021
+ failureAnalyses: import_zod25.z.array(FailureAnalysisSchema).optional(),
889
1022
  /** Aggregated LLM trace summary */
890
1023
  llmTraceSummary: LLMTraceSummarySchema.optional(),
891
1024
  /** What triggered this run */
892
1025
  trigger: TriggerSchema.optional(),
893
1026
  /** When the run started (set when evaluation is triggered) */
894
- startedAt: import_zod24.z.string().optional(),
1027
+ startedAt: import_zod25.z.string().optional(),
895
1028
  /** When the run completed */
896
- completedAt: import_zod24.z.string().optional(),
1029
+ completedAt: import_zod25.z.string().optional(),
897
1030
  /** Live trace events captured during execution (for playback on results page) */
898
- liveTraceEvents: import_zod24.z.array(LiveTraceEventSchema).optional(),
1031
+ liveTraceEvents: import_zod25.z.array(LiveTraceEventSchema).optional(),
899
1032
  /** Remote job ID for tracking execution in Dev Machines */
900
- jobId: import_zod24.z.string().optional(),
1033
+ jobId: import_zod25.z.string().optional(),
901
1034
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
902
- jobStatus: import_zod24.z.string().optional(),
1035
+ jobStatus: import_zod25.z.string().optional(),
903
1036
  /** Remote job error message if the job failed */
904
- jobError: import_zod24.z.string().optional(),
1037
+ jobError: import_zod25.z.string().optional(),
905
1038
  /** Timestamp of the last job status check */
906
- jobStatusCheckedAt: import_zod24.z.string().optional()
1039
+ jobStatusCheckedAt: import_zod25.z.string().optional()
907
1040
  });
908
1041
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
909
1042
  id: true,
@@ -916,28 +1049,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
916
1049
  startedAt: true,
917
1050
  completedAt: true
918
1051
  });
919
- var EvaluationProgressSchema = import_zod24.z.object({
920
- runId: import_zod24.z.string(),
921
- targetId: import_zod24.z.string(),
922
- totalScenarios: import_zod24.z.number(),
923
- completedScenarios: import_zod24.z.number(),
924
- scenarioProgress: import_zod24.z.array(
925
- import_zod24.z.object({
926
- scenarioId: import_zod24.z.string(),
927
- currentStep: import_zod24.z.string(),
928
- error: import_zod24.z.string().optional()
1052
+ var EvaluationProgressSchema = import_zod25.z.object({
1053
+ runId: import_zod25.z.string(),
1054
+ targetId: import_zod25.z.string(),
1055
+ totalScenarios: import_zod25.z.number(),
1056
+ completedScenarios: import_zod25.z.number(),
1057
+ scenarioProgress: import_zod25.z.array(
1058
+ import_zod25.z.object({
1059
+ scenarioId: import_zod25.z.string(),
1060
+ currentStep: import_zod25.z.string(),
1061
+ error: import_zod25.z.string().optional()
929
1062
  })
930
1063
  ),
931
- createdAt: import_zod24.z.number()
1064
+ createdAt: import_zod25.z.number()
932
1065
  });
933
- var EvaluationLogSchema = import_zod24.z.object({
934
- runId: import_zod24.z.string(),
935
- scenarioId: import_zod24.z.string(),
936
- log: import_zod24.z.object({
937
- level: import_zod24.z.enum(["info", "error", "debug"]),
938
- message: import_zod24.z.string().optional(),
939
- args: import_zod24.z.array(import_zod24.z.any()).optional(),
940
- error: import_zod24.z.string().optional()
1066
+ var EvaluationLogSchema = import_zod25.z.object({
1067
+ runId: import_zod25.z.string(),
1068
+ scenarioId: import_zod25.z.string(),
1069
+ log: import_zod25.z.object({
1070
+ level: import_zod25.z.enum(["info", "error", "debug"]),
1071
+ message: import_zod25.z.string().optional(),
1072
+ args: import_zod25.z.array(import_zod25.z.any()).optional(),
1073
+ error: import_zod25.z.string().optional()
941
1074
  })
942
1075
  });
943
1076
  var LLM_TIMEOUT = 12e4;
@@ -950,91 +1083,91 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
950
1083
  AssertionResultStatus2["ERROR"] = "error";
951
1084
  return AssertionResultStatus2;
952
1085
  })(AssertionResultStatus || {});
953
- var AssertionResultSchema = import_zod25.z.object({
954
- id: import_zod25.z.string(),
955
- assertionId: import_zod25.z.string(),
956
- assertionType: import_zod25.z.string(),
957
- assertionName: import_zod25.z.string(),
958
- status: import_zod25.z.enum(AssertionResultStatus),
959
- message: import_zod25.z.string().optional(),
960
- expected: import_zod25.z.string().optional(),
961
- actual: import_zod25.z.string().optional(),
962
- duration: import_zod25.z.number().optional(),
963
- details: import_zod25.z.record(import_zod25.z.string(), import_zod25.z.unknown()).optional(),
964
- llmTraceSteps: import_zod25.z.array(LLMTraceStepSchema).optional()
965
- });
966
- var EvalRunResultSchema = import_zod25.z.object({
967
- id: import_zod25.z.string(),
968
- targetId: import_zod25.z.string(),
969
- targetName: import_zod25.z.string().optional(),
970
- scenarioId: import_zod25.z.string(),
971
- scenarioName: import_zod25.z.string(),
1086
+ var AssertionResultSchema = import_zod26.z.object({
1087
+ id: import_zod26.z.string(),
1088
+ assertionId: import_zod26.z.string(),
1089
+ assertionType: import_zod26.z.string(),
1090
+ assertionName: import_zod26.z.string(),
1091
+ status: import_zod26.z.enum(AssertionResultStatus),
1092
+ message: import_zod26.z.string().optional(),
1093
+ expected: import_zod26.z.string().optional(),
1094
+ actual: import_zod26.z.string().optional(),
1095
+ duration: import_zod26.z.number().optional(),
1096
+ details: import_zod26.z.record(import_zod26.z.string(), import_zod26.z.unknown()).optional(),
1097
+ llmTraceSteps: import_zod26.z.array(LLMTraceStepSchema).optional()
1098
+ });
1099
+ var EvalRunResultSchema = import_zod26.z.object({
1100
+ id: import_zod26.z.string(),
1101
+ targetId: import_zod26.z.string(),
1102
+ targetName: import_zod26.z.string().optional(),
1103
+ scenarioId: import_zod26.z.string(),
1104
+ scenarioName: import_zod26.z.string(),
972
1105
  modelConfig: ModelConfigSchema.optional(),
973
- assertionResults: import_zod25.z.array(AssertionResultSchema),
1106
+ assertionResults: import_zod26.z.array(AssertionResultSchema),
974
1107
  metrics: EvalMetricsSchema.optional(),
975
- passed: import_zod25.z.number(),
976
- failed: import_zod25.z.number(),
977
- passRate: import_zod25.z.number(),
978
- duration: import_zod25.z.number(),
979
- outputText: import_zod25.z.string().optional(),
980
- files: import_zod25.z.array(ExpectedFileSchema).optional(),
981
- fileDiffs: import_zod25.z.array(DiffContentSchema).optional(),
1108
+ passed: import_zod26.z.number(),
1109
+ failed: import_zod26.z.number(),
1110
+ passRate: import_zod26.z.number(),
1111
+ duration: import_zod26.z.number(),
1112
+ outputText: import_zod26.z.string().optional(),
1113
+ files: import_zod26.z.array(ExpectedFileSchema).optional(),
1114
+ fileDiffs: import_zod26.z.array(DiffContentSchema).optional(),
982
1115
  /** Full template files after execution with status indicators */
983
- templateFiles: import_zod25.z.array(TemplateFileSchema).optional(),
984
- startedAt: import_zod25.z.string().optional(),
985
- completedAt: import_zod25.z.string().optional(),
1116
+ templateFiles: import_zod26.z.array(TemplateFileSchema).optional(),
1117
+ startedAt: import_zod26.z.string().optional(),
1118
+ completedAt: import_zod26.z.string().optional(),
986
1119
  llmTrace: LLMTraceSchema.optional()
987
1120
  });
988
- var PromptResultSchema = import_zod25.z.object({
989
- text: import_zod25.z.string(),
990
- files: import_zod25.z.array(import_zod25.z.unknown()).optional(),
991
- finishReason: import_zod25.z.string().optional(),
992
- reasoning: import_zod25.z.string().optional(),
993
- reasoningDetails: import_zod25.z.unknown().optional(),
994
- toolCalls: import_zod25.z.array(import_zod25.z.unknown()).optional(),
995
- toolResults: import_zod25.z.array(import_zod25.z.unknown()).optional(),
996
- warnings: import_zod25.z.array(import_zod25.z.unknown()).optional(),
997
- sources: import_zod25.z.array(import_zod25.z.unknown()).optional(),
998
- steps: import_zod25.z.array(import_zod25.z.unknown()),
999
- generationTimeMs: import_zod25.z.number(),
1000
- prompt: import_zod25.z.string(),
1001
- systemPrompt: import_zod25.z.string(),
1002
- usage: import_zod25.z.object({
1003
- totalTokens: import_zod25.z.number().optional(),
1004
- totalMicrocentsSpent: import_zod25.z.number().optional()
1121
+ var PromptResultSchema = import_zod26.z.object({
1122
+ text: import_zod26.z.string(),
1123
+ files: import_zod26.z.array(import_zod26.z.unknown()).optional(),
1124
+ finishReason: import_zod26.z.string().optional(),
1125
+ reasoning: import_zod26.z.string().optional(),
1126
+ reasoningDetails: import_zod26.z.unknown().optional(),
1127
+ toolCalls: import_zod26.z.array(import_zod26.z.unknown()).optional(),
1128
+ toolResults: import_zod26.z.array(import_zod26.z.unknown()).optional(),
1129
+ warnings: import_zod26.z.array(import_zod26.z.unknown()).optional(),
1130
+ sources: import_zod26.z.array(import_zod26.z.unknown()).optional(),
1131
+ steps: import_zod26.z.array(import_zod26.z.unknown()),
1132
+ generationTimeMs: import_zod26.z.number(),
1133
+ prompt: import_zod26.z.string(),
1134
+ systemPrompt: import_zod26.z.string(),
1135
+ usage: import_zod26.z.object({
1136
+ totalTokens: import_zod26.z.number().optional(),
1137
+ totalMicrocentsSpent: import_zod26.z.number().optional()
1005
1138
  })
1006
1139
  });
1007
- var EvaluationResultSchema = import_zod25.z.object({
1008
- id: import_zod25.z.string(),
1009
- runId: import_zod25.z.string(),
1010
- timestamp: import_zod25.z.number(),
1140
+ var EvaluationResultSchema = import_zod26.z.object({
1141
+ id: import_zod26.z.string(),
1142
+ runId: import_zod26.z.string(),
1143
+ timestamp: import_zod26.z.number(),
1011
1144
  promptResult: PromptResultSchema,
1012
- testResults: import_zod25.z.array(import_zod25.z.unknown()),
1013
- tags: import_zod25.z.array(import_zod25.z.string()).optional(),
1014
- feedback: import_zod25.z.string().optional(),
1015
- score: import_zod25.z.number(),
1016
- suiteId: import_zod25.z.string().optional()
1017
- });
1018
- var LeanEvaluationResultSchema = import_zod25.z.object({
1019
- id: import_zod25.z.string(),
1020
- runId: import_zod25.z.string(),
1021
- timestamp: import_zod25.z.number(),
1022
- tags: import_zod25.z.array(import_zod25.z.string()).optional(),
1023
- scenarioId: import_zod25.z.string(),
1024
- scenarioVersion: import_zod25.z.number().optional(),
1025
- targetId: import_zod25.z.string(),
1026
- targetVersion: import_zod25.z.number().optional(),
1027
- suiteId: import_zod25.z.string().optional(),
1028
- score: import_zod25.z.number(),
1029
- time: import_zod25.z.number().optional(),
1030
- microcentsSpent: import_zod25.z.number().optional()
1145
+ testResults: import_zod26.z.array(import_zod26.z.unknown()),
1146
+ tags: import_zod26.z.array(import_zod26.z.string()).optional(),
1147
+ feedback: import_zod26.z.string().optional(),
1148
+ score: import_zod26.z.number(),
1149
+ suiteId: import_zod26.z.string().optional()
1150
+ });
1151
+ var LeanEvaluationResultSchema = import_zod26.z.object({
1152
+ id: import_zod26.z.string(),
1153
+ runId: import_zod26.z.string(),
1154
+ timestamp: import_zod26.z.number(),
1155
+ tags: import_zod26.z.array(import_zod26.z.string()).optional(),
1156
+ scenarioId: import_zod26.z.string(),
1157
+ scenarioVersion: import_zod26.z.number().optional(),
1158
+ targetId: import_zod26.z.string(),
1159
+ targetVersion: import_zod26.z.number().optional(),
1160
+ suiteId: import_zod26.z.string().optional(),
1161
+ score: import_zod26.z.number(),
1162
+ time: import_zod26.z.number().optional(),
1163
+ microcentsSpent: import_zod26.z.number().optional()
1031
1164
  });
1032
1165
 
1033
1166
  // src/project/project.ts
1034
- var import_zod26 = require("zod");
1167
+ var import_zod27 = require("zod");
1035
1168
  var ProjectSchema = BaseEntitySchema.extend({
1036
- appId: import_zod26.z.string().optional().describe("The ID of the app in Dev Center"),
1037
- appSecret: import_zod26.z.string().optional().describe("The secret of the app in Dev Center")
1169
+ appId: import_zod27.z.string().optional().describe("The ID of the app in Dev Center"),
1170
+ appSecret: import_zod27.z.string().optional().describe("The secret of the app in Dev Center")
1038
1171
  });
1039
1172
  var CreateProjectInputSchema = ProjectSchema.omit({
1040
1173
  id: true,
@@ -1045,10 +1178,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
1045
1178
  var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
1046
1179
 
1047
1180
  // src/template/template.ts
1048
- var import_zod27 = require("zod");
1181
+ var import_zod28 = require("zod");
1049
1182
  var TemplateSchema = TenantEntitySchema.extend({
1050
1183
  /** URL to download the template from */
1051
- downloadUrl: import_zod27.z.url()
1184
+ downloadUrl: import_zod28.z.url()
1052
1185
  });
1053
1186
  var CreateTemplateInputSchema = TemplateSchema.omit({
1054
1187
  id: true,
@@ -1058,89 +1191,107 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
1058
1191
  });
1059
1192
  var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1060
1193
 
1061
- // src/assertion/assertion.ts
1062
- var import_zod28 = require("zod");
1063
- var AssertionTypeSchema = import_zod28.z.enum([
1064
- "skill_was_called",
1065
- "build_passed",
1066
- "llm_judge",
1067
- "custom"
1068
- ]);
1069
- var SkillWasCalledConfigSchema = import_zod28.z.object({
1070
- /** Name of the skill that must have been called */
1071
- skillName: import_zod28.z.string().min(1)
1072
- });
1073
- var BuildPassedConfigSchema = import_zod28.z.strictObject({
1074
- /** Command to run (default: "yarn build") */
1075
- command: import_zod28.z.string().optional(),
1076
- /** Expected exit code (default: 0) */
1077
- expectedExitCode: import_zod28.z.number().int().optional()
1078
- });
1079
- var LlmJudgeConfigSchema = import_zod28.z.object({
1080
- /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
1081
- prompt: import_zod28.z.string().min(1),
1082
- /** Optional system prompt for the judge */
1083
- systemPrompt: import_zod28.z.string().optional(),
1084
- /** Minimum score to pass (0-100, default 70) */
1085
- minScore: import_zod28.z.number().int().min(0).max(100).optional(),
1086
- /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
1087
- model: import_zod28.z.string().optional(),
1088
- /** Max output tokens */
1089
- maxTokens: import_zod28.z.number().int().optional(),
1090
- /** Temperature (0-1) */
1091
- temperature: import_zod28.z.number().min(0).max(1).optional()
1092
- });
1093
- var AssertionConfigSchema = import_zod28.z.union([
1094
- LlmJudgeConfigSchema,
1095
- // requires prompt - check first
1096
- SkillWasCalledConfigSchema,
1097
- // requires skillName
1098
- BuildPassedConfigSchema,
1099
- // all optional, uses strictObject to reject unknown keys
1100
- import_zod28.z.object({})
1101
- // fallback empty config
1102
- ]);
1103
- var CustomAssertionSchema = TenantEntitySchema.extend({
1104
- /** The assertion type */
1105
- type: AssertionTypeSchema,
1106
- /** Type-specific configuration */
1107
- config: AssertionConfigSchema
1108
- });
1109
- var CreateCustomAssertionInputSchema = CustomAssertionSchema.omit({
1110
- id: true,
1111
- createdAt: true,
1112
- updatedAt: true,
1113
- deleted: true
1114
- });
1115
- var UpdateCustomAssertionInputSchema = CreateCustomAssertionInputSchema.partial();
1116
- function validateAssertionConfig(type, config) {
1117
- switch (type) {
1118
- case "skill_was_called":
1119
- return SkillWasCalledConfigSchema.safeParse(config).success;
1120
- case "build_passed":
1121
- return BuildPassedConfigSchema.safeParse(config).success;
1122
- case "llm_judge":
1123
- case "custom":
1124
- return LlmJudgeConfigSchema.safeParse(config).success;
1125
- default:
1126
- return false;
1127
- }
1128
- }
1129
- function getSkillWasCalledConfig(assertion) {
1130
- if (assertion.type !== "skill_was_called") return null;
1131
- const result = SkillWasCalledConfigSchema.safeParse(assertion.config);
1132
- return result.success ? result.data : null;
1194
+ // src/assertion/system-assertions.ts
1195
+ var SYSTEM_ASSERTION_IDS = {
1196
+ SKILL_WAS_CALLED: "system:skill_was_called",
1197
+ BUILD_PASSED: "system:build_passed",
1198
+ LLM_JUDGE: "system:llm_judge"
1199
+ };
1200
+ function isSystemAssertionId(id) {
1201
+ return id.startsWith("system:");
1133
1202
  }
1134
- function getBuildPassedConfig(assertion) {
1135
- if (assertion.type !== "build_passed") return null;
1136
- const result = BuildPassedConfigSchema.safeParse(assertion.config);
1137
- return result.success ? result.data : null;
1203
+ var SYSTEM_ASSERTIONS = {
1204
+ [SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
1205
+ id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
1206
+ name: "Skill Was Called",
1207
+ description: "Check if a specific skill was invoked during the agent run",
1208
+ type: "skill_was_called",
1209
+ parameters: [
1210
+ {
1211
+ name: "skillName",
1212
+ label: "Skill Name",
1213
+ type: "string",
1214
+ required: true
1215
+ }
1216
+ ]
1217
+ },
1218
+ [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
1219
+ id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
1220
+ name: "Build Passed",
1221
+ description: "Run a build command and verify it exits with expected code",
1222
+ type: "build_passed",
1223
+ parameters: [
1224
+ {
1225
+ name: "command",
1226
+ label: "Build Command",
1227
+ type: "string",
1228
+ required: false,
1229
+ defaultValue: "yarn build"
1230
+ },
1231
+ {
1232
+ name: "expectedExitCode",
1233
+ label: "Expected Exit Code",
1234
+ type: "number",
1235
+ required: false,
1236
+ defaultValue: 0
1237
+ },
1238
+ {
1239
+ name: "maxBuildTime",
1240
+ label: "Max Build Time (ms)",
1241
+ type: "number",
1242
+ required: false,
1243
+ advanced: true
1244
+ },
1245
+ {
1246
+ name: "maxMemory",
1247
+ label: "Max Memory (MB)",
1248
+ type: "number",
1249
+ required: false,
1250
+ advanced: true
1251
+ }
1252
+ ]
1253
+ },
1254
+ [SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
1255
+ id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
1256
+ name: "LLM Judge",
1257
+ description: "LLM evaluates the output and assigns a score (0-100)",
1258
+ type: "llm_judge",
1259
+ parameters: [
1260
+ {
1261
+ name: "prompt",
1262
+ label: "Judge Prompt",
1263
+ type: "string",
1264
+ required: true,
1265
+ defaultValue: "Verify the output meets the acceptance criteria."
1266
+ },
1267
+ {
1268
+ name: "systemPrompt",
1269
+ label: "System Prompt (optional)",
1270
+ type: "string",
1271
+ required: false,
1272
+ defaultValue: `You are judging a scenario run. Use these values:
1273
+ - {{output}}: the agent's final output
1274
+ - {{cwd}}: working directory
1275
+ - {{changedFiles}}: list of files changed (or "No files were changed")
1276
+ - {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
1277
+
1278
+ Judge how well the output meets the acceptance criteria stated in the user prompt.`
1279
+ },
1280
+ {
1281
+ name: "minScore",
1282
+ label: "Minimum Score (0-100)",
1283
+ type: "number",
1284
+ required: false,
1285
+ defaultValue: 70
1286
+ }
1287
+ ]
1288
+ }
1289
+ };
1290
+ function getSystemAssertions() {
1291
+ return Object.values(SYSTEM_ASSERTIONS);
1138
1292
  }
1139
- function getLlmJudgeConfig(assertion) {
1140
- if (assertion.type !== "llm_judge" && assertion.type !== "custom")
1141
- return null;
1142
- const result = LlmJudgeConfigSchema.safeParse(assertion.config);
1143
- return result.success ? result.data : null;
1293
+ function getSystemAssertion(id) {
1294
+ return SYSTEM_ASSERTIONS[id];
1144
1295
  }
1145
1296
  // Annotate the CommonJS export names for ESM import in node:
1146
1297
  0 && (module.exports = {
@@ -1150,6 +1301,8 @@ function getLlmJudgeConfig(assertion) {
1150
1301
  AllowedCommands,
1151
1302
  ApiCallSchema,
1152
1303
  AssertionConfigSchema,
1304
+ AssertionParameterSchema,
1305
+ AssertionParameterTypeSchema,
1153
1306
  AssertionResultSchema,
1154
1307
  AssertionResultStatus,
1155
1308
  AssertionSchema,
@@ -1216,6 +1369,9 @@ function getLlmJudgeConfig(assertion) {
1216
1369
  ProjectSchema,
1217
1370
  PromptResultSchema,
1218
1371
  SKILL_FOLDER_NAME_REGEX,
1372
+ SYSTEM_ASSERTIONS,
1373
+ SYSTEM_ASSERTION_IDS,
1374
+ ScenarioAssertionLinkSchema,
1219
1375
  SiteConfigTestSchema,
1220
1376
  SkillMetadataSchema,
1221
1377
  SkillSchema,
@@ -1254,6 +1410,9 @@ function getLlmJudgeConfig(assertion) {
1254
1410
  getBuildPassedConfig,
1255
1411
  getLlmJudgeConfig,
1256
1412
  getSkillWasCalledConfig,
1413
+ getSystemAssertion,
1414
+ getSystemAssertions,
1415
+ isSystemAssertionId,
1257
1416
  isValidSkillFolderName,
1258
1417
  parseTraceEventLine,
1259
1418
  validateAssertionConfig