@wix/evalforge-types 0.71.0 → 0.73.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -21,6 +21,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
21
21
  var index_exports = {};
22
22
  __export(index_exports, {
23
23
  AGENT_TYPE_LABELS: () => AGENT_TYPE_LABELS,
24
+ ALLOWED_BUILD_COMMANDS: () => ALLOWED_BUILD_COMMANDS,
24
25
  ALL_AVAILABLE_MODEL_IDS: () => ALL_AVAILABLE_MODEL_IDS,
25
26
  AVAILABLE_CLAUDE_MODEL_IDS: () => AVAILABLE_CLAUDE_MODEL_IDS,
26
27
  AVAILABLE_OPENAI_MODEL_IDS: () => AVAILABLE_OPENAI_MODEL_IDS,
@@ -54,6 +55,7 @@ __export(index_exports, {
54
55
  BatchSummarySchema: () => BatchSummarySchema,
55
56
  BuildCheckTestSchema: () => BuildCheckTestSchema,
56
57
  BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
58
+ BuildPassedCommandStringSchema: () => BuildPassedCommandStringSchema,
57
59
  BuildPassedConfigSchema: () => BuildPassedConfigSchema,
58
60
  BulkImportResultItemSchema: () => BulkImportResultItemSchema,
59
61
  BulkImportResultSchema: () => BulkImportResultSchema,
@@ -81,6 +83,7 @@ __export(index_exports, {
81
83
  CreateTemplateInputSchema: () => CreateTemplateInputSchema,
82
84
  CreateTestScenarioInputSchema: () => CreateTestScenarioInputSchema,
83
85
  CreateTestSuiteInputSchema: () => CreateTestSuiteInputSchema,
86
+ DEFAULT_BUILD_PASSED_COMMAND: () => DEFAULT_BUILD_PASSED_COMMAND,
84
87
  DEFAULT_EVALUATOR_SYSTEM_PROMPT: () => DEFAULT_EVALUATOR_SYSTEM_PROMPT,
85
88
  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
86
89
  DiffContentSchema: () => DiffContentSchema,
@@ -178,6 +181,7 @@ __export(index_exports, {
178
181
  ToolTestSchema: () => ToolTestSchema,
179
182
  ToolUseBlockSchema: () => ToolUseBlockSchema,
180
183
  TriggerMetadataSchema: () => TriggerMetadataSchema,
184
+ TriggerPromptImageSchema: () => TriggerPromptImageSchema,
181
185
  TriggerSchema: () => TriggerSchema,
182
186
  TriggerType: () => TriggerType,
183
187
  UpdateAgentInputSchema: () => UpdateAgentInputSchema,
@@ -197,12 +201,15 @@ __export(index_exports, {
197
201
  formatTraceEventLine: () => formatTraceEventLine,
198
202
  getSystemAssertion: () => getSystemAssertion,
199
203
  getSystemAssertions: () => getSystemAssertions,
204
+ isAllowedBuildCommandString: () => isAllowedBuildCommandString,
200
205
  isSystemAssertionId: () => isSystemAssertionId,
201
206
  isValidSkillFolderName: () => isValidSkillFolderName,
202
207
  normalizeBatchAssertionLink: () => normalizeBatchAssertionLink,
203
208
  normalizeModelId: () => normalizeModelId,
209
+ parseBuildCommandToArgv: () => parseBuildCommandToArgv,
204
210
  parseTraceEventLine: () => parseTraceEventLine,
205
- validateAssertionConfig: () => validateAssertionConfig
211
+ validateAssertionConfig: () => validateAssertionConfig,
212
+ validateBuildPassedParamsInAssertionLinks: () => validateBuildPassedParamsInAssertionLinks
206
213
  });
207
214
  module.exports = __toCommonJS(index_exports);
208
215
 
@@ -792,11 +799,42 @@ var EnvironmentSchema = import_zod21.z.object({
792
799
  });
793
800
 
794
801
  // src/scenario/test-scenario.ts
795
- var import_zod23 = require("zod");
802
+ var import_zod24 = require("zod");
796
803
 
797
804
  // src/assertion/assertion.ts
805
+ var import_zod23 = require("zod");
806
+
807
+ // src/assertion/build-passed-command.ts
798
808
  var import_zod22 = require("zod");
799
- var AssertionTypeSchema = import_zod22.z.enum([
809
+ var ALLOWED_BUILD_COMMANDS = [
810
+ "yarn build",
811
+ "npm run build",
812
+ "pnpm run build",
813
+ "pnpm build"
814
+ ];
815
+ var DEFAULT_BUILD_PASSED_COMMAND = "yarn build";
816
+ var BUILD_COMMAND_ARGV = {
817
+ "yarn build": ["yarn", "build"],
818
+ "npm run build": ["npm", "run", "build"],
819
+ "pnpm run build": ["pnpm", "run", "build"],
820
+ "pnpm build": ["pnpm", "build"]
821
+ };
822
+ function isAllowedBuildCommandString(command) {
823
+ const trimmed = command.trim();
824
+ return ALLOWED_BUILD_COMMANDS.includes(trimmed);
825
+ }
826
+ function parseBuildCommandToArgv(command) {
827
+ const trimmed = command.trim();
828
+ if (!(trimmed in BUILD_COMMAND_ARGV)) {
829
+ return null;
830
+ }
831
+ return BUILD_COMMAND_ARGV[trimmed];
832
+ }
833
+ var enumTuple = ALLOWED_BUILD_COMMANDS;
834
+ var BuildPassedCommandStringSchema = import_zod22.z.enum(enumTuple);
835
+
836
+ // src/assertion/assertion.ts
837
+ var AssertionTypeSchema = import_zod23.z.enum([
800
838
  "skill_was_called",
801
839
  "tool_called_with_param",
802
840
  "build_passed",
@@ -805,61 +843,61 @@ var AssertionTypeSchema = import_zod22.z.enum([
805
843
  "llm_judge",
806
844
  "api_call"
807
845
  ]);
808
- var AssertionParameterTypeSchema = import_zod22.z.enum([
846
+ var AssertionParameterTypeSchema = import_zod23.z.enum([
809
847
  "string",
810
848
  "number",
811
849
  "boolean"
812
850
  ]);
813
- var AssertionParameterSchema = import_zod22.z.object({
851
+ var AssertionParameterSchema = import_zod23.z.object({
814
852
  /** Parameter name (used as key in params object) */
815
- name: import_zod22.z.string().min(1),
853
+ name: import_zod23.z.string().min(1),
816
854
  /** Display label for the parameter */
817
- label: import_zod22.z.string().min(1),
855
+ label: import_zod23.z.string().min(1),
818
856
  /** Parameter type */
819
857
  type: AssertionParameterTypeSchema,
820
858
  /** Whether this parameter is required */
821
- required: import_zod22.z.boolean(),
859
+ required: import_zod23.z.boolean(),
822
860
  /** Default value (optional, used when not provided) */
823
- defaultValue: import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean()]).optional(),
861
+ defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
824
862
  /** If true, parameter is hidden by default behind "Show advanced options" */
825
- advanced: import_zod22.z.boolean().optional()
863
+ advanced: import_zod23.z.boolean().optional()
826
864
  });
827
- var ScenarioAssertionLinkSchema = import_zod22.z.object({
865
+ var ScenarioAssertionLinkSchema = import_zod23.z.object({
828
866
  /** ID of the system assertion (e.g., 'system:skill_was_called') */
829
- assertionId: import_zod22.z.string(),
867
+ assertionId: import_zod23.z.string(),
830
868
  /** Parameter values for this assertion in this scenario */
831
- params: import_zod22.z.record(
832
- import_zod22.z.string(),
833
- import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean(), import_zod22.z.null()])
869
+ params: import_zod23.z.record(
870
+ import_zod23.z.string(),
871
+ import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
834
872
  ).optional()
835
873
  });
836
- var SkillWasCalledConfigSchema = import_zod22.z.object({
874
+ var SkillWasCalledConfigSchema = import_zod23.z.object({
837
875
  /** Names of the skills that must have been called */
838
- skillNames: import_zod22.z.array(import_zod22.z.string().min(1)).min(1)
876
+ skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
839
877
  });
840
- var CostConfigSchema = import_zod22.z.strictObject({
878
+ var CostConfigSchema = import_zod23.z.strictObject({
841
879
  /** Maximum allowed cost in USD */
842
- maxCostUsd: import_zod22.z.number().positive()
880
+ maxCostUsd: import_zod23.z.number().positive()
843
881
  });
844
- var ToolCalledWithParamConfigSchema = import_zod22.z.strictObject({
882
+ var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
845
883
  /** Name of the tool that must have been called */
846
- toolName: import_zod22.z.string().min(1),
884
+ toolName: import_zod23.z.string().min(1),
847
885
  /** JSON string of key-value pairs for expected parameters (substring match). Optional — when omitted, only checks tool presence. */
848
- expectedParams: import_zod22.z.string().min(1).optional(),
886
+ expectedParams: import_zod23.z.string().min(1).optional(),
849
887
  /** If true, the matching tool call must also have succeeded (step.success === true) */
850
- requireSuccess: import_zod22.z.boolean().optional()
888
+ requireSuccess: import_zod23.z.boolean().optional()
851
889
  });
852
- var BuildPassedConfigSchema = import_zod22.z.strictObject({
853
- /** Command to run (default: "yarn build") */
854
- command: import_zod22.z.string().optional(),
890
+ var BuildPassedConfigSchema = import_zod23.z.strictObject({
891
+ /** Allowlisted command only (default at runtime: "yarn build") */
892
+ command: BuildPassedCommandStringSchema.optional(),
855
893
  /** Expected exit code (default: 0) */
856
- expectedExitCode: import_zod22.z.number().int().optional()
894
+ expectedExitCode: import_zod23.z.number().int().optional()
857
895
  });
858
- var TimeConfigSchema = import_zod22.z.strictObject({
896
+ var TimeConfigSchema = import_zod23.z.strictObject({
859
897
  /** Maximum allowed duration in milliseconds */
860
- maxDurationMs: import_zod22.z.number().int().positive()
898
+ maxDurationMs: import_zod23.z.number().int().positive()
861
899
  });
862
- var LlmJudgeConfigSchema = import_zod22.z.object({
900
+ var LlmJudgeConfigSchema = import_zod23.z.object({
863
901
  /**
864
902
  * Prompt template with placeholders:
865
903
  * - {{output}}: agent's final output
@@ -870,65 +908,65 @@ var LlmJudgeConfigSchema = import_zod22.z.object({
870
908
  * - {{trace}}: step-by-step trace of tool calls
871
909
  * - Custom parameters defined in the parameters array
872
910
  */
873
- prompt: import_zod22.z.string().min(1),
911
+ prompt: import_zod23.z.string().min(1),
874
912
  /** Minimum score to pass (0-10, default 7) */
875
- minScore: import_zod22.z.number().int().min(0).max(10).optional(),
913
+ minScore: import_zod23.z.number().int().min(0).max(10).optional(),
876
914
  /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
877
- model: import_zod22.z.string().optional(),
915
+ model: import_zod23.z.string().optional(),
878
916
  /** Max output tokens */
879
- maxTokens: import_zod22.z.number().int().optional(),
917
+ maxTokens: import_zod23.z.number().int().optional(),
880
918
  /** Temperature (0-1) */
881
- temperature: import_zod22.z.number().min(0).max(1).optional(),
919
+ temperature: import_zod23.z.number().min(0).max(1).optional(),
882
920
  /** User-defined parameters for this assertion */
883
- parameters: import_zod22.z.array(AssertionParameterSchema).optional()
921
+ parameters: import_zod23.z.array(AssertionParameterSchema).optional()
884
922
  });
885
- var ApiCallConfigSchema = import_zod22.z.strictObject({
923
+ var ApiCallConfigSchema = import_zod23.z.strictObject({
886
924
  /** URL to call */
887
- url: import_zod22.z.string().min(1),
925
+ url: import_zod23.z.string().min(1),
888
926
  /** HTTP method (default GET) */
889
- method: import_zod22.z.enum(["GET", "POST"]).optional(),
927
+ method: import_zod23.z.enum(["GET", "POST"]).optional(),
890
928
  /** Request body (JSON string, for POST requests) */
891
- requestBody: import_zod22.z.string().optional(),
929
+ requestBody: import_zod23.z.string().optional(),
892
930
  /** Expected JSON response to validate against (subset match — extra fields in actual are OK) */
893
- expectedResponse: import_zod22.z.string().min(1),
931
+ expectedResponse: import_zod23.z.string().min(1),
894
932
  /** Request headers as JSON string of key-value pairs */
895
- requestHeaders: import_zod22.z.string().optional(),
933
+ requestHeaders: import_zod23.z.string().optional(),
896
934
  /** Request timeout in milliseconds (default 30000) */
897
- timeoutMs: import_zod22.z.number().int().positive().optional()
935
+ timeoutMs: import_zod23.z.number().int().positive().optional()
898
936
  });
899
937
  var AssertionBaseFields = {
900
938
  /** When true, the assertion's pass/fail logic is inverted (NOT operator). */
901
- negate: import_zod22.z.boolean().optional()
939
+ negate: import_zod23.z.boolean().optional()
902
940
  };
903
941
  var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
904
- type: import_zod22.z.literal("skill_was_called"),
942
+ type: import_zod23.z.literal("skill_was_called"),
905
943
  ...AssertionBaseFields
906
944
  });
907
945
  var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
908
- type: import_zod22.z.literal("tool_called_with_param"),
946
+ type: import_zod23.z.literal("tool_called_with_param"),
909
947
  ...AssertionBaseFields
910
948
  });
911
949
  var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
912
- type: import_zod22.z.literal("build_passed"),
950
+ type: import_zod23.z.literal("build_passed"),
913
951
  ...AssertionBaseFields
914
952
  });
915
953
  var CostAssertionSchema = CostConfigSchema.extend({
916
- type: import_zod22.z.literal("cost"),
954
+ type: import_zod23.z.literal("cost"),
917
955
  ...AssertionBaseFields
918
956
  });
919
957
  var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
920
- type: import_zod22.z.literal("llm_judge"),
958
+ type: import_zod23.z.literal("llm_judge"),
921
959
  ...AssertionBaseFields
922
960
  });
923
961
  var ApiCallAssertionSchema = ApiCallConfigSchema.extend({
924
- type: import_zod22.z.literal("api_call"),
962
+ type: import_zod23.z.literal("api_call"),
925
963
  ...AssertionBaseFields
926
964
  });
927
965
  var TimeAssertionSchema = TimeConfigSchema.extend({
928
- type: import_zod22.z.literal("time_limit"),
966
+ type: import_zod23.z.literal("time_limit"),
929
967
  ...AssertionBaseFields
930
968
  });
931
- var AssertionSchema = import_zod22.z.union([
969
+ var AssertionSchema = import_zod23.z.union([
932
970
  SkillWasCalledAssertionSchema,
933
971
  ToolCalledWithParamAssertionSchema,
934
972
  BuildPassedAssertionSchema,
@@ -937,7 +975,7 @@ var AssertionSchema = import_zod22.z.union([
937
975
  LlmJudgeAssertionSchema,
938
976
  ApiCallAssertionSchema
939
977
  ]);
940
- var AssertionConfigSchema = import_zod22.z.union([
978
+ var AssertionConfigSchema = import_zod23.z.union([
941
979
  LlmJudgeConfigSchema,
942
980
  // requires prompt - check first
943
981
  SkillWasCalledConfigSchema,
@@ -952,7 +990,7 @@ var AssertionConfigSchema = import_zod22.z.union([
952
990
  // requires maxCostUsd, uses strictObject
953
991
  BuildPassedConfigSchema,
954
992
  // all optional, uses strictObject to reject unknown keys
955
- import_zod22.z.object({})
993
+ import_zod23.z.object({})
956
994
  // fallback empty config
957
995
  ]);
958
996
  function validateAssertionConfig(type, config) {
@@ -976,52 +1014,322 @@ function validateAssertionConfig(type, config) {
976
1014
  }
977
1015
  }
978
1016
 
1017
+ // src/assertion/system-assertions.ts
1018
+ var SYSTEM_ASSERTION_IDS = {
1019
+ SKILL_WAS_CALLED: "system:skill_was_called",
1020
+ TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
1021
+ BUILD_PASSED: "system:build_passed",
1022
+ TIME_LIMIT: "system:time_limit",
1023
+ COST: "system:cost",
1024
+ LLM_JUDGE: "system:llm_judge",
1025
+ API_CALL: "system:api_call"
1026
+ };
1027
+ function isSystemAssertionId(id) {
1028
+ return id.startsWith("system:");
1029
+ }
1030
+ var SYSTEM_ASSERTIONS = {
1031
+ [SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
1032
+ id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
1033
+ name: "Skill Was Called",
1034
+ description: "Check that one or more skills were invoked during the agent run",
1035
+ type: "skill_was_called",
1036
+ parameters: [
1037
+ {
1038
+ name: "skillNames",
1039
+ label: "Skills",
1040
+ type: "string",
1041
+ required: true
1042
+ },
1043
+ {
1044
+ name: "negate",
1045
+ label: "Negate (NOT operator)",
1046
+ type: "boolean",
1047
+ required: false,
1048
+ defaultValue: false
1049
+ }
1050
+ ]
1051
+ },
1052
+ [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
1053
+ id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
1054
+ name: "Tool Called With Param",
1055
+ description: "Check that a tool was called with expected parameters (tool name is substring matched)",
1056
+ type: "tool_called_with_param",
1057
+ parameters: [
1058
+ {
1059
+ name: "toolName",
1060
+ label: "Tool Name",
1061
+ type: "string",
1062
+ required: true
1063
+ },
1064
+ {
1065
+ name: "expectedParams",
1066
+ label: "Expected Parameters (JSON, substring match)",
1067
+ type: "string",
1068
+ required: false
1069
+ },
1070
+ {
1071
+ name: "requireSuccess",
1072
+ label: "Require Successful Call",
1073
+ type: "boolean",
1074
+ required: false,
1075
+ defaultValue: false,
1076
+ advanced: true
1077
+ },
1078
+ {
1079
+ name: "negate",
1080
+ label: "Negate (NOT operator)",
1081
+ type: "boolean",
1082
+ required: false,
1083
+ defaultValue: false
1084
+ }
1085
+ ]
1086
+ },
1087
+ [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
1088
+ id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
1089
+ name: "Build Passed",
1090
+ description: "Run a build command and verify it exits with expected code",
1091
+ type: "build_passed",
1092
+ parameters: [
1093
+ {
1094
+ name: "command",
1095
+ label: "Build Command",
1096
+ type: "string",
1097
+ required: false,
1098
+ defaultValue: "yarn build"
1099
+ },
1100
+ {
1101
+ name: "expectedExitCode",
1102
+ label: "Expected Exit Code",
1103
+ type: "number",
1104
+ required: false,
1105
+ defaultValue: 0
1106
+ },
1107
+ {
1108
+ name: "maxBuildTime",
1109
+ label: "Max Build Time (ms)",
1110
+ type: "number",
1111
+ required: false,
1112
+ advanced: true
1113
+ },
1114
+ {
1115
+ name: "maxMemory",
1116
+ label: "Max Memory (MB)",
1117
+ type: "number",
1118
+ required: false,
1119
+ advanced: true
1120
+ }
1121
+ ]
1122
+ },
1123
+ [SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
1124
+ id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
1125
+ name: "Time Limit",
1126
+ description: "Check that the scenario completed within a maximum duration",
1127
+ type: "time_limit",
1128
+ parameters: [
1129
+ {
1130
+ name: "maxDurationMs",
1131
+ label: "Max Duration (ms)",
1132
+ type: "number",
1133
+ required: true,
1134
+ defaultValue: 3e5
1135
+ }
1136
+ ]
1137
+ },
1138
+ [SYSTEM_ASSERTION_IDS.COST]: {
1139
+ id: SYSTEM_ASSERTION_IDS.COST,
1140
+ name: "Cost",
1141
+ description: "Check that the scenario LLM execution cost stays within a USD threshold",
1142
+ type: "cost",
1143
+ parameters: [
1144
+ {
1145
+ name: "maxCostUsd",
1146
+ label: "Max Cost (USD)",
1147
+ type: "number",
1148
+ required: true,
1149
+ defaultValue: 1
1150
+ }
1151
+ ]
1152
+ },
1153
+ [SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
1154
+ id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
1155
+ name: "LLM Judge",
1156
+ description: "LLM evaluates the output and assigns a score (0-10)",
1157
+ type: "llm_judge",
1158
+ parameters: [
1159
+ {
1160
+ name: "prompt",
1161
+ label: "Judge Prompt",
1162
+ type: "string",
1163
+ required: true,
1164
+ defaultValue: "Verify the output meets the acceptance criteria."
1165
+ },
1166
+ {
1167
+ name: "minScore",
1168
+ label: "Minimum Score (0-10)",
1169
+ type: "number",
1170
+ required: false,
1171
+ defaultValue: 7
1172
+ },
1173
+ {
1174
+ name: "model",
1175
+ label: "Model",
1176
+ type: "string",
1177
+ required: false
1178
+ }
1179
+ ]
1180
+ },
1181
+ [SYSTEM_ASSERTION_IDS.API_CALL]: {
1182
+ id: SYSTEM_ASSERTION_IDS.API_CALL,
1183
+ name: "API Call",
1184
+ description: "Call an API endpoint and verify the response contains expected data",
1185
+ type: "api_call",
1186
+ parameters: [
1187
+ {
1188
+ name: "url",
1189
+ label: "URL",
1190
+ type: "string",
1191
+ required: true
1192
+ },
1193
+ {
1194
+ name: "method",
1195
+ label: "HTTP Method",
1196
+ type: "string",
1197
+ required: false,
1198
+ defaultValue: "GET"
1199
+ },
1200
+ {
1201
+ name: "requestBody",
1202
+ label: "Request Body (JSON)",
1203
+ type: "string",
1204
+ required: false
1205
+ },
1206
+ {
1207
+ name: "expectedResponse",
1208
+ label: "Expected Response (JSON)",
1209
+ type: "string",
1210
+ required: true
1211
+ },
1212
+ {
1213
+ name: "requestHeaders",
1214
+ label: "Headers (JSON)",
1215
+ type: "string",
1216
+ required: false,
1217
+ advanced: true
1218
+ },
1219
+ {
1220
+ name: "timeoutMs",
1221
+ label: "Timeout (ms)",
1222
+ type: "number",
1223
+ required: false,
1224
+ defaultValue: 3e4,
1225
+ advanced: true
1226
+ }
1227
+ ]
1228
+ }
1229
+ };
1230
+ function getSystemAssertions() {
1231
+ return Object.values(SYSTEM_ASSERTIONS);
1232
+ }
1233
+ function getSystemAssertion(id) {
1234
+ return SYSTEM_ASSERTIONS[id];
1235
+ }
1236
+
979
1237
  // src/scenario/test-scenario.ts
980
- var ExpectedFileSchema = import_zod23.z.object({
1238
+ var MAX_IMAGE_BASE64_LENGTH = 4 * Math.ceil(2 * 1024 * 1024 / 3);
1239
+ var TriggerPromptImageSchema = import_zod24.z.object({
1240
+ /** Base64-encoded image data (no data URL prefix) */
1241
+ base64: import_zod24.z.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
1242
+ /** MIME type of the image */
1243
+ mediaType: import_zod24.z.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
1244
+ /** Original filename of the image */
1245
+ name: import_zod24.z.string()
1246
+ });
1247
+ var ExpectedFileSchema = import_zod24.z.object({
981
1248
  /** Relative path where the file should be created */
982
- path: import_zod23.z.string(),
1249
+ path: import_zod24.z.string(),
983
1250
  /** Optional expected content */
984
- content: import_zod23.z.string().optional()
1251
+ content: import_zod24.z.string().optional()
985
1252
  });
986
1253
  var TestScenarioSchema = TenantEntitySchema.extend({
987
1254
  /** The prompt sent to the agent to trigger the task */
988
- triggerPrompt: import_zod23.z.string().min(10),
1255
+ triggerPrompt: import_zod24.z.string().min(10),
989
1256
  /** ID of the template to use for this scenario (null = no template) */
990
- templateId: import_zod23.z.string().nullish(),
1257
+ templateId: import_zod24.z.string().nullish(),
991
1258
  /** Inline assertions to evaluate for this scenario (legacy) */
992
- assertions: import_zod23.z.array(AssertionSchema).optional(),
1259
+ assertions: import_zod24.z.array(AssertionSchema).optional(),
993
1260
  /** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
994
- assertionIds: import_zod23.z.array(import_zod23.z.string()).optional(),
1261
+ assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
995
1262
  /** Linked assertions with per-scenario parameter values */
996
- assertionLinks: import_zod23.z.array(ScenarioAssertionLinkSchema).optional(),
1263
+ assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional(),
997
1264
  /** Tags for categorisation and filtering */
998
- tags: import_zod23.z.array(import_zod23.z.string()).optional()
999
- });
1000
- var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
1265
+ tags: import_zod24.z.array(import_zod24.z.string()).optional(),
1266
+ /** Base64-encoded images attached to the trigger prompt (max 3) */
1267
+ triggerPromptImages: import_zod24.z.array(TriggerPromptImageSchema).max(3).optional()
1268
+ });
1269
+ function validateBuildPassedParamsInAssertionLinks(links, ctx) {
1270
+ if (!links) return;
1271
+ for (let i = 0; i < links.length; i++) {
1272
+ const link = links[i];
1273
+ if (link.assertionId !== SYSTEM_ASSERTION_IDS.BUILD_PASSED) continue;
1274
+ const cmd = link.params?.command;
1275
+ if (cmd === void 0 || cmd === null) continue;
1276
+ if (typeof cmd !== "string") {
1277
+ ctx.addIssue({
1278
+ code: import_zod24.z.ZodIssueCode.custom,
1279
+ message: "build_passed command must be a string",
1280
+ path: ["assertionLinks", i, "params", "command"]
1281
+ });
1282
+ continue;
1283
+ }
1284
+ if (!isAllowedBuildCommandString(cmd)) {
1285
+ ctx.addIssue({
1286
+ code: import_zod24.z.ZodIssueCode.custom,
1287
+ message: "Invalid build_passed command. Allowed: yarn build, npm run build, pnpm run build, pnpm build",
1288
+ path: ["assertionLinks", i, "params", "command"]
1289
+ });
1290
+ }
1291
+ }
1292
+ }
1293
+ var TestScenarioCreateBaseSchema = TestScenarioSchema.omit({
1001
1294
  id: true,
1002
1295
  createdAt: true,
1003
1296
  updatedAt: true,
1004
1297
  deleted: true
1005
1298
  });
1006
- var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
1299
+ var CreateTestScenarioInputSchema = TestScenarioCreateBaseSchema.superRefine((data, ctx) => {
1300
+ validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
1301
+ });
1302
+ var UpdateTestScenarioInputSchema = TestScenarioCreateBaseSchema.partial().superRefine((data, ctx) => {
1303
+ if (data.assertionLinks !== void 0) {
1304
+ validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
1305
+ }
1306
+ });
1007
1307
 
1008
1308
  // src/scenario/batch-import.ts
1009
- var import_zod24 = require("zod");
1309
+ var import_zod25 = require("zod");
1010
1310
  var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
1011
- var BatchAssertionLinkSchema = import_zod24.z.union([
1012
- import_zod24.z.string().min(1),
1311
+ var BatchAssertionLinkSchema = import_zod25.z.union([
1312
+ import_zod25.z.string().min(1),
1013
1313
  ScenarioAssertionLinkSchema
1014
1314
  ]);
1015
- var BatchScenarioEntrySchema = import_zod24.z.object({
1016
- name: import_zod24.z.string().min(1, "name: Required"),
1017
- description: import_zod24.z.string().optional().default(""),
1018
- triggerPrompt: import_zod24.z.string().min(10, "triggerPrompt: Must be at least 10 characters"),
1019
- templateId: import_zod24.z.string().nullish(),
1020
- tags: import_zod24.z.array(import_zod24.z.string()).optional(),
1021
- assertionLinks: import_zod24.z.array(BatchAssertionLinkSchema).optional()
1315
+ var BatchScenarioEntrySchema = import_zod25.z.object({
1316
+ name: import_zod25.z.string().min(1, "name: Required"),
1317
+ description: import_zod25.z.string().optional().default(""),
1318
+ triggerPrompt: import_zod25.z.string().min(10, "triggerPrompt: Must be at least 10 characters"),
1319
+ templateId: import_zod25.z.string().nullish(),
1320
+ tags: import_zod25.z.array(import_zod25.z.string()).optional(),
1321
+ assertionLinks: import_zod25.z.array(BatchAssertionLinkSchema).optional()
1322
+ }).superRefine((data, ctx) => {
1323
+ if (!data.assertionLinks) return;
1324
+ const objectLinks = data.assertionLinks.filter(
1325
+ (link) => typeof link !== "string"
1326
+ );
1327
+ if (objectLinks.length > 0) {
1328
+ validateBuildPassedParamsInAssertionLinks(objectLinks, ctx);
1329
+ }
1022
1330
  });
1023
- var BatchImportPayloadSchema = import_zod24.z.object({
1024
- scenarios: import_zod24.z.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
1331
+ var BatchImportPayloadSchema = import_zod25.z.object({
1332
+ scenarios: import_zod25.z.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
1025
1333
  });
1026
1334
  var BATCH_IMPORT_LIMITS = {
1027
1335
  MAX_SCENARIOS: 100,
@@ -1043,29 +1351,29 @@ function normalizeBatchAssertionLink(link) {
1043
1351
  }
1044
1352
  return link;
1045
1353
  }
1046
- var BatchResultItemSchema = import_zod24.z.object({
1047
- index: import_zod24.z.number(),
1048
- name: import_zod24.z.string(),
1049
- status: import_zod24.z.enum(["valid", "invalid"]),
1050
- id: import_zod24.z.string().nullable().optional(),
1051
- errors: import_zod24.z.array(import_zod24.z.string()).optional()
1052
- });
1053
- var BatchSummarySchema = import_zod24.z.object({
1054
- total: import_zod24.z.number(),
1055
- valid: import_zod24.z.number(),
1056
- invalid: import_zod24.z.number(),
1057
- created: import_zod24.z.number()
1058
- });
1059
- var BatchImportResponseSchema = import_zod24.z.object({
1354
+ var BatchResultItemSchema = import_zod25.z.object({
1355
+ index: import_zod25.z.number(),
1356
+ name: import_zod25.z.string(),
1357
+ status: import_zod25.z.enum(["valid", "invalid"]),
1358
+ id: import_zod25.z.string().nullable().optional(),
1359
+ errors: import_zod25.z.array(import_zod25.z.string()).optional()
1360
+ });
1361
+ var BatchSummarySchema = import_zod25.z.object({
1362
+ total: import_zod25.z.number(),
1363
+ valid: import_zod25.z.number(),
1364
+ invalid: import_zod25.z.number(),
1365
+ created: import_zod25.z.number()
1366
+ });
1367
+ var BatchImportResponseSchema = import_zod25.z.object({
1060
1368
  summary: BatchSummarySchema,
1061
- results: import_zod24.z.array(BatchResultItemSchema)
1369
+ results: import_zod25.z.array(BatchResultItemSchema)
1062
1370
  });
1063
1371
 
1064
1372
  // src/suite/test-suite.ts
1065
- var import_zod25 = require("zod");
1373
+ var import_zod26 = require("zod");
1066
1374
  var TestSuiteSchema = TenantEntitySchema.extend({
1067
1375
  /** IDs of test scenarios in this suite */
1068
- scenarioIds: import_zod25.z.array(import_zod25.z.string())
1376
+ scenarioIds: import_zod26.z.array(import_zod26.z.string())
1069
1377
  });
1070
1378
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1071
1379
  id: true,
@@ -1076,21 +1384,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1076
1384
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
1077
1385
 
1078
1386
  // src/evaluation/metrics.ts
1079
- var import_zod26 = require("zod");
1080
- var TokenUsageSchema = import_zod26.z.object({
1081
- prompt: import_zod26.z.number(),
1082
- completion: import_zod26.z.number(),
1083
- total: import_zod26.z.number()
1084
- });
1085
- var EvalMetricsSchema = import_zod26.z.object({
1086
- totalAssertions: import_zod26.z.number(),
1087
- passed: import_zod26.z.number(),
1088
- failed: import_zod26.z.number(),
1089
- skipped: import_zod26.z.number(),
1090
- errors: import_zod26.z.number(),
1091
- passRate: import_zod26.z.number(),
1092
- avgDuration: import_zod26.z.number(),
1093
- totalDuration: import_zod26.z.number()
1387
+ var import_zod27 = require("zod");
1388
+ var TokenUsageSchema = import_zod27.z.object({
1389
+ prompt: import_zod27.z.number(),
1390
+ completion: import_zod27.z.number(),
1391
+ total: import_zod27.z.number()
1392
+ });
1393
+ var EvalMetricsSchema = import_zod27.z.object({
1394
+ totalAssertions: import_zod27.z.number(),
1395
+ passed: import_zod27.z.number(),
1396
+ failed: import_zod27.z.number(),
1397
+ skipped: import_zod27.z.number(),
1398
+ errors: import_zod27.z.number(),
1399
+ passRate: import_zod27.z.number(),
1400
+ avgDuration: import_zod27.z.number(),
1401
+ totalDuration: import_zod27.z.number()
1094
1402
  });
1095
1403
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1096
1404
  EvalStatus2["PENDING"] = "pending";
@@ -1100,7 +1408,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1100
1408
  EvalStatus2["CANCELLED"] = "cancelled";
1101
1409
  return EvalStatus2;
1102
1410
  })(EvalStatus || {});
1103
- var EvalStatusSchema = import_zod26.z.enum(EvalStatus);
1411
+ var EvalStatusSchema = import_zod27.z.enum(EvalStatus);
1104
1412
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1105
1413
  LLMStepType2["COMPLETION"] = "completion";
1106
1414
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -1108,54 +1416,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1108
1416
  LLMStepType2["THINKING"] = "thinking";
1109
1417
  return LLMStepType2;
1110
1418
  })(LLMStepType || {});
1111
- var LLMTraceStepSchema = import_zod26.z.object({
1112
- id: import_zod26.z.string(),
1113
- stepNumber: import_zod26.z.number(),
1114
- type: import_zod26.z.enum(LLMStepType),
1115
- model: import_zod26.z.string(),
1116
- provider: import_zod26.z.string(),
1117
- startedAt: import_zod26.z.string(),
1118
- durationMs: import_zod26.z.number(),
1419
+ var LLMTraceStepSchema = import_zod27.z.object({
1420
+ id: import_zod27.z.string(),
1421
+ stepNumber: import_zod27.z.number(),
1422
+ type: import_zod27.z.enum(LLMStepType),
1423
+ model: import_zod27.z.string(),
1424
+ provider: import_zod27.z.string(),
1425
+ startedAt: import_zod27.z.string(),
1426
+ durationMs: import_zod27.z.number(),
1119
1427
  tokenUsage: TokenUsageSchema,
1120
- costUsd: import_zod26.z.number(),
1121
- toolName: import_zod26.z.string().optional(),
1122
- toolArguments: import_zod26.z.string().optional(),
1123
- inputPreview: import_zod26.z.string().optional(),
1124
- outputPreview: import_zod26.z.string().optional(),
1125
- success: import_zod26.z.boolean(),
1126
- error: import_zod26.z.string().optional(),
1127
- turnIndex: import_zod26.z.number().optional()
1128
- });
1129
- var LLMBreakdownStatsSchema = import_zod26.z.object({
1130
- count: import_zod26.z.number(),
1131
- durationMs: import_zod26.z.number(),
1132
- tokens: import_zod26.z.number(),
1133
- costUsd: import_zod26.z.number()
1134
- });
1135
- var LLMTraceSummarySchema = import_zod26.z.object({
1136
- totalSteps: import_zod26.z.number(),
1137
- totalTurns: import_zod26.z.number().optional(),
1138
- totalDurationMs: import_zod26.z.number(),
1428
+ costUsd: import_zod27.z.number(),
1429
+ toolName: import_zod27.z.string().optional(),
1430
+ toolArguments: import_zod27.z.string().optional(),
1431
+ inputPreview: import_zod27.z.string().optional(),
1432
+ outputPreview: import_zod27.z.string().optional(),
1433
+ success: import_zod27.z.boolean(),
1434
+ error: import_zod27.z.string().optional(),
1435
+ turnIndex: import_zod27.z.number().optional()
1436
+ });
1437
+ var LLMBreakdownStatsSchema = import_zod27.z.object({
1438
+ count: import_zod27.z.number(),
1439
+ durationMs: import_zod27.z.number(),
1440
+ tokens: import_zod27.z.number(),
1441
+ costUsd: import_zod27.z.number()
1442
+ });
1443
+ var LLMTraceSummarySchema = import_zod27.z.object({
1444
+ totalSteps: import_zod27.z.number(),
1445
+ totalTurns: import_zod27.z.number().optional(),
1446
+ totalDurationMs: import_zod27.z.number(),
1139
1447
  totalTokens: TokenUsageSchema,
1140
- totalCostUsd: import_zod26.z.number(),
1141
- stepTypeBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema).optional(),
1142
- modelBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema),
1143
- modelsUsed: import_zod26.z.array(import_zod26.z.string())
1144
- });
1145
- var LLMTraceSchema = import_zod26.z.object({
1146
- id: import_zod26.z.string(),
1147
- steps: import_zod26.z.array(LLMTraceStepSchema),
1448
+ totalCostUsd: import_zod27.z.number(),
1449
+ stepTypeBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema).optional(),
1450
+ modelBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema),
1451
+ modelsUsed: import_zod27.z.array(import_zod27.z.string())
1452
+ });
1453
+ var LLMTraceSchema = import_zod27.z.object({
1454
+ id: import_zod27.z.string(),
1455
+ steps: import_zod27.z.array(LLMTraceStepSchema),
1148
1456
  summary: LLMTraceSummarySchema
1149
1457
  });
1150
1458
 
1151
1459
  // src/evaluation/eval-result.ts
1152
- var import_zod30 = require("zod");
1460
+ var import_zod31 = require("zod");
1153
1461
 
1154
1462
  // src/evaluation/eval-run.ts
1155
- var import_zod28 = require("zod");
1463
+ var import_zod29 = require("zod");
1156
1464
 
1157
1465
  // src/evaluation/live-trace.ts
1158
- var import_zod27 = require("zod");
1466
+ var import_zod28 = require("zod");
1159
1467
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1160
1468
  LiveTraceEventType2["THINKING"] = "thinking";
1161
1469
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -1169,37 +1477,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1169
1477
  LiveTraceEventType2["USER"] = "user";
1170
1478
  return LiveTraceEventType2;
1171
1479
  })(LiveTraceEventType || {});
1172
- var LiveTraceEventSchema = import_zod27.z.object({
1480
+ var LiveTraceEventSchema = import_zod28.z.object({
1173
1481
  /** The evaluation run ID */
1174
- evalRunId: import_zod27.z.string(),
1482
+ evalRunId: import_zod28.z.string(),
1175
1483
  /** The scenario ID being executed */
1176
- scenarioId: import_zod27.z.string(),
1484
+ scenarioId: import_zod28.z.string(),
1177
1485
  /** The scenario name for display */
1178
- scenarioName: import_zod27.z.string(),
1486
+ scenarioName: import_zod28.z.string(),
1179
1487
  /** The target ID (skill, agent, etc.) */
1180
- targetId: import_zod27.z.string(),
1488
+ targetId: import_zod28.z.string(),
1181
1489
  /** The target name for display */
1182
- targetName: import_zod27.z.string(),
1490
+ targetName: import_zod28.z.string(),
1183
1491
  /** Step number in the current scenario execution */
1184
- stepNumber: import_zod27.z.number(),
1492
+ stepNumber: import_zod28.z.number(),
1185
1493
  /** Type of trace event */
1186
- type: import_zod27.z.enum(LiveTraceEventType),
1494
+ type: import_zod28.z.enum(LiveTraceEventType),
1187
1495
  /** Tool name if this is a tool_use event */
1188
- toolName: import_zod27.z.string().optional(),
1496
+ toolName: import_zod28.z.string().optional(),
1189
1497
  /** Tool arguments preview (truncated JSON) */
1190
- toolArgs: import_zod27.z.string().optional(),
1498
+ toolArgs: import_zod28.z.string().optional(),
1191
1499
  /** Output preview (truncated text) */
1192
- outputPreview: import_zod27.z.string().optional(),
1500
+ outputPreview: import_zod28.z.string().optional(),
1193
1501
  /** File path for file operations */
1194
- filePath: import_zod27.z.string().optional(),
1502
+ filePath: import_zod28.z.string().optional(),
1195
1503
  /** Elapsed time in milliseconds for progress events */
1196
- elapsedMs: import_zod27.z.number().optional(),
1504
+ elapsedMs: import_zod28.z.number().optional(),
1197
1505
  /** Thinking/reasoning text from Claude */
1198
- thinking: import_zod27.z.string().optional(),
1506
+ thinking: import_zod28.z.string().optional(),
1199
1507
  /** Timestamp when this event occurred */
1200
- timestamp: import_zod27.z.string(),
1508
+ timestamp: import_zod28.z.string(),
1201
1509
  /** Whether this is the final event for this scenario */
1202
- isComplete: import_zod27.z.boolean()
1510
+ isComplete: import_zod28.z.boolean()
1203
1511
  });
1204
1512
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
1205
1513
  function parseTraceEventLine(line) {
@@ -1228,40 +1536,40 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
1228
1536
  TriggerType2["SCHEDULED"] = "SCHEDULED";
1229
1537
  return TriggerType2;
1230
1538
  })(TriggerType || {});
1231
- var TriggerMetadataSchema = import_zod28.z.object({
1232
- version: import_zod28.z.string().optional(),
1233
- resourceUpdated: import_zod28.z.array(import_zod28.z.string()).optional(),
1234
- scheduleId: import_zod28.z.string().optional()
1539
+ var TriggerMetadataSchema = import_zod29.z.object({
1540
+ version: import_zod29.z.string().optional(),
1541
+ resourceUpdated: import_zod29.z.array(import_zod29.z.string()).optional(),
1542
+ scheduleId: import_zod29.z.string().optional()
1235
1543
  });
1236
- var TriggerSchema = import_zod28.z.object({
1237
- id: import_zod28.z.string(),
1544
+ var TriggerSchema = import_zod29.z.object({
1545
+ id: import_zod29.z.string(),
1238
1546
  metadata: TriggerMetadataSchema.optional(),
1239
- type: import_zod28.z.nativeEnum(TriggerType)
1547
+ type: import_zod29.z.nativeEnum(TriggerType)
1240
1548
  });
1241
- var DiffLineTypeSchema = import_zod28.z.enum(["added", "removed", "unchanged"]);
1242
- var DiffLineSchema = import_zod28.z.object({
1549
+ var DiffLineTypeSchema = import_zod29.z.enum(["added", "removed", "unchanged"]);
1550
+ var DiffLineSchema = import_zod29.z.object({
1243
1551
  type: DiffLineTypeSchema,
1244
- content: import_zod28.z.string(),
1245
- lineNumber: import_zod28.z.number()
1246
- });
1247
- var DiffContentSchema = import_zod28.z.object({
1248
- path: import_zod28.z.string(),
1249
- expected: import_zod28.z.string(),
1250
- actual: import_zod28.z.string(),
1251
- diffLines: import_zod28.z.array(DiffLineSchema),
1252
- renamedFrom: import_zod28.z.string().optional(),
1552
+ content: import_zod29.z.string(),
1553
+ lineNumber: import_zod29.z.number()
1554
+ });
1555
+ var DiffContentSchema = import_zod29.z.object({
1556
+ path: import_zod29.z.string(),
1557
+ expected: import_zod29.z.string(),
1558
+ actual: import_zod29.z.string(),
1559
+ diffLines: import_zod29.z.array(DiffLineSchema),
1560
+ renamedFrom: import_zod29.z.string().optional(),
1253
1561
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1254
- isInfrastructure: import_zod28.z.boolean().optional()
1562
+ isInfrastructure: import_zod29.z.boolean().optional()
1255
1563
  });
1256
- var CommandExecutionSchema = import_zod28.z.object({
1257
- command: import_zod28.z.string(),
1258
- exitCode: import_zod28.z.number(),
1259
- output: import_zod28.z.string().optional(),
1260
- duration: import_zod28.z.number()
1564
+ var CommandExecutionSchema = import_zod29.z.object({
1565
+ command: import_zod29.z.string(),
1566
+ exitCode: import_zod29.z.number(),
1567
+ output: import_zod29.z.string().optional(),
1568
+ duration: import_zod29.z.number()
1261
1569
  });
1262
- var FileModificationSchema = import_zod28.z.object({
1263
- path: import_zod28.z.string(),
1264
- action: import_zod28.z.enum(["created", "modified", "deleted"])
1570
+ var FileModificationSchema = import_zod29.z.object({
1571
+ path: import_zod29.z.string(),
1572
+ action: import_zod29.z.enum(["created", "modified", "deleted"])
1265
1573
  });
1266
1574
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1267
1575
  TemplateFileStatus2["NEW"] = "new";
@@ -1269,62 +1577,62 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1269
1577
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
1270
1578
  return TemplateFileStatus2;
1271
1579
  })(TemplateFileStatus || {});
1272
- var TemplateFileSchema = import_zod28.z.object({
1580
+ var TemplateFileSchema = import_zod29.z.object({
1273
1581
  /** Relative path within the template */
1274
- path: import_zod28.z.string(),
1582
+ path: import_zod29.z.string(),
1275
1583
  /** Full file content after execution */
1276
- content: import_zod28.z.string(),
1584
+ content: import_zod29.z.string(),
1277
1585
  /** File status (new, modified, unchanged) */
1278
- status: import_zod28.z.enum(["new", "modified", "unchanged"]),
1586
+ status: import_zod29.z.enum(["new", "modified", "unchanged"]),
1279
1587
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1280
- isInfrastructure: import_zod28.z.boolean().optional()
1588
+ isInfrastructure: import_zod29.z.boolean().optional()
1281
1589
  });
1282
- var ApiCallSchema = import_zod28.z.object({
1283
- endpoint: import_zod28.z.string(),
1284
- tokensUsed: import_zod28.z.number(),
1285
- duration: import_zod28.z.number()
1590
+ var ApiCallSchema = import_zod29.z.object({
1591
+ endpoint: import_zod29.z.string(),
1592
+ tokensUsed: import_zod29.z.number(),
1593
+ duration: import_zod29.z.number()
1286
1594
  });
1287
- var ExecutionTraceSchema = import_zod28.z.object({
1288
- commands: import_zod28.z.array(CommandExecutionSchema),
1289
- filesModified: import_zod28.z.array(FileModificationSchema),
1290
- apiCalls: import_zod28.z.array(ApiCallSchema),
1291
- totalDuration: import_zod28.z.number()
1595
+ var ExecutionTraceSchema = import_zod29.z.object({
1596
+ commands: import_zod29.z.array(CommandExecutionSchema),
1597
+ filesModified: import_zod29.z.array(FileModificationSchema),
1598
+ apiCalls: import_zod29.z.array(ApiCallSchema),
1599
+ totalDuration: import_zod29.z.number()
1292
1600
  });
1293
- var RunAnalysisFindingSchema = import_zod28.z.object({
1294
- category: import_zod28.z.enum([
1601
+ var RunAnalysisFindingSchema = import_zod29.z.object({
1602
+ category: import_zod29.z.enum([
1295
1603
  "failure_pattern",
1296
1604
  "cost_waste",
1297
1605
  "flakiness",
1298
1606
  "inefficiency",
1299
1607
  "positive"
1300
1608
  ]),
1301
- severity: import_zod28.z.enum(["high", "medium", "low"]),
1302
- description: import_zod28.z.string(),
1303
- affectedScenarios: import_zod28.z.array(import_zod28.z.string()),
1304
- recommendation: import_zod28.z.string().optional()
1609
+ severity: import_zod29.z.enum(["high", "medium", "low"]),
1610
+ description: import_zod29.z.string(),
1611
+ affectedScenarios: import_zod29.z.array(import_zod29.z.string()),
1612
+ recommendation: import_zod29.z.string().optional()
1305
1613
  });
1306
- var RunAnalysisSchema = import_zod28.z.object({
1307
- generatedAt: import_zod28.z.string(),
1308
- summary: import_zod28.z.string(),
1309
- findings: import_zod28.z.array(RunAnalysisFindingSchema)
1614
+ var RunAnalysisSchema = import_zod29.z.object({
1615
+ generatedAt: import_zod29.z.string(),
1616
+ summary: import_zod29.z.string(),
1617
+ findings: import_zod29.z.array(RunAnalysisFindingSchema)
1310
1618
  });
1311
1619
  var EvalRunSchema = TenantEntitySchema.extend({
1312
1620
  /** Agent ID for this run */
1313
- agentId: import_zod28.z.string().optional(),
1621
+ agentId: import_zod29.z.string().optional(),
1314
1622
  /** Preset ID that originated this run (optional) */
1315
- presetId: import_zod28.z.string().optional(),
1623
+ presetId: import_zod29.z.string().optional(),
1316
1624
  /** Skill IDs for this run */
1317
- skillIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1625
+ skillIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1318
1626
  /** Map of skillId to skillVersionId for this run */
1319
- skillVersions: import_zod28.z.record(import_zod28.z.string(), import_zod28.z.string()).optional(),
1627
+ skillVersions: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.string()).optional(),
1320
1628
  /** Scenario IDs to run (always present — resolved server-side from tags when needed) */
1321
- scenarioIds: import_zod28.z.array(import_zod28.z.string()),
1629
+ scenarioIds: import_zod29.z.array(import_zod29.z.string()),
1322
1630
  /** Current status */
1323
1631
  status: EvalStatusSchema,
1324
1632
  /** Progress percentage (0-100) */
1325
- progress: import_zod28.z.number(),
1633
+ progress: import_zod29.z.number(),
1326
1634
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
1327
- results: import_zod28.z.array(import_zod28.z.lazy(() => EvalRunResultSchema)),
1635
+ results: import_zod29.z.array(import_zod29.z.lazy(() => EvalRunResultSchema)),
1328
1636
  /** Aggregated metrics across all results */
1329
1637
  aggregateMetrics: EvalMetricsSchema,
1330
1638
  /** Aggregated LLM trace summary */
@@ -1332,41 +1640,41 @@ var EvalRunSchema = TenantEntitySchema.extend({
1332
1640
  /** What triggered this run */
1333
1641
  trigger: TriggerSchema.optional(),
1334
1642
  /** When the run started (set when evaluation is triggered) */
1335
- startedAt: import_zod28.z.string().optional(),
1643
+ startedAt: import_zod29.z.string().optional(),
1336
1644
  /** When the run completed */
1337
- completedAt: import_zod28.z.string().optional(),
1645
+ completedAt: import_zod29.z.string().optional(),
1338
1646
  /** Live trace events captured during execution (for playback on results page) */
1339
- liveTraceEvents: import_zod28.z.array(LiveTraceEventSchema).optional(),
1647
+ liveTraceEvents: import_zod29.z.array(LiveTraceEventSchema).optional(),
1340
1648
  /** Remote job ID for tracking execution in Dev Machines */
1341
- jobId: import_zod28.z.string().optional(),
1649
+ jobId: import_zod29.z.string().optional(),
1342
1650
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
1343
- jobStatus: import_zod28.z.string().optional(),
1651
+ jobStatus: import_zod29.z.string().optional(),
1344
1652
  /** Remote job error message if the job failed */
1345
- jobError: import_zod28.z.string().optional(),
1653
+ jobError: import_zod29.z.string().optional(),
1346
1654
  /** Timestamp of the last job status check */
1347
- jobStatusCheckedAt: import_zod28.z.string().optional(),
1655
+ jobStatusCheckedAt: import_zod29.z.string().optional(),
1348
1656
  /** MCP server IDs to enable for this run (optional) */
1349
- mcpIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1657
+ mcpIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1350
1658
  /** Sub-agent IDs to enable for this run (optional) */
1351
- subAgentIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1659
+ subAgentIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1352
1660
  /** Rule IDs to enable for this run (optional) */
1353
- ruleIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1661
+ ruleIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1354
1662
  /** Tags used to select scenarios for this run (for traceability) */
1355
- tags: import_zod28.z.array(import_zod28.z.string()).optional(),
1663
+ tags: import_zod29.z.array(import_zod29.z.string()).optional(),
1356
1664
  /** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
1357
- runsPerScenario: import_zod28.z.number().int().min(1).max(20).optional(),
1665
+ runsPerScenario: import_zod29.z.number().int().min(1).max(20).optional(),
1358
1666
  /** Snapshot of agent configuration captured at run creation time */
1359
- agentSnapshot: import_zod28.z.object({
1360
- name: import_zod28.z.string().optional(),
1667
+ agentSnapshot: import_zod29.z.object({
1668
+ name: import_zod29.z.string().optional(),
1361
1669
  agentType: AgentTypeSchema.optional(),
1362
1670
  runCommand: AgentRunCommandSchema.optional(),
1363
- systemPrompt: import_zod28.z.string().nullable().optional(),
1671
+ systemPrompt: import_zod29.z.string().nullable().optional(),
1364
1672
  modelConfig: ModelConfigSchema.optional()
1365
1673
  }).optional(),
1366
1674
  /** UUID linking all runs in a comparison group */
1367
- comparisonGroupId: import_zod28.z.string().optional(),
1675
+ comparisonGroupId: import_zod29.z.string().optional(),
1368
1676
  /** Human-readable label for this variant (e.g., "MCP: Wix Stores") */
1369
- comparisonLabel: import_zod28.z.string().optional(),
1677
+ comparisonLabel: import_zod29.z.string().optional(),
1370
1678
  /** LLM-generated analysis of the completed run */
1371
1679
  runAnalysis: RunAnalysisSchema.optional()
1372
1680
  });
@@ -1384,60 +1692,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
1384
1692
  agentSnapshot: true
1385
1693
  }).extend({
1386
1694
  /** Optional on input — backend resolves from tags when not provided */
1387
- scenarioIds: import_zod28.z.array(import_zod28.z.string()).optional()
1695
+ scenarioIds: import_zod29.z.array(import_zod29.z.string()).optional()
1388
1696
  }).refine(
1389
1697
  (data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
1390
1698
  { message: "Either scenarioIds or tags must be provided" }
1391
1699
  );
1392
- var EvaluationProgressSchema = import_zod28.z.object({
1393
- runId: import_zod28.z.string(),
1394
- targetId: import_zod28.z.string(),
1395
- totalScenarios: import_zod28.z.number(),
1396
- completedScenarios: import_zod28.z.number(),
1397
- scenarioProgress: import_zod28.z.array(
1398
- import_zod28.z.object({
1399
- scenarioId: import_zod28.z.string(),
1400
- currentStep: import_zod28.z.string(),
1401
- error: import_zod28.z.string().optional()
1700
+ var EvaluationProgressSchema = import_zod29.z.object({
1701
+ runId: import_zod29.z.string(),
1702
+ targetId: import_zod29.z.string(),
1703
+ totalScenarios: import_zod29.z.number(),
1704
+ completedScenarios: import_zod29.z.number(),
1705
+ scenarioProgress: import_zod29.z.array(
1706
+ import_zod29.z.object({
1707
+ scenarioId: import_zod29.z.string(),
1708
+ currentStep: import_zod29.z.string(),
1709
+ error: import_zod29.z.string().optional()
1402
1710
  })
1403
1711
  ),
1404
- createdAt: import_zod28.z.number()
1405
- });
1406
- var EvaluationLogSchema = import_zod28.z.object({
1407
- runId: import_zod28.z.string(),
1408
- scenarioId: import_zod28.z.string(),
1409
- log: import_zod28.z.object({
1410
- level: import_zod28.z.enum(["info", "error", "debug"]),
1411
- message: import_zod28.z.string().optional(),
1412
- args: import_zod28.z.array(import_zod28.z.any()).optional(),
1413
- error: import_zod28.z.string().optional()
1712
+ createdAt: import_zod29.z.number()
1713
+ });
1714
+ var EvaluationLogSchema = import_zod29.z.object({
1715
+ runId: import_zod29.z.string(),
1716
+ scenarioId: import_zod29.z.string(),
1717
+ log: import_zod29.z.object({
1718
+ level: import_zod29.z.enum(["info", "error", "debug"]),
1719
+ message: import_zod29.z.string().optional(),
1720
+ args: import_zod29.z.array(import_zod29.z.any()).optional(),
1721
+ error: import_zod29.z.string().optional()
1414
1722
  })
1415
1723
  });
1416
1724
  var LLM_TIMEOUT = 12e4;
1417
1725
 
1418
1726
  // src/evaluation/conversation.ts
1419
- var import_zod29 = require("zod");
1420
- var TextBlockSchema = import_zod29.z.object({
1421
- type: import_zod29.z.literal("text"),
1422
- text: import_zod29.z.string()
1423
- });
1424
- var ThinkingBlockSchema = import_zod29.z.object({
1425
- type: import_zod29.z.literal("thinking"),
1426
- thinking: import_zod29.z.string()
1427
- });
1428
- var ToolUseBlockSchema = import_zod29.z.object({
1429
- type: import_zod29.z.literal("tool_use"),
1430
- toolName: import_zod29.z.string(),
1431
- toolId: import_zod29.z.string(),
1432
- input: import_zod29.z.unknown()
1433
- });
1434
- var ToolResultBlockSchema = import_zod29.z.object({
1435
- type: import_zod29.z.literal("tool_result"),
1436
- toolUseId: import_zod29.z.string(),
1437
- content: import_zod29.z.string(),
1438
- isError: import_zod29.z.boolean().optional()
1439
- });
1440
- var ConversationBlockSchema = import_zod29.z.discriminatedUnion("type", [
1727
+ var import_zod30 = require("zod");
1728
+ var TextBlockSchema = import_zod30.z.object({
1729
+ type: import_zod30.z.literal("text"),
1730
+ text: import_zod30.z.string()
1731
+ });
1732
+ var ThinkingBlockSchema = import_zod30.z.object({
1733
+ type: import_zod30.z.literal("thinking"),
1734
+ thinking: import_zod30.z.string()
1735
+ });
1736
+ var ToolUseBlockSchema = import_zod30.z.object({
1737
+ type: import_zod30.z.literal("tool_use"),
1738
+ toolName: import_zod30.z.string(),
1739
+ toolId: import_zod30.z.string(),
1740
+ input: import_zod30.z.unknown()
1741
+ });
1742
+ var ToolResultBlockSchema = import_zod30.z.object({
1743
+ type: import_zod30.z.literal("tool_result"),
1744
+ toolUseId: import_zod30.z.string(),
1745
+ content: import_zod30.z.string(),
1746
+ isError: import_zod30.z.boolean().optional()
1747
+ });
1748
+ var ConversationBlockSchema = import_zod30.z.discriminatedUnion("type", [
1441
1749
  TextBlockSchema,
1442
1750
  ThinkingBlockSchema,
1443
1751
  ToolUseBlockSchema,
@@ -1448,18 +1756,18 @@ var ConversationMessageRoles = [
1448
1756
  "user",
1449
1757
  "system"
1450
1758
  ];
1451
- var ConversationMessageSchema = import_zod29.z.object({
1452
- role: import_zod29.z.enum(ConversationMessageRoles),
1453
- content: import_zod29.z.array(ConversationBlockSchema),
1454
- timestamp: import_zod29.z.string()
1759
+ var ConversationMessageSchema = import_zod30.z.object({
1760
+ role: import_zod30.z.enum(ConversationMessageRoles),
1761
+ content: import_zod30.z.array(ConversationBlockSchema),
1762
+ timestamp: import_zod30.z.string()
1455
1763
  });
1456
- var ScenarioConversationSchema = import_zod29.z.object({
1457
- id: import_zod29.z.string(),
1458
- projectId: import_zod29.z.string(),
1459
- evalRunId: import_zod29.z.string(),
1460
- resultId: import_zod29.z.string(),
1461
- messages: import_zod29.z.array(ConversationMessageSchema),
1462
- createdAt: import_zod29.z.string()
1764
+ var ScenarioConversationSchema = import_zod30.z.object({
1765
+ id: import_zod30.z.string(),
1766
+ projectId: import_zod30.z.string(),
1767
+ evalRunId: import_zod30.z.string(),
1768
+ resultId: import_zod30.z.string(),
1769
+ messages: import_zod30.z.array(ConversationMessageSchema),
1770
+ createdAt: import_zod30.z.string()
1463
1771
  });
1464
1772
 
1465
1773
  // src/evaluation/eval-result.ts
@@ -1470,98 +1778,98 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
1470
1778
  AssertionResultStatus2["ERROR"] = "error";
1471
1779
  return AssertionResultStatus2;
1472
1780
  })(AssertionResultStatus || {});
1473
- var AssertionResultSchema = import_zod30.z.object({
1474
- id: import_zod30.z.string(),
1475
- assertionId: import_zod30.z.string(),
1476
- assertionType: import_zod30.z.string(),
1477
- assertionName: import_zod30.z.string(),
1478
- status: import_zod30.z.enum(AssertionResultStatus),
1479
- message: import_zod30.z.string().optional(),
1480
- expected: import_zod30.z.string().optional(),
1481
- actual: import_zod30.z.string().optional(),
1482
- duration: import_zod30.z.number().optional(),
1483
- details: import_zod30.z.record(import_zod30.z.string(), import_zod30.z.unknown()).optional(),
1484
- llmTraceSteps: import_zod30.z.array(LLMTraceStepSchema).optional()
1485
- });
1486
- var EvalRunResultSchema = import_zod30.z.object({
1487
- id: import_zod30.z.string(),
1488
- targetId: import_zod30.z.string(),
1489
- targetName: import_zod30.z.string().optional(),
1781
+ var AssertionResultSchema = import_zod31.z.object({
1782
+ id: import_zod31.z.string(),
1783
+ assertionId: import_zod31.z.string(),
1784
+ assertionType: import_zod31.z.string(),
1785
+ assertionName: import_zod31.z.string(),
1786
+ status: import_zod31.z.enum(AssertionResultStatus),
1787
+ message: import_zod31.z.string().optional(),
1788
+ expected: import_zod31.z.string().optional(),
1789
+ actual: import_zod31.z.string().optional(),
1790
+ duration: import_zod31.z.number().optional(),
1791
+ details: import_zod31.z.record(import_zod31.z.string(), import_zod31.z.unknown()).optional(),
1792
+ llmTraceSteps: import_zod31.z.array(LLMTraceStepSchema).optional()
1793
+ });
1794
+ var EvalRunResultSchema = import_zod31.z.object({
1795
+ id: import_zod31.z.string(),
1796
+ targetId: import_zod31.z.string(),
1797
+ targetName: import_zod31.z.string().optional(),
1490
1798
  /** SkillVersion ID used for this evaluation (for version tracking) */
1491
- skillVersionId: import_zod30.z.string().optional(),
1799
+ skillVersionId: import_zod31.z.string().optional(),
1492
1800
  /** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
1493
- skillVersion: import_zod30.z.string().optional(),
1494
- scenarioId: import_zod30.z.string(),
1495
- scenarioName: import_zod30.z.string(),
1801
+ skillVersion: import_zod31.z.string().optional(),
1802
+ scenarioId: import_zod31.z.string(),
1803
+ scenarioName: import_zod31.z.string(),
1496
1804
  /** Snapshot of the trigger prompt used during the run (prevents stale display after edits) */
1497
- triggerPrompt: import_zod30.z.string().optional(),
1805
+ triggerPrompt: import_zod31.z.string().optional(),
1498
1806
  modelConfig: ModelConfigSchema.optional(),
1499
- assertionResults: import_zod30.z.array(AssertionResultSchema),
1807
+ assertionResults: import_zod31.z.array(AssertionResultSchema),
1500
1808
  metrics: EvalMetricsSchema.optional(),
1501
- passed: import_zod30.z.number(),
1502
- failed: import_zod30.z.number(),
1503
- passRate: import_zod30.z.number(),
1504
- duration: import_zod30.z.number(),
1505
- outputText: import_zod30.z.string().optional(),
1506
- files: import_zod30.z.array(ExpectedFileSchema).optional(),
1507
- fileDiffs: import_zod30.z.array(DiffContentSchema).optional(),
1809
+ passed: import_zod31.z.number(),
1810
+ failed: import_zod31.z.number(),
1811
+ passRate: import_zod31.z.number(),
1812
+ duration: import_zod31.z.number(),
1813
+ outputText: import_zod31.z.string().optional(),
1814
+ files: import_zod31.z.array(ExpectedFileSchema).optional(),
1815
+ fileDiffs: import_zod31.z.array(DiffContentSchema).optional(),
1508
1816
  /** Full template files after execution with status indicators */
1509
- templateFiles: import_zod30.z.array(TemplateFileSchema).optional(),
1510
- startedAt: import_zod30.z.string().optional(),
1511
- completedAt: import_zod30.z.string().optional(),
1817
+ templateFiles: import_zod31.z.array(TemplateFileSchema).optional(),
1818
+ startedAt: import_zod31.z.string().optional(),
1819
+ completedAt: import_zod31.z.string().optional(),
1512
1820
  llmTrace: LLMTraceSchema.optional(),
1513
1821
  /** Full conversation messages (only present in transit; stripped before DB storage) */
1514
- conversation: import_zod30.z.array(ConversationMessageSchema).optional(),
1822
+ conversation: import_zod31.z.array(ConversationMessageSchema).optional(),
1515
1823
  /** 0-based iteration index when a scenario is run multiple times within a single eval run */
1516
- iterationIndex: import_zod30.z.number().int().min(0).optional()
1517
- });
1518
- var PromptResultSchema = import_zod30.z.object({
1519
- text: import_zod30.z.string(),
1520
- files: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1521
- finishReason: import_zod30.z.string().optional(),
1522
- reasoning: import_zod30.z.string().optional(),
1523
- reasoningDetails: import_zod30.z.unknown().optional(),
1524
- toolCalls: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1525
- toolResults: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1526
- warnings: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1527
- sources: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1528
- steps: import_zod30.z.array(import_zod30.z.unknown()),
1529
- generationTimeMs: import_zod30.z.number(),
1530
- prompt: import_zod30.z.string(),
1531
- systemPrompt: import_zod30.z.string(),
1532
- usage: import_zod30.z.object({
1533
- totalTokens: import_zod30.z.number().optional(),
1534
- totalMicrocentsSpent: import_zod30.z.number().optional()
1824
+ iterationIndex: import_zod31.z.number().int().min(0).optional()
1825
+ });
1826
+ var PromptResultSchema = import_zod31.z.object({
1827
+ text: import_zod31.z.string(),
1828
+ files: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1829
+ finishReason: import_zod31.z.string().optional(),
1830
+ reasoning: import_zod31.z.string().optional(),
1831
+ reasoningDetails: import_zod31.z.unknown().optional(),
1832
+ toolCalls: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1833
+ toolResults: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1834
+ warnings: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1835
+ sources: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1836
+ steps: import_zod31.z.array(import_zod31.z.unknown()),
1837
+ generationTimeMs: import_zod31.z.number(),
1838
+ prompt: import_zod31.z.string(),
1839
+ systemPrompt: import_zod31.z.string(),
1840
+ usage: import_zod31.z.object({
1841
+ totalTokens: import_zod31.z.number().optional(),
1842
+ totalMicrocentsSpent: import_zod31.z.number().optional()
1535
1843
  })
1536
1844
  });
1537
- var EvaluationResultSchema = import_zod30.z.object({
1538
- id: import_zod30.z.string(),
1539
- runId: import_zod30.z.string(),
1540
- timestamp: import_zod30.z.number(),
1845
+ var EvaluationResultSchema = import_zod31.z.object({
1846
+ id: import_zod31.z.string(),
1847
+ runId: import_zod31.z.string(),
1848
+ timestamp: import_zod31.z.number(),
1541
1849
  promptResult: PromptResultSchema,
1542
- testResults: import_zod30.z.array(import_zod30.z.unknown()),
1543
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
1544
- feedback: import_zod30.z.string().optional(),
1545
- score: import_zod30.z.number(),
1546
- suiteId: import_zod30.z.string().optional()
1547
- });
1548
- var LeanEvaluationResultSchema = import_zod30.z.object({
1549
- id: import_zod30.z.string(),
1550
- runId: import_zod30.z.string(),
1551
- timestamp: import_zod30.z.number(),
1552
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
1553
- scenarioId: import_zod30.z.string(),
1554
- scenarioVersion: import_zod30.z.number().optional(),
1555
- targetId: import_zod30.z.string(),
1556
- targetVersion: import_zod30.z.number().optional(),
1557
- suiteId: import_zod30.z.string().optional(),
1558
- score: import_zod30.z.number(),
1559
- time: import_zod30.z.number().optional(),
1560
- microcentsSpent: import_zod30.z.number().optional()
1850
+ testResults: import_zod31.z.array(import_zod31.z.unknown()),
1851
+ tags: import_zod31.z.array(import_zod31.z.string()).optional(),
1852
+ feedback: import_zod31.z.string().optional(),
1853
+ score: import_zod31.z.number(),
1854
+ suiteId: import_zod31.z.string().optional()
1855
+ });
1856
+ var LeanEvaluationResultSchema = import_zod31.z.object({
1857
+ id: import_zod31.z.string(),
1858
+ runId: import_zod31.z.string(),
1859
+ timestamp: import_zod31.z.number(),
1860
+ tags: import_zod31.z.array(import_zod31.z.string()).optional(),
1861
+ scenarioId: import_zod31.z.string(),
1862
+ scenarioVersion: import_zod31.z.number().optional(),
1863
+ targetId: import_zod31.z.string(),
1864
+ targetVersion: import_zod31.z.number().optional(),
1865
+ suiteId: import_zod31.z.string().optional(),
1866
+ score: import_zod31.z.number(),
1867
+ time: import_zod31.z.number().optional(),
1868
+ microcentsSpent: import_zod31.z.number().optional()
1561
1869
  });
1562
1870
 
1563
1871
  // src/evaluation/eval-run-folder.ts
1564
- var import_zod31 = require("zod");
1872
+ var import_zod32 = require("zod");
1565
1873
  var EvalRunFolderSchema = TenantEntitySchema.extend({});
1566
1874
  var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1567
1875
  id: true,
@@ -1575,26 +1883,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1575
1883
  updatedAt: true,
1576
1884
  deleted: true
1577
1885
  }).partial();
1578
- var EvalRunFolderMembershipSchema = import_zod31.z.object({
1579
- folderId: import_zod31.z.string(),
1580
- evalRunId: import_zod31.z.string(),
1581
- projectId: import_zod31.z.string(),
1582
- createdAt: import_zod31.z.string()
1886
+ var EvalRunFolderMembershipSchema = import_zod32.z.object({
1887
+ folderId: import_zod32.z.string(),
1888
+ evalRunId: import_zod32.z.string(),
1889
+ projectId: import_zod32.z.string(),
1890
+ createdAt: import_zod32.z.string()
1583
1891
  });
1584
1892
 
1585
1893
  // src/project/project.ts
1586
- var import_zod32 = require("zod");
1894
+ var import_zod33 = require("zod");
1587
1895
  var ProjectSchema = BaseEntitySchema.extend({
1588
- appId: import_zod32.z.string().optional().describe("The ID of the app in Dev Center"),
1589
- scenarioTags: import_zod32.z.array(import_zod32.z.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1896
+ appId: import_zod33.z.string().optional().describe("The ID of the app in Dev Center"),
1897
+ scenarioTags: import_zod33.z.array(import_zod33.z.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1590
1898
  /** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
1591
- wixAuthToken: import_zod32.z.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1899
+ wixAuthToken: import_zod33.z.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1592
1900
  /** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
1593
- base44AuthFile: import_zod32.z.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1901
+ base44AuthFile: import_zod33.z.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1594
1902
  /** Resolved at runtime from the encrypted Wix auth token */
1595
- wixAuthEmail: import_zod32.z.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1903
+ wixAuthEmail: import_zod33.z.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1596
1904
  /** Resolved at runtime from the encrypted Base44 auth file */
1597
- base44AuthEmail: import_zod32.z.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1905
+ base44AuthEmail: import_zod33.z.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1598
1906
  });
1599
1907
  var CreateProjectInputSchema = ProjectSchema.omit({
1600
1908
  id: true,
@@ -1620,7 +1928,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
1620
1928
  var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1621
1929
 
1622
1930
  // src/schedule/eval-schedule.ts
1623
- var import_zod33 = require("zod");
1931
+ var import_zod34 = require("zod");
1624
1932
  var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1625
1933
  FrequencyType2["DAILY"] = "daily";
1626
1934
  FrequencyType2["WEEKDAY"] = "weekday";
@@ -1630,29 +1938,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1630
1938
  })(FrequencyType || {});
1631
1939
  var EvalScheduleSchema = TenantEntitySchema.extend({
1632
1940
  /** Whether the schedule is active */
1633
- enabled: import_zod33.z.boolean(),
1941
+ enabled: import_zod34.z.boolean(),
1634
1942
  /** Test suite to run */
1635
- suiteId: import_zod33.z.string(),
1943
+ suiteId: import_zod34.z.string(),
1636
1944
  /** Preset that provides agent + entities for this schedule */
1637
- presetId: import_zod33.z.string(),
1945
+ presetId: import_zod34.z.string(),
1638
1946
  /** How often to run */
1639
- frequencyType: import_zod33.z.nativeEnum(FrequencyType),
1947
+ frequencyType: import_zod34.z.nativeEnum(FrequencyType),
1640
1948
  /** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
1641
- timeOfDay: import_zod33.z.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1949
+ timeOfDay: import_zod34.z.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1642
1950
  /** Day of week (0=Sun, 6=Sat) for weekly schedules */
1643
- dayOfWeek: import_zod33.z.number().min(0).max(6).optional(),
1951
+ dayOfWeek: import_zod34.z.number().min(0).max(6).optional(),
1644
1952
  /** Day of month (1-31) for monthly schedules */
1645
- dayOfMonth: import_zod33.z.number().min(1).max(31).optional(),
1953
+ dayOfMonth: import_zod34.z.number().min(1).max(31).optional(),
1646
1954
  /** IANA timezone (e.g., 'America/New_York') */
1647
- timezone: import_zod33.z.string(),
1955
+ timezone: import_zod34.z.string(),
1648
1956
  /** ID of the last eval run created by this schedule */
1649
- lastRunId: import_zod33.z.string().optional(),
1957
+ lastRunId: import_zod34.z.string().optional(),
1650
1958
  /** Denormalized status of the last run */
1651
- lastRunStatus: import_zod33.z.string().optional(),
1959
+ lastRunStatus: import_zod34.z.string().optional(),
1652
1960
  /** ISO timestamp of the last run */
1653
- lastRunAt: import_zod33.z.string().optional(),
1961
+ lastRunAt: import_zod34.z.string().optional(),
1654
1962
  /** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
1655
- nextRunAt: import_zod33.z.string().optional()
1963
+ nextRunAt: import_zod34.z.string().optional()
1656
1964
  });
1657
1965
  function isValidTimezone(tz) {
1658
1966
  try {
@@ -1665,14 +1973,14 @@ function isValidTimezone(tz) {
1665
1973
  function validateScheduleFields(data, ctx, options) {
1666
1974
  if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
1667
1975
  ctx.addIssue({
1668
- code: import_zod33.z.ZodIssueCode.custom,
1976
+ code: import_zod34.z.ZodIssueCode.custom,
1669
1977
  message: "dayOfWeek is required for weekly schedules",
1670
1978
  path: ["dayOfWeek"]
1671
1979
  });
1672
1980
  }
1673
1981
  if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
1674
1982
  ctx.addIssue({
1675
- code: import_zod33.z.ZodIssueCode.custom,
1983
+ code: import_zod34.z.ZodIssueCode.custom,
1676
1984
  message: "dayOfMonth is required for monthly schedules",
1677
1985
  path: ["dayOfMonth"]
1678
1986
  });
@@ -1680,7 +1988,7 @@ function validateScheduleFields(data, ctx, options) {
1680
1988
  const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
1681
1989
  if (shouldValidateTz && !isValidTimezone(data.timezone)) {
1682
1990
  ctx.addIssue({
1683
- code: import_zod33.z.ZodIssueCode.custom,
1991
+ code: import_zod34.z.ZodIssueCode.custom,
1684
1992
  message: "Invalid IANA timezone",
1685
1993
  path: ["timezone"]
1686
1994
  });
@@ -1703,229 +2011,10 @@ var CreateEvalScheduleInputSchema = BaseCreateScheduleSchema.superRefine((data,
1703
2011
  var UpdateEvalScheduleInputSchema = BaseCreateScheduleSchema.partial().superRefine((data, ctx) => {
1704
2012
  validateScheduleFields(data, ctx, { partial: true });
1705
2013
  });
1706
-
1707
- // src/assertion/system-assertions.ts
1708
- var SYSTEM_ASSERTION_IDS = {
1709
- SKILL_WAS_CALLED: "system:skill_was_called",
1710
- TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
1711
- BUILD_PASSED: "system:build_passed",
1712
- TIME_LIMIT: "system:time_limit",
1713
- COST: "system:cost",
1714
- LLM_JUDGE: "system:llm_judge",
1715
- API_CALL: "system:api_call"
1716
- };
1717
- function isSystemAssertionId(id) {
1718
- return id.startsWith("system:");
1719
- }
1720
- var SYSTEM_ASSERTIONS = {
1721
- [SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
1722
- id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
1723
- name: "Skill Was Called",
1724
- description: "Check that one or more skills were invoked during the agent run",
1725
- type: "skill_was_called",
1726
- parameters: [
1727
- {
1728
- name: "skillNames",
1729
- label: "Skills",
1730
- type: "string",
1731
- required: true
1732
- },
1733
- {
1734
- name: "negate",
1735
- label: "Negate (NOT operator)",
1736
- type: "boolean",
1737
- required: false,
1738
- defaultValue: false
1739
- }
1740
- ]
1741
- },
1742
- [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
1743
- id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
1744
- name: "Tool Called With Param",
1745
- description: "Check that a tool was called with expected parameters (tool name is substring matched)",
1746
- type: "tool_called_with_param",
1747
- parameters: [
1748
- {
1749
- name: "toolName",
1750
- label: "Tool Name",
1751
- type: "string",
1752
- required: true
1753
- },
1754
- {
1755
- name: "expectedParams",
1756
- label: "Expected Parameters (JSON, substring match)",
1757
- type: "string",
1758
- required: false
1759
- },
1760
- {
1761
- name: "requireSuccess",
1762
- label: "Require Successful Call",
1763
- type: "boolean",
1764
- required: false,
1765
- defaultValue: false,
1766
- advanced: true
1767
- },
1768
- {
1769
- name: "negate",
1770
- label: "Negate (NOT operator)",
1771
- type: "boolean",
1772
- required: false,
1773
- defaultValue: false
1774
- }
1775
- ]
1776
- },
1777
- [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
1778
- id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
1779
- name: "Build Passed",
1780
- description: "Run a build command and verify it exits with expected code",
1781
- type: "build_passed",
1782
- parameters: [
1783
- {
1784
- name: "command",
1785
- label: "Build Command",
1786
- type: "string",
1787
- required: false,
1788
- defaultValue: "yarn build"
1789
- },
1790
- {
1791
- name: "expectedExitCode",
1792
- label: "Expected Exit Code",
1793
- type: "number",
1794
- required: false,
1795
- defaultValue: 0
1796
- },
1797
- {
1798
- name: "maxBuildTime",
1799
- label: "Max Build Time (ms)",
1800
- type: "number",
1801
- required: false,
1802
- advanced: true
1803
- },
1804
- {
1805
- name: "maxMemory",
1806
- label: "Max Memory (MB)",
1807
- type: "number",
1808
- required: false,
1809
- advanced: true
1810
- }
1811
- ]
1812
- },
1813
- [SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
1814
- id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
1815
- name: "Time Limit",
1816
- description: "Check that the scenario completed within a maximum duration",
1817
- type: "time_limit",
1818
- parameters: [
1819
- {
1820
- name: "maxDurationMs",
1821
- label: "Max Duration (ms)",
1822
- type: "number",
1823
- required: true,
1824
- defaultValue: 3e5
1825
- }
1826
- ]
1827
- },
1828
- [SYSTEM_ASSERTION_IDS.COST]: {
1829
- id: SYSTEM_ASSERTION_IDS.COST,
1830
- name: "Cost",
1831
- description: "Check that the scenario LLM execution cost stays within a USD threshold",
1832
- type: "cost",
1833
- parameters: [
1834
- {
1835
- name: "maxCostUsd",
1836
- label: "Max Cost (USD)",
1837
- type: "number",
1838
- required: true,
1839
- defaultValue: 1
1840
- }
1841
- ]
1842
- },
1843
- [SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
1844
- id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
1845
- name: "LLM Judge",
1846
- description: "LLM evaluates the output and assigns a score (0-10)",
1847
- type: "llm_judge",
1848
- parameters: [
1849
- {
1850
- name: "prompt",
1851
- label: "Judge Prompt",
1852
- type: "string",
1853
- required: true,
1854
- defaultValue: "Verify the output meets the acceptance criteria."
1855
- },
1856
- {
1857
- name: "minScore",
1858
- label: "Minimum Score (0-10)",
1859
- type: "number",
1860
- required: false,
1861
- defaultValue: 7
1862
- },
1863
- {
1864
- name: "model",
1865
- label: "Model",
1866
- type: "string",
1867
- required: false
1868
- }
1869
- ]
1870
- },
1871
- [SYSTEM_ASSERTION_IDS.API_CALL]: {
1872
- id: SYSTEM_ASSERTION_IDS.API_CALL,
1873
- name: "API Call",
1874
- description: "Call an API endpoint and verify the response contains expected data",
1875
- type: "api_call",
1876
- parameters: [
1877
- {
1878
- name: "url",
1879
- label: "URL",
1880
- type: "string",
1881
- required: true
1882
- },
1883
- {
1884
- name: "method",
1885
- label: "HTTP Method",
1886
- type: "string",
1887
- required: false,
1888
- defaultValue: "GET"
1889
- },
1890
- {
1891
- name: "requestBody",
1892
- label: "Request Body (JSON)",
1893
- type: "string",
1894
- required: false
1895
- },
1896
- {
1897
- name: "expectedResponse",
1898
- label: "Expected Response (JSON)",
1899
- type: "string",
1900
- required: true
1901
- },
1902
- {
1903
- name: "requestHeaders",
1904
- label: "Headers (JSON)",
1905
- type: "string",
1906
- required: false,
1907
- advanced: true
1908
- },
1909
- {
1910
- name: "timeoutMs",
1911
- label: "Timeout (ms)",
1912
- type: "number",
1913
- required: false,
1914
- defaultValue: 3e4,
1915
- advanced: true
1916
- }
1917
- ]
1918
- }
1919
- };
1920
- function getSystemAssertions() {
1921
- return Object.values(SYSTEM_ASSERTIONS);
1922
- }
1923
- function getSystemAssertion(id) {
1924
- return SYSTEM_ASSERTIONS[id];
1925
- }
1926
2014
  // Annotate the CommonJS export names for ESM import in node:
1927
2015
  0 && (module.exports = {
1928
2016
  AGENT_TYPE_LABELS,
2017
+ ALLOWED_BUILD_COMMANDS,
1929
2018
  ALL_AVAILABLE_MODEL_IDS,
1930
2019
  AVAILABLE_CLAUDE_MODEL_IDS,
1931
2020
  AVAILABLE_OPENAI_MODEL_IDS,
@@ -1959,6 +2048,7 @@ function getSystemAssertion(id) {
1959
2048
  BatchSummarySchema,
1960
2049
  BuildCheckTestSchema,
1961
2050
  BuildPassedAssertionSchema,
2051
+ BuildPassedCommandStringSchema,
1962
2052
  BuildPassedConfigSchema,
1963
2053
  BulkImportResultItemSchema,
1964
2054
  BulkImportResultSchema,
@@ -1986,6 +2076,7 @@ function getSystemAssertion(id) {
1986
2076
  CreateTemplateInputSchema,
1987
2077
  CreateTestScenarioInputSchema,
1988
2078
  CreateTestSuiteInputSchema,
2079
+ DEFAULT_BUILD_PASSED_COMMAND,
1989
2080
  DEFAULT_EVALUATOR_SYSTEM_PROMPT,
1990
2081
  DEFAULT_JUDGE_MODEL,
1991
2082
  DiffContentSchema,
@@ -2083,6 +2174,7 @@ function getSystemAssertion(id) {
2083
2174
  ToolTestSchema,
2084
2175
  ToolUseBlockSchema,
2085
2176
  TriggerMetadataSchema,
2177
+ TriggerPromptImageSchema,
2086
2178
  TriggerSchema,
2087
2179
  TriggerType,
2088
2180
  UpdateAgentInputSchema,
@@ -2102,11 +2194,14 @@ function getSystemAssertion(id) {
2102
2194
  formatTraceEventLine,
2103
2195
  getSystemAssertion,
2104
2196
  getSystemAssertions,
2197
+ isAllowedBuildCommandString,
2105
2198
  isSystemAssertionId,
2106
2199
  isValidSkillFolderName,
2107
2200
  normalizeBatchAssertionLink,
2108
2201
  normalizeModelId,
2202
+ parseBuildCommandToArgv,
2109
2203
  parseTraceEventLine,
2110
- validateAssertionConfig
2204
+ validateAssertionConfig,
2205
+ validateBuildPassedParamsInAssertionLinks
2111
2206
  });
2112
2207
  //# sourceMappingURL=index.js.map