@wix/evalforge-types 0.72.0 → 0.74.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -21,6 +21,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
21
21
  var index_exports = {};
22
22
  __export(index_exports, {
23
23
  AGENT_TYPE_LABELS: () => AGENT_TYPE_LABELS,
24
+ ALLOWED_BUILD_COMMANDS: () => ALLOWED_BUILD_COMMANDS,
24
25
  ALL_AVAILABLE_MODEL_IDS: () => ALL_AVAILABLE_MODEL_IDS,
25
26
  AVAILABLE_CLAUDE_MODEL_IDS: () => AVAILABLE_CLAUDE_MODEL_IDS,
26
27
  AVAILABLE_OPENAI_MODEL_IDS: () => AVAILABLE_OPENAI_MODEL_IDS,
@@ -54,6 +55,7 @@ __export(index_exports, {
54
55
  BatchSummarySchema: () => BatchSummarySchema,
55
56
  BuildCheckTestSchema: () => BuildCheckTestSchema,
56
57
  BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
58
+ BuildPassedCommandStringSchema: () => BuildPassedCommandStringSchema,
57
59
  BuildPassedConfigSchema: () => BuildPassedConfigSchema,
58
60
  BulkImportResultItemSchema: () => BulkImportResultItemSchema,
59
61
  BulkImportResultSchema: () => BulkImportResultSchema,
@@ -81,6 +83,7 @@ __export(index_exports, {
81
83
  CreateTemplateInputSchema: () => CreateTemplateInputSchema,
82
84
  CreateTestScenarioInputSchema: () => CreateTestScenarioInputSchema,
83
85
  CreateTestSuiteInputSchema: () => CreateTestSuiteInputSchema,
86
+ DEFAULT_BUILD_PASSED_COMMAND: () => DEFAULT_BUILD_PASSED_COMMAND,
84
87
  DEFAULT_EVALUATOR_SYSTEM_PROMPT: () => DEFAULT_EVALUATOR_SYSTEM_PROMPT,
85
88
  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
86
89
  DiffContentSchema: () => DiffContentSchema,
@@ -198,12 +201,15 @@ __export(index_exports, {
198
201
  formatTraceEventLine: () => formatTraceEventLine,
199
202
  getSystemAssertion: () => getSystemAssertion,
200
203
  getSystemAssertions: () => getSystemAssertions,
204
+ isAllowedBuildCommandString: () => isAllowedBuildCommandString,
201
205
  isSystemAssertionId: () => isSystemAssertionId,
202
206
  isValidSkillFolderName: () => isValidSkillFolderName,
203
207
  normalizeBatchAssertionLink: () => normalizeBatchAssertionLink,
204
208
  normalizeModelId: () => normalizeModelId,
209
+ parseBuildCommandToArgv: () => parseBuildCommandToArgv,
205
210
  parseTraceEventLine: () => parseTraceEventLine,
206
- validateAssertionConfig: () => validateAssertionConfig
211
+ validateAssertionConfig: () => validateAssertionConfig,
212
+ validateBuildPassedParamsInAssertionLinks: () => validateBuildPassedParamsInAssertionLinks
207
213
  });
208
214
  module.exports = __toCommonJS(index_exports);
209
215
 
@@ -793,11 +799,42 @@ var EnvironmentSchema = import_zod21.z.object({
793
799
  });
794
800
 
795
801
  // src/scenario/test-scenario.ts
796
- var import_zod23 = require("zod");
802
+ var import_zod24 = require("zod");
797
803
 
798
804
  // src/assertion/assertion.ts
805
+ var import_zod23 = require("zod");
806
+
807
+ // src/assertion/build-passed-command.ts
799
808
  var import_zod22 = require("zod");
800
- var AssertionTypeSchema = import_zod22.z.enum([
809
+ var ALLOWED_BUILD_COMMANDS = [
810
+ "yarn build",
811
+ "npm run build",
812
+ "pnpm run build",
813
+ "pnpm build"
814
+ ];
815
+ var DEFAULT_BUILD_PASSED_COMMAND = "yarn build";
816
+ var BUILD_COMMAND_ARGV = {
817
+ "yarn build": ["yarn", "build"],
818
+ "npm run build": ["npm", "run", "build"],
819
+ "pnpm run build": ["pnpm", "run", "build"],
820
+ "pnpm build": ["pnpm", "build"]
821
+ };
822
+ function isAllowedBuildCommandString(command) {
823
+ const trimmed = command.trim();
824
+ return ALLOWED_BUILD_COMMANDS.includes(trimmed);
825
+ }
826
+ function parseBuildCommandToArgv(command) {
827
+ const trimmed = command.trim();
828
+ if (!(trimmed in BUILD_COMMAND_ARGV)) {
829
+ return null;
830
+ }
831
+ return BUILD_COMMAND_ARGV[trimmed];
832
+ }
833
+ var enumTuple = ALLOWED_BUILD_COMMANDS;
834
+ var BuildPassedCommandStringSchema = import_zod22.z.enum(enumTuple);
835
+
836
+ // src/assertion/assertion.ts
837
+ var AssertionTypeSchema = import_zod23.z.enum([
801
838
  "skill_was_called",
802
839
  "tool_called_with_param",
803
840
  "build_passed",
@@ -806,61 +843,61 @@ var AssertionTypeSchema = import_zod22.z.enum([
806
843
  "llm_judge",
807
844
  "api_call"
808
845
  ]);
809
- var AssertionParameterTypeSchema = import_zod22.z.enum([
846
+ var AssertionParameterTypeSchema = import_zod23.z.enum([
810
847
  "string",
811
848
  "number",
812
849
  "boolean"
813
850
  ]);
814
- var AssertionParameterSchema = import_zod22.z.object({
851
+ var AssertionParameterSchema = import_zod23.z.object({
815
852
  /** Parameter name (used as key in params object) */
816
- name: import_zod22.z.string().min(1),
853
+ name: import_zod23.z.string().min(1),
817
854
  /** Display label for the parameter */
818
- label: import_zod22.z.string().min(1),
855
+ label: import_zod23.z.string().min(1),
819
856
  /** Parameter type */
820
857
  type: AssertionParameterTypeSchema,
821
858
  /** Whether this parameter is required */
822
- required: import_zod22.z.boolean(),
859
+ required: import_zod23.z.boolean(),
823
860
  /** Default value (optional, used when not provided) */
824
- defaultValue: import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean()]).optional(),
861
+ defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
825
862
  /** If true, parameter is hidden by default behind "Show advanced options" */
826
- advanced: import_zod22.z.boolean().optional()
863
+ advanced: import_zod23.z.boolean().optional()
827
864
  });
828
- var ScenarioAssertionLinkSchema = import_zod22.z.object({
865
+ var ScenarioAssertionLinkSchema = import_zod23.z.object({
829
866
  /** ID of the system assertion (e.g., 'system:skill_was_called') */
830
- assertionId: import_zod22.z.string(),
867
+ assertionId: import_zod23.z.string(),
831
868
  /** Parameter values for this assertion in this scenario */
832
- params: import_zod22.z.record(
833
- import_zod22.z.string(),
834
- import_zod22.z.union([import_zod22.z.string(), import_zod22.z.number(), import_zod22.z.boolean(), import_zod22.z.null()])
869
+ params: import_zod23.z.record(
870
+ import_zod23.z.string(),
871
+ import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
835
872
  ).optional()
836
873
  });
837
- var SkillWasCalledConfigSchema = import_zod22.z.object({
874
+ var SkillWasCalledConfigSchema = import_zod23.z.object({
838
875
  /** Names of the skills that must have been called */
839
- skillNames: import_zod22.z.array(import_zod22.z.string().min(1)).min(1)
876
+ skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
840
877
  });
841
- var CostConfigSchema = import_zod22.z.strictObject({
878
+ var CostConfigSchema = import_zod23.z.strictObject({
842
879
  /** Maximum allowed cost in USD */
843
- maxCostUsd: import_zod22.z.number().positive()
880
+ maxCostUsd: import_zod23.z.number().positive()
844
881
  });
845
- var ToolCalledWithParamConfigSchema = import_zod22.z.strictObject({
882
+ var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
846
883
  /** Name of the tool that must have been called */
847
- toolName: import_zod22.z.string().min(1),
884
+ toolName: import_zod23.z.string().min(1),
848
885
  /** JSON string of key-value pairs for expected parameters (substring match). Optional — when omitted, only checks tool presence. */
849
- expectedParams: import_zod22.z.string().min(1).optional(),
886
+ expectedParams: import_zod23.z.string().min(1).optional(),
850
887
  /** If true, the matching tool call must also have succeeded (step.success === true) */
851
- requireSuccess: import_zod22.z.boolean().optional()
888
+ requireSuccess: import_zod23.z.boolean().optional()
852
889
  });
853
- var BuildPassedConfigSchema = import_zod22.z.strictObject({
854
- /** Command to run (default: "yarn build") */
855
- command: import_zod22.z.string().optional(),
890
+ var BuildPassedConfigSchema = import_zod23.z.strictObject({
891
+ /** Allowlisted command only (default at runtime: "yarn build") */
892
+ command: BuildPassedCommandStringSchema.optional(),
856
893
  /** Expected exit code (default: 0) */
857
- expectedExitCode: import_zod22.z.number().int().optional()
894
+ expectedExitCode: import_zod23.z.number().int().optional()
858
895
  });
859
- var TimeConfigSchema = import_zod22.z.strictObject({
896
+ var TimeConfigSchema = import_zod23.z.strictObject({
860
897
  /** Maximum allowed duration in milliseconds */
861
- maxDurationMs: import_zod22.z.number().int().positive()
898
+ maxDurationMs: import_zod23.z.number().int().positive()
862
899
  });
863
- var LlmJudgeConfigSchema = import_zod22.z.object({
900
+ var LlmJudgeConfigSchema = import_zod23.z.object({
864
901
  /**
865
902
  * Prompt template with placeholders:
866
903
  * - {{output}}: agent's final output
@@ -871,65 +908,65 @@ var LlmJudgeConfigSchema = import_zod22.z.object({
871
908
  * - {{trace}}: step-by-step trace of tool calls
872
909
  * - Custom parameters defined in the parameters array
873
910
  */
874
- prompt: import_zod22.z.string().min(1),
911
+ prompt: import_zod23.z.string().min(1),
875
912
  /** Minimum score to pass (0-10, default 7) */
876
- minScore: import_zod22.z.number().int().min(0).max(10).optional(),
913
+ minScore: import_zod23.z.number().int().min(0).max(10).optional(),
877
914
  /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
878
- model: import_zod22.z.string().optional(),
915
+ model: import_zod23.z.string().optional(),
879
916
  /** Max output tokens */
880
- maxTokens: import_zod22.z.number().int().optional(),
917
+ maxTokens: import_zod23.z.number().int().optional(),
881
918
  /** Temperature (0-1) */
882
- temperature: import_zod22.z.number().min(0).max(1).optional(),
919
+ temperature: import_zod23.z.number().min(0).max(1).optional(),
883
920
  /** User-defined parameters for this assertion */
884
- parameters: import_zod22.z.array(AssertionParameterSchema).optional()
921
+ parameters: import_zod23.z.array(AssertionParameterSchema).optional()
885
922
  });
886
- var ApiCallConfigSchema = import_zod22.z.strictObject({
923
+ var ApiCallConfigSchema = import_zod23.z.strictObject({
887
924
  /** URL to call */
888
- url: import_zod22.z.string().min(1),
925
+ url: import_zod23.z.string().min(1),
889
926
  /** HTTP method (default GET) */
890
- method: import_zod22.z.enum(["GET", "POST"]).optional(),
927
+ method: import_zod23.z.enum(["GET", "POST"]).optional(),
891
928
  /** Request body (JSON string, for POST requests) */
892
- requestBody: import_zod22.z.string().optional(),
929
+ requestBody: import_zod23.z.string().optional(),
893
930
  /** Expected JSON response to validate against (subset match — extra fields in actual are OK) */
894
- expectedResponse: import_zod22.z.string().min(1),
931
+ expectedResponse: import_zod23.z.string().min(1),
895
932
  /** Request headers as JSON string of key-value pairs */
896
- requestHeaders: import_zod22.z.string().optional(),
933
+ requestHeaders: import_zod23.z.string().optional(),
897
934
  /** Request timeout in milliseconds (default 30000) */
898
- timeoutMs: import_zod22.z.number().int().positive().optional()
935
+ timeoutMs: import_zod23.z.number().int().positive().optional()
899
936
  });
900
937
  var AssertionBaseFields = {
901
938
  /** When true, the assertion's pass/fail logic is inverted (NOT operator). */
902
- negate: import_zod22.z.boolean().optional()
939
+ negate: import_zod23.z.boolean().optional()
903
940
  };
904
941
  var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
905
- type: import_zod22.z.literal("skill_was_called"),
942
+ type: import_zod23.z.literal("skill_was_called"),
906
943
  ...AssertionBaseFields
907
944
  });
908
945
  var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
909
- type: import_zod22.z.literal("tool_called_with_param"),
946
+ type: import_zod23.z.literal("tool_called_with_param"),
910
947
  ...AssertionBaseFields
911
948
  });
912
949
  var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
913
- type: import_zod22.z.literal("build_passed"),
950
+ type: import_zod23.z.literal("build_passed"),
914
951
  ...AssertionBaseFields
915
952
  });
916
953
  var CostAssertionSchema = CostConfigSchema.extend({
917
- type: import_zod22.z.literal("cost"),
954
+ type: import_zod23.z.literal("cost"),
918
955
  ...AssertionBaseFields
919
956
  });
920
957
  var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
921
- type: import_zod22.z.literal("llm_judge"),
958
+ type: import_zod23.z.literal("llm_judge"),
922
959
  ...AssertionBaseFields
923
960
  });
924
961
  var ApiCallAssertionSchema = ApiCallConfigSchema.extend({
925
- type: import_zod22.z.literal("api_call"),
962
+ type: import_zod23.z.literal("api_call"),
926
963
  ...AssertionBaseFields
927
964
  });
928
965
  var TimeAssertionSchema = TimeConfigSchema.extend({
929
- type: import_zod22.z.literal("time_limit"),
966
+ type: import_zod23.z.literal("time_limit"),
930
967
  ...AssertionBaseFields
931
968
  });
932
- var AssertionSchema = import_zod22.z.union([
969
+ var AssertionSchema = import_zod23.z.union([
933
970
  SkillWasCalledAssertionSchema,
934
971
  ToolCalledWithParamAssertionSchema,
935
972
  BuildPassedAssertionSchema,
@@ -938,7 +975,7 @@ var AssertionSchema = import_zod22.z.union([
938
975
  LlmJudgeAssertionSchema,
939
976
  ApiCallAssertionSchema
940
977
  ]);
941
- var AssertionConfigSchema = import_zod22.z.union([
978
+ var AssertionConfigSchema = import_zod23.z.union([
942
979
  LlmJudgeConfigSchema,
943
980
  // requires prompt - check first
944
981
  SkillWasCalledConfigSchema,
@@ -953,7 +990,7 @@ var AssertionConfigSchema = import_zod22.z.union([
953
990
  // requires maxCostUsd, uses strictObject
954
991
  BuildPassedConfigSchema,
955
992
  // all optional, uses strictObject to reject unknown keys
956
- import_zod22.z.object({})
993
+ import_zod23.z.object({})
957
994
  // fallback empty config
958
995
  ]);
959
996
  function validateAssertionConfig(type, config) {
@@ -977,63 +1014,322 @@ function validateAssertionConfig(type, config) {
977
1014
  }
978
1015
  }
979
1016
 
1017
+ // src/assertion/system-assertions.ts
1018
+ var SYSTEM_ASSERTION_IDS = {
1019
+ SKILL_WAS_CALLED: "system:skill_was_called",
1020
+ TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
1021
+ BUILD_PASSED: "system:build_passed",
1022
+ TIME_LIMIT: "system:time_limit",
1023
+ COST: "system:cost",
1024
+ LLM_JUDGE: "system:llm_judge",
1025
+ API_CALL: "system:api_call"
1026
+ };
1027
+ function isSystemAssertionId(id) {
1028
+ return id.startsWith("system:");
1029
+ }
1030
+ var SYSTEM_ASSERTIONS = {
1031
+ [SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
1032
+ id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
1033
+ name: "Skill Was Called",
1034
+ description: "Check that one or more skills were invoked during the agent run",
1035
+ type: "skill_was_called",
1036
+ parameters: [
1037
+ {
1038
+ name: "skillNames",
1039
+ label: "Skills",
1040
+ type: "string",
1041
+ required: true
1042
+ },
1043
+ {
1044
+ name: "negate",
1045
+ label: "Negate (NOT operator)",
1046
+ type: "boolean",
1047
+ required: false,
1048
+ defaultValue: false
1049
+ }
1050
+ ]
1051
+ },
1052
+ [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
1053
+ id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
1054
+ name: "Tool Called With Param",
1055
+ description: "Check that a tool was called with expected parameters (tool name is substring matched)",
1056
+ type: "tool_called_with_param",
1057
+ parameters: [
1058
+ {
1059
+ name: "toolName",
1060
+ label: "Tool Name",
1061
+ type: "string",
1062
+ required: true
1063
+ },
1064
+ {
1065
+ name: "expectedParams",
1066
+ label: "Expected Parameters (JSON, substring match)",
1067
+ type: "string",
1068
+ required: false
1069
+ },
1070
+ {
1071
+ name: "requireSuccess",
1072
+ label: "Require Successful Call",
1073
+ type: "boolean",
1074
+ required: false,
1075
+ defaultValue: false,
1076
+ advanced: true
1077
+ },
1078
+ {
1079
+ name: "negate",
1080
+ label: "Negate (NOT operator)",
1081
+ type: "boolean",
1082
+ required: false,
1083
+ defaultValue: false
1084
+ }
1085
+ ]
1086
+ },
1087
+ [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
1088
+ id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
1089
+ name: "Build Passed",
1090
+ description: "Run a build command and verify it exits with expected code",
1091
+ type: "build_passed",
1092
+ parameters: [
1093
+ {
1094
+ name: "command",
1095
+ label: "Build Command",
1096
+ type: "string",
1097
+ required: false,
1098
+ defaultValue: "yarn build"
1099
+ },
1100
+ {
1101
+ name: "expectedExitCode",
1102
+ label: "Expected Exit Code",
1103
+ type: "number",
1104
+ required: false,
1105
+ defaultValue: 0
1106
+ },
1107
+ {
1108
+ name: "maxBuildTime",
1109
+ label: "Max Build Time (ms)",
1110
+ type: "number",
1111
+ required: false,
1112
+ advanced: true
1113
+ },
1114
+ {
1115
+ name: "maxMemory",
1116
+ label: "Max Memory (MB)",
1117
+ type: "number",
1118
+ required: false,
1119
+ advanced: true
1120
+ }
1121
+ ]
1122
+ },
1123
+ [SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
1124
+ id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
1125
+ name: "Time Limit",
1126
+ description: "Check that the scenario completed within a maximum duration",
1127
+ type: "time_limit",
1128
+ parameters: [
1129
+ {
1130
+ name: "maxDurationMs",
1131
+ label: "Max Duration (ms)",
1132
+ type: "number",
1133
+ required: true,
1134
+ defaultValue: 3e5
1135
+ }
1136
+ ]
1137
+ },
1138
+ [SYSTEM_ASSERTION_IDS.COST]: {
1139
+ id: SYSTEM_ASSERTION_IDS.COST,
1140
+ name: "Cost",
1141
+ description: "Check that the scenario LLM execution cost stays within a USD threshold",
1142
+ type: "cost",
1143
+ parameters: [
1144
+ {
1145
+ name: "maxCostUsd",
1146
+ label: "Max Cost (USD)",
1147
+ type: "number",
1148
+ required: true,
1149
+ defaultValue: 1
1150
+ }
1151
+ ]
1152
+ },
1153
+ [SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
1154
+ id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
1155
+ name: "LLM Judge",
1156
+ description: "LLM evaluates the output and assigns a score (0-10)",
1157
+ type: "llm_judge",
1158
+ parameters: [
1159
+ {
1160
+ name: "prompt",
1161
+ label: "Judge Prompt",
1162
+ type: "string",
1163
+ required: true,
1164
+ defaultValue: "Verify the output meets the acceptance criteria."
1165
+ },
1166
+ {
1167
+ name: "minScore",
1168
+ label: "Minimum Score (0-10)",
1169
+ type: "number",
1170
+ required: false,
1171
+ defaultValue: 7
1172
+ },
1173
+ {
1174
+ name: "model",
1175
+ label: "Model",
1176
+ type: "string",
1177
+ required: false
1178
+ }
1179
+ ]
1180
+ },
1181
+ [SYSTEM_ASSERTION_IDS.API_CALL]: {
1182
+ id: SYSTEM_ASSERTION_IDS.API_CALL,
1183
+ name: "API Call",
1184
+ description: "Call an API endpoint and verify the response contains expected data",
1185
+ type: "api_call",
1186
+ parameters: [
1187
+ {
1188
+ name: "url",
1189
+ label: "URL",
1190
+ type: "string",
1191
+ required: true
1192
+ },
1193
+ {
1194
+ name: "method",
1195
+ label: "HTTP Method",
1196
+ type: "string",
1197
+ required: false,
1198
+ defaultValue: "GET"
1199
+ },
1200
+ {
1201
+ name: "requestBody",
1202
+ label: "Request Body (JSON)",
1203
+ type: "string",
1204
+ required: false
1205
+ },
1206
+ {
1207
+ name: "expectedResponse",
1208
+ label: "Expected Response (JSON)",
1209
+ type: "string",
1210
+ required: true
1211
+ },
1212
+ {
1213
+ name: "requestHeaders",
1214
+ label: "Headers (JSON)",
1215
+ type: "string",
1216
+ required: false,
1217
+ advanced: true
1218
+ },
1219
+ {
1220
+ name: "timeoutMs",
1221
+ label: "Timeout (ms)",
1222
+ type: "number",
1223
+ required: false,
1224
+ defaultValue: 3e4,
1225
+ advanced: true
1226
+ }
1227
+ ]
1228
+ }
1229
+ };
1230
+ function getSystemAssertions() {
1231
+ return Object.values(SYSTEM_ASSERTIONS);
1232
+ }
1233
+ function getSystemAssertion(id) {
1234
+ return SYSTEM_ASSERTIONS[id];
1235
+ }
1236
+
980
1237
  // src/scenario/test-scenario.ts
981
1238
  var MAX_IMAGE_BASE64_LENGTH = 4 * Math.ceil(2 * 1024 * 1024 / 3);
982
- var TriggerPromptImageSchema = import_zod23.z.object({
1239
+ var TriggerPromptImageSchema = import_zod24.z.object({
983
1240
  /** Base64-encoded image data (no data URL prefix) */
984
- base64: import_zod23.z.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
1241
+ base64: import_zod24.z.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
985
1242
  /** MIME type of the image */
986
- mediaType: import_zod23.z.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
1243
+ mediaType: import_zod24.z.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
987
1244
  /** Original filename of the image */
988
- name: import_zod23.z.string()
1245
+ name: import_zod24.z.string()
989
1246
  });
990
- var ExpectedFileSchema = import_zod23.z.object({
1247
+ var ExpectedFileSchema = import_zod24.z.object({
991
1248
  /** Relative path where the file should be created */
992
- path: import_zod23.z.string(),
1249
+ path: import_zod24.z.string(),
993
1250
  /** Optional expected content */
994
- content: import_zod23.z.string().optional()
1251
+ content: import_zod24.z.string().optional()
995
1252
  });
996
1253
  var TestScenarioSchema = TenantEntitySchema.extend({
997
1254
  /** The prompt sent to the agent to trigger the task */
998
- triggerPrompt: import_zod23.z.string().min(10),
1255
+ triggerPrompt: import_zod24.z.string().min(10),
999
1256
  /** ID of the template to use for this scenario (null = no template) */
1000
- templateId: import_zod23.z.string().nullish(),
1257
+ templateId: import_zod24.z.string().nullish(),
1001
1258
  /** Inline assertions to evaluate for this scenario (legacy) */
1002
- assertions: import_zod23.z.array(AssertionSchema).optional(),
1259
+ assertions: import_zod24.z.array(AssertionSchema).optional(),
1003
1260
  /** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
1004
- assertionIds: import_zod23.z.array(import_zod23.z.string()).optional(),
1261
+ assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
1005
1262
  /** Linked assertions with per-scenario parameter values */
1006
- assertionLinks: import_zod23.z.array(ScenarioAssertionLinkSchema).optional(),
1263
+ assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional(),
1007
1264
  /** Tags for categorisation and filtering */
1008
- tags: import_zod23.z.array(import_zod23.z.string()).optional(),
1265
+ tags: import_zod24.z.array(import_zod24.z.string()).optional(),
1009
1266
  /** Base64-encoded images attached to the trigger prompt (max 3) */
1010
- triggerPromptImages: import_zod23.z.array(TriggerPromptImageSchema).max(3).optional()
1011
- });
1012
- var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
1267
+ triggerPromptImages: import_zod24.z.array(TriggerPromptImageSchema).max(3).optional()
1268
+ });
1269
+ function validateBuildPassedParamsInAssertionLinks(links, ctx) {
1270
+ if (!links) return;
1271
+ for (let i = 0; i < links.length; i++) {
1272
+ const link = links[i];
1273
+ if (link.assertionId !== SYSTEM_ASSERTION_IDS.BUILD_PASSED) continue;
1274
+ const cmd = link.params?.command;
1275
+ if (cmd === void 0 || cmd === null) continue;
1276
+ if (typeof cmd !== "string") {
1277
+ ctx.addIssue({
1278
+ code: import_zod24.z.ZodIssueCode.custom,
1279
+ message: "build_passed command must be a string",
1280
+ path: ["assertionLinks", i, "params", "command"]
1281
+ });
1282
+ continue;
1283
+ }
1284
+ if (!isAllowedBuildCommandString(cmd)) {
1285
+ ctx.addIssue({
1286
+ code: import_zod24.z.ZodIssueCode.custom,
1287
+ message: "Invalid build_passed command. Allowed: yarn build, npm run build, pnpm run build, pnpm build",
1288
+ path: ["assertionLinks", i, "params", "command"]
1289
+ });
1290
+ }
1291
+ }
1292
+ }
1293
+ var TestScenarioCreateBaseSchema = TestScenarioSchema.omit({
1013
1294
  id: true,
1014
1295
  createdAt: true,
1015
1296
  updatedAt: true,
1016
1297
  deleted: true
1017
1298
  });
1018
- var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
1299
+ var CreateTestScenarioInputSchema = TestScenarioCreateBaseSchema.superRefine((data, ctx) => {
1300
+ validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
1301
+ });
1302
+ var UpdateTestScenarioInputSchema = TestScenarioCreateBaseSchema.partial().superRefine((data, ctx) => {
1303
+ if (data.assertionLinks !== void 0) {
1304
+ validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
1305
+ }
1306
+ });
1019
1307
 
1020
1308
  // src/scenario/batch-import.ts
1021
- var import_zod24 = require("zod");
1309
+ var import_zod25 = require("zod");
1022
1310
  var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
1023
- var BatchAssertionLinkSchema = import_zod24.z.union([
1024
- import_zod24.z.string().min(1),
1311
+ var BatchAssertionLinkSchema = import_zod25.z.union([
1312
+ import_zod25.z.string().min(1),
1025
1313
  ScenarioAssertionLinkSchema
1026
1314
  ]);
1027
- var BatchScenarioEntrySchema = import_zod24.z.object({
1028
- name: import_zod24.z.string().min(1, "name: Required"),
1029
- description: import_zod24.z.string().optional().default(""),
1030
- triggerPrompt: import_zod24.z.string().min(10, "triggerPrompt: Must be at least 10 characters"),
1031
- templateId: import_zod24.z.string().nullish(),
1032
- tags: import_zod24.z.array(import_zod24.z.string()).optional(),
1033
- assertionLinks: import_zod24.z.array(BatchAssertionLinkSchema).optional()
1315
+ var BatchScenarioEntrySchema = import_zod25.z.object({
1316
+ name: import_zod25.z.string().min(1, "name: Required"),
1317
+ description: import_zod25.z.string().optional().default(""),
1318
+ triggerPrompt: import_zod25.z.string().min(10, "triggerPrompt: Must be at least 10 characters"),
1319
+ templateId: import_zod25.z.string().nullish(),
1320
+ tags: import_zod25.z.array(import_zod25.z.string()).optional(),
1321
+ assertionLinks: import_zod25.z.array(BatchAssertionLinkSchema).optional()
1322
+ }).superRefine((data, ctx) => {
1323
+ if (!data.assertionLinks) return;
1324
+ const objectLinks = data.assertionLinks.filter(
1325
+ (link) => typeof link !== "string"
1326
+ );
1327
+ if (objectLinks.length > 0) {
1328
+ validateBuildPassedParamsInAssertionLinks(objectLinks, ctx);
1329
+ }
1034
1330
  });
1035
- var BatchImportPayloadSchema = import_zod24.z.object({
1036
- scenarios: import_zod24.z.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
1331
+ var BatchImportPayloadSchema = import_zod25.z.object({
1332
+ scenarios: import_zod25.z.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
1037
1333
  });
1038
1334
  var BATCH_IMPORT_LIMITS = {
1039
1335
  MAX_SCENARIOS: 100,
@@ -1055,29 +1351,29 @@ function normalizeBatchAssertionLink(link) {
1055
1351
  }
1056
1352
  return link;
1057
1353
  }
1058
- var BatchResultItemSchema = import_zod24.z.object({
1059
- index: import_zod24.z.number(),
1060
- name: import_zod24.z.string(),
1061
- status: import_zod24.z.enum(["valid", "invalid"]),
1062
- id: import_zod24.z.string().nullable().optional(),
1063
- errors: import_zod24.z.array(import_zod24.z.string()).optional()
1064
- });
1065
- var BatchSummarySchema = import_zod24.z.object({
1066
- total: import_zod24.z.number(),
1067
- valid: import_zod24.z.number(),
1068
- invalid: import_zod24.z.number(),
1069
- created: import_zod24.z.number()
1070
- });
1071
- var BatchImportResponseSchema = import_zod24.z.object({
1354
+ var BatchResultItemSchema = import_zod25.z.object({
1355
+ index: import_zod25.z.number(),
1356
+ name: import_zod25.z.string(),
1357
+ status: import_zod25.z.enum(["valid", "invalid"]),
1358
+ id: import_zod25.z.string().nullable().optional(),
1359
+ errors: import_zod25.z.array(import_zod25.z.string()).optional()
1360
+ });
1361
+ var BatchSummarySchema = import_zod25.z.object({
1362
+ total: import_zod25.z.number(),
1363
+ valid: import_zod25.z.number(),
1364
+ invalid: import_zod25.z.number(),
1365
+ created: import_zod25.z.number()
1366
+ });
1367
+ var BatchImportResponseSchema = import_zod25.z.object({
1072
1368
  summary: BatchSummarySchema,
1073
- results: import_zod24.z.array(BatchResultItemSchema)
1369
+ results: import_zod25.z.array(BatchResultItemSchema)
1074
1370
  });
1075
1371
 
1076
1372
  // src/suite/test-suite.ts
1077
- var import_zod25 = require("zod");
1373
+ var import_zod26 = require("zod");
1078
1374
  var TestSuiteSchema = TenantEntitySchema.extend({
1079
1375
  /** IDs of test scenarios in this suite */
1080
- scenarioIds: import_zod25.z.array(import_zod25.z.string())
1376
+ scenarioIds: import_zod26.z.array(import_zod26.z.string())
1081
1377
  });
1082
1378
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1083
1379
  id: true,
@@ -1088,21 +1384,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
1088
1384
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
1089
1385
 
1090
1386
  // src/evaluation/metrics.ts
1091
- var import_zod26 = require("zod");
1092
- var TokenUsageSchema = import_zod26.z.object({
1093
- prompt: import_zod26.z.number(),
1094
- completion: import_zod26.z.number(),
1095
- total: import_zod26.z.number()
1096
- });
1097
- var EvalMetricsSchema = import_zod26.z.object({
1098
- totalAssertions: import_zod26.z.number(),
1099
- passed: import_zod26.z.number(),
1100
- failed: import_zod26.z.number(),
1101
- skipped: import_zod26.z.number(),
1102
- errors: import_zod26.z.number(),
1103
- passRate: import_zod26.z.number(),
1104
- avgDuration: import_zod26.z.number(),
1105
- totalDuration: import_zod26.z.number()
1387
+ var import_zod27 = require("zod");
1388
+ var TokenUsageSchema = import_zod27.z.object({
1389
+ prompt: import_zod27.z.number(),
1390
+ completion: import_zod27.z.number(),
1391
+ total: import_zod27.z.number()
1392
+ });
1393
+ var EvalMetricsSchema = import_zod27.z.object({
1394
+ totalAssertions: import_zod27.z.number(),
1395
+ passed: import_zod27.z.number(),
1396
+ failed: import_zod27.z.number(),
1397
+ skipped: import_zod27.z.number(),
1398
+ errors: import_zod27.z.number(),
1399
+ passRate: import_zod27.z.number(),
1400
+ avgDuration: import_zod27.z.number(),
1401
+ totalDuration: import_zod27.z.number()
1106
1402
  });
1107
1403
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1108
1404
  EvalStatus2["PENDING"] = "pending";
@@ -1112,7 +1408,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
1112
1408
  EvalStatus2["CANCELLED"] = "cancelled";
1113
1409
  return EvalStatus2;
1114
1410
  })(EvalStatus || {});
1115
- var EvalStatusSchema = import_zod26.z.enum(EvalStatus);
1411
+ var EvalStatusSchema = import_zod27.z.enum(EvalStatus);
1116
1412
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1117
1413
  LLMStepType2["COMPLETION"] = "completion";
1118
1414
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -1120,54 +1416,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
1120
1416
  LLMStepType2["THINKING"] = "thinking";
1121
1417
  return LLMStepType2;
1122
1418
  })(LLMStepType || {});
1123
- var LLMTraceStepSchema = import_zod26.z.object({
1124
- id: import_zod26.z.string(),
1125
- stepNumber: import_zod26.z.number(),
1126
- type: import_zod26.z.enum(LLMStepType),
1127
- model: import_zod26.z.string(),
1128
- provider: import_zod26.z.string(),
1129
- startedAt: import_zod26.z.string(),
1130
- durationMs: import_zod26.z.number(),
1419
+ var LLMTraceStepSchema = import_zod27.z.object({
1420
+ id: import_zod27.z.string(),
1421
+ stepNumber: import_zod27.z.number(),
1422
+ type: import_zod27.z.enum(LLMStepType),
1423
+ model: import_zod27.z.string(),
1424
+ provider: import_zod27.z.string(),
1425
+ startedAt: import_zod27.z.string(),
1426
+ durationMs: import_zod27.z.number(),
1131
1427
  tokenUsage: TokenUsageSchema,
1132
- costUsd: import_zod26.z.number(),
1133
- toolName: import_zod26.z.string().optional(),
1134
- toolArguments: import_zod26.z.string().optional(),
1135
- inputPreview: import_zod26.z.string().optional(),
1136
- outputPreview: import_zod26.z.string().optional(),
1137
- success: import_zod26.z.boolean(),
1138
- error: import_zod26.z.string().optional(),
1139
- turnIndex: import_zod26.z.number().optional()
1140
- });
1141
- var LLMBreakdownStatsSchema = import_zod26.z.object({
1142
- count: import_zod26.z.number(),
1143
- durationMs: import_zod26.z.number(),
1144
- tokens: import_zod26.z.number(),
1145
- costUsd: import_zod26.z.number()
1146
- });
1147
- var LLMTraceSummarySchema = import_zod26.z.object({
1148
- totalSteps: import_zod26.z.number(),
1149
- totalTurns: import_zod26.z.number().optional(),
1150
- totalDurationMs: import_zod26.z.number(),
1428
+ costUsd: import_zod27.z.number(),
1429
+ toolName: import_zod27.z.string().optional(),
1430
+ toolArguments: import_zod27.z.string().optional(),
1431
+ inputPreview: import_zod27.z.string().optional(),
1432
+ outputPreview: import_zod27.z.string().optional(),
1433
+ success: import_zod27.z.boolean(),
1434
+ error: import_zod27.z.string().optional(),
1435
+ turnIndex: import_zod27.z.number().optional()
1436
+ });
1437
+ var LLMBreakdownStatsSchema = import_zod27.z.object({
1438
+ count: import_zod27.z.number(),
1439
+ durationMs: import_zod27.z.number(),
1440
+ tokens: import_zod27.z.number(),
1441
+ costUsd: import_zod27.z.number()
1442
+ });
1443
+ var LLMTraceSummarySchema = import_zod27.z.object({
1444
+ totalSteps: import_zod27.z.number(),
1445
+ totalTurns: import_zod27.z.number().optional(),
1446
+ totalDurationMs: import_zod27.z.number(),
1151
1447
  totalTokens: TokenUsageSchema,
1152
- totalCostUsd: import_zod26.z.number(),
1153
- stepTypeBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema).optional(),
1154
- modelBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema),
1155
- modelsUsed: import_zod26.z.array(import_zod26.z.string())
1156
- });
1157
- var LLMTraceSchema = import_zod26.z.object({
1158
- id: import_zod26.z.string(),
1159
- steps: import_zod26.z.array(LLMTraceStepSchema),
1448
+ totalCostUsd: import_zod27.z.number(),
1449
+ stepTypeBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema).optional(),
1450
+ modelBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema),
1451
+ modelsUsed: import_zod27.z.array(import_zod27.z.string())
1452
+ });
1453
+ var LLMTraceSchema = import_zod27.z.object({
1454
+ id: import_zod27.z.string(),
1455
+ steps: import_zod27.z.array(LLMTraceStepSchema),
1160
1456
  summary: LLMTraceSummarySchema
1161
1457
  });
1162
1458
 
1163
1459
  // src/evaluation/eval-result.ts
1164
- var import_zod30 = require("zod");
1460
+ var import_zod31 = require("zod");
1165
1461
 
1166
1462
  // src/evaluation/eval-run.ts
1167
- var import_zod28 = require("zod");
1463
+ var import_zod29 = require("zod");
1168
1464
 
1169
1465
  // src/evaluation/live-trace.ts
1170
- var import_zod27 = require("zod");
1466
+ var import_zod28 = require("zod");
1171
1467
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1172
1468
  LiveTraceEventType2["THINKING"] = "thinking";
1173
1469
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -1181,37 +1477,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
1181
1477
  LiveTraceEventType2["USER"] = "user";
1182
1478
  return LiveTraceEventType2;
1183
1479
  })(LiveTraceEventType || {});
1184
- var LiveTraceEventSchema = import_zod27.z.object({
1480
+ var LiveTraceEventSchema = import_zod28.z.object({
1185
1481
  /** The evaluation run ID */
1186
- evalRunId: import_zod27.z.string(),
1482
+ evalRunId: import_zod28.z.string(),
1187
1483
  /** The scenario ID being executed */
1188
- scenarioId: import_zod27.z.string(),
1484
+ scenarioId: import_zod28.z.string(),
1189
1485
  /** The scenario name for display */
1190
- scenarioName: import_zod27.z.string(),
1486
+ scenarioName: import_zod28.z.string(),
1191
1487
  /** The target ID (skill, agent, etc.) */
1192
- targetId: import_zod27.z.string(),
1488
+ targetId: import_zod28.z.string(),
1193
1489
  /** The target name for display */
1194
- targetName: import_zod27.z.string(),
1490
+ targetName: import_zod28.z.string(),
1195
1491
  /** Step number in the current scenario execution */
1196
- stepNumber: import_zod27.z.number(),
1492
+ stepNumber: import_zod28.z.number(),
1197
1493
  /** Type of trace event */
1198
- type: import_zod27.z.enum(LiveTraceEventType),
1494
+ type: import_zod28.z.enum(LiveTraceEventType),
1199
1495
  /** Tool name if this is a tool_use event */
1200
- toolName: import_zod27.z.string().optional(),
1496
+ toolName: import_zod28.z.string().optional(),
1201
1497
  /** Tool arguments preview (truncated JSON) */
1202
- toolArgs: import_zod27.z.string().optional(),
1498
+ toolArgs: import_zod28.z.string().optional(),
1203
1499
  /** Output preview (truncated text) */
1204
- outputPreview: import_zod27.z.string().optional(),
1500
+ outputPreview: import_zod28.z.string().optional(),
1205
1501
  /** File path for file operations */
1206
- filePath: import_zod27.z.string().optional(),
1502
+ filePath: import_zod28.z.string().optional(),
1207
1503
  /** Elapsed time in milliseconds for progress events */
1208
- elapsedMs: import_zod27.z.number().optional(),
1504
+ elapsedMs: import_zod28.z.number().optional(),
1209
1505
  /** Thinking/reasoning text from Claude */
1210
- thinking: import_zod27.z.string().optional(),
1506
+ thinking: import_zod28.z.string().optional(),
1211
1507
  /** Timestamp when this event occurred */
1212
- timestamp: import_zod27.z.string(),
1508
+ timestamp: import_zod28.z.string(),
1213
1509
  /** Whether this is the final event for this scenario */
1214
- isComplete: import_zod27.z.boolean()
1510
+ isComplete: import_zod28.z.boolean()
1215
1511
  });
1216
1512
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
1217
1513
  function parseTraceEventLine(line) {
@@ -1240,40 +1536,40 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
1240
1536
  TriggerType2["SCHEDULED"] = "SCHEDULED";
1241
1537
  return TriggerType2;
1242
1538
  })(TriggerType || {});
1243
- var TriggerMetadataSchema = import_zod28.z.object({
1244
- version: import_zod28.z.string().optional(),
1245
- resourceUpdated: import_zod28.z.array(import_zod28.z.string()).optional(),
1246
- scheduleId: import_zod28.z.string().optional()
1539
+ var TriggerMetadataSchema = import_zod29.z.object({
1540
+ version: import_zod29.z.string().optional(),
1541
+ resourceUpdated: import_zod29.z.array(import_zod29.z.string()).optional(),
1542
+ scheduleId: import_zod29.z.string().optional()
1247
1543
  });
1248
- var TriggerSchema = import_zod28.z.object({
1249
- id: import_zod28.z.string(),
1544
+ var TriggerSchema = import_zod29.z.object({
1545
+ id: import_zod29.z.string(),
1250
1546
  metadata: TriggerMetadataSchema.optional(),
1251
- type: import_zod28.z.nativeEnum(TriggerType)
1547
+ type: import_zod29.z.nativeEnum(TriggerType)
1252
1548
  });
1253
- var DiffLineTypeSchema = import_zod28.z.enum(["added", "removed", "unchanged"]);
1254
- var DiffLineSchema = import_zod28.z.object({
1549
+ var DiffLineTypeSchema = import_zod29.z.enum(["added", "removed", "unchanged"]);
1550
+ var DiffLineSchema = import_zod29.z.object({
1255
1551
  type: DiffLineTypeSchema,
1256
- content: import_zod28.z.string(),
1257
- lineNumber: import_zod28.z.number()
1258
- });
1259
- var DiffContentSchema = import_zod28.z.object({
1260
- path: import_zod28.z.string(),
1261
- expected: import_zod28.z.string(),
1262
- actual: import_zod28.z.string(),
1263
- diffLines: import_zod28.z.array(DiffLineSchema),
1264
- renamedFrom: import_zod28.z.string().optional(),
1552
+ content: import_zod29.z.string(),
1553
+ lineNumber: import_zod29.z.number()
1554
+ });
1555
+ var DiffContentSchema = import_zod29.z.object({
1556
+ path: import_zod29.z.string(),
1557
+ expected: import_zod29.z.string(),
1558
+ actual: import_zod29.z.string(),
1559
+ diffLines: import_zod29.z.array(DiffLineSchema),
1560
+ renamedFrom: import_zod29.z.string().optional(),
1265
1561
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1266
- isInfrastructure: import_zod28.z.boolean().optional()
1562
+ isInfrastructure: import_zod29.z.boolean().optional()
1267
1563
  });
1268
- var CommandExecutionSchema = import_zod28.z.object({
1269
- command: import_zod28.z.string(),
1270
- exitCode: import_zod28.z.number(),
1271
- output: import_zod28.z.string().optional(),
1272
- duration: import_zod28.z.number()
1564
+ var CommandExecutionSchema = import_zod29.z.object({
1565
+ command: import_zod29.z.string(),
1566
+ exitCode: import_zod29.z.number(),
1567
+ output: import_zod29.z.string().optional(),
1568
+ duration: import_zod29.z.number()
1273
1569
  });
1274
- var FileModificationSchema = import_zod28.z.object({
1275
- path: import_zod28.z.string(),
1276
- action: import_zod28.z.enum(["created", "modified", "deleted"])
1570
+ var FileModificationSchema = import_zod29.z.object({
1571
+ path: import_zod29.z.string(),
1572
+ action: import_zod29.z.enum(["created", "modified", "deleted"])
1277
1573
  });
1278
1574
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1279
1575
  TemplateFileStatus2["NEW"] = "new";
@@ -1281,62 +1577,62 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1281
1577
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
1282
1578
  return TemplateFileStatus2;
1283
1579
  })(TemplateFileStatus || {});
1284
- var TemplateFileSchema = import_zod28.z.object({
1580
+ var TemplateFileSchema = import_zod29.z.object({
1285
1581
  /** Relative path within the template */
1286
- path: import_zod28.z.string(),
1582
+ path: import_zod29.z.string(),
1287
1583
  /** Full file content after execution */
1288
- content: import_zod28.z.string(),
1584
+ content: import_zod29.z.string(),
1289
1585
  /** File status (new, modified, unchanged) */
1290
- status: import_zod28.z.enum(["new", "modified", "unchanged"]),
1586
+ status: import_zod29.z.enum(["new", "modified", "unchanged"]),
1291
1587
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1292
- isInfrastructure: import_zod28.z.boolean().optional()
1588
+ isInfrastructure: import_zod29.z.boolean().optional()
1293
1589
  });
1294
- var ApiCallSchema = import_zod28.z.object({
1295
- endpoint: import_zod28.z.string(),
1296
- tokensUsed: import_zod28.z.number(),
1297
- duration: import_zod28.z.number()
1590
+ var ApiCallSchema = import_zod29.z.object({
1591
+ endpoint: import_zod29.z.string(),
1592
+ tokensUsed: import_zod29.z.number(),
1593
+ duration: import_zod29.z.number()
1298
1594
  });
1299
- var ExecutionTraceSchema = import_zod28.z.object({
1300
- commands: import_zod28.z.array(CommandExecutionSchema),
1301
- filesModified: import_zod28.z.array(FileModificationSchema),
1302
- apiCalls: import_zod28.z.array(ApiCallSchema),
1303
- totalDuration: import_zod28.z.number()
1595
+ var ExecutionTraceSchema = import_zod29.z.object({
1596
+ commands: import_zod29.z.array(CommandExecutionSchema),
1597
+ filesModified: import_zod29.z.array(FileModificationSchema),
1598
+ apiCalls: import_zod29.z.array(ApiCallSchema),
1599
+ totalDuration: import_zod29.z.number()
1304
1600
  });
1305
- var RunAnalysisFindingSchema = import_zod28.z.object({
1306
- category: import_zod28.z.enum([
1601
+ var RunAnalysisFindingSchema = import_zod29.z.object({
1602
+ category: import_zod29.z.enum([
1307
1603
  "failure_pattern",
1308
1604
  "cost_waste",
1309
1605
  "flakiness",
1310
1606
  "inefficiency",
1311
1607
  "positive"
1312
1608
  ]),
1313
- severity: import_zod28.z.enum(["high", "medium", "low"]),
1314
- description: import_zod28.z.string(),
1315
- affectedScenarios: import_zod28.z.array(import_zod28.z.string()),
1316
- recommendation: import_zod28.z.string().optional()
1609
+ severity: import_zod29.z.enum(["high", "medium", "low"]),
1610
+ description: import_zod29.z.string(),
1611
+ affectedScenarios: import_zod29.z.array(import_zod29.z.string()),
1612
+ recommendation: import_zod29.z.string().optional()
1317
1613
  });
1318
- var RunAnalysisSchema = import_zod28.z.object({
1319
- generatedAt: import_zod28.z.string(),
1320
- summary: import_zod28.z.string(),
1321
- findings: import_zod28.z.array(RunAnalysisFindingSchema)
1614
+ var RunAnalysisSchema = import_zod29.z.object({
1615
+ generatedAt: import_zod29.z.string(),
1616
+ summary: import_zod29.z.string(),
1617
+ findings: import_zod29.z.array(RunAnalysisFindingSchema)
1322
1618
  });
1323
1619
  var EvalRunSchema = TenantEntitySchema.extend({
1324
1620
  /** Agent ID for this run */
1325
- agentId: import_zod28.z.string().optional(),
1621
+ agentId: import_zod29.z.string().optional(),
1326
1622
  /** Preset ID that originated this run (optional) */
1327
- presetId: import_zod28.z.string().optional(),
1623
+ presetId: import_zod29.z.string().optional(),
1328
1624
  /** Skill IDs for this run */
1329
- skillIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1625
+ skillIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1330
1626
  /** Map of skillId to skillVersionId for this run */
1331
- skillVersions: import_zod28.z.record(import_zod28.z.string(), import_zod28.z.string()).optional(),
1627
+ skillVersions: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.string()).optional(),
1332
1628
  /** Scenario IDs to run (always present — resolved server-side from tags when needed) */
1333
- scenarioIds: import_zod28.z.array(import_zod28.z.string()),
1629
+ scenarioIds: import_zod29.z.array(import_zod29.z.string()),
1334
1630
  /** Current status */
1335
1631
  status: EvalStatusSchema,
1336
1632
  /** Progress percentage (0-100) */
1337
- progress: import_zod28.z.number(),
1633
+ progress: import_zod29.z.number(),
1338
1634
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
1339
- results: import_zod28.z.array(import_zod28.z.lazy(() => EvalRunResultSchema)),
1635
+ results: import_zod29.z.array(import_zod29.z.lazy(() => EvalRunResultSchema)),
1340
1636
  /** Aggregated metrics across all results */
1341
1637
  aggregateMetrics: EvalMetricsSchema,
1342
1638
  /** Aggregated LLM trace summary */
@@ -1344,41 +1640,41 @@ var EvalRunSchema = TenantEntitySchema.extend({
1344
1640
  /** What triggered this run */
1345
1641
  trigger: TriggerSchema.optional(),
1346
1642
  /** When the run started (set when evaluation is triggered) */
1347
- startedAt: import_zod28.z.string().optional(),
1643
+ startedAt: import_zod29.z.string().optional(),
1348
1644
  /** When the run completed */
1349
- completedAt: import_zod28.z.string().optional(),
1645
+ completedAt: import_zod29.z.string().optional(),
1350
1646
  /** Live trace events captured during execution (for playback on results page) */
1351
- liveTraceEvents: import_zod28.z.array(LiveTraceEventSchema).optional(),
1647
+ liveTraceEvents: import_zod29.z.array(LiveTraceEventSchema).optional(),
1352
1648
  /** Remote job ID for tracking execution in Dev Machines */
1353
- jobId: import_zod28.z.string().optional(),
1649
+ jobId: import_zod29.z.string().optional(),
1354
1650
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
1355
- jobStatus: import_zod28.z.string().optional(),
1651
+ jobStatus: import_zod29.z.string().optional(),
1356
1652
  /** Remote job error message if the job failed */
1357
- jobError: import_zod28.z.string().optional(),
1653
+ jobError: import_zod29.z.string().optional(),
1358
1654
  /** Timestamp of the last job status check */
1359
- jobStatusCheckedAt: import_zod28.z.string().optional(),
1655
+ jobStatusCheckedAt: import_zod29.z.string().optional(),
1360
1656
  /** MCP server IDs to enable for this run (optional) */
1361
- mcpIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1657
+ mcpIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1362
1658
  /** Sub-agent IDs to enable for this run (optional) */
1363
- subAgentIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1659
+ subAgentIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1364
1660
  /** Rule IDs to enable for this run (optional) */
1365
- ruleIds: import_zod28.z.array(import_zod28.z.string()).optional(),
1661
+ ruleIds: import_zod29.z.array(import_zod29.z.string()).optional(),
1366
1662
  /** Tags used to select scenarios for this run (for traceability) */
1367
- tags: import_zod28.z.array(import_zod28.z.string()).optional(),
1663
+ tags: import_zod29.z.array(import_zod29.z.string()).optional(),
1368
1664
  /** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
1369
- runsPerScenario: import_zod28.z.number().int().min(1).max(20).optional(),
1665
+ runsPerScenario: import_zod29.z.number().int().min(1).max(20).optional(),
1370
1666
  /** Snapshot of agent configuration captured at run creation time */
1371
- agentSnapshot: import_zod28.z.object({
1372
- name: import_zod28.z.string().optional(),
1667
+ agentSnapshot: import_zod29.z.object({
1668
+ name: import_zod29.z.string().optional(),
1373
1669
  agentType: AgentTypeSchema.optional(),
1374
1670
  runCommand: AgentRunCommandSchema.optional(),
1375
- systemPrompt: import_zod28.z.string().nullable().optional(),
1671
+ systemPrompt: import_zod29.z.string().nullable().optional(),
1376
1672
  modelConfig: ModelConfigSchema.optional()
1377
1673
  }).optional(),
1378
1674
  /** UUID linking all runs in a comparison group */
1379
- comparisonGroupId: import_zod28.z.string().optional(),
1675
+ comparisonGroupId: import_zod29.z.string().optional(),
1380
1676
  /** Human-readable label for this variant (e.g., "MCP: Wix Stores") */
1381
- comparisonLabel: import_zod28.z.string().optional(),
1677
+ comparisonLabel: import_zod29.z.string().optional(),
1382
1678
  /** LLM-generated analysis of the completed run */
1383
1679
  runAnalysis: RunAnalysisSchema.optional()
1384
1680
  });
@@ -1396,60 +1692,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
1396
1692
  agentSnapshot: true
1397
1693
  }).extend({
1398
1694
  /** Optional on input — backend resolves from tags when not provided */
1399
- scenarioIds: import_zod28.z.array(import_zod28.z.string()).optional()
1695
+ scenarioIds: import_zod29.z.array(import_zod29.z.string()).optional()
1400
1696
  }).refine(
1401
1697
  (data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
1402
1698
  { message: "Either scenarioIds or tags must be provided" }
1403
1699
  );
1404
- var EvaluationProgressSchema = import_zod28.z.object({
1405
- runId: import_zod28.z.string(),
1406
- targetId: import_zod28.z.string(),
1407
- totalScenarios: import_zod28.z.number(),
1408
- completedScenarios: import_zod28.z.number(),
1409
- scenarioProgress: import_zod28.z.array(
1410
- import_zod28.z.object({
1411
- scenarioId: import_zod28.z.string(),
1412
- currentStep: import_zod28.z.string(),
1413
- error: import_zod28.z.string().optional()
1700
+ var EvaluationProgressSchema = import_zod29.z.object({
1701
+ runId: import_zod29.z.string(),
1702
+ targetId: import_zod29.z.string(),
1703
+ totalScenarios: import_zod29.z.number(),
1704
+ completedScenarios: import_zod29.z.number(),
1705
+ scenarioProgress: import_zod29.z.array(
1706
+ import_zod29.z.object({
1707
+ scenarioId: import_zod29.z.string(),
1708
+ currentStep: import_zod29.z.string(),
1709
+ error: import_zod29.z.string().optional()
1414
1710
  })
1415
1711
  ),
1416
- createdAt: import_zod28.z.number()
1417
- });
1418
- var EvaluationLogSchema = import_zod28.z.object({
1419
- runId: import_zod28.z.string(),
1420
- scenarioId: import_zod28.z.string(),
1421
- log: import_zod28.z.object({
1422
- level: import_zod28.z.enum(["info", "error", "debug"]),
1423
- message: import_zod28.z.string().optional(),
1424
- args: import_zod28.z.array(import_zod28.z.any()).optional(),
1425
- error: import_zod28.z.string().optional()
1712
+ createdAt: import_zod29.z.number()
1713
+ });
1714
+ var EvaluationLogSchema = import_zod29.z.object({
1715
+ runId: import_zod29.z.string(),
1716
+ scenarioId: import_zod29.z.string(),
1717
+ log: import_zod29.z.object({
1718
+ level: import_zod29.z.enum(["info", "error", "debug"]),
1719
+ message: import_zod29.z.string().optional(),
1720
+ args: import_zod29.z.array(import_zod29.z.any()).optional(),
1721
+ error: import_zod29.z.string().optional()
1426
1722
  })
1427
1723
  });
1428
1724
  var LLM_TIMEOUT = 12e4;
1429
1725
 
1430
1726
  // src/evaluation/conversation.ts
1431
- var import_zod29 = require("zod");
1432
- var TextBlockSchema = import_zod29.z.object({
1433
- type: import_zod29.z.literal("text"),
1434
- text: import_zod29.z.string()
1435
- });
1436
- var ThinkingBlockSchema = import_zod29.z.object({
1437
- type: import_zod29.z.literal("thinking"),
1438
- thinking: import_zod29.z.string()
1439
- });
1440
- var ToolUseBlockSchema = import_zod29.z.object({
1441
- type: import_zod29.z.literal("tool_use"),
1442
- toolName: import_zod29.z.string(),
1443
- toolId: import_zod29.z.string(),
1444
- input: import_zod29.z.unknown()
1445
- });
1446
- var ToolResultBlockSchema = import_zod29.z.object({
1447
- type: import_zod29.z.literal("tool_result"),
1448
- toolUseId: import_zod29.z.string(),
1449
- content: import_zod29.z.string(),
1450
- isError: import_zod29.z.boolean().optional()
1451
- });
1452
- var ConversationBlockSchema = import_zod29.z.discriminatedUnion("type", [
1727
+ var import_zod30 = require("zod");
1728
+ var TextBlockSchema = import_zod30.z.object({
1729
+ type: import_zod30.z.literal("text"),
1730
+ text: import_zod30.z.string()
1731
+ });
1732
+ var ThinkingBlockSchema = import_zod30.z.object({
1733
+ type: import_zod30.z.literal("thinking"),
1734
+ thinking: import_zod30.z.string()
1735
+ });
1736
+ var ToolUseBlockSchema = import_zod30.z.object({
1737
+ type: import_zod30.z.literal("tool_use"),
1738
+ toolName: import_zod30.z.string(),
1739
+ toolId: import_zod30.z.string(),
1740
+ input: import_zod30.z.unknown()
1741
+ });
1742
+ var ToolResultBlockSchema = import_zod30.z.object({
1743
+ type: import_zod30.z.literal("tool_result"),
1744
+ toolUseId: import_zod30.z.string(),
1745
+ content: import_zod30.z.string(),
1746
+ isError: import_zod30.z.boolean().optional()
1747
+ });
1748
+ var ConversationBlockSchema = import_zod30.z.discriminatedUnion("type", [
1453
1749
  TextBlockSchema,
1454
1750
  ThinkingBlockSchema,
1455
1751
  ToolUseBlockSchema,
@@ -1460,18 +1756,18 @@ var ConversationMessageRoles = [
1460
1756
  "user",
1461
1757
  "system"
1462
1758
  ];
1463
- var ConversationMessageSchema = import_zod29.z.object({
1464
- role: import_zod29.z.enum(ConversationMessageRoles),
1465
- content: import_zod29.z.array(ConversationBlockSchema),
1466
- timestamp: import_zod29.z.string()
1759
+ var ConversationMessageSchema = import_zod30.z.object({
1760
+ role: import_zod30.z.enum(ConversationMessageRoles),
1761
+ content: import_zod30.z.array(ConversationBlockSchema),
1762
+ timestamp: import_zod30.z.string()
1467
1763
  });
1468
- var ScenarioConversationSchema = import_zod29.z.object({
1469
- id: import_zod29.z.string(),
1470
- projectId: import_zod29.z.string(),
1471
- evalRunId: import_zod29.z.string(),
1472
- resultId: import_zod29.z.string(),
1473
- messages: import_zod29.z.array(ConversationMessageSchema),
1474
- createdAt: import_zod29.z.string()
1764
+ var ScenarioConversationSchema = import_zod30.z.object({
1765
+ id: import_zod30.z.string(),
1766
+ projectId: import_zod30.z.string(),
1767
+ evalRunId: import_zod30.z.string(),
1768
+ resultId: import_zod30.z.string(),
1769
+ messages: import_zod30.z.array(ConversationMessageSchema),
1770
+ createdAt: import_zod30.z.string()
1475
1771
  });
1476
1772
 
1477
1773
  // src/evaluation/eval-result.ts
@@ -1482,98 +1778,98 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
1482
1778
  AssertionResultStatus2["ERROR"] = "error";
1483
1779
  return AssertionResultStatus2;
1484
1780
  })(AssertionResultStatus || {});
1485
- var AssertionResultSchema = import_zod30.z.object({
1486
- id: import_zod30.z.string(),
1487
- assertionId: import_zod30.z.string(),
1488
- assertionType: import_zod30.z.string(),
1489
- assertionName: import_zod30.z.string(),
1490
- status: import_zod30.z.enum(AssertionResultStatus),
1491
- message: import_zod30.z.string().optional(),
1492
- expected: import_zod30.z.string().optional(),
1493
- actual: import_zod30.z.string().optional(),
1494
- duration: import_zod30.z.number().optional(),
1495
- details: import_zod30.z.record(import_zod30.z.string(), import_zod30.z.unknown()).optional(),
1496
- llmTraceSteps: import_zod30.z.array(LLMTraceStepSchema).optional()
1497
- });
1498
- var EvalRunResultSchema = import_zod30.z.object({
1499
- id: import_zod30.z.string(),
1500
- targetId: import_zod30.z.string(),
1501
- targetName: import_zod30.z.string().optional(),
1781
+ var AssertionResultSchema = import_zod31.z.object({
1782
+ id: import_zod31.z.string(),
1783
+ assertionId: import_zod31.z.string(),
1784
+ assertionType: import_zod31.z.string(),
1785
+ assertionName: import_zod31.z.string(),
1786
+ status: import_zod31.z.enum(AssertionResultStatus),
1787
+ message: import_zod31.z.string().optional(),
1788
+ expected: import_zod31.z.string().optional(),
1789
+ actual: import_zod31.z.string().optional(),
1790
+ duration: import_zod31.z.number().optional(),
1791
+ details: import_zod31.z.record(import_zod31.z.string(), import_zod31.z.unknown()).optional(),
1792
+ llmTraceSteps: import_zod31.z.array(LLMTraceStepSchema).optional()
1793
+ });
1794
+ var EvalRunResultSchema = import_zod31.z.object({
1795
+ id: import_zod31.z.string(),
1796
+ targetId: import_zod31.z.string(),
1797
+ targetName: import_zod31.z.string().optional(),
1502
1798
  /** SkillVersion ID used for this evaluation (for version tracking) */
1503
- skillVersionId: import_zod30.z.string().optional(),
1799
+ skillVersionId: import_zod31.z.string().optional(),
1504
1800
  /** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
1505
- skillVersion: import_zod30.z.string().optional(),
1506
- scenarioId: import_zod30.z.string(),
1507
- scenarioName: import_zod30.z.string(),
1801
+ skillVersion: import_zod31.z.string().optional(),
1802
+ scenarioId: import_zod31.z.string(),
1803
+ scenarioName: import_zod31.z.string(),
1508
1804
  /** Snapshot of the trigger prompt used during the run (prevents stale display after edits) */
1509
- triggerPrompt: import_zod30.z.string().optional(),
1805
+ triggerPrompt: import_zod31.z.string().optional(),
1510
1806
  modelConfig: ModelConfigSchema.optional(),
1511
- assertionResults: import_zod30.z.array(AssertionResultSchema),
1807
+ assertionResults: import_zod31.z.array(AssertionResultSchema),
1512
1808
  metrics: EvalMetricsSchema.optional(),
1513
- passed: import_zod30.z.number(),
1514
- failed: import_zod30.z.number(),
1515
- passRate: import_zod30.z.number(),
1516
- duration: import_zod30.z.number(),
1517
- outputText: import_zod30.z.string().optional(),
1518
- files: import_zod30.z.array(ExpectedFileSchema).optional(),
1519
- fileDiffs: import_zod30.z.array(DiffContentSchema).optional(),
1809
+ passed: import_zod31.z.number(),
1810
+ failed: import_zod31.z.number(),
1811
+ passRate: import_zod31.z.number(),
1812
+ duration: import_zod31.z.number(),
1813
+ outputText: import_zod31.z.string().optional(),
1814
+ files: import_zod31.z.array(ExpectedFileSchema).optional(),
1815
+ fileDiffs: import_zod31.z.array(DiffContentSchema).optional(),
1520
1816
  /** Full template files after execution with status indicators */
1521
- templateFiles: import_zod30.z.array(TemplateFileSchema).optional(),
1522
- startedAt: import_zod30.z.string().optional(),
1523
- completedAt: import_zod30.z.string().optional(),
1817
+ templateFiles: import_zod31.z.array(TemplateFileSchema).optional(),
1818
+ startedAt: import_zod31.z.string().optional(),
1819
+ completedAt: import_zod31.z.string().optional(),
1524
1820
  llmTrace: LLMTraceSchema.optional(),
1525
1821
  /** Full conversation messages (only present in transit; stripped before DB storage) */
1526
- conversation: import_zod30.z.array(ConversationMessageSchema).optional(),
1822
+ conversation: import_zod31.z.array(ConversationMessageSchema).optional(),
1527
1823
  /** 0-based iteration index when a scenario is run multiple times within a single eval run */
1528
- iterationIndex: import_zod30.z.number().int().min(0).optional()
1529
- });
1530
- var PromptResultSchema = import_zod30.z.object({
1531
- text: import_zod30.z.string(),
1532
- files: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1533
- finishReason: import_zod30.z.string().optional(),
1534
- reasoning: import_zod30.z.string().optional(),
1535
- reasoningDetails: import_zod30.z.unknown().optional(),
1536
- toolCalls: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1537
- toolResults: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1538
- warnings: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1539
- sources: import_zod30.z.array(import_zod30.z.unknown()).optional(),
1540
- steps: import_zod30.z.array(import_zod30.z.unknown()),
1541
- generationTimeMs: import_zod30.z.number(),
1542
- prompt: import_zod30.z.string(),
1543
- systemPrompt: import_zod30.z.string(),
1544
- usage: import_zod30.z.object({
1545
- totalTokens: import_zod30.z.number().optional(),
1546
- totalMicrocentsSpent: import_zod30.z.number().optional()
1824
+ iterationIndex: import_zod31.z.number().int().min(0).optional()
1825
+ });
1826
+ var PromptResultSchema = import_zod31.z.object({
1827
+ text: import_zod31.z.string(),
1828
+ files: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1829
+ finishReason: import_zod31.z.string().optional(),
1830
+ reasoning: import_zod31.z.string().optional(),
1831
+ reasoningDetails: import_zod31.z.unknown().optional(),
1832
+ toolCalls: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1833
+ toolResults: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1834
+ warnings: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1835
+ sources: import_zod31.z.array(import_zod31.z.unknown()).optional(),
1836
+ steps: import_zod31.z.array(import_zod31.z.unknown()),
1837
+ generationTimeMs: import_zod31.z.number(),
1838
+ prompt: import_zod31.z.string(),
1839
+ systemPrompt: import_zod31.z.string(),
1840
+ usage: import_zod31.z.object({
1841
+ totalTokens: import_zod31.z.number().optional(),
1842
+ totalMicrocentsSpent: import_zod31.z.number().optional()
1547
1843
  })
1548
1844
  });
1549
- var EvaluationResultSchema = import_zod30.z.object({
1550
- id: import_zod30.z.string(),
1551
- runId: import_zod30.z.string(),
1552
- timestamp: import_zod30.z.number(),
1845
+ var EvaluationResultSchema = import_zod31.z.object({
1846
+ id: import_zod31.z.string(),
1847
+ runId: import_zod31.z.string(),
1848
+ timestamp: import_zod31.z.number(),
1553
1849
  promptResult: PromptResultSchema,
1554
- testResults: import_zod30.z.array(import_zod30.z.unknown()),
1555
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
1556
- feedback: import_zod30.z.string().optional(),
1557
- score: import_zod30.z.number(),
1558
- suiteId: import_zod30.z.string().optional()
1559
- });
1560
- var LeanEvaluationResultSchema = import_zod30.z.object({
1561
- id: import_zod30.z.string(),
1562
- runId: import_zod30.z.string(),
1563
- timestamp: import_zod30.z.number(),
1564
- tags: import_zod30.z.array(import_zod30.z.string()).optional(),
1565
- scenarioId: import_zod30.z.string(),
1566
- scenarioVersion: import_zod30.z.number().optional(),
1567
- targetId: import_zod30.z.string(),
1568
- targetVersion: import_zod30.z.number().optional(),
1569
- suiteId: import_zod30.z.string().optional(),
1570
- score: import_zod30.z.number(),
1571
- time: import_zod30.z.number().optional(),
1572
- microcentsSpent: import_zod30.z.number().optional()
1850
+ testResults: import_zod31.z.array(import_zod31.z.unknown()),
1851
+ tags: import_zod31.z.array(import_zod31.z.string()).optional(),
1852
+ feedback: import_zod31.z.string().optional(),
1853
+ score: import_zod31.z.number(),
1854
+ suiteId: import_zod31.z.string().optional()
1855
+ });
1856
+ var LeanEvaluationResultSchema = import_zod31.z.object({
1857
+ id: import_zod31.z.string(),
1858
+ runId: import_zod31.z.string(),
1859
+ timestamp: import_zod31.z.number(),
1860
+ tags: import_zod31.z.array(import_zod31.z.string()).optional(),
1861
+ scenarioId: import_zod31.z.string(),
1862
+ scenarioVersion: import_zod31.z.number().optional(),
1863
+ targetId: import_zod31.z.string(),
1864
+ targetVersion: import_zod31.z.number().optional(),
1865
+ suiteId: import_zod31.z.string().optional(),
1866
+ score: import_zod31.z.number(),
1867
+ time: import_zod31.z.number().optional(),
1868
+ microcentsSpent: import_zod31.z.number().optional()
1573
1869
  });
1574
1870
 
1575
1871
  // src/evaluation/eval-run-folder.ts
1576
- var import_zod31 = require("zod");
1872
+ var import_zod32 = require("zod");
1577
1873
  var EvalRunFolderSchema = TenantEntitySchema.extend({});
1578
1874
  var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1579
1875
  id: true,
@@ -1587,26 +1883,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1587
1883
  updatedAt: true,
1588
1884
  deleted: true
1589
1885
  }).partial();
1590
- var EvalRunFolderMembershipSchema = import_zod31.z.object({
1591
- folderId: import_zod31.z.string(),
1592
- evalRunId: import_zod31.z.string(),
1593
- projectId: import_zod31.z.string(),
1594
- createdAt: import_zod31.z.string()
1886
+ var EvalRunFolderMembershipSchema = import_zod32.z.object({
1887
+ folderId: import_zod32.z.string(),
1888
+ evalRunId: import_zod32.z.string(),
1889
+ projectId: import_zod32.z.string(),
1890
+ createdAt: import_zod32.z.string()
1595
1891
  });
1596
1892
 
1597
1893
  // src/project/project.ts
1598
- var import_zod32 = require("zod");
1894
+ var import_zod33 = require("zod");
1599
1895
  var ProjectSchema = BaseEntitySchema.extend({
1600
- appId: import_zod32.z.string().optional().describe("The ID of the app in Dev Center"),
1601
- scenarioTags: import_zod32.z.array(import_zod32.z.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1896
+ appId: import_zod33.z.string().optional().describe("The ID of the app in Dev Center"),
1897
+ scenarioTags: import_zod33.z.array(import_zod33.z.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1602
1898
  /** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
1603
- wixAuthToken: import_zod32.z.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1899
+ wixAuthToken: import_zod33.z.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1604
1900
  /** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
1605
- base44AuthFile: import_zod32.z.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1901
+ base44AuthFile: import_zod33.z.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1606
1902
  /** Resolved at runtime from the encrypted Wix auth token */
1607
- wixAuthEmail: import_zod32.z.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1903
+ wixAuthEmail: import_zod33.z.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1608
1904
  /** Resolved at runtime from the encrypted Base44 auth file */
1609
- base44AuthEmail: import_zod32.z.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1905
+ base44AuthEmail: import_zod33.z.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1610
1906
  });
1611
1907
  var CreateProjectInputSchema = ProjectSchema.omit({
1612
1908
  id: true,
@@ -1615,6 +1911,10 @@ var CreateProjectInputSchema = ProjectSchema.omit({
1615
1911
  deleted: true,
1616
1912
  wixAuthEmail: true,
1617
1913
  base44AuthEmail: true
1914
+ }).extend({
1915
+ appId: import_zod33.z.string().describe(
1916
+ "Required: The ID of the app in Dev Center for credential scoping"
1917
+ )
1618
1918
  });
1619
1919
  var UpdateProjectInputSchema = CreateProjectInputSchema.partial();
1620
1920
 
@@ -1632,7 +1932,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
1632
1932
  var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1633
1933
 
1634
1934
  // src/schedule/eval-schedule.ts
1635
- var import_zod33 = require("zod");
1935
+ var import_zod34 = require("zod");
1636
1936
  var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1637
1937
  FrequencyType2["DAILY"] = "daily";
1638
1938
  FrequencyType2["WEEKDAY"] = "weekday";
@@ -1642,29 +1942,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1642
1942
  })(FrequencyType || {});
1643
1943
  var EvalScheduleSchema = TenantEntitySchema.extend({
1644
1944
  /** Whether the schedule is active */
1645
- enabled: import_zod33.z.boolean(),
1945
+ enabled: import_zod34.z.boolean(),
1646
1946
  /** Test suite to run */
1647
- suiteId: import_zod33.z.string(),
1947
+ suiteId: import_zod34.z.string(),
1648
1948
  /** Preset that provides agent + entities for this schedule */
1649
- presetId: import_zod33.z.string(),
1949
+ presetId: import_zod34.z.string(),
1650
1950
  /** How often to run */
1651
- frequencyType: import_zod33.z.nativeEnum(FrequencyType),
1951
+ frequencyType: import_zod34.z.nativeEnum(FrequencyType),
1652
1952
  /** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
1653
- timeOfDay: import_zod33.z.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1953
+ timeOfDay: import_zod34.z.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1654
1954
  /** Day of week (0=Sun, 6=Sat) for weekly schedules */
1655
- dayOfWeek: import_zod33.z.number().min(0).max(6).optional(),
1955
+ dayOfWeek: import_zod34.z.number().min(0).max(6).optional(),
1656
1956
  /** Day of month (1-31) for monthly schedules */
1657
- dayOfMonth: import_zod33.z.number().min(1).max(31).optional(),
1957
+ dayOfMonth: import_zod34.z.number().min(1).max(31).optional(),
1658
1958
  /** IANA timezone (e.g., 'America/New_York') */
1659
- timezone: import_zod33.z.string(),
1959
+ timezone: import_zod34.z.string(),
1660
1960
  /** ID of the last eval run created by this schedule */
1661
- lastRunId: import_zod33.z.string().optional(),
1961
+ lastRunId: import_zod34.z.string().optional(),
1662
1962
  /** Denormalized status of the last run */
1663
- lastRunStatus: import_zod33.z.string().optional(),
1963
+ lastRunStatus: import_zod34.z.string().optional(),
1664
1964
  /** ISO timestamp of the last run */
1665
- lastRunAt: import_zod33.z.string().optional(),
1965
+ lastRunAt: import_zod34.z.string().optional(),
1666
1966
  /** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
1667
- nextRunAt: import_zod33.z.string().optional()
1967
+ nextRunAt: import_zod34.z.string().optional()
1668
1968
  });
1669
1969
  function isValidTimezone(tz) {
1670
1970
  try {
@@ -1677,14 +1977,14 @@ function isValidTimezone(tz) {
1677
1977
  function validateScheduleFields(data, ctx, options) {
1678
1978
  if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
1679
1979
  ctx.addIssue({
1680
- code: import_zod33.z.ZodIssueCode.custom,
1980
+ code: import_zod34.z.ZodIssueCode.custom,
1681
1981
  message: "dayOfWeek is required for weekly schedules",
1682
1982
  path: ["dayOfWeek"]
1683
1983
  });
1684
1984
  }
1685
1985
  if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
1686
1986
  ctx.addIssue({
1687
- code: import_zod33.z.ZodIssueCode.custom,
1987
+ code: import_zod34.z.ZodIssueCode.custom,
1688
1988
  message: "dayOfMonth is required for monthly schedules",
1689
1989
  path: ["dayOfMonth"]
1690
1990
  });
@@ -1692,7 +1992,7 @@ function validateScheduleFields(data, ctx, options) {
1692
1992
  const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
1693
1993
  if (shouldValidateTz && !isValidTimezone(data.timezone)) {
1694
1994
  ctx.addIssue({
1695
- code: import_zod33.z.ZodIssueCode.custom,
1995
+ code: import_zod34.z.ZodIssueCode.custom,
1696
1996
  message: "Invalid IANA timezone",
1697
1997
  path: ["timezone"]
1698
1998
  });
@@ -1715,229 +2015,10 @@ var CreateEvalScheduleInputSchema = BaseCreateScheduleSchema.superRefine((data,
1715
2015
  var UpdateEvalScheduleInputSchema = BaseCreateScheduleSchema.partial().superRefine((data, ctx) => {
1716
2016
  validateScheduleFields(data, ctx, { partial: true });
1717
2017
  });
1718
-
1719
- // src/assertion/system-assertions.ts
1720
- var SYSTEM_ASSERTION_IDS = {
1721
- SKILL_WAS_CALLED: "system:skill_was_called",
1722
- TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
1723
- BUILD_PASSED: "system:build_passed",
1724
- TIME_LIMIT: "system:time_limit",
1725
- COST: "system:cost",
1726
- LLM_JUDGE: "system:llm_judge",
1727
- API_CALL: "system:api_call"
1728
- };
1729
- function isSystemAssertionId(id) {
1730
- return id.startsWith("system:");
1731
- }
1732
- var SYSTEM_ASSERTIONS = {
1733
- [SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
1734
- id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
1735
- name: "Skill Was Called",
1736
- description: "Check that one or more skills were invoked during the agent run",
1737
- type: "skill_was_called",
1738
- parameters: [
1739
- {
1740
- name: "skillNames",
1741
- label: "Skills",
1742
- type: "string",
1743
- required: true
1744
- },
1745
- {
1746
- name: "negate",
1747
- label: "Negate (NOT operator)",
1748
- type: "boolean",
1749
- required: false,
1750
- defaultValue: false
1751
- }
1752
- ]
1753
- },
1754
- [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
1755
- id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
1756
- name: "Tool Called With Param",
1757
- description: "Check that a tool was called with expected parameters (tool name is substring matched)",
1758
- type: "tool_called_with_param",
1759
- parameters: [
1760
- {
1761
- name: "toolName",
1762
- label: "Tool Name",
1763
- type: "string",
1764
- required: true
1765
- },
1766
- {
1767
- name: "expectedParams",
1768
- label: "Expected Parameters (JSON, substring match)",
1769
- type: "string",
1770
- required: false
1771
- },
1772
- {
1773
- name: "requireSuccess",
1774
- label: "Require Successful Call",
1775
- type: "boolean",
1776
- required: false,
1777
- defaultValue: false,
1778
- advanced: true
1779
- },
1780
- {
1781
- name: "negate",
1782
- label: "Negate (NOT operator)",
1783
- type: "boolean",
1784
- required: false,
1785
- defaultValue: false
1786
- }
1787
- ]
1788
- },
1789
- [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
1790
- id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
1791
- name: "Build Passed",
1792
- description: "Run a build command and verify it exits with expected code",
1793
- type: "build_passed",
1794
- parameters: [
1795
- {
1796
- name: "command",
1797
- label: "Build Command",
1798
- type: "string",
1799
- required: false,
1800
- defaultValue: "yarn build"
1801
- },
1802
- {
1803
- name: "expectedExitCode",
1804
- label: "Expected Exit Code",
1805
- type: "number",
1806
- required: false,
1807
- defaultValue: 0
1808
- },
1809
- {
1810
- name: "maxBuildTime",
1811
- label: "Max Build Time (ms)",
1812
- type: "number",
1813
- required: false,
1814
- advanced: true
1815
- },
1816
- {
1817
- name: "maxMemory",
1818
- label: "Max Memory (MB)",
1819
- type: "number",
1820
- required: false,
1821
- advanced: true
1822
- }
1823
- ]
1824
- },
1825
- [SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
1826
- id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
1827
- name: "Time Limit",
1828
- description: "Check that the scenario completed within a maximum duration",
1829
- type: "time_limit",
1830
- parameters: [
1831
- {
1832
- name: "maxDurationMs",
1833
- label: "Max Duration (ms)",
1834
- type: "number",
1835
- required: true,
1836
- defaultValue: 3e5
1837
- }
1838
- ]
1839
- },
1840
- [SYSTEM_ASSERTION_IDS.COST]: {
1841
- id: SYSTEM_ASSERTION_IDS.COST,
1842
- name: "Cost",
1843
- description: "Check that the scenario LLM execution cost stays within a USD threshold",
1844
- type: "cost",
1845
- parameters: [
1846
- {
1847
- name: "maxCostUsd",
1848
- label: "Max Cost (USD)",
1849
- type: "number",
1850
- required: true,
1851
- defaultValue: 1
1852
- }
1853
- ]
1854
- },
1855
- [SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
1856
- id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
1857
- name: "LLM Judge",
1858
- description: "LLM evaluates the output and assigns a score (0-10)",
1859
- type: "llm_judge",
1860
- parameters: [
1861
- {
1862
- name: "prompt",
1863
- label: "Judge Prompt",
1864
- type: "string",
1865
- required: true,
1866
- defaultValue: "Verify the output meets the acceptance criteria."
1867
- },
1868
- {
1869
- name: "minScore",
1870
- label: "Minimum Score (0-10)",
1871
- type: "number",
1872
- required: false,
1873
- defaultValue: 7
1874
- },
1875
- {
1876
- name: "model",
1877
- label: "Model",
1878
- type: "string",
1879
- required: false
1880
- }
1881
- ]
1882
- },
1883
- [SYSTEM_ASSERTION_IDS.API_CALL]: {
1884
- id: SYSTEM_ASSERTION_IDS.API_CALL,
1885
- name: "API Call",
1886
- description: "Call an API endpoint and verify the response contains expected data",
1887
- type: "api_call",
1888
- parameters: [
1889
- {
1890
- name: "url",
1891
- label: "URL",
1892
- type: "string",
1893
- required: true
1894
- },
1895
- {
1896
- name: "method",
1897
- label: "HTTP Method",
1898
- type: "string",
1899
- required: false,
1900
- defaultValue: "GET"
1901
- },
1902
- {
1903
- name: "requestBody",
1904
- label: "Request Body (JSON)",
1905
- type: "string",
1906
- required: false
1907
- },
1908
- {
1909
- name: "expectedResponse",
1910
- label: "Expected Response (JSON)",
1911
- type: "string",
1912
- required: true
1913
- },
1914
- {
1915
- name: "requestHeaders",
1916
- label: "Headers (JSON)",
1917
- type: "string",
1918
- required: false,
1919
- advanced: true
1920
- },
1921
- {
1922
- name: "timeoutMs",
1923
- label: "Timeout (ms)",
1924
- type: "number",
1925
- required: false,
1926
- defaultValue: 3e4,
1927
- advanced: true
1928
- }
1929
- ]
1930
- }
1931
- };
1932
- function getSystemAssertions() {
1933
- return Object.values(SYSTEM_ASSERTIONS);
1934
- }
1935
- function getSystemAssertion(id) {
1936
- return SYSTEM_ASSERTIONS[id];
1937
- }
1938
2018
  // Annotate the CommonJS export names for ESM import in node:
1939
2019
  0 && (module.exports = {
1940
2020
  AGENT_TYPE_LABELS,
2021
+ ALLOWED_BUILD_COMMANDS,
1941
2022
  ALL_AVAILABLE_MODEL_IDS,
1942
2023
  AVAILABLE_CLAUDE_MODEL_IDS,
1943
2024
  AVAILABLE_OPENAI_MODEL_IDS,
@@ -1971,6 +2052,7 @@ function getSystemAssertion(id) {
1971
2052
  BatchSummarySchema,
1972
2053
  BuildCheckTestSchema,
1973
2054
  BuildPassedAssertionSchema,
2055
+ BuildPassedCommandStringSchema,
1974
2056
  BuildPassedConfigSchema,
1975
2057
  BulkImportResultItemSchema,
1976
2058
  BulkImportResultSchema,
@@ -1998,6 +2080,7 @@ function getSystemAssertion(id) {
1998
2080
  CreateTemplateInputSchema,
1999
2081
  CreateTestScenarioInputSchema,
2000
2082
  CreateTestSuiteInputSchema,
2083
+ DEFAULT_BUILD_PASSED_COMMAND,
2001
2084
  DEFAULT_EVALUATOR_SYSTEM_PROMPT,
2002
2085
  DEFAULT_JUDGE_MODEL,
2003
2086
  DiffContentSchema,
@@ -2115,11 +2198,14 @@ function getSystemAssertion(id) {
2115
2198
  formatTraceEventLine,
2116
2199
  getSystemAssertion,
2117
2200
  getSystemAssertions,
2201
+ isAllowedBuildCommandString,
2118
2202
  isSystemAssertionId,
2119
2203
  isValidSkillFolderName,
2120
2204
  normalizeBatchAssertionLink,
2121
2205
  normalizeModelId,
2206
+ parseBuildCommandToArgv,
2122
2207
  parseTraceEventLine,
2123
- validateAssertionConfig
2208
+ validateAssertionConfig,
2209
+ validateBuildPassedParamsInAssertionLinks
2124
2210
  });
2125
2211
  //# sourceMappingURL=index.js.map