@wix/evalforge-types 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -797,11 +797,67 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
797
797
  });
798
798
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
799
799
 
800
- // src/suite/test-suite.ts
800
+ // src/scenario/batch-import.ts
801
801
  import { z as z24 } from "zod";
802
+ var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
803
+ var BatchAssertionLinkSchema = z24.union([
804
+ z24.string().min(1),
805
+ ScenarioAssertionLinkSchema
806
+ ]);
807
+ var BatchScenarioEntrySchema = z24.object({
808
+ name: z24.string().min(1, "name: Required"),
809
+ description: z24.string().optional().default(""),
810
+ triggerPrompt: z24.string().min(10, "triggerPrompt: Must be at least 10 characters"),
811
+ templateId: z24.string().nullish(),
812
+ tags: z24.array(z24.string()).optional(),
813
+ assertionLinks: z24.array(BatchAssertionLinkSchema).optional()
814
+ });
815
+ var BatchImportPayloadSchema = z24.object({
816
+ scenarios: z24.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
817
+ });
818
+ var BATCH_IMPORT_LIMITS = {
819
+ MAX_SCENARIOS: 100,
820
+ MAX_PAYLOAD_BYTES: 1048576
821
+ // 1 MB
822
+ };
823
+ function classifyAssertionRef(ref) {
824
+ if (ref.startsWith("system:")) {
825
+ return { type: "system", value: ref };
826
+ }
827
+ if (UUID_REGEX.test(ref)) {
828
+ return { type: "uuid", value: ref };
829
+ }
830
+ return { type: "name", value: ref };
831
+ }
832
+ function normalizeBatchAssertionLink(link) {
833
+ if (typeof link === "string") {
834
+ return { assertionId: link };
835
+ }
836
+ return link;
837
+ }
838
+ var BatchResultItemSchema = z24.object({
839
+ index: z24.number(),
840
+ name: z24.string(),
841
+ status: z24.enum(["valid", "invalid"]),
842
+ id: z24.string().nullable().optional(),
843
+ errors: z24.array(z24.string()).optional()
844
+ });
845
+ var BatchSummarySchema = z24.object({
846
+ total: z24.number(),
847
+ valid: z24.number(),
848
+ invalid: z24.number(),
849
+ created: z24.number()
850
+ });
851
+ var BatchImportResponseSchema = z24.object({
852
+ summary: BatchSummarySchema,
853
+ results: z24.array(BatchResultItemSchema)
854
+ });
855
+
856
+ // src/suite/test-suite.ts
857
+ import { z as z25 } from "zod";
802
858
  var TestSuiteSchema = TenantEntitySchema.extend({
803
859
  /** IDs of test scenarios in this suite */
804
- scenarioIds: z24.array(z24.string())
860
+ scenarioIds: z25.array(z25.string())
805
861
  });
806
862
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
807
863
  id: true,
@@ -812,21 +868,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
812
868
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
813
869
 
814
870
  // src/evaluation/metrics.ts
815
- import { z as z25 } from "zod";
816
- var TokenUsageSchema = z25.object({
817
- prompt: z25.number(),
818
- completion: z25.number(),
819
- total: z25.number()
820
- });
821
- var EvalMetricsSchema = z25.object({
822
- totalAssertions: z25.number(),
823
- passed: z25.number(),
824
- failed: z25.number(),
825
- skipped: z25.number(),
826
- errors: z25.number(),
827
- passRate: z25.number(),
828
- avgDuration: z25.number(),
829
- totalDuration: z25.number()
871
+ import { z as z26 } from "zod";
872
+ var TokenUsageSchema = z26.object({
873
+ prompt: z26.number(),
874
+ completion: z26.number(),
875
+ total: z26.number()
876
+ });
877
+ var EvalMetricsSchema = z26.object({
878
+ totalAssertions: z26.number(),
879
+ passed: z26.number(),
880
+ failed: z26.number(),
881
+ skipped: z26.number(),
882
+ errors: z26.number(),
883
+ passRate: z26.number(),
884
+ avgDuration: z26.number(),
885
+ totalDuration: z26.number()
830
886
  });
831
887
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
832
888
  EvalStatus2["PENDING"] = "pending";
@@ -836,7 +892,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
836
892
  EvalStatus2["CANCELLED"] = "cancelled";
837
893
  return EvalStatus2;
838
894
  })(EvalStatus || {});
839
- var EvalStatusSchema = z25.enum(EvalStatus);
895
+ var EvalStatusSchema = z26.enum(EvalStatus);
840
896
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
841
897
  LLMStepType2["COMPLETION"] = "completion";
842
898
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -844,54 +900,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
844
900
  LLMStepType2["THINKING"] = "thinking";
845
901
  return LLMStepType2;
846
902
  })(LLMStepType || {});
847
- var LLMTraceStepSchema = z25.object({
848
- id: z25.string(),
849
- stepNumber: z25.number(),
850
- type: z25.enum(LLMStepType),
851
- model: z25.string(),
852
- provider: z25.string(),
853
- startedAt: z25.string(),
854
- durationMs: z25.number(),
903
+ var LLMTraceStepSchema = z26.object({
904
+ id: z26.string(),
905
+ stepNumber: z26.number(),
906
+ type: z26.enum(LLMStepType),
907
+ model: z26.string(),
908
+ provider: z26.string(),
909
+ startedAt: z26.string(),
910
+ durationMs: z26.number(),
855
911
  tokenUsage: TokenUsageSchema,
856
- costUsd: z25.number(),
857
- toolName: z25.string().optional(),
858
- toolArguments: z25.string().optional(),
859
- inputPreview: z25.string().optional(),
860
- outputPreview: z25.string().optional(),
861
- success: z25.boolean(),
862
- error: z25.string().optional(),
863
- turnIndex: z25.number().optional()
864
- });
865
- var LLMBreakdownStatsSchema = z25.object({
866
- count: z25.number(),
867
- durationMs: z25.number(),
868
- tokens: z25.number(),
869
- costUsd: z25.number()
870
- });
871
- var LLMTraceSummarySchema = z25.object({
872
- totalSteps: z25.number(),
873
- totalTurns: z25.number().optional(),
874
- totalDurationMs: z25.number(),
912
+ costUsd: z26.number(),
913
+ toolName: z26.string().optional(),
914
+ toolArguments: z26.string().optional(),
915
+ inputPreview: z26.string().optional(),
916
+ outputPreview: z26.string().optional(),
917
+ success: z26.boolean(),
918
+ error: z26.string().optional(),
919
+ turnIndex: z26.number().optional()
920
+ });
921
+ var LLMBreakdownStatsSchema = z26.object({
922
+ count: z26.number(),
923
+ durationMs: z26.number(),
924
+ tokens: z26.number(),
925
+ costUsd: z26.number()
926
+ });
927
+ var LLMTraceSummarySchema = z26.object({
928
+ totalSteps: z26.number(),
929
+ totalTurns: z26.number().optional(),
930
+ totalDurationMs: z26.number(),
875
931
  totalTokens: TokenUsageSchema,
876
- totalCostUsd: z25.number(),
877
- stepTypeBreakdown: z25.record(z25.string(), LLMBreakdownStatsSchema).optional(),
878
- modelBreakdown: z25.record(z25.string(), LLMBreakdownStatsSchema),
879
- modelsUsed: z25.array(z25.string())
880
- });
881
- var LLMTraceSchema = z25.object({
882
- id: z25.string(),
883
- steps: z25.array(LLMTraceStepSchema),
932
+ totalCostUsd: z26.number(),
933
+ stepTypeBreakdown: z26.record(z26.string(), LLMBreakdownStatsSchema).optional(),
934
+ modelBreakdown: z26.record(z26.string(), LLMBreakdownStatsSchema),
935
+ modelsUsed: z26.array(z26.string())
936
+ });
937
+ var LLMTraceSchema = z26.object({
938
+ id: z26.string(),
939
+ steps: z26.array(LLMTraceStepSchema),
884
940
  summary: LLMTraceSummarySchema
885
941
  });
886
942
 
887
943
  // src/evaluation/eval-result.ts
888
- import { z as z29 } from "zod";
944
+ import { z as z30 } from "zod";
889
945
 
890
946
  // src/evaluation/eval-run.ts
891
- import { z as z27 } from "zod";
947
+ import { z as z28 } from "zod";
892
948
 
893
949
  // src/evaluation/live-trace.ts
894
- import { z as z26 } from "zod";
950
+ import { z as z27 } from "zod";
895
951
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
896
952
  LiveTraceEventType2["THINKING"] = "thinking";
897
953
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -905,37 +961,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
905
961
  LiveTraceEventType2["USER"] = "user";
906
962
  return LiveTraceEventType2;
907
963
  })(LiveTraceEventType || {});
908
- var LiveTraceEventSchema = z26.object({
964
+ var LiveTraceEventSchema = z27.object({
909
965
  /** The evaluation run ID */
910
- evalRunId: z26.string(),
966
+ evalRunId: z27.string(),
911
967
  /** The scenario ID being executed */
912
- scenarioId: z26.string(),
968
+ scenarioId: z27.string(),
913
969
  /** The scenario name for display */
914
- scenarioName: z26.string(),
970
+ scenarioName: z27.string(),
915
971
  /** The target ID (skill, agent, etc.) */
916
- targetId: z26.string(),
972
+ targetId: z27.string(),
917
973
  /** The target name for display */
918
- targetName: z26.string(),
974
+ targetName: z27.string(),
919
975
  /** Step number in the current scenario execution */
920
- stepNumber: z26.number(),
976
+ stepNumber: z27.number(),
921
977
  /** Type of trace event */
922
- type: z26.enum(LiveTraceEventType),
978
+ type: z27.enum(LiveTraceEventType),
923
979
  /** Tool name if this is a tool_use event */
924
- toolName: z26.string().optional(),
980
+ toolName: z27.string().optional(),
925
981
  /** Tool arguments preview (truncated JSON) */
926
- toolArgs: z26.string().optional(),
982
+ toolArgs: z27.string().optional(),
927
983
  /** Output preview (truncated text) */
928
- outputPreview: z26.string().optional(),
984
+ outputPreview: z27.string().optional(),
929
985
  /** File path for file operations */
930
- filePath: z26.string().optional(),
986
+ filePath: z27.string().optional(),
931
987
  /** Elapsed time in milliseconds for progress events */
932
- elapsedMs: z26.number().optional(),
988
+ elapsedMs: z27.number().optional(),
933
989
  /** Thinking/reasoning text from Claude */
934
- thinking: z26.string().optional(),
990
+ thinking: z27.string().optional(),
935
991
  /** Timestamp when this event occurred */
936
- timestamp: z26.string(),
992
+ timestamp: z27.string(),
937
993
  /** Whether this is the final event for this scenario */
938
- isComplete: z26.boolean()
994
+ isComplete: z27.boolean()
939
995
  });
940
996
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
941
997
  function parseTraceEventLine(line) {
@@ -964,15 +1020,15 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
964
1020
  TriggerType2["SCHEDULED"] = "SCHEDULED";
965
1021
  return TriggerType2;
966
1022
  })(TriggerType || {});
967
- var TriggerMetadataSchema = z27.object({
968
- version: z27.string().optional(),
969
- resourceUpdated: z27.array(z27.string()).optional(),
970
- scheduleId: z27.string().optional()
1023
+ var TriggerMetadataSchema = z28.object({
1024
+ version: z28.string().optional(),
1025
+ resourceUpdated: z28.array(z28.string()).optional(),
1026
+ scheduleId: z28.string().optional()
971
1027
  });
972
- var TriggerSchema = z27.object({
973
- id: z27.string(),
1028
+ var TriggerSchema = z28.object({
1029
+ id: z28.string(),
974
1030
  metadata: TriggerMetadataSchema.optional(),
975
- type: z27.nativeEnum(TriggerType)
1031
+ type: z28.nativeEnum(TriggerType)
976
1032
  });
977
1033
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
978
1034
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -990,30 +1046,30 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
990
1046
  FailureSeverity2["LOW"] = "low";
991
1047
  return FailureSeverity2;
992
1048
  })(FailureSeverity || {});
993
- var DiffLineTypeSchema = z27.enum(["added", "removed", "unchanged"]);
994
- var DiffLineSchema = z27.object({
1049
+ var DiffLineTypeSchema = z28.enum(["added", "removed", "unchanged"]);
1050
+ var DiffLineSchema = z28.object({
995
1051
  type: DiffLineTypeSchema,
996
- content: z27.string(),
997
- lineNumber: z27.number()
998
- });
999
- var DiffContentSchema = z27.object({
1000
- path: z27.string(),
1001
- expected: z27.string(),
1002
- actual: z27.string(),
1003
- diffLines: z27.array(DiffLineSchema),
1004
- renamedFrom: z27.string().optional(),
1052
+ content: z28.string(),
1053
+ lineNumber: z28.number()
1054
+ });
1055
+ var DiffContentSchema = z28.object({
1056
+ path: z28.string(),
1057
+ expected: z28.string(),
1058
+ actual: z28.string(),
1059
+ diffLines: z28.array(DiffLineSchema),
1060
+ renamedFrom: z28.string().optional(),
1005
1061
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1006
- isInfrastructure: z27.boolean().optional()
1062
+ isInfrastructure: z28.boolean().optional()
1007
1063
  });
1008
- var CommandExecutionSchema = z27.object({
1009
- command: z27.string(),
1010
- exitCode: z27.number(),
1011
- output: z27.string().optional(),
1012
- duration: z27.number()
1064
+ var CommandExecutionSchema = z28.object({
1065
+ command: z28.string(),
1066
+ exitCode: z28.number(),
1067
+ output: z28.string().optional(),
1068
+ duration: z28.number()
1013
1069
  });
1014
- var FileModificationSchema = z27.object({
1015
- path: z27.string(),
1016
- action: z27.enum(["created", "modified", "deleted"])
1070
+ var FileModificationSchema = z28.object({
1071
+ path: z28.string(),
1072
+ action: z28.enum(["created", "modified", "deleted"])
1017
1073
  });
1018
1074
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1019
1075
  TemplateFileStatus2["NEW"] = "new";
@@ -1021,89 +1077,91 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1021
1077
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
1022
1078
  return TemplateFileStatus2;
1023
1079
  })(TemplateFileStatus || {});
1024
- var TemplateFileSchema = z27.object({
1080
+ var TemplateFileSchema = z28.object({
1025
1081
  /** Relative path within the template */
1026
- path: z27.string(),
1082
+ path: z28.string(),
1027
1083
  /** Full file content after execution */
1028
- content: z27.string(),
1084
+ content: z28.string(),
1029
1085
  /** File status (new, modified, unchanged) */
1030
- status: z27.enum(["new", "modified", "unchanged"]),
1086
+ status: z28.enum(["new", "modified", "unchanged"]),
1031
1087
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1032
- isInfrastructure: z27.boolean().optional()
1033
- });
1034
- var ApiCallSchema = z27.object({
1035
- endpoint: z27.string(),
1036
- tokensUsed: z27.number(),
1037
- duration: z27.number()
1038
- });
1039
- var ExecutionTraceSchema = z27.object({
1040
- commands: z27.array(CommandExecutionSchema),
1041
- filesModified: z27.array(FileModificationSchema),
1042
- apiCalls: z27.array(ApiCallSchema),
1043
- totalDuration: z27.number()
1044
- });
1045
- var FailureAnalysisSchema = z27.object({
1046
- category: z27.enum(FailureCategory),
1047
- severity: z27.enum(FailureSeverity),
1048
- summary: z27.string(),
1049
- details: z27.string(),
1050
- rootCause: z27.string(),
1051
- suggestedFix: z27.string(),
1052
- relatedAssertions: z27.array(z27.string()),
1053
- codeSnippet: z27.string().optional(),
1054
- similarIssues: z27.array(z27.string()).optional(),
1055
- patternId: z27.string().optional(),
1088
+ isInfrastructure: z28.boolean().optional()
1089
+ });
1090
+ var ApiCallSchema = z28.object({
1091
+ endpoint: z28.string(),
1092
+ tokensUsed: z28.number(),
1093
+ duration: z28.number()
1094
+ });
1095
+ var ExecutionTraceSchema = z28.object({
1096
+ commands: z28.array(CommandExecutionSchema),
1097
+ filesModified: z28.array(FileModificationSchema),
1098
+ apiCalls: z28.array(ApiCallSchema),
1099
+ totalDuration: z28.number()
1100
+ });
1101
+ var FailureAnalysisSchema = z28.object({
1102
+ category: z28.enum(FailureCategory),
1103
+ severity: z28.enum(FailureSeverity),
1104
+ summary: z28.string(),
1105
+ details: z28.string(),
1106
+ rootCause: z28.string(),
1107
+ suggestedFix: z28.string(),
1108
+ relatedAssertions: z28.array(z28.string()),
1109
+ codeSnippet: z28.string().optional(),
1110
+ similarIssues: z28.array(z28.string()).optional(),
1111
+ patternId: z28.string().optional(),
1056
1112
  // Extended fields for detailed debugging
1057
1113
  diff: DiffContentSchema.optional(),
1058
1114
  executionTrace: ExecutionTraceSchema.optional()
1059
1115
  });
1060
1116
  var EvalRunSchema = TenantEntitySchema.extend({
1061
1117
  /** Agent ID for this run */
1062
- agentId: z27.string().optional(),
1118
+ agentId: z28.string().optional(),
1063
1119
  /** Preset ID that originated this run (optional) */
1064
- presetId: z27.string().optional(),
1120
+ presetId: z28.string().optional(),
1065
1121
  /** Skill IDs for this run */
1066
- skillIds: z27.array(z27.string()).optional(),
1122
+ skillIds: z28.array(z28.string()).optional(),
1067
1123
  /** Map of skillId to skillVersionId for this run */
1068
- skillVersions: z27.record(z27.string(), z27.string()).optional(),
1124
+ skillVersions: z28.record(z28.string(), z28.string()).optional(),
1069
1125
  /** Scenario IDs to run (always present — resolved server-side from tags when needed) */
1070
- scenarioIds: z27.array(z27.string()),
1126
+ scenarioIds: z28.array(z28.string()),
1071
1127
  /** Current status */
1072
1128
  status: EvalStatusSchema,
1073
1129
  /** Progress percentage (0-100) */
1074
- progress: z27.number(),
1130
+ progress: z28.number(),
1075
1131
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
1076
- results: z27.array(z27.lazy(() => EvalRunResultSchema)),
1132
+ results: z28.array(z28.lazy(() => EvalRunResultSchema)),
1077
1133
  /** Aggregated metrics across all results */
1078
1134
  aggregateMetrics: EvalMetricsSchema,
1079
1135
  /** Failure analyses */
1080
- failureAnalyses: z27.array(FailureAnalysisSchema).optional(),
1136
+ failureAnalyses: z28.array(FailureAnalysisSchema).optional(),
1081
1137
  /** Aggregated LLM trace summary */
1082
1138
  llmTraceSummary: LLMTraceSummarySchema.optional(),
1083
1139
  /** What triggered this run */
1084
1140
  trigger: TriggerSchema.optional(),
1085
1141
  /** When the run started (set when evaluation is triggered) */
1086
- startedAt: z27.string().optional(),
1142
+ startedAt: z28.string().optional(),
1087
1143
  /** When the run completed */
1088
- completedAt: z27.string().optional(),
1144
+ completedAt: z28.string().optional(),
1089
1145
  /** Live trace events captured during execution (for playback on results page) */
1090
- liveTraceEvents: z27.array(LiveTraceEventSchema).optional(),
1146
+ liveTraceEvents: z28.array(LiveTraceEventSchema).optional(),
1091
1147
  /** Remote job ID for tracking execution in Dev Machines */
1092
- jobId: z27.string().optional(),
1148
+ jobId: z28.string().optional(),
1093
1149
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
1094
- jobStatus: z27.string().optional(),
1150
+ jobStatus: z28.string().optional(),
1095
1151
  /** Remote job error message if the job failed */
1096
- jobError: z27.string().optional(),
1152
+ jobError: z28.string().optional(),
1097
1153
  /** Timestamp of the last job status check */
1098
- jobStatusCheckedAt: z27.string().optional(),
1154
+ jobStatusCheckedAt: z28.string().optional(),
1099
1155
  /** MCP server IDs to enable for this run (optional) */
1100
- mcpIds: z27.array(z27.string()).optional(),
1156
+ mcpIds: z28.array(z28.string()).optional(),
1101
1157
  /** Sub-agent IDs to enable for this run (optional) */
1102
- subAgentIds: z27.array(z27.string()).optional(),
1158
+ subAgentIds: z28.array(z28.string()).optional(),
1103
1159
  /** Rule IDs to enable for this run (optional) */
1104
- ruleIds: z27.array(z27.string()).optional(),
1160
+ ruleIds: z28.array(z28.string()).optional(),
1105
1161
  /** Tags used to select scenarios for this run (for traceability) */
1106
- tags: z27.array(z27.string()).optional()
1162
+ tags: z28.array(z28.string()).optional(),
1163
+ /** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
1164
+ runsPerScenario: z28.number().int().min(1).max(20).optional()
1107
1165
  });
1108
1166
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
1109
1167
  id: true,
@@ -1118,60 +1176,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
1118
1176
  scenarioIds: true
1119
1177
  }).extend({
1120
1178
  /** Optional on input — backend resolves from tags when not provided */
1121
- scenarioIds: z27.array(z27.string()).optional()
1179
+ scenarioIds: z28.array(z28.string()).optional()
1122
1180
  }).refine(
1123
1181
  (data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
1124
1182
  { message: "Either scenarioIds or tags must be provided" }
1125
1183
  );
1126
- var EvaluationProgressSchema = z27.object({
1127
- runId: z27.string(),
1128
- targetId: z27.string(),
1129
- totalScenarios: z27.number(),
1130
- completedScenarios: z27.number(),
1131
- scenarioProgress: z27.array(
1132
- z27.object({
1133
- scenarioId: z27.string(),
1134
- currentStep: z27.string(),
1135
- error: z27.string().optional()
1184
+ var EvaluationProgressSchema = z28.object({
1185
+ runId: z28.string(),
1186
+ targetId: z28.string(),
1187
+ totalScenarios: z28.number(),
1188
+ completedScenarios: z28.number(),
1189
+ scenarioProgress: z28.array(
1190
+ z28.object({
1191
+ scenarioId: z28.string(),
1192
+ currentStep: z28.string(),
1193
+ error: z28.string().optional()
1136
1194
  })
1137
1195
  ),
1138
- createdAt: z27.number()
1139
- });
1140
- var EvaluationLogSchema = z27.object({
1141
- runId: z27.string(),
1142
- scenarioId: z27.string(),
1143
- log: z27.object({
1144
- level: z27.enum(["info", "error", "debug"]),
1145
- message: z27.string().optional(),
1146
- args: z27.array(z27.any()).optional(),
1147
- error: z27.string().optional()
1196
+ createdAt: z28.number()
1197
+ });
1198
+ var EvaluationLogSchema = z28.object({
1199
+ runId: z28.string(),
1200
+ scenarioId: z28.string(),
1201
+ log: z28.object({
1202
+ level: z28.enum(["info", "error", "debug"]),
1203
+ message: z28.string().optional(),
1204
+ args: z28.array(z28.any()).optional(),
1205
+ error: z28.string().optional()
1148
1206
  })
1149
1207
  });
1150
1208
  var LLM_TIMEOUT = 12e4;
1151
1209
 
1152
1210
  // src/evaluation/conversation.ts
1153
- import { z as z28 } from "zod";
1154
- var TextBlockSchema = z28.object({
1155
- type: z28.literal("text"),
1156
- text: z28.string()
1157
- });
1158
- var ThinkingBlockSchema = z28.object({
1159
- type: z28.literal("thinking"),
1160
- thinking: z28.string()
1161
- });
1162
- var ToolUseBlockSchema = z28.object({
1163
- type: z28.literal("tool_use"),
1164
- toolName: z28.string(),
1165
- toolId: z28.string(),
1166
- input: z28.unknown()
1167
- });
1168
- var ToolResultBlockSchema = z28.object({
1169
- type: z28.literal("tool_result"),
1170
- toolUseId: z28.string(),
1171
- content: z28.string(),
1172
- isError: z28.boolean().optional()
1173
- });
1174
- var ConversationBlockSchema = z28.discriminatedUnion("type", [
1211
+ import { z as z29 } from "zod";
1212
+ var TextBlockSchema = z29.object({
1213
+ type: z29.literal("text"),
1214
+ text: z29.string()
1215
+ });
1216
+ var ThinkingBlockSchema = z29.object({
1217
+ type: z29.literal("thinking"),
1218
+ thinking: z29.string()
1219
+ });
1220
+ var ToolUseBlockSchema = z29.object({
1221
+ type: z29.literal("tool_use"),
1222
+ toolName: z29.string(),
1223
+ toolId: z29.string(),
1224
+ input: z29.unknown()
1225
+ });
1226
+ var ToolResultBlockSchema = z29.object({
1227
+ type: z29.literal("tool_result"),
1228
+ toolUseId: z29.string(),
1229
+ content: z29.string(),
1230
+ isError: z29.boolean().optional()
1231
+ });
1232
+ var ConversationBlockSchema = z29.discriminatedUnion("type", [
1175
1233
  TextBlockSchema,
1176
1234
  ThinkingBlockSchema,
1177
1235
  ToolUseBlockSchema,
@@ -1182,18 +1240,18 @@ var ConversationMessageRoles = [
1182
1240
  "user",
1183
1241
  "system"
1184
1242
  ];
1185
- var ConversationMessageSchema = z28.object({
1186
- role: z28.enum(ConversationMessageRoles),
1187
- content: z28.array(ConversationBlockSchema),
1188
- timestamp: z28.string()
1243
+ var ConversationMessageSchema = z29.object({
1244
+ role: z29.enum(ConversationMessageRoles),
1245
+ content: z29.array(ConversationBlockSchema),
1246
+ timestamp: z29.string()
1189
1247
  });
1190
- var ScenarioConversationSchema = z28.object({
1191
- id: z28.string(),
1192
- projectId: z28.string(),
1193
- evalRunId: z28.string(),
1194
- resultId: z28.string(),
1195
- messages: z28.array(ConversationMessageSchema),
1196
- createdAt: z28.string()
1248
+ var ScenarioConversationSchema = z29.object({
1249
+ id: z29.string(),
1250
+ projectId: z29.string(),
1251
+ evalRunId: z29.string(),
1252
+ resultId: z29.string(),
1253
+ messages: z29.array(ConversationMessageSchema),
1254
+ createdAt: z29.string()
1197
1255
  });
1198
1256
 
1199
1257
  // src/evaluation/eval-result.ts
@@ -1204,94 +1262,96 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
1204
1262
  AssertionResultStatus2["ERROR"] = "error";
1205
1263
  return AssertionResultStatus2;
1206
1264
  })(AssertionResultStatus || {});
1207
- var AssertionResultSchema = z29.object({
1208
- id: z29.string(),
1209
- assertionId: z29.string(),
1210
- assertionType: z29.string(),
1211
- assertionName: z29.string(),
1212
- status: z29.enum(AssertionResultStatus),
1213
- message: z29.string().optional(),
1214
- expected: z29.string().optional(),
1215
- actual: z29.string().optional(),
1216
- duration: z29.number().optional(),
1217
- details: z29.record(z29.string(), z29.unknown()).optional(),
1218
- llmTraceSteps: z29.array(LLMTraceStepSchema).optional()
1219
- });
1220
- var EvalRunResultSchema = z29.object({
1221
- id: z29.string(),
1222
- targetId: z29.string(),
1223
- targetName: z29.string().optional(),
1265
+ var AssertionResultSchema = z30.object({
1266
+ id: z30.string(),
1267
+ assertionId: z30.string(),
1268
+ assertionType: z30.string(),
1269
+ assertionName: z30.string(),
1270
+ status: z30.enum(AssertionResultStatus),
1271
+ message: z30.string().optional(),
1272
+ expected: z30.string().optional(),
1273
+ actual: z30.string().optional(),
1274
+ duration: z30.number().optional(),
1275
+ details: z30.record(z30.string(), z30.unknown()).optional(),
1276
+ llmTraceSteps: z30.array(LLMTraceStepSchema).optional()
1277
+ });
1278
+ var EvalRunResultSchema = z30.object({
1279
+ id: z30.string(),
1280
+ targetId: z30.string(),
1281
+ targetName: z30.string().optional(),
1224
1282
  /** SkillVersion ID used for this evaluation (for version tracking) */
1225
- skillVersionId: z29.string().optional(),
1283
+ skillVersionId: z30.string().optional(),
1226
1284
  /** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
1227
- skillVersion: z29.string().optional(),
1228
- scenarioId: z29.string(),
1229
- scenarioName: z29.string(),
1285
+ skillVersion: z30.string().optional(),
1286
+ scenarioId: z30.string(),
1287
+ scenarioName: z30.string(),
1230
1288
  modelConfig: ModelConfigSchema.optional(),
1231
- assertionResults: z29.array(AssertionResultSchema),
1289
+ assertionResults: z30.array(AssertionResultSchema),
1232
1290
  metrics: EvalMetricsSchema.optional(),
1233
- passed: z29.number(),
1234
- failed: z29.number(),
1235
- passRate: z29.number(),
1236
- duration: z29.number(),
1237
- outputText: z29.string().optional(),
1238
- files: z29.array(ExpectedFileSchema).optional(),
1239
- fileDiffs: z29.array(DiffContentSchema).optional(),
1291
+ passed: z30.number(),
1292
+ failed: z30.number(),
1293
+ passRate: z30.number(),
1294
+ duration: z30.number(),
1295
+ outputText: z30.string().optional(),
1296
+ files: z30.array(ExpectedFileSchema).optional(),
1297
+ fileDiffs: z30.array(DiffContentSchema).optional(),
1240
1298
  /** Full template files after execution with status indicators */
1241
- templateFiles: z29.array(TemplateFileSchema).optional(),
1242
- startedAt: z29.string().optional(),
1243
- completedAt: z29.string().optional(),
1299
+ templateFiles: z30.array(TemplateFileSchema).optional(),
1300
+ startedAt: z30.string().optional(),
1301
+ completedAt: z30.string().optional(),
1244
1302
  llmTrace: LLMTraceSchema.optional(),
1245
1303
  /** Full conversation messages (only present in transit; stripped before DB storage) */
1246
- conversation: z29.array(ConversationMessageSchema).optional()
1247
- });
1248
- var PromptResultSchema = z29.object({
1249
- text: z29.string(),
1250
- files: z29.array(z29.unknown()).optional(),
1251
- finishReason: z29.string().optional(),
1252
- reasoning: z29.string().optional(),
1253
- reasoningDetails: z29.unknown().optional(),
1254
- toolCalls: z29.array(z29.unknown()).optional(),
1255
- toolResults: z29.array(z29.unknown()).optional(),
1256
- warnings: z29.array(z29.unknown()).optional(),
1257
- sources: z29.array(z29.unknown()).optional(),
1258
- steps: z29.array(z29.unknown()),
1259
- generationTimeMs: z29.number(),
1260
- prompt: z29.string(),
1261
- systemPrompt: z29.string(),
1262
- usage: z29.object({
1263
- totalTokens: z29.number().optional(),
1264
- totalMicrocentsSpent: z29.number().optional()
1304
+ conversation: z30.array(ConversationMessageSchema).optional(),
1305
+ /** 0-based iteration index when a scenario is run multiple times within a single eval run */
1306
+ iterationIndex: z30.number().int().min(0).optional()
1307
+ });
1308
+ var PromptResultSchema = z30.object({
1309
+ text: z30.string(),
1310
+ files: z30.array(z30.unknown()).optional(),
1311
+ finishReason: z30.string().optional(),
1312
+ reasoning: z30.string().optional(),
1313
+ reasoningDetails: z30.unknown().optional(),
1314
+ toolCalls: z30.array(z30.unknown()).optional(),
1315
+ toolResults: z30.array(z30.unknown()).optional(),
1316
+ warnings: z30.array(z30.unknown()).optional(),
1317
+ sources: z30.array(z30.unknown()).optional(),
1318
+ steps: z30.array(z30.unknown()),
1319
+ generationTimeMs: z30.number(),
1320
+ prompt: z30.string(),
1321
+ systemPrompt: z30.string(),
1322
+ usage: z30.object({
1323
+ totalTokens: z30.number().optional(),
1324
+ totalMicrocentsSpent: z30.number().optional()
1265
1325
  })
1266
1326
  });
1267
- var EvaluationResultSchema = z29.object({
1268
- id: z29.string(),
1269
- runId: z29.string(),
1270
- timestamp: z29.number(),
1327
+ var EvaluationResultSchema = z30.object({
1328
+ id: z30.string(),
1329
+ runId: z30.string(),
1330
+ timestamp: z30.number(),
1271
1331
  promptResult: PromptResultSchema,
1272
- testResults: z29.array(z29.unknown()),
1273
- tags: z29.array(z29.string()).optional(),
1274
- feedback: z29.string().optional(),
1275
- score: z29.number(),
1276
- suiteId: z29.string().optional()
1277
- });
1278
- var LeanEvaluationResultSchema = z29.object({
1279
- id: z29.string(),
1280
- runId: z29.string(),
1281
- timestamp: z29.number(),
1282
- tags: z29.array(z29.string()).optional(),
1283
- scenarioId: z29.string(),
1284
- scenarioVersion: z29.number().optional(),
1285
- targetId: z29.string(),
1286
- targetVersion: z29.number().optional(),
1287
- suiteId: z29.string().optional(),
1288
- score: z29.number(),
1289
- time: z29.number().optional(),
1290
- microcentsSpent: z29.number().optional()
1332
+ testResults: z30.array(z30.unknown()),
1333
+ tags: z30.array(z30.string()).optional(),
1334
+ feedback: z30.string().optional(),
1335
+ score: z30.number(),
1336
+ suiteId: z30.string().optional()
1337
+ });
1338
+ var LeanEvaluationResultSchema = z30.object({
1339
+ id: z30.string(),
1340
+ runId: z30.string(),
1341
+ timestamp: z30.number(),
1342
+ tags: z30.array(z30.string()).optional(),
1343
+ scenarioId: z30.string(),
1344
+ scenarioVersion: z30.number().optional(),
1345
+ targetId: z30.string(),
1346
+ targetVersion: z30.number().optional(),
1347
+ suiteId: z30.string().optional(),
1348
+ score: z30.number(),
1349
+ time: z30.number().optional(),
1350
+ microcentsSpent: z30.number().optional()
1291
1351
  });
1292
1352
 
1293
1353
  // src/evaluation/eval-run-folder.ts
1294
- import { z as z30 } from "zod";
1354
+ import { z as z31 } from "zod";
1295
1355
  var EvalRunFolderSchema = TenantEntitySchema.extend({});
1296
1356
  var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1297
1357
  id: true,
@@ -1305,26 +1365,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1305
1365
  updatedAt: true,
1306
1366
  deleted: true
1307
1367
  }).partial();
1308
- var EvalRunFolderMembershipSchema = z30.object({
1309
- folderId: z30.string(),
1310
- evalRunId: z30.string(),
1311
- projectId: z30.string(),
1312
- createdAt: z30.string()
1368
+ var EvalRunFolderMembershipSchema = z31.object({
1369
+ folderId: z31.string(),
1370
+ evalRunId: z31.string(),
1371
+ projectId: z31.string(),
1372
+ createdAt: z31.string()
1313
1373
  });
1314
1374
 
1315
1375
  // src/project/project.ts
1316
- import { z as z31 } from "zod";
1376
+ import { z as z32 } from "zod";
1317
1377
  var ProjectSchema = BaseEntitySchema.extend({
1318
- appId: z31.string().optional().describe("The ID of the app in Dev Center"),
1319
- scenarioTags: z31.array(z31.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1378
+ appId: z32.string().optional().describe("The ID of the app in Dev Center"),
1379
+ scenarioTags: z32.array(z32.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1320
1380
  /** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
1321
- wixAuthToken: z31.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1381
+ wixAuthToken: z32.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1322
1382
  /** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
1323
- base44AuthFile: z31.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1383
+ base44AuthFile: z32.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1324
1384
  /** Resolved at runtime from the encrypted Wix auth token */
1325
- wixAuthEmail: z31.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1385
+ wixAuthEmail: z32.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1326
1386
  /** Resolved at runtime from the encrypted Base44 auth file */
1327
- base44AuthEmail: z31.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1387
+ base44AuthEmail: z32.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1328
1388
  });
1329
1389
  var CreateProjectInputSchema = ProjectSchema.omit({
1330
1390
  id: true,
@@ -1350,7 +1410,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
1350
1410
  var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1351
1411
 
1352
1412
  // src/schedule/eval-schedule.ts
1353
- import { z as z32 } from "zod";
1413
+ import { z as z33 } from "zod";
1354
1414
  var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1355
1415
  FrequencyType2["DAILY"] = "daily";
1356
1416
  FrequencyType2["WEEKDAY"] = "weekday";
@@ -1360,29 +1420,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1360
1420
  })(FrequencyType || {});
1361
1421
  var EvalScheduleSchema = TenantEntitySchema.extend({
1362
1422
  /** Whether the schedule is active */
1363
- enabled: z32.boolean(),
1423
+ enabled: z33.boolean(),
1364
1424
  /** Test suite to run */
1365
- suiteId: z32.string(),
1425
+ suiteId: z33.string(),
1366
1426
  /** Preset that provides agent + entities for this schedule */
1367
- presetId: z32.string(),
1427
+ presetId: z33.string(),
1368
1428
  /** How often to run */
1369
- frequencyType: z32.nativeEnum(FrequencyType),
1429
+ frequencyType: z33.nativeEnum(FrequencyType),
1370
1430
  /** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
1371
- timeOfDay: z32.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1431
+ timeOfDay: z33.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1372
1432
  /** Day of week (0=Sun, 6=Sat) for weekly schedules */
1373
- dayOfWeek: z32.number().min(0).max(6).optional(),
1433
+ dayOfWeek: z33.number().min(0).max(6).optional(),
1374
1434
  /** Day of month (1-31) for monthly schedules */
1375
- dayOfMonth: z32.number().min(1).max(31).optional(),
1435
+ dayOfMonth: z33.number().min(1).max(31).optional(),
1376
1436
  /** IANA timezone (e.g., 'America/New_York') */
1377
- timezone: z32.string(),
1437
+ timezone: z33.string(),
1378
1438
  /** ID of the last eval run created by this schedule */
1379
- lastRunId: z32.string().optional(),
1439
+ lastRunId: z33.string().optional(),
1380
1440
  /** Denormalized status of the last run */
1381
- lastRunStatus: z32.string().optional(),
1441
+ lastRunStatus: z33.string().optional(),
1382
1442
  /** ISO timestamp of the last run */
1383
- lastRunAt: z32.string().optional(),
1443
+ lastRunAt: z33.string().optional(),
1384
1444
  /** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
1385
- nextRunAt: z32.string().optional()
1445
+ nextRunAt: z33.string().optional()
1386
1446
  });
1387
1447
  function isValidTimezone(tz) {
1388
1448
  try {
@@ -1395,14 +1455,14 @@ function isValidTimezone(tz) {
1395
1455
  function validateScheduleFields(data, ctx, options) {
1396
1456
  if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
1397
1457
  ctx.addIssue({
1398
- code: z32.ZodIssueCode.custom,
1458
+ code: z33.ZodIssueCode.custom,
1399
1459
  message: "dayOfWeek is required for weekly schedules",
1400
1460
  path: ["dayOfWeek"]
1401
1461
  });
1402
1462
  }
1403
1463
  if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
1404
1464
  ctx.addIssue({
1405
- code: z32.ZodIssueCode.custom,
1465
+ code: z33.ZodIssueCode.custom,
1406
1466
  message: "dayOfMonth is required for monthly schedules",
1407
1467
  path: ["dayOfMonth"]
1408
1468
  });
@@ -1410,7 +1470,7 @@ function validateScheduleFields(data, ctx, options) {
1410
1470
  const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
1411
1471
  if (shouldValidateTz && !isValidTimezone(data.timezone)) {
1412
1472
  ctx.addIssue({
1413
- code: z32.ZodIssueCode.custom,
1473
+ code: z33.ZodIssueCode.custom,
1414
1474
  message: "Invalid IANA timezone",
1415
1475
  path: ["timezone"]
1416
1476
  });
@@ -1677,8 +1737,15 @@ export {
1677
1737
  AssertionResultStatus,
1678
1738
  AssertionSchema,
1679
1739
  AssertionTypeSchema,
1740
+ BATCH_IMPORT_LIMITS,
1680
1741
  BaseEntitySchema,
1681
1742
  BaseTestSchema,
1743
+ BatchAssertionLinkSchema,
1744
+ BatchImportPayloadSchema,
1745
+ BatchImportResponseSchema,
1746
+ BatchResultItemSchema,
1747
+ BatchScenarioEntrySchema,
1748
+ BatchSummarySchema,
1682
1749
  BuildCheckTestSchema,
1683
1750
  BuildPassedAssertionSchema,
1684
1751
  BuildPassedConfigSchema,
@@ -1821,11 +1888,13 @@ export {
1821
1888
  UpdateTestScenarioInputSchema,
1822
1889
  UpdateTestSuiteInputSchema,
1823
1890
  VitestTestSchema,
1891
+ classifyAssertionRef,
1824
1892
  formatTraceEventLine,
1825
1893
  getSystemAssertion,
1826
1894
  getSystemAssertions,
1827
1895
  isSystemAssertionId,
1828
1896
  isValidSkillFolderName,
1897
+ normalizeBatchAssertionLink,
1829
1898
  normalizeModelId,
1830
1899
  parseTraceEventLine,
1831
1900
  validateAssertionConfig