@wix/evalforge-types 0.65.0 → 0.66.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -797,11 +797,67 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
797
797
  });
798
798
  var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
799
799
 
800
- // src/suite/test-suite.ts
800
+ // src/scenario/batch-import.ts
801
801
  import { z as z24 } from "zod";
802
+ var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
803
+ var BatchAssertionLinkSchema = z24.union([
804
+ z24.string().min(1),
805
+ ScenarioAssertionLinkSchema
806
+ ]);
807
+ var BatchScenarioEntrySchema = z24.object({
808
+ name: z24.string().min(1, "name: Required"),
809
+ description: z24.string().optional().default(""),
810
+ triggerPrompt: z24.string().min(10, "triggerPrompt: Must be at least 10 characters"),
811
+ templateId: z24.string().nullish(),
812
+ tags: z24.array(z24.string()).optional(),
813
+ assertionLinks: z24.array(BatchAssertionLinkSchema).optional()
814
+ });
815
+ var BatchImportPayloadSchema = z24.object({
816
+ scenarios: z24.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
817
+ });
818
+ var BATCH_IMPORT_LIMITS = {
819
+ MAX_SCENARIOS: 100,
820
+ MAX_PAYLOAD_BYTES: 1048576
821
+ // 1 MB
822
+ };
823
+ function classifyAssertionRef(ref) {
824
+ if (ref.startsWith("system:")) {
825
+ return { type: "system", value: ref };
826
+ }
827
+ if (UUID_REGEX.test(ref)) {
828
+ return { type: "uuid", value: ref };
829
+ }
830
+ return { type: "name", value: ref };
831
+ }
832
+ function normalizeBatchAssertionLink(link) {
833
+ if (typeof link === "string") {
834
+ return { assertionId: link };
835
+ }
836
+ return link;
837
+ }
838
+ var BatchResultItemSchema = z24.object({
839
+ index: z24.number(),
840
+ name: z24.string(),
841
+ status: z24.enum(["valid", "invalid"]),
842
+ id: z24.string().nullable().optional(),
843
+ errors: z24.array(z24.string()).optional()
844
+ });
845
+ var BatchSummarySchema = z24.object({
846
+ total: z24.number(),
847
+ valid: z24.number(),
848
+ invalid: z24.number(),
849
+ created: z24.number()
850
+ });
851
+ var BatchImportResponseSchema = z24.object({
852
+ summary: BatchSummarySchema,
853
+ results: z24.array(BatchResultItemSchema)
854
+ });
855
+
856
+ // src/suite/test-suite.ts
857
+ import { z as z25 } from "zod";
802
858
  var TestSuiteSchema = TenantEntitySchema.extend({
803
859
  /** IDs of test scenarios in this suite */
804
- scenarioIds: z24.array(z24.string())
860
+ scenarioIds: z25.array(z25.string())
805
861
  });
806
862
  var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
807
863
  id: true,
@@ -812,21 +868,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
812
868
  var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
813
869
 
814
870
  // src/evaluation/metrics.ts
815
- import { z as z25 } from "zod";
816
- var TokenUsageSchema = z25.object({
817
- prompt: z25.number(),
818
- completion: z25.number(),
819
- total: z25.number()
820
- });
821
- var EvalMetricsSchema = z25.object({
822
- totalAssertions: z25.number(),
823
- passed: z25.number(),
824
- failed: z25.number(),
825
- skipped: z25.number(),
826
- errors: z25.number(),
827
- passRate: z25.number(),
828
- avgDuration: z25.number(),
829
- totalDuration: z25.number()
871
+ import { z as z26 } from "zod";
872
+ var TokenUsageSchema = z26.object({
873
+ prompt: z26.number(),
874
+ completion: z26.number(),
875
+ total: z26.number()
876
+ });
877
+ var EvalMetricsSchema = z26.object({
878
+ totalAssertions: z26.number(),
879
+ passed: z26.number(),
880
+ failed: z26.number(),
881
+ skipped: z26.number(),
882
+ errors: z26.number(),
883
+ passRate: z26.number(),
884
+ avgDuration: z26.number(),
885
+ totalDuration: z26.number()
830
886
  });
831
887
  var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
832
888
  EvalStatus2["PENDING"] = "pending";
@@ -836,7 +892,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
836
892
  EvalStatus2["CANCELLED"] = "cancelled";
837
893
  return EvalStatus2;
838
894
  })(EvalStatus || {});
839
- var EvalStatusSchema = z25.enum(EvalStatus);
895
+ var EvalStatusSchema = z26.enum(EvalStatus);
840
896
  var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
841
897
  LLMStepType2["COMPLETION"] = "completion";
842
898
  LLMStepType2["TOOL_USE"] = "tool_use";
@@ -844,54 +900,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
844
900
  LLMStepType2["THINKING"] = "thinking";
845
901
  return LLMStepType2;
846
902
  })(LLMStepType || {});
847
- var LLMTraceStepSchema = z25.object({
848
- id: z25.string(),
849
- stepNumber: z25.number(),
850
- type: z25.enum(LLMStepType),
851
- model: z25.string(),
852
- provider: z25.string(),
853
- startedAt: z25.string(),
854
- durationMs: z25.number(),
903
+ var LLMTraceStepSchema = z26.object({
904
+ id: z26.string(),
905
+ stepNumber: z26.number(),
906
+ type: z26.enum(LLMStepType),
907
+ model: z26.string(),
908
+ provider: z26.string(),
909
+ startedAt: z26.string(),
910
+ durationMs: z26.number(),
855
911
  tokenUsage: TokenUsageSchema,
856
- costUsd: z25.number(),
857
- toolName: z25.string().optional(),
858
- toolArguments: z25.string().optional(),
859
- inputPreview: z25.string().optional(),
860
- outputPreview: z25.string().optional(),
861
- success: z25.boolean(),
862
- error: z25.string().optional(),
863
- turnIndex: z25.number().optional()
864
- });
865
- var LLMBreakdownStatsSchema = z25.object({
866
- count: z25.number(),
867
- durationMs: z25.number(),
868
- tokens: z25.number(),
869
- costUsd: z25.number()
870
- });
871
- var LLMTraceSummarySchema = z25.object({
872
- totalSteps: z25.number(),
873
- totalTurns: z25.number().optional(),
874
- totalDurationMs: z25.number(),
912
+ costUsd: z26.number(),
913
+ toolName: z26.string().optional(),
914
+ toolArguments: z26.string().optional(),
915
+ inputPreview: z26.string().optional(),
916
+ outputPreview: z26.string().optional(),
917
+ success: z26.boolean(),
918
+ error: z26.string().optional(),
919
+ turnIndex: z26.number().optional()
920
+ });
921
+ var LLMBreakdownStatsSchema = z26.object({
922
+ count: z26.number(),
923
+ durationMs: z26.number(),
924
+ tokens: z26.number(),
925
+ costUsd: z26.number()
926
+ });
927
+ var LLMTraceSummarySchema = z26.object({
928
+ totalSteps: z26.number(),
929
+ totalTurns: z26.number().optional(),
930
+ totalDurationMs: z26.number(),
875
931
  totalTokens: TokenUsageSchema,
876
- totalCostUsd: z25.number(),
877
- stepTypeBreakdown: z25.record(z25.string(), LLMBreakdownStatsSchema).optional(),
878
- modelBreakdown: z25.record(z25.string(), LLMBreakdownStatsSchema),
879
- modelsUsed: z25.array(z25.string())
880
- });
881
- var LLMTraceSchema = z25.object({
882
- id: z25.string(),
883
- steps: z25.array(LLMTraceStepSchema),
932
+ totalCostUsd: z26.number(),
933
+ stepTypeBreakdown: z26.record(z26.string(), LLMBreakdownStatsSchema).optional(),
934
+ modelBreakdown: z26.record(z26.string(), LLMBreakdownStatsSchema),
935
+ modelsUsed: z26.array(z26.string())
936
+ });
937
+ var LLMTraceSchema = z26.object({
938
+ id: z26.string(),
939
+ steps: z26.array(LLMTraceStepSchema),
884
940
  summary: LLMTraceSummarySchema
885
941
  });
886
942
 
887
943
  // src/evaluation/eval-result.ts
888
- import { z as z29 } from "zod";
944
+ import { z as z30 } from "zod";
889
945
 
890
946
  // src/evaluation/eval-run.ts
891
- import { z as z27 } from "zod";
947
+ import { z as z28 } from "zod";
892
948
 
893
949
  // src/evaluation/live-trace.ts
894
- import { z as z26 } from "zod";
950
+ import { z as z27 } from "zod";
895
951
  var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
896
952
  LiveTraceEventType2["THINKING"] = "thinking";
897
953
  LiveTraceEventType2["TOOL_USE"] = "tool_use";
@@ -905,37 +961,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
905
961
  LiveTraceEventType2["USER"] = "user";
906
962
  return LiveTraceEventType2;
907
963
  })(LiveTraceEventType || {});
908
- var LiveTraceEventSchema = z26.object({
964
+ var LiveTraceEventSchema = z27.object({
909
965
  /** The evaluation run ID */
910
- evalRunId: z26.string(),
966
+ evalRunId: z27.string(),
911
967
  /** The scenario ID being executed */
912
- scenarioId: z26.string(),
968
+ scenarioId: z27.string(),
913
969
  /** The scenario name for display */
914
- scenarioName: z26.string(),
970
+ scenarioName: z27.string(),
915
971
  /** The target ID (skill, agent, etc.) */
916
- targetId: z26.string(),
972
+ targetId: z27.string(),
917
973
  /** The target name for display */
918
- targetName: z26.string(),
974
+ targetName: z27.string(),
919
975
  /** Step number in the current scenario execution */
920
- stepNumber: z26.number(),
976
+ stepNumber: z27.number(),
921
977
  /** Type of trace event */
922
- type: z26.enum(LiveTraceEventType),
978
+ type: z27.enum(LiveTraceEventType),
923
979
  /** Tool name if this is a tool_use event */
924
- toolName: z26.string().optional(),
980
+ toolName: z27.string().optional(),
925
981
  /** Tool arguments preview (truncated JSON) */
926
- toolArgs: z26.string().optional(),
982
+ toolArgs: z27.string().optional(),
927
983
  /** Output preview (truncated text) */
928
- outputPreview: z26.string().optional(),
984
+ outputPreview: z27.string().optional(),
929
985
  /** File path for file operations */
930
- filePath: z26.string().optional(),
986
+ filePath: z27.string().optional(),
931
987
  /** Elapsed time in milliseconds for progress events */
932
- elapsedMs: z26.number().optional(),
988
+ elapsedMs: z27.number().optional(),
933
989
  /** Thinking/reasoning text from Claude */
934
- thinking: z26.string().optional(),
990
+ thinking: z27.string().optional(),
935
991
  /** Timestamp when this event occurred */
936
- timestamp: z26.string(),
992
+ timestamp: z27.string(),
937
993
  /** Whether this is the final event for this scenario */
938
- isComplete: z26.boolean()
994
+ isComplete: z27.boolean()
939
995
  });
940
996
  var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
941
997
  function parseTraceEventLine(line) {
@@ -964,15 +1020,15 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
964
1020
  TriggerType2["SCHEDULED"] = "SCHEDULED";
965
1021
  return TriggerType2;
966
1022
  })(TriggerType || {});
967
- var TriggerMetadataSchema = z27.object({
968
- version: z27.string().optional(),
969
- resourceUpdated: z27.array(z27.string()).optional(),
970
- scheduleId: z27.string().optional()
1023
+ var TriggerMetadataSchema = z28.object({
1024
+ version: z28.string().optional(),
1025
+ resourceUpdated: z28.array(z28.string()).optional(),
1026
+ scheduleId: z28.string().optional()
971
1027
  });
972
- var TriggerSchema = z27.object({
973
- id: z27.string(),
1028
+ var TriggerSchema = z28.object({
1029
+ id: z28.string(),
974
1030
  metadata: TriggerMetadataSchema.optional(),
975
- type: z27.nativeEnum(TriggerType)
1031
+ type: z28.nativeEnum(TriggerType)
976
1032
  });
977
1033
  var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
978
1034
  FailureCategory2["MISSING_FILE"] = "missing_file";
@@ -990,30 +1046,30 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
990
1046
  FailureSeverity2["LOW"] = "low";
991
1047
  return FailureSeverity2;
992
1048
  })(FailureSeverity || {});
993
- var DiffLineTypeSchema = z27.enum(["added", "removed", "unchanged"]);
994
- var DiffLineSchema = z27.object({
1049
+ var DiffLineTypeSchema = z28.enum(["added", "removed", "unchanged"]);
1050
+ var DiffLineSchema = z28.object({
995
1051
  type: DiffLineTypeSchema,
996
- content: z27.string(),
997
- lineNumber: z27.number()
998
- });
999
- var DiffContentSchema = z27.object({
1000
- path: z27.string(),
1001
- expected: z27.string(),
1002
- actual: z27.string(),
1003
- diffLines: z27.array(DiffLineSchema),
1004
- renamedFrom: z27.string().optional(),
1052
+ content: z28.string(),
1053
+ lineNumber: z28.number()
1054
+ });
1055
+ var DiffContentSchema = z28.object({
1056
+ path: z28.string(),
1057
+ expected: z28.string(),
1058
+ actual: z28.string(),
1059
+ diffLines: z28.array(DiffLineSchema),
1060
+ renamedFrom: z28.string().optional(),
1005
1061
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1006
- isInfrastructure: z27.boolean().optional()
1062
+ isInfrastructure: z28.boolean().optional()
1007
1063
  });
1008
- var CommandExecutionSchema = z27.object({
1009
- command: z27.string(),
1010
- exitCode: z27.number(),
1011
- output: z27.string().optional(),
1012
- duration: z27.number()
1064
+ var CommandExecutionSchema = z28.object({
1065
+ command: z28.string(),
1066
+ exitCode: z28.number(),
1067
+ output: z28.string().optional(),
1068
+ duration: z28.number()
1013
1069
  });
1014
- var FileModificationSchema = z27.object({
1015
- path: z27.string(),
1016
- action: z27.enum(["created", "modified", "deleted"])
1070
+ var FileModificationSchema = z28.object({
1071
+ path: z28.string(),
1072
+ action: z28.enum(["created", "modified", "deleted"])
1017
1073
  });
1018
1074
  var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1019
1075
  TemplateFileStatus2["NEW"] = "new";
@@ -1021,89 +1077,89 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
1021
1077
  TemplateFileStatus2["UNCHANGED"] = "unchanged";
1022
1078
  return TemplateFileStatus2;
1023
1079
  })(TemplateFileStatus || {});
1024
- var TemplateFileSchema = z27.object({
1080
+ var TemplateFileSchema = z28.object({
1025
1081
  /** Relative path within the template */
1026
- path: z27.string(),
1082
+ path: z28.string(),
1027
1083
  /** Full file content after execution */
1028
- content: z27.string(),
1084
+ content: z28.string(),
1029
1085
  /** File status (new, modified, unchanged) */
1030
- status: z27.enum(["new", "modified", "unchanged"]),
1086
+ status: z28.enum(["new", "modified", "unchanged"]),
1031
1087
  /** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
1032
- isInfrastructure: z27.boolean().optional()
1033
- });
1034
- var ApiCallSchema = z27.object({
1035
- endpoint: z27.string(),
1036
- tokensUsed: z27.number(),
1037
- duration: z27.number()
1038
- });
1039
- var ExecutionTraceSchema = z27.object({
1040
- commands: z27.array(CommandExecutionSchema),
1041
- filesModified: z27.array(FileModificationSchema),
1042
- apiCalls: z27.array(ApiCallSchema),
1043
- totalDuration: z27.number()
1044
- });
1045
- var FailureAnalysisSchema = z27.object({
1046
- category: z27.enum(FailureCategory),
1047
- severity: z27.enum(FailureSeverity),
1048
- summary: z27.string(),
1049
- details: z27.string(),
1050
- rootCause: z27.string(),
1051
- suggestedFix: z27.string(),
1052
- relatedAssertions: z27.array(z27.string()),
1053
- codeSnippet: z27.string().optional(),
1054
- similarIssues: z27.array(z27.string()).optional(),
1055
- patternId: z27.string().optional(),
1088
+ isInfrastructure: z28.boolean().optional()
1089
+ });
1090
+ var ApiCallSchema = z28.object({
1091
+ endpoint: z28.string(),
1092
+ tokensUsed: z28.number(),
1093
+ duration: z28.number()
1094
+ });
1095
+ var ExecutionTraceSchema = z28.object({
1096
+ commands: z28.array(CommandExecutionSchema),
1097
+ filesModified: z28.array(FileModificationSchema),
1098
+ apiCalls: z28.array(ApiCallSchema),
1099
+ totalDuration: z28.number()
1100
+ });
1101
+ var FailureAnalysisSchema = z28.object({
1102
+ category: z28.enum(FailureCategory),
1103
+ severity: z28.enum(FailureSeverity),
1104
+ summary: z28.string(),
1105
+ details: z28.string(),
1106
+ rootCause: z28.string(),
1107
+ suggestedFix: z28.string(),
1108
+ relatedAssertions: z28.array(z28.string()),
1109
+ codeSnippet: z28.string().optional(),
1110
+ similarIssues: z28.array(z28.string()).optional(),
1111
+ patternId: z28.string().optional(),
1056
1112
  // Extended fields for detailed debugging
1057
1113
  diff: DiffContentSchema.optional(),
1058
1114
  executionTrace: ExecutionTraceSchema.optional()
1059
1115
  });
1060
1116
  var EvalRunSchema = TenantEntitySchema.extend({
1061
1117
  /** Agent ID for this run */
1062
- agentId: z27.string().optional(),
1118
+ agentId: z28.string().optional(),
1063
1119
  /** Preset ID that originated this run (optional) */
1064
- presetId: z27.string().optional(),
1120
+ presetId: z28.string().optional(),
1065
1121
  /** Skill IDs for this run */
1066
- skillIds: z27.array(z27.string()).optional(),
1122
+ skillIds: z28.array(z28.string()).optional(),
1067
1123
  /** Map of skillId to skillVersionId for this run */
1068
- skillVersions: z27.record(z27.string(), z27.string()).optional(),
1124
+ skillVersions: z28.record(z28.string(), z28.string()).optional(),
1069
1125
  /** Scenario IDs to run (always present — resolved server-side from tags when needed) */
1070
- scenarioIds: z27.array(z27.string()),
1126
+ scenarioIds: z28.array(z28.string()),
1071
1127
  /** Current status */
1072
1128
  status: EvalStatusSchema,
1073
1129
  /** Progress percentage (0-100) */
1074
- progress: z27.number(),
1130
+ progress: z28.number(),
1075
1131
  /** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
1076
- results: z27.array(z27.lazy(() => EvalRunResultSchema)),
1132
+ results: z28.array(z28.lazy(() => EvalRunResultSchema)),
1077
1133
  /** Aggregated metrics across all results */
1078
1134
  aggregateMetrics: EvalMetricsSchema,
1079
1135
  /** Failure analyses */
1080
- failureAnalyses: z27.array(FailureAnalysisSchema).optional(),
1136
+ failureAnalyses: z28.array(FailureAnalysisSchema).optional(),
1081
1137
  /** Aggregated LLM trace summary */
1082
1138
  llmTraceSummary: LLMTraceSummarySchema.optional(),
1083
1139
  /** What triggered this run */
1084
1140
  trigger: TriggerSchema.optional(),
1085
1141
  /** When the run started (set when evaluation is triggered) */
1086
- startedAt: z27.string().optional(),
1142
+ startedAt: z28.string().optional(),
1087
1143
  /** When the run completed */
1088
- completedAt: z27.string().optional(),
1144
+ completedAt: z28.string().optional(),
1089
1145
  /** Live trace events captured during execution (for playback on results page) */
1090
- liveTraceEvents: z27.array(LiveTraceEventSchema).optional(),
1146
+ liveTraceEvents: z28.array(LiveTraceEventSchema).optional(),
1091
1147
  /** Remote job ID for tracking execution in Dev Machines */
1092
- jobId: z27.string().optional(),
1148
+ jobId: z28.string().optional(),
1093
1149
  /** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
1094
- jobStatus: z27.string().optional(),
1150
+ jobStatus: z28.string().optional(),
1095
1151
  /** Remote job error message if the job failed */
1096
- jobError: z27.string().optional(),
1152
+ jobError: z28.string().optional(),
1097
1153
  /** Timestamp of the last job status check */
1098
- jobStatusCheckedAt: z27.string().optional(),
1154
+ jobStatusCheckedAt: z28.string().optional(),
1099
1155
  /** MCP server IDs to enable for this run (optional) */
1100
- mcpIds: z27.array(z27.string()).optional(),
1156
+ mcpIds: z28.array(z28.string()).optional(),
1101
1157
  /** Sub-agent IDs to enable for this run (optional) */
1102
- subAgentIds: z27.array(z27.string()).optional(),
1158
+ subAgentIds: z28.array(z28.string()).optional(),
1103
1159
  /** Rule IDs to enable for this run (optional) */
1104
- ruleIds: z27.array(z27.string()).optional(),
1160
+ ruleIds: z28.array(z28.string()).optional(),
1105
1161
  /** Tags used to select scenarios for this run (for traceability) */
1106
- tags: z27.array(z27.string()).optional()
1162
+ tags: z28.array(z28.string()).optional()
1107
1163
  });
1108
1164
  var CreateEvalRunInputSchema = EvalRunSchema.omit({
1109
1165
  id: true,
@@ -1118,60 +1174,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
1118
1174
  scenarioIds: true
1119
1175
  }).extend({
1120
1176
  /** Optional on input — backend resolves from tags when not provided */
1121
- scenarioIds: z27.array(z27.string()).optional()
1177
+ scenarioIds: z28.array(z28.string()).optional()
1122
1178
  }).refine(
1123
1179
  (data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
1124
1180
  { message: "Either scenarioIds or tags must be provided" }
1125
1181
  );
1126
- var EvaluationProgressSchema = z27.object({
1127
- runId: z27.string(),
1128
- targetId: z27.string(),
1129
- totalScenarios: z27.number(),
1130
- completedScenarios: z27.number(),
1131
- scenarioProgress: z27.array(
1132
- z27.object({
1133
- scenarioId: z27.string(),
1134
- currentStep: z27.string(),
1135
- error: z27.string().optional()
1182
+ var EvaluationProgressSchema = z28.object({
1183
+ runId: z28.string(),
1184
+ targetId: z28.string(),
1185
+ totalScenarios: z28.number(),
1186
+ completedScenarios: z28.number(),
1187
+ scenarioProgress: z28.array(
1188
+ z28.object({
1189
+ scenarioId: z28.string(),
1190
+ currentStep: z28.string(),
1191
+ error: z28.string().optional()
1136
1192
  })
1137
1193
  ),
1138
- createdAt: z27.number()
1139
- });
1140
- var EvaluationLogSchema = z27.object({
1141
- runId: z27.string(),
1142
- scenarioId: z27.string(),
1143
- log: z27.object({
1144
- level: z27.enum(["info", "error", "debug"]),
1145
- message: z27.string().optional(),
1146
- args: z27.array(z27.any()).optional(),
1147
- error: z27.string().optional()
1194
+ createdAt: z28.number()
1195
+ });
1196
+ var EvaluationLogSchema = z28.object({
1197
+ runId: z28.string(),
1198
+ scenarioId: z28.string(),
1199
+ log: z28.object({
1200
+ level: z28.enum(["info", "error", "debug"]),
1201
+ message: z28.string().optional(),
1202
+ args: z28.array(z28.any()).optional(),
1203
+ error: z28.string().optional()
1148
1204
  })
1149
1205
  });
1150
1206
  var LLM_TIMEOUT = 12e4;
1151
1207
 
1152
1208
  // src/evaluation/conversation.ts
1153
- import { z as z28 } from "zod";
1154
- var TextBlockSchema = z28.object({
1155
- type: z28.literal("text"),
1156
- text: z28.string()
1157
- });
1158
- var ThinkingBlockSchema = z28.object({
1159
- type: z28.literal("thinking"),
1160
- thinking: z28.string()
1161
- });
1162
- var ToolUseBlockSchema = z28.object({
1163
- type: z28.literal("tool_use"),
1164
- toolName: z28.string(),
1165
- toolId: z28.string(),
1166
- input: z28.unknown()
1167
- });
1168
- var ToolResultBlockSchema = z28.object({
1169
- type: z28.literal("tool_result"),
1170
- toolUseId: z28.string(),
1171
- content: z28.string(),
1172
- isError: z28.boolean().optional()
1173
- });
1174
- var ConversationBlockSchema = z28.discriminatedUnion("type", [
1209
+ import { z as z29 } from "zod";
1210
+ var TextBlockSchema = z29.object({
1211
+ type: z29.literal("text"),
1212
+ text: z29.string()
1213
+ });
1214
+ var ThinkingBlockSchema = z29.object({
1215
+ type: z29.literal("thinking"),
1216
+ thinking: z29.string()
1217
+ });
1218
+ var ToolUseBlockSchema = z29.object({
1219
+ type: z29.literal("tool_use"),
1220
+ toolName: z29.string(),
1221
+ toolId: z29.string(),
1222
+ input: z29.unknown()
1223
+ });
1224
+ var ToolResultBlockSchema = z29.object({
1225
+ type: z29.literal("tool_result"),
1226
+ toolUseId: z29.string(),
1227
+ content: z29.string(),
1228
+ isError: z29.boolean().optional()
1229
+ });
1230
+ var ConversationBlockSchema = z29.discriminatedUnion("type", [
1175
1231
  TextBlockSchema,
1176
1232
  ThinkingBlockSchema,
1177
1233
  ToolUseBlockSchema,
@@ -1182,18 +1238,18 @@ var ConversationMessageRoles = [
1182
1238
  "user",
1183
1239
  "system"
1184
1240
  ];
1185
- var ConversationMessageSchema = z28.object({
1186
- role: z28.enum(ConversationMessageRoles),
1187
- content: z28.array(ConversationBlockSchema),
1188
- timestamp: z28.string()
1241
+ var ConversationMessageSchema = z29.object({
1242
+ role: z29.enum(ConversationMessageRoles),
1243
+ content: z29.array(ConversationBlockSchema),
1244
+ timestamp: z29.string()
1189
1245
  });
1190
- var ScenarioConversationSchema = z28.object({
1191
- id: z28.string(),
1192
- projectId: z28.string(),
1193
- evalRunId: z28.string(),
1194
- resultId: z28.string(),
1195
- messages: z28.array(ConversationMessageSchema),
1196
- createdAt: z28.string()
1246
+ var ScenarioConversationSchema = z29.object({
1247
+ id: z29.string(),
1248
+ projectId: z29.string(),
1249
+ evalRunId: z29.string(),
1250
+ resultId: z29.string(),
1251
+ messages: z29.array(ConversationMessageSchema),
1252
+ createdAt: z29.string()
1197
1253
  });
1198
1254
 
1199
1255
  // src/evaluation/eval-result.ts
@@ -1204,94 +1260,94 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
1204
1260
  AssertionResultStatus2["ERROR"] = "error";
1205
1261
  return AssertionResultStatus2;
1206
1262
  })(AssertionResultStatus || {});
1207
- var AssertionResultSchema = z29.object({
1208
- id: z29.string(),
1209
- assertionId: z29.string(),
1210
- assertionType: z29.string(),
1211
- assertionName: z29.string(),
1212
- status: z29.enum(AssertionResultStatus),
1213
- message: z29.string().optional(),
1214
- expected: z29.string().optional(),
1215
- actual: z29.string().optional(),
1216
- duration: z29.number().optional(),
1217
- details: z29.record(z29.string(), z29.unknown()).optional(),
1218
- llmTraceSteps: z29.array(LLMTraceStepSchema).optional()
1219
- });
1220
- var EvalRunResultSchema = z29.object({
1221
- id: z29.string(),
1222
- targetId: z29.string(),
1223
- targetName: z29.string().optional(),
1263
+ var AssertionResultSchema = z30.object({
1264
+ id: z30.string(),
1265
+ assertionId: z30.string(),
1266
+ assertionType: z30.string(),
1267
+ assertionName: z30.string(),
1268
+ status: z30.enum(AssertionResultStatus),
1269
+ message: z30.string().optional(),
1270
+ expected: z30.string().optional(),
1271
+ actual: z30.string().optional(),
1272
+ duration: z30.number().optional(),
1273
+ details: z30.record(z30.string(), z30.unknown()).optional(),
1274
+ llmTraceSteps: z30.array(LLMTraceStepSchema).optional()
1275
+ });
1276
+ var EvalRunResultSchema = z30.object({
1277
+ id: z30.string(),
1278
+ targetId: z30.string(),
1279
+ targetName: z30.string().optional(),
1224
1280
  /** SkillVersion ID used for this evaluation (for version tracking) */
1225
- skillVersionId: z29.string().optional(),
1281
+ skillVersionId: z30.string().optional(),
1226
1282
  /** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
1227
- skillVersion: z29.string().optional(),
1228
- scenarioId: z29.string(),
1229
- scenarioName: z29.string(),
1283
+ skillVersion: z30.string().optional(),
1284
+ scenarioId: z30.string(),
1285
+ scenarioName: z30.string(),
1230
1286
  modelConfig: ModelConfigSchema.optional(),
1231
- assertionResults: z29.array(AssertionResultSchema),
1287
+ assertionResults: z30.array(AssertionResultSchema),
1232
1288
  metrics: EvalMetricsSchema.optional(),
1233
- passed: z29.number(),
1234
- failed: z29.number(),
1235
- passRate: z29.number(),
1236
- duration: z29.number(),
1237
- outputText: z29.string().optional(),
1238
- files: z29.array(ExpectedFileSchema).optional(),
1239
- fileDiffs: z29.array(DiffContentSchema).optional(),
1289
+ passed: z30.number(),
1290
+ failed: z30.number(),
1291
+ passRate: z30.number(),
1292
+ duration: z30.number(),
1293
+ outputText: z30.string().optional(),
1294
+ files: z30.array(ExpectedFileSchema).optional(),
1295
+ fileDiffs: z30.array(DiffContentSchema).optional(),
1240
1296
  /** Full template files after execution with status indicators */
1241
- templateFiles: z29.array(TemplateFileSchema).optional(),
1242
- startedAt: z29.string().optional(),
1243
- completedAt: z29.string().optional(),
1297
+ templateFiles: z30.array(TemplateFileSchema).optional(),
1298
+ startedAt: z30.string().optional(),
1299
+ completedAt: z30.string().optional(),
1244
1300
  llmTrace: LLMTraceSchema.optional(),
1245
1301
  /** Full conversation messages (only present in transit; stripped before DB storage) */
1246
- conversation: z29.array(ConversationMessageSchema).optional()
1247
- });
1248
- var PromptResultSchema = z29.object({
1249
- text: z29.string(),
1250
- files: z29.array(z29.unknown()).optional(),
1251
- finishReason: z29.string().optional(),
1252
- reasoning: z29.string().optional(),
1253
- reasoningDetails: z29.unknown().optional(),
1254
- toolCalls: z29.array(z29.unknown()).optional(),
1255
- toolResults: z29.array(z29.unknown()).optional(),
1256
- warnings: z29.array(z29.unknown()).optional(),
1257
- sources: z29.array(z29.unknown()).optional(),
1258
- steps: z29.array(z29.unknown()),
1259
- generationTimeMs: z29.number(),
1260
- prompt: z29.string(),
1261
- systemPrompt: z29.string(),
1262
- usage: z29.object({
1263
- totalTokens: z29.number().optional(),
1264
- totalMicrocentsSpent: z29.number().optional()
1302
+ conversation: z30.array(ConversationMessageSchema).optional()
1303
+ });
1304
+ var PromptResultSchema = z30.object({
1305
+ text: z30.string(),
1306
+ files: z30.array(z30.unknown()).optional(),
1307
+ finishReason: z30.string().optional(),
1308
+ reasoning: z30.string().optional(),
1309
+ reasoningDetails: z30.unknown().optional(),
1310
+ toolCalls: z30.array(z30.unknown()).optional(),
1311
+ toolResults: z30.array(z30.unknown()).optional(),
1312
+ warnings: z30.array(z30.unknown()).optional(),
1313
+ sources: z30.array(z30.unknown()).optional(),
1314
+ steps: z30.array(z30.unknown()),
1315
+ generationTimeMs: z30.number(),
1316
+ prompt: z30.string(),
1317
+ systemPrompt: z30.string(),
1318
+ usage: z30.object({
1319
+ totalTokens: z30.number().optional(),
1320
+ totalMicrocentsSpent: z30.number().optional()
1265
1321
  })
1266
1322
  });
1267
- var EvaluationResultSchema = z29.object({
1268
- id: z29.string(),
1269
- runId: z29.string(),
1270
- timestamp: z29.number(),
1323
+ var EvaluationResultSchema = z30.object({
1324
+ id: z30.string(),
1325
+ runId: z30.string(),
1326
+ timestamp: z30.number(),
1271
1327
  promptResult: PromptResultSchema,
1272
- testResults: z29.array(z29.unknown()),
1273
- tags: z29.array(z29.string()).optional(),
1274
- feedback: z29.string().optional(),
1275
- score: z29.number(),
1276
- suiteId: z29.string().optional()
1277
- });
1278
- var LeanEvaluationResultSchema = z29.object({
1279
- id: z29.string(),
1280
- runId: z29.string(),
1281
- timestamp: z29.number(),
1282
- tags: z29.array(z29.string()).optional(),
1283
- scenarioId: z29.string(),
1284
- scenarioVersion: z29.number().optional(),
1285
- targetId: z29.string(),
1286
- targetVersion: z29.number().optional(),
1287
- suiteId: z29.string().optional(),
1288
- score: z29.number(),
1289
- time: z29.number().optional(),
1290
- microcentsSpent: z29.number().optional()
1328
+ testResults: z30.array(z30.unknown()),
1329
+ tags: z30.array(z30.string()).optional(),
1330
+ feedback: z30.string().optional(),
1331
+ score: z30.number(),
1332
+ suiteId: z30.string().optional()
1333
+ });
1334
+ var LeanEvaluationResultSchema = z30.object({
1335
+ id: z30.string(),
1336
+ runId: z30.string(),
1337
+ timestamp: z30.number(),
1338
+ tags: z30.array(z30.string()).optional(),
1339
+ scenarioId: z30.string(),
1340
+ scenarioVersion: z30.number().optional(),
1341
+ targetId: z30.string(),
1342
+ targetVersion: z30.number().optional(),
1343
+ suiteId: z30.string().optional(),
1344
+ score: z30.number(),
1345
+ time: z30.number().optional(),
1346
+ microcentsSpent: z30.number().optional()
1291
1347
  });
1292
1348
 
1293
1349
  // src/evaluation/eval-run-folder.ts
1294
- import { z as z30 } from "zod";
1350
+ import { z as z31 } from "zod";
1295
1351
  var EvalRunFolderSchema = TenantEntitySchema.extend({});
1296
1352
  var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1297
1353
  id: true,
@@ -1305,26 +1361,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
1305
1361
  updatedAt: true,
1306
1362
  deleted: true
1307
1363
  }).partial();
1308
- var EvalRunFolderMembershipSchema = z30.object({
1309
- folderId: z30.string(),
1310
- evalRunId: z30.string(),
1311
- projectId: z30.string(),
1312
- createdAt: z30.string()
1364
+ var EvalRunFolderMembershipSchema = z31.object({
1365
+ folderId: z31.string(),
1366
+ evalRunId: z31.string(),
1367
+ projectId: z31.string(),
1368
+ createdAt: z31.string()
1313
1369
  });
1314
1370
 
1315
1371
  // src/project/project.ts
1316
- import { z as z31 } from "zod";
1372
+ import { z as z32 } from "zod";
1317
1373
  var ProjectSchema = BaseEntitySchema.extend({
1318
- appId: z31.string().optional().describe("The ID of the app in Dev Center"),
1319
- scenarioTags: z31.array(z31.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1374
+ appId: z32.string().optional().describe("The ID of the app in Dev Center"),
1375
+ scenarioTags: z32.array(z32.string()).optional().describe("Project-level tag vocabulary for scenarios"),
1320
1376
  /** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
1321
- wixAuthToken: z31.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1377
+ wixAuthToken: z32.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
1322
1378
  /** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
1323
- base44AuthFile: z31.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1379
+ base44AuthFile: z32.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
1324
1380
  /** Resolved at runtime from the encrypted Wix auth token */
1325
- wixAuthEmail: z31.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1381
+ wixAuthEmail: z32.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
1326
1382
  /** Resolved at runtime from the encrypted Base44 auth file */
1327
- base44AuthEmail: z31.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1383
+ base44AuthEmail: z32.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
1328
1384
  });
1329
1385
  var CreateProjectInputSchema = ProjectSchema.omit({
1330
1386
  id: true,
@@ -1350,7 +1406,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
1350
1406
  var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
1351
1407
 
1352
1408
  // src/schedule/eval-schedule.ts
1353
- import { z as z32 } from "zod";
1409
+ import { z as z33 } from "zod";
1354
1410
  var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1355
1411
  FrequencyType2["DAILY"] = "daily";
1356
1412
  FrequencyType2["WEEKDAY"] = "weekday";
@@ -1360,29 +1416,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
1360
1416
  })(FrequencyType || {});
1361
1417
  var EvalScheduleSchema = TenantEntitySchema.extend({
1362
1418
  /** Whether the schedule is active */
1363
- enabled: z32.boolean(),
1419
+ enabled: z33.boolean(),
1364
1420
  /** Test suite to run */
1365
- suiteId: z32.string(),
1421
+ suiteId: z33.string(),
1366
1422
  /** Preset that provides agent + entities for this schedule */
1367
- presetId: z32.string(),
1423
+ presetId: z33.string(),
1368
1424
  /** How often to run */
1369
- frequencyType: z32.nativeEnum(FrequencyType),
1425
+ frequencyType: z33.nativeEnum(FrequencyType),
1370
1426
  /** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
1371
- timeOfDay: z32.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1427
+ timeOfDay: z33.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
1372
1428
  /** Day of week (0=Sun, 6=Sat) for weekly schedules */
1373
- dayOfWeek: z32.number().min(0).max(6).optional(),
1429
+ dayOfWeek: z33.number().min(0).max(6).optional(),
1374
1430
  /** Day of month (1-31) for monthly schedules */
1375
- dayOfMonth: z32.number().min(1).max(31).optional(),
1431
+ dayOfMonth: z33.number().min(1).max(31).optional(),
1376
1432
  /** IANA timezone (e.g., 'America/New_York') */
1377
- timezone: z32.string(),
1433
+ timezone: z33.string(),
1378
1434
  /** ID of the last eval run created by this schedule */
1379
- lastRunId: z32.string().optional(),
1435
+ lastRunId: z33.string().optional(),
1380
1436
  /** Denormalized status of the last run */
1381
- lastRunStatus: z32.string().optional(),
1437
+ lastRunStatus: z33.string().optional(),
1382
1438
  /** ISO timestamp of the last run */
1383
- lastRunAt: z32.string().optional(),
1439
+ lastRunAt: z33.string().optional(),
1384
1440
  /** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
1385
- nextRunAt: z32.string().optional()
1441
+ nextRunAt: z33.string().optional()
1386
1442
  });
1387
1443
  function isValidTimezone(tz) {
1388
1444
  try {
@@ -1395,14 +1451,14 @@ function isValidTimezone(tz) {
1395
1451
  function validateScheduleFields(data, ctx, options) {
1396
1452
  if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
1397
1453
  ctx.addIssue({
1398
- code: z32.ZodIssueCode.custom,
1454
+ code: z33.ZodIssueCode.custom,
1399
1455
  message: "dayOfWeek is required for weekly schedules",
1400
1456
  path: ["dayOfWeek"]
1401
1457
  });
1402
1458
  }
1403
1459
  if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
1404
1460
  ctx.addIssue({
1405
- code: z32.ZodIssueCode.custom,
1461
+ code: z33.ZodIssueCode.custom,
1406
1462
  message: "dayOfMonth is required for monthly schedules",
1407
1463
  path: ["dayOfMonth"]
1408
1464
  });
@@ -1410,7 +1466,7 @@ function validateScheduleFields(data, ctx, options) {
1410
1466
  const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
1411
1467
  if (shouldValidateTz && !isValidTimezone(data.timezone)) {
1412
1468
  ctx.addIssue({
1413
- code: z32.ZodIssueCode.custom,
1469
+ code: z33.ZodIssueCode.custom,
1414
1470
  message: "Invalid IANA timezone",
1415
1471
  path: ["timezone"]
1416
1472
  });
@@ -1677,8 +1733,15 @@ export {
1677
1733
  AssertionResultStatus,
1678
1734
  AssertionSchema,
1679
1735
  AssertionTypeSchema,
1736
+ BATCH_IMPORT_LIMITS,
1680
1737
  BaseEntitySchema,
1681
1738
  BaseTestSchema,
1739
+ BatchAssertionLinkSchema,
1740
+ BatchImportPayloadSchema,
1741
+ BatchImportResponseSchema,
1742
+ BatchResultItemSchema,
1743
+ BatchScenarioEntrySchema,
1744
+ BatchSummarySchema,
1682
1745
  BuildCheckTestSchema,
1683
1746
  BuildPassedAssertionSchema,
1684
1747
  BuildPassedConfigSchema,
@@ -1821,11 +1884,13 @@ export {
1821
1884
  UpdateTestScenarioInputSchema,
1822
1885
  UpdateTestSuiteInputSchema,
1823
1886
  VitestTestSchema,
1887
+ classifyAssertionRef,
1824
1888
  formatTraceEventLine,
1825
1889
  getSystemAssertion,
1826
1890
  getSystemAssertions,
1827
1891
  isSystemAssertionId,
1828
1892
  isValidSkillFolderName,
1893
+ normalizeBatchAssertionLink,
1829
1894
  normalizeModelId,
1830
1895
  parseTraceEventLine,
1831
1896
  validateAssertionConfig