@wix/evalforge-types 0.71.0 → 0.73.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.js +715 -620
- package/build/index.js.map +4 -4
- package/build/index.mjs +707 -619
- package/build/index.mjs.map +4 -4
- package/build/types/agent/adapter.d.ts +3 -1
- package/build/types/assertion/assertion.d.ts +26 -6
- package/build/types/assertion/build-passed-command.d.ts +25 -0
- package/build/types/assertion/index.d.ts +1 -0
- package/build/types/scenario/test-scenario.d.ts +64 -3
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -21,6 +21,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
21
21
|
var index_exports = {};
|
|
22
22
|
__export(index_exports, {
|
|
23
23
|
AGENT_TYPE_LABELS: () => AGENT_TYPE_LABELS,
|
|
24
|
+
ALLOWED_BUILD_COMMANDS: () => ALLOWED_BUILD_COMMANDS,
|
|
24
25
|
ALL_AVAILABLE_MODEL_IDS: () => ALL_AVAILABLE_MODEL_IDS,
|
|
25
26
|
AVAILABLE_CLAUDE_MODEL_IDS: () => AVAILABLE_CLAUDE_MODEL_IDS,
|
|
26
27
|
AVAILABLE_OPENAI_MODEL_IDS: () => AVAILABLE_OPENAI_MODEL_IDS,
|
|
@@ -54,6 +55,7 @@ __export(index_exports, {
|
|
|
54
55
|
BatchSummarySchema: () => BatchSummarySchema,
|
|
55
56
|
BuildCheckTestSchema: () => BuildCheckTestSchema,
|
|
56
57
|
BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
|
|
58
|
+
BuildPassedCommandStringSchema: () => BuildPassedCommandStringSchema,
|
|
57
59
|
BuildPassedConfigSchema: () => BuildPassedConfigSchema,
|
|
58
60
|
BulkImportResultItemSchema: () => BulkImportResultItemSchema,
|
|
59
61
|
BulkImportResultSchema: () => BulkImportResultSchema,
|
|
@@ -81,6 +83,7 @@ __export(index_exports, {
|
|
|
81
83
|
CreateTemplateInputSchema: () => CreateTemplateInputSchema,
|
|
82
84
|
CreateTestScenarioInputSchema: () => CreateTestScenarioInputSchema,
|
|
83
85
|
CreateTestSuiteInputSchema: () => CreateTestSuiteInputSchema,
|
|
86
|
+
DEFAULT_BUILD_PASSED_COMMAND: () => DEFAULT_BUILD_PASSED_COMMAND,
|
|
84
87
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT: () => DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
85
88
|
DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
|
|
86
89
|
DiffContentSchema: () => DiffContentSchema,
|
|
@@ -178,6 +181,7 @@ __export(index_exports, {
|
|
|
178
181
|
ToolTestSchema: () => ToolTestSchema,
|
|
179
182
|
ToolUseBlockSchema: () => ToolUseBlockSchema,
|
|
180
183
|
TriggerMetadataSchema: () => TriggerMetadataSchema,
|
|
184
|
+
TriggerPromptImageSchema: () => TriggerPromptImageSchema,
|
|
181
185
|
TriggerSchema: () => TriggerSchema,
|
|
182
186
|
TriggerType: () => TriggerType,
|
|
183
187
|
UpdateAgentInputSchema: () => UpdateAgentInputSchema,
|
|
@@ -197,12 +201,15 @@ __export(index_exports, {
|
|
|
197
201
|
formatTraceEventLine: () => formatTraceEventLine,
|
|
198
202
|
getSystemAssertion: () => getSystemAssertion,
|
|
199
203
|
getSystemAssertions: () => getSystemAssertions,
|
|
204
|
+
isAllowedBuildCommandString: () => isAllowedBuildCommandString,
|
|
200
205
|
isSystemAssertionId: () => isSystemAssertionId,
|
|
201
206
|
isValidSkillFolderName: () => isValidSkillFolderName,
|
|
202
207
|
normalizeBatchAssertionLink: () => normalizeBatchAssertionLink,
|
|
203
208
|
normalizeModelId: () => normalizeModelId,
|
|
209
|
+
parseBuildCommandToArgv: () => parseBuildCommandToArgv,
|
|
204
210
|
parseTraceEventLine: () => parseTraceEventLine,
|
|
205
|
-
validateAssertionConfig: () => validateAssertionConfig
|
|
211
|
+
validateAssertionConfig: () => validateAssertionConfig,
|
|
212
|
+
validateBuildPassedParamsInAssertionLinks: () => validateBuildPassedParamsInAssertionLinks
|
|
206
213
|
});
|
|
207
214
|
module.exports = __toCommonJS(index_exports);
|
|
208
215
|
|
|
@@ -792,11 +799,42 @@ var EnvironmentSchema = import_zod21.z.object({
|
|
|
792
799
|
});
|
|
793
800
|
|
|
794
801
|
// src/scenario/test-scenario.ts
|
|
795
|
-
var
|
|
802
|
+
var import_zod24 = require("zod");
|
|
796
803
|
|
|
797
804
|
// src/assertion/assertion.ts
|
|
805
|
+
var import_zod23 = require("zod");
|
|
806
|
+
|
|
807
|
+
// src/assertion/build-passed-command.ts
|
|
798
808
|
var import_zod22 = require("zod");
|
|
799
|
-
var
|
|
809
|
+
var ALLOWED_BUILD_COMMANDS = [
|
|
810
|
+
"yarn build",
|
|
811
|
+
"npm run build",
|
|
812
|
+
"pnpm run build",
|
|
813
|
+
"pnpm build"
|
|
814
|
+
];
|
|
815
|
+
var DEFAULT_BUILD_PASSED_COMMAND = "yarn build";
|
|
816
|
+
var BUILD_COMMAND_ARGV = {
|
|
817
|
+
"yarn build": ["yarn", "build"],
|
|
818
|
+
"npm run build": ["npm", "run", "build"],
|
|
819
|
+
"pnpm run build": ["pnpm", "run", "build"],
|
|
820
|
+
"pnpm build": ["pnpm", "build"]
|
|
821
|
+
};
|
|
822
|
+
function isAllowedBuildCommandString(command) {
|
|
823
|
+
const trimmed = command.trim();
|
|
824
|
+
return ALLOWED_BUILD_COMMANDS.includes(trimmed);
|
|
825
|
+
}
|
|
826
|
+
function parseBuildCommandToArgv(command) {
|
|
827
|
+
const trimmed = command.trim();
|
|
828
|
+
if (!(trimmed in BUILD_COMMAND_ARGV)) {
|
|
829
|
+
return null;
|
|
830
|
+
}
|
|
831
|
+
return BUILD_COMMAND_ARGV[trimmed];
|
|
832
|
+
}
|
|
833
|
+
var enumTuple = ALLOWED_BUILD_COMMANDS;
|
|
834
|
+
var BuildPassedCommandStringSchema = import_zod22.z.enum(enumTuple);
|
|
835
|
+
|
|
836
|
+
// src/assertion/assertion.ts
|
|
837
|
+
var AssertionTypeSchema = import_zod23.z.enum([
|
|
800
838
|
"skill_was_called",
|
|
801
839
|
"tool_called_with_param",
|
|
802
840
|
"build_passed",
|
|
@@ -805,61 +843,61 @@ var AssertionTypeSchema = import_zod22.z.enum([
|
|
|
805
843
|
"llm_judge",
|
|
806
844
|
"api_call"
|
|
807
845
|
]);
|
|
808
|
-
var AssertionParameterTypeSchema =
|
|
846
|
+
var AssertionParameterTypeSchema = import_zod23.z.enum([
|
|
809
847
|
"string",
|
|
810
848
|
"number",
|
|
811
849
|
"boolean"
|
|
812
850
|
]);
|
|
813
|
-
var AssertionParameterSchema =
|
|
851
|
+
var AssertionParameterSchema = import_zod23.z.object({
|
|
814
852
|
/** Parameter name (used as key in params object) */
|
|
815
|
-
name:
|
|
853
|
+
name: import_zod23.z.string().min(1),
|
|
816
854
|
/** Display label for the parameter */
|
|
817
|
-
label:
|
|
855
|
+
label: import_zod23.z.string().min(1),
|
|
818
856
|
/** Parameter type */
|
|
819
857
|
type: AssertionParameterTypeSchema,
|
|
820
858
|
/** Whether this parameter is required */
|
|
821
|
-
required:
|
|
859
|
+
required: import_zod23.z.boolean(),
|
|
822
860
|
/** Default value (optional, used when not provided) */
|
|
823
|
-
defaultValue:
|
|
861
|
+
defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
|
|
824
862
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
825
|
-
advanced:
|
|
863
|
+
advanced: import_zod23.z.boolean().optional()
|
|
826
864
|
});
|
|
827
|
-
var ScenarioAssertionLinkSchema =
|
|
865
|
+
var ScenarioAssertionLinkSchema = import_zod23.z.object({
|
|
828
866
|
/** ID of the system assertion (e.g., 'system:skill_was_called') */
|
|
829
|
-
assertionId:
|
|
867
|
+
assertionId: import_zod23.z.string(),
|
|
830
868
|
/** Parameter values for this assertion in this scenario */
|
|
831
|
-
params:
|
|
832
|
-
|
|
833
|
-
|
|
869
|
+
params: import_zod23.z.record(
|
|
870
|
+
import_zod23.z.string(),
|
|
871
|
+
import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
|
|
834
872
|
).optional()
|
|
835
873
|
});
|
|
836
|
-
var SkillWasCalledConfigSchema =
|
|
874
|
+
var SkillWasCalledConfigSchema = import_zod23.z.object({
|
|
837
875
|
/** Names of the skills that must have been called */
|
|
838
|
-
skillNames:
|
|
876
|
+
skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
|
|
839
877
|
});
|
|
840
|
-
var CostConfigSchema =
|
|
878
|
+
var CostConfigSchema = import_zod23.z.strictObject({
|
|
841
879
|
/** Maximum allowed cost in USD */
|
|
842
|
-
maxCostUsd:
|
|
880
|
+
maxCostUsd: import_zod23.z.number().positive()
|
|
843
881
|
});
|
|
844
|
-
var ToolCalledWithParamConfigSchema =
|
|
882
|
+
var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
|
|
845
883
|
/** Name of the tool that must have been called */
|
|
846
|
-
toolName:
|
|
884
|
+
toolName: import_zod23.z.string().min(1),
|
|
847
885
|
/** JSON string of key-value pairs for expected parameters (substring match). Optional — when omitted, only checks tool presence. */
|
|
848
|
-
expectedParams:
|
|
886
|
+
expectedParams: import_zod23.z.string().min(1).optional(),
|
|
849
887
|
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
850
|
-
requireSuccess:
|
|
888
|
+
requireSuccess: import_zod23.z.boolean().optional()
|
|
851
889
|
});
|
|
852
|
-
var BuildPassedConfigSchema =
|
|
853
|
-
/**
|
|
854
|
-
command:
|
|
890
|
+
var BuildPassedConfigSchema = import_zod23.z.strictObject({
|
|
891
|
+
/** Allowlisted command only (default at runtime: "yarn build") */
|
|
892
|
+
command: BuildPassedCommandStringSchema.optional(),
|
|
855
893
|
/** Expected exit code (default: 0) */
|
|
856
|
-
expectedExitCode:
|
|
894
|
+
expectedExitCode: import_zod23.z.number().int().optional()
|
|
857
895
|
});
|
|
858
|
-
var TimeConfigSchema =
|
|
896
|
+
var TimeConfigSchema = import_zod23.z.strictObject({
|
|
859
897
|
/** Maximum allowed duration in milliseconds */
|
|
860
|
-
maxDurationMs:
|
|
898
|
+
maxDurationMs: import_zod23.z.number().int().positive()
|
|
861
899
|
});
|
|
862
|
-
var LlmJudgeConfigSchema =
|
|
900
|
+
var LlmJudgeConfigSchema = import_zod23.z.object({
|
|
863
901
|
/**
|
|
864
902
|
* Prompt template with placeholders:
|
|
865
903
|
* - {{output}}: agent's final output
|
|
@@ -870,65 +908,65 @@ var LlmJudgeConfigSchema = import_zod22.z.object({
|
|
|
870
908
|
* - {{trace}}: step-by-step trace of tool calls
|
|
871
909
|
* - Custom parameters defined in the parameters array
|
|
872
910
|
*/
|
|
873
|
-
prompt:
|
|
911
|
+
prompt: import_zod23.z.string().min(1),
|
|
874
912
|
/** Minimum score to pass (0-10, default 7) */
|
|
875
|
-
minScore:
|
|
913
|
+
minScore: import_zod23.z.number().int().min(0).max(10).optional(),
|
|
876
914
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
877
|
-
model:
|
|
915
|
+
model: import_zod23.z.string().optional(),
|
|
878
916
|
/** Max output tokens */
|
|
879
|
-
maxTokens:
|
|
917
|
+
maxTokens: import_zod23.z.number().int().optional(),
|
|
880
918
|
/** Temperature (0-1) */
|
|
881
|
-
temperature:
|
|
919
|
+
temperature: import_zod23.z.number().min(0).max(1).optional(),
|
|
882
920
|
/** User-defined parameters for this assertion */
|
|
883
|
-
parameters:
|
|
921
|
+
parameters: import_zod23.z.array(AssertionParameterSchema).optional()
|
|
884
922
|
});
|
|
885
|
-
var ApiCallConfigSchema =
|
|
923
|
+
var ApiCallConfigSchema = import_zod23.z.strictObject({
|
|
886
924
|
/** URL to call */
|
|
887
|
-
url:
|
|
925
|
+
url: import_zod23.z.string().min(1),
|
|
888
926
|
/** HTTP method (default GET) */
|
|
889
|
-
method:
|
|
927
|
+
method: import_zod23.z.enum(["GET", "POST"]).optional(),
|
|
890
928
|
/** Request body (JSON string, for POST requests) */
|
|
891
|
-
requestBody:
|
|
929
|
+
requestBody: import_zod23.z.string().optional(),
|
|
892
930
|
/** Expected JSON response to validate against (subset match — extra fields in actual are OK) */
|
|
893
|
-
expectedResponse:
|
|
931
|
+
expectedResponse: import_zod23.z.string().min(1),
|
|
894
932
|
/** Request headers as JSON string of key-value pairs */
|
|
895
|
-
requestHeaders:
|
|
933
|
+
requestHeaders: import_zod23.z.string().optional(),
|
|
896
934
|
/** Request timeout in milliseconds (default 30000) */
|
|
897
|
-
timeoutMs:
|
|
935
|
+
timeoutMs: import_zod23.z.number().int().positive().optional()
|
|
898
936
|
});
|
|
899
937
|
var AssertionBaseFields = {
|
|
900
938
|
/** When true, the assertion's pass/fail logic is inverted (NOT operator). */
|
|
901
|
-
negate:
|
|
939
|
+
negate: import_zod23.z.boolean().optional()
|
|
902
940
|
};
|
|
903
941
|
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
904
|
-
type:
|
|
942
|
+
type: import_zod23.z.literal("skill_was_called"),
|
|
905
943
|
...AssertionBaseFields
|
|
906
944
|
});
|
|
907
945
|
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
908
|
-
type:
|
|
946
|
+
type: import_zod23.z.literal("tool_called_with_param"),
|
|
909
947
|
...AssertionBaseFields
|
|
910
948
|
});
|
|
911
949
|
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
912
|
-
type:
|
|
950
|
+
type: import_zod23.z.literal("build_passed"),
|
|
913
951
|
...AssertionBaseFields
|
|
914
952
|
});
|
|
915
953
|
var CostAssertionSchema = CostConfigSchema.extend({
|
|
916
|
-
type:
|
|
954
|
+
type: import_zod23.z.literal("cost"),
|
|
917
955
|
...AssertionBaseFields
|
|
918
956
|
});
|
|
919
957
|
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
920
|
-
type:
|
|
958
|
+
type: import_zod23.z.literal("llm_judge"),
|
|
921
959
|
...AssertionBaseFields
|
|
922
960
|
});
|
|
923
961
|
var ApiCallAssertionSchema = ApiCallConfigSchema.extend({
|
|
924
|
-
type:
|
|
962
|
+
type: import_zod23.z.literal("api_call"),
|
|
925
963
|
...AssertionBaseFields
|
|
926
964
|
});
|
|
927
965
|
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
928
|
-
type:
|
|
966
|
+
type: import_zod23.z.literal("time_limit"),
|
|
929
967
|
...AssertionBaseFields
|
|
930
968
|
});
|
|
931
|
-
var AssertionSchema =
|
|
969
|
+
var AssertionSchema = import_zod23.z.union([
|
|
932
970
|
SkillWasCalledAssertionSchema,
|
|
933
971
|
ToolCalledWithParamAssertionSchema,
|
|
934
972
|
BuildPassedAssertionSchema,
|
|
@@ -937,7 +975,7 @@ var AssertionSchema = import_zod22.z.union([
|
|
|
937
975
|
LlmJudgeAssertionSchema,
|
|
938
976
|
ApiCallAssertionSchema
|
|
939
977
|
]);
|
|
940
|
-
var AssertionConfigSchema =
|
|
978
|
+
var AssertionConfigSchema = import_zod23.z.union([
|
|
941
979
|
LlmJudgeConfigSchema,
|
|
942
980
|
// requires prompt - check first
|
|
943
981
|
SkillWasCalledConfigSchema,
|
|
@@ -952,7 +990,7 @@ var AssertionConfigSchema = import_zod22.z.union([
|
|
|
952
990
|
// requires maxCostUsd, uses strictObject
|
|
953
991
|
BuildPassedConfigSchema,
|
|
954
992
|
// all optional, uses strictObject to reject unknown keys
|
|
955
|
-
|
|
993
|
+
import_zod23.z.object({})
|
|
956
994
|
// fallback empty config
|
|
957
995
|
]);
|
|
958
996
|
function validateAssertionConfig(type, config) {
|
|
@@ -976,52 +1014,322 @@ function validateAssertionConfig(type, config) {
|
|
|
976
1014
|
}
|
|
977
1015
|
}
|
|
978
1016
|
|
|
1017
|
+
// src/assertion/system-assertions.ts
|
|
1018
|
+
var SYSTEM_ASSERTION_IDS = {
|
|
1019
|
+
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1020
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
1021
|
+
BUILD_PASSED: "system:build_passed",
|
|
1022
|
+
TIME_LIMIT: "system:time_limit",
|
|
1023
|
+
COST: "system:cost",
|
|
1024
|
+
LLM_JUDGE: "system:llm_judge",
|
|
1025
|
+
API_CALL: "system:api_call"
|
|
1026
|
+
};
|
|
1027
|
+
function isSystemAssertionId(id) {
|
|
1028
|
+
return id.startsWith("system:");
|
|
1029
|
+
}
|
|
1030
|
+
var SYSTEM_ASSERTIONS = {
|
|
1031
|
+
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1032
|
+
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1033
|
+
name: "Skill Was Called",
|
|
1034
|
+
description: "Check that one or more skills were invoked during the agent run",
|
|
1035
|
+
type: "skill_was_called",
|
|
1036
|
+
parameters: [
|
|
1037
|
+
{
|
|
1038
|
+
name: "skillNames",
|
|
1039
|
+
label: "Skills",
|
|
1040
|
+
type: "string",
|
|
1041
|
+
required: true
|
|
1042
|
+
},
|
|
1043
|
+
{
|
|
1044
|
+
name: "negate",
|
|
1045
|
+
label: "Negate (NOT operator)",
|
|
1046
|
+
type: "boolean",
|
|
1047
|
+
required: false,
|
|
1048
|
+
defaultValue: false
|
|
1049
|
+
}
|
|
1050
|
+
]
|
|
1051
|
+
},
|
|
1052
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
1053
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
1054
|
+
name: "Tool Called With Param",
|
|
1055
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
1056
|
+
type: "tool_called_with_param",
|
|
1057
|
+
parameters: [
|
|
1058
|
+
{
|
|
1059
|
+
name: "toolName",
|
|
1060
|
+
label: "Tool Name",
|
|
1061
|
+
type: "string",
|
|
1062
|
+
required: true
|
|
1063
|
+
},
|
|
1064
|
+
{
|
|
1065
|
+
name: "expectedParams",
|
|
1066
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
1067
|
+
type: "string",
|
|
1068
|
+
required: false
|
|
1069
|
+
},
|
|
1070
|
+
{
|
|
1071
|
+
name: "requireSuccess",
|
|
1072
|
+
label: "Require Successful Call",
|
|
1073
|
+
type: "boolean",
|
|
1074
|
+
required: false,
|
|
1075
|
+
defaultValue: false,
|
|
1076
|
+
advanced: true
|
|
1077
|
+
},
|
|
1078
|
+
{
|
|
1079
|
+
name: "negate",
|
|
1080
|
+
label: "Negate (NOT operator)",
|
|
1081
|
+
type: "boolean",
|
|
1082
|
+
required: false,
|
|
1083
|
+
defaultValue: false
|
|
1084
|
+
}
|
|
1085
|
+
]
|
|
1086
|
+
},
|
|
1087
|
+
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1088
|
+
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1089
|
+
name: "Build Passed",
|
|
1090
|
+
description: "Run a build command and verify it exits with expected code",
|
|
1091
|
+
type: "build_passed",
|
|
1092
|
+
parameters: [
|
|
1093
|
+
{
|
|
1094
|
+
name: "command",
|
|
1095
|
+
label: "Build Command",
|
|
1096
|
+
type: "string",
|
|
1097
|
+
required: false,
|
|
1098
|
+
defaultValue: "yarn build"
|
|
1099
|
+
},
|
|
1100
|
+
{
|
|
1101
|
+
name: "expectedExitCode",
|
|
1102
|
+
label: "Expected Exit Code",
|
|
1103
|
+
type: "number",
|
|
1104
|
+
required: false,
|
|
1105
|
+
defaultValue: 0
|
|
1106
|
+
},
|
|
1107
|
+
{
|
|
1108
|
+
name: "maxBuildTime",
|
|
1109
|
+
label: "Max Build Time (ms)",
|
|
1110
|
+
type: "number",
|
|
1111
|
+
required: false,
|
|
1112
|
+
advanced: true
|
|
1113
|
+
},
|
|
1114
|
+
{
|
|
1115
|
+
name: "maxMemory",
|
|
1116
|
+
label: "Max Memory (MB)",
|
|
1117
|
+
type: "number",
|
|
1118
|
+
required: false,
|
|
1119
|
+
advanced: true
|
|
1120
|
+
}
|
|
1121
|
+
]
|
|
1122
|
+
},
|
|
1123
|
+
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
1124
|
+
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
1125
|
+
name: "Time Limit",
|
|
1126
|
+
description: "Check that the scenario completed within a maximum duration",
|
|
1127
|
+
type: "time_limit",
|
|
1128
|
+
parameters: [
|
|
1129
|
+
{
|
|
1130
|
+
name: "maxDurationMs",
|
|
1131
|
+
label: "Max Duration (ms)",
|
|
1132
|
+
type: "number",
|
|
1133
|
+
required: true,
|
|
1134
|
+
defaultValue: 3e5
|
|
1135
|
+
}
|
|
1136
|
+
]
|
|
1137
|
+
},
|
|
1138
|
+
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
1139
|
+
id: SYSTEM_ASSERTION_IDS.COST,
|
|
1140
|
+
name: "Cost",
|
|
1141
|
+
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
1142
|
+
type: "cost",
|
|
1143
|
+
parameters: [
|
|
1144
|
+
{
|
|
1145
|
+
name: "maxCostUsd",
|
|
1146
|
+
label: "Max Cost (USD)",
|
|
1147
|
+
type: "number",
|
|
1148
|
+
required: true,
|
|
1149
|
+
defaultValue: 1
|
|
1150
|
+
}
|
|
1151
|
+
]
|
|
1152
|
+
},
|
|
1153
|
+
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1154
|
+
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1155
|
+
name: "LLM Judge",
|
|
1156
|
+
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
1157
|
+
type: "llm_judge",
|
|
1158
|
+
parameters: [
|
|
1159
|
+
{
|
|
1160
|
+
name: "prompt",
|
|
1161
|
+
label: "Judge Prompt",
|
|
1162
|
+
type: "string",
|
|
1163
|
+
required: true,
|
|
1164
|
+
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1165
|
+
},
|
|
1166
|
+
{
|
|
1167
|
+
name: "minScore",
|
|
1168
|
+
label: "Minimum Score (0-10)",
|
|
1169
|
+
type: "number",
|
|
1170
|
+
required: false,
|
|
1171
|
+
defaultValue: 7
|
|
1172
|
+
},
|
|
1173
|
+
{
|
|
1174
|
+
name: "model",
|
|
1175
|
+
label: "Model",
|
|
1176
|
+
type: "string",
|
|
1177
|
+
required: false
|
|
1178
|
+
}
|
|
1179
|
+
]
|
|
1180
|
+
},
|
|
1181
|
+
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
1182
|
+
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
1183
|
+
name: "API Call",
|
|
1184
|
+
description: "Call an API endpoint and verify the response contains expected data",
|
|
1185
|
+
type: "api_call",
|
|
1186
|
+
parameters: [
|
|
1187
|
+
{
|
|
1188
|
+
name: "url",
|
|
1189
|
+
label: "URL",
|
|
1190
|
+
type: "string",
|
|
1191
|
+
required: true
|
|
1192
|
+
},
|
|
1193
|
+
{
|
|
1194
|
+
name: "method",
|
|
1195
|
+
label: "HTTP Method",
|
|
1196
|
+
type: "string",
|
|
1197
|
+
required: false,
|
|
1198
|
+
defaultValue: "GET"
|
|
1199
|
+
},
|
|
1200
|
+
{
|
|
1201
|
+
name: "requestBody",
|
|
1202
|
+
label: "Request Body (JSON)",
|
|
1203
|
+
type: "string",
|
|
1204
|
+
required: false
|
|
1205
|
+
},
|
|
1206
|
+
{
|
|
1207
|
+
name: "expectedResponse",
|
|
1208
|
+
label: "Expected Response (JSON)",
|
|
1209
|
+
type: "string",
|
|
1210
|
+
required: true
|
|
1211
|
+
},
|
|
1212
|
+
{
|
|
1213
|
+
name: "requestHeaders",
|
|
1214
|
+
label: "Headers (JSON)",
|
|
1215
|
+
type: "string",
|
|
1216
|
+
required: false,
|
|
1217
|
+
advanced: true
|
|
1218
|
+
},
|
|
1219
|
+
{
|
|
1220
|
+
name: "timeoutMs",
|
|
1221
|
+
label: "Timeout (ms)",
|
|
1222
|
+
type: "number",
|
|
1223
|
+
required: false,
|
|
1224
|
+
defaultValue: 3e4,
|
|
1225
|
+
advanced: true
|
|
1226
|
+
}
|
|
1227
|
+
]
|
|
1228
|
+
}
|
|
1229
|
+
};
|
|
1230
|
+
function getSystemAssertions() {
|
|
1231
|
+
return Object.values(SYSTEM_ASSERTIONS);
|
|
1232
|
+
}
|
|
1233
|
+
function getSystemAssertion(id) {
|
|
1234
|
+
return SYSTEM_ASSERTIONS[id];
|
|
1235
|
+
}
|
|
1236
|
+
|
|
979
1237
|
// src/scenario/test-scenario.ts
|
|
980
|
-
var
|
|
1238
|
+
var MAX_IMAGE_BASE64_LENGTH = 4 * Math.ceil(2 * 1024 * 1024 / 3);
|
|
1239
|
+
var TriggerPromptImageSchema = import_zod24.z.object({
|
|
1240
|
+
/** Base64-encoded image data (no data URL prefix) */
|
|
1241
|
+
base64: import_zod24.z.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
|
|
1242
|
+
/** MIME type of the image */
|
|
1243
|
+
mediaType: import_zod24.z.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
|
|
1244
|
+
/** Original filename of the image */
|
|
1245
|
+
name: import_zod24.z.string()
|
|
1246
|
+
});
|
|
1247
|
+
var ExpectedFileSchema = import_zod24.z.object({
|
|
981
1248
|
/** Relative path where the file should be created */
|
|
982
|
-
path:
|
|
1249
|
+
path: import_zod24.z.string(),
|
|
983
1250
|
/** Optional expected content */
|
|
984
|
-
content:
|
|
1251
|
+
content: import_zod24.z.string().optional()
|
|
985
1252
|
});
|
|
986
1253
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
987
1254
|
/** The prompt sent to the agent to trigger the task */
|
|
988
|
-
triggerPrompt:
|
|
1255
|
+
triggerPrompt: import_zod24.z.string().min(10),
|
|
989
1256
|
/** ID of the template to use for this scenario (null = no template) */
|
|
990
|
-
templateId:
|
|
1257
|
+
templateId: import_zod24.z.string().nullish(),
|
|
991
1258
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
992
|
-
assertions:
|
|
1259
|
+
assertions: import_zod24.z.array(AssertionSchema).optional(),
|
|
993
1260
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
994
|
-
assertionIds:
|
|
1261
|
+
assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
|
|
995
1262
|
/** Linked assertions with per-scenario parameter values */
|
|
996
|
-
assertionLinks:
|
|
1263
|
+
assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional(),
|
|
997
1264
|
/** Tags for categorisation and filtering */
|
|
998
|
-
tags:
|
|
999
|
-
|
|
1000
|
-
|
|
1265
|
+
tags: import_zod24.z.array(import_zod24.z.string()).optional(),
|
|
1266
|
+
/** Base64-encoded images attached to the trigger prompt (max 3) */
|
|
1267
|
+
triggerPromptImages: import_zod24.z.array(TriggerPromptImageSchema).max(3).optional()
|
|
1268
|
+
});
|
|
1269
|
+
function validateBuildPassedParamsInAssertionLinks(links, ctx) {
|
|
1270
|
+
if (!links) return;
|
|
1271
|
+
for (let i = 0; i < links.length; i++) {
|
|
1272
|
+
const link = links[i];
|
|
1273
|
+
if (link.assertionId !== SYSTEM_ASSERTION_IDS.BUILD_PASSED) continue;
|
|
1274
|
+
const cmd = link.params?.command;
|
|
1275
|
+
if (cmd === void 0 || cmd === null) continue;
|
|
1276
|
+
if (typeof cmd !== "string") {
|
|
1277
|
+
ctx.addIssue({
|
|
1278
|
+
code: import_zod24.z.ZodIssueCode.custom,
|
|
1279
|
+
message: "build_passed command must be a string",
|
|
1280
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1281
|
+
});
|
|
1282
|
+
continue;
|
|
1283
|
+
}
|
|
1284
|
+
if (!isAllowedBuildCommandString(cmd)) {
|
|
1285
|
+
ctx.addIssue({
|
|
1286
|
+
code: import_zod24.z.ZodIssueCode.custom,
|
|
1287
|
+
message: "Invalid build_passed command. Allowed: yarn build, npm run build, pnpm run build, pnpm build",
|
|
1288
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1289
|
+
});
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
var TestScenarioCreateBaseSchema = TestScenarioSchema.omit({
|
|
1001
1294
|
id: true,
|
|
1002
1295
|
createdAt: true,
|
|
1003
1296
|
updatedAt: true,
|
|
1004
1297
|
deleted: true
|
|
1005
1298
|
});
|
|
1006
|
-
var
|
|
1299
|
+
var CreateTestScenarioInputSchema = TestScenarioCreateBaseSchema.superRefine((data, ctx) => {
|
|
1300
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1301
|
+
});
|
|
1302
|
+
var UpdateTestScenarioInputSchema = TestScenarioCreateBaseSchema.partial().superRefine((data, ctx) => {
|
|
1303
|
+
if (data.assertionLinks !== void 0) {
|
|
1304
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1305
|
+
}
|
|
1306
|
+
});
|
|
1007
1307
|
|
|
1008
1308
|
// src/scenario/batch-import.ts
|
|
1009
|
-
var
|
|
1309
|
+
var import_zod25 = require("zod");
|
|
1010
1310
|
var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
|
1011
|
-
var BatchAssertionLinkSchema =
|
|
1012
|
-
|
|
1311
|
+
var BatchAssertionLinkSchema = import_zod25.z.union([
|
|
1312
|
+
import_zod25.z.string().min(1),
|
|
1013
1313
|
ScenarioAssertionLinkSchema
|
|
1014
1314
|
]);
|
|
1015
|
-
var BatchScenarioEntrySchema =
|
|
1016
|
-
name:
|
|
1017
|
-
description:
|
|
1018
|
-
triggerPrompt:
|
|
1019
|
-
templateId:
|
|
1020
|
-
tags:
|
|
1021
|
-
assertionLinks:
|
|
1315
|
+
var BatchScenarioEntrySchema = import_zod25.z.object({
|
|
1316
|
+
name: import_zod25.z.string().min(1, "name: Required"),
|
|
1317
|
+
description: import_zod25.z.string().optional().default(""),
|
|
1318
|
+
triggerPrompt: import_zod25.z.string().min(10, "triggerPrompt: Must be at least 10 characters"),
|
|
1319
|
+
templateId: import_zod25.z.string().nullish(),
|
|
1320
|
+
tags: import_zod25.z.array(import_zod25.z.string()).optional(),
|
|
1321
|
+
assertionLinks: import_zod25.z.array(BatchAssertionLinkSchema).optional()
|
|
1322
|
+
}).superRefine((data, ctx) => {
|
|
1323
|
+
if (!data.assertionLinks) return;
|
|
1324
|
+
const objectLinks = data.assertionLinks.filter(
|
|
1325
|
+
(link) => typeof link !== "string"
|
|
1326
|
+
);
|
|
1327
|
+
if (objectLinks.length > 0) {
|
|
1328
|
+
validateBuildPassedParamsInAssertionLinks(objectLinks, ctx);
|
|
1329
|
+
}
|
|
1022
1330
|
});
|
|
1023
|
-
var BatchImportPayloadSchema =
|
|
1024
|
-
scenarios:
|
|
1331
|
+
var BatchImportPayloadSchema = import_zod25.z.object({
|
|
1332
|
+
scenarios: import_zod25.z.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
|
|
1025
1333
|
});
|
|
1026
1334
|
var BATCH_IMPORT_LIMITS = {
|
|
1027
1335
|
MAX_SCENARIOS: 100,
|
|
@@ -1043,29 +1351,29 @@ function normalizeBatchAssertionLink(link) {
|
|
|
1043
1351
|
}
|
|
1044
1352
|
return link;
|
|
1045
1353
|
}
|
|
1046
|
-
var BatchResultItemSchema =
|
|
1047
|
-
index:
|
|
1048
|
-
name:
|
|
1049
|
-
status:
|
|
1050
|
-
id:
|
|
1051
|
-
errors:
|
|
1052
|
-
});
|
|
1053
|
-
var BatchSummarySchema =
|
|
1054
|
-
total:
|
|
1055
|
-
valid:
|
|
1056
|
-
invalid:
|
|
1057
|
-
created:
|
|
1058
|
-
});
|
|
1059
|
-
var BatchImportResponseSchema =
|
|
1354
|
+
var BatchResultItemSchema = import_zod25.z.object({
|
|
1355
|
+
index: import_zod25.z.number(),
|
|
1356
|
+
name: import_zod25.z.string(),
|
|
1357
|
+
status: import_zod25.z.enum(["valid", "invalid"]),
|
|
1358
|
+
id: import_zod25.z.string().nullable().optional(),
|
|
1359
|
+
errors: import_zod25.z.array(import_zod25.z.string()).optional()
|
|
1360
|
+
});
|
|
1361
|
+
var BatchSummarySchema = import_zod25.z.object({
|
|
1362
|
+
total: import_zod25.z.number(),
|
|
1363
|
+
valid: import_zod25.z.number(),
|
|
1364
|
+
invalid: import_zod25.z.number(),
|
|
1365
|
+
created: import_zod25.z.number()
|
|
1366
|
+
});
|
|
1367
|
+
var BatchImportResponseSchema = import_zod25.z.object({
|
|
1060
1368
|
summary: BatchSummarySchema,
|
|
1061
|
-
results:
|
|
1369
|
+
results: import_zod25.z.array(BatchResultItemSchema)
|
|
1062
1370
|
});
|
|
1063
1371
|
|
|
1064
1372
|
// src/suite/test-suite.ts
|
|
1065
|
-
var
|
|
1373
|
+
var import_zod26 = require("zod");
|
|
1066
1374
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1067
1375
|
/** IDs of test scenarios in this suite */
|
|
1068
|
-
scenarioIds:
|
|
1376
|
+
scenarioIds: import_zod26.z.array(import_zod26.z.string())
|
|
1069
1377
|
});
|
|
1070
1378
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1071
1379
|
id: true,
|
|
@@ -1076,21 +1384,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1076
1384
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1077
1385
|
|
|
1078
1386
|
// src/evaluation/metrics.ts
|
|
1079
|
-
var
|
|
1080
|
-
var TokenUsageSchema =
|
|
1081
|
-
prompt:
|
|
1082
|
-
completion:
|
|
1083
|
-
total:
|
|
1084
|
-
});
|
|
1085
|
-
var EvalMetricsSchema =
|
|
1086
|
-
totalAssertions:
|
|
1087
|
-
passed:
|
|
1088
|
-
failed:
|
|
1089
|
-
skipped:
|
|
1090
|
-
errors:
|
|
1091
|
-
passRate:
|
|
1092
|
-
avgDuration:
|
|
1093
|
-
totalDuration:
|
|
1387
|
+
var import_zod27 = require("zod");
|
|
1388
|
+
var TokenUsageSchema = import_zod27.z.object({
|
|
1389
|
+
prompt: import_zod27.z.number(),
|
|
1390
|
+
completion: import_zod27.z.number(),
|
|
1391
|
+
total: import_zod27.z.number()
|
|
1392
|
+
});
|
|
1393
|
+
var EvalMetricsSchema = import_zod27.z.object({
|
|
1394
|
+
totalAssertions: import_zod27.z.number(),
|
|
1395
|
+
passed: import_zod27.z.number(),
|
|
1396
|
+
failed: import_zod27.z.number(),
|
|
1397
|
+
skipped: import_zod27.z.number(),
|
|
1398
|
+
errors: import_zod27.z.number(),
|
|
1399
|
+
passRate: import_zod27.z.number(),
|
|
1400
|
+
avgDuration: import_zod27.z.number(),
|
|
1401
|
+
totalDuration: import_zod27.z.number()
|
|
1094
1402
|
});
|
|
1095
1403
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1096
1404
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1100,7 +1408,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1100
1408
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1101
1409
|
return EvalStatus2;
|
|
1102
1410
|
})(EvalStatus || {});
|
|
1103
|
-
var EvalStatusSchema =
|
|
1411
|
+
var EvalStatusSchema = import_zod27.z.enum(EvalStatus);
|
|
1104
1412
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1105
1413
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1106
1414
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1108,54 +1416,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1108
1416
|
LLMStepType2["THINKING"] = "thinking";
|
|
1109
1417
|
return LLMStepType2;
|
|
1110
1418
|
})(LLMStepType || {});
|
|
1111
|
-
var LLMTraceStepSchema =
|
|
1112
|
-
id:
|
|
1113
|
-
stepNumber:
|
|
1114
|
-
type:
|
|
1115
|
-
model:
|
|
1116
|
-
provider:
|
|
1117
|
-
startedAt:
|
|
1118
|
-
durationMs:
|
|
1419
|
+
var LLMTraceStepSchema = import_zod27.z.object({
|
|
1420
|
+
id: import_zod27.z.string(),
|
|
1421
|
+
stepNumber: import_zod27.z.number(),
|
|
1422
|
+
type: import_zod27.z.enum(LLMStepType),
|
|
1423
|
+
model: import_zod27.z.string(),
|
|
1424
|
+
provider: import_zod27.z.string(),
|
|
1425
|
+
startedAt: import_zod27.z.string(),
|
|
1426
|
+
durationMs: import_zod27.z.number(),
|
|
1119
1427
|
tokenUsage: TokenUsageSchema,
|
|
1120
|
-
costUsd:
|
|
1121
|
-
toolName:
|
|
1122
|
-
toolArguments:
|
|
1123
|
-
inputPreview:
|
|
1124
|
-
outputPreview:
|
|
1125
|
-
success:
|
|
1126
|
-
error:
|
|
1127
|
-
turnIndex:
|
|
1128
|
-
});
|
|
1129
|
-
var LLMBreakdownStatsSchema =
|
|
1130
|
-
count:
|
|
1131
|
-
durationMs:
|
|
1132
|
-
tokens:
|
|
1133
|
-
costUsd:
|
|
1134
|
-
});
|
|
1135
|
-
var LLMTraceSummarySchema =
|
|
1136
|
-
totalSteps:
|
|
1137
|
-
totalTurns:
|
|
1138
|
-
totalDurationMs:
|
|
1428
|
+
costUsd: import_zod27.z.number(),
|
|
1429
|
+
toolName: import_zod27.z.string().optional(),
|
|
1430
|
+
toolArguments: import_zod27.z.string().optional(),
|
|
1431
|
+
inputPreview: import_zod27.z.string().optional(),
|
|
1432
|
+
outputPreview: import_zod27.z.string().optional(),
|
|
1433
|
+
success: import_zod27.z.boolean(),
|
|
1434
|
+
error: import_zod27.z.string().optional(),
|
|
1435
|
+
turnIndex: import_zod27.z.number().optional()
|
|
1436
|
+
});
|
|
1437
|
+
var LLMBreakdownStatsSchema = import_zod27.z.object({
|
|
1438
|
+
count: import_zod27.z.number(),
|
|
1439
|
+
durationMs: import_zod27.z.number(),
|
|
1440
|
+
tokens: import_zod27.z.number(),
|
|
1441
|
+
costUsd: import_zod27.z.number()
|
|
1442
|
+
});
|
|
1443
|
+
var LLMTraceSummarySchema = import_zod27.z.object({
|
|
1444
|
+
totalSteps: import_zod27.z.number(),
|
|
1445
|
+
totalTurns: import_zod27.z.number().optional(),
|
|
1446
|
+
totalDurationMs: import_zod27.z.number(),
|
|
1139
1447
|
totalTokens: TokenUsageSchema,
|
|
1140
|
-
totalCostUsd:
|
|
1141
|
-
stepTypeBreakdown:
|
|
1142
|
-
modelBreakdown:
|
|
1143
|
-
modelsUsed:
|
|
1144
|
-
});
|
|
1145
|
-
var LLMTraceSchema =
|
|
1146
|
-
id:
|
|
1147
|
-
steps:
|
|
1448
|
+
totalCostUsd: import_zod27.z.number(),
|
|
1449
|
+
stepTypeBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
1450
|
+
modelBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema),
|
|
1451
|
+
modelsUsed: import_zod27.z.array(import_zod27.z.string())
|
|
1452
|
+
});
|
|
1453
|
+
var LLMTraceSchema = import_zod27.z.object({
|
|
1454
|
+
id: import_zod27.z.string(),
|
|
1455
|
+
steps: import_zod27.z.array(LLMTraceStepSchema),
|
|
1148
1456
|
summary: LLMTraceSummarySchema
|
|
1149
1457
|
});
|
|
1150
1458
|
|
|
1151
1459
|
// src/evaluation/eval-result.ts
|
|
1152
|
-
var
|
|
1460
|
+
var import_zod31 = require("zod");
|
|
1153
1461
|
|
|
1154
1462
|
// src/evaluation/eval-run.ts
|
|
1155
|
-
var
|
|
1463
|
+
var import_zod29 = require("zod");
|
|
1156
1464
|
|
|
1157
1465
|
// src/evaluation/live-trace.ts
|
|
1158
|
-
var
|
|
1466
|
+
var import_zod28 = require("zod");
|
|
1159
1467
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1160
1468
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1161
1469
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1169,37 +1477,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1169
1477
|
LiveTraceEventType2["USER"] = "user";
|
|
1170
1478
|
return LiveTraceEventType2;
|
|
1171
1479
|
})(LiveTraceEventType || {});
|
|
1172
|
-
var LiveTraceEventSchema =
|
|
1480
|
+
var LiveTraceEventSchema = import_zod28.z.object({
|
|
1173
1481
|
/** The evaluation run ID */
|
|
1174
|
-
evalRunId:
|
|
1482
|
+
evalRunId: import_zod28.z.string(),
|
|
1175
1483
|
/** The scenario ID being executed */
|
|
1176
|
-
scenarioId:
|
|
1484
|
+
scenarioId: import_zod28.z.string(),
|
|
1177
1485
|
/** The scenario name for display */
|
|
1178
|
-
scenarioName:
|
|
1486
|
+
scenarioName: import_zod28.z.string(),
|
|
1179
1487
|
/** The target ID (skill, agent, etc.) */
|
|
1180
|
-
targetId:
|
|
1488
|
+
targetId: import_zod28.z.string(),
|
|
1181
1489
|
/** The target name for display */
|
|
1182
|
-
targetName:
|
|
1490
|
+
targetName: import_zod28.z.string(),
|
|
1183
1491
|
/** Step number in the current scenario execution */
|
|
1184
|
-
stepNumber:
|
|
1492
|
+
stepNumber: import_zod28.z.number(),
|
|
1185
1493
|
/** Type of trace event */
|
|
1186
|
-
type:
|
|
1494
|
+
type: import_zod28.z.enum(LiveTraceEventType),
|
|
1187
1495
|
/** Tool name if this is a tool_use event */
|
|
1188
|
-
toolName:
|
|
1496
|
+
toolName: import_zod28.z.string().optional(),
|
|
1189
1497
|
/** Tool arguments preview (truncated JSON) */
|
|
1190
|
-
toolArgs:
|
|
1498
|
+
toolArgs: import_zod28.z.string().optional(),
|
|
1191
1499
|
/** Output preview (truncated text) */
|
|
1192
|
-
outputPreview:
|
|
1500
|
+
outputPreview: import_zod28.z.string().optional(),
|
|
1193
1501
|
/** File path for file operations */
|
|
1194
|
-
filePath:
|
|
1502
|
+
filePath: import_zod28.z.string().optional(),
|
|
1195
1503
|
/** Elapsed time in milliseconds for progress events */
|
|
1196
|
-
elapsedMs:
|
|
1504
|
+
elapsedMs: import_zod28.z.number().optional(),
|
|
1197
1505
|
/** Thinking/reasoning text from Claude */
|
|
1198
|
-
thinking:
|
|
1506
|
+
thinking: import_zod28.z.string().optional(),
|
|
1199
1507
|
/** Timestamp when this event occurred */
|
|
1200
|
-
timestamp:
|
|
1508
|
+
timestamp: import_zod28.z.string(),
|
|
1201
1509
|
/** Whether this is the final event for this scenario */
|
|
1202
|
-
isComplete:
|
|
1510
|
+
isComplete: import_zod28.z.boolean()
|
|
1203
1511
|
});
|
|
1204
1512
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1205
1513
|
function parseTraceEventLine(line) {
|
|
@@ -1228,40 +1536,40 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1228
1536
|
TriggerType2["SCHEDULED"] = "SCHEDULED";
|
|
1229
1537
|
return TriggerType2;
|
|
1230
1538
|
})(TriggerType || {});
|
|
1231
|
-
var TriggerMetadataSchema =
|
|
1232
|
-
version:
|
|
1233
|
-
resourceUpdated:
|
|
1234
|
-
scheduleId:
|
|
1539
|
+
var TriggerMetadataSchema = import_zod29.z.object({
|
|
1540
|
+
version: import_zod29.z.string().optional(),
|
|
1541
|
+
resourceUpdated: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1542
|
+
scheduleId: import_zod29.z.string().optional()
|
|
1235
1543
|
});
|
|
1236
|
-
var TriggerSchema =
|
|
1237
|
-
id:
|
|
1544
|
+
var TriggerSchema = import_zod29.z.object({
|
|
1545
|
+
id: import_zod29.z.string(),
|
|
1238
1546
|
metadata: TriggerMetadataSchema.optional(),
|
|
1239
|
-
type:
|
|
1547
|
+
type: import_zod29.z.nativeEnum(TriggerType)
|
|
1240
1548
|
});
|
|
1241
|
-
var DiffLineTypeSchema =
|
|
1242
|
-
var DiffLineSchema =
|
|
1549
|
+
var DiffLineTypeSchema = import_zod29.z.enum(["added", "removed", "unchanged"]);
|
|
1550
|
+
var DiffLineSchema = import_zod29.z.object({
|
|
1243
1551
|
type: DiffLineTypeSchema,
|
|
1244
|
-
content:
|
|
1245
|
-
lineNumber:
|
|
1246
|
-
});
|
|
1247
|
-
var DiffContentSchema =
|
|
1248
|
-
path:
|
|
1249
|
-
expected:
|
|
1250
|
-
actual:
|
|
1251
|
-
diffLines:
|
|
1252
|
-
renamedFrom:
|
|
1552
|
+
content: import_zod29.z.string(),
|
|
1553
|
+
lineNumber: import_zod29.z.number()
|
|
1554
|
+
});
|
|
1555
|
+
var DiffContentSchema = import_zod29.z.object({
|
|
1556
|
+
path: import_zod29.z.string(),
|
|
1557
|
+
expected: import_zod29.z.string(),
|
|
1558
|
+
actual: import_zod29.z.string(),
|
|
1559
|
+
diffLines: import_zod29.z.array(DiffLineSchema),
|
|
1560
|
+
renamedFrom: import_zod29.z.string().optional(),
|
|
1253
1561
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1254
|
-
isInfrastructure:
|
|
1562
|
+
isInfrastructure: import_zod29.z.boolean().optional()
|
|
1255
1563
|
});
|
|
1256
|
-
var CommandExecutionSchema =
|
|
1257
|
-
command:
|
|
1258
|
-
exitCode:
|
|
1259
|
-
output:
|
|
1260
|
-
duration:
|
|
1564
|
+
var CommandExecutionSchema = import_zod29.z.object({
|
|
1565
|
+
command: import_zod29.z.string(),
|
|
1566
|
+
exitCode: import_zod29.z.number(),
|
|
1567
|
+
output: import_zod29.z.string().optional(),
|
|
1568
|
+
duration: import_zod29.z.number()
|
|
1261
1569
|
});
|
|
1262
|
-
var FileModificationSchema =
|
|
1263
|
-
path:
|
|
1264
|
-
action:
|
|
1570
|
+
var FileModificationSchema = import_zod29.z.object({
|
|
1571
|
+
path: import_zod29.z.string(),
|
|
1572
|
+
action: import_zod29.z.enum(["created", "modified", "deleted"])
|
|
1265
1573
|
});
|
|
1266
1574
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1267
1575
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1269,62 +1577,62 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1269
1577
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1270
1578
|
return TemplateFileStatus2;
|
|
1271
1579
|
})(TemplateFileStatus || {});
|
|
1272
|
-
var TemplateFileSchema =
|
|
1580
|
+
var TemplateFileSchema = import_zod29.z.object({
|
|
1273
1581
|
/** Relative path within the template */
|
|
1274
|
-
path:
|
|
1582
|
+
path: import_zod29.z.string(),
|
|
1275
1583
|
/** Full file content after execution */
|
|
1276
|
-
content:
|
|
1584
|
+
content: import_zod29.z.string(),
|
|
1277
1585
|
/** File status (new, modified, unchanged) */
|
|
1278
|
-
status:
|
|
1586
|
+
status: import_zod29.z.enum(["new", "modified", "unchanged"]),
|
|
1279
1587
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1280
|
-
isInfrastructure:
|
|
1588
|
+
isInfrastructure: import_zod29.z.boolean().optional()
|
|
1281
1589
|
});
|
|
1282
|
-
var ApiCallSchema =
|
|
1283
|
-
endpoint:
|
|
1284
|
-
tokensUsed:
|
|
1285
|
-
duration:
|
|
1590
|
+
var ApiCallSchema = import_zod29.z.object({
|
|
1591
|
+
endpoint: import_zod29.z.string(),
|
|
1592
|
+
tokensUsed: import_zod29.z.number(),
|
|
1593
|
+
duration: import_zod29.z.number()
|
|
1286
1594
|
});
|
|
1287
|
-
var ExecutionTraceSchema =
|
|
1288
|
-
commands:
|
|
1289
|
-
filesModified:
|
|
1290
|
-
apiCalls:
|
|
1291
|
-
totalDuration:
|
|
1595
|
+
var ExecutionTraceSchema = import_zod29.z.object({
|
|
1596
|
+
commands: import_zod29.z.array(CommandExecutionSchema),
|
|
1597
|
+
filesModified: import_zod29.z.array(FileModificationSchema),
|
|
1598
|
+
apiCalls: import_zod29.z.array(ApiCallSchema),
|
|
1599
|
+
totalDuration: import_zod29.z.number()
|
|
1292
1600
|
});
|
|
1293
|
-
var RunAnalysisFindingSchema =
|
|
1294
|
-
category:
|
|
1601
|
+
var RunAnalysisFindingSchema = import_zod29.z.object({
|
|
1602
|
+
category: import_zod29.z.enum([
|
|
1295
1603
|
"failure_pattern",
|
|
1296
1604
|
"cost_waste",
|
|
1297
1605
|
"flakiness",
|
|
1298
1606
|
"inefficiency",
|
|
1299
1607
|
"positive"
|
|
1300
1608
|
]),
|
|
1301
|
-
severity:
|
|
1302
|
-
description:
|
|
1303
|
-
affectedScenarios:
|
|
1304
|
-
recommendation:
|
|
1609
|
+
severity: import_zod29.z.enum(["high", "medium", "low"]),
|
|
1610
|
+
description: import_zod29.z.string(),
|
|
1611
|
+
affectedScenarios: import_zod29.z.array(import_zod29.z.string()),
|
|
1612
|
+
recommendation: import_zod29.z.string().optional()
|
|
1305
1613
|
});
|
|
1306
|
-
var RunAnalysisSchema =
|
|
1307
|
-
generatedAt:
|
|
1308
|
-
summary:
|
|
1309
|
-
findings:
|
|
1614
|
+
var RunAnalysisSchema = import_zod29.z.object({
|
|
1615
|
+
generatedAt: import_zod29.z.string(),
|
|
1616
|
+
summary: import_zod29.z.string(),
|
|
1617
|
+
findings: import_zod29.z.array(RunAnalysisFindingSchema)
|
|
1310
1618
|
});
|
|
1311
1619
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1312
1620
|
/** Agent ID for this run */
|
|
1313
|
-
agentId:
|
|
1621
|
+
agentId: import_zod29.z.string().optional(),
|
|
1314
1622
|
/** Preset ID that originated this run (optional) */
|
|
1315
|
-
presetId:
|
|
1623
|
+
presetId: import_zod29.z.string().optional(),
|
|
1316
1624
|
/** Skill IDs for this run */
|
|
1317
|
-
skillIds:
|
|
1625
|
+
skillIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1318
1626
|
/** Map of skillId to skillVersionId for this run */
|
|
1319
|
-
skillVersions:
|
|
1627
|
+
skillVersions: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.string()).optional(),
|
|
1320
1628
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
1321
|
-
scenarioIds:
|
|
1629
|
+
scenarioIds: import_zod29.z.array(import_zod29.z.string()),
|
|
1322
1630
|
/** Current status */
|
|
1323
1631
|
status: EvalStatusSchema,
|
|
1324
1632
|
/** Progress percentage (0-100) */
|
|
1325
|
-
progress:
|
|
1633
|
+
progress: import_zod29.z.number(),
|
|
1326
1634
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1327
|
-
results:
|
|
1635
|
+
results: import_zod29.z.array(import_zod29.z.lazy(() => EvalRunResultSchema)),
|
|
1328
1636
|
/** Aggregated metrics across all results */
|
|
1329
1637
|
aggregateMetrics: EvalMetricsSchema,
|
|
1330
1638
|
/** Aggregated LLM trace summary */
|
|
@@ -1332,41 +1640,41 @@ var EvalRunSchema = TenantEntitySchema.extend({
|
|
|
1332
1640
|
/** What triggered this run */
|
|
1333
1641
|
trigger: TriggerSchema.optional(),
|
|
1334
1642
|
/** When the run started (set when evaluation is triggered) */
|
|
1335
|
-
startedAt:
|
|
1643
|
+
startedAt: import_zod29.z.string().optional(),
|
|
1336
1644
|
/** When the run completed */
|
|
1337
|
-
completedAt:
|
|
1645
|
+
completedAt: import_zod29.z.string().optional(),
|
|
1338
1646
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1339
|
-
liveTraceEvents:
|
|
1647
|
+
liveTraceEvents: import_zod29.z.array(LiveTraceEventSchema).optional(),
|
|
1340
1648
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1341
|
-
jobId:
|
|
1649
|
+
jobId: import_zod29.z.string().optional(),
|
|
1342
1650
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1343
|
-
jobStatus:
|
|
1651
|
+
jobStatus: import_zod29.z.string().optional(),
|
|
1344
1652
|
/** Remote job error message if the job failed */
|
|
1345
|
-
jobError:
|
|
1653
|
+
jobError: import_zod29.z.string().optional(),
|
|
1346
1654
|
/** Timestamp of the last job status check */
|
|
1347
|
-
jobStatusCheckedAt:
|
|
1655
|
+
jobStatusCheckedAt: import_zod29.z.string().optional(),
|
|
1348
1656
|
/** MCP server IDs to enable for this run (optional) */
|
|
1349
|
-
mcpIds:
|
|
1657
|
+
mcpIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1350
1658
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
1351
|
-
subAgentIds:
|
|
1659
|
+
subAgentIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1352
1660
|
/** Rule IDs to enable for this run (optional) */
|
|
1353
|
-
ruleIds:
|
|
1661
|
+
ruleIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1354
1662
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
1355
|
-
tags:
|
|
1663
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1356
1664
|
/** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
|
|
1357
|
-
runsPerScenario:
|
|
1665
|
+
runsPerScenario: import_zod29.z.number().int().min(1).max(20).optional(),
|
|
1358
1666
|
/** Snapshot of agent configuration captured at run creation time */
|
|
1359
|
-
agentSnapshot:
|
|
1360
|
-
name:
|
|
1667
|
+
agentSnapshot: import_zod29.z.object({
|
|
1668
|
+
name: import_zod29.z.string().optional(),
|
|
1361
1669
|
agentType: AgentTypeSchema.optional(),
|
|
1362
1670
|
runCommand: AgentRunCommandSchema.optional(),
|
|
1363
|
-
systemPrompt:
|
|
1671
|
+
systemPrompt: import_zod29.z.string().nullable().optional(),
|
|
1364
1672
|
modelConfig: ModelConfigSchema.optional()
|
|
1365
1673
|
}).optional(),
|
|
1366
1674
|
/** UUID linking all runs in a comparison group */
|
|
1367
|
-
comparisonGroupId:
|
|
1675
|
+
comparisonGroupId: import_zod29.z.string().optional(),
|
|
1368
1676
|
/** Human-readable label for this variant (e.g., "MCP: Wix Stores") */
|
|
1369
|
-
comparisonLabel:
|
|
1677
|
+
comparisonLabel: import_zod29.z.string().optional(),
|
|
1370
1678
|
/** LLM-generated analysis of the completed run */
|
|
1371
1679
|
runAnalysis: RunAnalysisSchema.optional()
|
|
1372
1680
|
});
|
|
@@ -1384,60 +1692,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1384
1692
|
agentSnapshot: true
|
|
1385
1693
|
}).extend({
|
|
1386
1694
|
/** Optional on input — backend resolves from tags when not provided */
|
|
1387
|
-
scenarioIds:
|
|
1695
|
+
scenarioIds: import_zod29.z.array(import_zod29.z.string()).optional()
|
|
1388
1696
|
}).refine(
|
|
1389
1697
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
1390
1698
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
1391
1699
|
);
|
|
1392
|
-
var EvaluationProgressSchema =
|
|
1393
|
-
runId:
|
|
1394
|
-
targetId:
|
|
1395
|
-
totalScenarios:
|
|
1396
|
-
completedScenarios:
|
|
1397
|
-
scenarioProgress:
|
|
1398
|
-
|
|
1399
|
-
scenarioId:
|
|
1400
|
-
currentStep:
|
|
1401
|
-
error:
|
|
1700
|
+
var EvaluationProgressSchema = import_zod29.z.object({
|
|
1701
|
+
runId: import_zod29.z.string(),
|
|
1702
|
+
targetId: import_zod29.z.string(),
|
|
1703
|
+
totalScenarios: import_zod29.z.number(),
|
|
1704
|
+
completedScenarios: import_zod29.z.number(),
|
|
1705
|
+
scenarioProgress: import_zod29.z.array(
|
|
1706
|
+
import_zod29.z.object({
|
|
1707
|
+
scenarioId: import_zod29.z.string(),
|
|
1708
|
+
currentStep: import_zod29.z.string(),
|
|
1709
|
+
error: import_zod29.z.string().optional()
|
|
1402
1710
|
})
|
|
1403
1711
|
),
|
|
1404
|
-
createdAt:
|
|
1405
|
-
});
|
|
1406
|
-
var EvaluationLogSchema =
|
|
1407
|
-
runId:
|
|
1408
|
-
scenarioId:
|
|
1409
|
-
log:
|
|
1410
|
-
level:
|
|
1411
|
-
message:
|
|
1412
|
-
args:
|
|
1413
|
-
error:
|
|
1712
|
+
createdAt: import_zod29.z.number()
|
|
1713
|
+
});
|
|
1714
|
+
var EvaluationLogSchema = import_zod29.z.object({
|
|
1715
|
+
runId: import_zod29.z.string(),
|
|
1716
|
+
scenarioId: import_zod29.z.string(),
|
|
1717
|
+
log: import_zod29.z.object({
|
|
1718
|
+
level: import_zod29.z.enum(["info", "error", "debug"]),
|
|
1719
|
+
message: import_zod29.z.string().optional(),
|
|
1720
|
+
args: import_zod29.z.array(import_zod29.z.any()).optional(),
|
|
1721
|
+
error: import_zod29.z.string().optional()
|
|
1414
1722
|
})
|
|
1415
1723
|
});
|
|
1416
1724
|
var LLM_TIMEOUT = 12e4;
|
|
1417
1725
|
|
|
1418
1726
|
// src/evaluation/conversation.ts
|
|
1419
|
-
var
|
|
1420
|
-
var TextBlockSchema =
|
|
1421
|
-
type:
|
|
1422
|
-
text:
|
|
1423
|
-
});
|
|
1424
|
-
var ThinkingBlockSchema =
|
|
1425
|
-
type:
|
|
1426
|
-
thinking:
|
|
1427
|
-
});
|
|
1428
|
-
var ToolUseBlockSchema =
|
|
1429
|
-
type:
|
|
1430
|
-
toolName:
|
|
1431
|
-
toolId:
|
|
1432
|
-
input:
|
|
1433
|
-
});
|
|
1434
|
-
var ToolResultBlockSchema =
|
|
1435
|
-
type:
|
|
1436
|
-
toolUseId:
|
|
1437
|
-
content:
|
|
1438
|
-
isError:
|
|
1439
|
-
});
|
|
1440
|
-
var ConversationBlockSchema =
|
|
1727
|
+
var import_zod30 = require("zod");
|
|
1728
|
+
var TextBlockSchema = import_zod30.z.object({
|
|
1729
|
+
type: import_zod30.z.literal("text"),
|
|
1730
|
+
text: import_zod30.z.string()
|
|
1731
|
+
});
|
|
1732
|
+
var ThinkingBlockSchema = import_zod30.z.object({
|
|
1733
|
+
type: import_zod30.z.literal("thinking"),
|
|
1734
|
+
thinking: import_zod30.z.string()
|
|
1735
|
+
});
|
|
1736
|
+
var ToolUseBlockSchema = import_zod30.z.object({
|
|
1737
|
+
type: import_zod30.z.literal("tool_use"),
|
|
1738
|
+
toolName: import_zod30.z.string(),
|
|
1739
|
+
toolId: import_zod30.z.string(),
|
|
1740
|
+
input: import_zod30.z.unknown()
|
|
1741
|
+
});
|
|
1742
|
+
var ToolResultBlockSchema = import_zod30.z.object({
|
|
1743
|
+
type: import_zod30.z.literal("tool_result"),
|
|
1744
|
+
toolUseId: import_zod30.z.string(),
|
|
1745
|
+
content: import_zod30.z.string(),
|
|
1746
|
+
isError: import_zod30.z.boolean().optional()
|
|
1747
|
+
});
|
|
1748
|
+
var ConversationBlockSchema = import_zod30.z.discriminatedUnion("type", [
|
|
1441
1749
|
TextBlockSchema,
|
|
1442
1750
|
ThinkingBlockSchema,
|
|
1443
1751
|
ToolUseBlockSchema,
|
|
@@ -1448,18 +1756,18 @@ var ConversationMessageRoles = [
|
|
|
1448
1756
|
"user",
|
|
1449
1757
|
"system"
|
|
1450
1758
|
];
|
|
1451
|
-
var ConversationMessageSchema =
|
|
1452
|
-
role:
|
|
1453
|
-
content:
|
|
1454
|
-
timestamp:
|
|
1759
|
+
var ConversationMessageSchema = import_zod30.z.object({
|
|
1760
|
+
role: import_zod30.z.enum(ConversationMessageRoles),
|
|
1761
|
+
content: import_zod30.z.array(ConversationBlockSchema),
|
|
1762
|
+
timestamp: import_zod30.z.string()
|
|
1455
1763
|
});
|
|
1456
|
-
var ScenarioConversationSchema =
|
|
1457
|
-
id:
|
|
1458
|
-
projectId:
|
|
1459
|
-
evalRunId:
|
|
1460
|
-
resultId:
|
|
1461
|
-
messages:
|
|
1462
|
-
createdAt:
|
|
1764
|
+
var ScenarioConversationSchema = import_zod30.z.object({
|
|
1765
|
+
id: import_zod30.z.string(),
|
|
1766
|
+
projectId: import_zod30.z.string(),
|
|
1767
|
+
evalRunId: import_zod30.z.string(),
|
|
1768
|
+
resultId: import_zod30.z.string(),
|
|
1769
|
+
messages: import_zod30.z.array(ConversationMessageSchema),
|
|
1770
|
+
createdAt: import_zod30.z.string()
|
|
1463
1771
|
});
|
|
1464
1772
|
|
|
1465
1773
|
// src/evaluation/eval-result.ts
|
|
@@ -1470,98 +1778,98 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1470
1778
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1471
1779
|
return AssertionResultStatus2;
|
|
1472
1780
|
})(AssertionResultStatus || {});
|
|
1473
|
-
var AssertionResultSchema =
|
|
1474
|
-
id:
|
|
1475
|
-
assertionId:
|
|
1476
|
-
assertionType:
|
|
1477
|
-
assertionName:
|
|
1478
|
-
status:
|
|
1479
|
-
message:
|
|
1480
|
-
expected:
|
|
1481
|
-
actual:
|
|
1482
|
-
duration:
|
|
1483
|
-
details:
|
|
1484
|
-
llmTraceSteps:
|
|
1485
|
-
});
|
|
1486
|
-
var EvalRunResultSchema =
|
|
1487
|
-
id:
|
|
1488
|
-
targetId:
|
|
1489
|
-
targetName:
|
|
1781
|
+
var AssertionResultSchema = import_zod31.z.object({
|
|
1782
|
+
id: import_zod31.z.string(),
|
|
1783
|
+
assertionId: import_zod31.z.string(),
|
|
1784
|
+
assertionType: import_zod31.z.string(),
|
|
1785
|
+
assertionName: import_zod31.z.string(),
|
|
1786
|
+
status: import_zod31.z.enum(AssertionResultStatus),
|
|
1787
|
+
message: import_zod31.z.string().optional(),
|
|
1788
|
+
expected: import_zod31.z.string().optional(),
|
|
1789
|
+
actual: import_zod31.z.string().optional(),
|
|
1790
|
+
duration: import_zod31.z.number().optional(),
|
|
1791
|
+
details: import_zod31.z.record(import_zod31.z.string(), import_zod31.z.unknown()).optional(),
|
|
1792
|
+
llmTraceSteps: import_zod31.z.array(LLMTraceStepSchema).optional()
|
|
1793
|
+
});
|
|
1794
|
+
var EvalRunResultSchema = import_zod31.z.object({
|
|
1795
|
+
id: import_zod31.z.string(),
|
|
1796
|
+
targetId: import_zod31.z.string(),
|
|
1797
|
+
targetName: import_zod31.z.string().optional(),
|
|
1490
1798
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
1491
|
-
skillVersionId:
|
|
1799
|
+
skillVersionId: import_zod31.z.string().optional(),
|
|
1492
1800
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
1493
|
-
skillVersion:
|
|
1494
|
-
scenarioId:
|
|
1495
|
-
scenarioName:
|
|
1801
|
+
skillVersion: import_zod31.z.string().optional(),
|
|
1802
|
+
scenarioId: import_zod31.z.string(),
|
|
1803
|
+
scenarioName: import_zod31.z.string(),
|
|
1496
1804
|
/** Snapshot of the trigger prompt used during the run (prevents stale display after edits) */
|
|
1497
|
-
triggerPrompt:
|
|
1805
|
+
triggerPrompt: import_zod31.z.string().optional(),
|
|
1498
1806
|
modelConfig: ModelConfigSchema.optional(),
|
|
1499
|
-
assertionResults:
|
|
1807
|
+
assertionResults: import_zod31.z.array(AssertionResultSchema),
|
|
1500
1808
|
metrics: EvalMetricsSchema.optional(),
|
|
1501
|
-
passed:
|
|
1502
|
-
failed:
|
|
1503
|
-
passRate:
|
|
1504
|
-
duration:
|
|
1505
|
-
outputText:
|
|
1506
|
-
files:
|
|
1507
|
-
fileDiffs:
|
|
1809
|
+
passed: import_zod31.z.number(),
|
|
1810
|
+
failed: import_zod31.z.number(),
|
|
1811
|
+
passRate: import_zod31.z.number(),
|
|
1812
|
+
duration: import_zod31.z.number(),
|
|
1813
|
+
outputText: import_zod31.z.string().optional(),
|
|
1814
|
+
files: import_zod31.z.array(ExpectedFileSchema).optional(),
|
|
1815
|
+
fileDiffs: import_zod31.z.array(DiffContentSchema).optional(),
|
|
1508
1816
|
/** Full template files after execution with status indicators */
|
|
1509
|
-
templateFiles:
|
|
1510
|
-
startedAt:
|
|
1511
|
-
completedAt:
|
|
1817
|
+
templateFiles: import_zod31.z.array(TemplateFileSchema).optional(),
|
|
1818
|
+
startedAt: import_zod31.z.string().optional(),
|
|
1819
|
+
completedAt: import_zod31.z.string().optional(),
|
|
1512
1820
|
llmTrace: LLMTraceSchema.optional(),
|
|
1513
1821
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
1514
|
-
conversation:
|
|
1822
|
+
conversation: import_zod31.z.array(ConversationMessageSchema).optional(),
|
|
1515
1823
|
/** 0-based iteration index when a scenario is run multiple times within a single eval run */
|
|
1516
|
-
iterationIndex:
|
|
1517
|
-
});
|
|
1518
|
-
var PromptResultSchema =
|
|
1519
|
-
text:
|
|
1520
|
-
files:
|
|
1521
|
-
finishReason:
|
|
1522
|
-
reasoning:
|
|
1523
|
-
reasoningDetails:
|
|
1524
|
-
toolCalls:
|
|
1525
|
-
toolResults:
|
|
1526
|
-
warnings:
|
|
1527
|
-
sources:
|
|
1528
|
-
steps:
|
|
1529
|
-
generationTimeMs:
|
|
1530
|
-
prompt:
|
|
1531
|
-
systemPrompt:
|
|
1532
|
-
usage:
|
|
1533
|
-
totalTokens:
|
|
1534
|
-
totalMicrocentsSpent:
|
|
1824
|
+
iterationIndex: import_zod31.z.number().int().min(0).optional()
|
|
1825
|
+
});
|
|
1826
|
+
var PromptResultSchema = import_zod31.z.object({
|
|
1827
|
+
text: import_zod31.z.string(),
|
|
1828
|
+
files: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1829
|
+
finishReason: import_zod31.z.string().optional(),
|
|
1830
|
+
reasoning: import_zod31.z.string().optional(),
|
|
1831
|
+
reasoningDetails: import_zod31.z.unknown().optional(),
|
|
1832
|
+
toolCalls: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1833
|
+
toolResults: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1834
|
+
warnings: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1835
|
+
sources: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1836
|
+
steps: import_zod31.z.array(import_zod31.z.unknown()),
|
|
1837
|
+
generationTimeMs: import_zod31.z.number(),
|
|
1838
|
+
prompt: import_zod31.z.string(),
|
|
1839
|
+
systemPrompt: import_zod31.z.string(),
|
|
1840
|
+
usage: import_zod31.z.object({
|
|
1841
|
+
totalTokens: import_zod31.z.number().optional(),
|
|
1842
|
+
totalMicrocentsSpent: import_zod31.z.number().optional()
|
|
1535
1843
|
})
|
|
1536
1844
|
});
|
|
1537
|
-
var EvaluationResultSchema =
|
|
1538
|
-
id:
|
|
1539
|
-
runId:
|
|
1540
|
-
timestamp:
|
|
1845
|
+
var EvaluationResultSchema = import_zod31.z.object({
|
|
1846
|
+
id: import_zod31.z.string(),
|
|
1847
|
+
runId: import_zod31.z.string(),
|
|
1848
|
+
timestamp: import_zod31.z.number(),
|
|
1541
1849
|
promptResult: PromptResultSchema,
|
|
1542
|
-
testResults:
|
|
1543
|
-
tags:
|
|
1544
|
-
feedback:
|
|
1545
|
-
score:
|
|
1546
|
-
suiteId:
|
|
1547
|
-
});
|
|
1548
|
-
var LeanEvaluationResultSchema =
|
|
1549
|
-
id:
|
|
1550
|
-
runId:
|
|
1551
|
-
timestamp:
|
|
1552
|
-
tags:
|
|
1553
|
-
scenarioId:
|
|
1554
|
-
scenarioVersion:
|
|
1555
|
-
targetId:
|
|
1556
|
-
targetVersion:
|
|
1557
|
-
suiteId:
|
|
1558
|
-
score:
|
|
1559
|
-
time:
|
|
1560
|
-
microcentsSpent:
|
|
1850
|
+
testResults: import_zod31.z.array(import_zod31.z.unknown()),
|
|
1851
|
+
tags: import_zod31.z.array(import_zod31.z.string()).optional(),
|
|
1852
|
+
feedback: import_zod31.z.string().optional(),
|
|
1853
|
+
score: import_zod31.z.number(),
|
|
1854
|
+
suiteId: import_zod31.z.string().optional()
|
|
1855
|
+
});
|
|
1856
|
+
var LeanEvaluationResultSchema = import_zod31.z.object({
|
|
1857
|
+
id: import_zod31.z.string(),
|
|
1858
|
+
runId: import_zod31.z.string(),
|
|
1859
|
+
timestamp: import_zod31.z.number(),
|
|
1860
|
+
tags: import_zod31.z.array(import_zod31.z.string()).optional(),
|
|
1861
|
+
scenarioId: import_zod31.z.string(),
|
|
1862
|
+
scenarioVersion: import_zod31.z.number().optional(),
|
|
1863
|
+
targetId: import_zod31.z.string(),
|
|
1864
|
+
targetVersion: import_zod31.z.number().optional(),
|
|
1865
|
+
suiteId: import_zod31.z.string().optional(),
|
|
1866
|
+
score: import_zod31.z.number(),
|
|
1867
|
+
time: import_zod31.z.number().optional(),
|
|
1868
|
+
microcentsSpent: import_zod31.z.number().optional()
|
|
1561
1869
|
});
|
|
1562
1870
|
|
|
1563
1871
|
// src/evaluation/eval-run-folder.ts
|
|
1564
|
-
var
|
|
1872
|
+
var import_zod32 = require("zod");
|
|
1565
1873
|
var EvalRunFolderSchema = TenantEntitySchema.extend({});
|
|
1566
1874
|
var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
1567
1875
|
id: true,
|
|
@@ -1575,26 +1883,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
|
1575
1883
|
updatedAt: true,
|
|
1576
1884
|
deleted: true
|
|
1577
1885
|
}).partial();
|
|
1578
|
-
var EvalRunFolderMembershipSchema =
|
|
1579
|
-
folderId:
|
|
1580
|
-
evalRunId:
|
|
1581
|
-
projectId:
|
|
1582
|
-
createdAt:
|
|
1886
|
+
var EvalRunFolderMembershipSchema = import_zod32.z.object({
|
|
1887
|
+
folderId: import_zod32.z.string(),
|
|
1888
|
+
evalRunId: import_zod32.z.string(),
|
|
1889
|
+
projectId: import_zod32.z.string(),
|
|
1890
|
+
createdAt: import_zod32.z.string()
|
|
1583
1891
|
});
|
|
1584
1892
|
|
|
1585
1893
|
// src/project/project.ts
|
|
1586
|
-
var
|
|
1894
|
+
var import_zod33 = require("zod");
|
|
1587
1895
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1588
|
-
appId:
|
|
1589
|
-
scenarioTags:
|
|
1896
|
+
appId: import_zod33.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
1897
|
+
scenarioTags: import_zod33.z.array(import_zod33.z.string()).optional().describe("Project-level tag vocabulary for scenarios"),
|
|
1590
1898
|
/** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
|
|
1591
|
-
wixAuthToken:
|
|
1899
|
+
wixAuthToken: import_zod33.z.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
|
|
1592
1900
|
/** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
|
|
1593
|
-
base44AuthFile:
|
|
1901
|
+
base44AuthFile: import_zod33.z.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
|
|
1594
1902
|
/** Resolved at runtime from the encrypted Wix auth token */
|
|
1595
|
-
wixAuthEmail:
|
|
1903
|
+
wixAuthEmail: import_zod33.z.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
|
|
1596
1904
|
/** Resolved at runtime from the encrypted Base44 auth file */
|
|
1597
|
-
base44AuthEmail:
|
|
1905
|
+
base44AuthEmail: import_zod33.z.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
|
|
1598
1906
|
});
|
|
1599
1907
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1600
1908
|
id: true,
|
|
@@ -1620,7 +1928,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
|
1620
1928
|
var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
1621
1929
|
|
|
1622
1930
|
// src/schedule/eval-schedule.ts
|
|
1623
|
-
var
|
|
1931
|
+
var import_zod34 = require("zod");
|
|
1624
1932
|
var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
1625
1933
|
FrequencyType2["DAILY"] = "daily";
|
|
1626
1934
|
FrequencyType2["WEEKDAY"] = "weekday";
|
|
@@ -1630,29 +1938,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
|
1630
1938
|
})(FrequencyType || {});
|
|
1631
1939
|
var EvalScheduleSchema = TenantEntitySchema.extend({
|
|
1632
1940
|
/** Whether the schedule is active */
|
|
1633
|
-
enabled:
|
|
1941
|
+
enabled: import_zod34.z.boolean(),
|
|
1634
1942
|
/** Test suite to run */
|
|
1635
|
-
suiteId:
|
|
1943
|
+
suiteId: import_zod34.z.string(),
|
|
1636
1944
|
/** Preset that provides agent + entities for this schedule */
|
|
1637
|
-
presetId:
|
|
1945
|
+
presetId: import_zod34.z.string(),
|
|
1638
1946
|
/** How often to run */
|
|
1639
|
-
frequencyType:
|
|
1947
|
+
frequencyType: import_zod34.z.nativeEnum(FrequencyType),
|
|
1640
1948
|
/** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
|
|
1641
|
-
timeOfDay:
|
|
1949
|
+
timeOfDay: import_zod34.z.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
|
|
1642
1950
|
/** Day of week (0=Sun, 6=Sat) for weekly schedules */
|
|
1643
|
-
dayOfWeek:
|
|
1951
|
+
dayOfWeek: import_zod34.z.number().min(0).max(6).optional(),
|
|
1644
1952
|
/** Day of month (1-31) for monthly schedules */
|
|
1645
|
-
dayOfMonth:
|
|
1953
|
+
dayOfMonth: import_zod34.z.number().min(1).max(31).optional(),
|
|
1646
1954
|
/** IANA timezone (e.g., 'America/New_York') */
|
|
1647
|
-
timezone:
|
|
1955
|
+
timezone: import_zod34.z.string(),
|
|
1648
1956
|
/** ID of the last eval run created by this schedule */
|
|
1649
|
-
lastRunId:
|
|
1957
|
+
lastRunId: import_zod34.z.string().optional(),
|
|
1650
1958
|
/** Denormalized status of the last run */
|
|
1651
|
-
lastRunStatus:
|
|
1959
|
+
lastRunStatus: import_zod34.z.string().optional(),
|
|
1652
1960
|
/** ISO timestamp of the last run */
|
|
1653
|
-
lastRunAt:
|
|
1961
|
+
lastRunAt: import_zod34.z.string().optional(),
|
|
1654
1962
|
/** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
|
|
1655
|
-
nextRunAt:
|
|
1963
|
+
nextRunAt: import_zod34.z.string().optional()
|
|
1656
1964
|
});
|
|
1657
1965
|
function isValidTimezone(tz) {
|
|
1658
1966
|
try {
|
|
@@ -1665,14 +1973,14 @@ function isValidTimezone(tz) {
|
|
|
1665
1973
|
function validateScheduleFields(data, ctx, options) {
|
|
1666
1974
|
if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
|
|
1667
1975
|
ctx.addIssue({
|
|
1668
|
-
code:
|
|
1976
|
+
code: import_zod34.z.ZodIssueCode.custom,
|
|
1669
1977
|
message: "dayOfWeek is required for weekly schedules",
|
|
1670
1978
|
path: ["dayOfWeek"]
|
|
1671
1979
|
});
|
|
1672
1980
|
}
|
|
1673
1981
|
if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
|
|
1674
1982
|
ctx.addIssue({
|
|
1675
|
-
code:
|
|
1983
|
+
code: import_zod34.z.ZodIssueCode.custom,
|
|
1676
1984
|
message: "dayOfMonth is required for monthly schedules",
|
|
1677
1985
|
path: ["dayOfMonth"]
|
|
1678
1986
|
});
|
|
@@ -1680,7 +1988,7 @@ function validateScheduleFields(data, ctx, options) {
|
|
|
1680
1988
|
const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
|
|
1681
1989
|
if (shouldValidateTz && !isValidTimezone(data.timezone)) {
|
|
1682
1990
|
ctx.addIssue({
|
|
1683
|
-
code:
|
|
1991
|
+
code: import_zod34.z.ZodIssueCode.custom,
|
|
1684
1992
|
message: "Invalid IANA timezone",
|
|
1685
1993
|
path: ["timezone"]
|
|
1686
1994
|
});
|
|
@@ -1703,229 +2011,10 @@ var CreateEvalScheduleInputSchema = BaseCreateScheduleSchema.superRefine((data,
|
|
|
1703
2011
|
var UpdateEvalScheduleInputSchema = BaseCreateScheduleSchema.partial().superRefine((data, ctx) => {
|
|
1704
2012
|
validateScheduleFields(data, ctx, { partial: true });
|
|
1705
2013
|
});
|
|
1706
|
-
|
|
1707
|
-
// src/assertion/system-assertions.ts
|
|
1708
|
-
var SYSTEM_ASSERTION_IDS = {
|
|
1709
|
-
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1710
|
-
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
1711
|
-
BUILD_PASSED: "system:build_passed",
|
|
1712
|
-
TIME_LIMIT: "system:time_limit",
|
|
1713
|
-
COST: "system:cost",
|
|
1714
|
-
LLM_JUDGE: "system:llm_judge",
|
|
1715
|
-
API_CALL: "system:api_call"
|
|
1716
|
-
};
|
|
1717
|
-
function isSystemAssertionId(id) {
|
|
1718
|
-
return id.startsWith("system:");
|
|
1719
|
-
}
|
|
1720
|
-
var SYSTEM_ASSERTIONS = {
|
|
1721
|
-
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1722
|
-
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1723
|
-
name: "Skill Was Called",
|
|
1724
|
-
description: "Check that one or more skills were invoked during the agent run",
|
|
1725
|
-
type: "skill_was_called",
|
|
1726
|
-
parameters: [
|
|
1727
|
-
{
|
|
1728
|
-
name: "skillNames",
|
|
1729
|
-
label: "Skills",
|
|
1730
|
-
type: "string",
|
|
1731
|
-
required: true
|
|
1732
|
-
},
|
|
1733
|
-
{
|
|
1734
|
-
name: "negate",
|
|
1735
|
-
label: "Negate (NOT operator)",
|
|
1736
|
-
type: "boolean",
|
|
1737
|
-
required: false,
|
|
1738
|
-
defaultValue: false
|
|
1739
|
-
}
|
|
1740
|
-
]
|
|
1741
|
-
},
|
|
1742
|
-
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
1743
|
-
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
1744
|
-
name: "Tool Called With Param",
|
|
1745
|
-
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
1746
|
-
type: "tool_called_with_param",
|
|
1747
|
-
parameters: [
|
|
1748
|
-
{
|
|
1749
|
-
name: "toolName",
|
|
1750
|
-
label: "Tool Name",
|
|
1751
|
-
type: "string",
|
|
1752
|
-
required: true
|
|
1753
|
-
},
|
|
1754
|
-
{
|
|
1755
|
-
name: "expectedParams",
|
|
1756
|
-
label: "Expected Parameters (JSON, substring match)",
|
|
1757
|
-
type: "string",
|
|
1758
|
-
required: false
|
|
1759
|
-
},
|
|
1760
|
-
{
|
|
1761
|
-
name: "requireSuccess",
|
|
1762
|
-
label: "Require Successful Call",
|
|
1763
|
-
type: "boolean",
|
|
1764
|
-
required: false,
|
|
1765
|
-
defaultValue: false,
|
|
1766
|
-
advanced: true
|
|
1767
|
-
},
|
|
1768
|
-
{
|
|
1769
|
-
name: "negate",
|
|
1770
|
-
label: "Negate (NOT operator)",
|
|
1771
|
-
type: "boolean",
|
|
1772
|
-
required: false,
|
|
1773
|
-
defaultValue: false
|
|
1774
|
-
}
|
|
1775
|
-
]
|
|
1776
|
-
},
|
|
1777
|
-
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1778
|
-
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1779
|
-
name: "Build Passed",
|
|
1780
|
-
description: "Run a build command and verify it exits with expected code",
|
|
1781
|
-
type: "build_passed",
|
|
1782
|
-
parameters: [
|
|
1783
|
-
{
|
|
1784
|
-
name: "command",
|
|
1785
|
-
label: "Build Command",
|
|
1786
|
-
type: "string",
|
|
1787
|
-
required: false,
|
|
1788
|
-
defaultValue: "yarn build"
|
|
1789
|
-
},
|
|
1790
|
-
{
|
|
1791
|
-
name: "expectedExitCode",
|
|
1792
|
-
label: "Expected Exit Code",
|
|
1793
|
-
type: "number",
|
|
1794
|
-
required: false,
|
|
1795
|
-
defaultValue: 0
|
|
1796
|
-
},
|
|
1797
|
-
{
|
|
1798
|
-
name: "maxBuildTime",
|
|
1799
|
-
label: "Max Build Time (ms)",
|
|
1800
|
-
type: "number",
|
|
1801
|
-
required: false,
|
|
1802
|
-
advanced: true
|
|
1803
|
-
},
|
|
1804
|
-
{
|
|
1805
|
-
name: "maxMemory",
|
|
1806
|
-
label: "Max Memory (MB)",
|
|
1807
|
-
type: "number",
|
|
1808
|
-
required: false,
|
|
1809
|
-
advanced: true
|
|
1810
|
-
}
|
|
1811
|
-
]
|
|
1812
|
-
},
|
|
1813
|
-
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
1814
|
-
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
1815
|
-
name: "Time Limit",
|
|
1816
|
-
description: "Check that the scenario completed within a maximum duration",
|
|
1817
|
-
type: "time_limit",
|
|
1818
|
-
parameters: [
|
|
1819
|
-
{
|
|
1820
|
-
name: "maxDurationMs",
|
|
1821
|
-
label: "Max Duration (ms)",
|
|
1822
|
-
type: "number",
|
|
1823
|
-
required: true,
|
|
1824
|
-
defaultValue: 3e5
|
|
1825
|
-
}
|
|
1826
|
-
]
|
|
1827
|
-
},
|
|
1828
|
-
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
1829
|
-
id: SYSTEM_ASSERTION_IDS.COST,
|
|
1830
|
-
name: "Cost",
|
|
1831
|
-
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
1832
|
-
type: "cost",
|
|
1833
|
-
parameters: [
|
|
1834
|
-
{
|
|
1835
|
-
name: "maxCostUsd",
|
|
1836
|
-
label: "Max Cost (USD)",
|
|
1837
|
-
type: "number",
|
|
1838
|
-
required: true,
|
|
1839
|
-
defaultValue: 1
|
|
1840
|
-
}
|
|
1841
|
-
]
|
|
1842
|
-
},
|
|
1843
|
-
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1844
|
-
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1845
|
-
name: "LLM Judge",
|
|
1846
|
-
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
1847
|
-
type: "llm_judge",
|
|
1848
|
-
parameters: [
|
|
1849
|
-
{
|
|
1850
|
-
name: "prompt",
|
|
1851
|
-
label: "Judge Prompt",
|
|
1852
|
-
type: "string",
|
|
1853
|
-
required: true,
|
|
1854
|
-
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1855
|
-
},
|
|
1856
|
-
{
|
|
1857
|
-
name: "minScore",
|
|
1858
|
-
label: "Minimum Score (0-10)",
|
|
1859
|
-
type: "number",
|
|
1860
|
-
required: false,
|
|
1861
|
-
defaultValue: 7
|
|
1862
|
-
},
|
|
1863
|
-
{
|
|
1864
|
-
name: "model",
|
|
1865
|
-
label: "Model",
|
|
1866
|
-
type: "string",
|
|
1867
|
-
required: false
|
|
1868
|
-
}
|
|
1869
|
-
]
|
|
1870
|
-
},
|
|
1871
|
-
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
1872
|
-
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
1873
|
-
name: "API Call",
|
|
1874
|
-
description: "Call an API endpoint and verify the response contains expected data",
|
|
1875
|
-
type: "api_call",
|
|
1876
|
-
parameters: [
|
|
1877
|
-
{
|
|
1878
|
-
name: "url",
|
|
1879
|
-
label: "URL",
|
|
1880
|
-
type: "string",
|
|
1881
|
-
required: true
|
|
1882
|
-
},
|
|
1883
|
-
{
|
|
1884
|
-
name: "method",
|
|
1885
|
-
label: "HTTP Method",
|
|
1886
|
-
type: "string",
|
|
1887
|
-
required: false,
|
|
1888
|
-
defaultValue: "GET"
|
|
1889
|
-
},
|
|
1890
|
-
{
|
|
1891
|
-
name: "requestBody",
|
|
1892
|
-
label: "Request Body (JSON)",
|
|
1893
|
-
type: "string",
|
|
1894
|
-
required: false
|
|
1895
|
-
},
|
|
1896
|
-
{
|
|
1897
|
-
name: "expectedResponse",
|
|
1898
|
-
label: "Expected Response (JSON)",
|
|
1899
|
-
type: "string",
|
|
1900
|
-
required: true
|
|
1901
|
-
},
|
|
1902
|
-
{
|
|
1903
|
-
name: "requestHeaders",
|
|
1904
|
-
label: "Headers (JSON)",
|
|
1905
|
-
type: "string",
|
|
1906
|
-
required: false,
|
|
1907
|
-
advanced: true
|
|
1908
|
-
},
|
|
1909
|
-
{
|
|
1910
|
-
name: "timeoutMs",
|
|
1911
|
-
label: "Timeout (ms)",
|
|
1912
|
-
type: "number",
|
|
1913
|
-
required: false,
|
|
1914
|
-
defaultValue: 3e4,
|
|
1915
|
-
advanced: true
|
|
1916
|
-
}
|
|
1917
|
-
]
|
|
1918
|
-
}
|
|
1919
|
-
};
|
|
1920
|
-
function getSystemAssertions() {
|
|
1921
|
-
return Object.values(SYSTEM_ASSERTIONS);
|
|
1922
|
-
}
|
|
1923
|
-
function getSystemAssertion(id) {
|
|
1924
|
-
return SYSTEM_ASSERTIONS[id];
|
|
1925
|
-
}
|
|
1926
2014
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1927
2015
|
0 && (module.exports = {
|
|
1928
2016
|
AGENT_TYPE_LABELS,
|
|
2017
|
+
ALLOWED_BUILD_COMMANDS,
|
|
1929
2018
|
ALL_AVAILABLE_MODEL_IDS,
|
|
1930
2019
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
1931
2020
|
AVAILABLE_OPENAI_MODEL_IDS,
|
|
@@ -1959,6 +2048,7 @@ function getSystemAssertion(id) {
|
|
|
1959
2048
|
BatchSummarySchema,
|
|
1960
2049
|
BuildCheckTestSchema,
|
|
1961
2050
|
BuildPassedAssertionSchema,
|
|
2051
|
+
BuildPassedCommandStringSchema,
|
|
1962
2052
|
BuildPassedConfigSchema,
|
|
1963
2053
|
BulkImportResultItemSchema,
|
|
1964
2054
|
BulkImportResultSchema,
|
|
@@ -1986,6 +2076,7 @@ function getSystemAssertion(id) {
|
|
|
1986
2076
|
CreateTemplateInputSchema,
|
|
1987
2077
|
CreateTestScenarioInputSchema,
|
|
1988
2078
|
CreateTestSuiteInputSchema,
|
|
2079
|
+
DEFAULT_BUILD_PASSED_COMMAND,
|
|
1989
2080
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
1990
2081
|
DEFAULT_JUDGE_MODEL,
|
|
1991
2082
|
DiffContentSchema,
|
|
@@ -2083,6 +2174,7 @@ function getSystemAssertion(id) {
|
|
|
2083
2174
|
ToolTestSchema,
|
|
2084
2175
|
ToolUseBlockSchema,
|
|
2085
2176
|
TriggerMetadataSchema,
|
|
2177
|
+
TriggerPromptImageSchema,
|
|
2086
2178
|
TriggerSchema,
|
|
2087
2179
|
TriggerType,
|
|
2088
2180
|
UpdateAgentInputSchema,
|
|
@@ -2102,11 +2194,14 @@ function getSystemAssertion(id) {
|
|
|
2102
2194
|
formatTraceEventLine,
|
|
2103
2195
|
getSystemAssertion,
|
|
2104
2196
|
getSystemAssertions,
|
|
2197
|
+
isAllowedBuildCommandString,
|
|
2105
2198
|
isSystemAssertionId,
|
|
2106
2199
|
isValidSkillFolderName,
|
|
2107
2200
|
normalizeBatchAssertionLink,
|
|
2108
2201
|
normalizeModelId,
|
|
2202
|
+
parseBuildCommandToArgv,
|
|
2109
2203
|
parseTraceEventLine,
|
|
2110
|
-
validateAssertionConfig
|
|
2204
|
+
validateAssertionConfig,
|
|
2205
|
+
validateBuildPassedParamsInAssertionLinks
|
|
2111
2206
|
});
|
|
2112
2207
|
//# sourceMappingURL=index.js.map
|