@wix/evalforge-types 0.72.0 → 0.73.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.js +707 -625
- package/build/index.js.map +4 -4
- package/build/index.mjs +700 -624
- package/build/index.mjs.map +4 -4
- package/build/types/assertion/assertion.d.ts +26 -6
- package/build/types/assertion/build-passed-command.d.ts +25 -0
- package/build/types/assertion/index.d.ts +1 -0
- package/build/types/scenario/test-scenario.d.ts +22 -3
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -21,6 +21,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
21
21
|
var index_exports = {};
|
|
22
22
|
__export(index_exports, {
|
|
23
23
|
AGENT_TYPE_LABELS: () => AGENT_TYPE_LABELS,
|
|
24
|
+
ALLOWED_BUILD_COMMANDS: () => ALLOWED_BUILD_COMMANDS,
|
|
24
25
|
ALL_AVAILABLE_MODEL_IDS: () => ALL_AVAILABLE_MODEL_IDS,
|
|
25
26
|
AVAILABLE_CLAUDE_MODEL_IDS: () => AVAILABLE_CLAUDE_MODEL_IDS,
|
|
26
27
|
AVAILABLE_OPENAI_MODEL_IDS: () => AVAILABLE_OPENAI_MODEL_IDS,
|
|
@@ -54,6 +55,7 @@ __export(index_exports, {
|
|
|
54
55
|
BatchSummarySchema: () => BatchSummarySchema,
|
|
55
56
|
BuildCheckTestSchema: () => BuildCheckTestSchema,
|
|
56
57
|
BuildPassedAssertionSchema: () => BuildPassedAssertionSchema,
|
|
58
|
+
BuildPassedCommandStringSchema: () => BuildPassedCommandStringSchema,
|
|
57
59
|
BuildPassedConfigSchema: () => BuildPassedConfigSchema,
|
|
58
60
|
BulkImportResultItemSchema: () => BulkImportResultItemSchema,
|
|
59
61
|
BulkImportResultSchema: () => BulkImportResultSchema,
|
|
@@ -81,6 +83,7 @@ __export(index_exports, {
|
|
|
81
83
|
CreateTemplateInputSchema: () => CreateTemplateInputSchema,
|
|
82
84
|
CreateTestScenarioInputSchema: () => CreateTestScenarioInputSchema,
|
|
83
85
|
CreateTestSuiteInputSchema: () => CreateTestSuiteInputSchema,
|
|
86
|
+
DEFAULT_BUILD_PASSED_COMMAND: () => DEFAULT_BUILD_PASSED_COMMAND,
|
|
84
87
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT: () => DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
85
88
|
DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
|
|
86
89
|
DiffContentSchema: () => DiffContentSchema,
|
|
@@ -198,12 +201,15 @@ __export(index_exports, {
|
|
|
198
201
|
formatTraceEventLine: () => formatTraceEventLine,
|
|
199
202
|
getSystemAssertion: () => getSystemAssertion,
|
|
200
203
|
getSystemAssertions: () => getSystemAssertions,
|
|
204
|
+
isAllowedBuildCommandString: () => isAllowedBuildCommandString,
|
|
201
205
|
isSystemAssertionId: () => isSystemAssertionId,
|
|
202
206
|
isValidSkillFolderName: () => isValidSkillFolderName,
|
|
203
207
|
normalizeBatchAssertionLink: () => normalizeBatchAssertionLink,
|
|
204
208
|
normalizeModelId: () => normalizeModelId,
|
|
209
|
+
parseBuildCommandToArgv: () => parseBuildCommandToArgv,
|
|
205
210
|
parseTraceEventLine: () => parseTraceEventLine,
|
|
206
|
-
validateAssertionConfig: () => validateAssertionConfig
|
|
211
|
+
validateAssertionConfig: () => validateAssertionConfig,
|
|
212
|
+
validateBuildPassedParamsInAssertionLinks: () => validateBuildPassedParamsInAssertionLinks
|
|
207
213
|
});
|
|
208
214
|
module.exports = __toCommonJS(index_exports);
|
|
209
215
|
|
|
@@ -793,11 +799,42 @@ var EnvironmentSchema = import_zod21.z.object({
|
|
|
793
799
|
});
|
|
794
800
|
|
|
795
801
|
// src/scenario/test-scenario.ts
|
|
796
|
-
var
|
|
802
|
+
var import_zod24 = require("zod");
|
|
797
803
|
|
|
798
804
|
// src/assertion/assertion.ts
|
|
805
|
+
var import_zod23 = require("zod");
|
|
806
|
+
|
|
807
|
+
// src/assertion/build-passed-command.ts
|
|
799
808
|
var import_zod22 = require("zod");
|
|
800
|
-
var
|
|
809
|
+
var ALLOWED_BUILD_COMMANDS = [
|
|
810
|
+
"yarn build",
|
|
811
|
+
"npm run build",
|
|
812
|
+
"pnpm run build",
|
|
813
|
+
"pnpm build"
|
|
814
|
+
];
|
|
815
|
+
var DEFAULT_BUILD_PASSED_COMMAND = "yarn build";
|
|
816
|
+
var BUILD_COMMAND_ARGV = {
|
|
817
|
+
"yarn build": ["yarn", "build"],
|
|
818
|
+
"npm run build": ["npm", "run", "build"],
|
|
819
|
+
"pnpm run build": ["pnpm", "run", "build"],
|
|
820
|
+
"pnpm build": ["pnpm", "build"]
|
|
821
|
+
};
|
|
822
|
+
function isAllowedBuildCommandString(command) {
|
|
823
|
+
const trimmed = command.trim();
|
|
824
|
+
return ALLOWED_BUILD_COMMANDS.includes(trimmed);
|
|
825
|
+
}
|
|
826
|
+
function parseBuildCommandToArgv(command) {
|
|
827
|
+
const trimmed = command.trim();
|
|
828
|
+
if (!(trimmed in BUILD_COMMAND_ARGV)) {
|
|
829
|
+
return null;
|
|
830
|
+
}
|
|
831
|
+
return BUILD_COMMAND_ARGV[trimmed];
|
|
832
|
+
}
|
|
833
|
+
var enumTuple = ALLOWED_BUILD_COMMANDS;
|
|
834
|
+
var BuildPassedCommandStringSchema = import_zod22.z.enum(enumTuple);
|
|
835
|
+
|
|
836
|
+
// src/assertion/assertion.ts
|
|
837
|
+
var AssertionTypeSchema = import_zod23.z.enum([
|
|
801
838
|
"skill_was_called",
|
|
802
839
|
"tool_called_with_param",
|
|
803
840
|
"build_passed",
|
|
@@ -806,61 +843,61 @@ var AssertionTypeSchema = import_zod22.z.enum([
|
|
|
806
843
|
"llm_judge",
|
|
807
844
|
"api_call"
|
|
808
845
|
]);
|
|
809
|
-
var AssertionParameterTypeSchema =
|
|
846
|
+
var AssertionParameterTypeSchema = import_zod23.z.enum([
|
|
810
847
|
"string",
|
|
811
848
|
"number",
|
|
812
849
|
"boolean"
|
|
813
850
|
]);
|
|
814
|
-
var AssertionParameterSchema =
|
|
851
|
+
var AssertionParameterSchema = import_zod23.z.object({
|
|
815
852
|
/** Parameter name (used as key in params object) */
|
|
816
|
-
name:
|
|
853
|
+
name: import_zod23.z.string().min(1),
|
|
817
854
|
/** Display label for the parameter */
|
|
818
|
-
label:
|
|
855
|
+
label: import_zod23.z.string().min(1),
|
|
819
856
|
/** Parameter type */
|
|
820
857
|
type: AssertionParameterTypeSchema,
|
|
821
858
|
/** Whether this parameter is required */
|
|
822
|
-
required:
|
|
859
|
+
required: import_zod23.z.boolean(),
|
|
823
860
|
/** Default value (optional, used when not provided) */
|
|
824
|
-
defaultValue:
|
|
861
|
+
defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
|
|
825
862
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
826
|
-
advanced:
|
|
863
|
+
advanced: import_zod23.z.boolean().optional()
|
|
827
864
|
});
|
|
828
|
-
var ScenarioAssertionLinkSchema =
|
|
865
|
+
var ScenarioAssertionLinkSchema = import_zod23.z.object({
|
|
829
866
|
/** ID of the system assertion (e.g., 'system:skill_was_called') */
|
|
830
|
-
assertionId:
|
|
867
|
+
assertionId: import_zod23.z.string(),
|
|
831
868
|
/** Parameter values for this assertion in this scenario */
|
|
832
|
-
params:
|
|
833
|
-
|
|
834
|
-
|
|
869
|
+
params: import_zod23.z.record(
|
|
870
|
+
import_zod23.z.string(),
|
|
871
|
+
import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
|
|
835
872
|
).optional()
|
|
836
873
|
});
|
|
837
|
-
var SkillWasCalledConfigSchema =
|
|
874
|
+
var SkillWasCalledConfigSchema = import_zod23.z.object({
|
|
838
875
|
/** Names of the skills that must have been called */
|
|
839
|
-
skillNames:
|
|
876
|
+
skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
|
|
840
877
|
});
|
|
841
|
-
var CostConfigSchema =
|
|
878
|
+
var CostConfigSchema = import_zod23.z.strictObject({
|
|
842
879
|
/** Maximum allowed cost in USD */
|
|
843
|
-
maxCostUsd:
|
|
880
|
+
maxCostUsd: import_zod23.z.number().positive()
|
|
844
881
|
});
|
|
845
|
-
var ToolCalledWithParamConfigSchema =
|
|
882
|
+
var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
|
|
846
883
|
/** Name of the tool that must have been called */
|
|
847
|
-
toolName:
|
|
884
|
+
toolName: import_zod23.z.string().min(1),
|
|
848
885
|
/** JSON string of key-value pairs for expected parameters (substring match). Optional — when omitted, only checks tool presence. */
|
|
849
|
-
expectedParams:
|
|
886
|
+
expectedParams: import_zod23.z.string().min(1).optional(),
|
|
850
887
|
/** If true, the matching tool call must also have succeeded (step.success === true) */
|
|
851
|
-
requireSuccess:
|
|
888
|
+
requireSuccess: import_zod23.z.boolean().optional()
|
|
852
889
|
});
|
|
853
|
-
var BuildPassedConfigSchema =
|
|
854
|
-
/**
|
|
855
|
-
command:
|
|
890
|
+
var BuildPassedConfigSchema = import_zod23.z.strictObject({
|
|
891
|
+
/** Allowlisted command only (default at runtime: "yarn build") */
|
|
892
|
+
command: BuildPassedCommandStringSchema.optional(),
|
|
856
893
|
/** Expected exit code (default: 0) */
|
|
857
|
-
expectedExitCode:
|
|
894
|
+
expectedExitCode: import_zod23.z.number().int().optional()
|
|
858
895
|
});
|
|
859
|
-
var TimeConfigSchema =
|
|
896
|
+
var TimeConfigSchema = import_zod23.z.strictObject({
|
|
860
897
|
/** Maximum allowed duration in milliseconds */
|
|
861
|
-
maxDurationMs:
|
|
898
|
+
maxDurationMs: import_zod23.z.number().int().positive()
|
|
862
899
|
});
|
|
863
|
-
var LlmJudgeConfigSchema =
|
|
900
|
+
var LlmJudgeConfigSchema = import_zod23.z.object({
|
|
864
901
|
/**
|
|
865
902
|
* Prompt template with placeholders:
|
|
866
903
|
* - {{output}}: agent's final output
|
|
@@ -871,65 +908,65 @@ var LlmJudgeConfigSchema = import_zod22.z.object({
|
|
|
871
908
|
* - {{trace}}: step-by-step trace of tool calls
|
|
872
909
|
* - Custom parameters defined in the parameters array
|
|
873
910
|
*/
|
|
874
|
-
prompt:
|
|
911
|
+
prompt: import_zod23.z.string().min(1),
|
|
875
912
|
/** Minimum score to pass (0-10, default 7) */
|
|
876
|
-
minScore:
|
|
913
|
+
minScore: import_zod23.z.number().int().min(0).max(10).optional(),
|
|
877
914
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
878
|
-
model:
|
|
915
|
+
model: import_zod23.z.string().optional(),
|
|
879
916
|
/** Max output tokens */
|
|
880
|
-
maxTokens:
|
|
917
|
+
maxTokens: import_zod23.z.number().int().optional(),
|
|
881
918
|
/** Temperature (0-1) */
|
|
882
|
-
temperature:
|
|
919
|
+
temperature: import_zod23.z.number().min(0).max(1).optional(),
|
|
883
920
|
/** User-defined parameters for this assertion */
|
|
884
|
-
parameters:
|
|
921
|
+
parameters: import_zod23.z.array(AssertionParameterSchema).optional()
|
|
885
922
|
});
|
|
886
|
-
var ApiCallConfigSchema =
|
|
923
|
+
var ApiCallConfigSchema = import_zod23.z.strictObject({
|
|
887
924
|
/** URL to call */
|
|
888
|
-
url:
|
|
925
|
+
url: import_zod23.z.string().min(1),
|
|
889
926
|
/** HTTP method (default GET) */
|
|
890
|
-
method:
|
|
927
|
+
method: import_zod23.z.enum(["GET", "POST"]).optional(),
|
|
891
928
|
/** Request body (JSON string, for POST requests) */
|
|
892
|
-
requestBody:
|
|
929
|
+
requestBody: import_zod23.z.string().optional(),
|
|
893
930
|
/** Expected JSON response to validate against (subset match — extra fields in actual are OK) */
|
|
894
|
-
expectedResponse:
|
|
931
|
+
expectedResponse: import_zod23.z.string().min(1),
|
|
895
932
|
/** Request headers as JSON string of key-value pairs */
|
|
896
|
-
requestHeaders:
|
|
933
|
+
requestHeaders: import_zod23.z.string().optional(),
|
|
897
934
|
/** Request timeout in milliseconds (default 30000) */
|
|
898
|
-
timeoutMs:
|
|
935
|
+
timeoutMs: import_zod23.z.number().int().positive().optional()
|
|
899
936
|
});
|
|
900
937
|
var AssertionBaseFields = {
|
|
901
938
|
/** When true, the assertion's pass/fail logic is inverted (NOT operator). */
|
|
902
|
-
negate:
|
|
939
|
+
negate: import_zod23.z.boolean().optional()
|
|
903
940
|
};
|
|
904
941
|
var SkillWasCalledAssertionSchema = SkillWasCalledConfigSchema.extend({
|
|
905
|
-
type:
|
|
942
|
+
type: import_zod23.z.literal("skill_was_called"),
|
|
906
943
|
...AssertionBaseFields
|
|
907
944
|
});
|
|
908
945
|
var ToolCalledWithParamAssertionSchema = ToolCalledWithParamConfigSchema.extend({
|
|
909
|
-
type:
|
|
946
|
+
type: import_zod23.z.literal("tool_called_with_param"),
|
|
910
947
|
...AssertionBaseFields
|
|
911
948
|
});
|
|
912
949
|
var BuildPassedAssertionSchema = BuildPassedConfigSchema.extend({
|
|
913
|
-
type:
|
|
950
|
+
type: import_zod23.z.literal("build_passed"),
|
|
914
951
|
...AssertionBaseFields
|
|
915
952
|
});
|
|
916
953
|
var CostAssertionSchema = CostConfigSchema.extend({
|
|
917
|
-
type:
|
|
954
|
+
type: import_zod23.z.literal("cost"),
|
|
918
955
|
...AssertionBaseFields
|
|
919
956
|
});
|
|
920
957
|
var LlmJudgeAssertionSchema = LlmJudgeConfigSchema.extend({
|
|
921
|
-
type:
|
|
958
|
+
type: import_zod23.z.literal("llm_judge"),
|
|
922
959
|
...AssertionBaseFields
|
|
923
960
|
});
|
|
924
961
|
var ApiCallAssertionSchema = ApiCallConfigSchema.extend({
|
|
925
|
-
type:
|
|
962
|
+
type: import_zod23.z.literal("api_call"),
|
|
926
963
|
...AssertionBaseFields
|
|
927
964
|
});
|
|
928
965
|
var TimeAssertionSchema = TimeConfigSchema.extend({
|
|
929
|
-
type:
|
|
966
|
+
type: import_zod23.z.literal("time_limit"),
|
|
930
967
|
...AssertionBaseFields
|
|
931
968
|
});
|
|
932
|
-
var AssertionSchema =
|
|
969
|
+
var AssertionSchema = import_zod23.z.union([
|
|
933
970
|
SkillWasCalledAssertionSchema,
|
|
934
971
|
ToolCalledWithParamAssertionSchema,
|
|
935
972
|
BuildPassedAssertionSchema,
|
|
@@ -938,7 +975,7 @@ var AssertionSchema = import_zod22.z.union([
|
|
|
938
975
|
LlmJudgeAssertionSchema,
|
|
939
976
|
ApiCallAssertionSchema
|
|
940
977
|
]);
|
|
941
|
-
var AssertionConfigSchema =
|
|
978
|
+
var AssertionConfigSchema = import_zod23.z.union([
|
|
942
979
|
LlmJudgeConfigSchema,
|
|
943
980
|
// requires prompt - check first
|
|
944
981
|
SkillWasCalledConfigSchema,
|
|
@@ -953,7 +990,7 @@ var AssertionConfigSchema = import_zod22.z.union([
|
|
|
953
990
|
// requires maxCostUsd, uses strictObject
|
|
954
991
|
BuildPassedConfigSchema,
|
|
955
992
|
// all optional, uses strictObject to reject unknown keys
|
|
956
|
-
|
|
993
|
+
import_zod23.z.object({})
|
|
957
994
|
// fallback empty config
|
|
958
995
|
]);
|
|
959
996
|
function validateAssertionConfig(type, config) {
|
|
@@ -977,63 +1014,322 @@ function validateAssertionConfig(type, config) {
|
|
|
977
1014
|
}
|
|
978
1015
|
}
|
|
979
1016
|
|
|
1017
|
+
// src/assertion/system-assertions.ts
|
|
1018
|
+
var SYSTEM_ASSERTION_IDS = {
|
|
1019
|
+
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1020
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
1021
|
+
BUILD_PASSED: "system:build_passed",
|
|
1022
|
+
TIME_LIMIT: "system:time_limit",
|
|
1023
|
+
COST: "system:cost",
|
|
1024
|
+
LLM_JUDGE: "system:llm_judge",
|
|
1025
|
+
API_CALL: "system:api_call"
|
|
1026
|
+
};
|
|
1027
|
+
function isSystemAssertionId(id) {
|
|
1028
|
+
return id.startsWith("system:");
|
|
1029
|
+
}
|
|
1030
|
+
var SYSTEM_ASSERTIONS = {
|
|
1031
|
+
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1032
|
+
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1033
|
+
name: "Skill Was Called",
|
|
1034
|
+
description: "Check that one or more skills were invoked during the agent run",
|
|
1035
|
+
type: "skill_was_called",
|
|
1036
|
+
parameters: [
|
|
1037
|
+
{
|
|
1038
|
+
name: "skillNames",
|
|
1039
|
+
label: "Skills",
|
|
1040
|
+
type: "string",
|
|
1041
|
+
required: true
|
|
1042
|
+
},
|
|
1043
|
+
{
|
|
1044
|
+
name: "negate",
|
|
1045
|
+
label: "Negate (NOT operator)",
|
|
1046
|
+
type: "boolean",
|
|
1047
|
+
required: false,
|
|
1048
|
+
defaultValue: false
|
|
1049
|
+
}
|
|
1050
|
+
]
|
|
1051
|
+
},
|
|
1052
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
1053
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
1054
|
+
name: "Tool Called With Param",
|
|
1055
|
+
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
1056
|
+
type: "tool_called_with_param",
|
|
1057
|
+
parameters: [
|
|
1058
|
+
{
|
|
1059
|
+
name: "toolName",
|
|
1060
|
+
label: "Tool Name",
|
|
1061
|
+
type: "string",
|
|
1062
|
+
required: true
|
|
1063
|
+
},
|
|
1064
|
+
{
|
|
1065
|
+
name: "expectedParams",
|
|
1066
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
1067
|
+
type: "string",
|
|
1068
|
+
required: false
|
|
1069
|
+
},
|
|
1070
|
+
{
|
|
1071
|
+
name: "requireSuccess",
|
|
1072
|
+
label: "Require Successful Call",
|
|
1073
|
+
type: "boolean",
|
|
1074
|
+
required: false,
|
|
1075
|
+
defaultValue: false,
|
|
1076
|
+
advanced: true
|
|
1077
|
+
},
|
|
1078
|
+
{
|
|
1079
|
+
name: "negate",
|
|
1080
|
+
label: "Negate (NOT operator)",
|
|
1081
|
+
type: "boolean",
|
|
1082
|
+
required: false,
|
|
1083
|
+
defaultValue: false
|
|
1084
|
+
}
|
|
1085
|
+
]
|
|
1086
|
+
},
|
|
1087
|
+
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1088
|
+
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1089
|
+
name: "Build Passed",
|
|
1090
|
+
description: "Run a build command and verify it exits with expected code",
|
|
1091
|
+
type: "build_passed",
|
|
1092
|
+
parameters: [
|
|
1093
|
+
{
|
|
1094
|
+
name: "command",
|
|
1095
|
+
label: "Build Command",
|
|
1096
|
+
type: "string",
|
|
1097
|
+
required: false,
|
|
1098
|
+
defaultValue: "yarn build"
|
|
1099
|
+
},
|
|
1100
|
+
{
|
|
1101
|
+
name: "expectedExitCode",
|
|
1102
|
+
label: "Expected Exit Code",
|
|
1103
|
+
type: "number",
|
|
1104
|
+
required: false,
|
|
1105
|
+
defaultValue: 0
|
|
1106
|
+
},
|
|
1107
|
+
{
|
|
1108
|
+
name: "maxBuildTime",
|
|
1109
|
+
label: "Max Build Time (ms)",
|
|
1110
|
+
type: "number",
|
|
1111
|
+
required: false,
|
|
1112
|
+
advanced: true
|
|
1113
|
+
},
|
|
1114
|
+
{
|
|
1115
|
+
name: "maxMemory",
|
|
1116
|
+
label: "Max Memory (MB)",
|
|
1117
|
+
type: "number",
|
|
1118
|
+
required: false,
|
|
1119
|
+
advanced: true
|
|
1120
|
+
}
|
|
1121
|
+
]
|
|
1122
|
+
},
|
|
1123
|
+
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
1124
|
+
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
1125
|
+
name: "Time Limit",
|
|
1126
|
+
description: "Check that the scenario completed within a maximum duration",
|
|
1127
|
+
type: "time_limit",
|
|
1128
|
+
parameters: [
|
|
1129
|
+
{
|
|
1130
|
+
name: "maxDurationMs",
|
|
1131
|
+
label: "Max Duration (ms)",
|
|
1132
|
+
type: "number",
|
|
1133
|
+
required: true,
|
|
1134
|
+
defaultValue: 3e5
|
|
1135
|
+
}
|
|
1136
|
+
]
|
|
1137
|
+
},
|
|
1138
|
+
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
1139
|
+
id: SYSTEM_ASSERTION_IDS.COST,
|
|
1140
|
+
name: "Cost",
|
|
1141
|
+
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
1142
|
+
type: "cost",
|
|
1143
|
+
parameters: [
|
|
1144
|
+
{
|
|
1145
|
+
name: "maxCostUsd",
|
|
1146
|
+
label: "Max Cost (USD)",
|
|
1147
|
+
type: "number",
|
|
1148
|
+
required: true,
|
|
1149
|
+
defaultValue: 1
|
|
1150
|
+
}
|
|
1151
|
+
]
|
|
1152
|
+
},
|
|
1153
|
+
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1154
|
+
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1155
|
+
name: "LLM Judge",
|
|
1156
|
+
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
1157
|
+
type: "llm_judge",
|
|
1158
|
+
parameters: [
|
|
1159
|
+
{
|
|
1160
|
+
name: "prompt",
|
|
1161
|
+
label: "Judge Prompt",
|
|
1162
|
+
type: "string",
|
|
1163
|
+
required: true,
|
|
1164
|
+
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1165
|
+
},
|
|
1166
|
+
{
|
|
1167
|
+
name: "minScore",
|
|
1168
|
+
label: "Minimum Score (0-10)",
|
|
1169
|
+
type: "number",
|
|
1170
|
+
required: false,
|
|
1171
|
+
defaultValue: 7
|
|
1172
|
+
},
|
|
1173
|
+
{
|
|
1174
|
+
name: "model",
|
|
1175
|
+
label: "Model",
|
|
1176
|
+
type: "string",
|
|
1177
|
+
required: false
|
|
1178
|
+
}
|
|
1179
|
+
]
|
|
1180
|
+
},
|
|
1181
|
+
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
1182
|
+
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
1183
|
+
name: "API Call",
|
|
1184
|
+
description: "Call an API endpoint and verify the response contains expected data",
|
|
1185
|
+
type: "api_call",
|
|
1186
|
+
parameters: [
|
|
1187
|
+
{
|
|
1188
|
+
name: "url",
|
|
1189
|
+
label: "URL",
|
|
1190
|
+
type: "string",
|
|
1191
|
+
required: true
|
|
1192
|
+
},
|
|
1193
|
+
{
|
|
1194
|
+
name: "method",
|
|
1195
|
+
label: "HTTP Method",
|
|
1196
|
+
type: "string",
|
|
1197
|
+
required: false,
|
|
1198
|
+
defaultValue: "GET"
|
|
1199
|
+
},
|
|
1200
|
+
{
|
|
1201
|
+
name: "requestBody",
|
|
1202
|
+
label: "Request Body (JSON)",
|
|
1203
|
+
type: "string",
|
|
1204
|
+
required: false
|
|
1205
|
+
},
|
|
1206
|
+
{
|
|
1207
|
+
name: "expectedResponse",
|
|
1208
|
+
label: "Expected Response (JSON)",
|
|
1209
|
+
type: "string",
|
|
1210
|
+
required: true
|
|
1211
|
+
},
|
|
1212
|
+
{
|
|
1213
|
+
name: "requestHeaders",
|
|
1214
|
+
label: "Headers (JSON)",
|
|
1215
|
+
type: "string",
|
|
1216
|
+
required: false,
|
|
1217
|
+
advanced: true
|
|
1218
|
+
},
|
|
1219
|
+
{
|
|
1220
|
+
name: "timeoutMs",
|
|
1221
|
+
label: "Timeout (ms)",
|
|
1222
|
+
type: "number",
|
|
1223
|
+
required: false,
|
|
1224
|
+
defaultValue: 3e4,
|
|
1225
|
+
advanced: true
|
|
1226
|
+
}
|
|
1227
|
+
]
|
|
1228
|
+
}
|
|
1229
|
+
};
|
|
1230
|
+
function getSystemAssertions() {
|
|
1231
|
+
return Object.values(SYSTEM_ASSERTIONS);
|
|
1232
|
+
}
|
|
1233
|
+
function getSystemAssertion(id) {
|
|
1234
|
+
return SYSTEM_ASSERTIONS[id];
|
|
1235
|
+
}
|
|
1236
|
+
|
|
980
1237
|
// src/scenario/test-scenario.ts
|
|
981
1238
|
var MAX_IMAGE_BASE64_LENGTH = 4 * Math.ceil(2 * 1024 * 1024 / 3);
|
|
982
|
-
var TriggerPromptImageSchema =
|
|
1239
|
+
var TriggerPromptImageSchema = import_zod24.z.object({
|
|
983
1240
|
/** Base64-encoded image data (no data URL prefix) */
|
|
984
|
-
base64:
|
|
1241
|
+
base64: import_zod24.z.string().max(MAX_IMAGE_BASE64_LENGTH, "Image exceeds 2 MB size limit"),
|
|
985
1242
|
/** MIME type of the image */
|
|
986
|
-
mediaType:
|
|
1243
|
+
mediaType: import_zod24.z.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
|
|
987
1244
|
/** Original filename of the image */
|
|
988
|
-
name:
|
|
1245
|
+
name: import_zod24.z.string()
|
|
989
1246
|
});
|
|
990
|
-
var ExpectedFileSchema =
|
|
1247
|
+
var ExpectedFileSchema = import_zod24.z.object({
|
|
991
1248
|
/** Relative path where the file should be created */
|
|
992
|
-
path:
|
|
1249
|
+
path: import_zod24.z.string(),
|
|
993
1250
|
/** Optional expected content */
|
|
994
|
-
content:
|
|
1251
|
+
content: import_zod24.z.string().optional()
|
|
995
1252
|
});
|
|
996
1253
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
997
1254
|
/** The prompt sent to the agent to trigger the task */
|
|
998
|
-
triggerPrompt:
|
|
1255
|
+
triggerPrompt: import_zod24.z.string().min(10),
|
|
999
1256
|
/** ID of the template to use for this scenario (null = no template) */
|
|
1000
|
-
templateId:
|
|
1257
|
+
templateId: import_zod24.z.string().nullish(),
|
|
1001
1258
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
1002
|
-
assertions:
|
|
1259
|
+
assertions: import_zod24.z.array(AssertionSchema).optional(),
|
|
1003
1260
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
1004
|
-
assertionIds:
|
|
1261
|
+
assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
|
|
1005
1262
|
/** Linked assertions with per-scenario parameter values */
|
|
1006
|
-
assertionLinks:
|
|
1263
|
+
assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional(),
|
|
1007
1264
|
/** Tags for categorisation and filtering */
|
|
1008
|
-
tags:
|
|
1265
|
+
tags: import_zod24.z.array(import_zod24.z.string()).optional(),
|
|
1009
1266
|
/** Base64-encoded images attached to the trigger prompt (max 3) */
|
|
1010
|
-
triggerPromptImages:
|
|
1011
|
-
});
|
|
1012
|
-
|
|
1267
|
+
triggerPromptImages: import_zod24.z.array(TriggerPromptImageSchema).max(3).optional()
|
|
1268
|
+
});
|
|
1269
|
+
function validateBuildPassedParamsInAssertionLinks(links, ctx) {
|
|
1270
|
+
if (!links) return;
|
|
1271
|
+
for (let i = 0; i < links.length; i++) {
|
|
1272
|
+
const link = links[i];
|
|
1273
|
+
if (link.assertionId !== SYSTEM_ASSERTION_IDS.BUILD_PASSED) continue;
|
|
1274
|
+
const cmd = link.params?.command;
|
|
1275
|
+
if (cmd === void 0 || cmd === null) continue;
|
|
1276
|
+
if (typeof cmd !== "string") {
|
|
1277
|
+
ctx.addIssue({
|
|
1278
|
+
code: import_zod24.z.ZodIssueCode.custom,
|
|
1279
|
+
message: "build_passed command must be a string",
|
|
1280
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1281
|
+
});
|
|
1282
|
+
continue;
|
|
1283
|
+
}
|
|
1284
|
+
if (!isAllowedBuildCommandString(cmd)) {
|
|
1285
|
+
ctx.addIssue({
|
|
1286
|
+
code: import_zod24.z.ZodIssueCode.custom,
|
|
1287
|
+
message: "Invalid build_passed command. Allowed: yarn build, npm run build, pnpm run build, pnpm build",
|
|
1288
|
+
path: ["assertionLinks", i, "params", "command"]
|
|
1289
|
+
});
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
var TestScenarioCreateBaseSchema = TestScenarioSchema.omit({
|
|
1013
1294
|
id: true,
|
|
1014
1295
|
createdAt: true,
|
|
1015
1296
|
updatedAt: true,
|
|
1016
1297
|
deleted: true
|
|
1017
1298
|
});
|
|
1018
|
-
var
|
|
1299
|
+
var CreateTestScenarioInputSchema = TestScenarioCreateBaseSchema.superRefine((data, ctx) => {
|
|
1300
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1301
|
+
});
|
|
1302
|
+
var UpdateTestScenarioInputSchema = TestScenarioCreateBaseSchema.partial().superRefine((data, ctx) => {
|
|
1303
|
+
if (data.assertionLinks !== void 0) {
|
|
1304
|
+
validateBuildPassedParamsInAssertionLinks(data.assertionLinks, ctx);
|
|
1305
|
+
}
|
|
1306
|
+
});
|
|
1019
1307
|
|
|
1020
1308
|
// src/scenario/batch-import.ts
|
|
1021
|
-
var
|
|
1309
|
+
var import_zod25 = require("zod");
|
|
1022
1310
|
var UUID_REGEX = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
|
1023
|
-
var BatchAssertionLinkSchema =
|
|
1024
|
-
|
|
1311
|
+
var BatchAssertionLinkSchema = import_zod25.z.union([
|
|
1312
|
+
import_zod25.z.string().min(1),
|
|
1025
1313
|
ScenarioAssertionLinkSchema
|
|
1026
1314
|
]);
|
|
1027
|
-
var BatchScenarioEntrySchema =
|
|
1028
|
-
name:
|
|
1029
|
-
description:
|
|
1030
|
-
triggerPrompt:
|
|
1031
|
-
templateId:
|
|
1032
|
-
tags:
|
|
1033
|
-
assertionLinks:
|
|
1315
|
+
var BatchScenarioEntrySchema = import_zod25.z.object({
|
|
1316
|
+
name: import_zod25.z.string().min(1, "name: Required"),
|
|
1317
|
+
description: import_zod25.z.string().optional().default(""),
|
|
1318
|
+
triggerPrompt: import_zod25.z.string().min(10, "triggerPrompt: Must be at least 10 characters"),
|
|
1319
|
+
templateId: import_zod25.z.string().nullish(),
|
|
1320
|
+
tags: import_zod25.z.array(import_zod25.z.string()).optional(),
|
|
1321
|
+
assertionLinks: import_zod25.z.array(BatchAssertionLinkSchema).optional()
|
|
1322
|
+
}).superRefine((data, ctx) => {
|
|
1323
|
+
if (!data.assertionLinks) return;
|
|
1324
|
+
const objectLinks = data.assertionLinks.filter(
|
|
1325
|
+
(link) => typeof link !== "string"
|
|
1326
|
+
);
|
|
1327
|
+
if (objectLinks.length > 0) {
|
|
1328
|
+
validateBuildPassedParamsInAssertionLinks(objectLinks, ctx);
|
|
1329
|
+
}
|
|
1034
1330
|
});
|
|
1035
|
-
var BatchImportPayloadSchema =
|
|
1036
|
-
scenarios:
|
|
1331
|
+
var BatchImportPayloadSchema = import_zod25.z.object({
|
|
1332
|
+
scenarios: import_zod25.z.array(BatchScenarioEntrySchema).min(1, "scenarios array must contain at least one entry").max(100, "Maximum 100 scenarios per upload")
|
|
1037
1333
|
});
|
|
1038
1334
|
var BATCH_IMPORT_LIMITS = {
|
|
1039
1335
|
MAX_SCENARIOS: 100,
|
|
@@ -1055,29 +1351,29 @@ function normalizeBatchAssertionLink(link) {
|
|
|
1055
1351
|
}
|
|
1056
1352
|
return link;
|
|
1057
1353
|
}
|
|
1058
|
-
var BatchResultItemSchema =
|
|
1059
|
-
index:
|
|
1060
|
-
name:
|
|
1061
|
-
status:
|
|
1062
|
-
id:
|
|
1063
|
-
errors:
|
|
1064
|
-
});
|
|
1065
|
-
var BatchSummarySchema =
|
|
1066
|
-
total:
|
|
1067
|
-
valid:
|
|
1068
|
-
invalid:
|
|
1069
|
-
created:
|
|
1070
|
-
});
|
|
1071
|
-
var BatchImportResponseSchema =
|
|
1354
|
+
var BatchResultItemSchema = import_zod25.z.object({
|
|
1355
|
+
index: import_zod25.z.number(),
|
|
1356
|
+
name: import_zod25.z.string(),
|
|
1357
|
+
status: import_zod25.z.enum(["valid", "invalid"]),
|
|
1358
|
+
id: import_zod25.z.string().nullable().optional(),
|
|
1359
|
+
errors: import_zod25.z.array(import_zod25.z.string()).optional()
|
|
1360
|
+
});
|
|
1361
|
+
var BatchSummarySchema = import_zod25.z.object({
|
|
1362
|
+
total: import_zod25.z.number(),
|
|
1363
|
+
valid: import_zod25.z.number(),
|
|
1364
|
+
invalid: import_zod25.z.number(),
|
|
1365
|
+
created: import_zod25.z.number()
|
|
1366
|
+
});
|
|
1367
|
+
var BatchImportResponseSchema = import_zod25.z.object({
|
|
1072
1368
|
summary: BatchSummarySchema,
|
|
1073
|
-
results:
|
|
1369
|
+
results: import_zod25.z.array(BatchResultItemSchema)
|
|
1074
1370
|
});
|
|
1075
1371
|
|
|
1076
1372
|
// src/suite/test-suite.ts
|
|
1077
|
-
var
|
|
1373
|
+
var import_zod26 = require("zod");
|
|
1078
1374
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1079
1375
|
/** IDs of test scenarios in this suite */
|
|
1080
|
-
scenarioIds:
|
|
1376
|
+
scenarioIds: import_zod26.z.array(import_zod26.z.string())
|
|
1081
1377
|
});
|
|
1082
1378
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1083
1379
|
id: true,
|
|
@@ -1088,21 +1384,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1088
1384
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1089
1385
|
|
|
1090
1386
|
// src/evaluation/metrics.ts
|
|
1091
|
-
var
|
|
1092
|
-
var TokenUsageSchema =
|
|
1093
|
-
prompt:
|
|
1094
|
-
completion:
|
|
1095
|
-
total:
|
|
1096
|
-
});
|
|
1097
|
-
var EvalMetricsSchema =
|
|
1098
|
-
totalAssertions:
|
|
1099
|
-
passed:
|
|
1100
|
-
failed:
|
|
1101
|
-
skipped:
|
|
1102
|
-
errors:
|
|
1103
|
-
passRate:
|
|
1104
|
-
avgDuration:
|
|
1105
|
-
totalDuration:
|
|
1387
|
+
var import_zod27 = require("zod");
|
|
1388
|
+
var TokenUsageSchema = import_zod27.z.object({
|
|
1389
|
+
prompt: import_zod27.z.number(),
|
|
1390
|
+
completion: import_zod27.z.number(),
|
|
1391
|
+
total: import_zod27.z.number()
|
|
1392
|
+
});
|
|
1393
|
+
var EvalMetricsSchema = import_zod27.z.object({
|
|
1394
|
+
totalAssertions: import_zod27.z.number(),
|
|
1395
|
+
passed: import_zod27.z.number(),
|
|
1396
|
+
failed: import_zod27.z.number(),
|
|
1397
|
+
skipped: import_zod27.z.number(),
|
|
1398
|
+
errors: import_zod27.z.number(),
|
|
1399
|
+
passRate: import_zod27.z.number(),
|
|
1400
|
+
avgDuration: import_zod27.z.number(),
|
|
1401
|
+
totalDuration: import_zod27.z.number()
|
|
1106
1402
|
});
|
|
1107
1403
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1108
1404
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1112,7 +1408,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1112
1408
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1113
1409
|
return EvalStatus2;
|
|
1114
1410
|
})(EvalStatus || {});
|
|
1115
|
-
var EvalStatusSchema =
|
|
1411
|
+
var EvalStatusSchema = import_zod27.z.enum(EvalStatus);
|
|
1116
1412
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1117
1413
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1118
1414
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1120,54 +1416,54 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1120
1416
|
LLMStepType2["THINKING"] = "thinking";
|
|
1121
1417
|
return LLMStepType2;
|
|
1122
1418
|
})(LLMStepType || {});
|
|
1123
|
-
var LLMTraceStepSchema =
|
|
1124
|
-
id:
|
|
1125
|
-
stepNumber:
|
|
1126
|
-
type:
|
|
1127
|
-
model:
|
|
1128
|
-
provider:
|
|
1129
|
-
startedAt:
|
|
1130
|
-
durationMs:
|
|
1419
|
+
var LLMTraceStepSchema = import_zod27.z.object({
|
|
1420
|
+
id: import_zod27.z.string(),
|
|
1421
|
+
stepNumber: import_zod27.z.number(),
|
|
1422
|
+
type: import_zod27.z.enum(LLMStepType),
|
|
1423
|
+
model: import_zod27.z.string(),
|
|
1424
|
+
provider: import_zod27.z.string(),
|
|
1425
|
+
startedAt: import_zod27.z.string(),
|
|
1426
|
+
durationMs: import_zod27.z.number(),
|
|
1131
1427
|
tokenUsage: TokenUsageSchema,
|
|
1132
|
-
costUsd:
|
|
1133
|
-
toolName:
|
|
1134
|
-
toolArguments:
|
|
1135
|
-
inputPreview:
|
|
1136
|
-
outputPreview:
|
|
1137
|
-
success:
|
|
1138
|
-
error:
|
|
1139
|
-
turnIndex:
|
|
1140
|
-
});
|
|
1141
|
-
var LLMBreakdownStatsSchema =
|
|
1142
|
-
count:
|
|
1143
|
-
durationMs:
|
|
1144
|
-
tokens:
|
|
1145
|
-
costUsd:
|
|
1146
|
-
});
|
|
1147
|
-
var LLMTraceSummarySchema =
|
|
1148
|
-
totalSteps:
|
|
1149
|
-
totalTurns:
|
|
1150
|
-
totalDurationMs:
|
|
1428
|
+
costUsd: import_zod27.z.number(),
|
|
1429
|
+
toolName: import_zod27.z.string().optional(),
|
|
1430
|
+
toolArguments: import_zod27.z.string().optional(),
|
|
1431
|
+
inputPreview: import_zod27.z.string().optional(),
|
|
1432
|
+
outputPreview: import_zod27.z.string().optional(),
|
|
1433
|
+
success: import_zod27.z.boolean(),
|
|
1434
|
+
error: import_zod27.z.string().optional(),
|
|
1435
|
+
turnIndex: import_zod27.z.number().optional()
|
|
1436
|
+
});
|
|
1437
|
+
var LLMBreakdownStatsSchema = import_zod27.z.object({
|
|
1438
|
+
count: import_zod27.z.number(),
|
|
1439
|
+
durationMs: import_zod27.z.number(),
|
|
1440
|
+
tokens: import_zod27.z.number(),
|
|
1441
|
+
costUsd: import_zod27.z.number()
|
|
1442
|
+
});
|
|
1443
|
+
var LLMTraceSummarySchema = import_zod27.z.object({
|
|
1444
|
+
totalSteps: import_zod27.z.number(),
|
|
1445
|
+
totalTurns: import_zod27.z.number().optional(),
|
|
1446
|
+
totalDurationMs: import_zod27.z.number(),
|
|
1151
1447
|
totalTokens: TokenUsageSchema,
|
|
1152
|
-
totalCostUsd:
|
|
1153
|
-
stepTypeBreakdown:
|
|
1154
|
-
modelBreakdown:
|
|
1155
|
-
modelsUsed:
|
|
1156
|
-
});
|
|
1157
|
-
var LLMTraceSchema =
|
|
1158
|
-
id:
|
|
1159
|
-
steps:
|
|
1448
|
+
totalCostUsd: import_zod27.z.number(),
|
|
1449
|
+
stepTypeBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
1450
|
+
modelBreakdown: import_zod27.z.record(import_zod27.z.string(), LLMBreakdownStatsSchema),
|
|
1451
|
+
modelsUsed: import_zod27.z.array(import_zod27.z.string())
|
|
1452
|
+
});
|
|
1453
|
+
var LLMTraceSchema = import_zod27.z.object({
|
|
1454
|
+
id: import_zod27.z.string(),
|
|
1455
|
+
steps: import_zod27.z.array(LLMTraceStepSchema),
|
|
1160
1456
|
summary: LLMTraceSummarySchema
|
|
1161
1457
|
});
|
|
1162
1458
|
|
|
1163
1459
|
// src/evaluation/eval-result.ts
|
|
1164
|
-
var
|
|
1460
|
+
var import_zod31 = require("zod");
|
|
1165
1461
|
|
|
1166
1462
|
// src/evaluation/eval-run.ts
|
|
1167
|
-
var
|
|
1463
|
+
var import_zod29 = require("zod");
|
|
1168
1464
|
|
|
1169
1465
|
// src/evaluation/live-trace.ts
|
|
1170
|
-
var
|
|
1466
|
+
var import_zod28 = require("zod");
|
|
1171
1467
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1172
1468
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1173
1469
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1181,37 +1477,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1181
1477
|
LiveTraceEventType2["USER"] = "user";
|
|
1182
1478
|
return LiveTraceEventType2;
|
|
1183
1479
|
})(LiveTraceEventType || {});
|
|
1184
|
-
var LiveTraceEventSchema =
|
|
1480
|
+
var LiveTraceEventSchema = import_zod28.z.object({
|
|
1185
1481
|
/** The evaluation run ID */
|
|
1186
|
-
evalRunId:
|
|
1482
|
+
evalRunId: import_zod28.z.string(),
|
|
1187
1483
|
/** The scenario ID being executed */
|
|
1188
|
-
scenarioId:
|
|
1484
|
+
scenarioId: import_zod28.z.string(),
|
|
1189
1485
|
/** The scenario name for display */
|
|
1190
|
-
scenarioName:
|
|
1486
|
+
scenarioName: import_zod28.z.string(),
|
|
1191
1487
|
/** The target ID (skill, agent, etc.) */
|
|
1192
|
-
targetId:
|
|
1488
|
+
targetId: import_zod28.z.string(),
|
|
1193
1489
|
/** The target name for display */
|
|
1194
|
-
targetName:
|
|
1490
|
+
targetName: import_zod28.z.string(),
|
|
1195
1491
|
/** Step number in the current scenario execution */
|
|
1196
|
-
stepNumber:
|
|
1492
|
+
stepNumber: import_zod28.z.number(),
|
|
1197
1493
|
/** Type of trace event */
|
|
1198
|
-
type:
|
|
1494
|
+
type: import_zod28.z.enum(LiveTraceEventType),
|
|
1199
1495
|
/** Tool name if this is a tool_use event */
|
|
1200
|
-
toolName:
|
|
1496
|
+
toolName: import_zod28.z.string().optional(),
|
|
1201
1497
|
/** Tool arguments preview (truncated JSON) */
|
|
1202
|
-
toolArgs:
|
|
1498
|
+
toolArgs: import_zod28.z.string().optional(),
|
|
1203
1499
|
/** Output preview (truncated text) */
|
|
1204
|
-
outputPreview:
|
|
1500
|
+
outputPreview: import_zod28.z.string().optional(),
|
|
1205
1501
|
/** File path for file operations */
|
|
1206
|
-
filePath:
|
|
1502
|
+
filePath: import_zod28.z.string().optional(),
|
|
1207
1503
|
/** Elapsed time in milliseconds for progress events */
|
|
1208
|
-
elapsedMs:
|
|
1504
|
+
elapsedMs: import_zod28.z.number().optional(),
|
|
1209
1505
|
/** Thinking/reasoning text from Claude */
|
|
1210
|
-
thinking:
|
|
1506
|
+
thinking: import_zod28.z.string().optional(),
|
|
1211
1507
|
/** Timestamp when this event occurred */
|
|
1212
|
-
timestamp:
|
|
1508
|
+
timestamp: import_zod28.z.string(),
|
|
1213
1509
|
/** Whether this is the final event for this scenario */
|
|
1214
|
-
isComplete:
|
|
1510
|
+
isComplete: import_zod28.z.boolean()
|
|
1215
1511
|
});
|
|
1216
1512
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1217
1513
|
function parseTraceEventLine(line) {
|
|
@@ -1240,40 +1536,40 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1240
1536
|
TriggerType2["SCHEDULED"] = "SCHEDULED";
|
|
1241
1537
|
return TriggerType2;
|
|
1242
1538
|
})(TriggerType || {});
|
|
1243
|
-
var TriggerMetadataSchema =
|
|
1244
|
-
version:
|
|
1245
|
-
resourceUpdated:
|
|
1246
|
-
scheduleId:
|
|
1539
|
+
var TriggerMetadataSchema = import_zod29.z.object({
|
|
1540
|
+
version: import_zod29.z.string().optional(),
|
|
1541
|
+
resourceUpdated: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1542
|
+
scheduleId: import_zod29.z.string().optional()
|
|
1247
1543
|
});
|
|
1248
|
-
var TriggerSchema =
|
|
1249
|
-
id:
|
|
1544
|
+
var TriggerSchema = import_zod29.z.object({
|
|
1545
|
+
id: import_zod29.z.string(),
|
|
1250
1546
|
metadata: TriggerMetadataSchema.optional(),
|
|
1251
|
-
type:
|
|
1547
|
+
type: import_zod29.z.nativeEnum(TriggerType)
|
|
1252
1548
|
});
|
|
1253
|
-
var DiffLineTypeSchema =
|
|
1254
|
-
var DiffLineSchema =
|
|
1549
|
+
var DiffLineTypeSchema = import_zod29.z.enum(["added", "removed", "unchanged"]);
|
|
1550
|
+
var DiffLineSchema = import_zod29.z.object({
|
|
1255
1551
|
type: DiffLineTypeSchema,
|
|
1256
|
-
content:
|
|
1257
|
-
lineNumber:
|
|
1258
|
-
});
|
|
1259
|
-
var DiffContentSchema =
|
|
1260
|
-
path:
|
|
1261
|
-
expected:
|
|
1262
|
-
actual:
|
|
1263
|
-
diffLines:
|
|
1264
|
-
renamedFrom:
|
|
1552
|
+
content: import_zod29.z.string(),
|
|
1553
|
+
lineNumber: import_zod29.z.number()
|
|
1554
|
+
});
|
|
1555
|
+
var DiffContentSchema = import_zod29.z.object({
|
|
1556
|
+
path: import_zod29.z.string(),
|
|
1557
|
+
expected: import_zod29.z.string(),
|
|
1558
|
+
actual: import_zod29.z.string(),
|
|
1559
|
+
diffLines: import_zod29.z.array(DiffLineSchema),
|
|
1560
|
+
renamedFrom: import_zod29.z.string().optional(),
|
|
1265
1561
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1266
|
-
isInfrastructure:
|
|
1562
|
+
isInfrastructure: import_zod29.z.boolean().optional()
|
|
1267
1563
|
});
|
|
1268
|
-
var CommandExecutionSchema =
|
|
1269
|
-
command:
|
|
1270
|
-
exitCode:
|
|
1271
|
-
output:
|
|
1272
|
-
duration:
|
|
1564
|
+
var CommandExecutionSchema = import_zod29.z.object({
|
|
1565
|
+
command: import_zod29.z.string(),
|
|
1566
|
+
exitCode: import_zod29.z.number(),
|
|
1567
|
+
output: import_zod29.z.string().optional(),
|
|
1568
|
+
duration: import_zod29.z.number()
|
|
1273
1569
|
});
|
|
1274
|
-
var FileModificationSchema =
|
|
1275
|
-
path:
|
|
1276
|
-
action:
|
|
1570
|
+
var FileModificationSchema = import_zod29.z.object({
|
|
1571
|
+
path: import_zod29.z.string(),
|
|
1572
|
+
action: import_zod29.z.enum(["created", "modified", "deleted"])
|
|
1277
1573
|
});
|
|
1278
1574
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1279
1575
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1281,62 +1577,62 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1281
1577
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1282
1578
|
return TemplateFileStatus2;
|
|
1283
1579
|
})(TemplateFileStatus || {});
|
|
1284
|
-
var TemplateFileSchema =
|
|
1580
|
+
var TemplateFileSchema = import_zod29.z.object({
|
|
1285
1581
|
/** Relative path within the template */
|
|
1286
|
-
path:
|
|
1582
|
+
path: import_zod29.z.string(),
|
|
1287
1583
|
/** Full file content after execution */
|
|
1288
|
-
content:
|
|
1584
|
+
content: import_zod29.z.string(),
|
|
1289
1585
|
/** File status (new, modified, unchanged) */
|
|
1290
|
-
status:
|
|
1586
|
+
status: import_zod29.z.enum(["new", "modified", "unchanged"]),
|
|
1291
1587
|
/** Whether this file is an infrastructure/config file (e.g. .claude/settings.json, .mcp.json) */
|
|
1292
|
-
isInfrastructure:
|
|
1588
|
+
isInfrastructure: import_zod29.z.boolean().optional()
|
|
1293
1589
|
});
|
|
1294
|
-
var ApiCallSchema =
|
|
1295
|
-
endpoint:
|
|
1296
|
-
tokensUsed:
|
|
1297
|
-
duration:
|
|
1590
|
+
var ApiCallSchema = import_zod29.z.object({
|
|
1591
|
+
endpoint: import_zod29.z.string(),
|
|
1592
|
+
tokensUsed: import_zod29.z.number(),
|
|
1593
|
+
duration: import_zod29.z.number()
|
|
1298
1594
|
});
|
|
1299
|
-
var ExecutionTraceSchema =
|
|
1300
|
-
commands:
|
|
1301
|
-
filesModified:
|
|
1302
|
-
apiCalls:
|
|
1303
|
-
totalDuration:
|
|
1595
|
+
var ExecutionTraceSchema = import_zod29.z.object({
|
|
1596
|
+
commands: import_zod29.z.array(CommandExecutionSchema),
|
|
1597
|
+
filesModified: import_zod29.z.array(FileModificationSchema),
|
|
1598
|
+
apiCalls: import_zod29.z.array(ApiCallSchema),
|
|
1599
|
+
totalDuration: import_zod29.z.number()
|
|
1304
1600
|
});
|
|
1305
|
-
var RunAnalysisFindingSchema =
|
|
1306
|
-
category:
|
|
1601
|
+
var RunAnalysisFindingSchema = import_zod29.z.object({
|
|
1602
|
+
category: import_zod29.z.enum([
|
|
1307
1603
|
"failure_pattern",
|
|
1308
1604
|
"cost_waste",
|
|
1309
1605
|
"flakiness",
|
|
1310
1606
|
"inefficiency",
|
|
1311
1607
|
"positive"
|
|
1312
1608
|
]),
|
|
1313
|
-
severity:
|
|
1314
|
-
description:
|
|
1315
|
-
affectedScenarios:
|
|
1316
|
-
recommendation:
|
|
1609
|
+
severity: import_zod29.z.enum(["high", "medium", "low"]),
|
|
1610
|
+
description: import_zod29.z.string(),
|
|
1611
|
+
affectedScenarios: import_zod29.z.array(import_zod29.z.string()),
|
|
1612
|
+
recommendation: import_zod29.z.string().optional()
|
|
1317
1613
|
});
|
|
1318
|
-
var RunAnalysisSchema =
|
|
1319
|
-
generatedAt:
|
|
1320
|
-
summary:
|
|
1321
|
-
findings:
|
|
1614
|
+
var RunAnalysisSchema = import_zod29.z.object({
|
|
1615
|
+
generatedAt: import_zod29.z.string(),
|
|
1616
|
+
summary: import_zod29.z.string(),
|
|
1617
|
+
findings: import_zod29.z.array(RunAnalysisFindingSchema)
|
|
1322
1618
|
});
|
|
1323
1619
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1324
1620
|
/** Agent ID for this run */
|
|
1325
|
-
agentId:
|
|
1621
|
+
agentId: import_zod29.z.string().optional(),
|
|
1326
1622
|
/** Preset ID that originated this run (optional) */
|
|
1327
|
-
presetId:
|
|
1623
|
+
presetId: import_zod29.z.string().optional(),
|
|
1328
1624
|
/** Skill IDs for this run */
|
|
1329
|
-
skillIds:
|
|
1625
|
+
skillIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1330
1626
|
/** Map of skillId to skillVersionId for this run */
|
|
1331
|
-
skillVersions:
|
|
1627
|
+
skillVersions: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.string()).optional(),
|
|
1332
1628
|
/** Scenario IDs to run (always present — resolved server-side from tags when needed) */
|
|
1333
|
-
scenarioIds:
|
|
1629
|
+
scenarioIds: import_zod29.z.array(import_zod29.z.string()),
|
|
1334
1630
|
/** Current status */
|
|
1335
1631
|
status: EvalStatusSchema,
|
|
1336
1632
|
/** Progress percentage (0-100) */
|
|
1337
|
-
progress:
|
|
1633
|
+
progress: import_zod29.z.number(),
|
|
1338
1634
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1339
|
-
results:
|
|
1635
|
+
results: import_zod29.z.array(import_zod29.z.lazy(() => EvalRunResultSchema)),
|
|
1340
1636
|
/** Aggregated metrics across all results */
|
|
1341
1637
|
aggregateMetrics: EvalMetricsSchema,
|
|
1342
1638
|
/** Aggregated LLM trace summary */
|
|
@@ -1344,41 +1640,41 @@ var EvalRunSchema = TenantEntitySchema.extend({
|
|
|
1344
1640
|
/** What triggered this run */
|
|
1345
1641
|
trigger: TriggerSchema.optional(),
|
|
1346
1642
|
/** When the run started (set when evaluation is triggered) */
|
|
1347
|
-
startedAt:
|
|
1643
|
+
startedAt: import_zod29.z.string().optional(),
|
|
1348
1644
|
/** When the run completed */
|
|
1349
|
-
completedAt:
|
|
1645
|
+
completedAt: import_zod29.z.string().optional(),
|
|
1350
1646
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1351
|
-
liveTraceEvents:
|
|
1647
|
+
liveTraceEvents: import_zod29.z.array(LiveTraceEventSchema).optional(),
|
|
1352
1648
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1353
|
-
jobId:
|
|
1649
|
+
jobId: import_zod29.z.string().optional(),
|
|
1354
1650
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1355
|
-
jobStatus:
|
|
1651
|
+
jobStatus: import_zod29.z.string().optional(),
|
|
1356
1652
|
/** Remote job error message if the job failed */
|
|
1357
|
-
jobError:
|
|
1653
|
+
jobError: import_zod29.z.string().optional(),
|
|
1358
1654
|
/** Timestamp of the last job status check */
|
|
1359
|
-
jobStatusCheckedAt:
|
|
1655
|
+
jobStatusCheckedAt: import_zod29.z.string().optional(),
|
|
1360
1656
|
/** MCP server IDs to enable for this run (optional) */
|
|
1361
|
-
mcpIds:
|
|
1657
|
+
mcpIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1362
1658
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
1363
|
-
subAgentIds:
|
|
1659
|
+
subAgentIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1364
1660
|
/** Rule IDs to enable for this run (optional) */
|
|
1365
|
-
ruleIds:
|
|
1661
|
+
ruleIds: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1366
1662
|
/** Tags used to select scenarios for this run (for traceability) */
|
|
1367
|
-
tags:
|
|
1663
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
1368
1664
|
/** How many times each scenario is executed within this eval run. Default: 1. Max: 20. */
|
|
1369
|
-
runsPerScenario:
|
|
1665
|
+
runsPerScenario: import_zod29.z.number().int().min(1).max(20).optional(),
|
|
1370
1666
|
/** Snapshot of agent configuration captured at run creation time */
|
|
1371
|
-
agentSnapshot:
|
|
1372
|
-
name:
|
|
1667
|
+
agentSnapshot: import_zod29.z.object({
|
|
1668
|
+
name: import_zod29.z.string().optional(),
|
|
1373
1669
|
agentType: AgentTypeSchema.optional(),
|
|
1374
1670
|
runCommand: AgentRunCommandSchema.optional(),
|
|
1375
|
-
systemPrompt:
|
|
1671
|
+
systemPrompt: import_zod29.z.string().nullable().optional(),
|
|
1376
1672
|
modelConfig: ModelConfigSchema.optional()
|
|
1377
1673
|
}).optional(),
|
|
1378
1674
|
/** UUID linking all runs in a comparison group */
|
|
1379
|
-
comparisonGroupId:
|
|
1675
|
+
comparisonGroupId: import_zod29.z.string().optional(),
|
|
1380
1676
|
/** Human-readable label for this variant (e.g., "MCP: Wix Stores") */
|
|
1381
|
-
comparisonLabel:
|
|
1677
|
+
comparisonLabel: import_zod29.z.string().optional(),
|
|
1382
1678
|
/** LLM-generated analysis of the completed run */
|
|
1383
1679
|
runAnalysis: RunAnalysisSchema.optional()
|
|
1384
1680
|
});
|
|
@@ -1396,60 +1692,60 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1396
1692
|
agentSnapshot: true
|
|
1397
1693
|
}).extend({
|
|
1398
1694
|
/** Optional on input — backend resolves from tags when not provided */
|
|
1399
|
-
scenarioIds:
|
|
1695
|
+
scenarioIds: import_zod29.z.array(import_zod29.z.string()).optional()
|
|
1400
1696
|
}).refine(
|
|
1401
1697
|
(data) => data.scenarioIds && data.scenarioIds.length > 0 || data.tags && data.tags.length > 0,
|
|
1402
1698
|
{ message: "Either scenarioIds or tags must be provided" }
|
|
1403
1699
|
);
|
|
1404
|
-
var EvaluationProgressSchema =
|
|
1405
|
-
runId:
|
|
1406
|
-
targetId:
|
|
1407
|
-
totalScenarios:
|
|
1408
|
-
completedScenarios:
|
|
1409
|
-
scenarioProgress:
|
|
1410
|
-
|
|
1411
|
-
scenarioId:
|
|
1412
|
-
currentStep:
|
|
1413
|
-
error:
|
|
1700
|
+
var EvaluationProgressSchema = import_zod29.z.object({
|
|
1701
|
+
runId: import_zod29.z.string(),
|
|
1702
|
+
targetId: import_zod29.z.string(),
|
|
1703
|
+
totalScenarios: import_zod29.z.number(),
|
|
1704
|
+
completedScenarios: import_zod29.z.number(),
|
|
1705
|
+
scenarioProgress: import_zod29.z.array(
|
|
1706
|
+
import_zod29.z.object({
|
|
1707
|
+
scenarioId: import_zod29.z.string(),
|
|
1708
|
+
currentStep: import_zod29.z.string(),
|
|
1709
|
+
error: import_zod29.z.string().optional()
|
|
1414
1710
|
})
|
|
1415
1711
|
),
|
|
1416
|
-
createdAt:
|
|
1417
|
-
});
|
|
1418
|
-
var EvaluationLogSchema =
|
|
1419
|
-
runId:
|
|
1420
|
-
scenarioId:
|
|
1421
|
-
log:
|
|
1422
|
-
level:
|
|
1423
|
-
message:
|
|
1424
|
-
args:
|
|
1425
|
-
error:
|
|
1712
|
+
createdAt: import_zod29.z.number()
|
|
1713
|
+
});
|
|
1714
|
+
var EvaluationLogSchema = import_zod29.z.object({
|
|
1715
|
+
runId: import_zod29.z.string(),
|
|
1716
|
+
scenarioId: import_zod29.z.string(),
|
|
1717
|
+
log: import_zod29.z.object({
|
|
1718
|
+
level: import_zod29.z.enum(["info", "error", "debug"]),
|
|
1719
|
+
message: import_zod29.z.string().optional(),
|
|
1720
|
+
args: import_zod29.z.array(import_zod29.z.any()).optional(),
|
|
1721
|
+
error: import_zod29.z.string().optional()
|
|
1426
1722
|
})
|
|
1427
1723
|
});
|
|
1428
1724
|
var LLM_TIMEOUT = 12e4;
|
|
1429
1725
|
|
|
1430
1726
|
// src/evaluation/conversation.ts
|
|
1431
|
-
var
|
|
1432
|
-
var TextBlockSchema =
|
|
1433
|
-
type:
|
|
1434
|
-
text:
|
|
1435
|
-
});
|
|
1436
|
-
var ThinkingBlockSchema =
|
|
1437
|
-
type:
|
|
1438
|
-
thinking:
|
|
1439
|
-
});
|
|
1440
|
-
var ToolUseBlockSchema =
|
|
1441
|
-
type:
|
|
1442
|
-
toolName:
|
|
1443
|
-
toolId:
|
|
1444
|
-
input:
|
|
1445
|
-
});
|
|
1446
|
-
var ToolResultBlockSchema =
|
|
1447
|
-
type:
|
|
1448
|
-
toolUseId:
|
|
1449
|
-
content:
|
|
1450
|
-
isError:
|
|
1451
|
-
});
|
|
1452
|
-
var ConversationBlockSchema =
|
|
1727
|
+
var import_zod30 = require("zod");
|
|
1728
|
+
var TextBlockSchema = import_zod30.z.object({
|
|
1729
|
+
type: import_zod30.z.literal("text"),
|
|
1730
|
+
text: import_zod30.z.string()
|
|
1731
|
+
});
|
|
1732
|
+
var ThinkingBlockSchema = import_zod30.z.object({
|
|
1733
|
+
type: import_zod30.z.literal("thinking"),
|
|
1734
|
+
thinking: import_zod30.z.string()
|
|
1735
|
+
});
|
|
1736
|
+
var ToolUseBlockSchema = import_zod30.z.object({
|
|
1737
|
+
type: import_zod30.z.literal("tool_use"),
|
|
1738
|
+
toolName: import_zod30.z.string(),
|
|
1739
|
+
toolId: import_zod30.z.string(),
|
|
1740
|
+
input: import_zod30.z.unknown()
|
|
1741
|
+
});
|
|
1742
|
+
var ToolResultBlockSchema = import_zod30.z.object({
|
|
1743
|
+
type: import_zod30.z.literal("tool_result"),
|
|
1744
|
+
toolUseId: import_zod30.z.string(),
|
|
1745
|
+
content: import_zod30.z.string(),
|
|
1746
|
+
isError: import_zod30.z.boolean().optional()
|
|
1747
|
+
});
|
|
1748
|
+
var ConversationBlockSchema = import_zod30.z.discriminatedUnion("type", [
|
|
1453
1749
|
TextBlockSchema,
|
|
1454
1750
|
ThinkingBlockSchema,
|
|
1455
1751
|
ToolUseBlockSchema,
|
|
@@ -1460,18 +1756,18 @@ var ConversationMessageRoles = [
|
|
|
1460
1756
|
"user",
|
|
1461
1757
|
"system"
|
|
1462
1758
|
];
|
|
1463
|
-
var ConversationMessageSchema =
|
|
1464
|
-
role:
|
|
1465
|
-
content:
|
|
1466
|
-
timestamp:
|
|
1759
|
+
var ConversationMessageSchema = import_zod30.z.object({
|
|
1760
|
+
role: import_zod30.z.enum(ConversationMessageRoles),
|
|
1761
|
+
content: import_zod30.z.array(ConversationBlockSchema),
|
|
1762
|
+
timestamp: import_zod30.z.string()
|
|
1467
1763
|
});
|
|
1468
|
-
var ScenarioConversationSchema =
|
|
1469
|
-
id:
|
|
1470
|
-
projectId:
|
|
1471
|
-
evalRunId:
|
|
1472
|
-
resultId:
|
|
1473
|
-
messages:
|
|
1474
|
-
createdAt:
|
|
1764
|
+
var ScenarioConversationSchema = import_zod30.z.object({
|
|
1765
|
+
id: import_zod30.z.string(),
|
|
1766
|
+
projectId: import_zod30.z.string(),
|
|
1767
|
+
evalRunId: import_zod30.z.string(),
|
|
1768
|
+
resultId: import_zod30.z.string(),
|
|
1769
|
+
messages: import_zod30.z.array(ConversationMessageSchema),
|
|
1770
|
+
createdAt: import_zod30.z.string()
|
|
1475
1771
|
});
|
|
1476
1772
|
|
|
1477
1773
|
// src/evaluation/eval-result.ts
|
|
@@ -1482,98 +1778,98 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1482
1778
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1483
1779
|
return AssertionResultStatus2;
|
|
1484
1780
|
})(AssertionResultStatus || {});
|
|
1485
|
-
var AssertionResultSchema =
|
|
1486
|
-
id:
|
|
1487
|
-
assertionId:
|
|
1488
|
-
assertionType:
|
|
1489
|
-
assertionName:
|
|
1490
|
-
status:
|
|
1491
|
-
message:
|
|
1492
|
-
expected:
|
|
1493
|
-
actual:
|
|
1494
|
-
duration:
|
|
1495
|
-
details:
|
|
1496
|
-
llmTraceSteps:
|
|
1497
|
-
});
|
|
1498
|
-
var EvalRunResultSchema =
|
|
1499
|
-
id:
|
|
1500
|
-
targetId:
|
|
1501
|
-
targetName:
|
|
1781
|
+
var AssertionResultSchema = import_zod31.z.object({
|
|
1782
|
+
id: import_zod31.z.string(),
|
|
1783
|
+
assertionId: import_zod31.z.string(),
|
|
1784
|
+
assertionType: import_zod31.z.string(),
|
|
1785
|
+
assertionName: import_zod31.z.string(),
|
|
1786
|
+
status: import_zod31.z.enum(AssertionResultStatus),
|
|
1787
|
+
message: import_zod31.z.string().optional(),
|
|
1788
|
+
expected: import_zod31.z.string().optional(),
|
|
1789
|
+
actual: import_zod31.z.string().optional(),
|
|
1790
|
+
duration: import_zod31.z.number().optional(),
|
|
1791
|
+
details: import_zod31.z.record(import_zod31.z.string(), import_zod31.z.unknown()).optional(),
|
|
1792
|
+
llmTraceSteps: import_zod31.z.array(LLMTraceStepSchema).optional()
|
|
1793
|
+
});
|
|
1794
|
+
var EvalRunResultSchema = import_zod31.z.object({
|
|
1795
|
+
id: import_zod31.z.string(),
|
|
1796
|
+
targetId: import_zod31.z.string(),
|
|
1797
|
+
targetName: import_zod31.z.string().optional(),
|
|
1502
1798
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
1503
|
-
skillVersionId:
|
|
1799
|
+
skillVersionId: import_zod31.z.string().optional(),
|
|
1504
1800
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
1505
|
-
skillVersion:
|
|
1506
|
-
scenarioId:
|
|
1507
|
-
scenarioName:
|
|
1801
|
+
skillVersion: import_zod31.z.string().optional(),
|
|
1802
|
+
scenarioId: import_zod31.z.string(),
|
|
1803
|
+
scenarioName: import_zod31.z.string(),
|
|
1508
1804
|
/** Snapshot of the trigger prompt used during the run (prevents stale display after edits) */
|
|
1509
|
-
triggerPrompt:
|
|
1805
|
+
triggerPrompt: import_zod31.z.string().optional(),
|
|
1510
1806
|
modelConfig: ModelConfigSchema.optional(),
|
|
1511
|
-
assertionResults:
|
|
1807
|
+
assertionResults: import_zod31.z.array(AssertionResultSchema),
|
|
1512
1808
|
metrics: EvalMetricsSchema.optional(),
|
|
1513
|
-
passed:
|
|
1514
|
-
failed:
|
|
1515
|
-
passRate:
|
|
1516
|
-
duration:
|
|
1517
|
-
outputText:
|
|
1518
|
-
files:
|
|
1519
|
-
fileDiffs:
|
|
1809
|
+
passed: import_zod31.z.number(),
|
|
1810
|
+
failed: import_zod31.z.number(),
|
|
1811
|
+
passRate: import_zod31.z.number(),
|
|
1812
|
+
duration: import_zod31.z.number(),
|
|
1813
|
+
outputText: import_zod31.z.string().optional(),
|
|
1814
|
+
files: import_zod31.z.array(ExpectedFileSchema).optional(),
|
|
1815
|
+
fileDiffs: import_zod31.z.array(DiffContentSchema).optional(),
|
|
1520
1816
|
/** Full template files after execution with status indicators */
|
|
1521
|
-
templateFiles:
|
|
1522
|
-
startedAt:
|
|
1523
|
-
completedAt:
|
|
1817
|
+
templateFiles: import_zod31.z.array(TemplateFileSchema).optional(),
|
|
1818
|
+
startedAt: import_zod31.z.string().optional(),
|
|
1819
|
+
completedAt: import_zod31.z.string().optional(),
|
|
1524
1820
|
llmTrace: LLMTraceSchema.optional(),
|
|
1525
1821
|
/** Full conversation messages (only present in transit; stripped before DB storage) */
|
|
1526
|
-
conversation:
|
|
1822
|
+
conversation: import_zod31.z.array(ConversationMessageSchema).optional(),
|
|
1527
1823
|
/** 0-based iteration index when a scenario is run multiple times within a single eval run */
|
|
1528
|
-
iterationIndex:
|
|
1529
|
-
});
|
|
1530
|
-
var PromptResultSchema =
|
|
1531
|
-
text:
|
|
1532
|
-
files:
|
|
1533
|
-
finishReason:
|
|
1534
|
-
reasoning:
|
|
1535
|
-
reasoningDetails:
|
|
1536
|
-
toolCalls:
|
|
1537
|
-
toolResults:
|
|
1538
|
-
warnings:
|
|
1539
|
-
sources:
|
|
1540
|
-
steps:
|
|
1541
|
-
generationTimeMs:
|
|
1542
|
-
prompt:
|
|
1543
|
-
systemPrompt:
|
|
1544
|
-
usage:
|
|
1545
|
-
totalTokens:
|
|
1546
|
-
totalMicrocentsSpent:
|
|
1824
|
+
iterationIndex: import_zod31.z.number().int().min(0).optional()
|
|
1825
|
+
});
|
|
1826
|
+
var PromptResultSchema = import_zod31.z.object({
|
|
1827
|
+
text: import_zod31.z.string(),
|
|
1828
|
+
files: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1829
|
+
finishReason: import_zod31.z.string().optional(),
|
|
1830
|
+
reasoning: import_zod31.z.string().optional(),
|
|
1831
|
+
reasoningDetails: import_zod31.z.unknown().optional(),
|
|
1832
|
+
toolCalls: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1833
|
+
toolResults: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1834
|
+
warnings: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1835
|
+
sources: import_zod31.z.array(import_zod31.z.unknown()).optional(),
|
|
1836
|
+
steps: import_zod31.z.array(import_zod31.z.unknown()),
|
|
1837
|
+
generationTimeMs: import_zod31.z.number(),
|
|
1838
|
+
prompt: import_zod31.z.string(),
|
|
1839
|
+
systemPrompt: import_zod31.z.string(),
|
|
1840
|
+
usage: import_zod31.z.object({
|
|
1841
|
+
totalTokens: import_zod31.z.number().optional(),
|
|
1842
|
+
totalMicrocentsSpent: import_zod31.z.number().optional()
|
|
1547
1843
|
})
|
|
1548
1844
|
});
|
|
1549
|
-
var EvaluationResultSchema =
|
|
1550
|
-
id:
|
|
1551
|
-
runId:
|
|
1552
|
-
timestamp:
|
|
1845
|
+
var EvaluationResultSchema = import_zod31.z.object({
|
|
1846
|
+
id: import_zod31.z.string(),
|
|
1847
|
+
runId: import_zod31.z.string(),
|
|
1848
|
+
timestamp: import_zod31.z.number(),
|
|
1553
1849
|
promptResult: PromptResultSchema,
|
|
1554
|
-
testResults:
|
|
1555
|
-
tags:
|
|
1556
|
-
feedback:
|
|
1557
|
-
score:
|
|
1558
|
-
suiteId:
|
|
1559
|
-
});
|
|
1560
|
-
var LeanEvaluationResultSchema =
|
|
1561
|
-
id:
|
|
1562
|
-
runId:
|
|
1563
|
-
timestamp:
|
|
1564
|
-
tags:
|
|
1565
|
-
scenarioId:
|
|
1566
|
-
scenarioVersion:
|
|
1567
|
-
targetId:
|
|
1568
|
-
targetVersion:
|
|
1569
|
-
suiteId:
|
|
1570
|
-
score:
|
|
1571
|
-
time:
|
|
1572
|
-
microcentsSpent:
|
|
1850
|
+
testResults: import_zod31.z.array(import_zod31.z.unknown()),
|
|
1851
|
+
tags: import_zod31.z.array(import_zod31.z.string()).optional(),
|
|
1852
|
+
feedback: import_zod31.z.string().optional(),
|
|
1853
|
+
score: import_zod31.z.number(),
|
|
1854
|
+
suiteId: import_zod31.z.string().optional()
|
|
1855
|
+
});
|
|
1856
|
+
var LeanEvaluationResultSchema = import_zod31.z.object({
|
|
1857
|
+
id: import_zod31.z.string(),
|
|
1858
|
+
runId: import_zod31.z.string(),
|
|
1859
|
+
timestamp: import_zod31.z.number(),
|
|
1860
|
+
tags: import_zod31.z.array(import_zod31.z.string()).optional(),
|
|
1861
|
+
scenarioId: import_zod31.z.string(),
|
|
1862
|
+
scenarioVersion: import_zod31.z.number().optional(),
|
|
1863
|
+
targetId: import_zod31.z.string(),
|
|
1864
|
+
targetVersion: import_zod31.z.number().optional(),
|
|
1865
|
+
suiteId: import_zod31.z.string().optional(),
|
|
1866
|
+
score: import_zod31.z.number(),
|
|
1867
|
+
time: import_zod31.z.number().optional(),
|
|
1868
|
+
microcentsSpent: import_zod31.z.number().optional()
|
|
1573
1869
|
});
|
|
1574
1870
|
|
|
1575
1871
|
// src/evaluation/eval-run-folder.ts
|
|
1576
|
-
var
|
|
1872
|
+
var import_zod32 = require("zod");
|
|
1577
1873
|
var EvalRunFolderSchema = TenantEntitySchema.extend({});
|
|
1578
1874
|
var CreateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
1579
1875
|
id: true,
|
|
@@ -1587,26 +1883,26 @@ var UpdateEvalRunFolderInputSchema = EvalRunFolderSchema.omit({
|
|
|
1587
1883
|
updatedAt: true,
|
|
1588
1884
|
deleted: true
|
|
1589
1885
|
}).partial();
|
|
1590
|
-
var EvalRunFolderMembershipSchema =
|
|
1591
|
-
folderId:
|
|
1592
|
-
evalRunId:
|
|
1593
|
-
projectId:
|
|
1594
|
-
createdAt:
|
|
1886
|
+
var EvalRunFolderMembershipSchema = import_zod32.z.object({
|
|
1887
|
+
folderId: import_zod32.z.string(),
|
|
1888
|
+
evalRunId: import_zod32.z.string(),
|
|
1889
|
+
projectId: import_zod32.z.string(),
|
|
1890
|
+
createdAt: import_zod32.z.string()
|
|
1595
1891
|
});
|
|
1596
1892
|
|
|
1597
1893
|
// src/project/project.ts
|
|
1598
|
-
var
|
|
1894
|
+
var import_zod33 = require("zod");
|
|
1599
1895
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1600
|
-
appId:
|
|
1601
|
-
scenarioTags:
|
|
1896
|
+
appId: import_zod33.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
1897
|
+
scenarioTags: import_zod33.z.array(import_zod33.z.string()).optional().describe("Project-level tag vocabulary for scenarios"),
|
|
1602
1898
|
/** Per-project Wix auth token (write-only — never returned in GET responses). null = clear. */
|
|
1603
|
-
wixAuthToken:
|
|
1899
|
+
wixAuthToken: import_zod33.z.string().nullable().optional().describe("Wix auth token for CLI/MCP authentication (encrypted at rest)"),
|
|
1604
1900
|
/** Per-project Base44 auth file content (write-only — never returned in GET responses). null = clear. */
|
|
1605
|
-
base44AuthFile:
|
|
1901
|
+
base44AuthFile: import_zod33.z.string().nullable().optional().describe("Base64-encoded Base44 auth file content (encrypted at rest)"),
|
|
1606
1902
|
/** Resolved at runtime from the encrypted Wix auth token */
|
|
1607
|
-
wixAuthEmail:
|
|
1903
|
+
wixAuthEmail: import_zod33.z.string().optional().describe("Email associated with the Wix auth token (resolved at runtime)"),
|
|
1608
1904
|
/** Resolved at runtime from the encrypted Base44 auth file */
|
|
1609
|
-
base44AuthEmail:
|
|
1905
|
+
base44AuthEmail: import_zod33.z.string().optional().describe("Email from the Base44 auth file (resolved at runtime)")
|
|
1610
1906
|
});
|
|
1611
1907
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1612
1908
|
id: true,
|
|
@@ -1632,7 +1928,7 @@ var CreateTemplateInputSchema = TemplateSchema.omit({
|
|
|
1632
1928
|
var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
1633
1929
|
|
|
1634
1930
|
// src/schedule/eval-schedule.ts
|
|
1635
|
-
var
|
|
1931
|
+
var import_zod34 = require("zod");
|
|
1636
1932
|
var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
1637
1933
|
FrequencyType2["DAILY"] = "daily";
|
|
1638
1934
|
FrequencyType2["WEEKDAY"] = "weekday";
|
|
@@ -1642,29 +1938,29 @@ var FrequencyType = /* @__PURE__ */ ((FrequencyType2) => {
|
|
|
1642
1938
|
})(FrequencyType || {});
|
|
1643
1939
|
var EvalScheduleSchema = TenantEntitySchema.extend({
|
|
1644
1940
|
/** Whether the schedule is active */
|
|
1645
|
-
enabled:
|
|
1941
|
+
enabled: import_zod34.z.boolean(),
|
|
1646
1942
|
/** Test suite to run */
|
|
1647
|
-
suiteId:
|
|
1943
|
+
suiteId: import_zod34.z.string(),
|
|
1648
1944
|
/** Preset that provides agent + entities for this schedule */
|
|
1649
|
-
presetId:
|
|
1945
|
+
presetId: import_zod34.z.string(),
|
|
1650
1946
|
/** How often to run */
|
|
1651
|
-
frequencyType:
|
|
1947
|
+
frequencyType: import_zod34.z.nativeEnum(FrequencyType),
|
|
1652
1948
|
/** Time of day in 24h format (HH:MM), hours 00-23, minutes 00-59 */
|
|
1653
|
-
timeOfDay:
|
|
1949
|
+
timeOfDay: import_zod34.z.string().regex(/^([01]\d|2[0-3]):[0-5]\d$/),
|
|
1654
1950
|
/** Day of week (0=Sun, 6=Sat) for weekly schedules */
|
|
1655
|
-
dayOfWeek:
|
|
1951
|
+
dayOfWeek: import_zod34.z.number().min(0).max(6).optional(),
|
|
1656
1952
|
/** Day of month (1-31) for monthly schedules */
|
|
1657
|
-
dayOfMonth:
|
|
1953
|
+
dayOfMonth: import_zod34.z.number().min(1).max(31).optional(),
|
|
1658
1954
|
/** IANA timezone (e.g., 'America/New_York') */
|
|
1659
|
-
timezone:
|
|
1955
|
+
timezone: import_zod34.z.string(),
|
|
1660
1956
|
/** ID of the last eval run created by this schedule */
|
|
1661
|
-
lastRunId:
|
|
1957
|
+
lastRunId: import_zod34.z.string().optional(),
|
|
1662
1958
|
/** Denormalized status of the last run */
|
|
1663
|
-
lastRunStatus:
|
|
1959
|
+
lastRunStatus: import_zod34.z.string().optional(),
|
|
1664
1960
|
/** ISO timestamp of the last run */
|
|
1665
|
-
lastRunAt:
|
|
1961
|
+
lastRunAt: import_zod34.z.string().optional(),
|
|
1666
1962
|
/** Next scheduled run time in UTC (pre-computed for efficient querying, set by backend) */
|
|
1667
|
-
nextRunAt:
|
|
1963
|
+
nextRunAt: import_zod34.z.string().optional()
|
|
1668
1964
|
});
|
|
1669
1965
|
function isValidTimezone(tz) {
|
|
1670
1966
|
try {
|
|
@@ -1677,14 +1973,14 @@ function isValidTimezone(tz) {
|
|
|
1677
1973
|
function validateScheduleFields(data, ctx, options) {
|
|
1678
1974
|
if (data.frequencyType === "weekly" /* WEEKLY */ && data.dayOfWeek == null) {
|
|
1679
1975
|
ctx.addIssue({
|
|
1680
|
-
code:
|
|
1976
|
+
code: import_zod34.z.ZodIssueCode.custom,
|
|
1681
1977
|
message: "dayOfWeek is required for weekly schedules",
|
|
1682
1978
|
path: ["dayOfWeek"]
|
|
1683
1979
|
});
|
|
1684
1980
|
}
|
|
1685
1981
|
if (data.frequencyType === "monthly" /* MONTHLY */ && data.dayOfMonth == null) {
|
|
1686
1982
|
ctx.addIssue({
|
|
1687
|
-
code:
|
|
1983
|
+
code: import_zod34.z.ZodIssueCode.custom,
|
|
1688
1984
|
message: "dayOfMonth is required for monthly schedules",
|
|
1689
1985
|
path: ["dayOfMonth"]
|
|
1690
1986
|
});
|
|
@@ -1692,7 +1988,7 @@ function validateScheduleFields(data, ctx, options) {
|
|
|
1692
1988
|
const shouldValidateTz = options.partial ? data.timezone !== void 0 : true;
|
|
1693
1989
|
if (shouldValidateTz && !isValidTimezone(data.timezone)) {
|
|
1694
1990
|
ctx.addIssue({
|
|
1695
|
-
code:
|
|
1991
|
+
code: import_zod34.z.ZodIssueCode.custom,
|
|
1696
1992
|
message: "Invalid IANA timezone",
|
|
1697
1993
|
path: ["timezone"]
|
|
1698
1994
|
});
|
|
@@ -1715,229 +2011,10 @@ var CreateEvalScheduleInputSchema = BaseCreateScheduleSchema.superRefine((data,
|
|
|
1715
2011
|
var UpdateEvalScheduleInputSchema = BaseCreateScheduleSchema.partial().superRefine((data, ctx) => {
|
|
1716
2012
|
validateScheduleFields(data, ctx, { partial: true });
|
|
1717
2013
|
});
|
|
1718
|
-
|
|
1719
|
-
// src/assertion/system-assertions.ts
|
|
1720
|
-
var SYSTEM_ASSERTION_IDS = {
|
|
1721
|
-
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
1722
|
-
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
1723
|
-
BUILD_PASSED: "system:build_passed",
|
|
1724
|
-
TIME_LIMIT: "system:time_limit",
|
|
1725
|
-
COST: "system:cost",
|
|
1726
|
-
LLM_JUDGE: "system:llm_judge",
|
|
1727
|
-
API_CALL: "system:api_call"
|
|
1728
|
-
};
|
|
1729
|
-
function isSystemAssertionId(id) {
|
|
1730
|
-
return id.startsWith("system:");
|
|
1731
|
-
}
|
|
1732
|
-
var SYSTEM_ASSERTIONS = {
|
|
1733
|
-
[SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED]: {
|
|
1734
|
-
id: SYSTEM_ASSERTION_IDS.SKILL_WAS_CALLED,
|
|
1735
|
-
name: "Skill Was Called",
|
|
1736
|
-
description: "Check that one or more skills were invoked during the agent run",
|
|
1737
|
-
type: "skill_was_called",
|
|
1738
|
-
parameters: [
|
|
1739
|
-
{
|
|
1740
|
-
name: "skillNames",
|
|
1741
|
-
label: "Skills",
|
|
1742
|
-
type: "string",
|
|
1743
|
-
required: true
|
|
1744
|
-
},
|
|
1745
|
-
{
|
|
1746
|
-
name: "negate",
|
|
1747
|
-
label: "Negate (NOT operator)",
|
|
1748
|
-
type: "boolean",
|
|
1749
|
-
required: false,
|
|
1750
|
-
defaultValue: false
|
|
1751
|
-
}
|
|
1752
|
-
]
|
|
1753
|
-
},
|
|
1754
|
-
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
1755
|
-
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
1756
|
-
name: "Tool Called With Param",
|
|
1757
|
-
description: "Check that a tool was called with expected parameters (tool name is substring matched)",
|
|
1758
|
-
type: "tool_called_with_param",
|
|
1759
|
-
parameters: [
|
|
1760
|
-
{
|
|
1761
|
-
name: "toolName",
|
|
1762
|
-
label: "Tool Name",
|
|
1763
|
-
type: "string",
|
|
1764
|
-
required: true
|
|
1765
|
-
},
|
|
1766
|
-
{
|
|
1767
|
-
name: "expectedParams",
|
|
1768
|
-
label: "Expected Parameters (JSON, substring match)",
|
|
1769
|
-
type: "string",
|
|
1770
|
-
required: false
|
|
1771
|
-
},
|
|
1772
|
-
{
|
|
1773
|
-
name: "requireSuccess",
|
|
1774
|
-
label: "Require Successful Call",
|
|
1775
|
-
type: "boolean",
|
|
1776
|
-
required: false,
|
|
1777
|
-
defaultValue: false,
|
|
1778
|
-
advanced: true
|
|
1779
|
-
},
|
|
1780
|
-
{
|
|
1781
|
-
name: "negate",
|
|
1782
|
-
label: "Negate (NOT operator)",
|
|
1783
|
-
type: "boolean",
|
|
1784
|
-
required: false,
|
|
1785
|
-
defaultValue: false
|
|
1786
|
-
}
|
|
1787
|
-
]
|
|
1788
|
-
},
|
|
1789
|
-
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
1790
|
-
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
1791
|
-
name: "Build Passed",
|
|
1792
|
-
description: "Run a build command and verify it exits with expected code",
|
|
1793
|
-
type: "build_passed",
|
|
1794
|
-
parameters: [
|
|
1795
|
-
{
|
|
1796
|
-
name: "command",
|
|
1797
|
-
label: "Build Command",
|
|
1798
|
-
type: "string",
|
|
1799
|
-
required: false,
|
|
1800
|
-
defaultValue: "yarn build"
|
|
1801
|
-
},
|
|
1802
|
-
{
|
|
1803
|
-
name: "expectedExitCode",
|
|
1804
|
-
label: "Expected Exit Code",
|
|
1805
|
-
type: "number",
|
|
1806
|
-
required: false,
|
|
1807
|
-
defaultValue: 0
|
|
1808
|
-
},
|
|
1809
|
-
{
|
|
1810
|
-
name: "maxBuildTime",
|
|
1811
|
-
label: "Max Build Time (ms)",
|
|
1812
|
-
type: "number",
|
|
1813
|
-
required: false,
|
|
1814
|
-
advanced: true
|
|
1815
|
-
},
|
|
1816
|
-
{
|
|
1817
|
-
name: "maxMemory",
|
|
1818
|
-
label: "Max Memory (MB)",
|
|
1819
|
-
type: "number",
|
|
1820
|
-
required: false,
|
|
1821
|
-
advanced: true
|
|
1822
|
-
}
|
|
1823
|
-
]
|
|
1824
|
-
},
|
|
1825
|
-
[SYSTEM_ASSERTION_IDS.TIME_LIMIT]: {
|
|
1826
|
-
id: SYSTEM_ASSERTION_IDS.TIME_LIMIT,
|
|
1827
|
-
name: "Time Limit",
|
|
1828
|
-
description: "Check that the scenario completed within a maximum duration",
|
|
1829
|
-
type: "time_limit",
|
|
1830
|
-
parameters: [
|
|
1831
|
-
{
|
|
1832
|
-
name: "maxDurationMs",
|
|
1833
|
-
label: "Max Duration (ms)",
|
|
1834
|
-
type: "number",
|
|
1835
|
-
required: true,
|
|
1836
|
-
defaultValue: 3e5
|
|
1837
|
-
}
|
|
1838
|
-
]
|
|
1839
|
-
},
|
|
1840
|
-
[SYSTEM_ASSERTION_IDS.COST]: {
|
|
1841
|
-
id: SYSTEM_ASSERTION_IDS.COST,
|
|
1842
|
-
name: "Cost",
|
|
1843
|
-
description: "Check that the scenario LLM execution cost stays within a USD threshold",
|
|
1844
|
-
type: "cost",
|
|
1845
|
-
parameters: [
|
|
1846
|
-
{
|
|
1847
|
-
name: "maxCostUsd",
|
|
1848
|
-
label: "Max Cost (USD)",
|
|
1849
|
-
type: "number",
|
|
1850
|
-
required: true,
|
|
1851
|
-
defaultValue: 1
|
|
1852
|
-
}
|
|
1853
|
-
]
|
|
1854
|
-
},
|
|
1855
|
-
[SYSTEM_ASSERTION_IDS.LLM_JUDGE]: {
|
|
1856
|
-
id: SYSTEM_ASSERTION_IDS.LLM_JUDGE,
|
|
1857
|
-
name: "LLM Judge",
|
|
1858
|
-
description: "LLM evaluates the output and assigns a score (0-10)",
|
|
1859
|
-
type: "llm_judge",
|
|
1860
|
-
parameters: [
|
|
1861
|
-
{
|
|
1862
|
-
name: "prompt",
|
|
1863
|
-
label: "Judge Prompt",
|
|
1864
|
-
type: "string",
|
|
1865
|
-
required: true,
|
|
1866
|
-
defaultValue: "Verify the output meets the acceptance criteria."
|
|
1867
|
-
},
|
|
1868
|
-
{
|
|
1869
|
-
name: "minScore",
|
|
1870
|
-
label: "Minimum Score (0-10)",
|
|
1871
|
-
type: "number",
|
|
1872
|
-
required: false,
|
|
1873
|
-
defaultValue: 7
|
|
1874
|
-
},
|
|
1875
|
-
{
|
|
1876
|
-
name: "model",
|
|
1877
|
-
label: "Model",
|
|
1878
|
-
type: "string",
|
|
1879
|
-
required: false
|
|
1880
|
-
}
|
|
1881
|
-
]
|
|
1882
|
-
},
|
|
1883
|
-
[SYSTEM_ASSERTION_IDS.API_CALL]: {
|
|
1884
|
-
id: SYSTEM_ASSERTION_IDS.API_CALL,
|
|
1885
|
-
name: "API Call",
|
|
1886
|
-
description: "Call an API endpoint and verify the response contains expected data",
|
|
1887
|
-
type: "api_call",
|
|
1888
|
-
parameters: [
|
|
1889
|
-
{
|
|
1890
|
-
name: "url",
|
|
1891
|
-
label: "URL",
|
|
1892
|
-
type: "string",
|
|
1893
|
-
required: true
|
|
1894
|
-
},
|
|
1895
|
-
{
|
|
1896
|
-
name: "method",
|
|
1897
|
-
label: "HTTP Method",
|
|
1898
|
-
type: "string",
|
|
1899
|
-
required: false,
|
|
1900
|
-
defaultValue: "GET"
|
|
1901
|
-
},
|
|
1902
|
-
{
|
|
1903
|
-
name: "requestBody",
|
|
1904
|
-
label: "Request Body (JSON)",
|
|
1905
|
-
type: "string",
|
|
1906
|
-
required: false
|
|
1907
|
-
},
|
|
1908
|
-
{
|
|
1909
|
-
name: "expectedResponse",
|
|
1910
|
-
label: "Expected Response (JSON)",
|
|
1911
|
-
type: "string",
|
|
1912
|
-
required: true
|
|
1913
|
-
},
|
|
1914
|
-
{
|
|
1915
|
-
name: "requestHeaders",
|
|
1916
|
-
label: "Headers (JSON)",
|
|
1917
|
-
type: "string",
|
|
1918
|
-
required: false,
|
|
1919
|
-
advanced: true
|
|
1920
|
-
},
|
|
1921
|
-
{
|
|
1922
|
-
name: "timeoutMs",
|
|
1923
|
-
label: "Timeout (ms)",
|
|
1924
|
-
type: "number",
|
|
1925
|
-
required: false,
|
|
1926
|
-
defaultValue: 3e4,
|
|
1927
|
-
advanced: true
|
|
1928
|
-
}
|
|
1929
|
-
]
|
|
1930
|
-
}
|
|
1931
|
-
};
|
|
1932
|
-
function getSystemAssertions() {
|
|
1933
|
-
return Object.values(SYSTEM_ASSERTIONS);
|
|
1934
|
-
}
|
|
1935
|
-
function getSystemAssertion(id) {
|
|
1936
|
-
return SYSTEM_ASSERTIONS[id];
|
|
1937
|
-
}
|
|
1938
2014
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1939
2015
|
0 && (module.exports = {
|
|
1940
2016
|
AGENT_TYPE_LABELS,
|
|
2017
|
+
ALLOWED_BUILD_COMMANDS,
|
|
1941
2018
|
ALL_AVAILABLE_MODEL_IDS,
|
|
1942
2019
|
AVAILABLE_CLAUDE_MODEL_IDS,
|
|
1943
2020
|
AVAILABLE_OPENAI_MODEL_IDS,
|
|
@@ -1971,6 +2048,7 @@ function getSystemAssertion(id) {
|
|
|
1971
2048
|
BatchSummarySchema,
|
|
1972
2049
|
BuildCheckTestSchema,
|
|
1973
2050
|
BuildPassedAssertionSchema,
|
|
2051
|
+
BuildPassedCommandStringSchema,
|
|
1974
2052
|
BuildPassedConfigSchema,
|
|
1975
2053
|
BulkImportResultItemSchema,
|
|
1976
2054
|
BulkImportResultSchema,
|
|
@@ -1998,6 +2076,7 @@ function getSystemAssertion(id) {
|
|
|
1998
2076
|
CreateTemplateInputSchema,
|
|
1999
2077
|
CreateTestScenarioInputSchema,
|
|
2000
2078
|
CreateTestSuiteInputSchema,
|
|
2079
|
+
DEFAULT_BUILD_PASSED_COMMAND,
|
|
2001
2080
|
DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
2002
2081
|
DEFAULT_JUDGE_MODEL,
|
|
2003
2082
|
DiffContentSchema,
|
|
@@ -2115,11 +2194,14 @@ function getSystemAssertion(id) {
|
|
|
2115
2194
|
formatTraceEventLine,
|
|
2116
2195
|
getSystemAssertion,
|
|
2117
2196
|
getSystemAssertions,
|
|
2197
|
+
isAllowedBuildCommandString,
|
|
2118
2198
|
isSystemAssertionId,
|
|
2119
2199
|
isValidSkillFolderName,
|
|
2120
2200
|
normalizeBatchAssertionLink,
|
|
2121
2201
|
normalizeModelId,
|
|
2202
|
+
parseBuildCommandToArgv,
|
|
2122
2203
|
parseTraceEventLine,
|
|
2123
|
-
validateAssertionConfig
|
|
2204
|
+
validateAssertionConfig,
|
|
2205
|
+
validateBuildPassedParamsInAssertionLinks
|
|
2124
2206
|
});
|
|
2125
2207
|
//# sourceMappingURL=index.js.map
|