@wix/evalforge-types 0.37.0 → 0.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.js +496 -413
- package/build/index.js.map +4 -4
- package/build/index.mjs +489 -413
- package/build/index.mjs.map +4 -4
- package/build/types/agent/adapter.d.ts +3 -0
- package/build/types/assertion/assertion.d.ts +34 -0
- package/build/types/assertion/system-assertions.d.ts +1 -0
- package/build/types/common/index.d.ts +2 -0
- package/build/types/common/rule.d.ts +47 -0
- package/build/types/common/tool-names.d.ts +1 -0
- package/build/types/evaluation/eval-run.d.ts +2 -0
- package/build/types/scenario/assertions.d.ts +16 -0
- package/build/types/scenario/test-scenario.d.ts +12 -0
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -975,6 +975,33 @@ var ModelConfigSchema = z4.object({
|
|
|
975
975
|
maxTokens: z4.preprocess(nullToUndefined, z4.number().min(1).optional())
|
|
976
976
|
});
|
|
977
977
|
|
|
978
|
+
// src/common/rule.ts
|
|
979
|
+
import { z as z5 } from "zod";
|
|
980
|
+
var RuleTypeSchema = z5.enum(["claude-md", "agents-md", "cursor-rule"]);
|
|
981
|
+
var RuleSchema = TenantEntitySchema.extend({
|
|
982
|
+
ruleType: RuleTypeSchema,
|
|
983
|
+
content: z5.string()
|
|
984
|
+
});
|
|
985
|
+
var RuleInputBaseSchema = RuleSchema.omit({
|
|
986
|
+
id: true,
|
|
987
|
+
createdAt: true,
|
|
988
|
+
updatedAt: true,
|
|
989
|
+
deleted: true
|
|
990
|
+
});
|
|
991
|
+
var CreateRuleInputSchema = RuleInputBaseSchema;
|
|
992
|
+
var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
|
|
993
|
+
|
|
994
|
+
// src/common/tool-names.ts
|
|
995
|
+
var AVAILABLE_TOOL_NAMES = [
|
|
996
|
+
"Bash",
|
|
997
|
+
"Edit",
|
|
998
|
+
"Glob",
|
|
999
|
+
"Grep",
|
|
1000
|
+
"Read",
|
|
1001
|
+
"Skill",
|
|
1002
|
+
"Write"
|
|
1003
|
+
];
|
|
1004
|
+
|
|
978
1005
|
// src/target/target.ts
|
|
979
1006
|
var TargetSchema = TenantEntitySchema.extend({
|
|
980
1007
|
// Base for all testable entities
|
|
@@ -982,7 +1009,7 @@ var TargetSchema = TenantEntitySchema.extend({
|
|
|
982
1009
|
});
|
|
983
1010
|
|
|
984
1011
|
// src/target/agent.ts
|
|
985
|
-
import { z as
|
|
1012
|
+
import { z as z6 } from "zod";
|
|
986
1013
|
var AgentRunCommand = /* @__PURE__ */ ((AgentRunCommand2) => {
|
|
987
1014
|
AgentRunCommand2["CLAUDE"] = "claude";
|
|
988
1015
|
return AgentRunCommand2;
|
|
@@ -991,7 +1018,7 @@ var AVAILABLE_RUN_COMMANDS = Object.values(AgentRunCommand);
|
|
|
991
1018
|
var RUN_COMMAND_LABELS = {
|
|
992
1019
|
["claude" /* CLAUDE */]: "Claude Code"
|
|
993
1020
|
};
|
|
994
|
-
var AgentRunCommandSchema =
|
|
1021
|
+
var AgentRunCommandSchema = z6.nativeEnum(AgentRunCommand);
|
|
995
1022
|
var AgentSchema = TargetSchema.extend({
|
|
996
1023
|
/** Command to run the agent */
|
|
997
1024
|
runCommand: AgentRunCommandSchema,
|
|
@@ -1009,51 +1036,51 @@ var UpdateAgentInputSchema = CreateAgentInputSchema.partial().extend({
|
|
|
1009
1036
|
});
|
|
1010
1037
|
|
|
1011
1038
|
// src/target/skill.ts
|
|
1012
|
-
import { z as
|
|
1039
|
+
import { z as z7 } from "zod";
|
|
1013
1040
|
var SKILL_FOLDER_NAME_REGEX = /^[a-z0-9]+(-[a-z0-9]+)*$/;
|
|
1014
1041
|
var SEMVER_REGEX = /^\d+\.\d+\.\d+$/;
|
|
1015
|
-
var SkillVersionOriginSchema =
|
|
1042
|
+
var SkillVersionOriginSchema = z7.enum(["manual", "pr", "master"]);
|
|
1016
1043
|
function isValidSkillFolderName(name) {
|
|
1017
1044
|
return typeof name === "string" && name.length > 0 && SKILL_FOLDER_NAME_REGEX.test(name.trim());
|
|
1018
1045
|
}
|
|
1019
|
-
var SkillMetadataSchema =
|
|
1020
|
-
name:
|
|
1021
|
-
description:
|
|
1022
|
-
allowedTools:
|
|
1023
|
-
skills:
|
|
1046
|
+
var SkillMetadataSchema = z7.object({
|
|
1047
|
+
name: z7.string(),
|
|
1048
|
+
description: z7.string(),
|
|
1049
|
+
allowedTools: z7.array(z7.string()).optional(),
|
|
1050
|
+
skills: z7.array(z7.string()).optional()
|
|
1024
1051
|
});
|
|
1025
|
-
var SkillFileSchema =
|
|
1052
|
+
var SkillFileSchema = z7.object({
|
|
1026
1053
|
/** Relative path within the skill directory, e.g. "SKILL.md" or "references/API_SPEC.md" */
|
|
1027
|
-
path:
|
|
1054
|
+
path: z7.string().min(1),
|
|
1028
1055
|
/** File content (UTF-8 text) */
|
|
1029
|
-
content:
|
|
1056
|
+
content: z7.string()
|
|
1030
1057
|
});
|
|
1031
|
-
var SkillVersionSchema =
|
|
1032
|
-
id:
|
|
1033
|
-
projectId:
|
|
1034
|
-
skillId:
|
|
1058
|
+
var SkillVersionSchema = z7.object({
|
|
1059
|
+
id: z7.string(),
|
|
1060
|
+
projectId: z7.string(),
|
|
1061
|
+
skillId: z7.string(),
|
|
1035
1062
|
/** Semver string (e.g. "1.2.0") or Falcon fingerprint */
|
|
1036
|
-
version:
|
|
1063
|
+
version: z7.string(),
|
|
1037
1064
|
/** How this version was created */
|
|
1038
1065
|
origin: SkillVersionOriginSchema,
|
|
1039
1066
|
/** Where this snapshot was taken from */
|
|
1040
1067
|
source: GitHubSourceSchema.optional(),
|
|
1041
1068
|
/** Frozen snapshot of all files in the skill directory */
|
|
1042
|
-
files:
|
|
1069
|
+
files: z7.array(SkillFileSchema).optional(),
|
|
1043
1070
|
/** Optional notes about this version (changelog, reason for change) */
|
|
1044
|
-
notes:
|
|
1045
|
-
createdAt:
|
|
1071
|
+
notes: z7.string().optional(),
|
|
1072
|
+
createdAt: z7.string()
|
|
1046
1073
|
});
|
|
1047
|
-
var CreateSkillVersionInputSchema =
|
|
1074
|
+
var CreateSkillVersionInputSchema = z7.object({
|
|
1048
1075
|
/** GitHub source to snapshot from. If not provided, uses the Skill's source. */
|
|
1049
1076
|
source: GitHubSourceSchema.optional(),
|
|
1050
1077
|
/** Version string for this snapshot (e.g. "1.0.0", "1.0.3"). */
|
|
1051
|
-
version:
|
|
1052
|
-
notes:
|
|
1078
|
+
version: z7.string().min(1),
|
|
1079
|
+
notes: z7.string().optional(),
|
|
1053
1080
|
/** Origin of this version. Defaults to 'manual' in backend. */
|
|
1054
1081
|
origin: SkillVersionOriginSchema.optional(),
|
|
1055
1082
|
/** Pre-edited files to store directly (bypasses GitHub fetch when provided) */
|
|
1056
|
-
files:
|
|
1083
|
+
files: z7.array(SkillFileSchema).optional()
|
|
1057
1084
|
});
|
|
1058
1085
|
var SkillSchema = TargetSchema.extend({
|
|
1059
1086
|
/** GitHub source reference for live content fetching */
|
|
@@ -1069,15 +1096,15 @@ var SkillInputBaseSchema = SkillSchema.omit({
|
|
|
1069
1096
|
source: true
|
|
1070
1097
|
}).extend({
|
|
1071
1098
|
/** Optional - not stored on Skill; content description lives in SkillVersion */
|
|
1072
|
-
description:
|
|
1099
|
+
description: z7.string().optional(),
|
|
1073
1100
|
/** GitHub source reference for live content fetching */
|
|
1074
1101
|
source: GitHubSourceSchema.optional()
|
|
1075
1102
|
});
|
|
1076
|
-
var InitialVersionInputSchema =
|
|
1077
|
-
files:
|
|
1078
|
-
notes:
|
|
1103
|
+
var InitialVersionInputSchema = z7.object({
|
|
1104
|
+
files: z7.array(SkillFileSchema).optional(),
|
|
1105
|
+
notes: z7.string().optional(),
|
|
1079
1106
|
source: GitHubSourceSchema.optional(),
|
|
1080
|
-
version:
|
|
1107
|
+
version: z7.string().optional(),
|
|
1081
1108
|
origin: SkillVersionOriginSchema.optional()
|
|
1082
1109
|
});
|
|
1083
1110
|
var CreateSkillInputSchema = SkillInputBaseSchema.extend({
|
|
@@ -1095,10 +1122,10 @@ var SkillWithLatestVersionSchema = SkillSchema.extend({
|
|
|
1095
1122
|
});
|
|
1096
1123
|
|
|
1097
1124
|
// src/target/skills-group.ts
|
|
1098
|
-
import { z as
|
|
1125
|
+
import { z as z8 } from "zod";
|
|
1099
1126
|
var SkillsGroupSchema = TenantEntitySchema.extend({
|
|
1100
1127
|
/** IDs of skills in this group */
|
|
1101
|
-
skillIds:
|
|
1128
|
+
skillIds: z8.array(z8.string())
|
|
1102
1129
|
});
|
|
1103
1130
|
var CreateSkillsGroupInputSchema = SkillsGroupSchema.omit({
|
|
1104
1131
|
id: true,
|
|
@@ -1109,10 +1136,10 @@ var CreateSkillsGroupInputSchema = SkillsGroupSchema.omit({
|
|
|
1109
1136
|
var UpdateSkillsGroupInputSchema = CreateSkillsGroupInputSchema.partial();
|
|
1110
1137
|
|
|
1111
1138
|
// src/target/sub-agent.ts
|
|
1112
|
-
import { z as
|
|
1139
|
+
import { z as z9 } from "zod";
|
|
1113
1140
|
var SubAgentSchema = TargetSchema.extend({
|
|
1114
1141
|
/** The full sub-agent markdown content (YAML frontmatter + body) */
|
|
1115
|
-
subAgentMd:
|
|
1142
|
+
subAgentMd: z9.string()
|
|
1116
1143
|
});
|
|
1117
1144
|
var SubAgentInputBaseSchema = SubAgentSchema.omit({
|
|
1118
1145
|
id: true,
|
|
@@ -1124,10 +1151,10 @@ var CreateSubAgentInputSchema = SubAgentInputBaseSchema;
|
|
|
1124
1151
|
var UpdateSubAgentInputSchema = SubAgentInputBaseSchema.partial();
|
|
1125
1152
|
|
|
1126
1153
|
// src/test/index.ts
|
|
1127
|
-
import { z as
|
|
1154
|
+
import { z as z20 } from "zod";
|
|
1128
1155
|
|
|
1129
1156
|
// src/test/base.ts
|
|
1130
|
-
import { z as
|
|
1157
|
+
import { z as z10 } from "zod";
|
|
1131
1158
|
var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
1132
1159
|
TestType2["LLM"] = "LLM";
|
|
1133
1160
|
TestType2["TOOL"] = "TOOL";
|
|
@@ -1140,7 +1167,7 @@ var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
|
1140
1167
|
TestType2["PLAYWRIGHT_NL"] = "PLAYWRIGHT_NL";
|
|
1141
1168
|
return TestType2;
|
|
1142
1169
|
})(TestType || {});
|
|
1143
|
-
var TestTypeSchema =
|
|
1170
|
+
var TestTypeSchema = z10.enum(TestType);
|
|
1144
1171
|
var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
1145
1172
|
TestImportance2["LOW"] = "low";
|
|
1146
1173
|
TestImportance2["MEDIUM"] = "medium";
|
|
@@ -1148,153 +1175,153 @@ var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
|
1148
1175
|
TestImportance2["CRITICAL"] = "critical";
|
|
1149
1176
|
return TestImportance2;
|
|
1150
1177
|
})(TestImportance || {});
|
|
1151
|
-
var TestImportanceSchema =
|
|
1152
|
-
var BaseTestSchema =
|
|
1153
|
-
id:
|
|
1178
|
+
var TestImportanceSchema = z10.enum(TestImportance);
|
|
1179
|
+
var BaseTestSchema = z10.object({
|
|
1180
|
+
id: z10.string(),
|
|
1154
1181
|
type: TestTypeSchema,
|
|
1155
|
-
name:
|
|
1156
|
-
description:
|
|
1182
|
+
name: z10.string().min(3),
|
|
1183
|
+
description: z10.string().optional(),
|
|
1157
1184
|
importance: TestImportanceSchema.optional()
|
|
1158
1185
|
});
|
|
1159
1186
|
|
|
1160
1187
|
// src/test/llm.ts
|
|
1161
|
-
import { z as
|
|
1188
|
+
import { z as z11 } from "zod";
|
|
1162
1189
|
var LLMTestSchema = BaseTestSchema.extend({
|
|
1163
|
-
type:
|
|
1190
|
+
type: z11.literal("LLM" /* LLM */),
|
|
1164
1191
|
/** Maximum steps for the LLM to take */
|
|
1165
|
-
maxSteps:
|
|
1192
|
+
maxSteps: z11.number().min(1).max(100),
|
|
1166
1193
|
/** Prompt to send to the evaluator */
|
|
1167
|
-
prompt:
|
|
1194
|
+
prompt: z11.string().min(1),
|
|
1168
1195
|
/** ID of the evaluator agent to use */
|
|
1169
|
-
evaluatorId:
|
|
1196
|
+
evaluatorId: z11.string()
|
|
1170
1197
|
});
|
|
1171
1198
|
|
|
1172
1199
|
// src/test/tool.ts
|
|
1173
|
-
import { z as
|
|
1200
|
+
import { z as z12 } from "zod";
|
|
1174
1201
|
var ToolTestSchema = BaseTestSchema.extend({
|
|
1175
|
-
type:
|
|
1202
|
+
type: z12.literal("TOOL" /* TOOL */),
|
|
1176
1203
|
/** Name of the tool that should be called */
|
|
1177
|
-
toolName:
|
|
1204
|
+
toolName: z12.string().min(3),
|
|
1178
1205
|
/** Expected arguments for the tool call */
|
|
1179
|
-
args:
|
|
1206
|
+
args: z12.record(z12.string(), z12.any()),
|
|
1180
1207
|
/** Expected content in the tool results */
|
|
1181
|
-
resultsContent:
|
|
1208
|
+
resultsContent: z12.string()
|
|
1182
1209
|
});
|
|
1183
1210
|
|
|
1184
1211
|
// src/test/site-config.ts
|
|
1185
|
-
import { z as
|
|
1212
|
+
import { z as z13 } from "zod";
|
|
1186
1213
|
var SiteConfigTestSchema = BaseTestSchema.extend({
|
|
1187
|
-
type:
|
|
1214
|
+
type: z13.literal("SITE_CONFIG" /* SITE_CONFIG */),
|
|
1188
1215
|
/** URL to call */
|
|
1189
|
-
url:
|
|
1216
|
+
url: z13.string().url(),
|
|
1190
1217
|
/** HTTP method */
|
|
1191
|
-
method:
|
|
1218
|
+
method: z13.enum(["GET", "POST"]),
|
|
1192
1219
|
/** Request body (for POST) */
|
|
1193
|
-
body:
|
|
1220
|
+
body: z13.string().optional(),
|
|
1194
1221
|
/** Expected HTTP status code */
|
|
1195
|
-
expectedStatusCode:
|
|
1222
|
+
expectedStatusCode: z13.number().int().min(100).max(599),
|
|
1196
1223
|
/** Expected response content */
|
|
1197
|
-
expectedResponse:
|
|
1224
|
+
expectedResponse: z13.string().optional(),
|
|
1198
1225
|
/** JMESPath expression to extract from response */
|
|
1199
|
-
expectedResponseJMESPath:
|
|
1226
|
+
expectedResponseJMESPath: z13.string().optional()
|
|
1200
1227
|
});
|
|
1201
1228
|
|
|
1202
1229
|
// src/test/command-execution.ts
|
|
1203
|
-
import { z as
|
|
1230
|
+
import { z as z14 } from "zod";
|
|
1204
1231
|
var AllowedCommands = [
|
|
1205
1232
|
"yarn install --no-immutable && yarn build",
|
|
1206
1233
|
"npm run build",
|
|
1207
1234
|
"yarn typecheck"
|
|
1208
1235
|
];
|
|
1209
1236
|
var CommandExecutionTestSchema = BaseTestSchema.extend({
|
|
1210
|
-
type:
|
|
1237
|
+
type: z14.literal("COMMAND_EXECUTION" /* COMMAND_EXECUTION */),
|
|
1211
1238
|
/** Command to execute (must be in AllowedCommands) */
|
|
1212
|
-
command:
|
|
1239
|
+
command: z14.string().refine((value) => AllowedCommands.includes(value), {
|
|
1213
1240
|
message: `Command must be one of: ${AllowedCommands.join(", ")}`
|
|
1214
1241
|
}),
|
|
1215
1242
|
/** Expected exit code (default: 0) */
|
|
1216
|
-
expectedExitCode:
|
|
1243
|
+
expectedExitCode: z14.number().default(0).optional()
|
|
1217
1244
|
});
|
|
1218
1245
|
|
|
1219
1246
|
// src/test/file-presence.ts
|
|
1220
|
-
import { z as
|
|
1247
|
+
import { z as z15 } from "zod";
|
|
1221
1248
|
var FilePresenceTestSchema = BaseTestSchema.extend({
|
|
1222
|
-
type:
|
|
1249
|
+
type: z15.literal("FILE_PRESENCE" /* FILE_PRESENCE */),
|
|
1223
1250
|
/** Paths to check */
|
|
1224
|
-
paths:
|
|
1251
|
+
paths: z15.array(z15.string()),
|
|
1225
1252
|
/** Whether files should exist (true) or not exist (false) */
|
|
1226
|
-
shouldExist:
|
|
1253
|
+
shouldExist: z15.boolean()
|
|
1227
1254
|
});
|
|
1228
1255
|
|
|
1229
1256
|
// src/test/file-content.ts
|
|
1230
|
-
import { z as
|
|
1231
|
-
var FileContentCheckSchema =
|
|
1257
|
+
import { z as z16 } from "zod";
|
|
1258
|
+
var FileContentCheckSchema = z16.object({
|
|
1232
1259
|
/** Strings that must be present in the file */
|
|
1233
|
-
contains:
|
|
1260
|
+
contains: z16.array(z16.string()).optional(),
|
|
1234
1261
|
/** Strings that must NOT be present in the file */
|
|
1235
|
-
notContains:
|
|
1262
|
+
notContains: z16.array(z16.string()).optional(),
|
|
1236
1263
|
/** Regex pattern the content must match */
|
|
1237
|
-
matches:
|
|
1264
|
+
matches: z16.string().optional(),
|
|
1238
1265
|
/** JSON path checks for structured content */
|
|
1239
|
-
jsonPath:
|
|
1240
|
-
|
|
1241
|
-
path:
|
|
1242
|
-
value:
|
|
1266
|
+
jsonPath: z16.array(
|
|
1267
|
+
z16.object({
|
|
1268
|
+
path: z16.string(),
|
|
1269
|
+
value: z16.unknown()
|
|
1243
1270
|
})
|
|
1244
1271
|
).optional(),
|
|
1245
1272
|
/** Lines that should be added (for diff checking) */
|
|
1246
|
-
added:
|
|
1273
|
+
added: z16.array(z16.string()).optional(),
|
|
1247
1274
|
/** Lines that should be removed (for diff checking) */
|
|
1248
|
-
removed:
|
|
1275
|
+
removed: z16.array(z16.string()).optional()
|
|
1249
1276
|
});
|
|
1250
1277
|
var FileContentTestSchema = BaseTestSchema.extend({
|
|
1251
|
-
type:
|
|
1278
|
+
type: z16.literal("FILE_CONTENT" /* FILE_CONTENT */),
|
|
1252
1279
|
/** Path to the file to check */
|
|
1253
|
-
path:
|
|
1280
|
+
path: z16.string(),
|
|
1254
1281
|
/** Content checks to perform */
|
|
1255
1282
|
checks: FileContentCheckSchema
|
|
1256
1283
|
});
|
|
1257
1284
|
|
|
1258
1285
|
// src/test/build-check.ts
|
|
1259
|
-
import { z as
|
|
1286
|
+
import { z as z17 } from "zod";
|
|
1260
1287
|
var BuildCheckTestSchema = BaseTestSchema.extend({
|
|
1261
|
-
type:
|
|
1288
|
+
type: z17.literal("BUILD_CHECK" /* BUILD_CHECK */),
|
|
1262
1289
|
/** Build command to execute */
|
|
1263
|
-
command:
|
|
1290
|
+
command: z17.string(),
|
|
1264
1291
|
/** Whether the build should succeed */
|
|
1265
|
-
expectSuccess:
|
|
1292
|
+
expectSuccess: z17.boolean(),
|
|
1266
1293
|
/** Maximum allowed warnings (optional) */
|
|
1267
|
-
allowedWarnings:
|
|
1294
|
+
allowedWarnings: z17.number().optional(),
|
|
1268
1295
|
/** Timeout in milliseconds */
|
|
1269
|
-
timeout:
|
|
1296
|
+
timeout: z17.number().optional()
|
|
1270
1297
|
});
|
|
1271
1298
|
|
|
1272
1299
|
// src/test/vitest.ts
|
|
1273
|
-
import { z as
|
|
1300
|
+
import { z as z18 } from "zod";
|
|
1274
1301
|
var VitestTestSchema = BaseTestSchema.extend({
|
|
1275
|
-
type:
|
|
1302
|
+
type: z18.literal("VITEST" /* VITEST */),
|
|
1276
1303
|
/** Test file content */
|
|
1277
|
-
testFile:
|
|
1304
|
+
testFile: z18.string(),
|
|
1278
1305
|
/** Name of the test file */
|
|
1279
|
-
testFileName:
|
|
1306
|
+
testFileName: z18.string(),
|
|
1280
1307
|
/** Minimum pass rate required (0-100) */
|
|
1281
|
-
minPassRate:
|
|
1308
|
+
minPassRate: z18.number().min(0).max(100)
|
|
1282
1309
|
});
|
|
1283
1310
|
|
|
1284
1311
|
// src/test/playwright-nl.ts
|
|
1285
|
-
import { z as
|
|
1312
|
+
import { z as z19 } from "zod";
|
|
1286
1313
|
var PlaywrightNLTestSchema = BaseTestSchema.extend({
|
|
1287
|
-
type:
|
|
1314
|
+
type: z19.literal("PLAYWRIGHT_NL" /* PLAYWRIGHT_NL */),
|
|
1288
1315
|
/** Natural language steps to execute */
|
|
1289
|
-
steps:
|
|
1316
|
+
steps: z19.array(z19.string()),
|
|
1290
1317
|
/** Expected outcome description */
|
|
1291
|
-
expectedOutcome:
|
|
1318
|
+
expectedOutcome: z19.string(),
|
|
1292
1319
|
/** Timeout in milliseconds */
|
|
1293
|
-
timeout:
|
|
1320
|
+
timeout: z19.number().optional()
|
|
1294
1321
|
});
|
|
1295
1322
|
|
|
1296
1323
|
// src/test/index.ts
|
|
1297
|
-
var TestSchema =
|
|
1324
|
+
var TestSchema = z20.discriminatedUnion("type", [
|
|
1298
1325
|
LLMTestSchema,
|
|
1299
1326
|
ToolTestSchema,
|
|
1300
1327
|
SiteConfigTestSchema,
|
|
@@ -1307,44 +1334,52 @@ var TestSchema = z19.discriminatedUnion("type", [
|
|
|
1307
1334
|
]);
|
|
1308
1335
|
|
|
1309
1336
|
// src/scenario/assertions.ts
|
|
1310
|
-
import { z as
|
|
1311
|
-
var SkillWasCalledAssertionSchema =
|
|
1312
|
-
type:
|
|
1337
|
+
import { z as z21 } from "zod";
|
|
1338
|
+
var SkillWasCalledAssertionSchema = z21.object({
|
|
1339
|
+
type: z21.literal("skill_was_called"),
|
|
1313
1340
|
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1314
|
-
skillNames:
|
|
1341
|
+
skillNames: z21.array(z21.string().min(1)).min(1)
|
|
1315
1342
|
});
|
|
1316
|
-
var
|
|
1317
|
-
type:
|
|
1343
|
+
var ToolCalledWithParamAssertionSchema = z21.object({
|
|
1344
|
+
type: z21.literal("tool_called_with_param"),
|
|
1345
|
+
/** Name of the tool that must have been called */
|
|
1346
|
+
toolName: z21.string().min(1),
|
|
1347
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1348
|
+
expectedParams: z21.string().min(1)
|
|
1349
|
+
});
|
|
1350
|
+
var BuildPassedAssertionSchema = z21.object({
|
|
1351
|
+
type: z21.literal("build_passed"),
|
|
1318
1352
|
/** Command to run (default: "yarn build") */
|
|
1319
|
-
command:
|
|
1353
|
+
command: z21.string().optional(),
|
|
1320
1354
|
/** Expected exit code (default: 0) */
|
|
1321
|
-
expectedExitCode:
|
|
1355
|
+
expectedExitCode: z21.number().int().optional()
|
|
1322
1356
|
});
|
|
1323
|
-
var CostAssertionSchema =
|
|
1324
|
-
type:
|
|
1357
|
+
var CostAssertionSchema = z21.object({
|
|
1358
|
+
type: z21.literal("cost"),
|
|
1325
1359
|
/** Maximum allowed cost in USD */
|
|
1326
|
-
maxCostUsd:
|
|
1360
|
+
maxCostUsd: z21.number().positive()
|
|
1327
1361
|
});
|
|
1328
|
-
var LlmJudgeAssertionSchema =
|
|
1329
|
-
type:
|
|
1362
|
+
var LlmJudgeAssertionSchema = z21.object({
|
|
1363
|
+
type: z21.literal("llm_judge"),
|
|
1330
1364
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1331
|
-
prompt:
|
|
1365
|
+
prompt: z21.string(),
|
|
1332
1366
|
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
1333
|
-
systemPrompt:
|
|
1367
|
+
systemPrompt: z21.string().optional(),
|
|
1334
1368
|
/** Minimum score to pass (0-100, default 70) */
|
|
1335
|
-
minScore:
|
|
1369
|
+
minScore: z21.number().int().min(0).max(100).optional(),
|
|
1336
1370
|
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
1337
|
-
model:
|
|
1338
|
-
maxTokens:
|
|
1339
|
-
temperature:
|
|
1371
|
+
model: z21.string().optional(),
|
|
1372
|
+
maxTokens: z21.number().int().optional(),
|
|
1373
|
+
temperature: z21.number().min(0).max(1).optional()
|
|
1340
1374
|
});
|
|
1341
|
-
var TimeAssertionSchema =
|
|
1342
|
-
type:
|
|
1375
|
+
var TimeAssertionSchema = z21.object({
|
|
1376
|
+
type: z21.literal("time_limit"),
|
|
1343
1377
|
/** Maximum allowed duration in milliseconds */
|
|
1344
|
-
maxDurationMs:
|
|
1378
|
+
maxDurationMs: z21.number().int().positive()
|
|
1345
1379
|
});
|
|
1346
|
-
var AssertionSchema =
|
|
1380
|
+
var AssertionSchema = z21.union([
|
|
1347
1381
|
SkillWasCalledAssertionSchema,
|
|
1382
|
+
ToolCalledWithParamAssertionSchema,
|
|
1348
1383
|
BuildPassedAssertionSchema,
|
|
1349
1384
|
TimeAssertionSchema,
|
|
1350
1385
|
CostAssertionSchema,
|
|
@@ -1352,33 +1387,33 @@ var AssertionSchema = z20.union([
|
|
|
1352
1387
|
]);
|
|
1353
1388
|
|
|
1354
1389
|
// src/scenario/environment.ts
|
|
1355
|
-
import { z as
|
|
1356
|
-
var LocalProjectConfigSchema =
|
|
1390
|
+
import { z as z22 } from "zod";
|
|
1391
|
+
var LocalProjectConfigSchema = z22.object({
|
|
1357
1392
|
/** Template ID to use for the local project */
|
|
1358
|
-
templateId:
|
|
1393
|
+
templateId: z22.string().optional(),
|
|
1359
1394
|
/** Files to create in the project */
|
|
1360
|
-
files:
|
|
1361
|
-
|
|
1362
|
-
path:
|
|
1363
|
-
content:
|
|
1395
|
+
files: z22.array(
|
|
1396
|
+
z22.object({
|
|
1397
|
+
path: z22.string().min(1),
|
|
1398
|
+
content: z22.string().min(1)
|
|
1364
1399
|
})
|
|
1365
1400
|
).optional()
|
|
1366
1401
|
});
|
|
1367
|
-
var MetaSiteConfigSchema =
|
|
1368
|
-
configurations:
|
|
1369
|
-
|
|
1370
|
-
name:
|
|
1371
|
-
apiCalls:
|
|
1372
|
-
|
|
1373
|
-
url:
|
|
1374
|
-
method:
|
|
1375
|
-
body:
|
|
1402
|
+
var MetaSiteConfigSchema = z22.object({
|
|
1403
|
+
configurations: z22.array(
|
|
1404
|
+
z22.object({
|
|
1405
|
+
name: z22.string().min(1),
|
|
1406
|
+
apiCalls: z22.array(
|
|
1407
|
+
z22.object({
|
|
1408
|
+
url: z22.string().url(),
|
|
1409
|
+
method: z22.enum(["POST", "PUT"]),
|
|
1410
|
+
body: z22.string()
|
|
1376
1411
|
})
|
|
1377
1412
|
)
|
|
1378
1413
|
})
|
|
1379
1414
|
).optional()
|
|
1380
1415
|
});
|
|
1381
|
-
var EnvironmentSchema =
|
|
1416
|
+
var EnvironmentSchema = z22.object({
|
|
1382
1417
|
/** Local project configuration */
|
|
1383
1418
|
localProject: LocalProjectConfigSchema.optional(),
|
|
1384
1419
|
/** Meta site configuration */
|
|
@@ -1386,64 +1421,71 @@ var EnvironmentSchema = z21.object({
|
|
|
1386
1421
|
});
|
|
1387
1422
|
|
|
1388
1423
|
// src/scenario/test-scenario.ts
|
|
1389
|
-
import { z as
|
|
1424
|
+
import { z as z24 } from "zod";
|
|
1390
1425
|
|
|
1391
1426
|
// src/assertion/assertion.ts
|
|
1392
|
-
import { z as
|
|
1393
|
-
var AssertionTypeSchema =
|
|
1427
|
+
import { z as z23 } from "zod";
|
|
1428
|
+
var AssertionTypeSchema = z23.enum([
|
|
1394
1429
|
"skill_was_called",
|
|
1430
|
+
"tool_called_with_param",
|
|
1395
1431
|
"build_passed",
|
|
1396
1432
|
"time_limit",
|
|
1397
1433
|
"cost",
|
|
1398
1434
|
"llm_judge"
|
|
1399
1435
|
]);
|
|
1400
|
-
var AssertionParameterTypeSchema =
|
|
1436
|
+
var AssertionParameterTypeSchema = z23.enum([
|
|
1401
1437
|
"string",
|
|
1402
1438
|
"number",
|
|
1403
1439
|
"boolean"
|
|
1404
1440
|
]);
|
|
1405
|
-
var AssertionParameterSchema =
|
|
1441
|
+
var AssertionParameterSchema = z23.object({
|
|
1406
1442
|
/** Parameter name (used as key in params object) */
|
|
1407
|
-
name:
|
|
1443
|
+
name: z23.string().min(1),
|
|
1408
1444
|
/** Display label for the parameter */
|
|
1409
|
-
label:
|
|
1445
|
+
label: z23.string().min(1),
|
|
1410
1446
|
/** Parameter type */
|
|
1411
1447
|
type: AssertionParameterTypeSchema,
|
|
1412
1448
|
/** Whether this parameter is required */
|
|
1413
|
-
required:
|
|
1449
|
+
required: z23.boolean(),
|
|
1414
1450
|
/** Default value (optional, used when not provided) */
|
|
1415
|
-
defaultValue:
|
|
1451
|
+
defaultValue: z23.union([z23.string(), z23.number(), z23.boolean()]).optional(),
|
|
1416
1452
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
1417
|
-
advanced:
|
|
1453
|
+
advanced: z23.boolean().optional()
|
|
1418
1454
|
});
|
|
1419
|
-
var ScenarioAssertionLinkSchema =
|
|
1455
|
+
var ScenarioAssertionLinkSchema = z23.object({
|
|
1420
1456
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
1421
|
-
assertionId:
|
|
1457
|
+
assertionId: z23.string(),
|
|
1422
1458
|
/** Parameter values for this assertion in this scenario */
|
|
1423
|
-
params:
|
|
1424
|
-
|
|
1425
|
-
|
|
1459
|
+
params: z23.record(
|
|
1460
|
+
z23.string(),
|
|
1461
|
+
z23.union([z23.string(), z23.number(), z23.boolean(), z23.null()])
|
|
1426
1462
|
).optional()
|
|
1427
1463
|
});
|
|
1428
|
-
var SkillWasCalledConfigSchema =
|
|
1464
|
+
var SkillWasCalledConfigSchema = z23.object({
|
|
1429
1465
|
/** Names of the skills that must have been called */
|
|
1430
|
-
skillNames:
|
|
1466
|
+
skillNames: z23.array(z23.string().min(1)).min(1)
|
|
1431
1467
|
});
|
|
1432
|
-
var CostConfigSchema =
|
|
1468
|
+
var CostConfigSchema = z23.strictObject({
|
|
1433
1469
|
/** Maximum allowed cost in USD */
|
|
1434
|
-
maxCostUsd:
|
|
1470
|
+
maxCostUsd: z23.number().positive()
|
|
1471
|
+
});
|
|
1472
|
+
var ToolCalledWithParamConfigSchema = z23.strictObject({
|
|
1473
|
+
/** Name of the tool that must have been called */
|
|
1474
|
+
toolName: z23.string().min(1),
|
|
1475
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1476
|
+
expectedParams: z23.string().min(1)
|
|
1435
1477
|
});
|
|
1436
|
-
var BuildPassedConfigSchema =
|
|
1478
|
+
var BuildPassedConfigSchema = z23.strictObject({
|
|
1437
1479
|
/** Command to run (default: "yarn build") */
|
|
1438
|
-
command:
|
|
1480
|
+
command: z23.string().optional(),
|
|
1439
1481
|
/** Expected exit code (default: 0) */
|
|
1440
|
-
expectedExitCode:
|
|
1482
|
+
expectedExitCode: z23.number().int().optional()
|
|
1441
1483
|
});
|
|
1442
|
-
var TimeConfigSchema =
|
|
1484
|
+
var TimeConfigSchema = z23.strictObject({
|
|
1443
1485
|
/** Maximum allowed duration in milliseconds */
|
|
1444
|
-
maxDurationMs:
|
|
1486
|
+
maxDurationMs: z23.number().int().positive()
|
|
1445
1487
|
});
|
|
1446
|
-
var LlmJudgeConfigSchema =
|
|
1488
|
+
var LlmJudgeConfigSchema = z23.object({
|
|
1447
1489
|
/**
|
|
1448
1490
|
* Prompt template with placeholders:
|
|
1449
1491
|
* - {{output}}: agent's final output
|
|
@@ -1454,32 +1496,34 @@ var LlmJudgeConfigSchema = z22.object({
|
|
|
1454
1496
|
* - {{trace}}: step-by-step trace of tool calls
|
|
1455
1497
|
* - Custom parameters defined in the parameters array
|
|
1456
1498
|
*/
|
|
1457
|
-
prompt:
|
|
1499
|
+
prompt: z23.string().min(1),
|
|
1458
1500
|
/** Optional system prompt for the judge */
|
|
1459
|
-
systemPrompt:
|
|
1501
|
+
systemPrompt: z23.string().optional(),
|
|
1460
1502
|
/** Minimum score to pass (0-100, default 70) */
|
|
1461
|
-
minScore:
|
|
1503
|
+
minScore: z23.number().int().min(0).max(100).optional(),
|
|
1462
1504
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1463
|
-
model:
|
|
1505
|
+
model: z23.string().optional(),
|
|
1464
1506
|
/** Max output tokens */
|
|
1465
|
-
maxTokens:
|
|
1507
|
+
maxTokens: z23.number().int().optional(),
|
|
1466
1508
|
/** Temperature (0-1) */
|
|
1467
|
-
temperature:
|
|
1509
|
+
temperature: z23.number().min(0).max(1).optional(),
|
|
1468
1510
|
/** User-defined parameters for this assertion */
|
|
1469
|
-
parameters:
|
|
1511
|
+
parameters: z23.array(AssertionParameterSchema).optional()
|
|
1470
1512
|
});
|
|
1471
|
-
var AssertionConfigSchema =
|
|
1513
|
+
var AssertionConfigSchema = z23.union([
|
|
1472
1514
|
LlmJudgeConfigSchema,
|
|
1473
1515
|
// requires prompt - check first
|
|
1474
1516
|
SkillWasCalledConfigSchema,
|
|
1475
1517
|
// requires skillNames
|
|
1518
|
+
ToolCalledWithParamConfigSchema,
|
|
1519
|
+
// requires toolName + expectedParams, uses strictObject
|
|
1476
1520
|
TimeConfigSchema,
|
|
1477
1521
|
// requires maxDurationMs, uses strictObject
|
|
1478
1522
|
CostConfigSchema,
|
|
1479
1523
|
// requires maxCostUsd, uses strictObject
|
|
1480
1524
|
BuildPassedConfigSchema,
|
|
1481
1525
|
// all optional, uses strictObject to reject unknown keys
|
|
1482
|
-
|
|
1526
|
+
z23.object({})
|
|
1483
1527
|
// fallback empty config
|
|
1484
1528
|
]);
|
|
1485
1529
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -1501,6 +1545,8 @@ function validateAssertionConfig(type, config) {
|
|
|
1501
1545
|
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
1502
1546
|
case "cost":
|
|
1503
1547
|
return CostConfigSchema.safeParse(config).success;
|
|
1548
|
+
case "tool_called_with_param":
|
|
1549
|
+
return ToolCalledWithParamConfigSchema.safeParse(config).success;
|
|
1504
1550
|
case "build_passed":
|
|
1505
1551
|
return BuildPassedConfigSchema.safeParse(config).success;
|
|
1506
1552
|
case "time_limit":
|
|
@@ -1528,23 +1574,23 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1528
1574
|
}
|
|
1529
1575
|
|
|
1530
1576
|
// src/scenario/test-scenario.ts
|
|
1531
|
-
var ExpectedFileSchema =
|
|
1577
|
+
var ExpectedFileSchema = z24.object({
|
|
1532
1578
|
/** Relative path where the file should be created */
|
|
1533
|
-
path:
|
|
1579
|
+
path: z24.string(),
|
|
1534
1580
|
/** Optional expected content */
|
|
1535
|
-
content:
|
|
1581
|
+
content: z24.string().optional()
|
|
1536
1582
|
});
|
|
1537
1583
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
1538
1584
|
/** The prompt sent to the agent to trigger the task */
|
|
1539
|
-
triggerPrompt:
|
|
1585
|
+
triggerPrompt: z24.string().min(10),
|
|
1540
1586
|
/** ID of the template to use for this scenario (null = no template) */
|
|
1541
|
-
templateId:
|
|
1587
|
+
templateId: z24.string().nullish(),
|
|
1542
1588
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
1543
|
-
assertions:
|
|
1589
|
+
assertions: z24.array(AssertionSchema).optional(),
|
|
1544
1590
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
1545
|
-
assertionIds:
|
|
1591
|
+
assertionIds: z24.array(z24.string()).optional(),
|
|
1546
1592
|
/** Linked assertions with per-scenario parameter values */
|
|
1547
|
-
assertionLinks:
|
|
1593
|
+
assertionLinks: z24.array(ScenarioAssertionLinkSchema).optional()
|
|
1548
1594
|
});
|
|
1549
1595
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
1550
1596
|
id: true,
|
|
@@ -1555,10 +1601,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
1555
1601
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
1556
1602
|
|
|
1557
1603
|
// src/suite/test-suite.ts
|
|
1558
|
-
import { z as
|
|
1604
|
+
import { z as z25 } from "zod";
|
|
1559
1605
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1560
1606
|
/** IDs of test scenarios in this suite */
|
|
1561
|
-
scenarioIds:
|
|
1607
|
+
scenarioIds: z25.array(z25.string())
|
|
1562
1608
|
});
|
|
1563
1609
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1564
1610
|
id: true,
|
|
@@ -1569,21 +1615,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1569
1615
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1570
1616
|
|
|
1571
1617
|
// src/evaluation/metrics.ts
|
|
1572
|
-
import { z as
|
|
1573
|
-
var TokenUsageSchema =
|
|
1574
|
-
prompt:
|
|
1575
|
-
completion:
|
|
1576
|
-
total:
|
|
1577
|
-
});
|
|
1578
|
-
var EvalMetricsSchema =
|
|
1579
|
-
totalAssertions:
|
|
1580
|
-
passed:
|
|
1581
|
-
failed:
|
|
1582
|
-
skipped:
|
|
1583
|
-
errors:
|
|
1584
|
-
passRate:
|
|
1585
|
-
avgDuration:
|
|
1586
|
-
totalDuration:
|
|
1618
|
+
import { z as z26 } from "zod";
|
|
1619
|
+
var TokenUsageSchema = z26.object({
|
|
1620
|
+
prompt: z26.number(),
|
|
1621
|
+
completion: z26.number(),
|
|
1622
|
+
total: z26.number()
|
|
1623
|
+
});
|
|
1624
|
+
var EvalMetricsSchema = z26.object({
|
|
1625
|
+
totalAssertions: z26.number(),
|
|
1626
|
+
passed: z26.number(),
|
|
1627
|
+
failed: z26.number(),
|
|
1628
|
+
skipped: z26.number(),
|
|
1629
|
+
errors: z26.number(),
|
|
1630
|
+
passRate: z26.number(),
|
|
1631
|
+
avgDuration: z26.number(),
|
|
1632
|
+
totalDuration: z26.number()
|
|
1587
1633
|
});
|
|
1588
1634
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1589
1635
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1593,7 +1639,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1593
1639
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1594
1640
|
return EvalStatus2;
|
|
1595
1641
|
})(EvalStatus || {});
|
|
1596
|
-
var EvalStatusSchema =
|
|
1642
|
+
var EvalStatusSchema = z26.enum(EvalStatus);
|
|
1597
1643
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1598
1644
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1599
1645
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1601,52 +1647,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1601
1647
|
LLMStepType2["THINKING"] = "thinking";
|
|
1602
1648
|
return LLMStepType2;
|
|
1603
1649
|
})(LLMStepType || {});
|
|
1604
|
-
var LLMTraceStepSchema =
|
|
1605
|
-
id:
|
|
1606
|
-
stepNumber:
|
|
1607
|
-
type:
|
|
1608
|
-
model:
|
|
1609
|
-
provider:
|
|
1610
|
-
startedAt:
|
|
1611
|
-
durationMs:
|
|
1650
|
+
var LLMTraceStepSchema = z26.object({
|
|
1651
|
+
id: z26.string(),
|
|
1652
|
+
stepNumber: z26.number(),
|
|
1653
|
+
type: z26.enum(LLMStepType),
|
|
1654
|
+
model: z26.string(),
|
|
1655
|
+
provider: z26.string(),
|
|
1656
|
+
startedAt: z26.string(),
|
|
1657
|
+
durationMs: z26.number(),
|
|
1612
1658
|
tokenUsage: TokenUsageSchema,
|
|
1613
|
-
costUsd:
|
|
1614
|
-
toolName:
|
|
1615
|
-
toolArguments:
|
|
1616
|
-
inputPreview:
|
|
1617
|
-
outputPreview:
|
|
1618
|
-
success:
|
|
1619
|
-
error:
|
|
1620
|
-
});
|
|
1621
|
-
var LLMBreakdownStatsSchema =
|
|
1622
|
-
count:
|
|
1623
|
-
durationMs:
|
|
1624
|
-
tokens:
|
|
1625
|
-
costUsd:
|
|
1626
|
-
});
|
|
1627
|
-
var LLMTraceSummarySchema =
|
|
1628
|
-
totalSteps:
|
|
1629
|
-
totalDurationMs:
|
|
1659
|
+
costUsd: z26.number(),
|
|
1660
|
+
toolName: z26.string().optional(),
|
|
1661
|
+
toolArguments: z26.string().optional(),
|
|
1662
|
+
inputPreview: z26.string().optional(),
|
|
1663
|
+
outputPreview: z26.string().optional(),
|
|
1664
|
+
success: z26.boolean(),
|
|
1665
|
+
error: z26.string().optional()
|
|
1666
|
+
});
|
|
1667
|
+
var LLMBreakdownStatsSchema = z26.object({
|
|
1668
|
+
count: z26.number(),
|
|
1669
|
+
durationMs: z26.number(),
|
|
1670
|
+
tokens: z26.number(),
|
|
1671
|
+
costUsd: z26.number()
|
|
1672
|
+
});
|
|
1673
|
+
var LLMTraceSummarySchema = z26.object({
|
|
1674
|
+
totalSteps: z26.number(),
|
|
1675
|
+
totalDurationMs: z26.number(),
|
|
1630
1676
|
totalTokens: TokenUsageSchema,
|
|
1631
|
-
totalCostUsd:
|
|
1632
|
-
stepTypeBreakdown:
|
|
1633
|
-
modelBreakdown:
|
|
1634
|
-
modelsUsed:
|
|
1635
|
-
});
|
|
1636
|
-
var LLMTraceSchema =
|
|
1637
|
-
id:
|
|
1638
|
-
steps:
|
|
1677
|
+
totalCostUsd: z26.number(),
|
|
1678
|
+
stepTypeBreakdown: z26.record(z26.string(), LLMBreakdownStatsSchema).optional(),
|
|
1679
|
+
modelBreakdown: z26.record(z26.string(), LLMBreakdownStatsSchema),
|
|
1680
|
+
modelsUsed: z26.array(z26.string())
|
|
1681
|
+
});
|
|
1682
|
+
var LLMTraceSchema = z26.object({
|
|
1683
|
+
id: z26.string(),
|
|
1684
|
+
steps: z26.array(LLMTraceStepSchema),
|
|
1639
1685
|
summary: LLMTraceSummarySchema
|
|
1640
1686
|
});
|
|
1641
1687
|
|
|
1642
1688
|
// src/evaluation/eval-result.ts
|
|
1643
|
-
import { z as
|
|
1689
|
+
import { z as z29 } from "zod";
|
|
1644
1690
|
|
|
1645
1691
|
// src/evaluation/eval-run.ts
|
|
1646
|
-
import { z as
|
|
1692
|
+
import { z as z28 } from "zod";
|
|
1647
1693
|
|
|
1648
1694
|
// src/evaluation/live-trace.ts
|
|
1649
|
-
import { z as
|
|
1695
|
+
import { z as z27 } from "zod";
|
|
1650
1696
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1651
1697
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1652
1698
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1660,37 +1706,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1660
1706
|
LiveTraceEventType2["USER"] = "user";
|
|
1661
1707
|
return LiveTraceEventType2;
|
|
1662
1708
|
})(LiveTraceEventType || {});
|
|
1663
|
-
var LiveTraceEventSchema =
|
|
1709
|
+
var LiveTraceEventSchema = z27.object({
|
|
1664
1710
|
/** The evaluation run ID */
|
|
1665
|
-
evalRunId:
|
|
1711
|
+
evalRunId: z27.string(),
|
|
1666
1712
|
/** The scenario ID being executed */
|
|
1667
|
-
scenarioId:
|
|
1713
|
+
scenarioId: z27.string(),
|
|
1668
1714
|
/** The scenario name for display */
|
|
1669
|
-
scenarioName:
|
|
1715
|
+
scenarioName: z27.string(),
|
|
1670
1716
|
/** The target ID (skill, agent, etc.) */
|
|
1671
|
-
targetId:
|
|
1717
|
+
targetId: z27.string(),
|
|
1672
1718
|
/** The target name for display */
|
|
1673
|
-
targetName:
|
|
1719
|
+
targetName: z27.string(),
|
|
1674
1720
|
/** Step number in the current scenario execution */
|
|
1675
|
-
stepNumber:
|
|
1721
|
+
stepNumber: z27.number(),
|
|
1676
1722
|
/** Type of trace event */
|
|
1677
|
-
type:
|
|
1723
|
+
type: z27.enum(LiveTraceEventType),
|
|
1678
1724
|
/** Tool name if this is a tool_use event */
|
|
1679
|
-
toolName:
|
|
1725
|
+
toolName: z27.string().optional(),
|
|
1680
1726
|
/** Tool arguments preview (truncated JSON) */
|
|
1681
|
-
toolArgs:
|
|
1727
|
+
toolArgs: z27.string().optional(),
|
|
1682
1728
|
/** Output preview (truncated text) */
|
|
1683
|
-
outputPreview:
|
|
1729
|
+
outputPreview: z27.string().optional(),
|
|
1684
1730
|
/** File path for file operations */
|
|
1685
|
-
filePath:
|
|
1731
|
+
filePath: z27.string().optional(),
|
|
1686
1732
|
/** Elapsed time in milliseconds for progress events */
|
|
1687
|
-
elapsedMs:
|
|
1733
|
+
elapsedMs: z27.number().optional(),
|
|
1688
1734
|
/** Thinking/reasoning text from Claude */
|
|
1689
|
-
thinking:
|
|
1735
|
+
thinking: z27.string().optional(),
|
|
1690
1736
|
/** Timestamp when this event occurred */
|
|
1691
|
-
timestamp:
|
|
1737
|
+
timestamp: z27.string(),
|
|
1692
1738
|
/** Whether this is the final event for this scenario */
|
|
1693
|
-
isComplete:
|
|
1739
|
+
isComplete: z27.boolean()
|
|
1694
1740
|
});
|
|
1695
1741
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1696
1742
|
function parseTraceEventLine(line) {
|
|
@@ -1718,14 +1764,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1718
1764
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
1719
1765
|
return TriggerType2;
|
|
1720
1766
|
})(TriggerType || {});
|
|
1721
|
-
var TriggerMetadataSchema =
|
|
1722
|
-
version:
|
|
1723
|
-
resourceUpdated:
|
|
1767
|
+
var TriggerMetadataSchema = z28.object({
|
|
1768
|
+
version: z28.string().optional(),
|
|
1769
|
+
resourceUpdated: z28.array(z28.string()).optional()
|
|
1724
1770
|
});
|
|
1725
|
-
var TriggerSchema =
|
|
1726
|
-
id:
|
|
1771
|
+
var TriggerSchema = z28.object({
|
|
1772
|
+
id: z28.string(),
|
|
1727
1773
|
metadata: TriggerMetadataSchema.optional(),
|
|
1728
|
-
type:
|
|
1774
|
+
type: z28.enum(TriggerType)
|
|
1729
1775
|
});
|
|
1730
1776
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
1731
1777
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -1743,28 +1789,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
1743
1789
|
FailureSeverity2["LOW"] = "low";
|
|
1744
1790
|
return FailureSeverity2;
|
|
1745
1791
|
})(FailureSeverity || {});
|
|
1746
|
-
var DiffLineTypeSchema =
|
|
1747
|
-
var DiffLineSchema =
|
|
1792
|
+
var DiffLineTypeSchema = z28.enum(["added", "removed", "unchanged"]);
|
|
1793
|
+
var DiffLineSchema = z28.object({
|
|
1748
1794
|
type: DiffLineTypeSchema,
|
|
1749
|
-
content:
|
|
1750
|
-
lineNumber:
|
|
1751
|
-
});
|
|
1752
|
-
var DiffContentSchema =
|
|
1753
|
-
path:
|
|
1754
|
-
expected:
|
|
1755
|
-
actual:
|
|
1756
|
-
diffLines:
|
|
1757
|
-
renamedFrom:
|
|
1758
|
-
});
|
|
1759
|
-
var CommandExecutionSchema =
|
|
1760
|
-
command:
|
|
1761
|
-
exitCode:
|
|
1762
|
-
output:
|
|
1763
|
-
duration:
|
|
1764
|
-
});
|
|
1765
|
-
var FileModificationSchema =
|
|
1766
|
-
path:
|
|
1767
|
-
action:
|
|
1795
|
+
content: z28.string(),
|
|
1796
|
+
lineNumber: z28.number()
|
|
1797
|
+
});
|
|
1798
|
+
var DiffContentSchema = z28.object({
|
|
1799
|
+
path: z28.string(),
|
|
1800
|
+
expected: z28.string(),
|
|
1801
|
+
actual: z28.string(),
|
|
1802
|
+
diffLines: z28.array(DiffLineSchema),
|
|
1803
|
+
renamedFrom: z28.string().optional()
|
|
1804
|
+
});
|
|
1805
|
+
var CommandExecutionSchema = z28.object({
|
|
1806
|
+
command: z28.string(),
|
|
1807
|
+
exitCode: z28.number(),
|
|
1808
|
+
output: z28.string().optional(),
|
|
1809
|
+
duration: z28.number()
|
|
1810
|
+
});
|
|
1811
|
+
var FileModificationSchema = z28.object({
|
|
1812
|
+
path: z28.string(),
|
|
1813
|
+
action: z28.enum(["created", "modified", "deleted"])
|
|
1768
1814
|
});
|
|
1769
1815
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1770
1816
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1772,81 +1818,83 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1772
1818
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1773
1819
|
return TemplateFileStatus2;
|
|
1774
1820
|
})(TemplateFileStatus || {});
|
|
1775
|
-
var TemplateFileSchema =
|
|
1821
|
+
var TemplateFileSchema = z28.object({
|
|
1776
1822
|
/** Relative path within the template */
|
|
1777
|
-
path:
|
|
1823
|
+
path: z28.string(),
|
|
1778
1824
|
/** Full file content after execution */
|
|
1779
|
-
content:
|
|
1825
|
+
content: z28.string(),
|
|
1780
1826
|
/** File status (new, modified, unchanged) */
|
|
1781
|
-
status:
|
|
1782
|
-
});
|
|
1783
|
-
var ApiCallSchema =
|
|
1784
|
-
endpoint:
|
|
1785
|
-
tokensUsed:
|
|
1786
|
-
duration:
|
|
1787
|
-
});
|
|
1788
|
-
var ExecutionTraceSchema =
|
|
1789
|
-
commands:
|
|
1790
|
-
filesModified:
|
|
1791
|
-
apiCalls:
|
|
1792
|
-
totalDuration:
|
|
1793
|
-
});
|
|
1794
|
-
var FailureAnalysisSchema =
|
|
1795
|
-
category:
|
|
1796
|
-
severity:
|
|
1797
|
-
summary:
|
|
1798
|
-
details:
|
|
1799
|
-
rootCause:
|
|
1800
|
-
suggestedFix:
|
|
1801
|
-
relatedAssertions:
|
|
1802
|
-
codeSnippet:
|
|
1803
|
-
similarIssues:
|
|
1804
|
-
patternId:
|
|
1827
|
+
status: z28.enum(["new", "modified", "unchanged"])
|
|
1828
|
+
});
|
|
1829
|
+
var ApiCallSchema = z28.object({
|
|
1830
|
+
endpoint: z28.string(),
|
|
1831
|
+
tokensUsed: z28.number(),
|
|
1832
|
+
duration: z28.number()
|
|
1833
|
+
});
|
|
1834
|
+
var ExecutionTraceSchema = z28.object({
|
|
1835
|
+
commands: z28.array(CommandExecutionSchema),
|
|
1836
|
+
filesModified: z28.array(FileModificationSchema),
|
|
1837
|
+
apiCalls: z28.array(ApiCallSchema),
|
|
1838
|
+
totalDuration: z28.number()
|
|
1839
|
+
});
|
|
1840
|
+
var FailureAnalysisSchema = z28.object({
|
|
1841
|
+
category: z28.enum(FailureCategory),
|
|
1842
|
+
severity: z28.enum(FailureSeverity),
|
|
1843
|
+
summary: z28.string(),
|
|
1844
|
+
details: z28.string(),
|
|
1845
|
+
rootCause: z28.string(),
|
|
1846
|
+
suggestedFix: z28.string(),
|
|
1847
|
+
relatedAssertions: z28.array(z28.string()),
|
|
1848
|
+
codeSnippet: z28.string().optional(),
|
|
1849
|
+
similarIssues: z28.array(z28.string()).optional(),
|
|
1850
|
+
patternId: z28.string().optional(),
|
|
1805
1851
|
// Extended fields for detailed debugging
|
|
1806
1852
|
diff: DiffContentSchema.optional(),
|
|
1807
1853
|
executionTrace: ExecutionTraceSchema.optional()
|
|
1808
1854
|
});
|
|
1809
1855
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1810
1856
|
/** Agent ID for this run */
|
|
1811
|
-
agentId:
|
|
1857
|
+
agentId: z28.string().optional(),
|
|
1812
1858
|
/** Skills group ID for this run */
|
|
1813
|
-
skillsGroupId:
|
|
1859
|
+
skillsGroupId: z28.string().optional(),
|
|
1814
1860
|
/** Map of skillId to skillVersionId for this run */
|
|
1815
|
-
skillVersions:
|
|
1861
|
+
skillVersions: z28.record(z28.string(), z28.string()).optional(),
|
|
1816
1862
|
/** Scenario IDs to run */
|
|
1817
|
-
scenarioIds:
|
|
1863
|
+
scenarioIds: z28.array(z28.string()),
|
|
1818
1864
|
/** Current status */
|
|
1819
1865
|
status: EvalStatusSchema,
|
|
1820
1866
|
/** Progress percentage (0-100) */
|
|
1821
|
-
progress:
|
|
1867
|
+
progress: z28.number(),
|
|
1822
1868
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1823
|
-
results:
|
|
1869
|
+
results: z28.array(z28.lazy(() => EvalRunResultSchema)),
|
|
1824
1870
|
/** Aggregated metrics across all results */
|
|
1825
1871
|
aggregateMetrics: EvalMetricsSchema,
|
|
1826
1872
|
/** Failure analyses */
|
|
1827
|
-
failureAnalyses:
|
|
1873
|
+
failureAnalyses: z28.array(FailureAnalysisSchema).optional(),
|
|
1828
1874
|
/** Aggregated LLM trace summary */
|
|
1829
1875
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
1830
1876
|
/** What triggered this run */
|
|
1831
1877
|
trigger: TriggerSchema.optional(),
|
|
1832
1878
|
/** When the run started (set when evaluation is triggered) */
|
|
1833
|
-
startedAt:
|
|
1879
|
+
startedAt: z28.string().optional(),
|
|
1834
1880
|
/** When the run completed */
|
|
1835
|
-
completedAt:
|
|
1881
|
+
completedAt: z28.string().optional(),
|
|
1836
1882
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1837
|
-
liveTraceEvents:
|
|
1883
|
+
liveTraceEvents: z28.array(LiveTraceEventSchema).optional(),
|
|
1838
1884
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1839
|
-
jobId:
|
|
1885
|
+
jobId: z28.string().optional(),
|
|
1840
1886
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1841
|
-
jobStatus:
|
|
1887
|
+
jobStatus: z28.string().optional(),
|
|
1842
1888
|
/** Remote job error message if the job failed */
|
|
1843
|
-
jobError:
|
|
1889
|
+
jobError: z28.string().optional(),
|
|
1844
1890
|
/** Timestamp of the last job status check */
|
|
1845
|
-
jobStatusCheckedAt:
|
|
1891
|
+
jobStatusCheckedAt: z28.string().optional(),
|
|
1846
1892
|
/** MCP server IDs to enable for this run (optional) */
|
|
1847
|
-
mcpIds:
|
|
1893
|
+
mcpIds: z28.array(z28.string()).optional(),
|
|
1848
1894
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
1849
|
-
subAgentIds:
|
|
1895
|
+
subAgentIds: z28.array(z28.string()).optional(),
|
|
1896
|
+
/** Rule IDs to enable for this run (optional) */
|
|
1897
|
+
ruleIds: z28.array(z28.string()).optional()
|
|
1850
1898
|
});
|
|
1851
1899
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
1852
1900
|
id: true,
|
|
@@ -1859,28 +1907,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
1859
1907
|
startedAt: true,
|
|
1860
1908
|
completedAt: true
|
|
1861
1909
|
});
|
|
1862
|
-
var EvaluationProgressSchema =
|
|
1863
|
-
runId:
|
|
1864
|
-
targetId:
|
|
1865
|
-
totalScenarios:
|
|
1866
|
-
completedScenarios:
|
|
1867
|
-
scenarioProgress:
|
|
1868
|
-
|
|
1869
|
-
scenarioId:
|
|
1870
|
-
currentStep:
|
|
1871
|
-
error:
|
|
1910
|
+
var EvaluationProgressSchema = z28.object({
|
|
1911
|
+
runId: z28.string(),
|
|
1912
|
+
targetId: z28.string(),
|
|
1913
|
+
totalScenarios: z28.number(),
|
|
1914
|
+
completedScenarios: z28.number(),
|
|
1915
|
+
scenarioProgress: z28.array(
|
|
1916
|
+
z28.object({
|
|
1917
|
+
scenarioId: z28.string(),
|
|
1918
|
+
currentStep: z28.string(),
|
|
1919
|
+
error: z28.string().optional()
|
|
1872
1920
|
})
|
|
1873
1921
|
),
|
|
1874
|
-
createdAt:
|
|
1922
|
+
createdAt: z28.number()
|
|
1875
1923
|
});
|
|
1876
|
-
var EvaluationLogSchema =
|
|
1877
|
-
runId:
|
|
1878
|
-
scenarioId:
|
|
1879
|
-
log:
|
|
1880
|
-
level:
|
|
1881
|
-
message:
|
|
1882
|
-
args:
|
|
1883
|
-
error:
|
|
1924
|
+
var EvaluationLogSchema = z28.object({
|
|
1925
|
+
runId: z28.string(),
|
|
1926
|
+
scenarioId: z28.string(),
|
|
1927
|
+
log: z28.object({
|
|
1928
|
+
level: z28.enum(["info", "error", "debug"]),
|
|
1929
|
+
message: z28.string().optional(),
|
|
1930
|
+
args: z28.array(z28.any()).optional(),
|
|
1931
|
+
error: z28.string().optional()
|
|
1884
1932
|
})
|
|
1885
1933
|
});
|
|
1886
1934
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -1893,95 +1941,95 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
1893
1941
|
AssertionResultStatus2["ERROR"] = "error";
|
|
1894
1942
|
return AssertionResultStatus2;
|
|
1895
1943
|
})(AssertionResultStatus || {});
|
|
1896
|
-
var AssertionResultSchema =
|
|
1897
|
-
id:
|
|
1898
|
-
assertionId:
|
|
1899
|
-
assertionType:
|
|
1900
|
-
assertionName:
|
|
1901
|
-
status:
|
|
1902
|
-
message:
|
|
1903
|
-
expected:
|
|
1904
|
-
actual:
|
|
1905
|
-
duration:
|
|
1906
|
-
details:
|
|
1907
|
-
llmTraceSteps:
|
|
1908
|
-
});
|
|
1909
|
-
var EvalRunResultSchema =
|
|
1910
|
-
id:
|
|
1911
|
-
targetId:
|
|
1912
|
-
targetName:
|
|
1944
|
+
var AssertionResultSchema = z29.object({
|
|
1945
|
+
id: z29.string(),
|
|
1946
|
+
assertionId: z29.string(),
|
|
1947
|
+
assertionType: z29.string(),
|
|
1948
|
+
assertionName: z29.string(),
|
|
1949
|
+
status: z29.enum(AssertionResultStatus),
|
|
1950
|
+
message: z29.string().optional(),
|
|
1951
|
+
expected: z29.string().optional(),
|
|
1952
|
+
actual: z29.string().optional(),
|
|
1953
|
+
duration: z29.number().optional(),
|
|
1954
|
+
details: z29.record(z29.string(), z29.unknown()).optional(),
|
|
1955
|
+
llmTraceSteps: z29.array(LLMTraceStepSchema).optional()
|
|
1956
|
+
});
|
|
1957
|
+
var EvalRunResultSchema = z29.object({
|
|
1958
|
+
id: z29.string(),
|
|
1959
|
+
targetId: z29.string(),
|
|
1960
|
+
targetName: z29.string().optional(),
|
|
1913
1961
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
1914
|
-
skillVersionId:
|
|
1962
|
+
skillVersionId: z29.string().optional(),
|
|
1915
1963
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
1916
|
-
skillVersion:
|
|
1917
|
-
scenarioId:
|
|
1918
|
-
scenarioName:
|
|
1964
|
+
skillVersion: z29.string().optional(),
|
|
1965
|
+
scenarioId: z29.string(),
|
|
1966
|
+
scenarioName: z29.string(),
|
|
1919
1967
|
modelConfig: ModelConfigSchema.optional(),
|
|
1920
|
-
assertionResults:
|
|
1968
|
+
assertionResults: z29.array(AssertionResultSchema),
|
|
1921
1969
|
metrics: EvalMetricsSchema.optional(),
|
|
1922
|
-
passed:
|
|
1923
|
-
failed:
|
|
1924
|
-
passRate:
|
|
1925
|
-
duration:
|
|
1926
|
-
outputText:
|
|
1927
|
-
files:
|
|
1928
|
-
fileDiffs:
|
|
1970
|
+
passed: z29.number(),
|
|
1971
|
+
failed: z29.number(),
|
|
1972
|
+
passRate: z29.number(),
|
|
1973
|
+
duration: z29.number(),
|
|
1974
|
+
outputText: z29.string().optional(),
|
|
1975
|
+
files: z29.array(ExpectedFileSchema).optional(),
|
|
1976
|
+
fileDiffs: z29.array(DiffContentSchema).optional(),
|
|
1929
1977
|
/** Full template files after execution with status indicators */
|
|
1930
|
-
templateFiles:
|
|
1931
|
-
startedAt:
|
|
1932
|
-
completedAt:
|
|
1978
|
+
templateFiles: z29.array(TemplateFileSchema).optional(),
|
|
1979
|
+
startedAt: z29.string().optional(),
|
|
1980
|
+
completedAt: z29.string().optional(),
|
|
1933
1981
|
llmTrace: LLMTraceSchema.optional()
|
|
1934
1982
|
});
|
|
1935
|
-
var PromptResultSchema =
|
|
1936
|
-
text:
|
|
1937
|
-
files:
|
|
1938
|
-
finishReason:
|
|
1939
|
-
reasoning:
|
|
1940
|
-
reasoningDetails:
|
|
1941
|
-
toolCalls:
|
|
1942
|
-
toolResults:
|
|
1943
|
-
warnings:
|
|
1944
|
-
sources:
|
|
1945
|
-
steps:
|
|
1946
|
-
generationTimeMs:
|
|
1947
|
-
prompt:
|
|
1948
|
-
systemPrompt:
|
|
1949
|
-
usage:
|
|
1950
|
-
totalTokens:
|
|
1951
|
-
totalMicrocentsSpent:
|
|
1983
|
+
var PromptResultSchema = z29.object({
|
|
1984
|
+
text: z29.string(),
|
|
1985
|
+
files: z29.array(z29.unknown()).optional(),
|
|
1986
|
+
finishReason: z29.string().optional(),
|
|
1987
|
+
reasoning: z29.string().optional(),
|
|
1988
|
+
reasoningDetails: z29.unknown().optional(),
|
|
1989
|
+
toolCalls: z29.array(z29.unknown()).optional(),
|
|
1990
|
+
toolResults: z29.array(z29.unknown()).optional(),
|
|
1991
|
+
warnings: z29.array(z29.unknown()).optional(),
|
|
1992
|
+
sources: z29.array(z29.unknown()).optional(),
|
|
1993
|
+
steps: z29.array(z29.unknown()),
|
|
1994
|
+
generationTimeMs: z29.number(),
|
|
1995
|
+
prompt: z29.string(),
|
|
1996
|
+
systemPrompt: z29.string(),
|
|
1997
|
+
usage: z29.object({
|
|
1998
|
+
totalTokens: z29.number().optional(),
|
|
1999
|
+
totalMicrocentsSpent: z29.number().optional()
|
|
1952
2000
|
})
|
|
1953
2001
|
});
|
|
1954
|
-
var EvaluationResultSchema =
|
|
1955
|
-
id:
|
|
1956
|
-
runId:
|
|
1957
|
-
timestamp:
|
|
2002
|
+
var EvaluationResultSchema = z29.object({
|
|
2003
|
+
id: z29.string(),
|
|
2004
|
+
runId: z29.string(),
|
|
2005
|
+
timestamp: z29.number(),
|
|
1958
2006
|
promptResult: PromptResultSchema,
|
|
1959
|
-
testResults:
|
|
1960
|
-
tags:
|
|
1961
|
-
feedback:
|
|
1962
|
-
score:
|
|
1963
|
-
suiteId:
|
|
1964
|
-
});
|
|
1965
|
-
var LeanEvaluationResultSchema =
|
|
1966
|
-
id:
|
|
1967
|
-
runId:
|
|
1968
|
-
timestamp:
|
|
1969
|
-
tags:
|
|
1970
|
-
scenarioId:
|
|
1971
|
-
scenarioVersion:
|
|
1972
|
-
targetId:
|
|
1973
|
-
targetVersion:
|
|
1974
|
-
suiteId:
|
|
1975
|
-
score:
|
|
1976
|
-
time:
|
|
1977
|
-
microcentsSpent:
|
|
2007
|
+
testResults: z29.array(z29.unknown()),
|
|
2008
|
+
tags: z29.array(z29.string()).optional(),
|
|
2009
|
+
feedback: z29.string().optional(),
|
|
2010
|
+
score: z29.number(),
|
|
2011
|
+
suiteId: z29.string().optional()
|
|
2012
|
+
});
|
|
2013
|
+
var LeanEvaluationResultSchema = z29.object({
|
|
2014
|
+
id: z29.string(),
|
|
2015
|
+
runId: z29.string(),
|
|
2016
|
+
timestamp: z29.number(),
|
|
2017
|
+
tags: z29.array(z29.string()).optional(),
|
|
2018
|
+
scenarioId: z29.string(),
|
|
2019
|
+
scenarioVersion: z29.number().optional(),
|
|
2020
|
+
targetId: z29.string(),
|
|
2021
|
+
targetVersion: z29.number().optional(),
|
|
2022
|
+
suiteId: z29.string().optional(),
|
|
2023
|
+
score: z29.number(),
|
|
2024
|
+
time: z29.number().optional(),
|
|
2025
|
+
microcentsSpent: z29.number().optional()
|
|
1978
2026
|
});
|
|
1979
2027
|
|
|
1980
2028
|
// src/project/project.ts
|
|
1981
|
-
import { z as
|
|
2029
|
+
import { z as z30 } from "zod";
|
|
1982
2030
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
1983
|
-
appId:
|
|
1984
|
-
appSecret:
|
|
2031
|
+
appId: z30.string().optional().describe("The ID of the app in Dev Center"),
|
|
2032
|
+
appSecret: z30.string().optional().describe("The secret of the app in Dev Center")
|
|
1985
2033
|
});
|
|
1986
2034
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
1987
2035
|
id: true,
|
|
@@ -2007,6 +2055,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
2007
2055
|
// src/assertion/system-assertions.ts
|
|
2008
2056
|
var SYSTEM_ASSERTION_IDS = {
|
|
2009
2057
|
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
2058
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
2010
2059
|
BUILD_PASSED: "system:build_passed",
|
|
2011
2060
|
TIME_LIMIT: "system:time_limit",
|
|
2012
2061
|
COST: "system:cost",
|
|
@@ -2030,6 +2079,26 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2030
2079
|
}
|
|
2031
2080
|
]
|
|
2032
2081
|
},
|
|
2082
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2083
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2084
|
+
name: "Tool Called With Param",
|
|
2085
|
+
description: "Check that a tool was called with expected parameters",
|
|
2086
|
+
type: "tool_called_with_param",
|
|
2087
|
+
parameters: [
|
|
2088
|
+
{
|
|
2089
|
+
name: "toolName",
|
|
2090
|
+
label: "Tool Name",
|
|
2091
|
+
type: "string",
|
|
2092
|
+
required: true
|
|
2093
|
+
},
|
|
2094
|
+
{
|
|
2095
|
+
name: "expectedParams",
|
|
2096
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
2097
|
+
type: "string",
|
|
2098
|
+
required: true
|
|
2099
|
+
}
|
|
2100
|
+
]
|
|
2101
|
+
},
|
|
2033
2102
|
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
2034
2103
|
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
2035
2104
|
name: "Build Passed",
|
|
@@ -2148,6 +2217,7 @@ var export_ClaudeModel = import_types.ClaudeModel;
|
|
|
2148
2217
|
export {
|
|
2149
2218
|
AVAILABLE_MODEL_IDS,
|
|
2150
2219
|
AVAILABLE_RUN_COMMANDS,
|
|
2220
|
+
AVAILABLE_TOOL_NAMES,
|
|
2151
2221
|
AgentRunCommand,
|
|
2152
2222
|
AgentRunCommandSchema,
|
|
2153
2223
|
AgentSchema,
|
|
@@ -2176,6 +2246,7 @@ export {
|
|
|
2176
2246
|
CreateEvalRunInputSchema,
|
|
2177
2247
|
CreateMcpInputSchema,
|
|
2178
2248
|
CreateProjectInputSchema,
|
|
2249
|
+
CreateRuleInputSchema,
|
|
2179
2250
|
CreateSkillInputSchema,
|
|
2180
2251
|
CreateSkillVersionInputSchema,
|
|
2181
2252
|
CreateSkillsGroupInputSchema,
|
|
@@ -2230,6 +2301,8 @@ export {
|
|
|
2230
2301
|
ProjectSchema,
|
|
2231
2302
|
PromptResultSchema,
|
|
2232
2303
|
RUN_COMMAND_LABELS,
|
|
2304
|
+
RuleSchema,
|
|
2305
|
+
RuleTypeSchema,
|
|
2233
2306
|
SEMVER_REGEX,
|
|
2234
2307
|
SKILL_FOLDER_NAME_REGEX,
|
|
2235
2308
|
SYSTEM_ASSERTIONS,
|
|
@@ -2262,6 +2335,8 @@ export {
|
|
|
2262
2335
|
TimeAssertionSchema,
|
|
2263
2336
|
TimeConfigSchema,
|
|
2264
2337
|
TokenUsageSchema,
|
|
2338
|
+
ToolCalledWithParamAssertionSchema,
|
|
2339
|
+
ToolCalledWithParamConfigSchema,
|
|
2265
2340
|
ToolTestSchema,
|
|
2266
2341
|
TriggerMetadataSchema,
|
|
2267
2342
|
TriggerSchema,
|
|
@@ -2270,6 +2345,7 @@ export {
|
|
|
2270
2345
|
UpdateCustomAssertionInputSchema,
|
|
2271
2346
|
UpdateMcpInputSchema,
|
|
2272
2347
|
UpdateProjectInputSchema,
|
|
2348
|
+
UpdateRuleInputSchema,
|
|
2273
2349
|
UpdateSkillInputSchema,
|
|
2274
2350
|
UpdateSkillsGroupInputSchema,
|
|
2275
2351
|
UpdateSubAgentInputSchema,
|