@wix/evalforge-types 0.37.0 → 0.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/index.js +496 -413
- package/build/index.js.map +4 -4
- package/build/index.mjs +489 -413
- package/build/index.mjs.map +4 -4
- package/build/types/agent/adapter.d.ts +3 -0
- package/build/types/assertion/assertion.d.ts +34 -0
- package/build/types/assertion/system-assertions.d.ts +1 -0
- package/build/types/common/index.d.ts +2 -0
- package/build/types/common/rule.d.ts +47 -0
- package/build/types/common/tool-names.d.ts +1 -0
- package/build/types/evaluation/eval-run.d.ts +2 -0
- package/build/types/scenario/assertions.d.ts +16 -0
- package/build/types/scenario/test-scenario.d.ts +12 -0
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -916,6 +916,7 @@ var index_exports = {};
|
|
|
916
916
|
__export(index_exports, {
|
|
917
917
|
AVAILABLE_MODEL_IDS: () => AVAILABLE_MODEL_IDS,
|
|
918
918
|
AVAILABLE_RUN_COMMANDS: () => AVAILABLE_RUN_COMMANDS,
|
|
919
|
+
AVAILABLE_TOOL_NAMES: () => AVAILABLE_TOOL_NAMES,
|
|
919
920
|
AgentRunCommand: () => AgentRunCommand,
|
|
920
921
|
AgentRunCommandSchema: () => AgentRunCommandSchema,
|
|
921
922
|
AgentSchema: () => AgentSchema,
|
|
@@ -944,6 +945,7 @@ __export(index_exports, {
|
|
|
944
945
|
CreateEvalRunInputSchema: () => CreateEvalRunInputSchema,
|
|
945
946
|
CreateMcpInputSchema: () => CreateMcpInputSchema,
|
|
946
947
|
CreateProjectInputSchema: () => CreateProjectInputSchema,
|
|
948
|
+
CreateRuleInputSchema: () => CreateRuleInputSchema,
|
|
947
949
|
CreateSkillInputSchema: () => CreateSkillInputSchema,
|
|
948
950
|
CreateSkillVersionInputSchema: () => CreateSkillVersionInputSchema,
|
|
949
951
|
CreateSkillsGroupInputSchema: () => CreateSkillsGroupInputSchema,
|
|
@@ -998,6 +1000,8 @@ __export(index_exports, {
|
|
|
998
1000
|
ProjectSchema: () => ProjectSchema,
|
|
999
1001
|
PromptResultSchema: () => PromptResultSchema,
|
|
1000
1002
|
RUN_COMMAND_LABELS: () => RUN_COMMAND_LABELS,
|
|
1003
|
+
RuleSchema: () => RuleSchema,
|
|
1004
|
+
RuleTypeSchema: () => RuleTypeSchema,
|
|
1001
1005
|
SEMVER_REGEX: () => SEMVER_REGEX,
|
|
1002
1006
|
SKILL_FOLDER_NAME_REGEX: () => SKILL_FOLDER_NAME_REGEX,
|
|
1003
1007
|
SYSTEM_ASSERTIONS: () => SYSTEM_ASSERTIONS,
|
|
@@ -1030,6 +1034,8 @@ __export(index_exports, {
|
|
|
1030
1034
|
TimeAssertionSchema: () => TimeAssertionSchema,
|
|
1031
1035
|
TimeConfigSchema: () => TimeConfigSchema,
|
|
1032
1036
|
TokenUsageSchema: () => TokenUsageSchema,
|
|
1037
|
+
ToolCalledWithParamAssertionSchema: () => ToolCalledWithParamAssertionSchema,
|
|
1038
|
+
ToolCalledWithParamConfigSchema: () => ToolCalledWithParamConfigSchema,
|
|
1033
1039
|
ToolTestSchema: () => ToolTestSchema,
|
|
1034
1040
|
TriggerMetadataSchema: () => TriggerMetadataSchema,
|
|
1035
1041
|
TriggerSchema: () => TriggerSchema,
|
|
@@ -1038,6 +1044,7 @@ __export(index_exports, {
|
|
|
1038
1044
|
UpdateCustomAssertionInputSchema: () => UpdateCustomAssertionInputSchema,
|
|
1039
1045
|
UpdateMcpInputSchema: () => UpdateMcpInputSchema,
|
|
1040
1046
|
UpdateProjectInputSchema: () => UpdateProjectInputSchema,
|
|
1047
|
+
UpdateRuleInputSchema: () => UpdateRuleInputSchema,
|
|
1041
1048
|
UpdateSkillInputSchema: () => UpdateSkillInputSchema,
|
|
1042
1049
|
UpdateSkillsGroupInputSchema: () => UpdateSkillsGroupInputSchema,
|
|
1043
1050
|
UpdateSubAgentInputSchema: () => UpdateSubAgentInputSchema,
|
|
@@ -1128,6 +1135,33 @@ var ModelConfigSchema = import_zod4.z.object({
|
|
|
1128
1135
|
maxTokens: import_zod4.z.preprocess(nullToUndefined, import_zod4.z.number().min(1).optional())
|
|
1129
1136
|
});
|
|
1130
1137
|
|
|
1138
|
+
// src/common/rule.ts
|
|
1139
|
+
var import_zod5 = require("zod");
|
|
1140
|
+
var RuleTypeSchema = import_zod5.z.enum(["claude-md", "agents-md", "cursor-rule"]);
|
|
1141
|
+
var RuleSchema = TenantEntitySchema.extend({
|
|
1142
|
+
ruleType: RuleTypeSchema,
|
|
1143
|
+
content: import_zod5.z.string()
|
|
1144
|
+
});
|
|
1145
|
+
var RuleInputBaseSchema = RuleSchema.omit({
|
|
1146
|
+
id: true,
|
|
1147
|
+
createdAt: true,
|
|
1148
|
+
updatedAt: true,
|
|
1149
|
+
deleted: true
|
|
1150
|
+
});
|
|
1151
|
+
var CreateRuleInputSchema = RuleInputBaseSchema;
|
|
1152
|
+
var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
|
|
1153
|
+
|
|
1154
|
+
// src/common/tool-names.ts
|
|
1155
|
+
var AVAILABLE_TOOL_NAMES = [
|
|
1156
|
+
"Bash",
|
|
1157
|
+
"Edit",
|
|
1158
|
+
"Glob",
|
|
1159
|
+
"Grep",
|
|
1160
|
+
"Read",
|
|
1161
|
+
"Skill",
|
|
1162
|
+
"Write"
|
|
1163
|
+
];
|
|
1164
|
+
|
|
1131
1165
|
// src/target/target.ts
|
|
1132
1166
|
var TargetSchema = TenantEntitySchema.extend({
|
|
1133
1167
|
// Base for all testable entities
|
|
@@ -1135,7 +1169,7 @@ var TargetSchema = TenantEntitySchema.extend({
|
|
|
1135
1169
|
});
|
|
1136
1170
|
|
|
1137
1171
|
// src/target/agent.ts
|
|
1138
|
-
var
|
|
1172
|
+
var import_zod6 = require("zod");
|
|
1139
1173
|
var AgentRunCommand = /* @__PURE__ */ ((AgentRunCommand2) => {
|
|
1140
1174
|
AgentRunCommand2["CLAUDE"] = "claude";
|
|
1141
1175
|
return AgentRunCommand2;
|
|
@@ -1144,7 +1178,7 @@ var AVAILABLE_RUN_COMMANDS = Object.values(AgentRunCommand);
|
|
|
1144
1178
|
var RUN_COMMAND_LABELS = {
|
|
1145
1179
|
["claude" /* CLAUDE */]: "Claude Code"
|
|
1146
1180
|
};
|
|
1147
|
-
var AgentRunCommandSchema =
|
|
1181
|
+
var AgentRunCommandSchema = import_zod6.z.nativeEnum(AgentRunCommand);
|
|
1148
1182
|
var AgentSchema = TargetSchema.extend({
|
|
1149
1183
|
/** Command to run the agent */
|
|
1150
1184
|
runCommand: AgentRunCommandSchema,
|
|
@@ -1162,51 +1196,51 @@ var UpdateAgentInputSchema = CreateAgentInputSchema.partial().extend({
|
|
|
1162
1196
|
});
|
|
1163
1197
|
|
|
1164
1198
|
// src/target/skill.ts
|
|
1165
|
-
var
|
|
1199
|
+
var import_zod7 = require("zod");
|
|
1166
1200
|
var SKILL_FOLDER_NAME_REGEX = /^[a-z0-9]+(-[a-z0-9]+)*$/;
|
|
1167
1201
|
var SEMVER_REGEX = /^\d+\.\d+\.\d+$/;
|
|
1168
|
-
var SkillVersionOriginSchema =
|
|
1202
|
+
var SkillVersionOriginSchema = import_zod7.z.enum(["manual", "pr", "master"]);
|
|
1169
1203
|
function isValidSkillFolderName(name) {
|
|
1170
1204
|
return typeof name === "string" && name.length > 0 && SKILL_FOLDER_NAME_REGEX.test(name.trim());
|
|
1171
1205
|
}
|
|
1172
|
-
var SkillMetadataSchema =
|
|
1173
|
-
name:
|
|
1174
|
-
description:
|
|
1175
|
-
allowedTools:
|
|
1176
|
-
skills:
|
|
1206
|
+
var SkillMetadataSchema = import_zod7.z.object({
|
|
1207
|
+
name: import_zod7.z.string(),
|
|
1208
|
+
description: import_zod7.z.string(),
|
|
1209
|
+
allowedTools: import_zod7.z.array(import_zod7.z.string()).optional(),
|
|
1210
|
+
skills: import_zod7.z.array(import_zod7.z.string()).optional()
|
|
1177
1211
|
});
|
|
1178
|
-
var SkillFileSchema =
|
|
1212
|
+
var SkillFileSchema = import_zod7.z.object({
|
|
1179
1213
|
/** Relative path within the skill directory, e.g. "SKILL.md" or "references/API_SPEC.md" */
|
|
1180
|
-
path:
|
|
1214
|
+
path: import_zod7.z.string().min(1),
|
|
1181
1215
|
/** File content (UTF-8 text) */
|
|
1182
|
-
content:
|
|
1216
|
+
content: import_zod7.z.string()
|
|
1183
1217
|
});
|
|
1184
|
-
var SkillVersionSchema =
|
|
1185
|
-
id:
|
|
1186
|
-
projectId:
|
|
1187
|
-
skillId:
|
|
1218
|
+
var SkillVersionSchema = import_zod7.z.object({
|
|
1219
|
+
id: import_zod7.z.string(),
|
|
1220
|
+
projectId: import_zod7.z.string(),
|
|
1221
|
+
skillId: import_zod7.z.string(),
|
|
1188
1222
|
/** Semver string (e.g. "1.2.0") or Falcon fingerprint */
|
|
1189
|
-
version:
|
|
1223
|
+
version: import_zod7.z.string(),
|
|
1190
1224
|
/** How this version was created */
|
|
1191
1225
|
origin: SkillVersionOriginSchema,
|
|
1192
1226
|
/** Where this snapshot was taken from */
|
|
1193
1227
|
source: GitHubSourceSchema.optional(),
|
|
1194
1228
|
/** Frozen snapshot of all files in the skill directory */
|
|
1195
|
-
files:
|
|
1229
|
+
files: import_zod7.z.array(SkillFileSchema).optional(),
|
|
1196
1230
|
/** Optional notes about this version (changelog, reason for change) */
|
|
1197
|
-
notes:
|
|
1198
|
-
createdAt:
|
|
1231
|
+
notes: import_zod7.z.string().optional(),
|
|
1232
|
+
createdAt: import_zod7.z.string()
|
|
1199
1233
|
});
|
|
1200
|
-
var CreateSkillVersionInputSchema =
|
|
1234
|
+
var CreateSkillVersionInputSchema = import_zod7.z.object({
|
|
1201
1235
|
/** GitHub source to snapshot from. If not provided, uses the Skill's source. */
|
|
1202
1236
|
source: GitHubSourceSchema.optional(),
|
|
1203
1237
|
/** Version string for this snapshot (e.g. "1.0.0", "1.0.3"). */
|
|
1204
|
-
version:
|
|
1205
|
-
notes:
|
|
1238
|
+
version: import_zod7.z.string().min(1),
|
|
1239
|
+
notes: import_zod7.z.string().optional(),
|
|
1206
1240
|
/** Origin of this version. Defaults to 'manual' in backend. */
|
|
1207
1241
|
origin: SkillVersionOriginSchema.optional(),
|
|
1208
1242
|
/** Pre-edited files to store directly (bypasses GitHub fetch when provided) */
|
|
1209
|
-
files:
|
|
1243
|
+
files: import_zod7.z.array(SkillFileSchema).optional()
|
|
1210
1244
|
});
|
|
1211
1245
|
var SkillSchema = TargetSchema.extend({
|
|
1212
1246
|
/** GitHub source reference for live content fetching */
|
|
@@ -1222,15 +1256,15 @@ var SkillInputBaseSchema = SkillSchema.omit({
|
|
|
1222
1256
|
source: true
|
|
1223
1257
|
}).extend({
|
|
1224
1258
|
/** Optional - not stored on Skill; content description lives in SkillVersion */
|
|
1225
|
-
description:
|
|
1259
|
+
description: import_zod7.z.string().optional(),
|
|
1226
1260
|
/** GitHub source reference for live content fetching */
|
|
1227
1261
|
source: GitHubSourceSchema.optional()
|
|
1228
1262
|
});
|
|
1229
|
-
var InitialVersionInputSchema =
|
|
1230
|
-
files:
|
|
1231
|
-
notes:
|
|
1263
|
+
var InitialVersionInputSchema = import_zod7.z.object({
|
|
1264
|
+
files: import_zod7.z.array(SkillFileSchema).optional(),
|
|
1265
|
+
notes: import_zod7.z.string().optional(),
|
|
1232
1266
|
source: GitHubSourceSchema.optional(),
|
|
1233
|
-
version:
|
|
1267
|
+
version: import_zod7.z.string().optional(),
|
|
1234
1268
|
origin: SkillVersionOriginSchema.optional()
|
|
1235
1269
|
});
|
|
1236
1270
|
var CreateSkillInputSchema = SkillInputBaseSchema.extend({
|
|
@@ -1248,10 +1282,10 @@ var SkillWithLatestVersionSchema = SkillSchema.extend({
|
|
|
1248
1282
|
});
|
|
1249
1283
|
|
|
1250
1284
|
// src/target/skills-group.ts
|
|
1251
|
-
var
|
|
1285
|
+
var import_zod8 = require("zod");
|
|
1252
1286
|
var SkillsGroupSchema = TenantEntitySchema.extend({
|
|
1253
1287
|
/** IDs of skills in this group */
|
|
1254
|
-
skillIds:
|
|
1288
|
+
skillIds: import_zod8.z.array(import_zod8.z.string())
|
|
1255
1289
|
});
|
|
1256
1290
|
var CreateSkillsGroupInputSchema = SkillsGroupSchema.omit({
|
|
1257
1291
|
id: true,
|
|
@@ -1262,10 +1296,10 @@ var CreateSkillsGroupInputSchema = SkillsGroupSchema.omit({
|
|
|
1262
1296
|
var UpdateSkillsGroupInputSchema = CreateSkillsGroupInputSchema.partial();
|
|
1263
1297
|
|
|
1264
1298
|
// src/target/sub-agent.ts
|
|
1265
|
-
var
|
|
1299
|
+
var import_zod9 = require("zod");
|
|
1266
1300
|
var SubAgentSchema = TargetSchema.extend({
|
|
1267
1301
|
/** The full sub-agent markdown content (YAML frontmatter + body) */
|
|
1268
|
-
subAgentMd:
|
|
1302
|
+
subAgentMd: import_zod9.z.string()
|
|
1269
1303
|
});
|
|
1270
1304
|
var SubAgentInputBaseSchema = SubAgentSchema.omit({
|
|
1271
1305
|
id: true,
|
|
@@ -1277,10 +1311,10 @@ var CreateSubAgentInputSchema = SubAgentInputBaseSchema;
|
|
|
1277
1311
|
var UpdateSubAgentInputSchema = SubAgentInputBaseSchema.partial();
|
|
1278
1312
|
|
|
1279
1313
|
// src/test/index.ts
|
|
1280
|
-
var
|
|
1314
|
+
var import_zod20 = require("zod");
|
|
1281
1315
|
|
|
1282
1316
|
// src/test/base.ts
|
|
1283
|
-
var
|
|
1317
|
+
var import_zod10 = require("zod");
|
|
1284
1318
|
var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
1285
1319
|
TestType2["LLM"] = "LLM";
|
|
1286
1320
|
TestType2["TOOL"] = "TOOL";
|
|
@@ -1293,7 +1327,7 @@ var TestType = /* @__PURE__ */ ((TestType2) => {
|
|
|
1293
1327
|
TestType2["PLAYWRIGHT_NL"] = "PLAYWRIGHT_NL";
|
|
1294
1328
|
return TestType2;
|
|
1295
1329
|
})(TestType || {});
|
|
1296
|
-
var TestTypeSchema =
|
|
1330
|
+
var TestTypeSchema = import_zod10.z.enum(TestType);
|
|
1297
1331
|
var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
1298
1332
|
TestImportance2["LOW"] = "low";
|
|
1299
1333
|
TestImportance2["MEDIUM"] = "medium";
|
|
@@ -1301,153 +1335,153 @@ var TestImportance = /* @__PURE__ */ ((TestImportance2) => {
|
|
|
1301
1335
|
TestImportance2["CRITICAL"] = "critical";
|
|
1302
1336
|
return TestImportance2;
|
|
1303
1337
|
})(TestImportance || {});
|
|
1304
|
-
var TestImportanceSchema =
|
|
1305
|
-
var BaseTestSchema =
|
|
1306
|
-
id:
|
|
1338
|
+
var TestImportanceSchema = import_zod10.z.enum(TestImportance);
|
|
1339
|
+
var BaseTestSchema = import_zod10.z.object({
|
|
1340
|
+
id: import_zod10.z.string(),
|
|
1307
1341
|
type: TestTypeSchema,
|
|
1308
|
-
name:
|
|
1309
|
-
description:
|
|
1342
|
+
name: import_zod10.z.string().min(3),
|
|
1343
|
+
description: import_zod10.z.string().optional(),
|
|
1310
1344
|
importance: TestImportanceSchema.optional()
|
|
1311
1345
|
});
|
|
1312
1346
|
|
|
1313
1347
|
// src/test/llm.ts
|
|
1314
|
-
var
|
|
1348
|
+
var import_zod11 = require("zod");
|
|
1315
1349
|
var LLMTestSchema = BaseTestSchema.extend({
|
|
1316
|
-
type:
|
|
1350
|
+
type: import_zod11.z.literal("LLM" /* LLM */),
|
|
1317
1351
|
/** Maximum steps for the LLM to take */
|
|
1318
|
-
maxSteps:
|
|
1352
|
+
maxSteps: import_zod11.z.number().min(1).max(100),
|
|
1319
1353
|
/** Prompt to send to the evaluator */
|
|
1320
|
-
prompt:
|
|
1354
|
+
prompt: import_zod11.z.string().min(1),
|
|
1321
1355
|
/** ID of the evaluator agent to use */
|
|
1322
|
-
evaluatorId:
|
|
1356
|
+
evaluatorId: import_zod11.z.string()
|
|
1323
1357
|
});
|
|
1324
1358
|
|
|
1325
1359
|
// src/test/tool.ts
|
|
1326
|
-
var
|
|
1360
|
+
var import_zod12 = require("zod");
|
|
1327
1361
|
var ToolTestSchema = BaseTestSchema.extend({
|
|
1328
|
-
type:
|
|
1362
|
+
type: import_zod12.z.literal("TOOL" /* TOOL */),
|
|
1329
1363
|
/** Name of the tool that should be called */
|
|
1330
|
-
toolName:
|
|
1364
|
+
toolName: import_zod12.z.string().min(3),
|
|
1331
1365
|
/** Expected arguments for the tool call */
|
|
1332
|
-
args:
|
|
1366
|
+
args: import_zod12.z.record(import_zod12.z.string(), import_zod12.z.any()),
|
|
1333
1367
|
/** Expected content in the tool results */
|
|
1334
|
-
resultsContent:
|
|
1368
|
+
resultsContent: import_zod12.z.string()
|
|
1335
1369
|
});
|
|
1336
1370
|
|
|
1337
1371
|
// src/test/site-config.ts
|
|
1338
|
-
var
|
|
1372
|
+
var import_zod13 = require("zod");
|
|
1339
1373
|
var SiteConfigTestSchema = BaseTestSchema.extend({
|
|
1340
|
-
type:
|
|
1374
|
+
type: import_zod13.z.literal("SITE_CONFIG" /* SITE_CONFIG */),
|
|
1341
1375
|
/** URL to call */
|
|
1342
|
-
url:
|
|
1376
|
+
url: import_zod13.z.string().url(),
|
|
1343
1377
|
/** HTTP method */
|
|
1344
|
-
method:
|
|
1378
|
+
method: import_zod13.z.enum(["GET", "POST"]),
|
|
1345
1379
|
/** Request body (for POST) */
|
|
1346
|
-
body:
|
|
1380
|
+
body: import_zod13.z.string().optional(),
|
|
1347
1381
|
/** Expected HTTP status code */
|
|
1348
|
-
expectedStatusCode:
|
|
1382
|
+
expectedStatusCode: import_zod13.z.number().int().min(100).max(599),
|
|
1349
1383
|
/** Expected response content */
|
|
1350
|
-
expectedResponse:
|
|
1384
|
+
expectedResponse: import_zod13.z.string().optional(),
|
|
1351
1385
|
/** JMESPath expression to extract from response */
|
|
1352
|
-
expectedResponseJMESPath:
|
|
1386
|
+
expectedResponseJMESPath: import_zod13.z.string().optional()
|
|
1353
1387
|
});
|
|
1354
1388
|
|
|
1355
1389
|
// src/test/command-execution.ts
|
|
1356
|
-
var
|
|
1390
|
+
var import_zod14 = require("zod");
|
|
1357
1391
|
var AllowedCommands = [
|
|
1358
1392
|
"yarn install --no-immutable && yarn build",
|
|
1359
1393
|
"npm run build",
|
|
1360
1394
|
"yarn typecheck"
|
|
1361
1395
|
];
|
|
1362
1396
|
var CommandExecutionTestSchema = BaseTestSchema.extend({
|
|
1363
|
-
type:
|
|
1397
|
+
type: import_zod14.z.literal("COMMAND_EXECUTION" /* COMMAND_EXECUTION */),
|
|
1364
1398
|
/** Command to execute (must be in AllowedCommands) */
|
|
1365
|
-
command:
|
|
1399
|
+
command: import_zod14.z.string().refine((value) => AllowedCommands.includes(value), {
|
|
1366
1400
|
message: `Command must be one of: ${AllowedCommands.join(", ")}`
|
|
1367
1401
|
}),
|
|
1368
1402
|
/** Expected exit code (default: 0) */
|
|
1369
|
-
expectedExitCode:
|
|
1403
|
+
expectedExitCode: import_zod14.z.number().default(0).optional()
|
|
1370
1404
|
});
|
|
1371
1405
|
|
|
1372
1406
|
// src/test/file-presence.ts
|
|
1373
|
-
var
|
|
1407
|
+
var import_zod15 = require("zod");
|
|
1374
1408
|
var FilePresenceTestSchema = BaseTestSchema.extend({
|
|
1375
|
-
type:
|
|
1409
|
+
type: import_zod15.z.literal("FILE_PRESENCE" /* FILE_PRESENCE */),
|
|
1376
1410
|
/** Paths to check */
|
|
1377
|
-
paths:
|
|
1411
|
+
paths: import_zod15.z.array(import_zod15.z.string()),
|
|
1378
1412
|
/** Whether files should exist (true) or not exist (false) */
|
|
1379
|
-
shouldExist:
|
|
1413
|
+
shouldExist: import_zod15.z.boolean()
|
|
1380
1414
|
});
|
|
1381
1415
|
|
|
1382
1416
|
// src/test/file-content.ts
|
|
1383
|
-
var
|
|
1384
|
-
var FileContentCheckSchema =
|
|
1417
|
+
var import_zod16 = require("zod");
|
|
1418
|
+
var FileContentCheckSchema = import_zod16.z.object({
|
|
1385
1419
|
/** Strings that must be present in the file */
|
|
1386
|
-
contains:
|
|
1420
|
+
contains: import_zod16.z.array(import_zod16.z.string()).optional(),
|
|
1387
1421
|
/** Strings that must NOT be present in the file */
|
|
1388
|
-
notContains:
|
|
1422
|
+
notContains: import_zod16.z.array(import_zod16.z.string()).optional(),
|
|
1389
1423
|
/** Regex pattern the content must match */
|
|
1390
|
-
matches:
|
|
1424
|
+
matches: import_zod16.z.string().optional(),
|
|
1391
1425
|
/** JSON path checks for structured content */
|
|
1392
|
-
jsonPath:
|
|
1393
|
-
|
|
1394
|
-
path:
|
|
1395
|
-
value:
|
|
1426
|
+
jsonPath: import_zod16.z.array(
|
|
1427
|
+
import_zod16.z.object({
|
|
1428
|
+
path: import_zod16.z.string(),
|
|
1429
|
+
value: import_zod16.z.unknown()
|
|
1396
1430
|
})
|
|
1397
1431
|
).optional(),
|
|
1398
1432
|
/** Lines that should be added (for diff checking) */
|
|
1399
|
-
added:
|
|
1433
|
+
added: import_zod16.z.array(import_zod16.z.string()).optional(),
|
|
1400
1434
|
/** Lines that should be removed (for diff checking) */
|
|
1401
|
-
removed:
|
|
1435
|
+
removed: import_zod16.z.array(import_zod16.z.string()).optional()
|
|
1402
1436
|
});
|
|
1403
1437
|
var FileContentTestSchema = BaseTestSchema.extend({
|
|
1404
|
-
type:
|
|
1438
|
+
type: import_zod16.z.literal("FILE_CONTENT" /* FILE_CONTENT */),
|
|
1405
1439
|
/** Path to the file to check */
|
|
1406
|
-
path:
|
|
1440
|
+
path: import_zod16.z.string(),
|
|
1407
1441
|
/** Content checks to perform */
|
|
1408
1442
|
checks: FileContentCheckSchema
|
|
1409
1443
|
});
|
|
1410
1444
|
|
|
1411
1445
|
// src/test/build-check.ts
|
|
1412
|
-
var
|
|
1446
|
+
var import_zod17 = require("zod");
|
|
1413
1447
|
var BuildCheckTestSchema = BaseTestSchema.extend({
|
|
1414
|
-
type:
|
|
1448
|
+
type: import_zod17.z.literal("BUILD_CHECK" /* BUILD_CHECK */),
|
|
1415
1449
|
/** Build command to execute */
|
|
1416
|
-
command:
|
|
1450
|
+
command: import_zod17.z.string(),
|
|
1417
1451
|
/** Whether the build should succeed */
|
|
1418
|
-
expectSuccess:
|
|
1452
|
+
expectSuccess: import_zod17.z.boolean(),
|
|
1419
1453
|
/** Maximum allowed warnings (optional) */
|
|
1420
|
-
allowedWarnings:
|
|
1454
|
+
allowedWarnings: import_zod17.z.number().optional(),
|
|
1421
1455
|
/** Timeout in milliseconds */
|
|
1422
|
-
timeout:
|
|
1456
|
+
timeout: import_zod17.z.number().optional()
|
|
1423
1457
|
});
|
|
1424
1458
|
|
|
1425
1459
|
// src/test/vitest.ts
|
|
1426
|
-
var
|
|
1460
|
+
var import_zod18 = require("zod");
|
|
1427
1461
|
var VitestTestSchema = BaseTestSchema.extend({
|
|
1428
|
-
type:
|
|
1462
|
+
type: import_zod18.z.literal("VITEST" /* VITEST */),
|
|
1429
1463
|
/** Test file content */
|
|
1430
|
-
testFile:
|
|
1464
|
+
testFile: import_zod18.z.string(),
|
|
1431
1465
|
/** Name of the test file */
|
|
1432
|
-
testFileName:
|
|
1466
|
+
testFileName: import_zod18.z.string(),
|
|
1433
1467
|
/** Minimum pass rate required (0-100) */
|
|
1434
|
-
minPassRate:
|
|
1468
|
+
minPassRate: import_zod18.z.number().min(0).max(100)
|
|
1435
1469
|
});
|
|
1436
1470
|
|
|
1437
1471
|
// src/test/playwright-nl.ts
|
|
1438
|
-
var
|
|
1472
|
+
var import_zod19 = require("zod");
|
|
1439
1473
|
var PlaywrightNLTestSchema = BaseTestSchema.extend({
|
|
1440
|
-
type:
|
|
1474
|
+
type: import_zod19.z.literal("PLAYWRIGHT_NL" /* PLAYWRIGHT_NL */),
|
|
1441
1475
|
/** Natural language steps to execute */
|
|
1442
|
-
steps:
|
|
1476
|
+
steps: import_zod19.z.array(import_zod19.z.string()),
|
|
1443
1477
|
/** Expected outcome description */
|
|
1444
|
-
expectedOutcome:
|
|
1478
|
+
expectedOutcome: import_zod19.z.string(),
|
|
1445
1479
|
/** Timeout in milliseconds */
|
|
1446
|
-
timeout:
|
|
1480
|
+
timeout: import_zod19.z.number().optional()
|
|
1447
1481
|
});
|
|
1448
1482
|
|
|
1449
1483
|
// src/test/index.ts
|
|
1450
|
-
var TestSchema =
|
|
1484
|
+
var TestSchema = import_zod20.z.discriminatedUnion("type", [
|
|
1451
1485
|
LLMTestSchema,
|
|
1452
1486
|
ToolTestSchema,
|
|
1453
1487
|
SiteConfigTestSchema,
|
|
@@ -1460,44 +1494,52 @@ var TestSchema = import_zod19.z.discriminatedUnion("type", [
|
|
|
1460
1494
|
]);
|
|
1461
1495
|
|
|
1462
1496
|
// src/scenario/assertions.ts
|
|
1463
|
-
var
|
|
1464
|
-
var SkillWasCalledAssertionSchema =
|
|
1465
|
-
type:
|
|
1497
|
+
var import_zod21 = require("zod");
|
|
1498
|
+
var SkillWasCalledAssertionSchema = import_zod21.z.object({
|
|
1499
|
+
type: import_zod21.z.literal("skill_was_called"),
|
|
1466
1500
|
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1467
|
-
skillNames:
|
|
1501
|
+
skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
|
|
1468
1502
|
});
|
|
1469
|
-
var
|
|
1470
|
-
type:
|
|
1503
|
+
var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
|
|
1504
|
+
type: import_zod21.z.literal("tool_called_with_param"),
|
|
1505
|
+
/** Name of the tool that must have been called */
|
|
1506
|
+
toolName: import_zod21.z.string().min(1),
|
|
1507
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1508
|
+
expectedParams: import_zod21.z.string().min(1)
|
|
1509
|
+
});
|
|
1510
|
+
var BuildPassedAssertionSchema = import_zod21.z.object({
|
|
1511
|
+
type: import_zod21.z.literal("build_passed"),
|
|
1471
1512
|
/** Command to run (default: "yarn build") */
|
|
1472
|
-
command:
|
|
1513
|
+
command: import_zod21.z.string().optional(),
|
|
1473
1514
|
/** Expected exit code (default: 0) */
|
|
1474
|
-
expectedExitCode:
|
|
1515
|
+
expectedExitCode: import_zod21.z.number().int().optional()
|
|
1475
1516
|
});
|
|
1476
|
-
var CostAssertionSchema =
|
|
1477
|
-
type:
|
|
1517
|
+
var CostAssertionSchema = import_zod21.z.object({
|
|
1518
|
+
type: import_zod21.z.literal("cost"),
|
|
1478
1519
|
/** Maximum allowed cost in USD */
|
|
1479
|
-
maxCostUsd:
|
|
1520
|
+
maxCostUsd: import_zod21.z.number().positive()
|
|
1480
1521
|
});
|
|
1481
|
-
var LlmJudgeAssertionSchema =
|
|
1482
|
-
type:
|
|
1522
|
+
var LlmJudgeAssertionSchema = import_zod21.z.object({
|
|
1523
|
+
type: import_zod21.z.literal("llm_judge"),
|
|
1483
1524
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1484
|
-
prompt:
|
|
1525
|
+
prompt: import_zod21.z.string(),
|
|
1485
1526
|
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
1486
|
-
systemPrompt:
|
|
1527
|
+
systemPrompt: import_zod21.z.string().optional(),
|
|
1487
1528
|
/** Minimum score to pass (0-100, default 70) */
|
|
1488
|
-
minScore:
|
|
1529
|
+
minScore: import_zod21.z.number().int().min(0).max(100).optional(),
|
|
1489
1530
|
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
1490
|
-
model:
|
|
1491
|
-
maxTokens:
|
|
1492
|
-
temperature:
|
|
1531
|
+
model: import_zod21.z.string().optional(),
|
|
1532
|
+
maxTokens: import_zod21.z.number().int().optional(),
|
|
1533
|
+
temperature: import_zod21.z.number().min(0).max(1).optional()
|
|
1493
1534
|
});
|
|
1494
|
-
var TimeAssertionSchema =
|
|
1495
|
-
type:
|
|
1535
|
+
var TimeAssertionSchema = import_zod21.z.object({
|
|
1536
|
+
type: import_zod21.z.literal("time_limit"),
|
|
1496
1537
|
/** Maximum allowed duration in milliseconds */
|
|
1497
|
-
maxDurationMs:
|
|
1538
|
+
maxDurationMs: import_zod21.z.number().int().positive()
|
|
1498
1539
|
});
|
|
1499
|
-
var AssertionSchema =
|
|
1540
|
+
var AssertionSchema = import_zod21.z.union([
|
|
1500
1541
|
SkillWasCalledAssertionSchema,
|
|
1542
|
+
ToolCalledWithParamAssertionSchema,
|
|
1501
1543
|
BuildPassedAssertionSchema,
|
|
1502
1544
|
TimeAssertionSchema,
|
|
1503
1545
|
CostAssertionSchema,
|
|
@@ -1505,33 +1547,33 @@ var AssertionSchema = import_zod20.z.union([
|
|
|
1505
1547
|
]);
|
|
1506
1548
|
|
|
1507
1549
|
// src/scenario/environment.ts
|
|
1508
|
-
var
|
|
1509
|
-
var LocalProjectConfigSchema =
|
|
1550
|
+
var import_zod22 = require("zod");
|
|
1551
|
+
var LocalProjectConfigSchema = import_zod22.z.object({
|
|
1510
1552
|
/** Template ID to use for the local project */
|
|
1511
|
-
templateId:
|
|
1553
|
+
templateId: import_zod22.z.string().optional(),
|
|
1512
1554
|
/** Files to create in the project */
|
|
1513
|
-
files:
|
|
1514
|
-
|
|
1515
|
-
path:
|
|
1516
|
-
content:
|
|
1555
|
+
files: import_zod22.z.array(
|
|
1556
|
+
import_zod22.z.object({
|
|
1557
|
+
path: import_zod22.z.string().min(1),
|
|
1558
|
+
content: import_zod22.z.string().min(1)
|
|
1517
1559
|
})
|
|
1518
1560
|
).optional()
|
|
1519
1561
|
});
|
|
1520
|
-
var MetaSiteConfigSchema =
|
|
1521
|
-
configurations:
|
|
1522
|
-
|
|
1523
|
-
name:
|
|
1524
|
-
apiCalls:
|
|
1525
|
-
|
|
1526
|
-
url:
|
|
1527
|
-
method:
|
|
1528
|
-
body:
|
|
1562
|
+
var MetaSiteConfigSchema = import_zod22.z.object({
|
|
1563
|
+
configurations: import_zod22.z.array(
|
|
1564
|
+
import_zod22.z.object({
|
|
1565
|
+
name: import_zod22.z.string().min(1),
|
|
1566
|
+
apiCalls: import_zod22.z.array(
|
|
1567
|
+
import_zod22.z.object({
|
|
1568
|
+
url: import_zod22.z.string().url(),
|
|
1569
|
+
method: import_zod22.z.enum(["POST", "PUT"]),
|
|
1570
|
+
body: import_zod22.z.string()
|
|
1529
1571
|
})
|
|
1530
1572
|
)
|
|
1531
1573
|
})
|
|
1532
1574
|
).optional()
|
|
1533
1575
|
});
|
|
1534
|
-
var EnvironmentSchema =
|
|
1576
|
+
var EnvironmentSchema = import_zod22.z.object({
|
|
1535
1577
|
/** Local project configuration */
|
|
1536
1578
|
localProject: LocalProjectConfigSchema.optional(),
|
|
1537
1579
|
/** Meta site configuration */
|
|
@@ -1539,64 +1581,71 @@ var EnvironmentSchema = import_zod21.z.object({
|
|
|
1539
1581
|
});
|
|
1540
1582
|
|
|
1541
1583
|
// src/scenario/test-scenario.ts
|
|
1542
|
-
var
|
|
1584
|
+
var import_zod24 = require("zod");
|
|
1543
1585
|
|
|
1544
1586
|
// src/assertion/assertion.ts
|
|
1545
|
-
var
|
|
1546
|
-
var AssertionTypeSchema =
|
|
1587
|
+
var import_zod23 = require("zod");
|
|
1588
|
+
var AssertionTypeSchema = import_zod23.z.enum([
|
|
1547
1589
|
"skill_was_called",
|
|
1590
|
+
"tool_called_with_param",
|
|
1548
1591
|
"build_passed",
|
|
1549
1592
|
"time_limit",
|
|
1550
1593
|
"cost",
|
|
1551
1594
|
"llm_judge"
|
|
1552
1595
|
]);
|
|
1553
|
-
var AssertionParameterTypeSchema =
|
|
1596
|
+
var AssertionParameterTypeSchema = import_zod23.z.enum([
|
|
1554
1597
|
"string",
|
|
1555
1598
|
"number",
|
|
1556
1599
|
"boolean"
|
|
1557
1600
|
]);
|
|
1558
|
-
var AssertionParameterSchema =
|
|
1601
|
+
var AssertionParameterSchema = import_zod23.z.object({
|
|
1559
1602
|
/** Parameter name (used as key in params object) */
|
|
1560
|
-
name:
|
|
1603
|
+
name: import_zod23.z.string().min(1),
|
|
1561
1604
|
/** Display label for the parameter */
|
|
1562
|
-
label:
|
|
1605
|
+
label: import_zod23.z.string().min(1),
|
|
1563
1606
|
/** Parameter type */
|
|
1564
1607
|
type: AssertionParameterTypeSchema,
|
|
1565
1608
|
/** Whether this parameter is required */
|
|
1566
|
-
required:
|
|
1609
|
+
required: import_zod23.z.boolean(),
|
|
1567
1610
|
/** Default value (optional, used when not provided) */
|
|
1568
|
-
defaultValue:
|
|
1611
|
+
defaultValue: import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean()]).optional(),
|
|
1569
1612
|
/** If true, parameter is hidden by default behind "Show advanced options" */
|
|
1570
|
-
advanced:
|
|
1613
|
+
advanced: import_zod23.z.boolean().optional()
|
|
1571
1614
|
});
|
|
1572
|
-
var ScenarioAssertionLinkSchema =
|
|
1615
|
+
var ScenarioAssertionLinkSchema = import_zod23.z.object({
|
|
1573
1616
|
/** ID of the assertion (can be system assertion like 'system:skill_was_called' or custom assertion UUID) */
|
|
1574
|
-
assertionId:
|
|
1617
|
+
assertionId: import_zod23.z.string(),
|
|
1575
1618
|
/** Parameter values for this assertion in this scenario */
|
|
1576
|
-
params:
|
|
1577
|
-
|
|
1578
|
-
|
|
1619
|
+
params: import_zod23.z.record(
|
|
1620
|
+
import_zod23.z.string(),
|
|
1621
|
+
import_zod23.z.union([import_zod23.z.string(), import_zod23.z.number(), import_zod23.z.boolean(), import_zod23.z.null()])
|
|
1579
1622
|
).optional()
|
|
1580
1623
|
});
|
|
1581
|
-
var SkillWasCalledConfigSchema =
|
|
1624
|
+
var SkillWasCalledConfigSchema = import_zod23.z.object({
|
|
1582
1625
|
/** Names of the skills that must have been called */
|
|
1583
|
-
skillNames:
|
|
1626
|
+
skillNames: import_zod23.z.array(import_zod23.z.string().min(1)).min(1)
|
|
1584
1627
|
});
|
|
1585
|
-
var CostConfigSchema =
|
|
1628
|
+
var CostConfigSchema = import_zod23.z.strictObject({
|
|
1586
1629
|
/** Maximum allowed cost in USD */
|
|
1587
|
-
maxCostUsd:
|
|
1630
|
+
maxCostUsd: import_zod23.z.number().positive()
|
|
1631
|
+
});
|
|
1632
|
+
var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
|
|
1633
|
+
/** Name of the tool that must have been called */
|
|
1634
|
+
toolName: import_zod23.z.string().min(1),
|
|
1635
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1636
|
+
expectedParams: import_zod23.z.string().min(1)
|
|
1588
1637
|
});
|
|
1589
|
-
var BuildPassedConfigSchema =
|
|
1638
|
+
var BuildPassedConfigSchema = import_zod23.z.strictObject({
|
|
1590
1639
|
/** Command to run (default: "yarn build") */
|
|
1591
|
-
command:
|
|
1640
|
+
command: import_zod23.z.string().optional(),
|
|
1592
1641
|
/** Expected exit code (default: 0) */
|
|
1593
|
-
expectedExitCode:
|
|
1642
|
+
expectedExitCode: import_zod23.z.number().int().optional()
|
|
1594
1643
|
});
|
|
1595
|
-
var TimeConfigSchema =
|
|
1644
|
+
var TimeConfigSchema = import_zod23.z.strictObject({
|
|
1596
1645
|
/** Maximum allowed duration in milliseconds */
|
|
1597
|
-
maxDurationMs:
|
|
1646
|
+
maxDurationMs: import_zod23.z.number().int().positive()
|
|
1598
1647
|
});
|
|
1599
|
-
var LlmJudgeConfigSchema =
|
|
1648
|
+
var LlmJudgeConfigSchema = import_zod23.z.object({
|
|
1600
1649
|
/**
|
|
1601
1650
|
* Prompt template with placeholders:
|
|
1602
1651
|
* - {{output}}: agent's final output
|
|
@@ -1607,32 +1656,34 @@ var LlmJudgeConfigSchema = import_zod22.z.object({
|
|
|
1607
1656
|
* - {{trace}}: step-by-step trace of tool calls
|
|
1608
1657
|
* - Custom parameters defined in the parameters array
|
|
1609
1658
|
*/
|
|
1610
|
-
prompt:
|
|
1659
|
+
prompt: import_zod23.z.string().min(1),
|
|
1611
1660
|
/** Optional system prompt for the judge */
|
|
1612
|
-
systemPrompt:
|
|
1661
|
+
systemPrompt: import_zod23.z.string().optional(),
|
|
1613
1662
|
/** Minimum score to pass (0-100, default 70) */
|
|
1614
|
-
minScore:
|
|
1663
|
+
minScore: import_zod23.z.number().int().min(0).max(100).optional(),
|
|
1615
1664
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
1616
|
-
model:
|
|
1665
|
+
model: import_zod23.z.string().optional(),
|
|
1617
1666
|
/** Max output tokens */
|
|
1618
|
-
maxTokens:
|
|
1667
|
+
maxTokens: import_zod23.z.number().int().optional(),
|
|
1619
1668
|
/** Temperature (0-1) */
|
|
1620
|
-
temperature:
|
|
1669
|
+
temperature: import_zod23.z.number().min(0).max(1).optional(),
|
|
1621
1670
|
/** User-defined parameters for this assertion */
|
|
1622
|
-
parameters:
|
|
1671
|
+
parameters: import_zod23.z.array(AssertionParameterSchema).optional()
|
|
1623
1672
|
});
|
|
1624
|
-
var AssertionConfigSchema =
|
|
1673
|
+
var AssertionConfigSchema = import_zod23.z.union([
|
|
1625
1674
|
LlmJudgeConfigSchema,
|
|
1626
1675
|
// requires prompt - check first
|
|
1627
1676
|
SkillWasCalledConfigSchema,
|
|
1628
1677
|
// requires skillNames
|
|
1678
|
+
ToolCalledWithParamConfigSchema,
|
|
1679
|
+
// requires toolName + expectedParams, uses strictObject
|
|
1629
1680
|
TimeConfigSchema,
|
|
1630
1681
|
// requires maxDurationMs, uses strictObject
|
|
1631
1682
|
CostConfigSchema,
|
|
1632
1683
|
// requires maxCostUsd, uses strictObject
|
|
1633
1684
|
BuildPassedConfigSchema,
|
|
1634
1685
|
// all optional, uses strictObject to reject unknown keys
|
|
1635
|
-
|
|
1686
|
+
import_zod23.z.object({})
|
|
1636
1687
|
// fallback empty config
|
|
1637
1688
|
]);
|
|
1638
1689
|
var CustomAssertionSchema = TenantEntitySchema.extend({
|
|
@@ -1654,6 +1705,8 @@ function validateAssertionConfig(type, config) {
|
|
|
1654
1705
|
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
1655
1706
|
case "cost":
|
|
1656
1707
|
return CostConfigSchema.safeParse(config).success;
|
|
1708
|
+
case "tool_called_with_param":
|
|
1709
|
+
return ToolCalledWithParamConfigSchema.safeParse(config).success;
|
|
1657
1710
|
case "build_passed":
|
|
1658
1711
|
return BuildPassedConfigSchema.safeParse(config).success;
|
|
1659
1712
|
case "time_limit":
|
|
@@ -1681,23 +1734,23 @@ function getLlmJudgeConfig(assertion) {
|
|
|
1681
1734
|
}
|
|
1682
1735
|
|
|
1683
1736
|
// src/scenario/test-scenario.ts
|
|
1684
|
-
var ExpectedFileSchema =
|
|
1737
|
+
var ExpectedFileSchema = import_zod24.z.object({
|
|
1685
1738
|
/** Relative path where the file should be created */
|
|
1686
|
-
path:
|
|
1739
|
+
path: import_zod24.z.string(),
|
|
1687
1740
|
/** Optional expected content */
|
|
1688
|
-
content:
|
|
1741
|
+
content: import_zod24.z.string().optional()
|
|
1689
1742
|
});
|
|
1690
1743
|
var TestScenarioSchema = TenantEntitySchema.extend({
|
|
1691
1744
|
/** The prompt sent to the agent to trigger the task */
|
|
1692
|
-
triggerPrompt:
|
|
1745
|
+
triggerPrompt: import_zod24.z.string().min(10),
|
|
1693
1746
|
/** ID of the template to use for this scenario (null = no template) */
|
|
1694
|
-
templateId:
|
|
1747
|
+
templateId: import_zod24.z.string().nullish(),
|
|
1695
1748
|
/** Inline assertions to evaluate for this scenario (legacy) */
|
|
1696
|
-
assertions:
|
|
1749
|
+
assertions: import_zod24.z.array(AssertionSchema).optional(),
|
|
1697
1750
|
/** IDs of saved assertions to evaluate (from assertions table) - legacy, use assertionLinks */
|
|
1698
|
-
assertionIds:
|
|
1751
|
+
assertionIds: import_zod24.z.array(import_zod24.z.string()).optional(),
|
|
1699
1752
|
/** Linked assertions with per-scenario parameter values */
|
|
1700
|
-
assertionLinks:
|
|
1753
|
+
assertionLinks: import_zod24.z.array(ScenarioAssertionLinkSchema).optional()
|
|
1701
1754
|
});
|
|
1702
1755
|
var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
1703
1756
|
id: true,
|
|
@@ -1708,10 +1761,10 @@ var CreateTestScenarioInputSchema = TestScenarioSchema.omit({
|
|
|
1708
1761
|
var UpdateTestScenarioInputSchema = CreateTestScenarioInputSchema.partial();
|
|
1709
1762
|
|
|
1710
1763
|
// src/suite/test-suite.ts
|
|
1711
|
-
var
|
|
1764
|
+
var import_zod25 = require("zod");
|
|
1712
1765
|
var TestSuiteSchema = TenantEntitySchema.extend({
|
|
1713
1766
|
/** IDs of test scenarios in this suite */
|
|
1714
|
-
scenarioIds:
|
|
1767
|
+
scenarioIds: import_zod25.z.array(import_zod25.z.string())
|
|
1715
1768
|
});
|
|
1716
1769
|
var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
1717
1770
|
id: true,
|
|
@@ -1722,21 +1775,21 @@ var CreateTestSuiteInputSchema = TestSuiteSchema.omit({
|
|
|
1722
1775
|
var UpdateTestSuiteInputSchema = CreateTestSuiteInputSchema.partial();
|
|
1723
1776
|
|
|
1724
1777
|
// src/evaluation/metrics.ts
|
|
1725
|
-
var
|
|
1726
|
-
var TokenUsageSchema =
|
|
1727
|
-
prompt:
|
|
1728
|
-
completion:
|
|
1729
|
-
total:
|
|
1730
|
-
});
|
|
1731
|
-
var EvalMetricsSchema =
|
|
1732
|
-
totalAssertions:
|
|
1733
|
-
passed:
|
|
1734
|
-
failed:
|
|
1735
|
-
skipped:
|
|
1736
|
-
errors:
|
|
1737
|
-
passRate:
|
|
1738
|
-
avgDuration:
|
|
1739
|
-
totalDuration:
|
|
1778
|
+
var import_zod26 = require("zod");
|
|
1779
|
+
var TokenUsageSchema = import_zod26.z.object({
|
|
1780
|
+
prompt: import_zod26.z.number(),
|
|
1781
|
+
completion: import_zod26.z.number(),
|
|
1782
|
+
total: import_zod26.z.number()
|
|
1783
|
+
});
|
|
1784
|
+
var EvalMetricsSchema = import_zod26.z.object({
|
|
1785
|
+
totalAssertions: import_zod26.z.number(),
|
|
1786
|
+
passed: import_zod26.z.number(),
|
|
1787
|
+
failed: import_zod26.z.number(),
|
|
1788
|
+
skipped: import_zod26.z.number(),
|
|
1789
|
+
errors: import_zod26.z.number(),
|
|
1790
|
+
passRate: import_zod26.z.number(),
|
|
1791
|
+
avgDuration: import_zod26.z.number(),
|
|
1792
|
+
totalDuration: import_zod26.z.number()
|
|
1740
1793
|
});
|
|
1741
1794
|
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
1742
1795
|
EvalStatus2["PENDING"] = "pending";
|
|
@@ -1746,7 +1799,7 @@ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
|
1746
1799
|
EvalStatus2["CANCELLED"] = "cancelled";
|
|
1747
1800
|
return EvalStatus2;
|
|
1748
1801
|
})(EvalStatus || {});
|
|
1749
|
-
var EvalStatusSchema =
|
|
1802
|
+
var EvalStatusSchema = import_zod26.z.enum(EvalStatus);
|
|
1750
1803
|
var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
1751
1804
|
LLMStepType2["COMPLETION"] = "completion";
|
|
1752
1805
|
LLMStepType2["TOOL_USE"] = "tool_use";
|
|
@@ -1754,52 +1807,52 @@ var LLMStepType = /* @__PURE__ */ ((LLMStepType2) => {
|
|
|
1754
1807
|
LLMStepType2["THINKING"] = "thinking";
|
|
1755
1808
|
return LLMStepType2;
|
|
1756
1809
|
})(LLMStepType || {});
|
|
1757
|
-
var LLMTraceStepSchema =
|
|
1758
|
-
id:
|
|
1759
|
-
stepNumber:
|
|
1760
|
-
type:
|
|
1761
|
-
model:
|
|
1762
|
-
provider:
|
|
1763
|
-
startedAt:
|
|
1764
|
-
durationMs:
|
|
1810
|
+
var LLMTraceStepSchema = import_zod26.z.object({
|
|
1811
|
+
id: import_zod26.z.string(),
|
|
1812
|
+
stepNumber: import_zod26.z.number(),
|
|
1813
|
+
type: import_zod26.z.enum(LLMStepType),
|
|
1814
|
+
model: import_zod26.z.string(),
|
|
1815
|
+
provider: import_zod26.z.string(),
|
|
1816
|
+
startedAt: import_zod26.z.string(),
|
|
1817
|
+
durationMs: import_zod26.z.number(),
|
|
1765
1818
|
tokenUsage: TokenUsageSchema,
|
|
1766
|
-
costUsd:
|
|
1767
|
-
toolName:
|
|
1768
|
-
toolArguments:
|
|
1769
|
-
inputPreview:
|
|
1770
|
-
outputPreview:
|
|
1771
|
-
success:
|
|
1772
|
-
error:
|
|
1773
|
-
});
|
|
1774
|
-
var LLMBreakdownStatsSchema =
|
|
1775
|
-
count:
|
|
1776
|
-
durationMs:
|
|
1777
|
-
tokens:
|
|
1778
|
-
costUsd:
|
|
1779
|
-
});
|
|
1780
|
-
var LLMTraceSummarySchema =
|
|
1781
|
-
totalSteps:
|
|
1782
|
-
totalDurationMs:
|
|
1819
|
+
costUsd: import_zod26.z.number(),
|
|
1820
|
+
toolName: import_zod26.z.string().optional(),
|
|
1821
|
+
toolArguments: import_zod26.z.string().optional(),
|
|
1822
|
+
inputPreview: import_zod26.z.string().optional(),
|
|
1823
|
+
outputPreview: import_zod26.z.string().optional(),
|
|
1824
|
+
success: import_zod26.z.boolean(),
|
|
1825
|
+
error: import_zod26.z.string().optional()
|
|
1826
|
+
});
|
|
1827
|
+
var LLMBreakdownStatsSchema = import_zod26.z.object({
|
|
1828
|
+
count: import_zod26.z.number(),
|
|
1829
|
+
durationMs: import_zod26.z.number(),
|
|
1830
|
+
tokens: import_zod26.z.number(),
|
|
1831
|
+
costUsd: import_zod26.z.number()
|
|
1832
|
+
});
|
|
1833
|
+
var LLMTraceSummarySchema = import_zod26.z.object({
|
|
1834
|
+
totalSteps: import_zod26.z.number(),
|
|
1835
|
+
totalDurationMs: import_zod26.z.number(),
|
|
1783
1836
|
totalTokens: TokenUsageSchema,
|
|
1784
|
-
totalCostUsd:
|
|
1785
|
-
stepTypeBreakdown:
|
|
1786
|
-
modelBreakdown:
|
|
1787
|
-
modelsUsed:
|
|
1788
|
-
});
|
|
1789
|
-
var LLMTraceSchema =
|
|
1790
|
-
id:
|
|
1791
|
-
steps:
|
|
1837
|
+
totalCostUsd: import_zod26.z.number(),
|
|
1838
|
+
stepTypeBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema).optional(),
|
|
1839
|
+
modelBreakdown: import_zod26.z.record(import_zod26.z.string(), LLMBreakdownStatsSchema),
|
|
1840
|
+
modelsUsed: import_zod26.z.array(import_zod26.z.string())
|
|
1841
|
+
});
|
|
1842
|
+
var LLMTraceSchema = import_zod26.z.object({
|
|
1843
|
+
id: import_zod26.z.string(),
|
|
1844
|
+
steps: import_zod26.z.array(LLMTraceStepSchema),
|
|
1792
1845
|
summary: LLMTraceSummarySchema
|
|
1793
1846
|
});
|
|
1794
1847
|
|
|
1795
1848
|
// src/evaluation/eval-result.ts
|
|
1796
|
-
var
|
|
1849
|
+
var import_zod29 = require("zod");
|
|
1797
1850
|
|
|
1798
1851
|
// src/evaluation/eval-run.ts
|
|
1799
|
-
var
|
|
1852
|
+
var import_zod28 = require("zod");
|
|
1800
1853
|
|
|
1801
1854
|
// src/evaluation/live-trace.ts
|
|
1802
|
-
var
|
|
1855
|
+
var import_zod27 = require("zod");
|
|
1803
1856
|
var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
1804
1857
|
LiveTraceEventType2["THINKING"] = "thinking";
|
|
1805
1858
|
LiveTraceEventType2["TOOL_USE"] = "tool_use";
|
|
@@ -1813,37 +1866,37 @@ var LiveTraceEventType = /* @__PURE__ */ ((LiveTraceEventType2) => {
|
|
|
1813
1866
|
LiveTraceEventType2["USER"] = "user";
|
|
1814
1867
|
return LiveTraceEventType2;
|
|
1815
1868
|
})(LiveTraceEventType || {});
|
|
1816
|
-
var LiveTraceEventSchema =
|
|
1869
|
+
var LiveTraceEventSchema = import_zod27.z.object({
|
|
1817
1870
|
/** The evaluation run ID */
|
|
1818
|
-
evalRunId:
|
|
1871
|
+
evalRunId: import_zod27.z.string(),
|
|
1819
1872
|
/** The scenario ID being executed */
|
|
1820
|
-
scenarioId:
|
|
1873
|
+
scenarioId: import_zod27.z.string(),
|
|
1821
1874
|
/** The scenario name for display */
|
|
1822
|
-
scenarioName:
|
|
1875
|
+
scenarioName: import_zod27.z.string(),
|
|
1823
1876
|
/** The target ID (skill, agent, etc.) */
|
|
1824
|
-
targetId:
|
|
1877
|
+
targetId: import_zod27.z.string(),
|
|
1825
1878
|
/** The target name for display */
|
|
1826
|
-
targetName:
|
|
1879
|
+
targetName: import_zod27.z.string(),
|
|
1827
1880
|
/** Step number in the current scenario execution */
|
|
1828
|
-
stepNumber:
|
|
1881
|
+
stepNumber: import_zod27.z.number(),
|
|
1829
1882
|
/** Type of trace event */
|
|
1830
|
-
type:
|
|
1883
|
+
type: import_zod27.z.enum(LiveTraceEventType),
|
|
1831
1884
|
/** Tool name if this is a tool_use event */
|
|
1832
|
-
toolName:
|
|
1885
|
+
toolName: import_zod27.z.string().optional(),
|
|
1833
1886
|
/** Tool arguments preview (truncated JSON) */
|
|
1834
|
-
toolArgs:
|
|
1887
|
+
toolArgs: import_zod27.z.string().optional(),
|
|
1835
1888
|
/** Output preview (truncated text) */
|
|
1836
|
-
outputPreview:
|
|
1889
|
+
outputPreview: import_zod27.z.string().optional(),
|
|
1837
1890
|
/** File path for file operations */
|
|
1838
|
-
filePath:
|
|
1891
|
+
filePath: import_zod27.z.string().optional(),
|
|
1839
1892
|
/** Elapsed time in milliseconds for progress events */
|
|
1840
|
-
elapsedMs:
|
|
1893
|
+
elapsedMs: import_zod27.z.number().optional(),
|
|
1841
1894
|
/** Thinking/reasoning text from Claude */
|
|
1842
|
-
thinking:
|
|
1895
|
+
thinking: import_zod27.z.string().optional(),
|
|
1843
1896
|
/** Timestamp when this event occurred */
|
|
1844
|
-
timestamp:
|
|
1897
|
+
timestamp: import_zod27.z.string(),
|
|
1845
1898
|
/** Whether this is the final event for this scenario */
|
|
1846
|
-
isComplete:
|
|
1899
|
+
isComplete: import_zod27.z.boolean()
|
|
1847
1900
|
});
|
|
1848
1901
|
var TRACE_EVENT_PREFIX = "TRACE_EVENT:";
|
|
1849
1902
|
function parseTraceEventLine(line) {
|
|
@@ -1871,14 +1924,14 @@ var TriggerType = /* @__PURE__ */ ((TriggerType2) => {
|
|
|
1871
1924
|
TriggerType2["MANUAL"] = "MANUAL";
|
|
1872
1925
|
return TriggerType2;
|
|
1873
1926
|
})(TriggerType || {});
|
|
1874
|
-
var TriggerMetadataSchema =
|
|
1875
|
-
version:
|
|
1876
|
-
resourceUpdated:
|
|
1927
|
+
var TriggerMetadataSchema = import_zod28.z.object({
|
|
1928
|
+
version: import_zod28.z.string().optional(),
|
|
1929
|
+
resourceUpdated: import_zod28.z.array(import_zod28.z.string()).optional()
|
|
1877
1930
|
});
|
|
1878
|
-
var TriggerSchema =
|
|
1879
|
-
id:
|
|
1931
|
+
var TriggerSchema = import_zod28.z.object({
|
|
1932
|
+
id: import_zod28.z.string(),
|
|
1880
1933
|
metadata: TriggerMetadataSchema.optional(),
|
|
1881
|
-
type:
|
|
1934
|
+
type: import_zod28.z.enum(TriggerType)
|
|
1882
1935
|
});
|
|
1883
1936
|
var FailureCategory = /* @__PURE__ */ ((FailureCategory2) => {
|
|
1884
1937
|
FailureCategory2["MISSING_FILE"] = "missing_file";
|
|
@@ -1896,28 +1949,28 @@ var FailureSeverity = /* @__PURE__ */ ((FailureSeverity2) => {
|
|
|
1896
1949
|
FailureSeverity2["LOW"] = "low";
|
|
1897
1950
|
return FailureSeverity2;
|
|
1898
1951
|
})(FailureSeverity || {});
|
|
1899
|
-
var DiffLineTypeSchema =
|
|
1900
|
-
var DiffLineSchema =
|
|
1952
|
+
var DiffLineTypeSchema = import_zod28.z.enum(["added", "removed", "unchanged"]);
|
|
1953
|
+
var DiffLineSchema = import_zod28.z.object({
|
|
1901
1954
|
type: DiffLineTypeSchema,
|
|
1902
|
-
content:
|
|
1903
|
-
lineNumber:
|
|
1904
|
-
});
|
|
1905
|
-
var DiffContentSchema =
|
|
1906
|
-
path:
|
|
1907
|
-
expected:
|
|
1908
|
-
actual:
|
|
1909
|
-
diffLines:
|
|
1910
|
-
renamedFrom:
|
|
1911
|
-
});
|
|
1912
|
-
var CommandExecutionSchema =
|
|
1913
|
-
command:
|
|
1914
|
-
exitCode:
|
|
1915
|
-
output:
|
|
1916
|
-
duration:
|
|
1917
|
-
});
|
|
1918
|
-
var FileModificationSchema =
|
|
1919
|
-
path:
|
|
1920
|
-
action:
|
|
1955
|
+
content: import_zod28.z.string(),
|
|
1956
|
+
lineNumber: import_zod28.z.number()
|
|
1957
|
+
});
|
|
1958
|
+
var DiffContentSchema = import_zod28.z.object({
|
|
1959
|
+
path: import_zod28.z.string(),
|
|
1960
|
+
expected: import_zod28.z.string(),
|
|
1961
|
+
actual: import_zod28.z.string(),
|
|
1962
|
+
diffLines: import_zod28.z.array(DiffLineSchema),
|
|
1963
|
+
renamedFrom: import_zod28.z.string().optional()
|
|
1964
|
+
});
|
|
1965
|
+
var CommandExecutionSchema = import_zod28.z.object({
|
|
1966
|
+
command: import_zod28.z.string(),
|
|
1967
|
+
exitCode: import_zod28.z.number(),
|
|
1968
|
+
output: import_zod28.z.string().optional(),
|
|
1969
|
+
duration: import_zod28.z.number()
|
|
1970
|
+
});
|
|
1971
|
+
var FileModificationSchema = import_zod28.z.object({
|
|
1972
|
+
path: import_zod28.z.string(),
|
|
1973
|
+
action: import_zod28.z.enum(["created", "modified", "deleted"])
|
|
1921
1974
|
});
|
|
1922
1975
|
var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
1923
1976
|
TemplateFileStatus2["NEW"] = "new";
|
|
@@ -1925,81 +1978,83 @@ var TemplateFileStatus = /* @__PURE__ */ ((TemplateFileStatus2) => {
|
|
|
1925
1978
|
TemplateFileStatus2["UNCHANGED"] = "unchanged";
|
|
1926
1979
|
return TemplateFileStatus2;
|
|
1927
1980
|
})(TemplateFileStatus || {});
|
|
1928
|
-
var TemplateFileSchema =
|
|
1981
|
+
var TemplateFileSchema = import_zod28.z.object({
|
|
1929
1982
|
/** Relative path within the template */
|
|
1930
|
-
path:
|
|
1983
|
+
path: import_zod28.z.string(),
|
|
1931
1984
|
/** Full file content after execution */
|
|
1932
|
-
content:
|
|
1985
|
+
content: import_zod28.z.string(),
|
|
1933
1986
|
/** File status (new, modified, unchanged) */
|
|
1934
|
-
status:
|
|
1935
|
-
});
|
|
1936
|
-
var ApiCallSchema =
|
|
1937
|
-
endpoint:
|
|
1938
|
-
tokensUsed:
|
|
1939
|
-
duration:
|
|
1940
|
-
});
|
|
1941
|
-
var ExecutionTraceSchema =
|
|
1942
|
-
commands:
|
|
1943
|
-
filesModified:
|
|
1944
|
-
apiCalls:
|
|
1945
|
-
totalDuration:
|
|
1946
|
-
});
|
|
1947
|
-
var FailureAnalysisSchema =
|
|
1948
|
-
category:
|
|
1949
|
-
severity:
|
|
1950
|
-
summary:
|
|
1951
|
-
details:
|
|
1952
|
-
rootCause:
|
|
1953
|
-
suggestedFix:
|
|
1954
|
-
relatedAssertions:
|
|
1955
|
-
codeSnippet:
|
|
1956
|
-
similarIssues:
|
|
1957
|
-
patternId:
|
|
1987
|
+
status: import_zod28.z.enum(["new", "modified", "unchanged"])
|
|
1988
|
+
});
|
|
1989
|
+
var ApiCallSchema = import_zod28.z.object({
|
|
1990
|
+
endpoint: import_zod28.z.string(),
|
|
1991
|
+
tokensUsed: import_zod28.z.number(),
|
|
1992
|
+
duration: import_zod28.z.number()
|
|
1993
|
+
});
|
|
1994
|
+
var ExecutionTraceSchema = import_zod28.z.object({
|
|
1995
|
+
commands: import_zod28.z.array(CommandExecutionSchema),
|
|
1996
|
+
filesModified: import_zod28.z.array(FileModificationSchema),
|
|
1997
|
+
apiCalls: import_zod28.z.array(ApiCallSchema),
|
|
1998
|
+
totalDuration: import_zod28.z.number()
|
|
1999
|
+
});
|
|
2000
|
+
var FailureAnalysisSchema = import_zod28.z.object({
|
|
2001
|
+
category: import_zod28.z.enum(FailureCategory),
|
|
2002
|
+
severity: import_zod28.z.enum(FailureSeverity),
|
|
2003
|
+
summary: import_zod28.z.string(),
|
|
2004
|
+
details: import_zod28.z.string(),
|
|
2005
|
+
rootCause: import_zod28.z.string(),
|
|
2006
|
+
suggestedFix: import_zod28.z.string(),
|
|
2007
|
+
relatedAssertions: import_zod28.z.array(import_zod28.z.string()),
|
|
2008
|
+
codeSnippet: import_zod28.z.string().optional(),
|
|
2009
|
+
similarIssues: import_zod28.z.array(import_zod28.z.string()).optional(),
|
|
2010
|
+
patternId: import_zod28.z.string().optional(),
|
|
1958
2011
|
// Extended fields for detailed debugging
|
|
1959
2012
|
diff: DiffContentSchema.optional(),
|
|
1960
2013
|
executionTrace: ExecutionTraceSchema.optional()
|
|
1961
2014
|
});
|
|
1962
2015
|
var EvalRunSchema = TenantEntitySchema.extend({
|
|
1963
2016
|
/** Agent ID for this run */
|
|
1964
|
-
agentId:
|
|
2017
|
+
agentId: import_zod28.z.string().optional(),
|
|
1965
2018
|
/** Skills group ID for this run */
|
|
1966
|
-
skillsGroupId:
|
|
2019
|
+
skillsGroupId: import_zod28.z.string().optional(),
|
|
1967
2020
|
/** Map of skillId to skillVersionId for this run */
|
|
1968
|
-
skillVersions:
|
|
2021
|
+
skillVersions: import_zod28.z.record(import_zod28.z.string(), import_zod28.z.string()).optional(),
|
|
1969
2022
|
/** Scenario IDs to run */
|
|
1970
|
-
scenarioIds:
|
|
2023
|
+
scenarioIds: import_zod28.z.array(import_zod28.z.string()),
|
|
1971
2024
|
/** Current status */
|
|
1972
2025
|
status: EvalStatusSchema,
|
|
1973
2026
|
/** Progress percentage (0-100) */
|
|
1974
|
-
progress:
|
|
2027
|
+
progress: import_zod28.z.number(),
|
|
1975
2028
|
/** Results for each scenario/target combination (lazy to break eval-result ↔ eval-run cycle) */
|
|
1976
|
-
results:
|
|
2029
|
+
results: import_zod28.z.array(import_zod28.z.lazy(() => EvalRunResultSchema)),
|
|
1977
2030
|
/** Aggregated metrics across all results */
|
|
1978
2031
|
aggregateMetrics: EvalMetricsSchema,
|
|
1979
2032
|
/** Failure analyses */
|
|
1980
|
-
failureAnalyses:
|
|
2033
|
+
failureAnalyses: import_zod28.z.array(FailureAnalysisSchema).optional(),
|
|
1981
2034
|
/** Aggregated LLM trace summary */
|
|
1982
2035
|
llmTraceSummary: LLMTraceSummarySchema.optional(),
|
|
1983
2036
|
/** What triggered this run */
|
|
1984
2037
|
trigger: TriggerSchema.optional(),
|
|
1985
2038
|
/** When the run started (set when evaluation is triggered) */
|
|
1986
|
-
startedAt:
|
|
2039
|
+
startedAt: import_zod28.z.string().optional(),
|
|
1987
2040
|
/** When the run completed */
|
|
1988
|
-
completedAt:
|
|
2041
|
+
completedAt: import_zod28.z.string().optional(),
|
|
1989
2042
|
/** Live trace events captured during execution (for playback on results page) */
|
|
1990
|
-
liveTraceEvents:
|
|
2043
|
+
liveTraceEvents: import_zod28.z.array(LiveTraceEventSchema).optional(),
|
|
1991
2044
|
/** Remote job ID for tracking execution in Dev Machines */
|
|
1992
|
-
jobId:
|
|
2045
|
+
jobId: import_zod28.z.string().optional(),
|
|
1993
2046
|
/** Remote job status from the Dev Machine API (PENDING, RUNNING, COMPLETED, FAILED, CANCELLED) */
|
|
1994
|
-
jobStatus:
|
|
2047
|
+
jobStatus: import_zod28.z.string().optional(),
|
|
1995
2048
|
/** Remote job error message if the job failed */
|
|
1996
|
-
jobError:
|
|
2049
|
+
jobError: import_zod28.z.string().optional(),
|
|
1997
2050
|
/** Timestamp of the last job status check */
|
|
1998
|
-
jobStatusCheckedAt:
|
|
2051
|
+
jobStatusCheckedAt: import_zod28.z.string().optional(),
|
|
1999
2052
|
/** MCP server IDs to enable for this run (optional) */
|
|
2000
|
-
mcpIds:
|
|
2053
|
+
mcpIds: import_zod28.z.array(import_zod28.z.string()).optional(),
|
|
2001
2054
|
/** Sub-agent IDs to enable for this run (optional) */
|
|
2002
|
-
subAgentIds:
|
|
2055
|
+
subAgentIds: import_zod28.z.array(import_zod28.z.string()).optional(),
|
|
2056
|
+
/** Rule IDs to enable for this run (optional) */
|
|
2057
|
+
ruleIds: import_zod28.z.array(import_zod28.z.string()).optional()
|
|
2003
2058
|
});
|
|
2004
2059
|
var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
2005
2060
|
id: true,
|
|
@@ -2012,28 +2067,28 @@ var CreateEvalRunInputSchema = EvalRunSchema.omit({
|
|
|
2012
2067
|
startedAt: true,
|
|
2013
2068
|
completedAt: true
|
|
2014
2069
|
});
|
|
2015
|
-
var EvaluationProgressSchema =
|
|
2016
|
-
runId:
|
|
2017
|
-
targetId:
|
|
2018
|
-
totalScenarios:
|
|
2019
|
-
completedScenarios:
|
|
2020
|
-
scenarioProgress:
|
|
2021
|
-
|
|
2022
|
-
scenarioId:
|
|
2023
|
-
currentStep:
|
|
2024
|
-
error:
|
|
2070
|
+
var EvaluationProgressSchema = import_zod28.z.object({
|
|
2071
|
+
runId: import_zod28.z.string(),
|
|
2072
|
+
targetId: import_zod28.z.string(),
|
|
2073
|
+
totalScenarios: import_zod28.z.number(),
|
|
2074
|
+
completedScenarios: import_zod28.z.number(),
|
|
2075
|
+
scenarioProgress: import_zod28.z.array(
|
|
2076
|
+
import_zod28.z.object({
|
|
2077
|
+
scenarioId: import_zod28.z.string(),
|
|
2078
|
+
currentStep: import_zod28.z.string(),
|
|
2079
|
+
error: import_zod28.z.string().optional()
|
|
2025
2080
|
})
|
|
2026
2081
|
),
|
|
2027
|
-
createdAt:
|
|
2082
|
+
createdAt: import_zod28.z.number()
|
|
2028
2083
|
});
|
|
2029
|
-
var EvaluationLogSchema =
|
|
2030
|
-
runId:
|
|
2031
|
-
scenarioId:
|
|
2032
|
-
log:
|
|
2033
|
-
level:
|
|
2034
|
-
message:
|
|
2035
|
-
args:
|
|
2036
|
-
error:
|
|
2084
|
+
var EvaluationLogSchema = import_zod28.z.object({
|
|
2085
|
+
runId: import_zod28.z.string(),
|
|
2086
|
+
scenarioId: import_zod28.z.string(),
|
|
2087
|
+
log: import_zod28.z.object({
|
|
2088
|
+
level: import_zod28.z.enum(["info", "error", "debug"]),
|
|
2089
|
+
message: import_zod28.z.string().optional(),
|
|
2090
|
+
args: import_zod28.z.array(import_zod28.z.any()).optional(),
|
|
2091
|
+
error: import_zod28.z.string().optional()
|
|
2037
2092
|
})
|
|
2038
2093
|
});
|
|
2039
2094
|
var LLM_TIMEOUT = 12e4;
|
|
@@ -2046,95 +2101,95 @@ var AssertionResultStatus = /* @__PURE__ */ ((AssertionResultStatus2) => {
|
|
|
2046
2101
|
AssertionResultStatus2["ERROR"] = "error";
|
|
2047
2102
|
return AssertionResultStatus2;
|
|
2048
2103
|
})(AssertionResultStatus || {});
|
|
2049
|
-
var AssertionResultSchema =
|
|
2050
|
-
id:
|
|
2051
|
-
assertionId:
|
|
2052
|
-
assertionType:
|
|
2053
|
-
assertionName:
|
|
2054
|
-
status:
|
|
2055
|
-
message:
|
|
2056
|
-
expected:
|
|
2057
|
-
actual:
|
|
2058
|
-
duration:
|
|
2059
|
-
details:
|
|
2060
|
-
llmTraceSteps:
|
|
2061
|
-
});
|
|
2062
|
-
var EvalRunResultSchema =
|
|
2063
|
-
id:
|
|
2064
|
-
targetId:
|
|
2065
|
-
targetName:
|
|
2104
|
+
var AssertionResultSchema = import_zod29.z.object({
|
|
2105
|
+
id: import_zod29.z.string(),
|
|
2106
|
+
assertionId: import_zod29.z.string(),
|
|
2107
|
+
assertionType: import_zod29.z.string(),
|
|
2108
|
+
assertionName: import_zod29.z.string(),
|
|
2109
|
+
status: import_zod29.z.enum(AssertionResultStatus),
|
|
2110
|
+
message: import_zod29.z.string().optional(),
|
|
2111
|
+
expected: import_zod29.z.string().optional(),
|
|
2112
|
+
actual: import_zod29.z.string().optional(),
|
|
2113
|
+
duration: import_zod29.z.number().optional(),
|
|
2114
|
+
details: import_zod29.z.record(import_zod29.z.string(), import_zod29.z.unknown()).optional(),
|
|
2115
|
+
llmTraceSteps: import_zod29.z.array(LLMTraceStepSchema).optional()
|
|
2116
|
+
});
|
|
2117
|
+
var EvalRunResultSchema = import_zod29.z.object({
|
|
2118
|
+
id: import_zod29.z.string(),
|
|
2119
|
+
targetId: import_zod29.z.string(),
|
|
2120
|
+
targetName: import_zod29.z.string().optional(),
|
|
2066
2121
|
/** SkillVersion ID used for this evaluation (for version tracking) */
|
|
2067
|
-
skillVersionId:
|
|
2122
|
+
skillVersionId: import_zod29.z.string().optional(),
|
|
2068
2123
|
/** SkillVersion semver string (e.g., "1.0.0", "1.2.3") for display */
|
|
2069
|
-
skillVersion:
|
|
2070
|
-
scenarioId:
|
|
2071
|
-
scenarioName:
|
|
2124
|
+
skillVersion: import_zod29.z.string().optional(),
|
|
2125
|
+
scenarioId: import_zod29.z.string(),
|
|
2126
|
+
scenarioName: import_zod29.z.string(),
|
|
2072
2127
|
modelConfig: ModelConfigSchema.optional(),
|
|
2073
|
-
assertionResults:
|
|
2128
|
+
assertionResults: import_zod29.z.array(AssertionResultSchema),
|
|
2074
2129
|
metrics: EvalMetricsSchema.optional(),
|
|
2075
|
-
passed:
|
|
2076
|
-
failed:
|
|
2077
|
-
passRate:
|
|
2078
|
-
duration:
|
|
2079
|
-
outputText:
|
|
2080
|
-
files:
|
|
2081
|
-
fileDiffs:
|
|
2130
|
+
passed: import_zod29.z.number(),
|
|
2131
|
+
failed: import_zod29.z.number(),
|
|
2132
|
+
passRate: import_zod29.z.number(),
|
|
2133
|
+
duration: import_zod29.z.number(),
|
|
2134
|
+
outputText: import_zod29.z.string().optional(),
|
|
2135
|
+
files: import_zod29.z.array(ExpectedFileSchema).optional(),
|
|
2136
|
+
fileDiffs: import_zod29.z.array(DiffContentSchema).optional(),
|
|
2082
2137
|
/** Full template files after execution with status indicators */
|
|
2083
|
-
templateFiles:
|
|
2084
|
-
startedAt:
|
|
2085
|
-
completedAt:
|
|
2138
|
+
templateFiles: import_zod29.z.array(TemplateFileSchema).optional(),
|
|
2139
|
+
startedAt: import_zod29.z.string().optional(),
|
|
2140
|
+
completedAt: import_zod29.z.string().optional(),
|
|
2086
2141
|
llmTrace: LLMTraceSchema.optional()
|
|
2087
2142
|
});
|
|
2088
|
-
var PromptResultSchema =
|
|
2089
|
-
text:
|
|
2090
|
-
files:
|
|
2091
|
-
finishReason:
|
|
2092
|
-
reasoning:
|
|
2093
|
-
reasoningDetails:
|
|
2094
|
-
toolCalls:
|
|
2095
|
-
toolResults:
|
|
2096
|
-
warnings:
|
|
2097
|
-
sources:
|
|
2098
|
-
steps:
|
|
2099
|
-
generationTimeMs:
|
|
2100
|
-
prompt:
|
|
2101
|
-
systemPrompt:
|
|
2102
|
-
usage:
|
|
2103
|
-
totalTokens:
|
|
2104
|
-
totalMicrocentsSpent:
|
|
2143
|
+
var PromptResultSchema = import_zod29.z.object({
|
|
2144
|
+
text: import_zod29.z.string(),
|
|
2145
|
+
files: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2146
|
+
finishReason: import_zod29.z.string().optional(),
|
|
2147
|
+
reasoning: import_zod29.z.string().optional(),
|
|
2148
|
+
reasoningDetails: import_zod29.z.unknown().optional(),
|
|
2149
|
+
toolCalls: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2150
|
+
toolResults: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2151
|
+
warnings: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2152
|
+
sources: import_zod29.z.array(import_zod29.z.unknown()).optional(),
|
|
2153
|
+
steps: import_zod29.z.array(import_zod29.z.unknown()),
|
|
2154
|
+
generationTimeMs: import_zod29.z.number(),
|
|
2155
|
+
prompt: import_zod29.z.string(),
|
|
2156
|
+
systemPrompt: import_zod29.z.string(),
|
|
2157
|
+
usage: import_zod29.z.object({
|
|
2158
|
+
totalTokens: import_zod29.z.number().optional(),
|
|
2159
|
+
totalMicrocentsSpent: import_zod29.z.number().optional()
|
|
2105
2160
|
})
|
|
2106
2161
|
});
|
|
2107
|
-
var EvaluationResultSchema =
|
|
2108
|
-
id:
|
|
2109
|
-
runId:
|
|
2110
|
-
timestamp:
|
|
2162
|
+
var EvaluationResultSchema = import_zod29.z.object({
|
|
2163
|
+
id: import_zod29.z.string(),
|
|
2164
|
+
runId: import_zod29.z.string(),
|
|
2165
|
+
timestamp: import_zod29.z.number(),
|
|
2111
2166
|
promptResult: PromptResultSchema,
|
|
2112
|
-
testResults:
|
|
2113
|
-
tags:
|
|
2114
|
-
feedback:
|
|
2115
|
-
score:
|
|
2116
|
-
suiteId:
|
|
2117
|
-
});
|
|
2118
|
-
var LeanEvaluationResultSchema =
|
|
2119
|
-
id:
|
|
2120
|
-
runId:
|
|
2121
|
-
timestamp:
|
|
2122
|
-
tags:
|
|
2123
|
-
scenarioId:
|
|
2124
|
-
scenarioVersion:
|
|
2125
|
-
targetId:
|
|
2126
|
-
targetVersion:
|
|
2127
|
-
suiteId:
|
|
2128
|
-
score:
|
|
2129
|
-
time:
|
|
2130
|
-
microcentsSpent:
|
|
2167
|
+
testResults: import_zod29.z.array(import_zod29.z.unknown()),
|
|
2168
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
2169
|
+
feedback: import_zod29.z.string().optional(),
|
|
2170
|
+
score: import_zod29.z.number(),
|
|
2171
|
+
suiteId: import_zod29.z.string().optional()
|
|
2172
|
+
});
|
|
2173
|
+
var LeanEvaluationResultSchema = import_zod29.z.object({
|
|
2174
|
+
id: import_zod29.z.string(),
|
|
2175
|
+
runId: import_zod29.z.string(),
|
|
2176
|
+
timestamp: import_zod29.z.number(),
|
|
2177
|
+
tags: import_zod29.z.array(import_zod29.z.string()).optional(),
|
|
2178
|
+
scenarioId: import_zod29.z.string(),
|
|
2179
|
+
scenarioVersion: import_zod29.z.number().optional(),
|
|
2180
|
+
targetId: import_zod29.z.string(),
|
|
2181
|
+
targetVersion: import_zod29.z.number().optional(),
|
|
2182
|
+
suiteId: import_zod29.z.string().optional(),
|
|
2183
|
+
score: import_zod29.z.number(),
|
|
2184
|
+
time: import_zod29.z.number().optional(),
|
|
2185
|
+
microcentsSpent: import_zod29.z.number().optional()
|
|
2131
2186
|
});
|
|
2132
2187
|
|
|
2133
2188
|
// src/project/project.ts
|
|
2134
|
-
var
|
|
2189
|
+
var import_zod30 = require("zod");
|
|
2135
2190
|
var ProjectSchema = BaseEntitySchema.extend({
|
|
2136
|
-
appId:
|
|
2137
|
-
appSecret:
|
|
2191
|
+
appId: import_zod30.z.string().optional().describe("The ID of the app in Dev Center"),
|
|
2192
|
+
appSecret: import_zod30.z.string().optional().describe("The secret of the app in Dev Center")
|
|
2138
2193
|
});
|
|
2139
2194
|
var CreateProjectInputSchema = ProjectSchema.omit({
|
|
2140
2195
|
id: true,
|
|
@@ -2160,6 +2215,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
2160
2215
|
// src/assertion/system-assertions.ts
|
|
2161
2216
|
var SYSTEM_ASSERTION_IDS = {
|
|
2162
2217
|
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
2218
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
2163
2219
|
BUILD_PASSED: "system:build_passed",
|
|
2164
2220
|
TIME_LIMIT: "system:time_limit",
|
|
2165
2221
|
COST: "system:cost",
|
|
@@ -2183,6 +2239,26 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2183
2239
|
}
|
|
2184
2240
|
]
|
|
2185
2241
|
},
|
|
2242
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2243
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2244
|
+
name: "Tool Called With Param",
|
|
2245
|
+
description: "Check that a tool was called with expected parameters",
|
|
2246
|
+
type: "tool_called_with_param",
|
|
2247
|
+
parameters: [
|
|
2248
|
+
{
|
|
2249
|
+
name: "toolName",
|
|
2250
|
+
label: "Tool Name",
|
|
2251
|
+
type: "string",
|
|
2252
|
+
required: true
|
|
2253
|
+
},
|
|
2254
|
+
{
|
|
2255
|
+
name: "expectedParams",
|
|
2256
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
2257
|
+
type: "string",
|
|
2258
|
+
required: true
|
|
2259
|
+
}
|
|
2260
|
+
]
|
|
2261
|
+
},
|
|
2186
2262
|
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
2187
2263
|
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
2188
2264
|
name: "Build Passed",
|
|
@@ -2301,6 +2377,7 @@ function getSystemAssertion(id) {
|
|
|
2301
2377
|
0 && (module.exports = {
|
|
2302
2378
|
AVAILABLE_MODEL_IDS,
|
|
2303
2379
|
AVAILABLE_RUN_COMMANDS,
|
|
2380
|
+
AVAILABLE_TOOL_NAMES,
|
|
2304
2381
|
AgentRunCommand,
|
|
2305
2382
|
AgentRunCommandSchema,
|
|
2306
2383
|
AgentSchema,
|
|
@@ -2329,6 +2406,7 @@ function getSystemAssertion(id) {
|
|
|
2329
2406
|
CreateEvalRunInputSchema,
|
|
2330
2407
|
CreateMcpInputSchema,
|
|
2331
2408
|
CreateProjectInputSchema,
|
|
2409
|
+
CreateRuleInputSchema,
|
|
2332
2410
|
CreateSkillInputSchema,
|
|
2333
2411
|
CreateSkillVersionInputSchema,
|
|
2334
2412
|
CreateSkillsGroupInputSchema,
|
|
@@ -2383,6 +2461,8 @@ function getSystemAssertion(id) {
|
|
|
2383
2461
|
ProjectSchema,
|
|
2384
2462
|
PromptResultSchema,
|
|
2385
2463
|
RUN_COMMAND_LABELS,
|
|
2464
|
+
RuleSchema,
|
|
2465
|
+
RuleTypeSchema,
|
|
2386
2466
|
SEMVER_REGEX,
|
|
2387
2467
|
SKILL_FOLDER_NAME_REGEX,
|
|
2388
2468
|
SYSTEM_ASSERTIONS,
|
|
@@ -2415,6 +2495,8 @@ function getSystemAssertion(id) {
|
|
|
2415
2495
|
TimeAssertionSchema,
|
|
2416
2496
|
TimeConfigSchema,
|
|
2417
2497
|
TokenUsageSchema,
|
|
2498
|
+
ToolCalledWithParamAssertionSchema,
|
|
2499
|
+
ToolCalledWithParamConfigSchema,
|
|
2418
2500
|
ToolTestSchema,
|
|
2419
2501
|
TriggerMetadataSchema,
|
|
2420
2502
|
TriggerSchema,
|
|
@@ -2423,6 +2505,7 @@ function getSystemAssertion(id) {
|
|
|
2423
2505
|
UpdateCustomAssertionInputSchema,
|
|
2424
2506
|
UpdateMcpInputSchema,
|
|
2425
2507
|
UpdateProjectInputSchema,
|
|
2508
|
+
UpdateRuleInputSchema,
|
|
2426
2509
|
UpdateSkillInputSchema,
|
|
2427
2510
|
UpdateSkillsGroupInputSchema,
|
|
2428
2511
|
UpdateSubAgentInputSchema,
|