@wix/evalforge-types 0.38.0 → 0.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +57 -17
- package/build/index.js.map +3 -3
- package/build/index.mjs +54 -17
- package/build/index.mjs.map +3 -3
- package/build/types/assertion/assertion.d.ts +34 -5
- package/build/types/assertion/system-assertions.d.ts +1 -0
- package/build/types/common/index.d.ts +1 -0
- package/build/types/common/tool-names.d.ts +1 -0
- package/build/types/scenario/assertions.d.ts +16 -2
- package/build/types/scenario/test-scenario.d.ts +12 -3
- package/package.json +2 -2
package/build/index.mjs
CHANGED
|
@@ -991,6 +991,17 @@ var RuleInputBaseSchema = RuleSchema.omit({
|
|
|
991
991
|
var CreateRuleInputSchema = RuleInputBaseSchema;
|
|
992
992
|
var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
|
|
993
993
|
|
|
994
|
+
// src/common/tool-names.ts
|
|
995
|
+
var AVAILABLE_TOOL_NAMES = [
|
|
996
|
+
"Bash",
|
|
997
|
+
"Edit",
|
|
998
|
+
"Glob",
|
|
999
|
+
"Grep",
|
|
1000
|
+
"Read",
|
|
1001
|
+
"Skill",
|
|
1002
|
+
"Write"
|
|
1003
|
+
];
|
|
1004
|
+
|
|
994
1005
|
// src/target/target.ts
|
|
995
1006
|
var TargetSchema = TenantEntitySchema.extend({
|
|
996
1007
|
// Base for all testable entities
|
|
@@ -1329,6 +1340,13 @@ var SkillWasCalledAssertionSchema = z21.object({
|
|
|
1329
1340
|
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1330
1341
|
skillNames: z21.array(z21.string().min(1)).min(1)
|
|
1331
1342
|
});
|
|
1343
|
+
var ToolCalledWithParamAssertionSchema = z21.object({
|
|
1344
|
+
type: z21.literal("tool_called_with_param"),
|
|
1345
|
+
/** Name of the tool that must have been called */
|
|
1346
|
+
toolName: z21.string().min(1),
|
|
1347
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1348
|
+
expectedParams: z21.string().min(1)
|
|
1349
|
+
});
|
|
1332
1350
|
var BuildPassedAssertionSchema = z21.object({
|
|
1333
1351
|
type: z21.literal("build_passed"),
|
|
1334
1352
|
/** Command to run (default: "yarn build") */
|
|
@@ -1345,8 +1363,6 @@ var LlmJudgeAssertionSchema = z21.object({
|
|
|
1345
1363
|
type: z21.literal("llm_judge"),
|
|
1346
1364
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1347
1365
|
prompt: z21.string(),
|
|
1348
|
-
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
1349
|
-
systemPrompt: z21.string().optional(),
|
|
1350
1366
|
/** Minimum score to pass (0-100, default 70) */
|
|
1351
1367
|
minScore: z21.number().int().min(0).max(100).optional(),
|
|
1352
1368
|
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
@@ -1361,6 +1377,7 @@ var TimeAssertionSchema = z21.object({
|
|
|
1361
1377
|
});
|
|
1362
1378
|
var AssertionSchema = z21.union([
|
|
1363
1379
|
SkillWasCalledAssertionSchema,
|
|
1380
|
+
ToolCalledWithParamAssertionSchema,
|
|
1364
1381
|
BuildPassedAssertionSchema,
|
|
1365
1382
|
TimeAssertionSchema,
|
|
1366
1383
|
CostAssertionSchema,
|
|
@@ -1408,6 +1425,7 @@ import { z as z24 } from "zod";
|
|
|
1408
1425
|
import { z as z23 } from "zod";
|
|
1409
1426
|
var AssertionTypeSchema = z23.enum([
|
|
1410
1427
|
"skill_was_called",
|
|
1428
|
+
"tool_called_with_param",
|
|
1411
1429
|
"build_passed",
|
|
1412
1430
|
"time_limit",
|
|
1413
1431
|
"cost",
|
|
@@ -1449,6 +1467,12 @@ var CostConfigSchema = z23.strictObject({
|
|
|
1449
1467
|
/** Maximum allowed cost in USD */
|
|
1450
1468
|
maxCostUsd: z23.number().positive()
|
|
1451
1469
|
});
|
|
1470
|
+
var ToolCalledWithParamConfigSchema = z23.strictObject({
|
|
1471
|
+
/** Name of the tool that must have been called */
|
|
1472
|
+
toolName: z23.string().min(1),
|
|
1473
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1474
|
+
expectedParams: z23.string().min(1)
|
|
1475
|
+
});
|
|
1452
1476
|
var BuildPassedConfigSchema = z23.strictObject({
|
|
1453
1477
|
/** Command to run (default: "yarn build") */
|
|
1454
1478
|
command: z23.string().optional(),
|
|
@@ -1471,8 +1495,6 @@ var LlmJudgeConfigSchema = z23.object({
|
|
|
1471
1495
|
* - Custom parameters defined in the parameters array
|
|
1472
1496
|
*/
|
|
1473
1497
|
prompt: z23.string().min(1),
|
|
1474
|
-
/** Optional system prompt for the judge */
|
|
1475
|
-
systemPrompt: z23.string().optional(),
|
|
1476
1498
|
/** Minimum score to pass (0-100, default 70) */
|
|
1477
1499
|
minScore: z23.number().int().min(0).max(100).optional(),
|
|
1478
1500
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
@@ -1489,6 +1511,8 @@ var AssertionConfigSchema = z23.union([
|
|
|
1489
1511
|
// requires prompt - check first
|
|
1490
1512
|
SkillWasCalledConfigSchema,
|
|
1491
1513
|
// requires skillNames
|
|
1514
|
+
ToolCalledWithParamConfigSchema,
|
|
1515
|
+
// requires toolName + expectedParams, uses strictObject
|
|
1492
1516
|
TimeConfigSchema,
|
|
1493
1517
|
// requires maxDurationMs, uses strictObject
|
|
1494
1518
|
CostConfigSchema,
|
|
@@ -1517,6 +1541,8 @@ function validateAssertionConfig(type, config) {
|
|
|
1517
1541
|
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
1518
1542
|
case "cost":
|
|
1519
1543
|
return CostConfigSchema.safeParse(config).success;
|
|
1544
|
+
case "tool_called_with_param":
|
|
1545
|
+
return ToolCalledWithParamConfigSchema.safeParse(config).success;
|
|
1520
1546
|
case "build_passed":
|
|
1521
1547
|
return BuildPassedConfigSchema.safeParse(config).success;
|
|
1522
1548
|
case "time_limit":
|
|
@@ -2025,6 +2051,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
2025
2051
|
// src/assertion/system-assertions.ts
|
|
2026
2052
|
var SYSTEM_ASSERTION_IDS = {
|
|
2027
2053
|
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
2054
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
2028
2055
|
BUILD_PASSED: "system:build_passed",
|
|
2029
2056
|
TIME_LIMIT: "system:time_limit",
|
|
2030
2057
|
COST: "system:cost",
|
|
@@ -2048,6 +2075,26 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2048
2075
|
}
|
|
2049
2076
|
]
|
|
2050
2077
|
},
|
|
2078
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2079
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2080
|
+
name: "Tool Called With Param",
|
|
2081
|
+
description: "Check that a tool was called with expected parameters",
|
|
2082
|
+
type: "tool_called_with_param",
|
|
2083
|
+
parameters: [
|
|
2084
|
+
{
|
|
2085
|
+
name: "toolName",
|
|
2086
|
+
label: "Tool Name",
|
|
2087
|
+
type: "string",
|
|
2088
|
+
required: true
|
|
2089
|
+
},
|
|
2090
|
+
{
|
|
2091
|
+
name: "expectedParams",
|
|
2092
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
2093
|
+
type: "string",
|
|
2094
|
+
required: true
|
|
2095
|
+
}
|
|
2096
|
+
]
|
|
2097
|
+
},
|
|
2051
2098
|
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
2052
2099
|
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
2053
2100
|
name: "Build Passed",
|
|
@@ -2127,19 +2174,6 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2127
2174
|
required: true,
|
|
2128
2175
|
defaultValue: "Verify the output meets the acceptance criteria."
|
|
2129
2176
|
},
|
|
2130
|
-
{
|
|
2131
|
-
name: "systemPrompt",
|
|
2132
|
-
label: "System Prompt (optional)",
|
|
2133
|
-
type: "string",
|
|
2134
|
-
required: false,
|
|
2135
|
-
defaultValue: `You are judging a scenario run. Use these values:
|
|
2136
|
-
- {{output}}: the agent's final output
|
|
2137
|
-
- {{cwd}}: working directory
|
|
2138
|
-
- {{changedFiles}}: list of files changed (or "No files were changed")
|
|
2139
|
-
- {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
|
|
2140
|
-
|
|
2141
|
-
Judge how well the output meets the acceptance criteria stated in the user prompt.`
|
|
2142
|
-
},
|
|
2143
2177
|
{
|
|
2144
2178
|
name: "minScore",
|
|
2145
2179
|
label: "Minimum Score (0-100)",
|
|
@@ -2166,6 +2200,7 @@ var export_ClaudeModel = import_types.ClaudeModel;
|
|
|
2166
2200
|
export {
|
|
2167
2201
|
AVAILABLE_MODEL_IDS,
|
|
2168
2202
|
AVAILABLE_RUN_COMMANDS,
|
|
2203
|
+
AVAILABLE_TOOL_NAMES,
|
|
2169
2204
|
AgentRunCommand,
|
|
2170
2205
|
AgentRunCommandSchema,
|
|
2171
2206
|
AgentSchema,
|
|
@@ -2283,6 +2318,8 @@ export {
|
|
|
2283
2318
|
TimeAssertionSchema,
|
|
2284
2319
|
TimeConfigSchema,
|
|
2285
2320
|
TokenUsageSchema,
|
|
2321
|
+
ToolCalledWithParamAssertionSchema,
|
|
2322
|
+
ToolCalledWithParamConfigSchema,
|
|
2286
2323
|
ToolTestSchema,
|
|
2287
2324
|
TriggerMetadataSchema,
|
|
2288
2325
|
TriggerSchema,
|