@wix/evalforge-types 0.38.0 → 0.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +57 -17
- package/build/index.js.map +3 -3
- package/build/index.mjs +54 -17
- package/build/index.mjs.map +3 -3
- package/build/types/assertion/assertion.d.ts +34 -5
- package/build/types/assertion/system-assertions.d.ts +1 -0
- package/build/types/common/index.d.ts +1 -0
- package/build/types/common/tool-names.d.ts +1 -0
- package/build/types/scenario/assertions.d.ts +16 -2
- package/build/types/scenario/test-scenario.d.ts +12 -3
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -916,6 +916,7 @@ var index_exports = {};
|
|
|
916
916
|
__export(index_exports, {
|
|
917
917
|
AVAILABLE_MODEL_IDS: () => AVAILABLE_MODEL_IDS,
|
|
918
918
|
AVAILABLE_RUN_COMMANDS: () => AVAILABLE_RUN_COMMANDS,
|
|
919
|
+
AVAILABLE_TOOL_NAMES: () => AVAILABLE_TOOL_NAMES,
|
|
919
920
|
AgentRunCommand: () => AgentRunCommand,
|
|
920
921
|
AgentRunCommandSchema: () => AgentRunCommandSchema,
|
|
921
922
|
AgentSchema: () => AgentSchema,
|
|
@@ -1033,6 +1034,8 @@ __export(index_exports, {
|
|
|
1033
1034
|
TimeAssertionSchema: () => TimeAssertionSchema,
|
|
1034
1035
|
TimeConfigSchema: () => TimeConfigSchema,
|
|
1035
1036
|
TokenUsageSchema: () => TokenUsageSchema,
|
|
1037
|
+
ToolCalledWithParamAssertionSchema: () => ToolCalledWithParamAssertionSchema,
|
|
1038
|
+
ToolCalledWithParamConfigSchema: () => ToolCalledWithParamConfigSchema,
|
|
1036
1039
|
ToolTestSchema: () => ToolTestSchema,
|
|
1037
1040
|
TriggerMetadataSchema: () => TriggerMetadataSchema,
|
|
1038
1041
|
TriggerSchema: () => TriggerSchema,
|
|
@@ -1148,6 +1151,17 @@ var RuleInputBaseSchema = RuleSchema.omit({
|
|
|
1148
1151
|
var CreateRuleInputSchema = RuleInputBaseSchema;
|
|
1149
1152
|
var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
|
|
1150
1153
|
|
|
1154
|
+
// src/common/tool-names.ts
|
|
1155
|
+
var AVAILABLE_TOOL_NAMES = [
|
|
1156
|
+
"Bash",
|
|
1157
|
+
"Edit",
|
|
1158
|
+
"Glob",
|
|
1159
|
+
"Grep",
|
|
1160
|
+
"Read",
|
|
1161
|
+
"Skill",
|
|
1162
|
+
"Write"
|
|
1163
|
+
];
|
|
1164
|
+
|
|
1151
1165
|
// src/target/target.ts
|
|
1152
1166
|
var TargetSchema = TenantEntitySchema.extend({
|
|
1153
1167
|
// Base for all testable entities
|
|
@@ -1486,6 +1500,13 @@ var SkillWasCalledAssertionSchema = import_zod21.z.object({
|
|
|
1486
1500
|
/** Names of the skills that must have been called (matched against trace Skill tool args) */
|
|
1487
1501
|
skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
|
|
1488
1502
|
});
|
|
1503
|
+
var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
|
|
1504
|
+
type: import_zod21.z.literal("tool_called_with_param"),
|
|
1505
|
+
/** Name of the tool that must have been called */
|
|
1506
|
+
toolName: import_zod21.z.string().min(1),
|
|
1507
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1508
|
+
expectedParams: import_zod21.z.string().min(1)
|
|
1509
|
+
});
|
|
1489
1510
|
var BuildPassedAssertionSchema = import_zod21.z.object({
|
|
1490
1511
|
type: import_zod21.z.literal("build_passed"),
|
|
1491
1512
|
/** Command to run (default: "yarn build") */
|
|
@@ -1502,8 +1523,6 @@ var LlmJudgeAssertionSchema = import_zod21.z.object({
|
|
|
1502
1523
|
type: import_zod21.z.literal("llm_judge"),
|
|
1503
1524
|
/** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
|
|
1504
1525
|
prompt: import_zod21.z.string(),
|
|
1505
|
-
/** Optional system prompt for the judge (default asks for JSON with score) */
|
|
1506
|
-
systemPrompt: import_zod21.z.string().optional(),
|
|
1507
1526
|
/** Minimum score to pass (0-100, default 70) */
|
|
1508
1527
|
minScore: import_zod21.z.number().int().min(0).max(100).optional(),
|
|
1509
1528
|
/** Model for the judge (e.g. claude-3-5-haiku) */
|
|
@@ -1518,6 +1537,7 @@ var TimeAssertionSchema = import_zod21.z.object({
|
|
|
1518
1537
|
});
|
|
1519
1538
|
var AssertionSchema = import_zod21.z.union([
|
|
1520
1539
|
SkillWasCalledAssertionSchema,
|
|
1540
|
+
ToolCalledWithParamAssertionSchema,
|
|
1521
1541
|
BuildPassedAssertionSchema,
|
|
1522
1542
|
TimeAssertionSchema,
|
|
1523
1543
|
CostAssertionSchema,
|
|
@@ -1565,6 +1585,7 @@ var import_zod24 = require("zod");
|
|
|
1565
1585
|
var import_zod23 = require("zod");
|
|
1566
1586
|
var AssertionTypeSchema = import_zod23.z.enum([
|
|
1567
1587
|
"skill_was_called",
|
|
1588
|
+
"tool_called_with_param",
|
|
1568
1589
|
"build_passed",
|
|
1569
1590
|
"time_limit",
|
|
1570
1591
|
"cost",
|
|
@@ -1606,6 +1627,12 @@ var CostConfigSchema = import_zod23.z.strictObject({
|
|
|
1606
1627
|
/** Maximum allowed cost in USD */
|
|
1607
1628
|
maxCostUsd: import_zod23.z.number().positive()
|
|
1608
1629
|
});
|
|
1630
|
+
var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
|
|
1631
|
+
/** Name of the tool that must have been called */
|
|
1632
|
+
toolName: import_zod23.z.string().min(1),
|
|
1633
|
+
/** JSON string of key-value pairs for expected parameters (substring match) */
|
|
1634
|
+
expectedParams: import_zod23.z.string().min(1)
|
|
1635
|
+
});
|
|
1609
1636
|
var BuildPassedConfigSchema = import_zod23.z.strictObject({
|
|
1610
1637
|
/** Command to run (default: "yarn build") */
|
|
1611
1638
|
command: import_zod23.z.string().optional(),
|
|
@@ -1628,8 +1655,6 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
|
|
|
1628
1655
|
* - Custom parameters defined in the parameters array
|
|
1629
1656
|
*/
|
|
1630
1657
|
prompt: import_zod23.z.string().min(1),
|
|
1631
|
-
/** Optional system prompt for the judge */
|
|
1632
|
-
systemPrompt: import_zod23.z.string().optional(),
|
|
1633
1658
|
/** Minimum score to pass (0-100, default 70) */
|
|
1634
1659
|
minScore: import_zod23.z.number().int().min(0).max(100).optional(),
|
|
1635
1660
|
/** Model for the judge (e.g. claude-3-5-haiku-20241022) */
|
|
@@ -1646,6 +1671,8 @@ var AssertionConfigSchema = import_zod23.z.union([
|
|
|
1646
1671
|
// requires prompt - check first
|
|
1647
1672
|
SkillWasCalledConfigSchema,
|
|
1648
1673
|
// requires skillNames
|
|
1674
|
+
ToolCalledWithParamConfigSchema,
|
|
1675
|
+
// requires toolName + expectedParams, uses strictObject
|
|
1649
1676
|
TimeConfigSchema,
|
|
1650
1677
|
// requires maxDurationMs, uses strictObject
|
|
1651
1678
|
CostConfigSchema,
|
|
@@ -1674,6 +1701,8 @@ function validateAssertionConfig(type, config) {
|
|
|
1674
1701
|
return SkillWasCalledConfigSchema.safeParse(config).success;
|
|
1675
1702
|
case "cost":
|
|
1676
1703
|
return CostConfigSchema.safeParse(config).success;
|
|
1704
|
+
case "tool_called_with_param":
|
|
1705
|
+
return ToolCalledWithParamConfigSchema.safeParse(config).success;
|
|
1677
1706
|
case "build_passed":
|
|
1678
1707
|
return BuildPassedConfigSchema.safeParse(config).success;
|
|
1679
1708
|
case "time_limit":
|
|
@@ -2182,6 +2211,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
|
|
|
2182
2211
|
// src/assertion/system-assertions.ts
|
|
2183
2212
|
var SYSTEM_ASSERTION_IDS = {
|
|
2184
2213
|
SKILL_WAS_CALLED: "system:skill_was_called",
|
|
2214
|
+
TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
|
|
2185
2215
|
BUILD_PASSED: "system:build_passed",
|
|
2186
2216
|
TIME_LIMIT: "system:time_limit",
|
|
2187
2217
|
COST: "system:cost",
|
|
@@ -2205,6 +2235,26 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2205
2235
|
}
|
|
2206
2236
|
]
|
|
2207
2237
|
},
|
|
2238
|
+
[SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
|
|
2239
|
+
id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
|
|
2240
|
+
name: "Tool Called With Param",
|
|
2241
|
+
description: "Check that a tool was called with expected parameters",
|
|
2242
|
+
type: "tool_called_with_param",
|
|
2243
|
+
parameters: [
|
|
2244
|
+
{
|
|
2245
|
+
name: "toolName",
|
|
2246
|
+
label: "Tool Name",
|
|
2247
|
+
type: "string",
|
|
2248
|
+
required: true
|
|
2249
|
+
},
|
|
2250
|
+
{
|
|
2251
|
+
name: "expectedParams",
|
|
2252
|
+
label: "Expected Parameters (JSON, substring match)",
|
|
2253
|
+
type: "string",
|
|
2254
|
+
required: true
|
|
2255
|
+
}
|
|
2256
|
+
]
|
|
2257
|
+
},
|
|
2208
2258
|
[SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
|
|
2209
2259
|
id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
|
|
2210
2260
|
name: "Build Passed",
|
|
@@ -2284,19 +2334,6 @@ var SYSTEM_ASSERTIONS = {
|
|
|
2284
2334
|
required: true,
|
|
2285
2335
|
defaultValue: "Verify the output meets the acceptance criteria."
|
|
2286
2336
|
},
|
|
2287
|
-
{
|
|
2288
|
-
name: "systemPrompt",
|
|
2289
|
-
label: "System Prompt (optional)",
|
|
2290
|
-
type: "string",
|
|
2291
|
-
required: false,
|
|
2292
|
-
defaultValue: `You are judging a scenario run. Use these values:
|
|
2293
|
-
- {{output}}: the agent's final output
|
|
2294
|
-
- {{cwd}}: working directory
|
|
2295
|
-
- {{changedFiles}}: list of files changed (or "No files were changed")
|
|
2296
|
-
- {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
|
|
2297
|
-
|
|
2298
|
-
Judge how well the output meets the acceptance criteria stated in the user prompt.`
|
|
2299
|
-
},
|
|
2300
2337
|
{
|
|
2301
2338
|
name: "minScore",
|
|
2302
2339
|
label: "Minimum Score (0-100)",
|
|
@@ -2323,6 +2360,7 @@ function getSystemAssertion(id) {
|
|
|
2323
2360
|
0 && (module.exports = {
|
|
2324
2361
|
AVAILABLE_MODEL_IDS,
|
|
2325
2362
|
AVAILABLE_RUN_COMMANDS,
|
|
2363
|
+
AVAILABLE_TOOL_NAMES,
|
|
2326
2364
|
AgentRunCommand,
|
|
2327
2365
|
AgentRunCommandSchema,
|
|
2328
2366
|
AgentSchema,
|
|
@@ -2440,6 +2478,8 @@ function getSystemAssertion(id) {
|
|
|
2440
2478
|
TimeAssertionSchema,
|
|
2441
2479
|
TimeConfigSchema,
|
|
2442
2480
|
TokenUsageSchema,
|
|
2481
|
+
ToolCalledWithParamAssertionSchema,
|
|
2482
|
+
ToolCalledWithParamConfigSchema,
|
|
2443
2483
|
ToolTestSchema,
|
|
2444
2484
|
TriggerMetadataSchema,
|
|
2445
2485
|
TriggerSchema,
|