@wix/evalforge-types 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -916,6 +916,7 @@ var index_exports = {};
916
916
  __export(index_exports, {
917
917
  AVAILABLE_MODEL_IDS: () => AVAILABLE_MODEL_IDS,
918
918
  AVAILABLE_RUN_COMMANDS: () => AVAILABLE_RUN_COMMANDS,
919
+ AVAILABLE_TOOL_NAMES: () => AVAILABLE_TOOL_NAMES,
919
920
  AgentRunCommand: () => AgentRunCommand,
920
921
  AgentRunCommandSchema: () => AgentRunCommandSchema,
921
922
  AgentSchema: () => AgentSchema,
@@ -1033,6 +1034,8 @@ __export(index_exports, {
1033
1034
  TimeAssertionSchema: () => TimeAssertionSchema,
1034
1035
  TimeConfigSchema: () => TimeConfigSchema,
1035
1036
  TokenUsageSchema: () => TokenUsageSchema,
1037
+ ToolCalledWithParamAssertionSchema: () => ToolCalledWithParamAssertionSchema,
1038
+ ToolCalledWithParamConfigSchema: () => ToolCalledWithParamConfigSchema,
1036
1039
  ToolTestSchema: () => ToolTestSchema,
1037
1040
  TriggerMetadataSchema: () => TriggerMetadataSchema,
1038
1041
  TriggerSchema: () => TriggerSchema,
@@ -1148,6 +1151,17 @@ var RuleInputBaseSchema = RuleSchema.omit({
1148
1151
  var CreateRuleInputSchema = RuleInputBaseSchema;
1149
1152
  var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
1150
1153
 
1154
+ // src/common/tool-names.ts
1155
+ var AVAILABLE_TOOL_NAMES = [
1156
+ "Bash",
1157
+ "Edit",
1158
+ "Glob",
1159
+ "Grep",
1160
+ "Read",
1161
+ "Skill",
1162
+ "Write"
1163
+ ];
1164
+
1151
1165
  // src/target/target.ts
1152
1166
  var TargetSchema = TenantEntitySchema.extend({
1153
1167
  // Base for all testable entities
@@ -1486,6 +1500,13 @@ var SkillWasCalledAssertionSchema = import_zod21.z.object({
1486
1500
  /** Names of the skills that must have been called (matched against trace Skill tool args) */
1487
1501
  skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
1488
1502
  });
1503
+ var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
1504
+ type: import_zod21.z.literal("tool_called_with_param"),
1505
+ /** Name of the tool that must have been called */
1506
+ toolName: import_zod21.z.string().min(1),
1507
+ /** JSON string of key-value pairs for expected parameters (substring match) */
1508
+ expectedParams: import_zod21.z.string().min(1)
1509
+ });
1489
1510
  var BuildPassedAssertionSchema = import_zod21.z.object({
1490
1511
  type: import_zod21.z.literal("build_passed"),
1491
1512
  /** Command to run (default: "yarn build") */
@@ -1502,8 +1523,6 @@ var LlmJudgeAssertionSchema = import_zod21.z.object({
1502
1523
  type: import_zod21.z.literal("llm_judge"),
1503
1524
  /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
1504
1525
  prompt: import_zod21.z.string(),
1505
- /** Optional system prompt for the judge (default asks for JSON with score) */
1506
- systemPrompt: import_zod21.z.string().optional(),
1507
1526
  /** Minimum score to pass (0-100, default 70) */
1508
1527
  minScore: import_zod21.z.number().int().min(0).max(100).optional(),
1509
1528
  /** Model for the judge (e.g. claude-3-5-haiku) */
@@ -1518,6 +1537,7 @@ var TimeAssertionSchema = import_zod21.z.object({
1518
1537
  });
1519
1538
  var AssertionSchema = import_zod21.z.union([
1520
1539
  SkillWasCalledAssertionSchema,
1540
+ ToolCalledWithParamAssertionSchema,
1521
1541
  BuildPassedAssertionSchema,
1522
1542
  TimeAssertionSchema,
1523
1543
  CostAssertionSchema,
@@ -1565,6 +1585,7 @@ var import_zod24 = require("zod");
1565
1585
  var import_zod23 = require("zod");
1566
1586
  var AssertionTypeSchema = import_zod23.z.enum([
1567
1587
  "skill_was_called",
1588
+ "tool_called_with_param",
1568
1589
  "build_passed",
1569
1590
  "time_limit",
1570
1591
  "cost",
@@ -1606,6 +1627,12 @@ var CostConfigSchema = import_zod23.z.strictObject({
1606
1627
  /** Maximum allowed cost in USD */
1607
1628
  maxCostUsd: import_zod23.z.number().positive()
1608
1629
  });
1630
+ var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
1631
+ /** Name of the tool that must have been called */
1632
+ toolName: import_zod23.z.string().min(1),
1633
+ /** JSON string of key-value pairs for expected parameters (substring match) */
1634
+ expectedParams: import_zod23.z.string().min(1)
1635
+ });
1609
1636
  var BuildPassedConfigSchema = import_zod23.z.strictObject({
1610
1637
  /** Command to run (default: "yarn build") */
1611
1638
  command: import_zod23.z.string().optional(),
@@ -1628,8 +1655,6 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
1628
1655
  * - Custom parameters defined in the parameters array
1629
1656
  */
1630
1657
  prompt: import_zod23.z.string().min(1),
1631
- /** Optional system prompt for the judge */
1632
- systemPrompt: import_zod23.z.string().optional(),
1633
1658
  /** Minimum score to pass (0-100, default 70) */
1634
1659
  minScore: import_zod23.z.number().int().min(0).max(100).optional(),
1635
1660
  /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
@@ -1646,6 +1671,8 @@ var AssertionConfigSchema = import_zod23.z.union([
1646
1671
  // requires prompt - check first
1647
1672
  SkillWasCalledConfigSchema,
1648
1673
  // requires skillNames
1674
+ ToolCalledWithParamConfigSchema,
1675
+ // requires toolName + expectedParams, uses strictObject
1649
1676
  TimeConfigSchema,
1650
1677
  // requires maxDurationMs, uses strictObject
1651
1678
  CostConfigSchema,
@@ -1674,6 +1701,8 @@ function validateAssertionConfig(type, config) {
1674
1701
  return SkillWasCalledConfigSchema.safeParse(config).success;
1675
1702
  case "cost":
1676
1703
  return CostConfigSchema.safeParse(config).success;
1704
+ case "tool_called_with_param":
1705
+ return ToolCalledWithParamConfigSchema.safeParse(config).success;
1677
1706
  case "build_passed":
1678
1707
  return BuildPassedConfigSchema.safeParse(config).success;
1679
1708
  case "time_limit":
@@ -2182,6 +2211,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
2182
2211
  // src/assertion/system-assertions.ts
2183
2212
  var SYSTEM_ASSERTION_IDS = {
2184
2213
  SKILL_WAS_CALLED: "system:skill_was_called",
2214
+ TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
2185
2215
  BUILD_PASSED: "system:build_passed",
2186
2216
  TIME_LIMIT: "system:time_limit",
2187
2217
  COST: "system:cost",
@@ -2205,6 +2235,26 @@ var SYSTEM_ASSERTIONS = {
2205
2235
  }
2206
2236
  ]
2207
2237
  },
2238
+ [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
2239
+ id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
2240
+ name: "Tool Called With Param",
2241
+ description: "Check that a tool was called with expected parameters",
2242
+ type: "tool_called_with_param",
2243
+ parameters: [
2244
+ {
2245
+ name: "toolName",
2246
+ label: "Tool Name",
2247
+ type: "string",
2248
+ required: true
2249
+ },
2250
+ {
2251
+ name: "expectedParams",
2252
+ label: "Expected Parameters (JSON, substring match)",
2253
+ type: "string",
2254
+ required: true
2255
+ }
2256
+ ]
2257
+ },
2208
2258
  [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
2209
2259
  id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
2210
2260
  name: "Build Passed",
@@ -2284,19 +2334,6 @@ var SYSTEM_ASSERTIONS = {
2284
2334
  required: true,
2285
2335
  defaultValue: "Verify the output meets the acceptance criteria."
2286
2336
  },
2287
- {
2288
- name: "systemPrompt",
2289
- label: "System Prompt (optional)",
2290
- type: "string",
2291
- required: false,
2292
- defaultValue: `You are judging a scenario run. Use these values:
2293
- - {{output}}: the agent's final output
2294
- - {{cwd}}: working directory
2295
- - {{changedFiles}}: list of files changed (or "No files were changed")
2296
- - {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
2297
-
2298
- Judge how well the output meets the acceptance criteria stated in the user prompt.`
2299
- },
2300
2337
  {
2301
2338
  name: "minScore",
2302
2339
  label: "Minimum Score (0-100)",
@@ -2323,6 +2360,7 @@ function getSystemAssertion(id) {
2323
2360
  0 && (module.exports = {
2324
2361
  AVAILABLE_MODEL_IDS,
2325
2362
  AVAILABLE_RUN_COMMANDS,
2363
+ AVAILABLE_TOOL_NAMES,
2326
2364
  AgentRunCommand,
2327
2365
  AgentRunCommandSchema,
2328
2366
  AgentSchema,
@@ -2440,6 +2478,8 @@ function getSystemAssertion(id) {
2440
2478
  TimeAssertionSchema,
2441
2479
  TimeConfigSchema,
2442
2480
  TokenUsageSchema,
2481
+ ToolCalledWithParamAssertionSchema,
2482
+ ToolCalledWithParamConfigSchema,
2443
2483
  ToolTestSchema,
2444
2484
  TriggerMetadataSchema,
2445
2485
  TriggerSchema,