@wix/evalforge-types 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.mjs CHANGED
@@ -991,6 +991,17 @@ var RuleInputBaseSchema = RuleSchema.omit({
991
991
  var CreateRuleInputSchema = RuleInputBaseSchema;
992
992
  var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
993
993
 
994
+ // src/common/tool-names.ts
995
+ var AVAILABLE_TOOL_NAMES = [
996
+ "Bash",
997
+ "Edit",
998
+ "Glob",
999
+ "Grep",
1000
+ "Read",
1001
+ "Skill",
1002
+ "Write"
1003
+ ];
1004
+
994
1005
  // src/target/target.ts
995
1006
  var TargetSchema = TenantEntitySchema.extend({
996
1007
  // Base for all testable entities
@@ -1329,6 +1340,13 @@ var SkillWasCalledAssertionSchema = z21.object({
1329
1340
  /** Names of the skills that must have been called (matched against trace Skill tool args) */
1330
1341
  skillNames: z21.array(z21.string().min(1)).min(1)
1331
1342
  });
1343
+ var ToolCalledWithParamAssertionSchema = z21.object({
1344
+ type: z21.literal("tool_called_with_param"),
1345
+ /** Name of the tool that must have been called */
1346
+ toolName: z21.string().min(1),
1347
+ /** JSON string of key-value pairs for expected parameters (substring match) */
1348
+ expectedParams: z21.string().min(1)
1349
+ });
1332
1350
  var BuildPassedAssertionSchema = z21.object({
1333
1351
  type: z21.literal("build_passed"),
1334
1352
  /** Command to run (default: "yarn build") */
@@ -1345,8 +1363,6 @@ var LlmJudgeAssertionSchema = z21.object({
1345
1363
  type: z21.literal("llm_judge"),
1346
1364
  /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
1347
1365
  prompt: z21.string(),
1348
- /** Optional system prompt for the judge (default asks for JSON with score) */
1349
- systemPrompt: z21.string().optional(),
1350
1366
  /** Minimum score to pass (0-100, default 70) */
1351
1367
  minScore: z21.number().int().min(0).max(100).optional(),
1352
1368
  /** Model for the judge (e.g. claude-3-5-haiku) */
@@ -1361,6 +1377,7 @@ var TimeAssertionSchema = z21.object({
1361
1377
  });
1362
1378
  var AssertionSchema = z21.union([
1363
1379
  SkillWasCalledAssertionSchema,
1380
+ ToolCalledWithParamAssertionSchema,
1364
1381
  BuildPassedAssertionSchema,
1365
1382
  TimeAssertionSchema,
1366
1383
  CostAssertionSchema,
@@ -1408,6 +1425,7 @@ import { z as z24 } from "zod";
1408
1425
  import { z as z23 } from "zod";
1409
1426
  var AssertionTypeSchema = z23.enum([
1410
1427
  "skill_was_called",
1428
+ "tool_called_with_param",
1411
1429
  "build_passed",
1412
1430
  "time_limit",
1413
1431
  "cost",
@@ -1449,6 +1467,12 @@ var CostConfigSchema = z23.strictObject({
1449
1467
  /** Maximum allowed cost in USD */
1450
1468
  maxCostUsd: z23.number().positive()
1451
1469
  });
1470
+ var ToolCalledWithParamConfigSchema = z23.strictObject({
1471
+ /** Name of the tool that must have been called */
1472
+ toolName: z23.string().min(1),
1473
+ /** JSON string of key-value pairs for expected parameters (substring match) */
1474
+ expectedParams: z23.string().min(1)
1475
+ });
1452
1476
  var BuildPassedConfigSchema = z23.strictObject({
1453
1477
  /** Command to run (default: "yarn build") */
1454
1478
  command: z23.string().optional(),
@@ -1471,8 +1495,6 @@ var LlmJudgeConfigSchema = z23.object({
1471
1495
  * - Custom parameters defined in the parameters array
1472
1496
  */
1473
1497
  prompt: z23.string().min(1),
1474
- /** Optional system prompt for the judge */
1475
- systemPrompt: z23.string().optional(),
1476
1498
  /** Minimum score to pass (0-100, default 70) */
1477
1499
  minScore: z23.number().int().min(0).max(100).optional(),
1478
1500
  /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
@@ -1489,6 +1511,8 @@ var AssertionConfigSchema = z23.union([
1489
1511
  // requires prompt - check first
1490
1512
  SkillWasCalledConfigSchema,
1491
1513
  // requires skillNames
1514
+ ToolCalledWithParamConfigSchema,
1515
+ // requires toolName + expectedParams, uses strictObject
1492
1516
  TimeConfigSchema,
1493
1517
  // requires maxDurationMs, uses strictObject
1494
1518
  CostConfigSchema,
@@ -1517,6 +1541,8 @@ function validateAssertionConfig(type, config) {
1517
1541
  return SkillWasCalledConfigSchema.safeParse(config).success;
1518
1542
  case "cost":
1519
1543
  return CostConfigSchema.safeParse(config).success;
1544
+ case "tool_called_with_param":
1545
+ return ToolCalledWithParamConfigSchema.safeParse(config).success;
1520
1546
  case "build_passed":
1521
1547
  return BuildPassedConfigSchema.safeParse(config).success;
1522
1548
  case "time_limit":
@@ -2025,6 +2051,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
2025
2051
  // src/assertion/system-assertions.ts
2026
2052
  var SYSTEM_ASSERTION_IDS = {
2027
2053
  SKILL_WAS_CALLED: "system:skill_was_called",
2054
+ TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
2028
2055
  BUILD_PASSED: "system:build_passed",
2029
2056
  TIME_LIMIT: "system:time_limit",
2030
2057
  COST: "system:cost",
@@ -2048,6 +2075,26 @@ var SYSTEM_ASSERTIONS = {
2048
2075
  }
2049
2076
  ]
2050
2077
  },
2078
+ [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
2079
+ id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
2080
+ name: "Tool Called With Param",
2081
+ description: "Check that a tool was called with expected parameters",
2082
+ type: "tool_called_with_param",
2083
+ parameters: [
2084
+ {
2085
+ name: "toolName",
2086
+ label: "Tool Name",
2087
+ type: "string",
2088
+ required: true
2089
+ },
2090
+ {
2091
+ name: "expectedParams",
2092
+ label: "Expected Parameters (JSON, substring match)",
2093
+ type: "string",
2094
+ required: true
2095
+ }
2096
+ ]
2097
+ },
2051
2098
  [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
2052
2099
  id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
2053
2100
  name: "Build Passed",
@@ -2127,19 +2174,6 @@ var SYSTEM_ASSERTIONS = {
2127
2174
  required: true,
2128
2175
  defaultValue: "Verify the output meets the acceptance criteria."
2129
2176
  },
2130
- {
2131
- name: "systemPrompt",
2132
- label: "System Prompt (optional)",
2133
- type: "string",
2134
- required: false,
2135
- defaultValue: `You are judging a scenario run. Use these values:
2136
- - {{output}}: the agent's final output
2137
- - {{cwd}}: working directory
2138
- - {{changedFiles}}: list of files changed (or "No files were changed")
2139
- - {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
2140
-
2141
- Judge how well the output meets the acceptance criteria stated in the user prompt.`
2142
- },
2143
2177
  {
2144
2178
  name: "minScore",
2145
2179
  label: "Minimum Score (0-100)",
@@ -2166,6 +2200,7 @@ var export_ClaudeModel = import_types.ClaudeModel;
2166
2200
  export {
2167
2201
  AVAILABLE_MODEL_IDS,
2168
2202
  AVAILABLE_RUN_COMMANDS,
2203
+ AVAILABLE_TOOL_NAMES,
2169
2204
  AgentRunCommand,
2170
2205
  AgentRunCommandSchema,
2171
2206
  AgentSchema,
@@ -2283,6 +2318,8 @@ export {
2283
2318
  TimeAssertionSchema,
2284
2319
  TimeConfigSchema,
2285
2320
  TokenUsageSchema,
2321
+ ToolCalledWithParamAssertionSchema,
2322
+ ToolCalledWithParamConfigSchema,
2286
2323
  ToolTestSchema,
2287
2324
  TriggerMetadataSchema,
2288
2325
  TriggerSchema,