npm - @wix/evalforge-types - Versions diffs - 0.38.0 → 0.40.0 - Mend

@wix/evalforge-types 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/build/index.js +57 -17
package/build/index.js.map +3 -3
package/build/index.mjs +54 -17
package/build/index.mjs.map +3 -3
package/build/types/assertion/assertion.d.ts +34 -5
package/build/types/assertion/system-assertions.d.ts +1 -0
package/build/types/common/index.d.ts +1 -0
package/build/types/common/tool-names.d.ts +1 -0
package/build/types/scenario/assertions.d.ts +16 -2
package/build/types/scenario/test-scenario.d.ts +12 -3
package/package.json +2 -2

package/build/index.js CHANGED Viewed

@@ -916,6 +916,7 @@ var index_exports = {};
 __export(index_exports, {
   AVAILABLE_MODEL_IDS: () => AVAILABLE_MODEL_IDS,
   AVAILABLE_RUN_COMMANDS: () => AVAILABLE_RUN_COMMANDS,
+  AVAILABLE_TOOL_NAMES: () => AVAILABLE_TOOL_NAMES,
   AgentRunCommand: () => AgentRunCommand,
   AgentRunCommandSchema: () => AgentRunCommandSchema,
   AgentSchema: () => AgentSchema,
@@ -1033,6 +1034,8 @@ __export(index_exports, {
   TimeAssertionSchema: () => TimeAssertionSchema,
   TimeConfigSchema: () => TimeConfigSchema,
   TokenUsageSchema: () => TokenUsageSchema,
+  ToolCalledWithParamAssertionSchema: () => ToolCalledWithParamAssertionSchema,
+  ToolCalledWithParamConfigSchema: () => ToolCalledWithParamConfigSchema,
   ToolTestSchema: () => ToolTestSchema,
   TriggerMetadataSchema: () => TriggerMetadataSchema,
   TriggerSchema: () => TriggerSchema,
@@ -1148,6 +1151,17 @@ var RuleInputBaseSchema = RuleSchema.omit({
 var CreateRuleInputSchema = RuleInputBaseSchema;
 var UpdateRuleInputSchema = RuleInputBaseSchema.partial();
+// src/common/tool-names.ts
+var AVAILABLE_TOOL_NAMES = [
+  "Bash",
+  "Edit",
+  "Glob",
+  "Grep",
+  "Read",
+  "Skill",
+  "Write"
+];
 // src/target/target.ts
 var TargetSchema = TenantEntitySchema.extend({
   // Base for all testable entities
@@ -1486,6 +1500,13 @@ var SkillWasCalledAssertionSchema = import_zod21.z.object({
   /** Names of the skills that must have been called (matched against trace Skill tool args) */
   skillNames: import_zod21.z.array(import_zod21.z.string().min(1)).min(1)
 });
+var ToolCalledWithParamAssertionSchema = import_zod21.z.object({
+  type: import_zod21.z.literal("tool_called_with_param"),
+  /** Name of the tool that must have been called */
+  toolName: import_zod21.z.string().min(1),
+  /** JSON string of key-value pairs for expected parameters (substring match) */
+  expectedParams: import_zod21.z.string().min(1)
+});
 var BuildPassedAssertionSchema = import_zod21.z.object({
   type: import_zod21.z.literal("build_passed"),
   /** Command to run (default: "yarn build") */
@@ -1502,8 +1523,6 @@ var LlmJudgeAssertionSchema = import_zod21.z.object({
   type: import_zod21.z.literal("llm_judge"),
   /** Prompt template; placeholders: {{output}}, {{cwd}}, {{changedFiles}}, {{trace}} */
   prompt: import_zod21.z.string(),
-  /** Optional system prompt for the judge (default asks for JSON with score) */
-  systemPrompt: import_zod21.z.string().optional(),
   /** Minimum score to pass (0-100, default 70) */
   minScore: import_zod21.z.number().int().min(0).max(100).optional(),
   /** Model for the judge (e.g. claude-3-5-haiku) */
@@ -1518,6 +1537,7 @@ var TimeAssertionSchema = import_zod21.z.object({
 });
 var AssertionSchema = import_zod21.z.union([
   SkillWasCalledAssertionSchema,
+  ToolCalledWithParamAssertionSchema,
   BuildPassedAssertionSchema,
   TimeAssertionSchema,
   CostAssertionSchema,
@@ -1565,6 +1585,7 @@ var import_zod24 = require("zod");
 var import_zod23 = require("zod");
 var AssertionTypeSchema = import_zod23.z.enum([
   "skill_was_called",
+  "tool_called_with_param",
   "build_passed",
   "time_limit",
   "cost",
@@ -1606,6 +1627,12 @@ var CostConfigSchema = import_zod23.z.strictObject({
   /** Maximum allowed cost in USD */
   maxCostUsd: import_zod23.z.number().positive()
 });
+var ToolCalledWithParamConfigSchema = import_zod23.z.strictObject({
+  /** Name of the tool that must have been called */
+  toolName: import_zod23.z.string().min(1),
+  /** JSON string of key-value pairs for expected parameters (substring match) */
+  expectedParams: import_zod23.z.string().min(1)
+});
 var BuildPassedConfigSchema = import_zod23.z.strictObject({
   /** Command to run (default: "yarn build") */
   command: import_zod23.z.string().optional(),
@@ -1628,8 +1655,6 @@ var LlmJudgeConfigSchema = import_zod23.z.object({
    * - Custom parameters defined in the parameters array
    */
   prompt: import_zod23.z.string().min(1),
-  /** Optional system prompt for the judge */
-  systemPrompt: import_zod23.z.string().optional(),
   /** Minimum score to pass (0-100, default 70) */
   minScore: import_zod23.z.number().int().min(0).max(100).optional(),
   /** Model for the judge (e.g. claude-3-5-haiku-20241022) */
@@ -1646,6 +1671,8 @@ var AssertionConfigSchema = import_zod23.z.union([
   // requires prompt - check first
   SkillWasCalledConfigSchema,
   // requires skillNames
+  ToolCalledWithParamConfigSchema,
+  // requires toolName + expectedParams, uses strictObject
   TimeConfigSchema,
   // requires maxDurationMs, uses strictObject
   CostConfigSchema,
@@ -1674,6 +1701,8 @@ function validateAssertionConfig(type, config) {
       return SkillWasCalledConfigSchema.safeParse(config).success;
     case "cost":
       return CostConfigSchema.safeParse(config).success;
+    case "tool_called_with_param":
+      return ToolCalledWithParamConfigSchema.safeParse(config).success;
     case "build_passed":
       return BuildPassedConfigSchema.safeParse(config).success;
     case "time_limit":
@@ -2182,6 +2211,7 @@ var UpdateTemplateInputSchema = CreateTemplateInputSchema.partial();
 // src/assertion/system-assertions.ts
 var SYSTEM_ASSERTION_IDS = {
   SKILL_WAS_CALLED: "system:skill_was_called",
+  TOOL_CALLED_WITH_PARAM: "system:tool_called_with_param",
   BUILD_PASSED: "system:build_passed",
   TIME_LIMIT: "system:time_limit",
   COST: "system:cost",
@@ -2205,6 +2235,26 @@ var SYSTEM_ASSERTIONS = {
       }
     ]
   },
+  [SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM]: {
+    id: SYSTEM_ASSERTION_IDS.TOOL_CALLED_WITH_PARAM,
+    name: "Tool Called With Param",
+    description: "Check that a tool was called with expected parameters",
+    type: "tool_called_with_param",
+    parameters: [
+      {
+        name: "toolName",
+        label: "Tool Name",
+        type: "string",
+        required: true
+      },
+      {
+        name: "expectedParams",
+        label: "Expected Parameters (JSON, substring match)",
+        type: "string",
+        required: true
+      }
+    ]
+  },
   [SYSTEM_ASSERTION_IDS.BUILD_PASSED]: {
     id: SYSTEM_ASSERTION_IDS.BUILD_PASSED,
     name: "Build Passed",
@@ -2284,19 +2334,6 @@ var SYSTEM_ASSERTIONS = {
         required: true,
         defaultValue: "Verify the output meets the acceptance criteria."
       },
-      {
-        name: "systemPrompt",
-        label: "System Prompt (optional)",
-        type: "string",
-        required: false,
-        defaultValue: `You are judging a scenario run. Use these values:
-- {{output}}: the agent's final output
-- {{cwd}}: working directory
-- {{changedFiles}}: list of files changed (or "No files were changed")
-- {{trace}}: step-by-step trace (tool calls, completions) to check e.g. which tools were called and how many times
-Judge how well the output meets the acceptance criteria stated in the user prompt.`
-      },
       {
         name: "minScore",
         label: "Minimum Score (0-100)",
@@ -2323,6 +2360,7 @@ function getSystemAssertion(id) {
 0 && (module.exports = {
   AVAILABLE_MODEL_IDS,
   AVAILABLE_RUN_COMMANDS,
+  AVAILABLE_TOOL_NAMES,
   AgentRunCommand,
   AgentRunCommandSchema,
   AgentSchema,
@@ -2440,6 +2478,8 @@ function getSystemAssertion(id) {
   TimeAssertionSchema,
   TimeConfigSchema,
   TokenUsageSchema,
+  ToolCalledWithParamAssertionSchema,
+  ToolCalledWithParamConfigSchema,
   ToolTestSchema,
   TriggerMetadataSchema,
   TriggerSchema,