npm - agentv - Versions diffs - 0.26.0 → 1.0.0 - Mend

agentv 0.26.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/{chunk-6ZM7WVSC.js → chunk-RIJO5WBF.js} +13 -13
package/dist/chunk-RIJO5WBF.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/cli.js.map +1 -1
package/dist/index.js +1 -1
package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +36 -19
package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +217 -217
package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +94 -2
package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +8 -8
package/package.json +1 -1
package/dist/chunk-6ZM7WVSC.js.map +0 -1
package/dist/templates/agentv/.env.template +0 -23

package/dist/cli.js CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 import {
   runCli
-} from "./chunk-6ZM7WVSC.js";
+} from "./chunk-RIJO5WBF.js";
 import "./chunk-UE4GLFVL.js";
 // src/cli.ts

package/dist/cli.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\r\nimport { runCli } from './index.js';\r\n\r\nrunCli().catch((error) => {\r\n console.error(error);\r\n process.exit(1);\r\n});\r\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}
1	+ {"version":3,"sources":["../src/cli.ts"],"sourcesContent":["#!/usr/bin/env node\nimport { runCli } from './index.js';\n\nrunCli().catch((error) => {\n console.error(error);\n process.exit(1);\n});\n"],"mappings":";;;;;;;AAGA,OAAO,EAAE,MAAM,CAAC,UAAU;AACxB,UAAQ,MAAM,KAAK;AACnB,UAAQ,KAAK,CAAC;AAChB,CAAC;","names":[]}

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   app,
   runCli
-} from "./chunk-6ZM7WVSC.js";
+} from "./chunk-RIJO5WBF.js";
 import "./chunk-UE4GLFVL.js";
 export {
   app,

package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md CHANGED Viewed

@@ -61,8 +61,42 @@ execution:
       model: gpt-5-chat
 ```
-### Evaluator Chaining
-Evaluators run sequentially:
+### Tool Trajectory Evaluators
+Validate agent tool usage patterns (requires trace data from provider):
+```yaml
+execution:
+  evaluators:
+    - name: research_check
+      type: tool_trajectory
+      mode: any_order       # Options: any_order, in_order, exact
+      minimums:             # For any_order mode
+        knowledgeSearch: 2
+      expected:             # For in_order/exact modes
+        - tool: knowledgeSearch
+        - tool: documentRetrieve
+```
+See `references/tool-trajectory-evaluator.md` for modes and configuration.
+### Expected Tool Calls Evaluators
+Validate tool calls and inputs inline with conversation flow:
+```yaml
+expected_messages:
+  - role: assistant
+    tool_calls:
+      - tool: getMetrics
+        input: { server: "prod-1" }
+execution:
+  evaluators:
+    - name: input_check
+      type: expected_tool_calls
+```
+### Multiple Evaluators
+Define multiple evaluators to run sequentially. The final score is a weighted average of all results.
 ```yaml
 execution:
@@ -119,23 +153,6 @@ execution:
 See `references/composite-evaluator.md` for aggregation types and patterns.
-### Tool Trajectory Evaluator
-Validate agent tool usage from trace data:
-```yaml
-execution:
-  evaluators:
-    - name: workflow-check
-      type: tool_trajectory
-      mode: in_order  # or: any_order, exact
-      expected:
-        - tool: fetchData
-        - tool: processData
-        - tool: saveResults
-```
-See `references/tool-trajectory-evaluator.md` for modes and configuration.
 ## Example
 ```yaml
 $schema: agentv-eval-v2

package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json CHANGED Viewed

@@ -1,217 +1,217 @@
-{
-  "$schema": "http://json-schema.org/draft-07/schema#",
-  "title": "AgentV Eval Schema",
-  "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
-  "type": "object",
-  "properties": {
-    "$schema": {
-      "type": "string",
-      "description": "Schema identifier",
-      "enum": ["agentv-eval-v2"]
-    },
-    "description": {
-      "type": "string",
-      "description": "Description of what this eval suite covers"
-    },
-    "target": {
-      "type": "string",
-      "description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
-    },
-    "execution": {
-      "type": "object",
-      "description": "Default execution configuration for all eval cases (can be overridden per case)",
-      "properties": {
-        "target": {
-          "type": "string",
-          "description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
-        },
-        "evaluators": {
-          "type": "array",
-          "description": "Default evaluators for all eval cases (code-based and LLM judges)",
-          "items": {
-            "type": "object",
-            "properties": {
-              "name": {
-                "type": "string",
-                "description": "Evaluator name/identifier"
-              },
-              "type": {
-                "type": "string",
-                "enum": ["code", "llm_judge"],
-                "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
-              },
-              "script": {
-                "type": "string",
-                "description": "Path to evaluator script (for type: code)"
-              },
-              "prompt": {
-                "type": "string",
-                "description": "Path to judge prompt file (for type: llm_judge)"
-              }
-            },
-            "required": ["name", "type"],
-            "additionalProperties": true
-          }
-        }
-      },
-      "additionalProperties": true
-    },
-    "evalcases": {
-      "type": "array",
-      "description": "Array of evaluation cases",
-      "minItems": 1,
-      "items": {
-        "type": "object",
-        "properties": {
-          "id": {
-            "type": "string",
-            "description": "Unique identifier for the eval case"
-          },
-          "conversation_id": {
-            "type": "string",
-            "description": "Optional conversation identifier for threading multiple eval cases together"
-          },
-          "expected_outcome": {
-            "type": "string",
-            "description": "Description of what the AI should accomplish in this eval"
-          },
-          "note": {
-            "type": "string",
-            "description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
-          },
-          "input_messages": {
-            "type": "array",
-            "description": "Input messages for the conversation",
-            "minItems": 1,
-            "items": {
-              "type": "object",
-              "properties": {
-                "role": {
-                  "type": "string",
-                  "enum": ["system", "user", "assistant", "tool"],
-                  "description": "Message role"
-                },
-                "content": {
-                  "oneOf": [
-                    {
-                      "type": "string",
-                      "description": "Simple text content"
-                    },
-                    {
-                      "type": "array",
-                      "description": "Mixed content items (text and file references)",
-                      "items": {
-                        "type": "object",
-                        "properties": {
-                          "type": {
-                            "type": "string",
-                            "enum": ["text", "file"],
-                            "description": "Content type: 'text' for inline content, 'file' for file references"
-                          },
-                          "value": {
-                            "type": "string",
-                            "description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
-                          }
-                        },
-                        "required": ["type", "value"],
-                        "additionalProperties": false
-                      }
-                    }
-                  ]
-                }
-              },
-              "required": ["role", "content"],
-              "additionalProperties": false
-            }
-          },
-          "expected_messages": {
-            "type": "array",
-            "description": "Expected response messages",
-            "minItems": 1,
-            "items": {
-              "type": "object",
-              "properties": {
-                "role": {
-                  "type": "string",
-                  "enum": ["system", "user", "assistant", "tool"],
-                  "description": "Message role"
-                },
-                "content": {
-                  "oneOf": [
-                    {
-                      "type": "string",
-                      "description": "Simple text content"
-                    },
-                    {
-                      "type": "array",
-                      "description": "Mixed content items",
-                      "items": {
-                        "type": "object",
-                        "properties": {
-                          "type": {
-                            "type": "string",
-                            "enum": ["text", "file"]
-                          },
-                          "value": {
-                            "type": "string"
-                          }
-                        },
-                        "required": ["type", "value"],
-                        "additionalProperties": false
-                      }
-                    }
-                  ]
-                }
-              },
-              "required": ["role", "content"],
-              "additionalProperties": false
-            }
-          },
-          "execution": {
-            "type": "object",
-            "description": "Per-case execution configuration",
-            "properties": {
-              "target": {
-                "type": "string",
-                "description": "Override target for this specific eval case"
-              },
-              "evaluators": {
-                "type": "array",
-                "description": "Multiple evaluators (code-based and LLM judges)",
-                "items": {
-                  "type": "object",
-                  "properties": {
-                    "name": {
-                      "type": "string",
-                      "description": "Evaluator name/identifier"
-                    },
-                    "type": {
-                      "type": "string",
-                      "enum": ["code", "llm_judge"],
-                      "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
-                    },
-                    "script": {
-                      "type": "string",
-                      "description": "Path to evaluator script (for type: code)"
-                    },
-                    "prompt": {
-                      "type": "string",
-                      "description": "Path to judge prompt file (for type: llm_judge)"
-                    }
-                  },
-                  "required": ["name", "type"],
-                  "additionalProperties": true
-                }
-              }
-            },
-            "additionalProperties": true
-          }
-        },
-        "required": ["id", "expected_outcome", "input_messages", "expected_messages"],
-        "additionalProperties": false
-      }
-    }
-  },
-  "required": ["evalcases"],
-  "additionalProperties": false
-}
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AgentV Eval Schema",
+  "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
+  "type": "object",
+  "properties": {
+    "$schema": {
+      "type": "string",
+      "description": "Schema identifier",
+      "enum": ["agentv-eval-v2"]
+    },
+    "description": {
+      "type": "string",
+      "description": "Description of what this eval suite covers"
+    },
+    "target": {
+      "type": "string",
+      "description": "(Deprecated: use execution.target instead) Default target configuration name. Can be overridden per eval case."
+    },
+    "execution": {
+      "type": "object",
+      "description": "Default execution configuration for all eval cases (can be overridden per case)",
+      "properties": {
+        "target": {
+          "type": "string",
+          "description": "Default target configuration name (e.g., default, azure_base, vscode_projectx). Can be overridden per eval case."
+        },
+        "evaluators": {
+          "type": "array",
+          "description": "Default evaluators for all eval cases (code-based and LLM judges)",
+          "items": {
+            "type": "object",
+            "properties": {
+              "name": {
+                "type": "string",
+                "description": "Evaluator name/identifier"
+              },
+              "type": {
+                "type": "string",
+                "enum": ["code", "llm_judge"],
+                "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
+              },
+              "script": {
+                "type": "string",
+                "description": "Path to evaluator script (for type: code)"
+              },
+              "prompt": {
+                "type": "string",
+                "description": "Path to judge prompt file (for type: llm_judge)"
+              }
+            },
+            "required": ["name", "type"],
+            "additionalProperties": true
+          }
+        }
+      },
+      "additionalProperties": true
+    },
+    "evalcases": {
+      "type": "array",
+      "description": "Array of evaluation cases",
+      "minItems": 1,
+      "items": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "description": "Unique identifier for the eval case"
+          },
+          "conversation_id": {
+            "type": "string",
+            "description": "Optional conversation identifier for threading multiple eval cases together"
+          },
+          "expected_outcome": {
+            "type": "string",
+            "description": "Description of what the AI should accomplish in this eval"
+          },
+          "note": {
+            "type": "string",
+            "description": "Optional note or additional context for the eval case. Use this to document test-specific considerations, known limitations, or rationale for expected behavior."
+          },
+          "input_messages": {
+            "type": "array",
+            "description": "Input messages for the conversation",
+            "minItems": 1,
+            "items": {
+              "type": "object",
+              "properties": {
+                "role": {
+                  "type": "string",
+                  "enum": ["system", "user", "assistant", "tool"],
+                  "description": "Message role"
+                },
+                "content": {
+                  "oneOf": [
+                    {
+                      "type": "string",
+                      "description": "Simple text content"
+                    },
+                    {
+                      "type": "array",
+                      "description": "Mixed content items (text and file references)",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "type": {
+                            "type": "string",
+                            "enum": ["text", "file"],
+                            "description": "Content type: 'text' for inline content, 'file' for file references"
+                          },
+                          "value": {
+                            "type": "string",
+                            "description": "Text content or file path. Relative paths (e.g., ../prompts/file.md) are resolved from eval file directory. Absolute paths (e.g., /docs/examples/prompts/file.md) are resolved from repo root."
+                          }
+                        },
+                        "required": ["type", "value"],
+                        "additionalProperties": false
+                      }
+                    }
+                  ]
+                }
+              },
+              "required": ["role", "content"],
+              "additionalProperties": false
+            }
+          },
+          "expected_messages": {
+            "type": "array",
+            "description": "Expected response messages",
+            "minItems": 1,
+            "items": {
+              "type": "object",
+              "properties": {
+                "role": {
+                  "type": "string",
+                  "enum": ["system", "user", "assistant", "tool"],
+                  "description": "Message role"
+                },
+                "content": {
+                  "oneOf": [
+                    {
+                      "type": "string",
+                      "description": "Simple text content"
+                    },
+                    {
+                      "type": "array",
+                      "description": "Mixed content items",
+                      "items": {
+                        "type": "object",
+                        "properties": {
+                          "type": {
+                            "type": "string",
+                            "enum": ["text", "file"]
+                          },
+                          "value": {
+                            "type": "string"
+                          }
+                        },
+                        "required": ["type", "value"],
+                        "additionalProperties": false
+                      }
+                    }
+                  ]
+                }
+              },
+              "required": ["role", "content"],
+              "additionalProperties": false
+            }
+          },
+          "execution": {
+            "type": "object",
+            "description": "Per-case execution configuration",
+            "properties": {
+              "target": {
+                "type": "string",
+                "description": "Override target for this specific eval case"
+              },
+              "evaluators": {
+                "type": "array",
+                "description": "Multiple evaluators (code-based and LLM judges)",
+                "items": {
+                  "type": "object",
+                  "properties": {
+                    "name": {
+                      "type": "string",
+                      "description": "Evaluator name/identifier"
+                    },
+                    "type": {
+                      "type": "string",
+                      "enum": ["code", "llm_judge"],
+                      "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
+                    },
+                    "script": {
+                      "type": "string",
+                      "description": "Path to evaluator script (for type: code)"
+                    },
+                    "prompt": {
+                      "type": "string",
+                      "description": "Path to judge prompt file (for type: llm_judge)"
+                    }
+                  },
+                  "required": ["name", "type"],
+                  "additionalProperties": true
+                }
+              }
+            },
+            "additionalProperties": true
+          }
+        },
+        "required": ["id", "expected_outcome", "input_messages", "expected_messages"],
+        "additionalProperties": false
+      }
+    }
+  },
+  "required": ["evalcases"],
+  "additionalProperties": false
+}

package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md CHANGED Viewed

@@ -78,13 +78,12 @@ evalcases:
     execution:
       evaluators:
         - name: json_format_validator
-          type: code
+          type: code_judge
           script: uv run validate_json.py
           cwd: ./evaluators
         - name: content_evaluator
           type: llm_judge
           prompt: ./judges/semantic_correctness.md
-          model: gpt-5-chat
     input_messages:
       - role: user
@@ -102,6 +101,99 @@ evalcases:
           }
 ```
+## Tool Trajectory Evaluation
+Validate that an agent uses specific tools during execution.
+```yaml
+$schema: agentv-eval-v2
+description: Tool usage validation
+target: mock_agent
+evalcases:
+  # Validate minimum tool usage (order doesn't matter)
+  - id: research-depth
+    expected_outcome: Agent researches thoroughly
+    input_messages:
+      - role: user
+        content: Research REST vs GraphQL
+    execution:
+      evaluators:
+        - name: research-check
+          type: tool_trajectory
+          mode: any_order
+          minimums:
+            knowledgeSearch: 2
+            documentRetrieve: 1
+  # Validate exact tool sequence
+  - id: auth-flow
+    expected_outcome: Agent follows auth sequence
+    input_messages:
+      - role: user
+        content: Authenticate user
+    execution:
+      evaluators:
+        - name: auth-sequence
+          type: tool_trajectory
+          mode: exact
+          expected:
+            - tool: checkCredentials
+            - tool: generateToken
+```
+## Expected Messages with Tool Calls
+Validate precise tool inputs inline with expected messages.
+```yaml
+$schema: agentv-eval-v2
+description: Tool input validation
+target: mock_agent
+evalcases:
+  - id: precise-inputs
+    expected_outcome: Agent calls tools with correct parameters
+    input_messages:
+      - role: user
+        content: Check CPU metrics for prod-1
+    expected_messages:
+      - role: assistant
+        content: Checking metrics...
+        tool_calls:
+          - tool: getCpuMetrics
+            input: { server: "prod-1" }
+    execution:
+      evaluators:
+        - name: input-validator
+          type: expected_tool_calls
+```
+## Static Trace Evaluation
+Evaluate pre-existing trace files without running an agent.
+```yaml
+$schema: agentv-eval-v2
+description: Static trace evaluation
+target: static_trace
+evalcases:
+  - id: validate-trace-file
+    expected_outcome: Trace contains required steps
+    input_messages:
+      - role: user
+        content: Analyze trace
+    execution:
+      evaluators:
+        - name: trace-check
+          type: tool_trajectory
+          mode: in_order
+          expected:
+            - tool: webSearch
+            - tool: readFile
+```
 ## Multi-Turn Conversation (Single Eval Case)
 ```yaml