npm - agentv - Versions diffs - 1.2.0 → 1.5.0 - Mend

agentv 1.2.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/cli.js CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 import {
   runCli
-} from "./chunk-IVIT4U6S.js";
+} from "./chunk-3RYQPI4H.js";
 import "./chunk-UE4GLFVL.js";
 // src/cli.ts

package/dist/index.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   app,
   runCli
-} from "./chunk-IVIT4U6S.js";
+} from "./chunk-3RYQPI4H.js";
 import "./chunk-UE4GLFVL.js";
 export {
   app,

package/dist/templates/.agentv/.env.template CHANGED Viewed

@@ -1,23 +1,23 @@
-# Copy this file to .env and fill in your credentials
-# Azure OpenAI Configuration
-AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
-AZURE_OPENAI_API_KEY=your-openai-api-key-here
-AZURE_DEPLOYMENT_NAME=gpt-5-chat
-AZURE_OPENAI_API_VERSION=2024-12-01-preview
-# Google Gemini
-GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
-GEMINI_MODEL_NAME=gemini-2.5-flash
-# Anthropic
-ANTHROPIC_API_KEY=your-anthropic-api-key-here
-# VS Code Workspace Paths for Execution Targets
-# Note: Using forward slashes is recommended for paths in .env files
-# to avoid issues with escape characters.
-PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
-# CLI provider sample (used by the local_cli target)
-CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
-LOCAL_AGENT_TOKEN=dummytoken
+# Copy this file to .env and fill in your credentials
+# Azure OpenAI Configuration
+AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
+AZURE_OPENAI_API_KEY=your-openai-api-key-here
+AZURE_DEPLOYMENT_NAME=gpt-5-chat
+AZURE_OPENAI_API_VERSION=2024-12-01-preview
+# Google Gemini
+GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
+GEMINI_MODEL_NAME=gemini-2.5-flash
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-api-key-here
+# VS Code Workspace Paths for Execution Targets
+# Note: Using forward slashes is recommended for paths in .env files
+# to avoid issues with escape characters.
+PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
+# CLI provider sample (used by the local_cli target)
+CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
+LOCAL_AGENT_TOKEN=dummytoken

package/dist/templates/.agentv/config.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
-$schema: agentv-config-v2
-# Customize which files are treated as guidelines vs regular file content
-# Custom guideline patterns:
-guideline_patterns:
-  - "**/*.instructions.md"
-  - "**/*.prompt.md"
-  - "**/SKILL.md"
-# Notes:
-# - Patterns use standard glob syntax (via micromatch library)
-# - Paths are normalized to forward slashes for cross-platform compatibility
-# - Only files matching these patterns are loaded as guidelines
-# - All other files referenced in eval cases are treated as regular file content
+$schema: agentv-config-v2
+# Customize which files are treated as guidelines vs regular file content
+# Custom guideline patterns:
+guideline_patterns:
+  - "**/*.instructions.md"
+  - "**/*.prompt.md"
+  - "**/SKILL.md"
+# Notes:
+# - Patterns use standard glob syntax (via micromatch library)
+# - Paths are normalized to forward slashes for cross-platform compatibility
+# - Only files matching these patterns are loaded as guidelines
+# - All other files referenced in eval cases are treated as regular file content

package/dist/templates/.agentv/targets.yaml CHANGED Viewed

@@ -1,73 +1,71 @@
-$schema: agentv-targets-v2.2
-# A list of all supported evaluation targets for the project.
-# Each target defines a provider and its specific configuration.
-# Actual values for paths/keys are stored in the local .env file.
-targets:
-  - name: default
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    # version: ${{ AZURE_OPENAI_API_VERSION }}  # Optional: uncomment to override default (2024-12-01-preview)
-  - name: vscode
-    provider: vscode
-    judge_target: azure_base
-  - name: codex
-    provider: codex
-    judge_target: azure_base
-    # Uses the Codex CLI (defaults to `codex` on PATH)
-    # executable: ${{ CODEX_CLI_PATH }}        # Optional: override executable path
-    # args:                             # Optional additional CLI arguments
-    #   - --profile
-    #   - ${{ CODEX_PROFILE }}
-    #   - --model
-    #   - ${{ CODEX_MODEL }}
-    #   - --ask-for-approval
-    #   - ${{ CODEX_APPROVAL_PRESET }}
-    timeout_seconds: 180
-    cwd: ${{ CODEX_WORKSPACE_DIR }}            # Where scratch workspaces are created
-    log_dir: ${{ CODEX_LOG_DIR }}              # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex)
-    log_format: json                    # Optional: 'summary' (default) or 'json' for raw event logs
-  - name: vscode_projectx
-    provider: vscode
-    workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
-    provider_batching: false
-    judge_target: azure_base
-  - name: vscode_insiders_projectx
-    provider: vscode-insiders
-    workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
-    provider_batching: false
-    judge_target: azure_base
-  - name: azure_base
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
-  - name: gemini_base
-    provider: gemini
-    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
-    model: ${{ GEMINI_MODEL_NAME }}
-  - name: local_cli
-    provider: cli
-    judge_target: azure_base
-    # Passes the fully rendered prompt and any attached files to a local Python script
-    # NOTE: Do not add quotes around {PROMPT} or {FILES} - they are already shell-escaped
-    command_template: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}
-    # Format for each file in {FILES}. {path} and {basename} are automatically shell-escaped, so no quotes needed
-    files_format: --file {path}
-    # Optional working directory resolved from .env
-    cwd: ${{ CLI_EVALS_DIR }}
-    timeout_seconds: 30
-    healthcheck:
-      type: command
-      command_template: uv run ./mock_cli.py --healthcheck
+# A list of all supported evaluation targets for the project.
+# Each target defines a provider and its specific configuration.
+# Actual values for paths/keys are stored in the local .env file.
+targets:
+  - name: default
+    provider: azure
+    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
+    api_key: ${{ AZURE_OPENAI_API_KEY }}
+    model: ${{ AZURE_DEPLOYMENT_NAME }}
+    # version: ${{ AZURE_OPENAI_API_VERSION }}  # Optional: uncomment to override default (2024-12-01-preview)
+  - name: vscode
+    provider: vscode
+    judge_target: azure_base
+  - name: codex
+    provider: codex
+    judge_target: azure_base
+    # Uses the Codex CLI (defaults to `codex` on PATH)
+    # executable: ${{ CODEX_CLI_PATH }}        # Optional: override executable path
+    # args:                             # Optional additional CLI arguments
+    #   - --profile
+    #   - ${{ CODEX_PROFILE }}
+    #   - --model
+    #   - ${{ CODEX_MODEL }}
+    #   - --ask-for-approval
+    #   - ${{ CODEX_APPROVAL_PRESET }}
+    timeout_seconds: 180
+    cwd: ${{ CODEX_WORKSPACE_DIR }}            # Where scratch workspaces are created
+    log_dir: ${{ CODEX_LOG_DIR }}              # Optional: where Codex CLI stream logs are stored (defaults to ./.agentv/logs/codex)
+    log_format: json                    # Optional: 'summary' (default) or 'json' for raw event logs
+  - name: vscode_projectx
+    provider: vscode
+    workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
+    provider_batching: false
+    judge_target: azure_base
+  - name: vscode_insiders_projectx
+    provider: vscode-insiders
+    workspace_template: ${{ PROJECTX_WORKSPACE_PATH }}
+    provider_batching: false
+    judge_target: azure_base
+  - name: azure_base
+    provider: azure
+    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
+    api_key: ${{ AZURE_OPENAI_API_KEY }}
+    model: ${{ AZURE_DEPLOYMENT_NAME }}
+    version: ${{ AZURE_OPENAI_API_VERSION }}
+  - name: gemini_base
+    provider: gemini
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+    model: ${{ GEMINI_MODEL_NAME }}
+  - name: local_cli
+    provider: cli
+    judge_target: azure_base
+    # Passes the fully rendered prompt and any attached files to a local Python script
+    # NOTE: Do not add quotes around {PROMPT} or {FILES} - they are already shell-escaped
+    command_template: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}
+    # Format for each file in {FILES}. {path} and {basename} are automatically shell-escaped, so no quotes needed
+    files_format: --file {path}
+    # Optional working directory resolved from .env
+    cwd: ${{ CLI_EVALS_DIR }}
+    timeout_seconds: 30
+    healthcheck:
+      type: command
+      command_template: uv run ./mock_cli.py --healthcheck

package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md CHANGED Viewed

@@ -1,174 +1,212 @@
----
-name: agentv-eval-builder
-description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding eval cases, or configuring custom evaluators (code validators or LLM judges) for agent testing workflows.
----
-# AgentV Eval Builder
-## Schema Reference
-- Schema: `references/eval-schema.json` (JSON Schema for validation and tooling)
-- Format: YAML with structured content arrays
-- Examples: `references/example-evals.md`
-## Feature Reference
-- Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
-- Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
-- Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
-- Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
-## Structure Requirements
-- Root level: `description` (optional), `target` (optional), `execution` (optional), `evalcases` (required)
-- Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required)
-- Optional fields: `expected_messages`, `conversation_id`, `rubrics`, `execution`
-- `expected_messages` is optional - omit for outcome-only evaluation where the LLM judge evaluates based on `expected_outcome` criteria alone
-- Message fields: `role` (required), `content` (required)
-- Message roles: `system`, `user`, `assistant`, `tool`
-- Content types: `text` (inline), `file` (relative or absolute path)
-- Attachments (type: `file`) should default to the `user` role
-- File paths: Relative (from eval file dir) or absolute with "/" prefix (from repo root)
-## Custom Evaluators
-Configure multiple evaluators per eval case via `execution.evaluators` array.
-### Code Evaluators
-Scripts that validate output programmatically:
-```yaml
-execution:
-  evaluators:
-    - name: json_format_validator
-      type: code_judge
-      script: uv run validate_output.py
-      cwd: ../../evaluators/scripts
-```
-**Contract:**
-- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_files` (file paths), `input_files` (file paths, excludes guidelines), `input_messages`
-- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
-**Template:** See `references/custom-evaluators.md` for Python code evaluator template
-### LLM Judges
-Language models evaluate response quality:
-```yaml
-execution:
-  evaluators:
-    - name: content_evaluator
-      type: llm_judge
-      prompt: /evaluators/prompts/correctness.md
-      model: gpt-5-chat
-```
-### Tool Trajectory Evaluators
-Validate agent tool usage patterns (requires trace data from provider):
-```yaml
-execution:
-  evaluators:
-    - name: research_check
-      type: tool_trajectory
-      mode: any_order       # Options: any_order, in_order, exact
-      minimums:             # For any_order mode
-        knowledgeSearch: 2
-      expected:             # For in_order/exact modes
-        - tool: knowledgeSearch
-        - tool: documentRetrieve
-```
-See `references/tool-trajectory-evaluator.md` for modes and configuration.
-### Multiple Evaluators
-Define multiple evaluators to run sequentially. The final score is a weighted average of all results.
-```yaml
-execution:
-  evaluators:
-    - name: format_check      # Runs first
-      type: code_judge
-      script: uv run validate_json.py
-    - name: content_check     # Runs second
-      type: llm_judge
-```
-### Rubric Evaluator
-Inline rubrics for structured criteria-based evaluation:
-```yaml
-evalcases:
-  - id: explanation-task
-    expected_outcome: Clear explanation of quicksort
-    input_messages:
-      - role: user
-        content: Explain quicksort
-    rubrics:
-      - Mentions divide-and-conquer approach
-      - Explains the partition step
-      - id: complexity
-        description: States time complexity correctly
-        weight: 2.0
-        required: true
-```
-See `references/rubric-evaluator.md` for detailed rubric configuration.
-### Composite Evaluator
-Combine multiple evaluators with aggregation:
-```yaml
-execution:
-  evaluators:
-    - name: release_gate
-      type: composite
-      evaluators:
-        - name: safety
-          type: llm_judge
-          prompt: ./prompts/safety.md
-        - name: quality
-          type: llm_judge
-          prompt: ./prompts/quality.md
-      aggregator:
-        type: weighted_average
-        weights:
-          safety: 0.3
-          quality: 0.7
-```
-See `references/composite-evaluator.md` for aggregation types and patterns.
-## Example
-```yaml
-$schema: agentv-eval-v2
-description: Example showing basic features and conversation threading
-execution:
-  target: default
-evalcases:
-  - id: code-review-basic
-    expected_outcome: Assistant provides helpful code analysis
-    input_messages:
-      - role: system
-        content: You are an expert code reviewer.
-      - role: user
-        content:
-          - type: text
-            value: |-
-              Review this function:
-              ```python
-              def add(a, b):
-                  return a + b
-              ```
-          - type: file
-            value: /prompts/python.instructions.md
-    expected_messages:
-      - role: assistant
-        content: |-
-          The function is simple and correct. Suggestions:
-          - Add type hints: `def add(a: int, b: int) -> int:`
-          - Add docstring
-          - Consider validation for edge cases
-```
+---
+name: agentv-eval-builder
+description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding eval cases, or configuring custom evaluators (code validators or LLM judges) for agent testing workflows.
+---
+# AgentV Eval Builder
+## Schema Reference
+- Schema: `references/eval-schema.json` (JSON Schema for validation and tooling)
+- Format: YAML with structured content arrays
+- Examples: `references/example-evals.md`
+## Feature Reference
+- Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
+- Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
+- Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
+- Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
+- Batch CLI: `references/batch-cli-evaluator.md` - Evaluate batch runner output (JSONL)
+## Structure Requirements
+- Root level: `description` (optional), `execution` (optional with `target` inside), `evalcases` (required)
+- Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required)
+- Optional fields: `expected_messages`, `conversation_id`, `rubrics`, `execution`
+- `expected_messages` is optional - omit for outcome-only evaluation where the LLM judge evaluates based on `expected_outcome` criteria alone
+- Message fields: `role` (required), `content` (required)
+- Message roles: `system`, `user`, `assistant`, `tool`
+- Content types: `text` (inline), `file` (relative or absolute path)
+- Attachments (type: `file`) should default to the `user` role
+- File paths: Relative (from eval file dir) or absolute with "/" prefix (from repo root)
+## Custom Evaluators
+Configure multiple evaluators per eval case via `execution.evaluators` array.
+### Code Evaluators
+Scripts that validate output programmatically:
+```yaml
+execution:
+  evaluators:
+    - name: json_format_validator
+      type: code_judge
+      script: uv run validate_output.py
+      cwd: ../../evaluators/scripts
+```
+**Contract:**
+- Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_files` (file paths), `input_files` (file paths, excludes guidelines), `input_messages`
+- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
+**Template:** See `references/custom-evaluators.md` for Python code evaluator template
+### LLM Judges
+Language models evaluate response quality:
+```yaml
+execution:
+  evaluators:
+    - name: content_evaluator
+      type: llm_judge
+      prompt: /evaluators/prompts/correctness.md
+      model: gpt-5-chat
+```
+### Tool Trajectory Evaluators
+Validate agent tool usage patterns (requires `output_messages` with `tool_calls` from provider):
+```yaml
+execution:
+  evaluators:
+    - name: research_check
+      type: tool_trajectory
+      mode: any_order       # Options: any_order, in_order, exact
+      minimums:             # For any_order mode
+        knowledgeSearch: 2
+      expected:             # For in_order/exact modes
+        - tool: knowledgeSearch
+        - tool: documentRetrieve
+```
+See `references/tool-trajectory-evaluator.md` for modes and configuration.
+### Multiple Evaluators
+Define multiple evaluators to run sequentially. The final score is a weighted average of all results.
+```yaml
+execution:
+  evaluators:
+    - name: format_check      # Runs first
+      type: code_judge
+      script: uv run validate_json.py
+    - name: content_check     # Runs second
+      type: llm_judge
+```
+### Rubric Evaluator
+Inline rubrics for structured criteria-based evaluation:
+```yaml
+evalcases:
+  - id: explanation-task
+    expected_outcome: Clear explanation of quicksort
+    input_messages:
+      - role: user
+        content: Explain quicksort
+    rubrics:
+      - Mentions divide-and-conquer approach
+      - Explains the partition step
+      - id: complexity
+        description: States time complexity correctly
+        weight: 2.0
+        required: true
+```
+See `references/rubric-evaluator.md` for detailed rubric configuration.
+### Composite Evaluator
+Combine multiple evaluators with aggregation:
+```yaml
+execution:
+  evaluators:
+    - name: release_gate
+      type: composite
+      evaluators:
+        - name: safety
+          type: llm_judge
+          prompt: ./prompts/safety.md
+        - name: quality
+          type: llm_judge
+          prompt: ./prompts/quality.md
+      aggregator:
+        type: weighted_average
+        weights:
+          safety: 0.3
+          quality: 0.7
+```
+See `references/composite-evaluator.md` for aggregation types and patterns.
+### Batch CLI Evaluation
+Evaluate external batch runners that process all evalcases in one invocation:
+```yaml
+$schema: agentv-eval-v2
+description: Batch CLI evaluation
+execution:
+  target: batch_cli
+evalcases:
+  - id: case-001
+    expected_outcome: Returns decision=CLEAR
+    expected_messages:
+      - role: assistant
+        content:
+          decision: CLEAR
+    input_messages:
+      - role: user
+        content:
+          row:
+            id: case-001
+            amount: 5000
+    execution:
+      evaluators:
+        - name: decision-check
+          type: code_judge
+          script: bun run ./scripts/check-output.ts
+          cwd: .
+```
+**Key pattern:**
+- Batch runner reads eval YAML via `--eval` flag, outputs JSONL keyed by `id`
+- Each evalcase has its own evaluator to validate its corresponding output
+- Use structured `expected_messages.content` for expected output fields
+See `references/batch-cli-evaluator.md` for full implementation guide.
+## Example
+```yaml
+$schema: agentv-eval-v2
+description: Example showing basic features and conversation threading
+execution:
+  target: default
+evalcases:
+  - id: code-review-basic
+    expected_outcome: Assistant provides helpful code analysis
+    input_messages:
+      - role: system
+        content: You are an expert code reviewer.
+      - role: user
+        content:
+          - type: text
+            value: |-
+              Review this function:
+              ```python
+              def add(a, b):
+                  return a + b
+              ```
+          - type: file
+            value: /prompts/python.instructions.md
+    expected_messages:
+      - role: assistant
+        content: |-
+          The function is simple and correct. Suggestions:
+          - Add type hints: `def add(a: int, b: int) -> int:`
+          - Add docstring
+          - Consider validation for edge cases
+```