agentv 0.23.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  runCli
4
- } from "./chunk-4T62HFF4.js";
4
+ } from "./chunk-6ZM7WVSC.js";
5
5
  import "./chunk-UE4GLFVL.js";
6
6
 
7
7
  // src/cli.ts
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  app,
3
3
  runCli
4
- } from "./chunk-4T62HFF4.js";
4
+ } from "./chunk-6ZM7WVSC.js";
5
5
  import "./chunk-UE4GLFVL.js";
6
6
  export {
7
7
  app,
@@ -1,16 +1,16 @@
1
- # Example environment configuration for AgentV
2
1
  # Copy this file to .env and fill in your credentials
3
2
 
4
- # Model Provider Selection (Optional - can be configured via targets.yaml)
5
- PROVIDER=azure
6
-
7
3
  # Azure OpenAI Configuration
8
- # These are the default environment variable names used in the provided targets.yaml
9
4
  AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
10
- AZURE_OPENAI_API_KEY=your-api-key-here
11
- AZURE_DEPLOYMENT_NAME=gpt-4o
5
+ AZURE_OPENAI_API_KEY=your-openai-api-key-here
6
+ AZURE_DEPLOYMENT_NAME=gpt-5-chat
7
+ AZURE_OPENAI_API_VERSION=2024-12-01-preview
8
+
9
+ # Google Gemini
10
+ GOOGLE_GENERATIVE_AI_API_KEY=your-gemini-api-key-here
11
+ GEMINI_MODEL_NAME=gemini-2.5-flash
12
12
 
13
- # Anthropic Configuration (if using Anthropic provider)
13
+ # Anthropic
14
14
  ANTHROPIC_API_KEY=your-anthropic-api-key-here
15
15
 
16
16
  # VS Code Workspace Paths for Execution Targets
@@ -19,5 +19,5 @@ ANTHROPIC_API_KEY=your-anthropic-api-key-here
19
19
  PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
20
20
 
21
21
  # CLI provider sample (used by the local_cli target)
22
- PROJECT_ROOT=D:/GitHub/your-username/agentv/docs/examples/simple
23
- LOCAL_AGENT_TOKEN=your-cli-token
22
+ CLI_EVALS_DIR=./docs/examples/simple/evals/local-cli
23
+ LOCAL_AGENT_TOKEN=dummytoken
@@ -10,6 +10,7 @@ targets:
10
10
  endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
11
11
  api_key: ${{ AZURE_OPENAI_API_KEY }}
12
12
  model: ${{ AZURE_DEPLOYMENT_NAME }}
13
+ # version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: uncomment to override default (2024-12-01-preview)
13
14
 
14
15
  - name: vscode
15
16
  provider: vscode
@@ -49,13 +50,19 @@ targets:
49
50
  endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
50
51
  api_key: ${{ AZURE_OPENAI_API_KEY }}
51
52
  model: ${{ AZURE_DEPLOYMENT_NAME }}
53
+ version: ${{ AZURE_OPENAI_API_VERSION }}
54
+
55
+ - name: gemini_base
56
+ provider: gemini
57
+ api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
58
+ model: ${{ GEMINI_MODEL_NAME }}
52
59
 
53
60
  - name: local_cli
54
61
  provider: cli
55
62
  judge_target: azure_base
56
63
  # Passes the fully rendered prompt and any attached files to a local Python script
57
64
  # NOTE: Do not add quotes around {PROMPT} or {FILES} - they are already shell-escaped
58
- command_template: uv run ./mock_cli.py --prompt {PROMPT} {FILES}
65
+ command_template: uv run ./mock_cli.py --prompt {PROMPT} {FILES} --output {OUTPUT_FILE}
59
66
  # Format for each file in {FILES}. {path} and {basename} are automatically shell-escaped, so no quotes needed
60
67
  files_format: --file {path}
61
68
  # Optional working directory resolved from .env
@@ -10,10 +10,17 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
10
10
  - Format: YAML with structured content arrays
11
11
  - Examples: `references/example-evals.md`
12
12
 
13
+ ## Feature Reference
14
+ - Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
15
+ - Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
16
+ - Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
17
+ - Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
18
+
13
19
  ## Structure Requirements
14
- - Root level: `description` (optional), `execution` (optional), `evalcases` (required)
15
- - Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required), `expected_messages` (required)
16
- - Optional fields: `conversation_id`, `note`, `execution`
20
+ - Root level: `description` (optional), `target` (optional), `execution` (optional), `evalcases` (required)
21
+ - Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required)
22
+ - Optional fields: `expected_messages`, `conversation_id`, `rubrics`, `execution`
23
+ - `expected_messages` is optional - omit for outcome-only evaluation where the LLM judge evaluates based on `expected_outcome` criteria alone
17
24
  - Message fields: `role` (required), `content` (required)
18
25
  - Message roles: `system`, `user`, `assistant`, `tool`
19
26
  - Content types: `text` (inline), `file` (relative or absolute path)
@@ -31,13 +38,13 @@ Scripts that validate output programmatically:
31
38
  execution:
32
39
  evaluators:
33
40
  - name: json_format_validator
34
- type: code
41
+ type: code_judge
35
42
  script: uv run validate_output.py
36
43
  cwd: ../../evaluators/scripts
37
44
  ```
38
45
 
39
46
  **Contract:**
40
- - Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_paths`, `input_files`, `input_messages`
47
+ - Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_files` (file paths), `input_files` (file paths, excludes guidelines), `input_messages`
41
48
  - Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
42
49
 
43
50
  **Template:** See `references/custom-evaluators.md` for Python code evaluator template
@@ -61,12 +68,74 @@ Evaluators run sequentially:
61
68
  execution:
62
69
  evaluators:
63
70
  - name: format_check # Runs first
64
- type: code
71
+ type: code_judge
65
72
  script: uv run validate_json.py
66
73
  - name: content_check # Runs second
67
74
  type: llm_judge
68
75
  ```
69
76
 
77
+ ### Rubric Evaluator
78
+ Inline rubrics for structured criteria-based evaluation:
79
+
80
+ ```yaml
81
+ evalcases:
82
+ - id: explanation-task
83
+ expected_outcome: Clear explanation of quicksort
84
+ input_messages:
85
+ - role: user
86
+ content: Explain quicksort
87
+ rubrics:
88
+ - Mentions divide-and-conquer approach
89
+ - Explains the partition step
90
+ - id: complexity
91
+ description: States time complexity correctly
92
+ weight: 2.0
93
+ required: true
94
+ ```
95
+
96
+ See `references/rubric-evaluator.md` for detailed rubric configuration.
97
+
98
+ ### Composite Evaluator
99
+ Combine multiple evaluators with aggregation:
100
+
101
+ ```yaml
102
+ execution:
103
+ evaluators:
104
+ - name: release_gate
105
+ type: composite
106
+ evaluators:
107
+ - name: safety
108
+ type: llm_judge
109
+ prompt: ./prompts/safety.md
110
+ - name: quality
111
+ type: llm_judge
112
+ prompt: ./prompts/quality.md
113
+ aggregator:
114
+ type: weighted_average
115
+ weights:
116
+ safety: 0.3
117
+ quality: 0.7
118
+ ```
119
+
120
+ See `references/composite-evaluator.md` for aggregation types and patterns.
121
+
122
+ ### Tool Trajectory Evaluator
123
+ Validate agent tool usage from trace data:
124
+
125
+ ```yaml
126
+ execution:
127
+ evaluators:
128
+ - name: workflow-check
129
+ type: tool_trajectory
130
+ mode: in_order # or: any_order, exact
131
+ expected:
132
+ - tool: fetchData
133
+ - tool: processData
134
+ - tool: saveResults
135
+ ```
136
+
137
+ See `references/tool-trajectory-evaluator.md` for modes and configuration.
138
+
70
139
  ## Example
71
140
  ```yaml
72
141
  $schema: agentv-eval-v2
@@ -0,0 +1,215 @@
1
+ # Composite Evaluator Guide
2
+
3
+ Composite evaluators combine multiple evaluators and aggregate their results. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
4
+
5
+ ## Basic Structure
6
+
7
+ ```yaml
8
+ execution:
9
+ evaluators:
10
+ - name: my_composite
11
+ type: composite
12
+ evaluators:
13
+ - name: evaluator_1
14
+ type: llm_judge
15
+ prompt: ./prompts/check1.md
16
+ - name: evaluator_2
17
+ type: code_judge
18
+ script: uv run check2.py
19
+ aggregator:
20
+ type: weighted_average
21
+ weights:
22
+ evaluator_1: 0.6
23
+ evaluator_2: 0.4
24
+ ```
25
+
26
+ ## Aggregator Types
27
+
28
+ ### 1. Weighted Average (Default)
29
+
30
+ Combines scores using weighted arithmetic mean:
31
+
32
+ ```yaml
33
+ aggregator:
34
+ type: weighted_average
35
+ weights:
36
+ safety: 0.3 # 30% weight
37
+ quality: 0.7 # 70% weight
38
+ ```
39
+
40
+ If weights are omitted, all evaluators have equal weight (1.0).
41
+
42
+ **Score calculation:**
43
+ ```
44
+ final_score = Σ(score_i × weight_i) / Σ(weight_i)
45
+ ```
46
+
47
+ ### 2. Code Judge Aggregator
48
+
49
+ Run custom code to decide final score based on all evaluator results:
50
+
51
+ ```yaml
52
+ aggregator:
53
+ type: code_judge
54
+ path: node ./scripts/safety-gate.js
55
+ cwd: ./evaluators # optional working directory
56
+ ```
57
+
58
+ **Input (stdin):**
59
+ ```json
60
+ {
61
+ "results": {
62
+ "safety": { "score": 0.9, "hits": [...], "misses": [...] },
63
+ "quality": { "score": 0.85, "hits": [...], "misses": [...] }
64
+ }
65
+ }
66
+ ```
67
+
68
+ **Output (stdout):**
69
+ ```json
70
+ {
71
+ "score": 0.87,
72
+ "verdict": "pass",
73
+ "hits": ["Combined check passed"],
74
+ "misses": [],
75
+ "reasoning": "Safety gate passed, quality acceptable"
76
+ }
77
+ ```
78
+
79
+ ### 3. LLM Judge Aggregator
80
+
81
+ Use an LLM to resolve conflicts or make nuanced decisions:
82
+
83
+ ```yaml
84
+ aggregator:
85
+ type: llm_judge
86
+ prompt: ./prompts/conflict-resolution.md
87
+ ```
88
+
89
+ The `{{EVALUATOR_RESULTS_JSON}}` variable is replaced with the JSON results from all child evaluators.
90
+
91
+ ## Example Patterns
92
+
93
+ ### Safety Gate Pattern
94
+
95
+ Block outputs that fail safety even if quality is high:
96
+
97
+ ```yaml
98
+ evalcases:
99
+ - id: safety-gated-response
100
+ expected_outcome: Safe and accurate response
101
+
102
+ input_messages:
103
+ - role: user
104
+ content: Explain quantum computing
105
+
106
+ execution:
107
+ evaluators:
108
+ - name: safety_gate
109
+ type: composite
110
+ evaluators:
111
+ - name: safety
112
+ type: llm_judge
113
+ prompt: ./prompts/safety-check.md
114
+ - name: quality
115
+ type: llm_judge
116
+ prompt: ./prompts/quality-check.md
117
+ aggregator:
118
+ type: code_judge
119
+ path: ./scripts/safety-gate.js
120
+ ```
121
+
122
+ ### Multi-Criteria Weighted Evaluation
123
+
124
+ ```yaml
125
+ - name: release_readiness
126
+ type: composite
127
+ evaluators:
128
+ - name: correctness
129
+ type: llm_judge
130
+ prompt: ./prompts/correctness.md
131
+ - name: style
132
+ type: code_judge
133
+ script: uv run style_checker.py
134
+ - name: security
135
+ type: llm_judge
136
+ prompt: ./prompts/security.md
137
+ aggregator:
138
+ type: weighted_average
139
+ weights:
140
+ correctness: 0.5
141
+ style: 0.2
142
+ security: 0.3
143
+ ```
144
+
145
+ ### Nested Composites
146
+
147
+ Composites can contain other composites for complex hierarchies:
148
+
149
+ ```yaml
150
+ - name: comprehensive_eval
151
+ type: composite
152
+ evaluators:
153
+ - name: content_quality
154
+ type: composite
155
+ evaluators:
156
+ - name: accuracy
157
+ type: llm_judge
158
+ prompt: ./prompts/accuracy.md
159
+ - name: clarity
160
+ type: llm_judge
161
+ prompt: ./prompts/clarity.md
162
+ aggregator:
163
+ type: weighted_average
164
+ weights:
165
+ accuracy: 0.6
166
+ clarity: 0.4
167
+ - name: safety
168
+ type: llm_judge
169
+ prompt: ./prompts/safety.md
170
+ aggregator:
171
+ type: weighted_average
172
+ weights:
173
+ content_quality: 0.7
174
+ safety: 0.3
175
+ ```
176
+
177
+ ## Result Structure
178
+
179
+ Composite evaluators return nested `evaluator_results`:
180
+
181
+ ```json
182
+ {
183
+ "score": 0.85,
184
+ "verdict": "pass",
185
+ "hits": ["[safety] No harmful content", "[quality] Clear explanation"],
186
+ "misses": ["[quality] Could use more examples"],
187
+ "reasoning": "safety: Passed all checks; quality: Good but could improve",
188
+ "evaluator_results": [
189
+ {
190
+ "name": "safety",
191
+ "type": "llm_judge",
192
+ "score": 0.95,
193
+ "verdict": "pass",
194
+ "hits": ["No harmful content"],
195
+ "misses": []
196
+ },
197
+ {
198
+ "name": "quality",
199
+ "type": "llm_judge",
200
+ "score": 0.8,
201
+ "verdict": "pass",
202
+ "hits": ["Clear explanation"],
203
+ "misses": ["Could use more examples"]
204
+ }
205
+ ]
206
+ }
207
+ ```
208
+
209
+ ## Best Practices
210
+
211
+ 1. **Name evaluators clearly** - Names appear in results and debugging output
212
+ 2. **Use safety gates for critical checks** - Don't let high quality override safety failures
213
+ 3. **Balance weights thoughtfully** - Consider which aspects matter most for your use case
214
+ 4. **Keep nesting shallow** - Deep nesting makes debugging harder
215
+ 5. **Test aggregators independently** - Verify your custom aggregation logic with unit tests
@@ -0,0 +1,139 @@
1
+ # Rubric Evaluator Guide
2
+
3
+ Rubrics provide structured evaluation through lists of criteria that define what makes a good response. Rubrics are checked by an LLM judge and scored based on weights and requirements.
4
+
5
+ ## Basic Usage
6
+
7
+ ### Simple String Rubrics
8
+
9
+ Define rubrics as simple strings - each becomes a required criterion with weight 1.0:
10
+
11
+ ```yaml
12
+ $schema: agentv-eval-v2
13
+
14
+ evalcases:
15
+ - id: quicksort-explanation
16
+ expected_outcome: Explain how quicksort works
17
+
18
+ input_messages:
19
+ - role: user
20
+ content: Explain how the quicksort algorithm works
21
+
22
+ rubrics:
23
+ - Mentions divide-and-conquer approach
24
+ - Explains the partition step
25
+ - States time complexity correctly
26
+ ```
27
+
28
+ ### Detailed Rubric Objects
29
+
30
+ Use objects for fine-grained control over weights and requirements:
31
+
32
+ ```yaml
33
+ evalcases:
34
+ - id: technical-guide
35
+ expected_outcome: Write a comprehensive HTTP status codes guide
36
+
37
+ input_messages:
38
+ - role: user
39
+ content: Write a guide explaining HTTP status codes
40
+
41
+ rubrics:
42
+ - id: structure
43
+ description: Has clear headings and organization
44
+ weight: 1.0
45
+ required: true
46
+
47
+ - id: success-codes
48
+ description: Covers 2xx success codes with examples
49
+ weight: 2.0
50
+ required: true
51
+
52
+ - id: client-errors
53
+ description: Explains 4xx client error codes
54
+ weight: 2.0
55
+ required: true
56
+
57
+ - id: server-errors
58
+ description: Explains 5xx server error codes
59
+ weight: 1.5
60
+ required: false
61
+
62
+ - id: practical-examples
63
+ description: Includes practical use case examples
64
+ weight: 1.0
65
+ required: false
66
+ ```
67
+
68
+ ## Rubric Object Fields
69
+
70
+ | Field | Type | Default | Description |
71
+ |-------|------|---------|-------------|
72
+ | `id` | string | auto-generated | Unique identifier for the rubric |
73
+ | `description` | string | required | The criterion being evaluated |
74
+ | `weight` | number | 1.0 | Relative importance (higher = more impact on score) |
75
+ | `required` | boolean | true | If true, failing this rubric forces verdict to 'fail' |
76
+
77
+ ## Scoring and Verdicts
78
+
79
+ **Score Calculation:**
80
+ ```
81
+ score = (sum of satisfied weights) / (total weights)
82
+ ```
83
+
84
+ **Verdict Rules:**
85
+ - `pass`: Score ≥ 0.8 AND all required rubrics satisfied
86
+ - `borderline`: Score ≥ 0.6 AND all required rubrics satisfied
87
+ - `fail`: Score < 0.6 OR any required rubric failed
88
+
89
+ ## Combining Rubrics with Other Evaluators
90
+
91
+ Rubrics can be combined with code evaluators for comprehensive validation:
92
+
93
+ ```yaml
94
+ evalcases:
95
+ - id: email-validator
96
+ expected_outcome: Python function to validate email addresses
97
+
98
+ input_messages:
99
+ - role: user
100
+ content: Write a Python function to validate email addresses
101
+
102
+ # Semantic evaluation via rubrics
103
+ rubrics:
104
+ - Uses regular expressions for validation
105
+ - Includes type hints
106
+ - Has docstring documentation
107
+ - Handles edge cases (None, empty string)
108
+
109
+ execution:
110
+ evaluators:
111
+ # Rubric evaluator is auto-added from inline rubrics field
112
+
113
+ # Additional code evaluator for syntax checking
114
+ - name: python_syntax
115
+ type: code_judge
116
+ script: uv run python -m py_compile
117
+ ```
118
+
119
+ ## Generate Rubrics from Expected Outcome
120
+
121
+ Use the CLI to auto-generate rubrics from `expected_outcome`:
122
+
123
+ ```bash
124
+ # Generate rubrics for eval cases that don't have them
125
+ agentv generate rubrics evals/my-eval.yaml
126
+
127
+ # Use a specific LLM target for generation
128
+ agentv generate rubrics evals/my-eval.yaml --target azure_base
129
+ ```
130
+
131
+ This analyzes each `expected_outcome` and creates appropriate rubric items.
132
+
133
+ ## Best Practices
134
+
135
+ 1. **Use required sparingly** - Only mark rubrics as `required: true` for critical criteria
136
+ 2. **Balance weights** - Use higher weights (2.0+) for core requirements, lower (0.5) for nice-to-haves
137
+ 3. **Be specific** - "Includes error handling" is better than "Good code quality"
138
+ 4. **Keep rubrics atomic** - Each rubric should test one thing
139
+ 5. **Consider partial credit** - Non-required rubrics allow partial scores