agentv 0.10.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  runCli
4
- } from "./chunk-72BHGHIT.js";
4
+ } from "./chunk-WMO5PVPX.js";
5
5
 
6
6
  // src/cli.ts
7
7
  void runCli();
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  createProgram,
3
3
  runCli
4
- } from "./chunk-72BHGHIT.js";
4
+ } from "./chunk-WMO5PVPX.js";
5
5
  export {
6
6
  createProgram,
7
7
  runCli
@@ -0,0 +1,23 @@
1
+ # Example environment configuration for AgentV
2
+ # Copy this file to .env and fill in your credentials
3
+
4
+ # Model Provider Selection (Optional - can be configured via targets.yaml)
5
+ PROVIDER=azure
6
+
7
+ # Azure OpenAI Configuration
8
+ # These are the default environment variable names used in the provided targets.yaml
9
+ AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
10
+ AZURE_OPENAI_API_KEY=your-api-key-here
11
+ AZURE_DEPLOYMENT_NAME=gpt-4o
12
+
13
+ # Anthropic Configuration (if using Anthropic provider)
14
+ ANTHROPIC_API_KEY=your-anthropic-api-key-here
15
+
16
+ # VS Code Workspace Paths for Execution Targets
17
+ # Note: Using forward slashes is recommended for paths in .env files
18
+ # to avoid issues with escape characters.
19
+ PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
20
+
21
+ # CLI provider sample (used by the local_cli target)
22
+ PROJECT_ROOT=D:/GitHub/your-username/agentv/docs/examples/simple
23
+ LOCAL_AGENT_TOKEN=your-cli-token
@@ -1,10 +1,14 @@
1
1
  ---
2
- description: 'Apply when writing evals in YAML format'
2
+ name: eval-builder
3
+ description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding eval cases, or configuring custom evaluators (code validators or LLM judges) for agent testing workflows.
3
4
  ---
4
5
 
6
+ # Eval Builder
7
+
5
8
  ## Schema Reference
6
- - Schema: `@../contexts/eval-schema.json` (JSON Schema for validation and tooling)
9
+ - Schema: `references/eval-schema.json` (JSON Schema for validation and tooling)
7
10
  - Format: YAML with structured content arrays
11
+ - Examples: `references/example-evals.md`
8
12
 
9
13
  ## Structure Requirements
10
14
  - Root level: `$schema` (required: "agentv-eval-v2"), `description` (optional), `target` (optional), `evalcases` (required)
@@ -14,7 +18,54 @@ description: 'Apply when writing evals in YAML format'
14
18
  - Message roles: `system`, `user`, `assistant`, `tool`
15
19
  - Content types: `text` (inline), `file` (relative or absolute path)
16
20
  - Attachments (type: `file`) should default to the `user` role
17
- - File paths must start with "/" for absolute paths (e.g., "/prompts/file.md")
21
+ - File paths: Relative (from eval file dir) or absolute with "/" prefix (from repo root)
22
+
23
+ ## Custom Evaluators
24
+
25
+ Configure multiple evaluators per eval case via `execution.evaluators` array.
26
+
27
+ ### Code Evaluators
28
+ Scripts that validate output programmatically:
29
+
30
+ ```yaml
31
+ execution:
32
+ evaluators:
33
+ - name: json_format_validator
34
+ type: code
35
+ script: uv run validate_output.py
36
+ cwd: ../../evaluators/scripts
37
+ ```
38
+
39
+ **Contract:**
40
+ - Input (stdin): JSON with `task`, `outcome`, `expected`, `output`, `system_message`, etc.
41
+ - Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
42
+
43
+ **Template:** See `references/custom-evaluators.md` for Python code evaluator template
44
+
45
+ ### LLM Judges
46
+ Language models evaluate response quality:
47
+
48
+ ```yaml
49
+ execution:
50
+ evaluators:
51
+ - name: content_evaluator
52
+ type: llm_judge
53
+ prompt: /evaluators/prompts/correctness.md
54
+ model: gpt-5-chat
55
+ ```
56
+
57
+ ### Evaluator Chaining
58
+ Evaluators run sequentially:
59
+
60
+ ```yaml
61
+ execution:
62
+ evaluators:
63
+ - name: format_check # Runs first
64
+ type: code
65
+ script: uv run validate_json.py
66
+ - name: content_check # Runs second
67
+ type: llm_judge
68
+ ```
18
69
 
19
70
  ## Example
20
71
  ```yaml
@@ -40,7 +91,6 @@ evalcases:
40
91
  def add(a, b):
41
92
  return a + b
42
93
  ```
43
- # File paths can be relative or absolute
44
94
  - type: file
45
95
  value: /prompts/python.instructions.md
46
96
 
@@ -62,7 +112,8 @@ evalcases:
62
112
  evaluators:
63
113
  - name: keyword_check
64
114
  type: code
65
- script: /evaluators/scripts/check_keywords.py
115
+ script: uv run check_keywords.py
116
+ cwd: /evaluators/scripts
66
117
  - name: semantic_judge
67
118
  type: llm_judge
68
119
  prompt: /evaluators/prompts/correctness.md
@@ -98,4 +149,4 @@ evalcases:
98
149
  unique.sort(reverse=True)
99
150
  return unique[1]
100
151
  ```
101
- ```
152
+ ```
@@ -0,0 +1,399 @@
1
+ # Custom Evaluators Guide
2
+
3
+ Guide for writing custom code evaluators and LLM judges for AgentV eval files.
4
+
5
+ ## Code Evaluator Contract
6
+
7
+ Code evaluators receive input via stdin and write output to stdout, both as JSON.
8
+
9
+ ### Input Format (via stdin)
10
+
11
+ ```json
12
+ {
13
+ "task": "string describing the task",
14
+ "outcome": "expected outcome description",
15
+ "expected": "expected output string",
16
+ "output": "generated code/text from the agent",
17
+ "system_message": "system message if any",
18
+ "guideline_paths": ["path1", "path2"],
19
+ "attachments": ["file1", "file2"],
20
+ "user_segments": [{"type": "text", "value": "..."}]
21
+ }
22
+ ```
23
+
24
+ ### Output Format (to stdout)
25
+
26
+ ```json
27
+ {
28
+ "score": 0.85,
29
+ "hits": ["successful check 1", "successful check 2"],
30
+ "misses": ["failed check 1"],
31
+ "reasoning": "Brief explanation of the score"
32
+ }
33
+ ```
34
+
35
+ **Field Requirements:**
36
+ - `score`: Float between 0.0 and 1.0 (required)
37
+ - `hits`: Array of strings describing what passed (optional but recommended)
38
+ - `misses`: Array of strings describing what failed (optional but recommended)
39
+ - `reasoning`: String explaining the score (optional but recommended)
40
+
41
+ ## Python Code Evaluator Template
42
+
43
+ ```python
44
+ #!/usr/bin/env python3
45
+ """
46
+ Example code evaluator for AgentV
47
+
48
+ This evaluator checks for specific keywords in the output.
49
+ Replace validation logic as needed.
50
+ """
51
+
52
+ import json
53
+ import sys
54
+ from typing import Any
55
+
56
+
57
+ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
58
+ """
59
+ Evaluate the agent output.
60
+
61
+ Args:
62
+ input_data: Full input context from AgentV
63
+
64
+ Returns:
65
+ Evaluation result with score, hits, misses, reasoning
66
+ """
67
+ # Extract only the fields you need
68
+ # Most evaluators only need 'output' - avoid using unnecessary fields
69
+ output = input_data.get("output", "")
70
+
71
+ # Your validation logic here
72
+ hits = []
73
+ misses = []
74
+
75
+ # Example: Check for keywords
76
+ required_keywords = ["async", "await"]
77
+ for keyword in required_keywords:
78
+ if keyword in output:
79
+ hits.append(f"Contains required keyword: {keyword}")
80
+ else:
81
+ misses.append(f"Missing required keyword: {keyword}")
82
+
83
+ # Calculate score
84
+ if not required_keywords:
85
+ score = 1.0
86
+ else:
87
+ score = len(hits) / len(required_keywords)
88
+
89
+ # Build result
90
+ return {
91
+ "score": score,
92
+ "hits": hits,
93
+ "misses": misses,
94
+ "reasoning": f"Found {len(hits)}/{len(required_keywords)} required keywords"
95
+ }
96
+
97
+
98
+ def main():
99
+ """Main entry point for AgentV code evaluator."""
100
+ try:
101
+ # Read input from stdin
102
+ input_data = json.loads(sys.stdin.read())
103
+
104
+ # Run evaluation
105
+ result = evaluate(input_data)
106
+
107
+ # Write result to stdout
108
+ print(json.dumps(result, indent=2))
109
+
110
+ except Exception as e:
111
+ # Error handling: return zero score with error message
112
+ error_result = {
113
+ "score": 0.0,
114
+ "hits": [],
115
+ "misses": [f"Evaluator error: {str(e)}"],
116
+ "reasoning": f"Evaluator error: {str(e)}"
117
+ }
118
+ print(json.dumps(error_result, indent=2))
119
+ sys.exit(1)
120
+
121
+
122
+ if __name__ == "__main__":
123
+ main()
124
+ ```
125
+
126
+ ## JSON Format Validator Example
127
+
128
+ A common pattern is validating JSON output structure:
129
+
130
+ ```python
131
+ #!/usr/bin/env python3
132
+ """
133
+ JSON Format Validator for AgentV
134
+ Validates that output is valid JSON with required keys.
135
+ """
136
+
137
+ import json
138
+ import sys
139
+ from typing import Any
140
+
141
+
142
+ def validate_json_format(output: str, required_keys: list[str]) -> dict[str, Any]:
143
+ """
144
+ Validate that output is valid JSON with required keys.
145
+
146
+ Args:
147
+ output: The candidate output to validate
148
+ required_keys: List of required top-level keys
149
+
150
+ Returns:
151
+ Evaluation result dict
152
+ """
153
+ # Try to parse as JSON
154
+ try:
155
+ parsed = json.loads(output.strip())
156
+ except json.JSONDecodeError as e:
157
+ return {
158
+ "score": 0.0,
159
+ "hits": [],
160
+ "misses": ["Not valid JSON"],
161
+ "reasoning": f"Output is not valid JSON. Parse error: {str(e)}"
162
+ }
163
+
164
+ # Check if it's a dict
165
+ if not isinstance(parsed, dict):
166
+ return {
167
+ "score": 0.0,
168
+ "hits": [],
169
+ "misses": ["JSON is not an object/dict"],
170
+ "reasoning": f"Output is valid JSON but not an object. Got: {type(parsed).__name__}"
171
+ }
172
+
173
+ # Check for required keys
174
+ missing_keys = [key for key in required_keys if key not in parsed]
175
+ present_keys = [key for key in required_keys if key in parsed]
176
+
177
+ if missing_keys:
178
+ return {
179
+ "score": 0.0,
180
+ "hits": [f"Has key: {key}" for key in present_keys],
181
+ "misses": [f"Missing key: {key}" for key in missing_keys],
182
+ "reasoning": f"Valid JSON but missing required keys: {', '.join(missing_keys)}"
183
+ }
184
+
185
+ # All checks passed
186
+ return {
187
+ "score": 1.0,
188
+ "hits": [f"Valid JSON with all required keys: {', '.join(required_keys)}"],
189
+ "misses": [],
190
+ "reasoning": f"Valid JSON with all required keys: {', '.join(required_keys)}"
191
+ }
192
+
193
+
194
+ def main():
195
+ """Main entry point."""
196
+ try:
197
+ input_data = json.loads(sys.stdin.read())
198
+ output = input_data.get("output", "")
199
+
200
+ # Define required keys (customize as needed)
201
+ required_keys = ["criticalityRating", "reasoning"]
202
+
203
+ result = validate_json_format(output, required_keys)
204
+ print(json.dumps(result, indent=2))
205
+
206
+ except Exception as e:
207
+ error_result = {
208
+ "score": 0.0,
209
+ "hits": [],
210
+ "misses": [f"Evaluator error: {str(e)}"],
211
+ "reasoning": f"Evaluator error: {str(e)}"
212
+ }
213
+ print(json.dumps(error_result, indent=2))
214
+ sys.exit(1)
215
+
216
+
217
+ if __name__ == "__main__":
218
+ main()
219
+ ```
220
+
221
+ ## LLM Judge Prompt Template
222
+
223
+ LLM judges use markdown prompts to guide evaluation:
224
+
225
+ ```markdown
226
+ # Code Quality Judge
227
+
228
+ Evaluate the candidate code for quality, correctness, and best practices.
229
+
230
+ ## Evaluation Criteria
231
+
232
+ Rate the code on:
233
+ 1. **Correctness** - Does it solve the problem?
234
+ 2. **Style** - Does it follow best practices?
235
+ 3. **Completeness** - Are edge cases handled?
236
+ 4. **Documentation** - Are there helpful comments/docstrings?
237
+
238
+ ## Scoring Guidelines
239
+
240
+ - **0.9-1.0:** Excellent - Correct, clean, well-documented
241
+ - **0.7-0.8:** Good - Correct with minor style issues
242
+ - **0.5-0.6:** Adequate - Works but has quality issues
243
+ - **0.3-0.4:** Poor - Has bugs or major style problems
244
+ - **0.0-0.2:** Unacceptable - Does not work or completely wrong
245
+
246
+ ## Output Format
247
+
248
+ Respond with valid JSON:
249
+
250
+ ```json
251
+ {
252
+ "score": 0.85,
253
+ "hits": [
254
+ "Correctly implements the algorithm",
255
+ "Good error handling"
256
+ ],
257
+ "misses": [
258
+ "Missing type hints",
259
+ "No docstring"
260
+ ],
261
+ "reasoning": "Code is correct and handles errors well, but lacks documentation."
262
+ }
263
+ ```
264
+ ```
265
+
266
+ ## Best Practices
267
+
268
+ ### For Code Evaluators
269
+
270
+ 1. **Focus on relevant fields** - Most evaluators only need the `output` field
271
+ 2. **Avoid false positives** - Don't check fields like `task` or `expected` unless you specifically need context
272
+ 3. **Be deterministic** - Same input should always produce same output
273
+ 4. **Handle errors gracefully** - Return a valid result even when evaluation fails
274
+ 5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
275
+
276
+ ### For LLM Judges
277
+
278
+ 1. **Clear criteria** - Define what you're evaluating
279
+ 2. **Specific guidelines** - Provide scoring rubrics
280
+ 3. **JSON output** - Enforce structured output format
281
+ 4. **Examples** - Show what good/bad looks like
282
+ 5. **Concise prompts** - Keep instructions focused
283
+
284
+ ### Common Pitfalls to Avoid
285
+
286
+ **❌ Checking unnecessary fields:**
287
+ ```python
288
+ # BAD: Checking 'task' or 'expected' when you only need to validate format
289
+ if "async" in input_data.get("task", ""):
290
+ # This creates false positives
291
+ ```
292
+
293
+ **✅ Focus on output:**
294
+ ```python
295
+ # GOOD: Only check the actual output
296
+ output = input_data.get("output", "")
297
+ if "async" in output:
298
+ # This is what you actually want to validate
299
+ ```
300
+
301
+ **❌ Brittle string matching:**
302
+ ```python
303
+ # BAD: Exact match is too strict
304
+ if output == "The answer is 42":
305
+ score = 1.0
306
+ ```
307
+
308
+ **✅ Flexible validation:**
309
+ ```python
310
+ # GOOD: Check for semantic correctness
311
+ if "42" in output and "answer" in output.lower():
312
+ score = 1.0
313
+ ```
314
+
315
+ ## Running Code Evaluators
316
+
317
+ ### In Eval Files
318
+
319
+ ```yaml
320
+ execution:
321
+ evaluators:
322
+ - name: my_validator
323
+ type: code
324
+ script: uv run my_validator.py
325
+ cwd: ./evaluators
326
+ ```
327
+
328
+ ### Command Line Testing
329
+
330
+ Test your evaluator locally:
331
+
332
+ ```bash
333
+ # Create test input
334
+ echo '{
335
+ "output": "test output here",
336
+ "task": "test task"
337
+ }' | uv run my_validator.py
338
+
339
+ # Should output:
340
+ # {
341
+ # "score": 0.8,
342
+ # "hits": ["check 1 passed"],
343
+ # "misses": ["check 2 failed"],
344
+ # "reasoning": "..."
345
+ # }
346
+ ```
347
+
348
+ ## Advanced Patterns
349
+
350
+ ### Combining Multiple Checks
351
+
352
+ ```python
353
+ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
354
+ output = input_data.get("output", "")
355
+
356
+ checks = [
357
+ ("has_async", "async" in output, "Contains async keyword"),
358
+ ("has_await", "await" in output, "Contains await keyword"),
359
+ ("has_try", "try:" in output, "Has error handling"),
360
+ ]
361
+
362
+ hits = [msg for _, passed, msg in checks if passed]
363
+ misses = [msg for _, passed, msg in checks if not passed]
364
+ score = len(hits) / len(checks)
365
+
366
+ return {
367
+ "score": score,
368
+ "hits": hits,
369
+ "misses": misses,
370
+ "reasoning": f"Passed {len(hits)}/{len(checks)} checks"
371
+ }
372
+ ```
373
+
374
+ ### Weighted Scoring
375
+
376
+ ```python
377
+ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
378
+ output = input_data.get("output", "")
379
+
380
+ # Define checks with weights
381
+ checks = [
382
+ ("correctness", is_correct(output), 0.5),
383
+ ("style", has_good_style(output), 0.3),
384
+ ("docs", has_docs(output), 0.2),
385
+ ]
386
+
387
+ hits = [name for name, passed, _ in checks if passed]
388
+ misses = [name for name, passed, _ in checks if not passed]
389
+
390
+ # Weighted score
391
+ score = sum(weight for _, passed, weight in checks if passed)
392
+
393
+ return {
394
+ "score": score,
395
+ "hits": hits,
396
+ "misses": misses,
397
+ "reasoning": f"Weighted score: {score:.2f}"
398
+ }
399
+ ```