agentv 0.21.3 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  runCli
4
- } from "./chunk-A5T7W63L.js";
4
+ } from "./chunk-4T62HFF4.js";
5
5
  import "./chunk-UE4GLFVL.js";
6
6
 
7
7
  // src/cli.ts
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import {
2
2
  app,
3
3
  runCli
4
- } from "./chunk-A5T7W63L.js";
4
+ } from "./chunk-4T62HFF4.js";
5
5
  import "./chunk-UE4GLFVL.js";
6
6
  export {
7
7
  app,
@@ -37,7 +37,7 @@ execution:
37
37
  ```
38
38
 
39
39
  **Contract:**
40
- - Input (stdin): JSON with `task`, `expected_outcome`, `expected`, `output`, `system_message`, etc.
40
+ - Input (stdin): JSON with `question`, `expected_outcome`, `reference_answer`, `candidate_answer`, `guideline_paths`, `input_files`, `input_messages`
41
41
  - Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
42
42
 
43
43
  **Template:** See `references/custom-evaluators.md` for Python code evaluator template
@@ -10,14 +10,13 @@ Code evaluators receive input via stdin and write output to stdout, both as JSON
10
10
 
11
11
  ```json
12
12
  {
13
- "task": "string describing the task",
14
- "outcome": "expected outcome description",
15
- "expected": "expected output string",
16
- "output": "generated code/text from the agent",
17
- "system_message": "system message if any",
13
+ "question": "string describing the task/question",
14
+ "expected_outcome": "expected outcome description",
15
+ "reference_answer": "gold standard answer (optional)",
16
+ "candidate_answer": "generated code/text from the agent",
18
17
  "guideline_paths": ["path1", "path2"],
19
- "attachments": ["file1", "file2"],
20
- "user_segments": [{"type": "text", "value": "..."}]
18
+ "input_files": ["file1", "file2"],
19
+ "input_messages": [{"role": "user", "content": "..."}]
21
20
  }
22
21
  ```
23
22
 
@@ -65,8 +64,8 @@ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
65
64
  Evaluation result with score, hits, misses, reasoning
66
65
  """
67
66
  # Extract only the fields you need
68
- # Most evaluators only need 'output' - avoid using unnecessary fields
69
- output = input_data.get("output", "")
67
+ # Most evaluators only need 'candidate_answer' - avoid using unnecessary fields
68
+ candidate_answer = input_data.get("candidate_answer", "")
70
69
 
71
70
  # Your validation logic here
72
71
  hits = []
@@ -75,7 +74,7 @@ def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
75
74
  # Example: Check for keywords
76
75
  required_keywords = ["async", "await"]
77
76
  for keyword in required_keywords:
78
- if keyword in output:
77
+ if keyword in candidate_answer:
79
78
  hits.append(f"Contains required keyword: {keyword}")
80
79
  else:
81
80
  misses.append(f"Missing required keyword: {keyword}")
@@ -123,157 +122,55 @@ if __name__ == "__main__":
123
122
  main()
124
123
  ```
125
124
 
126
- ## JSON Format Validator Example
127
-
128
- A common pattern is validating JSON output structure:
129
-
130
- ```python
131
- #!/usr/bin/env python3
132
- """
133
- JSON Format Validator for AgentV
134
- Validates that output is valid JSON with required keys.
135
- """
136
-
137
- import json
138
- import sys
139
- from typing import Any
140
-
125
+ ## LLM Judge Prompt Template
141
126
 
142
- def validate_json_format(output: str, required_keys: list[str]) -> dict[str, Any]:
143
- """
144
- Validate that output is valid JSON with required keys.
145
-
146
- Args:
147
- output: The candidate output to validate
148
- required_keys: List of required top-level keys
149
-
150
- Returns:
151
- Evaluation result dict
152
- """
153
- # Try to parse as JSON
154
- try:
155
- parsed = json.loads(output.strip())
156
- except json.JSONDecodeError as e:
157
- return {
158
- "score": 0.0,
159
- "hits": [],
160
- "misses": ["Not valid JSON"],
161
- "reasoning": f"Output is not valid JSON. Parse error: {str(e)}"
162
- }
163
-
164
- # Check if it's a dict
165
- if not isinstance(parsed, dict):
166
- return {
167
- "score": 0.0,
168
- "hits": [],
169
- "misses": ["JSON is not an object/dict"],
170
- "reasoning": f"Output is valid JSON but not an object. Got: {type(parsed).__name__}"
171
- }
172
-
173
- # Check for required keys
174
- missing_keys = [key for key in required_keys if key not in parsed]
175
- present_keys = [key for key in required_keys if key in parsed]
176
-
177
- if missing_keys:
178
- return {
179
- "score": 0.0,
180
- "hits": [f"Has key: {key}" for key in present_keys],
181
- "misses": [f"Missing key: {key}" for key in missing_keys],
182
- "reasoning": f"Valid JSON but missing required keys: {', '.join(missing_keys)}"
183
- }
184
-
185
- # All checks passed
186
- return {
187
- "score": 1.0,
188
- "hits": [f"Valid JSON with all required keys: {', '.join(required_keys)}"],
189
- "misses": [],
190
- "reasoning": f"Valid JSON with all required keys: {', '.join(required_keys)}"
191
- }
127
+ LLM judges use markdown prompts to guide evaluation. AgentV automatically handles the output format, so focus your prompt on evaluation criteria and guidelines.
192
128
 
129
+ **Available Template Variables:**
130
+ - `{{question}}` - The original question/task
131
+ - `{{expected_outcome}}` - What the answer should accomplish
132
+ - `{{candidate_answer}}` - The actual output to evaluate
133
+ - `{{reference_answer}}` - Gold standard answer (optional, may be empty)
134
+ - `{{input_messages}}` - JSON stringified input message segments
135
+ - `{{output_messages}}` - JSON stringified expected output segments
193
136
 
194
- def main():
195
- """Main entry point."""
196
- try:
197
- input_data = json.loads(sys.stdin.read())
198
- output = input_data.get("output", "")
199
-
200
- # Define required keys (customize as needed)
201
- required_keys = ["criticalityRating", "reasoning"]
202
-
203
- result = validate_json_format(output, required_keys)
204
- print(json.dumps(result, indent=2))
205
-
206
- except Exception as e:
207
- error_result = {
208
- "score": 0.0,
209
- "hits": [],
210
- "misses": [f"Evaluator error: {str(e)}"],
211
- "reasoning": f"Evaluator error: {str(e)}"
212
- }
213
- print(json.dumps(error_result, indent=2))
214
- sys.exit(1)
137
+ **Default Evaluator Template:**
215
138
 
139
+ If you don't specify a custom evaluator template, AgentV uses this default:
216
140
 
217
- if __name__ == "__main__":
218
- main()
219
141
  ```
142
+ You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
220
143
 
221
- ## LLM Judge Prompt Template
222
-
223
- LLM judges use markdown prompts to guide evaluation:
224
-
225
- ```markdown
226
- # Code Quality Judge
227
-
228
- Evaluate the candidate code for quality, correctness, and best practices.
229
-
230
- ## Evaluation Criteria
144
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
231
145
 
232
- Rate the code on:
233
- 1. **Correctness** - Does it solve the problem?
234
- 2. **Style** - Does it follow best practices?
235
- 3. **Completeness** - Are edge cases handled?
236
- 4. **Documentation** - Are there helpful comments/docstrings?
146
+ Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
237
147
 
238
- ## Scoring Guidelines
148
+ [[ ## expected_outcome ## ]]
149
+ {{expected_outcome}}
239
150
 
240
- - **0.9-1.0:** Excellent - Correct, clean, well-documented
241
- - **0.7-0.8:** Good - Correct with minor style issues
242
- - **0.5-0.6:** Adequate - Works but has quality issues
243
- - **0.3-0.4:** Poor - Has bugs or major style problems
244
- - **0.0-0.2:** Unacceptable - Does not work or completely wrong
151
+ [[ ## question ## ]]
152
+ {{question}}
245
153
 
246
- ## Output Format
154
+ [[ ## reference_answer ## ]]
155
+ {{reference_answer}}
247
156
 
248
- Respond with valid JSON:
249
-
250
- ```json
251
- {
252
- "score": 0.85,
253
- "hits": [
254
- "Correctly implements the algorithm",
255
- "Good error handling"
256
- ],
257
- "misses": [
258
- "Missing type hints",
259
- "No docstring"
260
- ],
261
- "reasoning": "Code is correct and handles errors well, but lacks documentation."
262
- }
263
- ```
157
+ [[ ## candidate_answer ## ]]
158
+ {{candidate_answer}}
264
159
  ```
265
160
 
161
+ You can customize this template in your eval file using the `evaluatorTemplate` field to add domain-specific criteria or scoring guidelines.
162
+
266
163
  ## Best Practices
267
164
 
268
- ### For Code Evaluators
165
+ ### For Code-based Evaluators
269
166
 
270
- 1. **Focus on relevant fields** - Most evaluators only need the `output` field
271
- 2. **Avoid false positives** - Don't check fields like `task` or `expected` unless you specifically need context
167
+ 1. **Focus on relevant fields** - Most evaluators only need the `candidate_answer` field
168
+ 2. **Avoid false positives** - Don't check fields like `question` or `reference_answer` unless you specifically need context
272
169
  3. **Be deterministic** - Same input should always produce same output
273
170
  4. **Handle errors gracefully** - Return a valid result even when evaluation fails
274
171
  5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
275
172
 
276
- ### For LLM Judges
173
+ ### For Prompt-based Evaluators (LLM Judges)
277
174
 
278
175
  1. **Clear criteria** - Define what you're evaluating
279
176
  2. **Specific guidelines** - Provide scoring rubrics
@@ -281,37 +178,6 @@ Respond with valid JSON:
281
178
  4. **Examples** - Show what good/bad looks like
282
179
  5. **Concise prompts** - Keep instructions focused
283
180
 
284
- ### Common Pitfalls to Avoid
285
-
286
- **❌ Checking unnecessary fields:**
287
- ```python
288
- # BAD: Checking 'task' or 'expected' when you only need to validate format
289
- if "async" in input_data.get("task", ""):
290
- # This creates false positives
291
- ```
292
-
293
- **✅ Focus on output:**
294
- ```python
295
- # GOOD: Only check the actual output
296
- output = input_data.get("output", "")
297
- if "async" in output:
298
- # This is what you actually want to validate
299
- ```
300
-
301
- **❌ Brittle string matching:**
302
- ```python
303
- # BAD: Exact match is too strict
304
- if output == "The answer is 42":
305
- score = 1.0
306
- ```
307
-
308
- **✅ Flexible validation:**
309
- ```python
310
- # GOOD: Check for semantic correctness
311
- if "42" in output and "answer" in output.lower():
312
- score = 1.0
313
- ```
314
-
315
181
  ## Running Code Evaluators
316
182
 
317
183
  ### In Eval Files
@@ -332,8 +198,9 @@ Test your evaluator locally:
332
198
  ```bash
333
199
  # Create test input
334
200
  echo '{
335
- "output": "test output here",
336
- "task": "test task"
201
+ "candidate_answer": "test output here",
202
+ "question": "test task",
203
+ "expected_outcome": "expected result"
337
204
  }' | uv run my_validator.py
338
205
 
339
206
  # Should output:
@@ -344,56 +211,3 @@ echo '{
344
211
  # "reasoning": "..."
345
212
  # }
346
213
  ```
347
-
348
- ## Advanced Patterns
349
-
350
- ### Combining Multiple Checks
351
-
352
- ```python
353
- def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
354
- output = input_data.get("output", "")
355
-
356
- checks = [
357
- ("has_async", "async" in output, "Contains async keyword"),
358
- ("has_await", "await" in output, "Contains await keyword"),
359
- ("has_try", "try:" in output, "Has error handling"),
360
- ]
361
-
362
- hits = [msg for _, passed, msg in checks if passed]
363
- misses = [msg for _, passed, msg in checks if not passed]
364
- score = len(hits) / len(checks)
365
-
366
- return {
367
- "score": score,
368
- "hits": hits,
369
- "misses": misses,
370
- "reasoning": f"Passed {len(hits)}/{len(checks)} checks"
371
- }
372
- ```
373
-
374
- ### Weighted Scoring
375
-
376
- ```python
377
- def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
378
- output = input_data.get("output", "")
379
-
380
- # Define checks with weights
381
- checks = [
382
- ("correctness", is_correct(output), 0.5),
383
- ("style", has_good_style(output), 0.3),
384
- ("docs", has_docs(output), 0.2),
385
- ]
386
-
387
- hits = [name for name, passed, _ in checks if passed]
388
- misses = [name for name, passed, _ in checks if not passed]
389
-
390
- # Weighted score
391
- score = sum(weight for _, passed, weight in checks if passed)
392
-
393
- return {
394
- "score": score,
395
- "hits": hits,
396
- "misses": misses,
397
- "reasoning": f"Weighted score: {score:.2f}"
398
- }
399
- ```