agentv 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7CJK3EYC.js → chunk-WMO5PVPX.js} +795 -668
- package/dist/chunk-WMO5PVPX.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +23 -0
- package/dist/templates/{github/prompts/eval-build.prompt.md → .claude/skills/agentv-eval-builder/SKILL.md} +57 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +399 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +317 -0
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +70 -0
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +5 -0
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +4 -0
- package/package.json +2 -2
- package/dist/chunk-7CJK3EYC.js.map +0 -1
- /package/dist/templates/{agentv → .agentv}/config.yaml +0 -0
- /package/dist/templates/{agentv → .agentv}/targets.yaml +0 -0
- /package/dist/templates/{github/contexts → .claude/skills/agentv-eval-builder/references}/config-schema.json +0 -0
- /package/dist/templates/{github/contexts → .claude/skills/agentv-eval-builder/references}/eval-schema.json +0 -0
package/dist/cli.js
CHANGED
package/dist/index.js
CHANGED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Example environment configuration for AgentV
|
|
2
|
+
# Copy this file to .env and fill in your credentials
|
|
3
|
+
|
|
4
|
+
# Model Provider Selection (Optional - can be configured via targets.yaml)
|
|
5
|
+
PROVIDER=azure
|
|
6
|
+
|
|
7
|
+
# Azure OpenAI Configuration
|
|
8
|
+
# These are the default environment variable names used in the provided targets.yaml
|
|
9
|
+
AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
|
|
10
|
+
AZURE_OPENAI_API_KEY=your-api-key-here
|
|
11
|
+
AZURE_DEPLOYMENT_NAME=gpt-4o
|
|
12
|
+
|
|
13
|
+
# Anthropic Configuration (if using Anthropic provider)
|
|
14
|
+
ANTHROPIC_API_KEY=your-anthropic-api-key-here
|
|
15
|
+
|
|
16
|
+
# VS Code Workspace Paths for Execution Targets
|
|
17
|
+
# Note: Using forward slashes is recommended for paths in .env files
|
|
18
|
+
# to avoid issues with escape characters.
|
|
19
|
+
PROJECTX_WORKSPACE_PATH=C:/Users/your-username/OneDrive - Company Pty Ltd/sample.code-workspace
|
|
20
|
+
|
|
21
|
+
# CLI provider sample (used by the local_cli target)
|
|
22
|
+
PROJECT_ROOT=D:/GitHub/your-username/agentv/docs/examples/simple
|
|
23
|
+
LOCAL_AGENT_TOKEN=your-cli-token
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
2
|
+
name: eval-builder
|
|
3
|
+
description: Create and maintain AgentV YAML evaluation files for testing AI agent performance. Use this skill when creating new eval files, adding eval cases, or configuring custom evaluators (code validators or LLM judges) for agent testing workflows.
|
|
3
4
|
---
|
|
4
5
|
|
|
6
|
+
# Eval Builder
|
|
7
|
+
|
|
5
8
|
## Schema Reference
|
|
6
|
-
- Schema:
|
|
9
|
+
- Schema: `references/eval-schema.json` (JSON Schema for validation and tooling)
|
|
7
10
|
- Format: YAML with structured content arrays
|
|
11
|
+
- Examples: `references/example-evals.md`
|
|
8
12
|
|
|
9
13
|
## Structure Requirements
|
|
10
14
|
- Root level: `$schema` (required: "agentv-eval-v2"), `description` (optional), `target` (optional), `evalcases` (required)
|
|
@@ -14,7 +18,54 @@ description: 'Apply when writing evals in YAML format'
|
|
|
14
18
|
- Message roles: `system`, `user`, `assistant`, `tool`
|
|
15
19
|
- Content types: `text` (inline), `file` (relative or absolute path)
|
|
16
20
|
- Attachments (type: `file`) should default to the `user` role
|
|
17
|
-
- File paths
|
|
21
|
+
- File paths: Relative (from eval file dir) or absolute with "/" prefix (from repo root)
|
|
22
|
+
|
|
23
|
+
## Custom Evaluators
|
|
24
|
+
|
|
25
|
+
Configure multiple evaluators per eval case via `execution.evaluators` array.
|
|
26
|
+
|
|
27
|
+
### Code Evaluators
|
|
28
|
+
Scripts that validate output programmatically:
|
|
29
|
+
|
|
30
|
+
```yaml
|
|
31
|
+
execution:
|
|
32
|
+
evaluators:
|
|
33
|
+
- name: json_format_validator
|
|
34
|
+
type: code
|
|
35
|
+
script: uv run validate_output.py
|
|
36
|
+
cwd: ../../evaluators/scripts
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
**Contract:**
|
|
40
|
+
- Input (stdin): JSON with `task`, `outcome`, `expected`, `output`, `system_message`, etc.
|
|
41
|
+
- Output (stdout): JSON with `score` (0.0-1.0), `hits`, `misses`, `reasoning`
|
|
42
|
+
|
|
43
|
+
**Template:** See `references/custom-evaluators.md` for Python code evaluator template
|
|
44
|
+
|
|
45
|
+
### LLM Judges
|
|
46
|
+
Language models evaluate response quality:
|
|
47
|
+
|
|
48
|
+
```yaml
|
|
49
|
+
execution:
|
|
50
|
+
evaluators:
|
|
51
|
+
- name: content_evaluator
|
|
52
|
+
type: llm_judge
|
|
53
|
+
prompt: /evaluators/prompts/correctness.md
|
|
54
|
+
model: gpt-5-chat
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Evaluator Chaining
|
|
58
|
+
Evaluators run sequentially:
|
|
59
|
+
|
|
60
|
+
```yaml
|
|
61
|
+
execution:
|
|
62
|
+
evaluators:
|
|
63
|
+
- name: format_check # Runs first
|
|
64
|
+
type: code
|
|
65
|
+
script: uv run validate_json.py
|
|
66
|
+
- name: content_check # Runs second
|
|
67
|
+
type: llm_judge
|
|
68
|
+
```
|
|
18
69
|
|
|
19
70
|
## Example
|
|
20
71
|
```yaml
|
|
@@ -40,7 +91,6 @@ evalcases:
|
|
|
40
91
|
def add(a, b):
|
|
41
92
|
return a + b
|
|
42
93
|
```
|
|
43
|
-
# File paths can be relative or absolute
|
|
44
94
|
- type: file
|
|
45
95
|
value: /prompts/python.instructions.md
|
|
46
96
|
|
|
@@ -62,7 +112,8 @@ evalcases:
|
|
|
62
112
|
evaluators:
|
|
63
113
|
- name: keyword_check
|
|
64
114
|
type: code
|
|
65
|
-
script:
|
|
115
|
+
script: uv run check_keywords.py
|
|
116
|
+
cwd: /evaluators/scripts
|
|
66
117
|
- name: semantic_judge
|
|
67
118
|
type: llm_judge
|
|
68
119
|
prompt: /evaluators/prompts/correctness.md
|
|
@@ -98,4 +149,4 @@ evalcases:
|
|
|
98
149
|
unique.sort(reverse=True)
|
|
99
150
|
return unique[1]
|
|
100
151
|
```
|
|
101
|
-
```
|
|
152
|
+
```
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
# Custom Evaluators Guide
|
|
2
|
+
|
|
3
|
+
Guide for writing custom code evaluators and LLM judges for AgentV eval files.
|
|
4
|
+
|
|
5
|
+
## Code Evaluator Contract
|
|
6
|
+
|
|
7
|
+
Code evaluators receive input via stdin and write output to stdout, both as JSON.
|
|
8
|
+
|
|
9
|
+
### Input Format (via stdin)
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"task": "string describing the task",
|
|
14
|
+
"outcome": "expected outcome description",
|
|
15
|
+
"expected": "expected output string",
|
|
16
|
+
"output": "generated code/text from the agent",
|
|
17
|
+
"system_message": "system message if any",
|
|
18
|
+
"guideline_paths": ["path1", "path2"],
|
|
19
|
+
"attachments": ["file1", "file2"],
|
|
20
|
+
"user_segments": [{"type": "text", "value": "..."}]
|
|
21
|
+
}
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Output Format (to stdout)
|
|
25
|
+
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"score": 0.85,
|
|
29
|
+
"hits": ["successful check 1", "successful check 2"],
|
|
30
|
+
"misses": ["failed check 1"],
|
|
31
|
+
"reasoning": "Brief explanation of the score"
|
|
32
|
+
}
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**Field Requirements:**
|
|
36
|
+
- `score`: Float between 0.0 and 1.0 (required)
|
|
37
|
+
- `hits`: Array of strings describing what passed (optional but recommended)
|
|
38
|
+
- `misses`: Array of strings describing what failed (optional but recommended)
|
|
39
|
+
- `reasoning`: String explaining the score (optional but recommended)
|
|
40
|
+
|
|
41
|
+
## Python Code Evaluator Template
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
#!/usr/bin/env python3
|
|
45
|
+
"""
|
|
46
|
+
Example code evaluator for AgentV
|
|
47
|
+
|
|
48
|
+
This evaluator checks for specific keywords in the output.
|
|
49
|
+
Replace validation logic as needed.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
import json
|
|
53
|
+
import sys
|
|
54
|
+
from typing import Any
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
58
|
+
"""
|
|
59
|
+
Evaluate the agent output.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
input_data: Full input context from AgentV
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Evaluation result with score, hits, misses, reasoning
|
|
66
|
+
"""
|
|
67
|
+
# Extract only the fields you need
|
|
68
|
+
# Most evaluators only need 'output' - avoid using unnecessary fields
|
|
69
|
+
output = input_data.get("output", "")
|
|
70
|
+
|
|
71
|
+
# Your validation logic here
|
|
72
|
+
hits = []
|
|
73
|
+
misses = []
|
|
74
|
+
|
|
75
|
+
# Example: Check for keywords
|
|
76
|
+
required_keywords = ["async", "await"]
|
|
77
|
+
for keyword in required_keywords:
|
|
78
|
+
if keyword in output:
|
|
79
|
+
hits.append(f"Contains required keyword: {keyword}")
|
|
80
|
+
else:
|
|
81
|
+
misses.append(f"Missing required keyword: {keyword}")
|
|
82
|
+
|
|
83
|
+
# Calculate score
|
|
84
|
+
if not required_keywords:
|
|
85
|
+
score = 1.0
|
|
86
|
+
else:
|
|
87
|
+
score = len(hits) / len(required_keywords)
|
|
88
|
+
|
|
89
|
+
# Build result
|
|
90
|
+
return {
|
|
91
|
+
"score": score,
|
|
92
|
+
"hits": hits,
|
|
93
|
+
"misses": misses,
|
|
94
|
+
"reasoning": f"Found {len(hits)}/{len(required_keywords)} required keywords"
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
"""Main entry point for AgentV code evaluator."""
|
|
100
|
+
try:
|
|
101
|
+
# Read input from stdin
|
|
102
|
+
input_data = json.loads(sys.stdin.read())
|
|
103
|
+
|
|
104
|
+
# Run evaluation
|
|
105
|
+
result = evaluate(input_data)
|
|
106
|
+
|
|
107
|
+
# Write result to stdout
|
|
108
|
+
print(json.dumps(result, indent=2))
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
# Error handling: return zero score with error message
|
|
112
|
+
error_result = {
|
|
113
|
+
"score": 0.0,
|
|
114
|
+
"hits": [],
|
|
115
|
+
"misses": [f"Evaluator error: {str(e)}"],
|
|
116
|
+
"reasoning": f"Evaluator error: {str(e)}"
|
|
117
|
+
}
|
|
118
|
+
print(json.dumps(error_result, indent=2))
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
if __name__ == "__main__":
|
|
123
|
+
main()
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## JSON Format Validator Example
|
|
127
|
+
|
|
128
|
+
A common pattern is validating JSON output structure:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
#!/usr/bin/env python3
|
|
132
|
+
"""
|
|
133
|
+
JSON Format Validator for AgentV
|
|
134
|
+
Validates that output is valid JSON with required keys.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
import json
|
|
138
|
+
import sys
|
|
139
|
+
from typing import Any
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def validate_json_format(output: str, required_keys: list[str]) -> dict[str, Any]:
|
|
143
|
+
"""
|
|
144
|
+
Validate that output is valid JSON with required keys.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
output: The candidate output to validate
|
|
148
|
+
required_keys: List of required top-level keys
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Evaluation result dict
|
|
152
|
+
"""
|
|
153
|
+
# Try to parse as JSON
|
|
154
|
+
try:
|
|
155
|
+
parsed = json.loads(output.strip())
|
|
156
|
+
except json.JSONDecodeError as e:
|
|
157
|
+
return {
|
|
158
|
+
"score": 0.0,
|
|
159
|
+
"hits": [],
|
|
160
|
+
"misses": ["Not valid JSON"],
|
|
161
|
+
"reasoning": f"Output is not valid JSON. Parse error: {str(e)}"
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# Check if it's a dict
|
|
165
|
+
if not isinstance(parsed, dict):
|
|
166
|
+
return {
|
|
167
|
+
"score": 0.0,
|
|
168
|
+
"hits": [],
|
|
169
|
+
"misses": ["JSON is not an object/dict"],
|
|
170
|
+
"reasoning": f"Output is valid JSON but not an object. Got: {type(parsed).__name__}"
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# Check for required keys
|
|
174
|
+
missing_keys = [key for key in required_keys if key not in parsed]
|
|
175
|
+
present_keys = [key for key in required_keys if key in parsed]
|
|
176
|
+
|
|
177
|
+
if missing_keys:
|
|
178
|
+
return {
|
|
179
|
+
"score": 0.0,
|
|
180
|
+
"hits": [f"Has key: {key}" for key in present_keys],
|
|
181
|
+
"misses": [f"Missing key: {key}" for key in missing_keys],
|
|
182
|
+
"reasoning": f"Valid JSON but missing required keys: {', '.join(missing_keys)}"
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
# All checks passed
|
|
186
|
+
return {
|
|
187
|
+
"score": 1.0,
|
|
188
|
+
"hits": [f"Valid JSON with all required keys: {', '.join(required_keys)}"],
|
|
189
|
+
"misses": [],
|
|
190
|
+
"reasoning": f"Valid JSON with all required keys: {', '.join(required_keys)}"
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def main():
|
|
195
|
+
"""Main entry point."""
|
|
196
|
+
try:
|
|
197
|
+
input_data = json.loads(sys.stdin.read())
|
|
198
|
+
output = input_data.get("output", "")
|
|
199
|
+
|
|
200
|
+
# Define required keys (customize as needed)
|
|
201
|
+
required_keys = ["criticalityRating", "reasoning"]
|
|
202
|
+
|
|
203
|
+
result = validate_json_format(output, required_keys)
|
|
204
|
+
print(json.dumps(result, indent=2))
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
error_result = {
|
|
208
|
+
"score": 0.0,
|
|
209
|
+
"hits": [],
|
|
210
|
+
"misses": [f"Evaluator error: {str(e)}"],
|
|
211
|
+
"reasoning": f"Evaluator error: {str(e)}"
|
|
212
|
+
}
|
|
213
|
+
print(json.dumps(error_result, indent=2))
|
|
214
|
+
sys.exit(1)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
if __name__ == "__main__":
|
|
218
|
+
main()
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## LLM Judge Prompt Template
|
|
222
|
+
|
|
223
|
+
LLM judges use markdown prompts to guide evaluation:
|
|
224
|
+
|
|
225
|
+
```markdown
|
|
226
|
+
# Code Quality Judge
|
|
227
|
+
|
|
228
|
+
Evaluate the candidate code for quality, correctness, and best practices.
|
|
229
|
+
|
|
230
|
+
## Evaluation Criteria
|
|
231
|
+
|
|
232
|
+
Rate the code on:
|
|
233
|
+
1. **Correctness** - Does it solve the problem?
|
|
234
|
+
2. **Style** - Does it follow best practices?
|
|
235
|
+
3. **Completeness** - Are edge cases handled?
|
|
236
|
+
4. **Documentation** - Are there helpful comments/docstrings?
|
|
237
|
+
|
|
238
|
+
## Scoring Guidelines
|
|
239
|
+
|
|
240
|
+
- **0.9-1.0:** Excellent - Correct, clean, well-documented
|
|
241
|
+
- **0.7-0.8:** Good - Correct with minor style issues
|
|
242
|
+
- **0.5-0.6:** Adequate - Works but has quality issues
|
|
243
|
+
- **0.3-0.4:** Poor - Has bugs or major style problems
|
|
244
|
+
- **0.0-0.2:** Unacceptable - Does not work or completely wrong
|
|
245
|
+
|
|
246
|
+
## Output Format
|
|
247
|
+
|
|
248
|
+
Respond with valid JSON:
|
|
249
|
+
|
|
250
|
+
```json
|
|
251
|
+
{
|
|
252
|
+
"score": 0.85,
|
|
253
|
+
"hits": [
|
|
254
|
+
"Correctly implements the algorithm",
|
|
255
|
+
"Good error handling"
|
|
256
|
+
],
|
|
257
|
+
"misses": [
|
|
258
|
+
"Missing type hints",
|
|
259
|
+
"No docstring"
|
|
260
|
+
],
|
|
261
|
+
"reasoning": "Code is correct and handles errors well, but lacks documentation."
|
|
262
|
+
}
|
|
263
|
+
```
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Best Practices
|
|
267
|
+
|
|
268
|
+
### For Code Evaluators
|
|
269
|
+
|
|
270
|
+
1. **Focus on relevant fields** - Most evaluators only need the `output` field
|
|
271
|
+
2. **Avoid false positives** - Don't check fields like `task` or `expected` unless you specifically need context
|
|
272
|
+
3. **Be deterministic** - Same input should always produce same output
|
|
273
|
+
4. **Handle errors gracefully** - Return a valid result even when evaluation fails
|
|
274
|
+
5. **Provide helpful feedback** - Use `hits` and `misses` to explain the score
|
|
275
|
+
|
|
276
|
+
### For LLM Judges
|
|
277
|
+
|
|
278
|
+
1. **Clear criteria** - Define what you're evaluating
|
|
279
|
+
2. **Specific guidelines** - Provide scoring rubrics
|
|
280
|
+
3. **JSON output** - Enforce structured output format
|
|
281
|
+
4. **Examples** - Show what good/bad looks like
|
|
282
|
+
5. **Concise prompts** - Keep instructions focused
|
|
283
|
+
|
|
284
|
+
### Common Pitfalls to Avoid
|
|
285
|
+
|
|
286
|
+
**❌ Checking unnecessary fields:**
|
|
287
|
+
```python
|
|
288
|
+
# BAD: Checking 'task' or 'expected' when you only need to validate format
|
|
289
|
+
if "async" in input_data.get("task", ""):
|
|
290
|
+
# This creates false positives
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
**✅ Focus on output:**
|
|
294
|
+
```python
|
|
295
|
+
# GOOD: Only check the actual output
|
|
296
|
+
output = input_data.get("output", "")
|
|
297
|
+
if "async" in output:
|
|
298
|
+
# This is what you actually want to validate
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
**❌ Brittle string matching:**
|
|
302
|
+
```python
|
|
303
|
+
# BAD: Exact match is too strict
|
|
304
|
+
if output == "The answer is 42":
|
|
305
|
+
score = 1.0
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
**✅ Flexible validation:**
|
|
309
|
+
```python
|
|
310
|
+
# GOOD: Check for semantic correctness
|
|
311
|
+
if "42" in output and "answer" in output.lower():
|
|
312
|
+
score = 1.0
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
## Running Code Evaluators
|
|
316
|
+
|
|
317
|
+
### In Eval Files
|
|
318
|
+
|
|
319
|
+
```yaml
|
|
320
|
+
execution:
|
|
321
|
+
evaluators:
|
|
322
|
+
- name: my_validator
|
|
323
|
+
type: code
|
|
324
|
+
script: uv run my_validator.py
|
|
325
|
+
cwd: ./evaluators
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### Command Line Testing
|
|
329
|
+
|
|
330
|
+
Test your evaluator locally:
|
|
331
|
+
|
|
332
|
+
```bash
|
|
333
|
+
# Create test input
|
|
334
|
+
echo '{
|
|
335
|
+
"output": "test output here",
|
|
336
|
+
"task": "test task"
|
|
337
|
+
}' | uv run my_validator.py
|
|
338
|
+
|
|
339
|
+
# Should output:
|
|
340
|
+
# {
|
|
341
|
+
# "score": 0.8,
|
|
342
|
+
# "hits": ["check 1 passed"],
|
|
343
|
+
# "misses": ["check 2 failed"],
|
|
344
|
+
# "reasoning": "..."
|
|
345
|
+
# }
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
## Advanced Patterns
|
|
349
|
+
|
|
350
|
+
### Combining Multiple Checks
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
354
|
+
output = input_data.get("output", "")
|
|
355
|
+
|
|
356
|
+
checks = [
|
|
357
|
+
("has_async", "async" in output, "Contains async keyword"),
|
|
358
|
+
("has_await", "await" in output, "Contains await keyword"),
|
|
359
|
+
("has_try", "try:" in output, "Has error handling"),
|
|
360
|
+
]
|
|
361
|
+
|
|
362
|
+
hits = [msg for _, passed, msg in checks if passed]
|
|
363
|
+
misses = [msg for _, passed, msg in checks if not passed]
|
|
364
|
+
score = len(hits) / len(checks)
|
|
365
|
+
|
|
366
|
+
return {
|
|
367
|
+
"score": score,
|
|
368
|
+
"hits": hits,
|
|
369
|
+
"misses": misses,
|
|
370
|
+
"reasoning": f"Passed {len(hits)}/{len(checks)} checks"
|
|
371
|
+
}
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### Weighted Scoring
|
|
375
|
+
|
|
376
|
+
```python
|
|
377
|
+
def evaluate(input_data: dict[str, Any]) -> dict[str, Any]:
|
|
378
|
+
output = input_data.get("output", "")
|
|
379
|
+
|
|
380
|
+
# Define checks with weights
|
|
381
|
+
checks = [
|
|
382
|
+
("correctness", is_correct(output), 0.5),
|
|
383
|
+
("style", has_good_style(output), 0.3),
|
|
384
|
+
("docs", has_docs(output), 0.2),
|
|
385
|
+
]
|
|
386
|
+
|
|
387
|
+
hits = [name for name, passed, _ in checks if passed]
|
|
388
|
+
misses = [name for name, passed, _ in checks if not passed]
|
|
389
|
+
|
|
390
|
+
# Weighted score
|
|
391
|
+
score = sum(weight for _, passed, weight in checks if passed)
|
|
392
|
+
|
|
393
|
+
return {
|
|
394
|
+
"score": score,
|
|
395
|
+
"hits": hits,
|
|
396
|
+
"misses": misses,
|
|
397
|
+
"reasoning": f"Weighted score: {score:.2f}"
|
|
398
|
+
}
|
|
399
|
+
```
|