agentv 0.23.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -11
- package/dist/{chunk-4T62HFF4.js → chunk-6ZM7WVSC.js} +900 -250
- package/dist/chunk-6ZM7WVSC.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +10 -10
- package/dist/templates/.agentv/targets.yaml +8 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +75 -6
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +139 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +237 -0
- package/package.json +1 -1
- package/dist/chunk-4T62HFF4.js.map +0 -1
package/README.md
CHANGED
|
@@ -130,6 +130,8 @@ agentv eval --target vscode_projectx --targets "path/to/targets.yaml" --eval-id
|
|
|
130
130
|
- `--max-retries COUNT`: Maximum number of retries for timeout cases (default: 2)
|
|
131
131
|
- `--cache`: Enable caching of LLM responses (default: disabled)
|
|
132
132
|
- `--dump-prompts`: Save all prompts to `.agentv/prompts/` directory
|
|
133
|
+
- `--dump-traces`: Write trace files to `.agentv/traces/` directory
|
|
134
|
+
- `--include-trace`: Include full trace in result output (verbose)
|
|
133
135
|
- `--workers COUNT`: Parallel workers for eval cases (default: 3; target `workers` setting used when provided)
|
|
134
136
|
- `--verbose`: Verbose output
|
|
135
137
|
|
|
@@ -174,6 +176,7 @@ Each target specifies:
|
|
|
174
176
|
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
|
|
175
177
|
api_key: ${{ AZURE_OPENAI_API_KEY }}
|
|
176
178
|
model: ${{ AZURE_DEPLOYMENT_NAME }}
|
|
179
|
+
version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: defaults to 2024-12-01-preview
|
|
177
180
|
```
|
|
178
181
|
|
|
179
182
|
Note: Environment variables are referenced using `${{ VARIABLE_NAME }}` syntax. The actual values are resolved from your `.env` file at runtime.
|
|
@@ -246,14 +249,13 @@ Code evaluators receive input via stdin and write output to stdout as JSON.
|
|
|
246
249
|
**Input Format (via stdin):**
|
|
247
250
|
```json
|
|
248
251
|
{
|
|
249
|
-
"
|
|
250
|
-
"
|
|
251
|
-
"
|
|
252
|
-
"
|
|
253
|
-
"
|
|
254
|
-
"
|
|
255
|
-
"
|
|
256
|
-
"user_segments": [{"type": "text", "value": "..."}]
|
|
252
|
+
"question": "string describing the task/question",
|
|
253
|
+
"expected_outcome": "expected outcome description",
|
|
254
|
+
"reference_answer": "gold standard answer (optional)",
|
|
255
|
+
"candidate_answer": "generated code/text from the agent",
|
|
256
|
+
"guideline_files": ["path/to/guideline1.md", "path/to/guideline2.md"],
|
|
257
|
+
"input_files": ["path/to/data.json", "path/to/config.yaml"],
|
|
258
|
+
"input_messages": [{"role": "user", "content": "..."}]
|
|
257
259
|
}
|
|
258
260
|
```
|
|
259
261
|
|
|
@@ -269,8 +271,8 @@ Code evaluators receive input via stdin and write output to stdout as JSON.
|
|
|
269
271
|
|
|
270
272
|
**Key Points:**
|
|
271
273
|
- Evaluators receive **full context** but should select only relevant fields
|
|
272
|
-
- Most evaluators only need `
|
|
273
|
-
- Complex evaluators can use `
|
|
274
|
+
- Most evaluators only need `candidate_answer` field - ignore the rest to avoid false positives
|
|
275
|
+
- Complex evaluators can use `question`, `reference_answer`, or `guideline_paths` for context-aware validation
|
|
274
276
|
- Score range: `0.0` to `1.0` (float)
|
|
275
277
|
- `hits` and `misses` are optional but recommended for debugging
|
|
276
278
|
|
|
@@ -283,7 +285,7 @@ import sys
|
|
|
283
285
|
|
|
284
286
|
def evaluate(input_data):
|
|
285
287
|
# Extract only the fields you need
|
|
286
|
-
|
|
288
|
+
candidate_answer = input_data.get("candidate_answer", "")
|
|
287
289
|
|
|
288
290
|
# Your validation logic here
|
|
289
291
|
score = 0.0 # to 1.0
|
|
@@ -414,6 +416,7 @@ targets:
|
|
|
414
416
|
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
|
|
415
417
|
api_key: ${{ AZURE_OPENAI_API_KEY }}
|
|
416
418
|
model: gpt-4
|
|
419
|
+
version: ${{ AZURE_OPENAI_API_VERSION }} # Optional: API version (defaults to 2024-12-01-preview)
|
|
417
420
|
max_retries: 5 # Maximum retry attempts
|
|
418
421
|
retry_initial_delay_ms: 2000 # Initial delay before first retry
|
|
419
422
|
retry_max_delay_ms: 120000 # Maximum delay cap
|