agentv 4.40.1 → 4.41.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-GIAIMGPQ.js → artifact-writer-AMV64TWV.js} +4 -4
- package/dist/{chunk-B7CT3J2W.js → chunk-6FXICR66.js} +899 -300
- package/dist/chunk-6FXICR66.js.map +1 -0
- package/dist/{chunk-TWQP7JYQ.js → chunk-A4J456KS.js} +2 -2
- package/dist/{chunk-A36XLUI5.js → chunk-CF5RCUWH.js} +12 -10
- package/dist/chunk-CF5RCUWH.js.map +1 -0
- package/dist/{chunk-BLXYBUU4.js → chunk-ENHX2CCS.js} +1485 -943
- package/dist/chunk-ENHX2CCS.js.map +1 -0
- package/dist/{chunk-I3SC4FOT.js → chunk-Z45FKRMJ.js} +212 -58
- package/dist/chunk-Z45FKRMJ.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/{dist-6Z4OSITR.js → dist-X5P5IR65.js} +7 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-Q575M3A7.js → interactive-4JKJTY3G.js} +5 -5
- package/dist/skills/agentv-bench/references/eval-yaml-spec.md +4 -4
- package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +14 -14
- package/dist/skills/agentv-eval-writer/references/python-helpers.md +47 -0
- package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-A36XLUI5.js.map +0 -1
- package/dist/chunk-B7CT3J2W.js.map +0 -1
- package/dist/chunk-BLXYBUU4.js.map +0 -1
- package/dist/chunk-I3SC4FOT.js.map +0 -1
- /package/dist/{artifact-writer-GIAIMGPQ.js.map → artifact-writer-AMV64TWV.js.map} +0 -0
- /package/dist/{chunk-TWQP7JYQ.js.map → chunk-A4J456KS.js.map} +0 -0
- /package/dist/{dist-6Z4OSITR.js.map → dist-X5P5IR65.js.map} +0 -0
- /package/dist/{interactive-Q575M3A7.js.map → interactive-4JKJTY3G.js.map} +0 -0
- /package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map} +0 -0
package/dist/cli.js
CHANGED
|
@@ -2,15 +2,15 @@
|
|
|
2
2
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
3
3
|
import {
|
|
4
4
|
runCli
|
|
5
|
-
} from "./chunk-
|
|
6
|
-
import "./chunk-
|
|
7
|
-
import "./chunk-
|
|
8
|
-
import "./chunk-
|
|
5
|
+
} from "./chunk-6FXICR66.js";
|
|
6
|
+
import "./chunk-CF5RCUWH.js";
|
|
7
|
+
import "./chunk-A4J456KS.js";
|
|
8
|
+
import "./chunk-Z45FKRMJ.js";
|
|
9
9
|
import "./chunk-76FOHROU.js";
|
|
10
10
|
import "./chunk-BPGJ4HBU.js";
|
|
11
11
|
import {
|
|
12
12
|
killAllTrackedChildren
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-ENHX2CCS.js";
|
|
14
14
|
import "./chunk-NPVGBFF6.js";
|
|
15
15
|
import "./chunk-M7BUKBAF.js";
|
|
16
16
|
import "./chunk-5H446C7X.js";
|
|
@@ -30,6 +30,7 @@ import {
|
|
|
30
30
|
parseClaudeSession,
|
|
31
31
|
parseCodexSession,
|
|
32
32
|
parseEnvOutput,
|
|
33
|
+
prepareEvalWorkspace,
|
|
33
34
|
prepareResultsRepoBranch,
|
|
34
35
|
pushResultsRepoBranch,
|
|
35
36
|
pushWipCheckpoint,
|
|
@@ -46,7 +47,7 @@ import {
|
|
|
46
47
|
transpileEvalYaml,
|
|
47
48
|
transpileEvalYamlFile,
|
|
48
49
|
trimBaselineResult
|
|
49
|
-
} from "./chunk-
|
|
50
|
+
} from "./chunk-Z45FKRMJ.js";
|
|
50
51
|
import {
|
|
51
52
|
OtlpJsonFileExporter
|
|
52
53
|
} from "./chunk-76FOHROU.js";
|
|
@@ -221,6 +222,7 @@ import {
|
|
|
221
222
|
getWorkspacePath,
|
|
222
223
|
getWorkspacePoolRoot,
|
|
223
224
|
getWorkspacesRoot,
|
|
225
|
+
gradePreparedEvalCase,
|
|
224
226
|
groupTranscriptJsonLines,
|
|
225
227
|
initializeBaseline,
|
|
226
228
|
isAgentSkillsFormat,
|
|
@@ -322,7 +324,7 @@ import {
|
|
|
322
324
|
writeArtifactsFromResults,
|
|
323
325
|
writeInitialBenchmarkArtifact,
|
|
324
326
|
writePerTestArtifacts
|
|
325
|
-
} from "./chunk-
|
|
327
|
+
} from "./chunk-ENHX2CCS.js";
|
|
326
328
|
import "./chunk-NPVGBFF6.js";
|
|
327
329
|
import "./chunk-M7BUKBAF.js";
|
|
328
330
|
import "./chunk-5H446C7X.js";
|
|
@@ -519,6 +521,7 @@ export {
|
|
|
519
521
|
getWorkspacePath,
|
|
520
522
|
getWorkspacePoolRoot,
|
|
521
523
|
getWorkspacesRoot,
|
|
524
|
+
gradePreparedEvalCase,
|
|
522
525
|
groupTranscriptJsonLines,
|
|
523
526
|
initializeBaseline,
|
|
524
527
|
isAgentSkillsFormat,
|
|
@@ -559,6 +562,7 @@ export {
|
|
|
559
562
|
parseJsonSafe,
|
|
560
563
|
parseJsonlResults,
|
|
561
564
|
parseYamlValue,
|
|
565
|
+
prepareEvalWorkspace,
|
|
562
566
|
prepareResultsRepoBranch,
|
|
563
567
|
pushResultsRepoBranch,
|
|
564
568
|
pushWipCheckpoint,
|
|
@@ -644,4 +648,4 @@ export {
|
|
|
644
648
|
writeInitialBenchmarkArtifact,
|
|
645
649
|
writePerTestArtifacts
|
|
646
650
|
};
|
|
647
|
-
//# sourceMappingURL=dist-
|
|
651
|
+
//# sourceMappingURL=dist-X5P5IR65.js.map
|
package/dist/index.js
CHANGED
|
@@ -4,13 +4,13 @@ import {
|
|
|
4
4
|
preprocessArgv,
|
|
5
5
|
runCli,
|
|
6
6
|
usesDeprecatedStudioAlias
|
|
7
|
-
} from "./chunk-
|
|
8
|
-
import "./chunk-
|
|
9
|
-
import "./chunk-
|
|
10
|
-
import "./chunk-
|
|
7
|
+
} from "./chunk-6FXICR66.js";
|
|
8
|
+
import "./chunk-CF5RCUWH.js";
|
|
9
|
+
import "./chunk-A4J456KS.js";
|
|
10
|
+
import "./chunk-Z45FKRMJ.js";
|
|
11
11
|
import "./chunk-76FOHROU.js";
|
|
12
12
|
import "./chunk-BPGJ4HBU.js";
|
|
13
|
-
import "./chunk-
|
|
13
|
+
import "./chunk-ENHX2CCS.js";
|
|
14
14
|
import "./chunk-NPVGBFF6.js";
|
|
15
15
|
import "./chunk-M7BUKBAF.js";
|
|
16
16
|
import "./chunk-5H446C7X.js";
|
|
@@ -7,16 +7,16 @@ import {
|
|
|
7
7
|
findRepoRoot,
|
|
8
8
|
getCategories,
|
|
9
9
|
runEvalCommand
|
|
10
|
-
} from "./chunk-
|
|
11
|
-
import "./chunk-
|
|
12
|
-
import "./chunk-
|
|
10
|
+
} from "./chunk-CF5RCUWH.js";
|
|
11
|
+
import "./chunk-A4J456KS.js";
|
|
12
|
+
import "./chunk-Z45FKRMJ.js";
|
|
13
13
|
import "./chunk-76FOHROU.js";
|
|
14
14
|
import "./chunk-BPGJ4HBU.js";
|
|
15
15
|
import {
|
|
16
16
|
getAgentvConfigDir,
|
|
17
17
|
listTargetNames,
|
|
18
18
|
readTargetDefinitions
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-ENHX2CCS.js";
|
|
20
20
|
import "./chunk-NPVGBFF6.js";
|
|
21
21
|
import "./chunk-M7BUKBAF.js";
|
|
22
22
|
import "./chunk-5H446C7X.js";
|
|
@@ -360,4 +360,4 @@ ${ANSI_DIM}Retrying execution errors...${ANSI_RESET}
|
|
|
360
360
|
export {
|
|
361
361
|
launchInteractiveWizard
|
|
362
362
|
};
|
|
363
|
-
//# sourceMappingURL=interactive-
|
|
363
|
+
//# sourceMappingURL=interactive-4JKJTY3G.js.map
|
|
@@ -175,12 +175,12 @@ Same as contains variants but explicitly case-insensitive.
|
|
|
175
175
|
- **Script SDK:** Use `defineCodeGrader` from `@agentv/eval`:
|
|
176
176
|
```typescript
|
|
177
177
|
import { defineCodeGrader } from '@agentv/eval';
|
|
178
|
-
export default defineCodeGrader(({
|
|
179
|
-
score:
|
|
180
|
-
assertions: [{ text: 'Contains expected', passed:
|
|
178
|
+
export default defineCodeGrader(({ output, trace }) => ({
|
|
179
|
+
score: (output ?? '').includes('expected') ? 1 : 0,
|
|
180
|
+
assertions: [{ text: 'Contains expected', passed: (output ?? '').includes('expected') }],
|
|
181
181
|
}));
|
|
182
182
|
```
|
|
183
|
-
- **Recipe:** The CLI runs the script, passing
|
|
183
|
+
- **Recipe:** The CLI runs the script, passing canonical JSON on stdin (`{output, input, expected_output, ...}`). Script returns `{"score": N, "assertions": [...]}`
|
|
184
184
|
- **PASS:** score >= 0.5 (or as configured).
|
|
185
185
|
|
|
186
186
|
### Composite assertion
|
|
@@ -63,29 +63,29 @@ import { defineCodeGrader, createTargetClient, definePromptTemplate } from '@age
|
|
|
63
63
|
- Raw stdin uses `snake_case`; SDK handlers receive `camelCase`
|
|
64
64
|
- Context fields: `input`, `expectedOutput`, `output`, `messages`, `criteria`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
|
|
65
65
|
|
|
66
|
+
For Python, the repo-local helper example in `examples/features/sdk-python/` keeps canonical `snake_case` fields and rejects deprecated wire aliases like `output_text`, `input_text`, and `reference_answer`. It is not a separate Python runner; generated evals still run through the AgentV CLI.
|
|
67
|
+
|
|
66
68
|
## Python Example
|
|
67
69
|
|
|
68
70
|
```python
|
|
69
71
|
#!/usr/bin/env python3
|
|
70
|
-
import
|
|
72
|
+
from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
|
|
73
|
+
|
|
71
74
|
|
|
72
|
-
def evaluate(
|
|
73
|
-
candidate =
|
|
75
|
+
def evaluate(context):
|
|
76
|
+
candidate = context.output or ""
|
|
74
77
|
assertions = []
|
|
75
78
|
for kw in ["async", "await"]:
|
|
76
|
-
assertions.append(
|
|
77
|
-
passed = sum(1 for
|
|
78
|
-
return
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
assertions.append(Assertion(text=f"Keyword '{kw}'", passed=kw in candidate))
|
|
80
|
+
passed = sum(1 for item in assertions if item.passed)
|
|
81
|
+
return CodeGraderResult(
|
|
82
|
+
score=passed / max(len(assertions), 1),
|
|
83
|
+
assertions=assertions,
|
|
84
|
+
)
|
|
85
|
+
|
|
82
86
|
|
|
83
87
|
if __name__ == "__main__":
|
|
84
|
-
|
|
85
|
-
print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
|
|
86
|
-
except Exception as e:
|
|
87
|
-
print(json.dumps({"score": 0, "assertions": [{"text": str(e), "passed": False}]}))
|
|
88
|
-
sys.exit(1)
|
|
88
|
+
define_code_grader(evaluate)
|
|
89
89
|
```
|
|
90
90
|
|
|
91
91
|
## TypeScript Example
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Repo-Local Python Helpers
|
|
2
|
+
|
|
3
|
+
AgentV's Python authoring surface is currently a repo-local helper example under `examples/features/sdk-python/`.
|
|
4
|
+
|
|
5
|
+
Use it when the user wants Python-based custom graders or wants to emit AgentV YAML/JSONL from Python without introducing a Python-native runner.
|
|
6
|
+
|
|
7
|
+
## Rules
|
|
8
|
+
|
|
9
|
+
- Prefer canonical AgentV wire and YAML fields.
|
|
10
|
+
- Do not accept deprecated wire aliases like `output_text`, `input_text`, or `reference_answer`.
|
|
11
|
+
- Keep Python eval authoring YAML-shaped. Mirror `execution`, `tests`, `assertions`, `expected_output`, and related AgentV keys directly.
|
|
12
|
+
- Run evals through the AgentV CLI, not through a separate Python runtime.
|
|
13
|
+
|
|
14
|
+
## Available helpers
|
|
15
|
+
|
|
16
|
+
- `agentv_py.grader`
|
|
17
|
+
- `load_grader_input()`
|
|
18
|
+
- `run_code_grader(handler)`
|
|
19
|
+
- `define_code_grader(handler)`
|
|
20
|
+
- `TargetClient.from_env()`
|
|
21
|
+
- `agentv_py.evals`
|
|
22
|
+
- `EvalDefinition`
|
|
23
|
+
- `EvalTest`
|
|
24
|
+
- `JsonlCase`
|
|
25
|
+
- `write_eval_yaml()`
|
|
26
|
+
- `write_jsonl()`
|
|
27
|
+
- `run_agentv_eval()`
|
|
28
|
+
|
|
29
|
+
## Example
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def evaluate(context):
|
|
36
|
+
actual = context.output or ""
|
|
37
|
+
expected = context.expected_output[0]["content"]
|
|
38
|
+
passed = actual.strip() == expected.strip()
|
|
39
|
+
return CodeGraderResult(
|
|
40
|
+
score=1.0 if passed else 0.0,
|
|
41
|
+
assertions=[Assertion(text="Exact match", passed=passed)],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
define_code_grader(evaluate)
|
|
47
|
+
```
|
|
@@ -2,7 +2,7 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
|
|
|
2
2
|
import {
|
|
3
3
|
loadTsEvalFile,
|
|
4
4
|
loadTsEvalSuite
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-ENHX2CCS.js";
|
|
6
6
|
import "./chunk-NPVGBFF6.js";
|
|
7
7
|
import "./chunk-M7BUKBAF.js";
|
|
8
8
|
import "./chunk-5H446C7X.js";
|
|
@@ -10,4 +10,4 @@ export {
|
|
|
10
10
|
loadTsEvalFile,
|
|
11
11
|
loadTsEvalSuite
|
|
12
12
|
};
|
|
13
|
-
//# sourceMappingURL=ts-eval-loader-
|
|
13
|
+
//# sourceMappingURL=ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map
|