agentv 4.40.1 → 4.41.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/dist/{artifact-writer-GIAIMGPQ.js → artifact-writer-AMV64TWV.js} +4 -4
  2. package/dist/{chunk-B7CT3J2W.js → chunk-6FXICR66.js} +899 -300
  3. package/dist/chunk-6FXICR66.js.map +1 -0
  4. package/dist/{chunk-TWQP7JYQ.js → chunk-A4J456KS.js} +2 -2
  5. package/dist/{chunk-A36XLUI5.js → chunk-CF5RCUWH.js} +12 -10
  6. package/dist/chunk-CF5RCUWH.js.map +1 -0
  7. package/dist/{chunk-BLXYBUU4.js → chunk-ENHX2CCS.js} +1485 -943
  8. package/dist/chunk-ENHX2CCS.js.map +1 -0
  9. package/dist/{chunk-I3SC4FOT.js → chunk-Z45FKRMJ.js} +212 -58
  10. package/dist/chunk-Z45FKRMJ.js.map +1 -0
  11. package/dist/cli.js +5 -5
  12. package/dist/{dist-6Z4OSITR.js → dist-X5P5IR65.js} +7 -3
  13. package/dist/index.js +5 -5
  14. package/dist/{interactive-Q575M3A7.js → interactive-4JKJTY3G.js} +5 -5
  15. package/dist/skills/agentv-bench/references/eval-yaml-spec.md +4 -4
  16. package/dist/skills/agentv-eval-writer/references/custom-evaluators.md +14 -14
  17. package/dist/skills/agentv-eval-writer/references/python-helpers.md +47 -0
  18. package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js} +2 -2
  19. package/package.json +1 -1
  20. package/dist/chunk-A36XLUI5.js.map +0 -1
  21. package/dist/chunk-B7CT3J2W.js.map +0 -1
  22. package/dist/chunk-BLXYBUU4.js.map +0 -1
  23. package/dist/chunk-I3SC4FOT.js.map +0 -1
  24. /package/dist/{artifact-writer-GIAIMGPQ.js.map → artifact-writer-AMV64TWV.js.map} +0 -0
  25. /package/dist/{chunk-TWQP7JYQ.js.map → chunk-A4J456KS.js.map} +0 -0
  26. /package/dist/{dist-6Z4OSITR.js.map → dist-X5P5IR65.js.map} +0 -0
  27. /package/dist/{interactive-Q575M3A7.js.map → interactive-4JKJTY3G.js.map} +0 -0
  28. /package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map} +0 -0
package/dist/cli.js CHANGED
@@ -2,15 +2,15 @@
2
2
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
3
3
  import {
4
4
  runCli
5
- } from "./chunk-B7CT3J2W.js";
6
- import "./chunk-A36XLUI5.js";
7
- import "./chunk-TWQP7JYQ.js";
8
- import "./chunk-I3SC4FOT.js";
5
+ } from "./chunk-6FXICR66.js";
6
+ import "./chunk-CF5RCUWH.js";
7
+ import "./chunk-A4J456KS.js";
8
+ import "./chunk-Z45FKRMJ.js";
9
9
  import "./chunk-76FOHROU.js";
10
10
  import "./chunk-BPGJ4HBU.js";
11
11
  import {
12
12
  killAllTrackedChildren
13
- } from "./chunk-BLXYBUU4.js";
13
+ } from "./chunk-ENHX2CCS.js";
14
14
  import "./chunk-NPVGBFF6.js";
15
15
  import "./chunk-M7BUKBAF.js";
16
16
  import "./chunk-5H446C7X.js";
@@ -30,6 +30,7 @@ import {
30
30
  parseClaudeSession,
31
31
  parseCodexSession,
32
32
  parseEnvOutput,
33
+ prepareEvalWorkspace,
33
34
  prepareResultsRepoBranch,
34
35
  pushResultsRepoBranch,
35
36
  pushWipCheckpoint,
@@ -46,7 +47,7 @@ import {
46
47
  transpileEvalYaml,
47
48
  transpileEvalYamlFile,
48
49
  trimBaselineResult
49
- } from "./chunk-I3SC4FOT.js";
50
+ } from "./chunk-Z45FKRMJ.js";
50
51
  import {
51
52
  OtlpJsonFileExporter
52
53
  } from "./chunk-76FOHROU.js";
@@ -221,6 +222,7 @@ import {
221
222
  getWorkspacePath,
222
223
  getWorkspacePoolRoot,
223
224
  getWorkspacesRoot,
225
+ gradePreparedEvalCase,
224
226
  groupTranscriptJsonLines,
225
227
  initializeBaseline,
226
228
  isAgentSkillsFormat,
@@ -322,7 +324,7 @@ import {
322
324
  writeArtifactsFromResults,
323
325
  writeInitialBenchmarkArtifact,
324
326
  writePerTestArtifacts
325
- } from "./chunk-BLXYBUU4.js";
327
+ } from "./chunk-ENHX2CCS.js";
326
328
  import "./chunk-NPVGBFF6.js";
327
329
  import "./chunk-M7BUKBAF.js";
328
330
  import "./chunk-5H446C7X.js";
@@ -519,6 +521,7 @@ export {
519
521
  getWorkspacePath,
520
522
  getWorkspacePoolRoot,
521
523
  getWorkspacesRoot,
524
+ gradePreparedEvalCase,
522
525
  groupTranscriptJsonLines,
523
526
  initializeBaseline,
524
527
  isAgentSkillsFormat,
@@ -559,6 +562,7 @@ export {
559
562
  parseJsonSafe,
560
563
  parseJsonlResults,
561
564
  parseYamlValue,
565
+ prepareEvalWorkspace,
562
566
  prepareResultsRepoBranch,
563
567
  pushResultsRepoBranch,
564
568
  pushWipCheckpoint,
@@ -644,4 +648,4 @@ export {
644
648
  writeInitialBenchmarkArtifact,
645
649
  writePerTestArtifacts
646
650
  };
647
- //# sourceMappingURL=dist-6Z4OSITR.js.map
651
+ //# sourceMappingURL=dist-X5P5IR65.js.map
package/dist/index.js CHANGED
@@ -4,13 +4,13 @@ import {
4
4
  preprocessArgv,
5
5
  runCli,
6
6
  usesDeprecatedStudioAlias
7
- } from "./chunk-B7CT3J2W.js";
8
- import "./chunk-A36XLUI5.js";
9
- import "./chunk-TWQP7JYQ.js";
10
- import "./chunk-I3SC4FOT.js";
7
+ } from "./chunk-6FXICR66.js";
8
+ import "./chunk-CF5RCUWH.js";
9
+ import "./chunk-A4J456KS.js";
10
+ import "./chunk-Z45FKRMJ.js";
11
11
  import "./chunk-76FOHROU.js";
12
12
  import "./chunk-BPGJ4HBU.js";
13
- import "./chunk-BLXYBUU4.js";
13
+ import "./chunk-ENHX2CCS.js";
14
14
  import "./chunk-NPVGBFF6.js";
15
15
  import "./chunk-M7BUKBAF.js";
16
16
  import "./chunk-5H446C7X.js";
@@ -7,16 +7,16 @@ import {
7
7
  findRepoRoot,
8
8
  getCategories,
9
9
  runEvalCommand
10
- } from "./chunk-A36XLUI5.js";
11
- import "./chunk-TWQP7JYQ.js";
12
- import "./chunk-I3SC4FOT.js";
10
+ } from "./chunk-CF5RCUWH.js";
11
+ import "./chunk-A4J456KS.js";
12
+ import "./chunk-Z45FKRMJ.js";
13
13
  import "./chunk-76FOHROU.js";
14
14
  import "./chunk-BPGJ4HBU.js";
15
15
  import {
16
16
  getAgentvConfigDir,
17
17
  listTargetNames,
18
18
  readTargetDefinitions
19
- } from "./chunk-BLXYBUU4.js";
19
+ } from "./chunk-ENHX2CCS.js";
20
20
  import "./chunk-NPVGBFF6.js";
21
21
  import "./chunk-M7BUKBAF.js";
22
22
  import "./chunk-5H446C7X.js";
@@ -360,4 +360,4 @@ ${ANSI_DIM}Retrying execution errors...${ANSI_RESET}
360
360
  export {
361
361
  launchInteractiveWizard
362
362
  };
363
- //# sourceMappingURL=interactive-Q575M3A7.js.map
363
+ //# sourceMappingURL=interactive-4JKJTY3G.js.map
@@ -175,12 +175,12 @@ Same as contains variants but explicitly case-insensitive.
175
175
  - **Script SDK:** Use `defineCodeGrader` from `@agentv/eval`:
176
176
  ```typescript
177
177
  import { defineCodeGrader } from '@agentv/eval';
178
- export default defineCodeGrader(({ outputText, trace }) => ({
179
- score: outputText.includes('expected') ? 1 : 0,
180
- assertions: [{ text: 'Contains expected', passed: outputText.includes('expected') }],
178
+ export default defineCodeGrader(({ output, trace }) => ({
179
+ score: (output ?? '').includes('expected') ? 1 : 0,
180
+ assertions: [{ text: 'Contains expected', passed: (output ?? '').includes('expected') }],
181
181
  }));
182
182
  ```
183
- - **Recipe:** The CLI runs the script, passing context as JSON on stdin (`{output, outputText, input, inputText, ...}`). Script returns `{"score": N, "assertions": [...]}`
183
+ - **Recipe:** The CLI runs the script, passing canonical JSON on stdin (`{output, input, expected_output, ...}`). Script returns `{"score": N, "assertions": [...]}`
184
184
  - **PASS:** score >= 0.5 (or as configured).
185
185
 
186
186
  ### Composite assertion
@@ -63,29 +63,29 @@ import { defineCodeGrader, createTargetClient, definePromptTemplate } from '@age
63
63
  - Raw stdin uses `snake_case`; SDK handlers receive `camelCase`
64
64
  - Context fields: `input`, `expectedOutput`, `output`, `messages`, `criteria`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
65
65
 
66
+ For Python, the repo-local helper example in `examples/features/sdk-python/` keeps canonical `snake_case` fields and rejects deprecated wire aliases like `output_text`, `input_text`, and `reference_answer`. It is not a separate Python runner; generated evals still run through the AgentV CLI.
67
+
66
68
  ## Python Example
67
69
 
68
70
  ```python
69
71
  #!/usr/bin/env python3
70
- import json, sys
72
+ from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
73
+
71
74
 
72
- def evaluate(data: dict) -> dict:
73
- candidate = data.get("output", "")
75
+ def evaluate(context):
76
+ candidate = context.output or ""
74
77
  assertions = []
75
78
  for kw in ["async", "await"]:
76
- assertions.append({"text": f"Keyword '{kw}'", "passed": kw in candidate})
77
- passed = sum(1 for a in assertions if a["passed"])
78
- return {
79
- "score": passed / max(len(assertions), 1),
80
- "assertions": assertions,
81
- }
79
+ assertions.append(Assertion(text=f"Keyword '{kw}'", passed=kw in candidate))
80
+ passed = sum(1 for item in assertions if item.passed)
81
+ return CodeGraderResult(
82
+ score=passed / max(len(assertions), 1),
83
+ assertions=assertions,
84
+ )
85
+
82
86
 
83
87
  if __name__ == "__main__":
84
- try:
85
- print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
86
- except Exception as e:
87
- print(json.dumps({"score": 0, "assertions": [{"text": str(e), "passed": False}]}))
88
- sys.exit(1)
88
+ define_code_grader(evaluate)
89
89
  ```
90
90
 
91
91
  ## TypeScript Example
@@ -0,0 +1,47 @@
1
+ # Repo-Local Python Helpers
2
+
3
+ AgentV's Python authoring surface is currently a repo-local helper example under `examples/features/sdk-python/`.
4
+
5
+ Use it when the user wants Python-based custom graders or wants to emit AgentV YAML/JSONL from Python without introducing a Python-native runner.
6
+
7
+ ## Rules
8
+
9
+ - Prefer canonical AgentV wire and YAML fields.
10
+ - Do not accept deprecated wire aliases like `output_text`, `input_text`, or `reference_answer`.
11
+ - Keep Python eval authoring YAML-shaped. Mirror `execution`, `tests`, `assertions`, `expected_output`, and related AgentV keys directly.
12
+ - Run evals through the AgentV CLI, not through a separate Python runtime.
13
+
14
+ ## Available helpers
15
+
16
+ - `agentv_py.grader`
17
+ - `load_grader_input()`
18
+ - `run_code_grader(handler)`
19
+ - `define_code_grader(handler)`
20
+ - `TargetClient.from_env()`
21
+ - `agentv_py.evals`
22
+ - `EvalDefinition`
23
+ - `EvalTest`
24
+ - `JsonlCase`
25
+ - `write_eval_yaml()`
26
+ - `write_jsonl()`
27
+ - `run_agentv_eval()`
28
+
29
+ ## Example
30
+
31
+ ```python
32
+ from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
33
+
34
+
35
+ def evaluate(context):
36
+ actual = context.output or ""
37
+ expected = context.expected_output[0]["content"]
38
+ passed = actual.strip() == expected.strip()
39
+ return CodeGraderResult(
40
+ score=1.0 if passed else 0.0,
41
+ assertions=[Assertion(text="Exact match", passed=passed)],
42
+ )
43
+
44
+
45
+ if __name__ == "__main__":
46
+ define_code_grader(evaluate)
47
+ ```
@@ -2,7 +2,7 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
2
2
  import {
3
3
  loadTsEvalFile,
4
4
  loadTsEvalSuite
5
- } from "./chunk-BLXYBUU4.js";
5
+ } from "./chunk-ENHX2CCS.js";
6
6
  import "./chunk-NPVGBFF6.js";
7
7
  import "./chunk-M7BUKBAF.js";
8
8
  import "./chunk-5H446C7X.js";
@@ -10,4 +10,4 @@ export {
10
10
  loadTsEvalFile,
11
11
  loadTsEvalSuite
12
12
  };
13
- //# sourceMappingURL=ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map
13
+ //# sourceMappingURL=ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentv",
3
- "version": "4.40.1",
3
+ "version": "4.41.0-next.1",
4
4
  "description": "CLI entry point for AgentV",
5
5
  "type": "module",
6
6
  "repository": {