npm - agentv - Versions diffs - 4.40.1 → 4.41.0-next.1 - Mend

agentv 4.40.1 → 4.41.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/dist/cli.js CHANGED Viewed

@@ -2,15 +2,15 @@
 import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
 import {
   runCli
-} from "./chunk-B7CT3J2W.js";
-import "./chunk-A36XLUI5.js";
-import "./chunk-TWQP7JYQ.js";
-import "./chunk-I3SC4FOT.js";
+} from "./chunk-6FXICR66.js";
+import "./chunk-CF5RCUWH.js";
+import "./chunk-A4J456KS.js";
+import "./chunk-Z45FKRMJ.js";
 import "./chunk-76FOHROU.js";
 import "./chunk-BPGJ4HBU.js";
 import {
   killAllTrackedChildren
-} from "./chunk-BLXYBUU4.js";
+} from "./chunk-ENHX2CCS.js";
 import "./chunk-NPVGBFF6.js";
 import "./chunk-M7BUKBAF.js";
 import "./chunk-5H446C7X.js";

package/dist/{dist-6Z4OSITR.js → dist-X5P5IR65.js} RENAMED Viewed

@@ -30,6 +30,7 @@ import {
   parseClaudeSession,
   parseCodexSession,
   parseEnvOutput,
+  prepareEvalWorkspace,
   prepareResultsRepoBranch,
   pushResultsRepoBranch,
   pushWipCheckpoint,
@@ -46,7 +47,7 @@ import {
   transpileEvalYaml,
   transpileEvalYamlFile,
   trimBaselineResult
-} from "./chunk-I3SC4FOT.js";
+} from "./chunk-Z45FKRMJ.js";
 import {
   OtlpJsonFileExporter
 } from "./chunk-76FOHROU.js";
@@ -221,6 +222,7 @@ import {
   getWorkspacePath,
   getWorkspacePoolRoot,
   getWorkspacesRoot,
+  gradePreparedEvalCase,
   groupTranscriptJsonLines,
   initializeBaseline,
   isAgentSkillsFormat,
@@ -322,7 +324,7 @@ import {
   writeArtifactsFromResults,
   writeInitialBenchmarkArtifact,
   writePerTestArtifacts
-} from "./chunk-BLXYBUU4.js";
+} from "./chunk-ENHX2CCS.js";
 import "./chunk-NPVGBFF6.js";
 import "./chunk-M7BUKBAF.js";
 import "./chunk-5H446C7X.js";
@@ -519,6 +521,7 @@ export {
   getWorkspacePath,
   getWorkspacePoolRoot,
   getWorkspacesRoot,
+  gradePreparedEvalCase,
   groupTranscriptJsonLines,
   initializeBaseline,
   isAgentSkillsFormat,
@@ -559,6 +562,7 @@ export {
   parseJsonSafe,
   parseJsonlResults,
   parseYamlValue,
+  prepareEvalWorkspace,
   prepareResultsRepoBranch,
   pushResultsRepoBranch,
   pushWipCheckpoint,
@@ -644,4 +648,4 @@ export {
   writeInitialBenchmarkArtifact,
   writePerTestArtifacts
 };
-//# sourceMappingURL=dist-6Z4OSITR.js.map
+//# sourceMappingURL=dist-X5P5IR65.js.map

package/dist/index.js CHANGED Viewed

@@ -4,13 +4,13 @@ import {
   preprocessArgv,
   runCli,
   usesDeprecatedStudioAlias
-} from "./chunk-B7CT3J2W.js";
-import "./chunk-A36XLUI5.js";
-import "./chunk-TWQP7JYQ.js";
-import "./chunk-I3SC4FOT.js";
+} from "./chunk-6FXICR66.js";
+import "./chunk-CF5RCUWH.js";
+import "./chunk-A4J456KS.js";
+import "./chunk-Z45FKRMJ.js";
 import "./chunk-76FOHROU.js";
 import "./chunk-BPGJ4HBU.js";
-import "./chunk-BLXYBUU4.js";
+import "./chunk-ENHX2CCS.js";
 import "./chunk-NPVGBFF6.js";
 import "./chunk-M7BUKBAF.js";
 import "./chunk-5H446C7X.js";

package/dist/{interactive-Q575M3A7.js → interactive-4JKJTY3G.js} RENAMED Viewed

@@ -7,16 +7,16 @@ import {
   findRepoRoot,
   getCategories,
   runEvalCommand
-} from "./chunk-A36XLUI5.js";
-import "./chunk-TWQP7JYQ.js";
-import "./chunk-I3SC4FOT.js";
+} from "./chunk-CF5RCUWH.js";
+import "./chunk-A4J456KS.js";
+import "./chunk-Z45FKRMJ.js";
 import "./chunk-76FOHROU.js";
 import "./chunk-BPGJ4HBU.js";
 import {
   getAgentvConfigDir,
   listTargetNames,
   readTargetDefinitions
-} from "./chunk-BLXYBUU4.js";
+} from "./chunk-ENHX2CCS.js";
 import "./chunk-NPVGBFF6.js";
 import "./chunk-M7BUKBAF.js";
 import "./chunk-5H446C7X.js";
@@ -360,4 +360,4 @@ ${ANSI_DIM}Retrying execution errors...${ANSI_RESET}
 export {
   launchInteractiveWizard
 };
-//# sourceMappingURL=interactive-Q575M3A7.js.map
+//# sourceMappingURL=interactive-4JKJTY3G.js.map

package/dist/skills/agentv-bench/references/eval-yaml-spec.md CHANGED Viewed

@@ -175,12 +175,12 @@ Same as contains variants but explicitly case-insensitive.
 - **Script SDK:** Use `defineCodeGrader` from `@agentv/eval`:
   ```typescript
   import { defineCodeGrader } from '@agentv/eval';
-  export default defineCodeGrader(({ outputText, trace }) => ({
-    score: outputText.includes('expected') ? 1 : 0,
-    assertions: [{ text: 'Contains expected', passed: outputText.includes('expected') }],
+  export default defineCodeGrader(({ output, trace }) => ({
+    score: (output ?? '').includes('expected') ? 1 : 0,
+    assertions: [{ text: 'Contains expected', passed: (output ?? '').includes('expected') }],
   }));
   ```
-- **Recipe:** The CLI runs the script, passing context as JSON on stdin (`{output, outputText, input, inputText, ...}`). Script returns `{"score": N, "assertions": [...]}`
+- **Recipe:** The CLI runs the script, passing canonical JSON on stdin (`{output, input, expected_output, ...}`). Script returns `{"score": N, "assertions": [...]}`
 - **PASS:** score >= 0.5 (or as configured).
 ### Composite assertion

package/dist/skills/agentv-eval-writer/references/custom-evaluators.md CHANGED Viewed

@@ -63,29 +63,29 @@ import { defineCodeGrader, createTargetClient, definePromptTemplate } from '@age
   - Raw stdin uses `snake_case`; SDK handlers receive `camelCase`
   - Context fields: `input`, `expectedOutput`, `output`, `messages`, `criteria`, `config`, `trace`, `traceSummary`, `tokenUsage`, `costUsd`, `durationMs`, `startTime`, `endTime`
+For Python, the repo-local helper example in `examples/features/sdk-python/` keeps canonical `snake_case` fields and rejects deprecated wire aliases like `output_text`, `input_text`, and `reference_answer`. It is not a separate Python runner; generated evals still run through the AgentV CLI.
 ## Python Example
 ```python
 #!/usr/bin/env python3
-import json, sys
+from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
-def evaluate(data: dict) -> dict:
-    candidate = data.get("output", "")
+def evaluate(context):
+    candidate = context.output or ""
     assertions = []
     for kw in ["async", "await"]:
-        assertions.append({"text": f"Keyword '{kw}'", "passed": kw in candidate})
-    passed = sum(1 for a in assertions if a["passed"])
-    return {
-        "score": passed / max(len(assertions), 1),
-        "assertions": assertions,
-    }
+        assertions.append(Assertion(text=f"Keyword '{kw}'", passed=kw in candidate))
+    passed = sum(1 for item in assertions if item.passed)
+    return CodeGraderResult(
+        score=passed / max(len(assertions), 1),
+        assertions=assertions,
+    )
 if __name__ == "__main__":
-    try:
-        print(json.dumps(evaluate(json.loads(sys.stdin.read()))))
-    except Exception as e:
-        print(json.dumps({"score": 0, "assertions": [{"text": str(e), "passed": False}]}))
-        sys.exit(1)
+    define_code_grader(evaluate)
 ```
 ## TypeScript Example

package/dist/skills/agentv-eval-writer/references/python-helpers.md ADDED Viewed

@@ -0,0 +1,47 @@
+# Repo-Local Python Helpers
+AgentV's Python authoring surface is currently a repo-local helper example under `examples/features/sdk-python/`.
+Use it when the user wants Python-based custom graders or wants to emit AgentV YAML/JSONL from Python without introducing a Python-native runner.
+## Rules
+- Prefer canonical AgentV wire and YAML fields.
+- Do not accept deprecated wire aliases like `output_text`, `input_text`, or `reference_answer`.
+- Keep Python eval authoring YAML-shaped. Mirror `execution`, `tests`, `assertions`, `expected_output`, and related AgentV keys directly.
+- Run evals through the AgentV CLI, not through a separate Python runtime.
+## Available helpers
+- `agentv_py.grader`
+  - `load_grader_input()`
+  - `run_code_grader(handler)`
+  - `define_code_grader(handler)`
+  - `TargetClient.from_env()`
+- `agentv_py.evals`
+  - `EvalDefinition`
+  - `EvalTest`
+  - `JsonlCase`
+  - `write_eval_yaml()`
+  - `write_jsonl()`
+  - `run_agentv_eval()`
+## Example
+```python
+from agentv_py.grader import Assertion, CodeGraderResult, define_code_grader
+def evaluate(context):
+    actual = context.output or ""
+    expected = context.expected_output[0]["content"]
+    passed = actual.strip() == expected.strip()
+    return CodeGraderResult(
+        score=1.0 if passed else 0.0,
+        assertions=[Assertion(text="Exact match", passed=passed)],
+    )
+if __name__ == "__main__":
+    define_code_grader(evaluate)
+```

package/dist/{ts-eval-loader-NWH3B4HG-UXXCZKLP.js → ts-eval-loader-ZVL6CGTE-TZYZX3QS.js} RENAMED Viewed

@@ -2,7 +2,7 @@ import { createRequire } from 'node:module'; const require = createRequire(impor
 import {
   loadTsEvalFile,
   loadTsEvalSuite
-} from "./chunk-BLXYBUU4.js";
+} from "./chunk-ENHX2CCS.js";
 import "./chunk-NPVGBFF6.js";
 import "./chunk-M7BUKBAF.js";
 import "./chunk-5H446C7X.js";
@@ -10,4 +10,4 @@ export {
   loadTsEvalFile,
   loadTsEvalSuite
 };
-//# sourceMappingURL=ts-eval-loader-NWH3B4HG-UXXCZKLP.js.map
+//# sourceMappingURL=ts-eval-loader-ZVL6CGTE-TZYZX3QS.js.map

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentv",
-  "version": "4.40.1",
+  "version": "4.41.0-next.1",
   "description": "CLI entry point for AgentV",
   "type": "module",
   "repository": {