agentv 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -8
- package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js → agentv-provider-NFFLXG5M-TJAWCWCX.js} +2 -2
- package/dist/{chunk-A7ZDUB46.js → chunk-5GG6DDP5.js} +27 -17
- package/dist/chunk-5GG6DDP5.js.map +1 -0
- package/dist/{chunk-AR3QEKXH.js → chunk-BJV6MDBE.js} +3 -3
- package/dist/{chunk-AR3QEKXH.js.map → chunk-BJV6MDBE.js.map} +1 -1
- package/dist/{chunk-GOZV2HN2.js → chunk-D6G4N2H2.js} +386 -439
- package/dist/chunk-D6G4N2H2.js.map +1 -0
- package/dist/{chunk-RE5I3U2S.js → chunk-RLL4QGNL.js} +26 -45
- package/dist/chunk-RLL4QGNL.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-AFDYFH6Y.js → dist-MZFXE6B5.js} +3 -5
- package/dist/index.js +4 -4
- package/dist/{interactive-WXXTZ7PD.js → interactive-J7SUWZH2.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-A7ZDUB46.js.map +0 -1
- package/dist/chunk-GOZV2HN2.js.map +0 -1
- package/dist/chunk-RE5I3U2S.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js.map → agentv-provider-NFFLXG5M-TJAWCWCX.js.map} +0 -0
- /package/dist/{dist-AFDYFH6Y.js.map → dist-MZFXE6B5.js.map} +0 -0
- /package/dist/{interactive-WXXTZ7PD.js.map → interactive-J7SUWZH2.js.map} +0 -0
package/README.md
CHANGED
|
@@ -238,21 +238,19 @@ import json, sys
|
|
|
238
238
|
data = json.load(sys.stdin)
|
|
239
239
|
answer = data.get("answer", "")
|
|
240
240
|
|
|
241
|
-
|
|
242
|
-
misses = []
|
|
241
|
+
assertions = []
|
|
243
242
|
|
|
244
243
|
if "42" in answer:
|
|
245
|
-
|
|
244
|
+
assertions.append({"text": "Answer contains correct value (42)", "passed": True})
|
|
246
245
|
else:
|
|
247
|
-
|
|
246
|
+
assertions.append({"text": "Answer does not contain expected value (42)", "passed": False})
|
|
248
247
|
|
|
249
|
-
|
|
248
|
+
passed = sum(1 for a in assertions if a["passed"])
|
|
249
|
+
score = 1.0 if passed == len(assertions) else 0.0
|
|
250
250
|
|
|
251
251
|
print(json.dumps({
|
|
252
252
|
"score": score,
|
|
253
|
-
"
|
|
254
|
-
"misses": misses,
|
|
255
|
-
"reasoning": f"Passed {len(hits)} check(s)"
|
|
253
|
+
"assertions": assertions,
|
|
256
254
|
}))
|
|
257
255
|
```
|
|
258
256
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
|
|
2
2
|
import {
|
|
3
3
|
AgentvProvider
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-BJV6MDBE.js";
|
|
5
5
|
import "./chunk-5H446C7X.js";
|
|
6
6
|
export {
|
|
7
7
|
AgentvProvider
|
|
8
8
|
};
|
|
9
|
-
//# sourceMappingURL=agentv-provider-
|
|
9
|
+
//# sourceMappingURL=agentv-provider-NFFLXG5M-TJAWCWCX.js.map
|
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
validateEvalFile,
|
|
17
17
|
validateFileReferences,
|
|
18
18
|
validateTargetsFile
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-RLL4QGNL.js";
|
|
20
20
|
import {
|
|
21
21
|
createBuiltinRegistry,
|
|
22
22
|
createProvider,
|
|
@@ -34,7 +34,7 @@ import {
|
|
|
34
34
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
35
35
|
transpileEvalYamlFile,
|
|
36
36
|
trimBaselineResult
|
|
37
|
-
} from "./chunk-
|
|
37
|
+
} from "./chunk-D6G4N2H2.js";
|
|
38
38
|
import {
|
|
39
39
|
__commonJS,
|
|
40
40
|
__esm,
|
|
@@ -4185,7 +4185,7 @@ var evalRunCommand = command({
|
|
|
4185
4185
|
},
|
|
4186
4186
|
handler: async (args) => {
|
|
4187
4187
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4188
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4188
|
+
const { launchInteractiveWizard } = await import("./interactive-J7SUWZH2.js");
|
|
4189
4189
|
await launchInteractiveWizard();
|
|
4190
4190
|
return;
|
|
4191
4191
|
}
|
|
@@ -5089,9 +5089,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
|
|
|
5089
5089
|
originalScore: raw.score,
|
|
5090
5090
|
newScore: score.score,
|
|
5091
5091
|
verdict: score.verdict,
|
|
5092
|
-
|
|
5093
|
-
misses: score.misses,
|
|
5094
|
-
reasoning: score.reasoning
|
|
5092
|
+
assertions: score.assertions
|
|
5095
5093
|
});
|
|
5096
5094
|
}
|
|
5097
5095
|
return scored;
|
|
@@ -5110,7 +5108,9 @@ function renderTable(scored, assertSpec) {
|
|
|
5110
5108
|
lines.push(cols.map((col) => "\u2500".repeat(col.width)).join("\u2500\u2500"));
|
|
5111
5109
|
for (const r of scored) {
|
|
5112
5110
|
const verdictColor = r.verdict === "pass" ? c2.green : c2.red;
|
|
5113
|
-
const
|
|
5111
|
+
const failed = r.assertions.filter((a) => !a.passed);
|
|
5112
|
+
const passed = r.assertions.filter((a) => a.passed);
|
|
5113
|
+
const detail = failed.length > 0 ? failed[0].text.slice(0, 48) : passed.length > 0 ? passed[0].text.slice(0, 48) : "";
|
|
5114
5114
|
const row = [
|
|
5115
5115
|
padRight2(r.testId.slice(0, 24), cols[0].width),
|
|
5116
5116
|
padLeft2(formatScore(r.originalScore), cols[1].width),
|
|
@@ -5332,11 +5332,17 @@ function formatResultDetail(result, index, tree) {
|
|
|
5332
5332
|
if (result.error) {
|
|
5333
5333
|
lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
|
|
5334
5334
|
}
|
|
5335
|
-
if (result.
|
|
5336
|
-
|
|
5337
|
-
|
|
5338
|
-
|
|
5339
|
-
|
|
5335
|
+
if (result.assertions && result.assertions.length > 0) {
|
|
5336
|
+
const passed = result.assertions.filter((a) => a.passed);
|
|
5337
|
+
const failed = result.assertions.filter((a) => !a.passed);
|
|
5338
|
+
if (passed.length > 0)
|
|
5339
|
+
lines.push(
|
|
5340
|
+
` ${c2.green}\u2713 Passed:${c2.reset} ${passed.map((a) => a.text).join(", ")}`
|
|
5341
|
+
);
|
|
5342
|
+
if (failed.length > 0)
|
|
5343
|
+
lines.push(
|
|
5344
|
+
` ${c2.red}\u2717 Failed:${c2.reset} ${failed.map((a) => a.text).join(", ")}`
|
|
5345
|
+
);
|
|
5340
5346
|
}
|
|
5341
5347
|
if (result.scores && result.scores.length > 0) {
|
|
5342
5348
|
lines.push(` ${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
|
|
@@ -5344,10 +5350,14 @@ function formatResultDetail(result, index, tree) {
|
|
|
5344
5350
|
if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
|
|
5345
5351
|
lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result)}`);
|
|
5346
5352
|
}
|
|
5347
|
-
if (result.
|
|
5348
|
-
const
|
|
5349
|
-
|
|
5350
|
-
|
|
5353
|
+
if (result.assertions && result.assertions.length > 0) {
|
|
5354
|
+
const withEvidence = result.assertions.filter((a) => a.evidence);
|
|
5355
|
+
if (withEvidence.length > 0) {
|
|
5356
|
+
const maxLen = 200;
|
|
5357
|
+
const evidence = withEvidence[0].evidence;
|
|
5358
|
+
const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence;
|
|
5359
|
+
lines.push(` ${c2.dim}Evidence: ${truncated}${c2.reset}`);
|
|
5360
|
+
}
|
|
5351
5361
|
}
|
|
5352
5362
|
return lines.join("\n");
|
|
5353
5363
|
}
|
|
@@ -6268,4 +6278,4 @@ export {
|
|
|
6268
6278
|
preprocessArgv,
|
|
6269
6279
|
runCli
|
|
6270
6280
|
};
|
|
6271
|
-
//# sourceMappingURL=chunk-
|
|
6281
|
+
//# sourceMappingURL=chunk-5GG6DDP5.js.map
|