agentv 3.4.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -11
- package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js → agentv-provider-NFFLXG5M-TJAWCWCX.js} +2 -2
- package/dist/{chunk-AR3QEKXH.js → chunk-BJV6MDBE.js} +3 -3
- package/dist/{chunk-AR3QEKXH.js.map → chunk-BJV6MDBE.js.map} +1 -1
- package/dist/{chunk-A7ZDUB46.js → chunk-IP5BO54H.js} +35 -26
- package/dist/chunk-IP5BO54H.js.map +1 -0
- package/dist/{chunk-GOZV2HN2.js → chunk-K4RXLQWV.js} +453 -494
- package/dist/chunk-K4RXLQWV.js.map +1 -0
- package/dist/{chunk-RE5I3U2S.js → chunk-UU5N43YS.js} +27 -46
- package/dist/chunk-UU5N43YS.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-AFDYFH6Y.js → dist-VWEFBDZ5.js} +3 -5
- package/dist/index.js +4 -4
- package/dist/{interactive-WXXTZ7PD.js → interactive-5S4ILY2Y.js} +4 -4
- package/dist/templates/.agentv/.env.example +9 -11
- package/dist/templates/.agentv/config.yaml +0 -5
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-A7ZDUB46.js.map +0 -1
- package/dist/chunk-GOZV2HN2.js.map +0 -1
- package/dist/chunk-RE5I3U2S.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js.map → agentv-provider-NFFLXG5M-TJAWCWCX.js.map} +0 -0
- /package/dist/{dist-AFDYFH6Y.js.map → dist-VWEFBDZ5.js.map} +0 -0
- /package/dist/{interactive-WXXTZ7PD.js.map → interactive-5S4ILY2Y.js.map} +0 -0
|
@@ -16,7 +16,7 @@ import {
|
|
|
16
16
|
validateEvalFile,
|
|
17
17
|
validateFileReferences,
|
|
18
18
|
validateTargetsFile
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-UU5N43YS.js";
|
|
20
20
|
import {
|
|
21
21
|
createBuiltinRegistry,
|
|
22
22
|
createProvider,
|
|
@@ -34,7 +34,7 @@ import {
|
|
|
34
34
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
35
35
|
transpileEvalYamlFile,
|
|
36
36
|
trimBaselineResult
|
|
37
|
-
} from "./chunk-
|
|
37
|
+
} from "./chunk-K4RXLQWV.js";
|
|
38
38
|
import {
|
|
39
39
|
__commonJS,
|
|
40
40
|
__esm,
|
|
@@ -3493,9 +3493,9 @@ var ASSERTION_TEMPLATES = {
|
|
|
3493
3493
|
default: `#!/usr/bin/env bun
|
|
3494
3494
|
import { defineAssertion } from '@agentv/eval';
|
|
3495
3495
|
|
|
3496
|
-
export default defineAssertion(({
|
|
3496
|
+
export default defineAssertion(({ outputText }) => {
|
|
3497
3497
|
// TODO: Implement your assertion logic
|
|
3498
|
-
const pass =
|
|
3498
|
+
const pass = outputText.length > 0;
|
|
3499
3499
|
return {
|
|
3500
3500
|
pass,
|
|
3501
3501
|
reasoning: pass ? 'Output has content' : 'Output is empty',
|
|
@@ -3505,9 +3505,9 @@ export default defineAssertion(({ answer }) => {
|
|
|
3505
3505
|
score: `#!/usr/bin/env bun
|
|
3506
3506
|
import { defineAssertion } from '@agentv/eval';
|
|
3507
3507
|
|
|
3508
|
-
export default defineAssertion(({
|
|
3508
|
+
export default defineAssertion(({ outputText }) => {
|
|
3509
3509
|
// TODO: Implement your scoring logic (0.0 to 1.0)
|
|
3510
|
-
const score =
|
|
3510
|
+
const score = outputText.length > 0 ? 1.0 : 0.0;
|
|
3511
3511
|
return {
|
|
3512
3512
|
pass: score >= 0.5,
|
|
3513
3513
|
score,
|
|
@@ -3967,7 +3967,6 @@ var evalAssertCommand = command({
|
|
|
3967
3967
|
}
|
|
3968
3968
|
const payload = JSON.stringify(
|
|
3969
3969
|
{
|
|
3970
|
-
answer: resolvedOutput,
|
|
3971
3970
|
output: [{ role: "assistant", content: resolvedOutput }],
|
|
3972
3971
|
input: [{ role: "user", content: resolvedInput }],
|
|
3973
3972
|
question: resolvedInput,
|
|
@@ -4185,7 +4184,7 @@ var evalRunCommand = command({
|
|
|
4185
4184
|
},
|
|
4186
4185
|
handler: async (args) => {
|
|
4187
4186
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4188
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4187
|
+
const { launchInteractiveWizard } = await import("./interactive-5S4ILY2Y.js");
|
|
4189
4188
|
await launchInteractiveWizard();
|
|
4190
4189
|
return;
|
|
4191
4190
|
}
|
|
@@ -4707,10 +4706,10 @@ function exportResults(sourceFile, content, outputDir) {
|
|
|
4707
4706
|
const outputsDir = path8.join(outputDir, "outputs");
|
|
4708
4707
|
mkdirSync2(outputsDir, { recursive: true });
|
|
4709
4708
|
for (const result of patched) {
|
|
4710
|
-
const
|
|
4711
|
-
if (
|
|
4709
|
+
const outputText = result.outputText;
|
|
4710
|
+
if (outputText) {
|
|
4712
4711
|
const id = safeTestId(result);
|
|
4713
|
-
writeFileSync3(path8.join(outputsDir, `${id}.txt`),
|
|
4712
|
+
writeFileSync3(path8.join(outputsDir, `${id}.txt`), outputText);
|
|
4714
4713
|
}
|
|
4715
4714
|
}
|
|
4716
4715
|
}
|
|
@@ -5022,7 +5021,7 @@ function toTraceSummary(raw) {
|
|
|
5022
5021
|
return toCamelCaseDeep(raw.trace);
|
|
5023
5022
|
}
|
|
5024
5023
|
function extractCandidate(raw) {
|
|
5025
|
-
if (raw.
|
|
5024
|
+
if (raw.output_text !== void 0) return raw.output_text;
|
|
5026
5025
|
if (raw.output !== void 0)
|
|
5027
5026
|
return typeof raw.output === "string" ? raw.output : JSON.stringify(raw.output);
|
|
5028
5027
|
return "";
|
|
@@ -5089,9 +5088,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
|
|
|
5089
5088
|
originalScore: raw.score,
|
|
5090
5089
|
newScore: score.score,
|
|
5091
5090
|
verdict: score.verdict,
|
|
5092
|
-
|
|
5093
|
-
misses: score.misses,
|
|
5094
|
-
reasoning: score.reasoning
|
|
5091
|
+
assertions: score.assertions
|
|
5095
5092
|
});
|
|
5096
5093
|
}
|
|
5097
5094
|
return scored;
|
|
@@ -5110,7 +5107,9 @@ function renderTable(scored, assertSpec) {
|
|
|
5110
5107
|
lines.push(cols.map((col) => "\u2500".repeat(col.width)).join("\u2500\u2500"));
|
|
5111
5108
|
for (const r of scored) {
|
|
5112
5109
|
const verdictColor = r.verdict === "pass" ? c2.green : c2.red;
|
|
5113
|
-
const
|
|
5110
|
+
const failed = r.assertions.filter((a) => !a.passed);
|
|
5111
|
+
const passed = r.assertions.filter((a) => a.passed);
|
|
5112
|
+
const detail = failed.length > 0 ? failed[0].text.slice(0, 48) : passed.length > 0 ? passed[0].text.slice(0, 48) : "";
|
|
5114
5113
|
const row = [
|
|
5115
5114
|
padRight2(r.testId.slice(0, 24), cols[0].width),
|
|
5116
5115
|
padLeft2(formatScore(r.originalScore), cols[1].width),
|
|
@@ -5332,11 +5331,17 @@ function formatResultDetail(result, index, tree) {
|
|
|
5332
5331
|
if (result.error) {
|
|
5333
5332
|
lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
|
|
5334
5333
|
}
|
|
5335
|
-
if (result.
|
|
5336
|
-
|
|
5337
|
-
|
|
5338
|
-
|
|
5339
|
-
|
|
5334
|
+
if (result.assertions && result.assertions.length > 0) {
|
|
5335
|
+
const passed = result.assertions.filter((a) => a.passed);
|
|
5336
|
+
const failed = result.assertions.filter((a) => !a.passed);
|
|
5337
|
+
if (passed.length > 0)
|
|
5338
|
+
lines.push(
|
|
5339
|
+
` ${c2.green}\u2713 Passed:${c2.reset} ${passed.map((a) => a.text).join(", ")}`
|
|
5340
|
+
);
|
|
5341
|
+
if (failed.length > 0)
|
|
5342
|
+
lines.push(
|
|
5343
|
+
` ${c2.red}\u2717 Failed:${c2.reset} ${failed.map((a) => a.text).join(", ")}`
|
|
5344
|
+
);
|
|
5340
5345
|
}
|
|
5341
5346
|
if (result.scores && result.scores.length > 0) {
|
|
5342
5347
|
lines.push(` ${c2.dim}Scores:${c2.reset} ${renderScores(result.scores)}`);
|
|
@@ -5344,10 +5349,14 @@ function formatResultDetail(result, index, tree) {
|
|
|
5344
5349
|
if (result.trace || result.duration_ms !== void 0 || result.cost_usd !== void 0) {
|
|
5345
5350
|
lines.push(` ${c2.dim}Trace:${c2.reset} ${renderFlatTrace(result)}`);
|
|
5346
5351
|
}
|
|
5347
|
-
if (result.
|
|
5348
|
-
const
|
|
5349
|
-
|
|
5350
|
-
|
|
5352
|
+
if (result.assertions && result.assertions.length > 0) {
|
|
5353
|
+
const withEvidence = result.assertions.filter((a) => a.evidence);
|
|
5354
|
+
if (withEvidence.length > 0) {
|
|
5355
|
+
const maxLen = 200;
|
|
5356
|
+
const evidence = withEvidence[0].evidence;
|
|
5357
|
+
const truncated = evidence.length > maxLen ? `${evidence.slice(0, maxLen)}...` : evidence;
|
|
5358
|
+
lines.push(` ${c2.dim}Evidence: ${truncated}${c2.reset}`);
|
|
5359
|
+
}
|
|
5351
5360
|
}
|
|
5352
5361
|
return lines.join("\n");
|
|
5353
5362
|
}
|
|
@@ -6268,4 +6277,4 @@ export {
|
|
|
6268
6277
|
preprocessArgv,
|
|
6269
6278
|
runCli
|
|
6270
6279
|
};
|
|
6271
|
-
//# sourceMappingURL=chunk-
|
|
6280
|
+
//# sourceMappingURL=chunk-IP5BO54H.js.map
|