agentv 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-QCKPJPYC.js → chunk-HAZJO7OY.js} +4 -4
- package/dist/{chunk-QCKPJPYC.js.map → chunk-HAZJO7OY.js.map} +1 -1
- package/dist/{chunk-TDY2FQN5.js → chunk-UXSQQHCI.js} +124 -153
- package/dist/chunk-UXSQQHCI.js.map +1 -0
- package/dist/{chunk-XEAW7OQT.js → chunk-XLM3RNN7.js} +19 -29
- package/dist/chunk-XLM3RNN7.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-2JUUJ6PT.js → dist-VVXR6TYM.js} +4 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-ASB4FU3J.js → interactive-NVNOLL2H.js} +3 -3
- package/dist/studio/assets/{index-DofvSOmX.js → index-Cir5Hc8S.js} +1 -1
- package/dist/studio/assets/{index-CDGReinH.js → index-D8LVkz9x.js} +1 -1
- package/dist/studio/index.html +1 -1
- package/package.json +1 -1
- package/dist/chunk-TDY2FQN5.js.map +0 -1
- package/dist/chunk-XEAW7OQT.js.map +0 -1
- /package/dist/{dist-2JUUJ6PT.js.map → dist-VVXR6TYM.js.map} +0 -0
- /package/dist/{interactive-ASB4FU3J.js.map → interactive-NVNOLL2H.js.map} +0 -0
|
@@ -24,7 +24,7 @@ import {
|
|
|
24
24
|
validateFileReferences,
|
|
25
25
|
validateTargetsFile,
|
|
26
26
|
writeArtifactsFromResults
|
|
27
|
-
} from "./chunk-
|
|
27
|
+
} from "./chunk-HAZJO7OY.js";
|
|
28
28
|
import {
|
|
29
29
|
DEFAULT_CATEGORY,
|
|
30
30
|
createBuiltinRegistry,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
44
44
|
transpileEvalYamlFile,
|
|
45
45
|
trimBaselineResult
|
|
46
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-XLM3RNN7.js";
|
|
47
47
|
import {
|
|
48
48
|
__commonJS,
|
|
49
49
|
__esm,
|
|
@@ -4217,7 +4217,7 @@ var evalRunCommand = command({
|
|
|
4217
4217
|
},
|
|
4218
4218
|
handler: async (args) => {
|
|
4219
4219
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4220
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4220
|
+
const { launchInteractiveWizard } = await import("./interactive-NVNOLL2H.js");
|
|
4221
4221
|
await launchInteractiveWizard();
|
|
4222
4222
|
return;
|
|
4223
4223
|
}
|
|
@@ -4628,12 +4628,115 @@ function computeStats(values) {
|
|
|
4628
4628
|
// src/commands/pipeline/grade.ts
|
|
4629
4629
|
import { mkdir as mkdir2, readFile as readFile2, readdir as readdir2, writeFile as writeFile3 } from "node:fs/promises";
|
|
4630
4630
|
import { join as join2 } from "node:path";
|
|
4631
|
+
var DEFAULT_CONCURRENCY = 10;
|
|
4631
4632
|
function extractInputText(input) {
|
|
4632
4633
|
if (!input || input.length === 0) return "";
|
|
4633
4634
|
if (input.length === 1) return input[0].content;
|
|
4634
4635
|
return input.map((m) => `@[${m.role}]:
|
|
4635
4636
|
${m.content}`).join("\n\n");
|
|
4636
4637
|
}
|
|
4638
|
+
async function runCodeGraders(tasks, concurrency) {
|
|
4639
|
+
let totalGraders = 0;
|
|
4640
|
+
let totalPassed = 0;
|
|
4641
|
+
let completed = 0;
|
|
4642
|
+
const total = tasks.length;
|
|
4643
|
+
if (total === 0) return { totalGraders: 0, totalPassed: 0 };
|
|
4644
|
+
const writeProgress = () => {
|
|
4645
|
+
process.stderr.write(`\rGrading: ${completed}/${total} done`);
|
|
4646
|
+
};
|
|
4647
|
+
writeProgress();
|
|
4648
|
+
const executeGrader = async (task) => {
|
|
4649
|
+
const { testId, testDir, resultsDir, graderFile, responseText, inputData } = task;
|
|
4650
|
+
const graderConfig = JSON.parse(
|
|
4651
|
+
await readFile2(join2(testDir, "code_graders", graderFile), "utf8")
|
|
4652
|
+
);
|
|
4653
|
+
const graderName = graderConfig.name;
|
|
4654
|
+
const inputText = extractInputText(inputData.input);
|
|
4655
|
+
const payload = JSON.stringify({
|
|
4656
|
+
output: [{ role: "assistant", content: responseText }],
|
|
4657
|
+
input: inputData.input,
|
|
4658
|
+
criteria: "",
|
|
4659
|
+
expected_output: [],
|
|
4660
|
+
input_files: inputData.input_files ?? [],
|
|
4661
|
+
trace: null,
|
|
4662
|
+
token_usage: null,
|
|
4663
|
+
cost_usd: null,
|
|
4664
|
+
duration_ms: null,
|
|
4665
|
+
start_time: null,
|
|
4666
|
+
end_time: null,
|
|
4667
|
+
file_changes: null,
|
|
4668
|
+
workspace_path: null,
|
|
4669
|
+
config: graderConfig.config ?? null,
|
|
4670
|
+
metadata: inputData.metadata ?? {},
|
|
4671
|
+
input_text: inputText,
|
|
4672
|
+
output_text: responseText,
|
|
4673
|
+
expected_output_text: ""
|
|
4674
|
+
});
|
|
4675
|
+
try {
|
|
4676
|
+
const stdout = await executeScript(
|
|
4677
|
+
graderConfig.command,
|
|
4678
|
+
payload,
|
|
4679
|
+
void 0,
|
|
4680
|
+
graderConfig.cwd
|
|
4681
|
+
);
|
|
4682
|
+
const parsed = JSON.parse(stdout);
|
|
4683
|
+
const score = typeof parsed.score === "number" ? parsed.score : 0;
|
|
4684
|
+
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
|
|
4685
|
+
const result = {
|
|
4686
|
+
name: graderName,
|
|
4687
|
+
type: "code-grader",
|
|
4688
|
+
score,
|
|
4689
|
+
weight: graderConfig.weight ?? 1,
|
|
4690
|
+
assertions,
|
|
4691
|
+
details: parsed.details ?? {}
|
|
4692
|
+
};
|
|
4693
|
+
await writeFile3(
|
|
4694
|
+
join2(resultsDir, `${graderName}.json`),
|
|
4695
|
+
`${JSON.stringify(result, null, 2)}
|
|
4696
|
+
`,
|
|
4697
|
+
"utf8"
|
|
4698
|
+
);
|
|
4699
|
+
totalGraders++;
|
|
4700
|
+
if (score >= 0.5) totalPassed++;
|
|
4701
|
+
} catch (error) {
|
|
4702
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4703
|
+
process.stderr.write(`
|
|
4704
|
+
${testId}/${graderName}: ERROR \u2014 ${message}
|
|
4705
|
+
`);
|
|
4706
|
+
const errorResult = {
|
|
4707
|
+
name: graderName,
|
|
4708
|
+
type: "code-grader",
|
|
4709
|
+
score: 0,
|
|
4710
|
+
weight: graderConfig.weight ?? 1,
|
|
4711
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
4712
|
+
details: { error: message }
|
|
4713
|
+
};
|
|
4714
|
+
await writeFile3(
|
|
4715
|
+
join2(resultsDir, `${graderName}.json`),
|
|
4716
|
+
`${JSON.stringify(errorResult, null, 2)}
|
|
4717
|
+
`,
|
|
4718
|
+
"utf8"
|
|
4719
|
+
);
|
|
4720
|
+
totalGraders++;
|
|
4721
|
+
} finally {
|
|
4722
|
+
completed++;
|
|
4723
|
+
writeProgress();
|
|
4724
|
+
}
|
|
4725
|
+
};
|
|
4726
|
+
const pending = /* @__PURE__ */ new Set();
|
|
4727
|
+
for (const task of tasks) {
|
|
4728
|
+
const p = executeGrader(task).then(() => {
|
|
4729
|
+
pending.delete(p);
|
|
4730
|
+
});
|
|
4731
|
+
pending.add(p);
|
|
4732
|
+
if (pending.size >= concurrency) {
|
|
4733
|
+
await Promise.race(pending);
|
|
4734
|
+
}
|
|
4735
|
+
}
|
|
4736
|
+
await Promise.all(pending);
|
|
4737
|
+
process.stderr.write("\n");
|
|
4738
|
+
return { totalGraders, totalPassed };
|
|
4739
|
+
}
|
|
4637
4740
|
var evalGradeCommand = command({
|
|
4638
4741
|
name: "grade",
|
|
4639
4742
|
description: "Run code-grader assertions on responses in an export directory",
|
|
@@ -4642,16 +4745,22 @@ var evalGradeCommand = command({
|
|
|
4642
4745
|
type: string,
|
|
4643
4746
|
displayName: "export-dir",
|
|
4644
4747
|
description: "Export directory from pipeline input"
|
|
4748
|
+
}),
|
|
4749
|
+
concurrency: option({
|
|
4750
|
+
type: optional(number),
|
|
4751
|
+
long: "concurrency",
|
|
4752
|
+
short: "j",
|
|
4753
|
+
description: `Number of graders to run in parallel (default: ${DEFAULT_CONCURRENCY})`
|
|
4645
4754
|
})
|
|
4646
4755
|
},
|
|
4647
|
-
handler: async ({ exportDir }) => {
|
|
4756
|
+
handler: async ({ exportDir, concurrency }) => {
|
|
4757
|
+
const maxWorkers = concurrency ?? DEFAULT_CONCURRENCY;
|
|
4648
4758
|
const manifestPath = join2(exportDir, "manifest.json");
|
|
4649
4759
|
const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
|
|
4650
4760
|
const testIds = manifest.test_ids;
|
|
4651
4761
|
const evalSet = manifest.dataset ?? "";
|
|
4652
4762
|
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4653
|
-
|
|
4654
|
-
let totalPassed = 0;
|
|
4763
|
+
const tasks = [];
|
|
4655
4764
|
for (const testId of testIds) {
|
|
4656
4765
|
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
4657
4766
|
const testDir = join2(exportDir, ...subpath);
|
|
@@ -4668,76 +4777,10 @@ var evalGradeCommand = command({
|
|
|
4668
4777
|
const responseText = await readFile2(join2(testDir, "response.md"), "utf8");
|
|
4669
4778
|
const inputData = JSON.parse(await readFile2(join2(testDir, "input.json"), "utf8"));
|
|
4670
4779
|
for (const graderFile of graderFiles) {
|
|
4671
|
-
|
|
4672
|
-
const graderName = graderConfig.name;
|
|
4673
|
-
const inputText = extractInputText(inputData.input);
|
|
4674
|
-
const payload = JSON.stringify({
|
|
4675
|
-
output: [{ role: "assistant", content: responseText }],
|
|
4676
|
-
input: inputData.input,
|
|
4677
|
-
criteria: "",
|
|
4678
|
-
expected_output: [],
|
|
4679
|
-
input_files: inputData.input_files ?? [],
|
|
4680
|
-
trace: null,
|
|
4681
|
-
token_usage: null,
|
|
4682
|
-
cost_usd: null,
|
|
4683
|
-
duration_ms: null,
|
|
4684
|
-
start_time: null,
|
|
4685
|
-
end_time: null,
|
|
4686
|
-
file_changes: null,
|
|
4687
|
-
workspace_path: null,
|
|
4688
|
-
config: graderConfig.config ?? null,
|
|
4689
|
-
metadata: inputData.metadata ?? {},
|
|
4690
|
-
input_text: inputText,
|
|
4691
|
-
output_text: responseText,
|
|
4692
|
-
expected_output_text: ""
|
|
4693
|
-
});
|
|
4694
|
-
try {
|
|
4695
|
-
const stdout = await executeScript(
|
|
4696
|
-
graderConfig.command,
|
|
4697
|
-
payload,
|
|
4698
|
-
void 0,
|
|
4699
|
-
graderConfig.cwd
|
|
4700
|
-
);
|
|
4701
|
-
const parsed = JSON.parse(stdout);
|
|
4702
|
-
const score = typeof parsed.score === "number" ? parsed.score : 0;
|
|
4703
|
-
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
|
|
4704
|
-
const result = {
|
|
4705
|
-
name: graderName,
|
|
4706
|
-
type: "code-grader",
|
|
4707
|
-
score,
|
|
4708
|
-
weight: graderConfig.weight ?? 1,
|
|
4709
|
-
assertions,
|
|
4710
|
-
details: parsed.details ?? {}
|
|
4711
|
-
};
|
|
4712
|
-
await writeFile3(
|
|
4713
|
-
join2(resultsDir, `${graderName}.json`),
|
|
4714
|
-
`${JSON.stringify(result, null, 2)}
|
|
4715
|
-
`,
|
|
4716
|
-
"utf8"
|
|
4717
|
-
);
|
|
4718
|
-
totalGraders++;
|
|
4719
|
-
if (score >= 0.5) totalPassed++;
|
|
4720
|
-
} catch (error) {
|
|
4721
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
4722
|
-
console.error(` ${testId}/${graderName}: ERROR \u2014 ${message}`);
|
|
4723
|
-
const errorResult = {
|
|
4724
|
-
name: graderName,
|
|
4725
|
-
type: "code-grader",
|
|
4726
|
-
score: 0,
|
|
4727
|
-
weight: graderConfig.weight ?? 1,
|
|
4728
|
-
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
4729
|
-
details: { error: message }
|
|
4730
|
-
};
|
|
4731
|
-
await writeFile3(
|
|
4732
|
-
join2(resultsDir, `${graderName}.json`),
|
|
4733
|
-
`${JSON.stringify(errorResult, null, 2)}
|
|
4734
|
-
`,
|
|
4735
|
-
"utf8"
|
|
4736
|
-
);
|
|
4737
|
-
totalGraders++;
|
|
4738
|
-
}
|
|
4780
|
+
tasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
|
|
4739
4781
|
}
|
|
4740
4782
|
}
|
|
4783
|
+
const { totalGraders, totalPassed } = await runCodeGraders(tasks, maxWorkers);
|
|
4741
4784
|
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
|
|
4742
4785
|
}
|
|
4743
4786
|
});
|
|
@@ -5156,8 +5199,7 @@ Done. Results in ${outDir}`);
|
|
|
5156
5199
|
);
|
|
5157
5200
|
return;
|
|
5158
5201
|
}
|
|
5159
|
-
|
|
5160
|
-
let totalPassed = 0;
|
|
5202
|
+
const graderTasks = [];
|
|
5161
5203
|
for (const testId of testIds) {
|
|
5162
5204
|
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
5163
5205
|
const testDir = join4(outDir, ...subpath);
|
|
@@ -5174,82 +5216,11 @@ Done. Results in ${outDir}`);
|
|
|
5174
5216
|
const responseText = await readFile4(join4(testDir, "response.md"), "utf8");
|
|
5175
5217
|
const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
|
|
5176
5218
|
for (const graderFile of graderFiles) {
|
|
5177
|
-
|
|
5178
|
-
const graderName = graderConfig.name;
|
|
5179
|
-
const inputText = extractInputText2(inputData.input);
|
|
5180
|
-
const payload = JSON.stringify({
|
|
5181
|
-
output: [{ role: "assistant", content: responseText }],
|
|
5182
|
-
input: inputData.input,
|
|
5183
|
-
criteria: "",
|
|
5184
|
-
expected_output: [],
|
|
5185
|
-
input_files: inputData.input_files ?? [],
|
|
5186
|
-
trace: null,
|
|
5187
|
-
token_usage: null,
|
|
5188
|
-
cost_usd: null,
|
|
5189
|
-
duration_ms: null,
|
|
5190
|
-
start_time: null,
|
|
5191
|
-
end_time: null,
|
|
5192
|
-
file_changes: null,
|
|
5193
|
-
workspace_path: null,
|
|
5194
|
-
config: graderConfig.config ?? null,
|
|
5195
|
-
metadata: inputData.metadata ?? {},
|
|
5196
|
-
input_text: inputText,
|
|
5197
|
-
output_text: responseText,
|
|
5198
|
-
expected_output_text: ""
|
|
5199
|
-
});
|
|
5200
|
-
try {
|
|
5201
|
-
const stdout = await executeScript(
|
|
5202
|
-
graderConfig.command,
|
|
5203
|
-
payload,
|
|
5204
|
-
void 0,
|
|
5205
|
-
graderConfig.cwd
|
|
5206
|
-
);
|
|
5207
|
-
const parsed = JSON.parse(stdout);
|
|
5208
|
-
const score = typeof parsed.score === "number" ? parsed.score : 0;
|
|
5209
|
-
const assertions = Array.isArray(parsed.assertions) ? parsed.assertions : [];
|
|
5210
|
-
await writeFile5(
|
|
5211
|
-
join4(resultsDir, `${graderName}.json`),
|
|
5212
|
-
`${JSON.stringify(
|
|
5213
|
-
{
|
|
5214
|
-
name: graderName,
|
|
5215
|
-
type: "code-grader",
|
|
5216
|
-
score,
|
|
5217
|
-
weight: graderConfig.weight ?? 1,
|
|
5218
|
-
assertions,
|
|
5219
|
-
details: parsed.details ?? {}
|
|
5220
|
-
},
|
|
5221
|
-
null,
|
|
5222
|
-
2
|
|
5223
|
-
)}
|
|
5224
|
-
`,
|
|
5225
|
-
"utf8"
|
|
5226
|
-
);
|
|
5227
|
-
totalGraders++;
|
|
5228
|
-
if (score >= 0.5) totalPassed++;
|
|
5229
|
-
} catch (error) {
|
|
5230
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
5231
|
-
console.error(` ${testId}/${graderName}: ERROR \u2014 ${message}`);
|
|
5232
|
-
await writeFile5(
|
|
5233
|
-
join4(resultsDir, `${graderName}.json`),
|
|
5234
|
-
`${JSON.stringify(
|
|
5235
|
-
{
|
|
5236
|
-
name: graderName,
|
|
5237
|
-
type: "code-grader",
|
|
5238
|
-
score: 0,
|
|
5239
|
-
weight: graderConfig.weight ?? 1,
|
|
5240
|
-
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
5241
|
-
details: { error: message }
|
|
5242
|
-
},
|
|
5243
|
-
null,
|
|
5244
|
-
2
|
|
5245
|
-
)}
|
|
5246
|
-
`,
|
|
5247
|
-
"utf8"
|
|
5248
|
-
);
|
|
5249
|
-
totalGraders++;
|
|
5250
|
-
}
|
|
5219
|
+
graderTasks.push({ testId, testDir, resultsDir, graderFile, responseText, inputData });
|
|
5251
5220
|
}
|
|
5252
5221
|
}
|
|
5222
|
+
const graderConcurrency = workers ?? 10;
|
|
5223
|
+
const { totalGraders, totalPassed } = await runCodeGraders(graderTasks, graderConcurrency);
|
|
5253
5224
|
console.log(`Graded ${totalGraders} code-grader(s): ${totalPassed} passed`);
|
|
5254
5225
|
console.log(`
|
|
5255
5226
|
Done. Agent can now perform LLM grading on responses in ${outDir}`);
|
|
@@ -6743,8 +6714,8 @@ function resolveStudioDistDir() {
|
|
|
6743
6714
|
path9.resolve(currentDir, "../../../../studio/dist"),
|
|
6744
6715
|
// From dist/ → sibling apps/studio/dist (monorepo dev)
|
|
6745
6716
|
path9.resolve(currentDir, "../../studio/dist"),
|
|
6746
|
-
// Bundled inside CLI dist (published package)
|
|
6747
|
-
path9.resolve(currentDir, "
|
|
6717
|
+
// Bundled inside CLI dist (published package: dist/studio/)
|
|
6718
|
+
path9.resolve(currentDir, "studio"),
|
|
6748
6719
|
// From dist/ in monorepo root context
|
|
6749
6720
|
path9.resolve(currentDir, "../../../apps/studio/dist")
|
|
6750
6721
|
];
|
|
@@ -8359,4 +8330,4 @@ export {
|
|
|
8359
8330
|
preprocessArgv,
|
|
8360
8331
|
runCli
|
|
8361
8332
|
};
|
|
8362
|
-
//# sourceMappingURL=chunk-
|
|
8333
|
+
//# sourceMappingURL=chunk-UXSQQHCI.js.map
|