agentv 1.2.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +439 -441
- package/dist/{chunk-IVIT4U6S.js → chunk-3RYQPI4H.js} +709 -465
- package/dist/chunk-3RYQPI4H.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.agentv/.env.template +23 -23
- package/dist/templates/.agentv/config.yaml +15 -15
- package/dist/templates/.agentv/targets.yaml +71 -73
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +212 -174
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +318 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +215 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +216 -213
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +340 -247
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +139 -139
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +198 -179
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +77 -77
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +4 -4
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +3 -3
- package/package.json +3 -6
- package/dist/chunk-IVIT4U6S.js.map +0 -1
|
@@ -141,30 +141,14 @@ var require_dist = __commonJS({
|
|
|
141
141
|
});
|
|
142
142
|
|
|
143
143
|
// src/index.ts
|
|
144
|
-
import { readFileSync as
|
|
144
|
+
import { readFileSync as readFileSync3 } from "node:fs";
|
|
145
145
|
import { binary, run, subcommands as subcommands2 } from "cmd-ts";
|
|
146
146
|
|
|
147
|
-
// src/commands/
|
|
148
|
-
import {
|
|
149
|
-
import
|
|
150
|
-
import {
|
|
151
|
-
command,
|
|
152
|
-
flag,
|
|
153
|
-
number as number4,
|
|
154
|
-
option,
|
|
155
|
-
optional as optional2,
|
|
156
|
-
restPositionals,
|
|
157
|
-
string as string4
|
|
158
|
-
} from "cmd-ts";
|
|
159
|
-
import fg from "fast-glob";
|
|
160
|
-
|
|
161
|
-
// src/commands/eval/run-eval.ts
|
|
162
|
-
import { constants as constants6 } from "node:fs";
|
|
163
|
-
import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
164
|
-
import path19 from "node:path";
|
|
165
|
-
import { pathToFileURL } from "node:url";
|
|
147
|
+
// src/commands/convert/index.ts
|
|
148
|
+
import { readFileSync, writeFileSync } from "node:fs";
|
|
149
|
+
import path14 from "node:path";
|
|
166
150
|
|
|
167
|
-
// ../../packages/core/dist/chunk-
|
|
151
|
+
// ../../packages/core/dist/chunk-KPHTMTZ3.js
|
|
168
152
|
import { constants } from "node:fs";
|
|
169
153
|
import { access, readFile } from "node:fs/promises";
|
|
170
154
|
import path from "node:path";
|
|
@@ -648,8 +632,8 @@ function getErrorMap() {
|
|
|
648
632
|
|
|
649
633
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/helpers/parseUtil.js
|
|
650
634
|
var makeIssue = (params) => {
|
|
651
|
-
const { data, path:
|
|
652
|
-
const fullPath = [...
|
|
635
|
+
const { data, path: path28, errorMaps, issueData } = params;
|
|
636
|
+
const fullPath = [...path28, ...issueData.path || []];
|
|
653
637
|
const fullIssue = {
|
|
654
638
|
...issueData,
|
|
655
639
|
path: fullPath
|
|
@@ -765,11 +749,11 @@ var errorUtil;
|
|
|
765
749
|
|
|
766
750
|
// ../../node_modules/.bun/zod@3.25.76/node_modules/zod/v3/types.js
|
|
767
751
|
var ParseInputLazyPath = class {
|
|
768
|
-
constructor(parent, value,
|
|
752
|
+
constructor(parent, value, path28, key2) {
|
|
769
753
|
this._cachedPath = [];
|
|
770
754
|
this.parent = parent;
|
|
771
755
|
this.data = value;
|
|
772
|
-
this._path =
|
|
756
|
+
this._path = path28;
|
|
773
757
|
this._key = key2;
|
|
774
758
|
}
|
|
775
759
|
get path() {
|
|
@@ -1049,8 +1033,8 @@ var ZodType = class {
|
|
|
1049
1033
|
promise() {
|
|
1050
1034
|
return ZodPromise.create(this, this._def);
|
|
1051
1035
|
}
|
|
1052
|
-
or(
|
|
1053
|
-
return ZodUnion.create([this,
|
|
1036
|
+
or(option5) {
|
|
1037
|
+
return ZodUnion.create([this, option5], this._def);
|
|
1054
1038
|
}
|
|
1055
1039
|
and(incoming) {
|
|
1056
1040
|
return ZodIntersection.create(this, incoming, this._def);
|
|
@@ -2900,7 +2884,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2900
2884
|
return INVALID;
|
|
2901
2885
|
}
|
|
2902
2886
|
if (ctx.common.async) {
|
|
2903
|
-
return Promise.all(options.map(async (
|
|
2887
|
+
return Promise.all(options.map(async (option5) => {
|
|
2904
2888
|
const childCtx = {
|
|
2905
2889
|
...ctx,
|
|
2906
2890
|
common: {
|
|
@@ -2910,7 +2894,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2910
2894
|
parent: null
|
|
2911
2895
|
};
|
|
2912
2896
|
return {
|
|
2913
|
-
result: await
|
|
2897
|
+
result: await option5._parseAsync({
|
|
2914
2898
|
data: ctx.data,
|
|
2915
2899
|
path: ctx.path,
|
|
2916
2900
|
parent: childCtx
|
|
@@ -2921,7 +2905,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2921
2905
|
} else {
|
|
2922
2906
|
let dirty = void 0;
|
|
2923
2907
|
const issues = [];
|
|
2924
|
-
for (const
|
|
2908
|
+
for (const option5 of options) {
|
|
2925
2909
|
const childCtx = {
|
|
2926
2910
|
...ctx,
|
|
2927
2911
|
common: {
|
|
@@ -2930,7 +2914,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2930
2914
|
},
|
|
2931
2915
|
parent: null
|
|
2932
2916
|
};
|
|
2933
|
-
const result =
|
|
2917
|
+
const result = option5._parseSync({
|
|
2934
2918
|
data: ctx.data,
|
|
2935
2919
|
path: ctx.path,
|
|
2936
2920
|
parent: childCtx
|
|
@@ -3011,8 +2995,8 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3011
2995
|
}
|
|
3012
2996
|
const discriminator = this.discriminator;
|
|
3013
2997
|
const discriminatorValue = ctx.data[discriminator];
|
|
3014
|
-
const
|
|
3015
|
-
if (!
|
|
2998
|
+
const option5 = this.optionsMap.get(discriminatorValue);
|
|
2999
|
+
if (!option5) {
|
|
3016
3000
|
addIssueToContext(ctx, {
|
|
3017
3001
|
code: ZodIssueCode.invalid_union_discriminator,
|
|
3018
3002
|
options: Array.from(this.optionsMap.keys()),
|
|
@@ -3021,13 +3005,13 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3021
3005
|
return INVALID;
|
|
3022
3006
|
}
|
|
3023
3007
|
if (ctx.common.async) {
|
|
3024
|
-
return
|
|
3008
|
+
return option5._parseAsync({
|
|
3025
3009
|
data: ctx.data,
|
|
3026
3010
|
path: ctx.path,
|
|
3027
3011
|
parent: ctx
|
|
3028
3012
|
});
|
|
3029
3013
|
} else {
|
|
3030
|
-
return
|
|
3014
|
+
return option5._parseSync({
|
|
3031
3015
|
data: ctx.data,
|
|
3032
3016
|
path: ctx.path,
|
|
3033
3017
|
parent: ctx
|
|
@@ -4211,7 +4195,7 @@ var coerce = {
|
|
|
4211
4195
|
};
|
|
4212
4196
|
var NEVER = INVALID;
|
|
4213
4197
|
|
|
4214
|
-
// ../../packages/core/dist/chunk-
|
|
4198
|
+
// ../../packages/core/dist/chunk-KPHTMTZ3.js
|
|
4215
4199
|
async function fileExists(filePath) {
|
|
4216
4200
|
try {
|
|
4217
4201
|
await access(filePath, constants.F_OK);
|
|
@@ -4227,10 +4211,6 @@ async function readTextFile(filePath) {
|
|
|
4227
4211
|
const content = await readFile(filePath, "utf8");
|
|
4228
4212
|
return normalizeLineEndings(content);
|
|
4229
4213
|
}
|
|
4230
|
-
async function readJsonFile(filePath) {
|
|
4231
|
-
const content = await readFile(filePath, "utf8");
|
|
4232
|
-
return JSON.parse(content);
|
|
4233
|
-
}
|
|
4234
4214
|
async function findGitRoot(startPath) {
|
|
4235
4215
|
let currentDir = path.dirname(path.resolve(startPath));
|
|
4236
4216
|
const root2 = path.parse(currentDir).root;
|
|
@@ -4574,8 +4554,7 @@ function normalizeCodexLogFormat(value) {
|
|
|
4574
4554
|
}
|
|
4575
4555
|
function resolveMockConfig(target) {
|
|
4576
4556
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
4577
|
-
|
|
4578
|
-
return { response, trace: trace2 };
|
|
4557
|
+
return { response };
|
|
4579
4558
|
}
|
|
4580
4559
|
function resolveVSCodeConfig(target, env, insiders) {
|
|
4581
4560
|
const workspaceTemplateEnvVar = resolveOptionalLiteralString(
|
|
@@ -4595,9 +4574,9 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4595
4574
|
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
4596
4575
|
const subagentRootSource = target.subagent_root ?? target.subagentRoot;
|
|
4597
4576
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
4598
|
-
const
|
|
4577
|
+
const command6 = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
|
|
4599
4578
|
return {
|
|
4600
|
-
command:
|
|
4579
|
+
command: command6,
|
|
4601
4580
|
waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
|
|
4602
4581
|
dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
|
|
4603
4582
|
subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
|
|
@@ -4612,10 +4591,17 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4612
4591
|
const filesFormat = resolveOptionalLiteralString(
|
|
4613
4592
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
4614
4593
|
);
|
|
4594
|
+
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
4595
|
+
const keepTempFiles = resolveOptionalBoolean(
|
|
4596
|
+
target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
|
|
4597
|
+
);
|
|
4615
4598
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
4616
4599
|
allowLiteral: true,
|
|
4617
4600
|
optionalEnv: true
|
|
4618
4601
|
});
|
|
4602
|
+
if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
|
|
4603
|
+
cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
|
|
4604
|
+
}
|
|
4619
4605
|
if (!cwd && evalFilePath) {
|
|
4620
4606
|
cwd = path2.dirname(path2.resolve(evalFilePath));
|
|
4621
4607
|
}
|
|
@@ -4623,7 +4609,7 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4623
4609
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
4624
4610
|
`${target.name} timeout`
|
|
4625
4611
|
);
|
|
4626
|
-
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
4612
|
+
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
|
|
4627
4613
|
const commandTemplate = resolveString(
|
|
4628
4614
|
commandTemplateSource,
|
|
4629
4615
|
env,
|
|
@@ -4636,7 +4622,9 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4636
4622
|
filesFormat,
|
|
4637
4623
|
cwd,
|
|
4638
4624
|
timeoutMs,
|
|
4639
|
-
healthcheck
|
|
4625
|
+
healthcheck,
|
|
4626
|
+
verbose,
|
|
4627
|
+
keepTempFiles
|
|
4640
4628
|
};
|
|
4641
4629
|
}
|
|
4642
4630
|
function resolveTimeoutMs(source2, description) {
|
|
@@ -4649,7 +4637,7 @@ function resolveTimeoutMs(source2, description) {
|
|
|
4649
4637
|
}
|
|
4650
4638
|
return Math.floor(seconds * 1e3);
|
|
4651
4639
|
}
|
|
4652
|
-
function resolveCliHealthcheck(source2, env, targetName) {
|
|
4640
|
+
function resolveCliHealthcheck(source2, env, targetName, evalFilePath) {
|
|
4653
4641
|
if (source2 === void 0 || source2 === null) {
|
|
4654
4642
|
return void 0;
|
|
4655
4643
|
}
|
|
@@ -4682,11 +4670,12 @@ function resolveCliHealthcheck(source2, env, targetName) {
|
|
|
4682
4670
|
allowLiteral: true,
|
|
4683
4671
|
optionalEnv: true
|
|
4684
4672
|
});
|
|
4673
|
+
const resolvedCwd = cwd && evalFilePath && !path2.isAbsolute(cwd) ? path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd) : cwd;
|
|
4685
4674
|
return {
|
|
4686
4675
|
type: "command",
|
|
4687
4676
|
commandTemplate,
|
|
4688
4677
|
timeoutMs,
|
|
4689
|
-
cwd
|
|
4678
|
+
cwd: resolvedCwd
|
|
4690
4679
|
};
|
|
4691
4680
|
}
|
|
4692
4681
|
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
@@ -4885,6 +4874,21 @@ var PROVIDER_ALIASES = [
|
|
|
4885
4874
|
"vertex"
|
|
4886
4875
|
// legacy/future support
|
|
4887
4876
|
];
|
|
4877
|
+
function extractLastAssistantContent(messages) {
|
|
4878
|
+
if (!messages || messages.length === 0) {
|
|
4879
|
+
return "";
|
|
4880
|
+
}
|
|
4881
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
4882
|
+
const msg = messages[i];
|
|
4883
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
4884
|
+
if (typeof msg.content === "string") {
|
|
4885
|
+
return msg.content;
|
|
4886
|
+
}
|
|
4887
|
+
return JSON.stringify(msg.content);
|
|
4888
|
+
}
|
|
4889
|
+
}
|
|
4890
|
+
return "";
|
|
4891
|
+
}
|
|
4888
4892
|
function isAgentProvider(provider) {
|
|
4889
4893
|
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
4890
4894
|
}
|
|
@@ -5995,10 +5999,10 @@ function assignProp(target, prop, value) {
|
|
|
5995
5999
|
configurable: true
|
|
5996
6000
|
});
|
|
5997
6001
|
}
|
|
5998
|
-
function getElementAtPath(obj,
|
|
5999
|
-
if (!
|
|
6002
|
+
function getElementAtPath(obj, path28) {
|
|
6003
|
+
if (!path28)
|
|
6000
6004
|
return obj;
|
|
6001
|
-
return
|
|
6005
|
+
return path28.reduce((acc, key2) => acc?.[key2], obj);
|
|
6002
6006
|
}
|
|
6003
6007
|
function promiseAllObject(promisesObj) {
|
|
6004
6008
|
const keys = Object.keys(promisesObj);
|
|
@@ -6318,11 +6322,11 @@ function aborted(x, startIndex = 0) {
|
|
|
6318
6322
|
}
|
|
6319
6323
|
return false;
|
|
6320
6324
|
}
|
|
6321
|
-
function prefixIssues(
|
|
6325
|
+
function prefixIssues(path28, issues) {
|
|
6322
6326
|
return issues.map((iss) => {
|
|
6323
6327
|
var _a17;
|
|
6324
6328
|
(_a17 = iss).path ?? (_a17.path = []);
|
|
6325
|
-
iss.path.unshift(
|
|
6329
|
+
iss.path.unshift(path28);
|
|
6326
6330
|
return iss;
|
|
6327
6331
|
});
|
|
6328
6332
|
}
|
|
@@ -6459,7 +6463,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6459
6463
|
return issue2.message;
|
|
6460
6464
|
};
|
|
6461
6465
|
const result = { errors: [] };
|
|
6462
|
-
const processError = (error41,
|
|
6466
|
+
const processError = (error41, path28 = []) => {
|
|
6463
6467
|
var _a17, _b8;
|
|
6464
6468
|
for (const issue2 of error41.issues) {
|
|
6465
6469
|
if (issue2.code === "invalid_union" && issue2.errors.length) {
|
|
@@ -6469,7 +6473,7 @@ function treeifyError(error40, _mapper) {
|
|
|
6469
6473
|
} else if (issue2.code === "invalid_element") {
|
|
6470
6474
|
processError({ issues: issue2.issues }, issue2.path);
|
|
6471
6475
|
} else {
|
|
6472
|
-
const fullpath = [...
|
|
6476
|
+
const fullpath = [...path28, ...issue2.path];
|
|
6473
6477
|
if (fullpath.length === 0) {
|
|
6474
6478
|
result.errors.push(mapper(issue2));
|
|
6475
6479
|
continue;
|
|
@@ -6499,9 +6503,9 @@ function treeifyError(error40, _mapper) {
|
|
|
6499
6503
|
processError(error40);
|
|
6500
6504
|
return result;
|
|
6501
6505
|
}
|
|
6502
|
-
function toDotPath(
|
|
6506
|
+
function toDotPath(path28) {
|
|
6503
6507
|
const segs = [];
|
|
6504
|
-
for (const seg of
|
|
6508
|
+
for (const seg of path28) {
|
|
6505
6509
|
if (typeof seg === "number")
|
|
6506
6510
|
segs.push(`[${seg}]`);
|
|
6507
6511
|
else if (typeof seg === "symbol")
|
|
@@ -8100,7 +8104,7 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8100
8104
|
defineLazy(inst._zod, "optout", () => def.options.some((o) => o._zod.optout === "optional") ? "optional" : void 0);
|
|
8101
8105
|
defineLazy(inst._zod, "values", () => {
|
|
8102
8106
|
if (def.options.every((o) => o._zod.values)) {
|
|
8103
|
-
return new Set(def.options.flatMap((
|
|
8107
|
+
return new Set(def.options.flatMap((option5) => Array.from(option5._zod.values)));
|
|
8104
8108
|
}
|
|
8105
8109
|
return void 0;
|
|
8106
8110
|
});
|
|
@@ -8114,8 +8118,8 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8114
8118
|
inst._zod.parse = (payload, ctx) => {
|
|
8115
8119
|
let async = false;
|
|
8116
8120
|
const results = [];
|
|
8117
|
-
for (const
|
|
8118
|
-
const result =
|
|
8121
|
+
for (const option5 of def.options) {
|
|
8122
|
+
const result = option5._zod.run({
|
|
8119
8123
|
value: payload.value,
|
|
8120
8124
|
issues: []
|
|
8121
8125
|
}, ctx);
|
|
@@ -8140,10 +8144,10 @@ var $ZodDiscriminatedUnion = /* @__PURE__ */ $constructor("$ZodDiscriminatedUnio
|
|
|
8140
8144
|
const _super = inst._zod.parse;
|
|
8141
8145
|
defineLazy(inst._zod, "propValues", () => {
|
|
8142
8146
|
const propValues = {};
|
|
8143
|
-
for (const
|
|
8144
|
-
const pv =
|
|
8147
|
+
for (const option5 of def.options) {
|
|
8148
|
+
const pv = option5._zod.propValues;
|
|
8145
8149
|
if (!pv || Object.keys(pv).length === 0)
|
|
8146
|
-
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(
|
|
8150
|
+
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(option5)}"`);
|
|
8147
8151
|
for (const [k, v] of Object.entries(pv)) {
|
|
8148
8152
|
if (!propValues[k])
|
|
8149
8153
|
propValues[k] = /* @__PURE__ */ new Set();
|
|
@@ -15347,8 +15351,8 @@ function isTransforming(_schema, _ctx) {
|
|
|
15347
15351
|
return false;
|
|
15348
15352
|
}
|
|
15349
15353
|
case "union": {
|
|
15350
|
-
for (const
|
|
15351
|
-
if (isTransforming(
|
|
15354
|
+
for (const option5 of def.options) {
|
|
15355
|
+
if (isTransforming(option5, ctx))
|
|
15352
15356
|
return true;
|
|
15353
15357
|
}
|
|
15354
15358
|
return false;
|
|
@@ -26054,14 +26058,14 @@ function createAzure(options = {}) {
|
|
|
26054
26058
|
description: "Azure OpenAI resource name"
|
|
26055
26059
|
});
|
|
26056
26060
|
const apiVersion = (_a17 = options.apiVersion) != null ? _a17 : "v1";
|
|
26057
|
-
const url2 = ({ path:
|
|
26061
|
+
const url2 = ({ path: path28, modelId }) => {
|
|
26058
26062
|
var _a24;
|
|
26059
26063
|
const baseUrlPrefix = (_a24 = options.baseURL) != null ? _a24 : `https://${getResourceName()}.openai.azure.com/openai`;
|
|
26060
26064
|
let fullUrl;
|
|
26061
26065
|
if (options.useDeploymentBasedUrls) {
|
|
26062
|
-
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${
|
|
26066
|
+
fullUrl = new URL(`${baseUrlPrefix}/deployments/${modelId}${path28}`);
|
|
26063
26067
|
} else {
|
|
26064
|
-
fullUrl = new URL(`${baseUrlPrefix}/v1${
|
|
26068
|
+
fullUrl = new URL(`${baseUrlPrefix}/v1${path28}`);
|
|
26065
26069
|
}
|
|
26066
26070
|
fullUrl.searchParams.set("api-version", apiVersion);
|
|
26067
26071
|
return fullUrl.toString();
|
|
@@ -34589,33 +34593,22 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
|
34589
34593
|
function isEvaluatorKind(value) {
|
|
34590
34594
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
34591
34595
|
}
|
|
34592
|
-
function
|
|
34593
|
-
return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
|
|
34594
|
-
}
|
|
34595
|
-
function isTraceEvent(value) {
|
|
34596
|
-
if (typeof value !== "object" || value === null) {
|
|
34597
|
-
return false;
|
|
34598
|
-
}
|
|
34599
|
-
const candidate = value;
|
|
34600
|
-
return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
|
|
34601
|
-
}
|
|
34602
|
-
function computeTraceSummary(trace2) {
|
|
34596
|
+
function computeTraceSummary(messages) {
|
|
34603
34597
|
const toolCallCounts = {};
|
|
34604
|
-
let
|
|
34605
|
-
for (const
|
|
34606
|
-
if (
|
|
34607
|
-
|
|
34608
|
-
|
|
34609
|
-
|
|
34610
|
-
errorCount++;
|
|
34598
|
+
let totalToolCalls = 0;
|
|
34599
|
+
for (const message of messages) {
|
|
34600
|
+
if (!message.toolCalls) continue;
|
|
34601
|
+
for (const toolCall of message.toolCalls) {
|
|
34602
|
+
toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
|
|
34603
|
+
totalToolCalls++;
|
|
34611
34604
|
}
|
|
34612
34605
|
}
|
|
34613
34606
|
const toolNames = Object.keys(toolCallCounts).sort();
|
|
34614
34607
|
return {
|
|
34615
|
-
eventCount:
|
|
34608
|
+
eventCount: totalToolCalls,
|
|
34616
34609
|
toolNames,
|
|
34617
34610
|
toolCallsByName: toolCallCounts,
|
|
34618
|
-
errorCount
|
|
34611
|
+
errorCount: 0
|
|
34619
34612
|
};
|
|
34620
34613
|
}
|
|
34621
34614
|
function extractCodeBlocks(segments) {
|
|
@@ -34863,7 +34856,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
34863
34856
|
QUESTION: "question",
|
|
34864
34857
|
EXPECTED_OUTCOME: "expected_outcome",
|
|
34865
34858
|
REFERENCE_ANSWER: "reference_answer",
|
|
34866
|
-
INPUT_MESSAGES: "input_messages"
|
|
34859
|
+
INPUT_MESSAGES: "input_messages",
|
|
34860
|
+
OUTPUT_MESSAGES: "output_messages"
|
|
34867
34861
|
};
|
|
34868
34862
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
34869
34863
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
@@ -35253,6 +35247,17 @@ async function processMessages(options) {
|
|
|
35253
35247
|
}
|
|
35254
35248
|
continue;
|
|
35255
35249
|
}
|
|
35250
|
+
if (isJsonObject(content)) {
|
|
35251
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
35252
|
+
segments.push({ type: "text", value: rendered });
|
|
35253
|
+
if (textParts) {
|
|
35254
|
+
textParts.push(rendered);
|
|
35255
|
+
}
|
|
35256
|
+
continue;
|
|
35257
|
+
}
|
|
35258
|
+
if (!Array.isArray(content)) {
|
|
35259
|
+
continue;
|
|
35260
|
+
}
|
|
35256
35261
|
for (const rawSegment of content) {
|
|
35257
35262
|
if (!isJsonObject(rawSegment)) {
|
|
35258
35263
|
continue;
|
|
@@ -35475,6 +35480,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
35475
35480
|
}
|
|
35476
35481
|
}
|
|
35477
35482
|
}
|
|
35483
|
+
} else if (isJsonObject(message.content)) {
|
|
35484
|
+
const rendered = JSON.stringify(message.content, null, 2);
|
|
35485
|
+
if (rendered.trim().length > 0) {
|
|
35486
|
+
messageSegments.push({ type: "text", value: rendered });
|
|
35487
|
+
}
|
|
35478
35488
|
}
|
|
35479
35489
|
segmentsByMessage.push(messageSegments);
|
|
35480
35490
|
}
|
|
@@ -35716,16 +35726,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35716
35726
|
}) : [];
|
|
35717
35727
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
35718
35728
|
let referenceAnswer = "";
|
|
35719
|
-
if (outputSegments.length >
|
|
35720
|
-
|
|
35721
|
-
|
|
35722
|
-
const
|
|
35723
|
-
if (typeof
|
|
35724
|
-
referenceAnswer =
|
|
35725
|
-
} else if (
|
|
35726
|
-
referenceAnswer = JSON.stringify(
|
|
35727
|
-
} else if (
|
|
35728
|
-
referenceAnswer = JSON.stringify(
|
|
35729
|
+
if (outputSegments.length > 0) {
|
|
35730
|
+
const lastMessage = outputSegments[outputSegments.length - 1];
|
|
35731
|
+
const content = lastMessage.content;
|
|
35732
|
+
const toolCalls = lastMessage.tool_calls;
|
|
35733
|
+
if (typeof content === "string") {
|
|
35734
|
+
referenceAnswer = content;
|
|
35735
|
+
} else if (content !== void 0 && content !== null) {
|
|
35736
|
+
referenceAnswer = JSON.stringify(content, null, 2);
|
|
35737
|
+
} else if (toolCalls !== void 0 && toolCalls !== null) {
|
|
35738
|
+
referenceAnswer = JSON.stringify(toolCalls, null, 2);
|
|
35729
35739
|
}
|
|
35730
35740
|
}
|
|
35731
35741
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
@@ -36047,11 +36057,11 @@ async function invokeModel(options) {
|
|
|
36047
36057
|
return mapResponse(result);
|
|
36048
36058
|
}
|
|
36049
36059
|
function mapResponse(result) {
|
|
36060
|
+
const content = result.text ?? "";
|
|
36050
36061
|
return {
|
|
36051
|
-
text: result.text ?? "",
|
|
36052
|
-
reasoning: result.reasoningText ?? void 0,
|
|
36053
36062
|
raw: result,
|
|
36054
|
-
usage: toJsonObject(result.totalUsage ?? result.usage)
|
|
36063
|
+
usage: toJsonObject(result.totalUsage ?? result.usage),
|
|
36064
|
+
outputMessages: [{ role: "assistant", content }]
|
|
36055
36065
|
};
|
|
36056
36066
|
}
|
|
36057
36067
|
function toJsonObject(value) {
|
|
@@ -36158,7 +36168,7 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
36158
36168
|
}
|
|
36159
36169
|
var execAsync2 = promisify2(execWithCallback);
|
|
36160
36170
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
36161
|
-
async function defaultCommandRunner(
|
|
36171
|
+
async function defaultCommandRunner(command6, options) {
|
|
36162
36172
|
const execOptions = {
|
|
36163
36173
|
cwd: options.cwd,
|
|
36164
36174
|
env: options.env,
|
|
@@ -36168,7 +36178,7 @@ async function defaultCommandRunner(command5, options) {
|
|
|
36168
36178
|
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
36169
36179
|
};
|
|
36170
36180
|
try {
|
|
36171
|
-
const { stdout, stderr } = await execAsync2(
|
|
36181
|
+
const { stdout, stderr } = await execAsync2(command6, execOptions);
|
|
36172
36182
|
return {
|
|
36173
36183
|
stdout,
|
|
36174
36184
|
stderr,
|
|
@@ -36193,10 +36203,11 @@ var CliProvider = class {
|
|
|
36193
36203
|
id;
|
|
36194
36204
|
kind = "cli";
|
|
36195
36205
|
targetName;
|
|
36196
|
-
supportsBatch =
|
|
36206
|
+
supportsBatch = true;
|
|
36197
36207
|
config;
|
|
36198
36208
|
runCommand;
|
|
36199
36209
|
verbose;
|
|
36210
|
+
keepTempFiles;
|
|
36200
36211
|
healthcheckPromise;
|
|
36201
36212
|
constructor(targetName, config2, runner = defaultCommandRunner) {
|
|
36202
36213
|
this.targetName = targetName;
|
|
@@ -36204,6 +36215,7 @@ var CliProvider = class {
|
|
|
36204
36215
|
this.config = config2;
|
|
36205
36216
|
this.runCommand = runner;
|
|
36206
36217
|
this.verbose = config2.verbose ?? false;
|
|
36218
|
+
this.keepTempFiles = config2.keepTempFiles ?? false;
|
|
36207
36219
|
}
|
|
36208
36220
|
async invoke(request) {
|
|
36209
36221
|
if (request.signal?.aborted) {
|
|
@@ -36213,6 +36225,11 @@ var CliProvider = class {
|
|
|
36213
36225
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
36214
36226
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
36215
36227
|
const renderedCommand = renderTemplate2(this.config.commandTemplate, templateValues);
|
|
36228
|
+
if (this.verbose) {
|
|
36229
|
+
console.log(
|
|
36230
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36231
|
+
);
|
|
36232
|
+
}
|
|
36216
36233
|
const result = await this.runCommand(renderedCommand, {
|
|
36217
36234
|
cwd: this.config.cwd,
|
|
36218
36235
|
env: process.env,
|
|
@@ -36236,8 +36253,7 @@ var CliProvider = class {
|
|
|
36236
36253
|
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
36237
36254
|
const parsed = this.parseOutputContent(responseContent);
|
|
36238
36255
|
return {
|
|
36239
|
-
|
|
36240
|
-
trace: parsed.trace,
|
|
36256
|
+
outputMessages: parsed.outputMessages,
|
|
36241
36257
|
raw: {
|
|
36242
36258
|
command: renderedCommand,
|
|
36243
36259
|
stderr: result.stderr,
|
|
@@ -36247,30 +36263,225 @@ var CliProvider = class {
|
|
|
36247
36263
|
}
|
|
36248
36264
|
};
|
|
36249
36265
|
}
|
|
36266
|
+
async invokeBatch(requests) {
|
|
36267
|
+
if (requests.length === 0) {
|
|
36268
|
+
return [];
|
|
36269
|
+
}
|
|
36270
|
+
for (const request of requests) {
|
|
36271
|
+
if (request.signal?.aborted) {
|
|
36272
|
+
throw new Error("CLI provider batch request was aborted before execution");
|
|
36273
|
+
}
|
|
36274
|
+
}
|
|
36275
|
+
const controller = new AbortController();
|
|
36276
|
+
for (const request of requests) {
|
|
36277
|
+
request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
36278
|
+
}
|
|
36279
|
+
await this.ensureHealthy(controller.signal);
|
|
36280
|
+
const outputFilePath = generateOutputFilePath("batch", ".jsonl");
|
|
36281
|
+
const batchInputFiles = [];
|
|
36282
|
+
for (const request of requests) {
|
|
36283
|
+
if (request.inputFiles && request.inputFiles.length > 0) {
|
|
36284
|
+
batchInputFiles.push(...request.inputFiles);
|
|
36285
|
+
}
|
|
36286
|
+
}
|
|
36287
|
+
const templateValues = buildTemplateValues(
|
|
36288
|
+
{
|
|
36289
|
+
question: "",
|
|
36290
|
+
guidelines: "",
|
|
36291
|
+
inputFiles: batchInputFiles,
|
|
36292
|
+
evalCaseId: "batch",
|
|
36293
|
+
attempt: 0
|
|
36294
|
+
},
|
|
36295
|
+
this.config,
|
|
36296
|
+
outputFilePath
|
|
36297
|
+
);
|
|
36298
|
+
const renderedCommand = renderTemplate2(this.config.commandTemplate, templateValues);
|
|
36299
|
+
if (this.verbose) {
|
|
36300
|
+
console.log(
|
|
36301
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36302
|
+
);
|
|
36303
|
+
}
|
|
36304
|
+
const result = await this.runCommand(renderedCommand, {
|
|
36305
|
+
cwd: this.config.cwd,
|
|
36306
|
+
env: process.env,
|
|
36307
|
+
timeoutMs: this.config.timeoutMs,
|
|
36308
|
+
signal: controller.signal
|
|
36309
|
+
});
|
|
36310
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
36311
|
+
if (controller.signal.aborted) {
|
|
36312
|
+
throw new Error("CLI provider request was aborted");
|
|
36313
|
+
}
|
|
36314
|
+
if (result.timedOut) {
|
|
36315
|
+
throw new Error(
|
|
36316
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
36317
|
+
);
|
|
36318
|
+
}
|
|
36319
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
36320
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
36321
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
36322
|
+
throw new Error(message);
|
|
36323
|
+
}
|
|
36324
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
36325
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
36326
|
+
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
36327
|
+
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
36328
|
+
if (missingIds.length > 0) {
|
|
36329
|
+
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
36330
|
+
}
|
|
36331
|
+
const responses = requests.map((request) => {
|
|
36332
|
+
const evalCaseId = request.evalCaseId;
|
|
36333
|
+
if (!evalCaseId) {
|
|
36334
|
+
return {
|
|
36335
|
+
outputMessages: [],
|
|
36336
|
+
raw: {
|
|
36337
|
+
command: renderedCommand,
|
|
36338
|
+
stderr: result.stderr,
|
|
36339
|
+
exitCode: result.exitCode ?? 0,
|
|
36340
|
+
cwd: this.config.cwd,
|
|
36341
|
+
outputFile: outputFilePath
|
|
36342
|
+
}
|
|
36343
|
+
};
|
|
36344
|
+
}
|
|
36345
|
+
const parsed = recordsById.get(evalCaseId);
|
|
36346
|
+
if (!parsed) {
|
|
36347
|
+
return {
|
|
36348
|
+
outputMessages: [],
|
|
36349
|
+
raw: {
|
|
36350
|
+
command: renderedCommand,
|
|
36351
|
+
stderr: result.stderr,
|
|
36352
|
+
exitCode: result.exitCode ?? 0,
|
|
36353
|
+
cwd: this.config.cwd,
|
|
36354
|
+
outputFile: outputFilePath
|
|
36355
|
+
}
|
|
36356
|
+
};
|
|
36357
|
+
}
|
|
36358
|
+
return {
|
|
36359
|
+
outputMessages: parsed.outputMessages,
|
|
36360
|
+
raw: {
|
|
36361
|
+
command: renderedCommand,
|
|
36362
|
+
stderr: result.stderr,
|
|
36363
|
+
exitCode: result.exitCode ?? 0,
|
|
36364
|
+
cwd: this.config.cwd,
|
|
36365
|
+
outputFile: outputFilePath,
|
|
36366
|
+
recordId: evalCaseId
|
|
36367
|
+
}
|
|
36368
|
+
};
|
|
36369
|
+
});
|
|
36370
|
+
return responses;
|
|
36371
|
+
}
|
|
36250
36372
|
/**
|
|
36251
36373
|
* Parse output content from CLI.
|
|
36252
|
-
* If the content is valid JSON with
|
|
36253
|
-
*
|
|
36374
|
+
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
36375
|
+
* If only 'text' is provided, wrap it in outputMessages.
|
|
36376
|
+
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
36254
36377
|
*/
|
|
36255
36378
|
parseOutputContent(content) {
|
|
36256
36379
|
try {
|
|
36257
36380
|
const parsed = JSON.parse(content);
|
|
36258
|
-
if (typeof parsed === "object" && parsed !== null
|
|
36381
|
+
if (typeof parsed === "object" && parsed !== null) {
|
|
36259
36382
|
const obj = parsed;
|
|
36260
|
-
const
|
|
36261
|
-
|
|
36262
|
-
|
|
36383
|
+
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
36384
|
+
if (outputMessages && outputMessages.length > 0) {
|
|
36385
|
+
return { outputMessages };
|
|
36386
|
+
}
|
|
36387
|
+
if ("text" in obj) {
|
|
36388
|
+
const text2 = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
36389
|
+
return { outputMessages: [{ role: "assistant", content: text2 }] };
|
|
36390
|
+
}
|
|
36263
36391
|
}
|
|
36264
36392
|
} catch {
|
|
36265
36393
|
}
|
|
36266
|
-
return {
|
|
36394
|
+
return { outputMessages: [{ role: "assistant", content }] };
|
|
36267
36395
|
}
|
|
36268
|
-
|
|
36269
|
-
|
|
36396
|
+
/**
|
|
36397
|
+
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
36398
|
+
*/
|
|
36399
|
+
parseOutputMessages(outputMessages) {
|
|
36400
|
+
if (!Array.isArray(outputMessages)) {
|
|
36270
36401
|
return void 0;
|
|
36271
36402
|
}
|
|
36272
|
-
const
|
|
36273
|
-
|
|
36403
|
+
const messages = [];
|
|
36404
|
+
for (const msg of outputMessages) {
|
|
36405
|
+
if (typeof msg !== "object" || msg === null) {
|
|
36406
|
+
continue;
|
|
36407
|
+
}
|
|
36408
|
+
const rawMsg = msg;
|
|
36409
|
+
if (typeof rawMsg.role !== "string") {
|
|
36410
|
+
continue;
|
|
36411
|
+
}
|
|
36412
|
+
const message = {
|
|
36413
|
+
role: rawMsg.role,
|
|
36414
|
+
name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
|
|
36415
|
+
content: rawMsg.content,
|
|
36416
|
+
toolCalls: this.parseToolCalls(rawMsg.tool_calls),
|
|
36417
|
+
timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
|
|
36418
|
+
metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
|
|
36419
|
+
};
|
|
36420
|
+
messages.push(message);
|
|
36421
|
+
}
|
|
36422
|
+
return messages.length > 0 ? messages : void 0;
|
|
36423
|
+
}
|
|
36424
|
+
/**
|
|
36425
|
+
* Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
|
|
36426
|
+
*/
|
|
36427
|
+
parseToolCalls(toolCalls) {
|
|
36428
|
+
if (!Array.isArray(toolCalls)) {
|
|
36429
|
+
return void 0;
|
|
36430
|
+
}
|
|
36431
|
+
const calls = [];
|
|
36432
|
+
for (const call of toolCalls) {
|
|
36433
|
+
if (typeof call !== "object" || call === null) {
|
|
36434
|
+
continue;
|
|
36435
|
+
}
|
|
36436
|
+
const rawCall = call;
|
|
36437
|
+
if (typeof rawCall.tool !== "string") {
|
|
36438
|
+
continue;
|
|
36439
|
+
}
|
|
36440
|
+
calls.push({
|
|
36441
|
+
tool: rawCall.tool,
|
|
36442
|
+
input: rawCall.input,
|
|
36443
|
+
output: rawCall.output,
|
|
36444
|
+
id: typeof rawCall.id === "string" ? rawCall.id : void 0,
|
|
36445
|
+
timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
|
|
36446
|
+
});
|
|
36447
|
+
}
|
|
36448
|
+
return calls.length > 0 ? calls : void 0;
|
|
36449
|
+
}
|
|
36450
|
+
parseJsonlBatchOutput(content) {
|
|
36451
|
+
const records = /* @__PURE__ */ new Map();
|
|
36452
|
+
const lines = content.split(/\r?\n/).map((line2) => line2.trim()).filter((line2) => line2.length > 0);
|
|
36453
|
+
for (const line2 of lines) {
|
|
36454
|
+
let parsed;
|
|
36455
|
+
try {
|
|
36456
|
+
parsed = JSON.parse(line2);
|
|
36457
|
+
} catch (error40) {
|
|
36458
|
+
const reason = error40 instanceof Error ? error40.message : String(error40);
|
|
36459
|
+
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
36460
|
+
}
|
|
36461
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
36462
|
+
throw new Error("CLI batch output JSONL line must be an object");
|
|
36463
|
+
}
|
|
36464
|
+
const obj = parsed;
|
|
36465
|
+
const id = typeof obj.id === "string" ? obj.id : void 0;
|
|
36466
|
+
if (!id || id.trim().length === 0) {
|
|
36467
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
36468
|
+
}
|
|
36469
|
+
if (records.has(id)) {
|
|
36470
|
+
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
36471
|
+
}
|
|
36472
|
+
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
36473
|
+
let outputMessages;
|
|
36474
|
+
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
36475
|
+
outputMessages = parsedOutputMessages;
|
|
36476
|
+
} else {
|
|
36477
|
+
const text2 = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
36478
|
+
outputMessages = text2 ? [{ role: "assistant", content: text2 }] : [];
|
|
36479
|
+
}
|
|
36480
|
+
records.set(id, {
|
|
36481
|
+
outputMessages
|
|
36482
|
+
});
|
|
36483
|
+
}
|
|
36484
|
+
return records;
|
|
36274
36485
|
}
|
|
36275
36486
|
async readAndCleanupOutputFile(filePath) {
|
|
36276
36487
|
try {
|
|
@@ -36280,8 +36491,10 @@ var CliProvider = class {
|
|
|
36280
36491
|
const errorMsg = error40 instanceof Error ? error40.message : String(error40);
|
|
36281
36492
|
throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
|
|
36282
36493
|
} finally {
|
|
36283
|
-
|
|
36284
|
-
|
|
36494
|
+
if (!this.keepTempFiles) {
|
|
36495
|
+
await fs.unlink(filePath).catch(() => {
|
|
36496
|
+
});
|
|
36497
|
+
}
|
|
36285
36498
|
}
|
|
36286
36499
|
}
|
|
36287
36500
|
async ensureHealthy(signal) {
|
|
@@ -36333,7 +36546,7 @@ var CliProvider = class {
|
|
|
36333
36546
|
);
|
|
36334
36547
|
if (this.verbose) {
|
|
36335
36548
|
console.log(
|
|
36336
|
-
`[cli-provider:${this.targetName}] (healthcheck)
|
|
36549
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36337
36550
|
);
|
|
36338
36551
|
}
|
|
36339
36552
|
const result = await this.runCommand(renderedCommand, {
|
|
@@ -36401,11 +36614,11 @@ function shellEscape(value) {
|
|
|
36401
36614
|
}
|
|
36402
36615
|
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
36403
36616
|
}
|
|
36404
|
-
function generateOutputFilePath(evalCaseId) {
|
|
36617
|
+
function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
36405
36618
|
const safeEvalId = evalCaseId || "unknown";
|
|
36406
36619
|
const timestamp = Date.now();
|
|
36407
36620
|
const random = Math.random().toString(36).substring(2, 9);
|
|
36408
|
-
return path72.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}
|
|
36621
|
+
return path72.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
36409
36622
|
}
|
|
36410
36623
|
function formatTimeoutSuffix(timeoutMs) {
|
|
36411
36624
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -36601,7 +36814,6 @@ var CodexProvider = class {
|
|
|
36601
36814
|
const parsed = parseCodexJson(result.stdout);
|
|
36602
36815
|
const assistantText = extractAssistantText(parsed);
|
|
36603
36816
|
return {
|
|
36604
|
-
text: assistantText,
|
|
36605
36817
|
raw: {
|
|
36606
36818
|
response: parsed,
|
|
36607
36819
|
stdout: result.stdout,
|
|
@@ -36613,7 +36825,8 @@ var CodexProvider = class {
|
|
|
36613
36825
|
workspace: workspaceRoot,
|
|
36614
36826
|
inputFiles,
|
|
36615
36827
|
logFile: logger?.filePath
|
|
36616
|
-
}
|
|
36828
|
+
},
|
|
36829
|
+
outputMessages: [{ role: "assistant", content: assistantText }]
|
|
36617
36830
|
};
|
|
36618
36831
|
} finally {
|
|
36619
36832
|
await logger?.close();
|
|
@@ -37233,7 +37446,6 @@ var MockProvider = class {
|
|
|
37233
37446
|
delayMs;
|
|
37234
37447
|
delayMinMs;
|
|
37235
37448
|
delayMaxMs;
|
|
37236
|
-
trace;
|
|
37237
37449
|
constructor(targetName, config2) {
|
|
37238
37450
|
this.id = `mock:${targetName}`;
|
|
37239
37451
|
this.targetName = targetName;
|
|
@@ -37241,7 +37453,6 @@ var MockProvider = class {
|
|
|
37241
37453
|
this.delayMs = config2.delayMs ?? 0;
|
|
37242
37454
|
this.delayMinMs = config2.delayMinMs ?? 0;
|
|
37243
37455
|
this.delayMaxMs = config2.delayMaxMs ?? 0;
|
|
37244
|
-
this.trace = config2.trace;
|
|
37245
37456
|
}
|
|
37246
37457
|
async invoke(request) {
|
|
37247
37458
|
const delay2 = this.calculateDelay();
|
|
@@ -37249,12 +37460,11 @@ var MockProvider = class {
|
|
|
37249
37460
|
await new Promise((resolve2) => setTimeout(resolve2, delay2));
|
|
37250
37461
|
}
|
|
37251
37462
|
return {
|
|
37252
|
-
|
|
37463
|
+
outputMessages: [{ role: "assistant", content: this.cannedResponse }],
|
|
37253
37464
|
raw: {
|
|
37254
37465
|
question: request.question,
|
|
37255
37466
|
guidelines: request.guidelines
|
|
37256
|
-
}
|
|
37257
|
-
trace: this.trace
|
|
37467
|
+
}
|
|
37258
37468
|
};
|
|
37259
37469
|
}
|
|
37260
37470
|
calculateDelay() {
|
|
@@ -37334,7 +37544,7 @@ var VSCodeProvider = class {
|
|
|
37334
37544
|
}
|
|
37335
37545
|
if (this.config.dryRun) {
|
|
37336
37546
|
return {
|
|
37337
|
-
|
|
37547
|
+
outputMessages: [],
|
|
37338
37548
|
raw: {
|
|
37339
37549
|
session,
|
|
37340
37550
|
inputFiles
|
|
@@ -37343,7 +37553,7 @@ var VSCodeProvider = class {
|
|
|
37343
37553
|
}
|
|
37344
37554
|
const responseText = await readTextFile(session.responseFile);
|
|
37345
37555
|
return {
|
|
37346
|
-
|
|
37556
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
37347
37557
|
raw: {
|
|
37348
37558
|
session,
|
|
37349
37559
|
inputFiles
|
|
@@ -37381,7 +37591,7 @@ var VSCodeProvider = class {
|
|
|
37381
37591
|
}
|
|
37382
37592
|
if (this.config.dryRun) {
|
|
37383
37593
|
return normalizedRequests.map(({ inputFiles }) => ({
|
|
37384
|
-
|
|
37594
|
+
outputMessages: [],
|
|
37385
37595
|
raw: {
|
|
37386
37596
|
session,
|
|
37387
37597
|
inputFiles,
|
|
@@ -37398,7 +37608,7 @@ var VSCodeProvider = class {
|
|
|
37398
37608
|
for (const [index, responseFile] of session.responseFiles.entries()) {
|
|
37399
37609
|
const responseText = await readTextFile(responseFile);
|
|
37400
37610
|
responses.push({
|
|
37401
|
-
|
|
37611
|
+
outputMessages: [{ role: "assistant", content: responseText }],
|
|
37402
37612
|
raw: {
|
|
37403
37613
|
session,
|
|
37404
37614
|
inputFiles: normalizedRequests[index]?.inputFiles,
|
|
@@ -37686,6 +37896,7 @@ var LlmJudgeEvaluator = class {
|
|
|
37686
37896
|
null,
|
|
37687
37897
|
2
|
|
37688
37898
|
),
|
|
37899
|
+
[TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
|
|
37689
37900
|
[TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
|
|
37690
37901
|
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
37691
37902
|
[TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
|
|
@@ -37710,7 +37921,7 @@ var LlmJudgeEvaluator = class {
|
|
|
37710
37921
|
const score = clampScore(data.score);
|
|
37711
37922
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37712
37923
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37713
|
-
const reasoning = data.reasoning
|
|
37924
|
+
const reasoning = data.reasoning;
|
|
37714
37925
|
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37715
37926
|
return {
|
|
37716
37927
|
score,
|
|
@@ -37812,7 +38023,9 @@ var LlmJudgeEvaluator = class {
|
|
|
37812
38023
|
maxOutputTokens: this.maxOutputTokens,
|
|
37813
38024
|
temperature: this.temperature
|
|
37814
38025
|
});
|
|
37815
|
-
const data = schema.parse(
|
|
38026
|
+
const data = schema.parse(
|
|
38027
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
38028
|
+
);
|
|
37816
38029
|
return { data, providerResponse: response };
|
|
37817
38030
|
} catch (e) {
|
|
37818
38031
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -37895,15 +38108,16 @@ var CodeEvaluator = class {
|
|
|
37895
38108
|
{
|
|
37896
38109
|
question: context.evalCase.question,
|
|
37897
38110
|
expected_outcome: context.evalCase.expected_outcome,
|
|
38111
|
+
expected_messages: context.evalCase.expected_messages,
|
|
37898
38112
|
reference_answer: context.evalCase.reference_answer,
|
|
37899
38113
|
candidate_answer: context.candidate,
|
|
38114
|
+
output_messages: context.outputMessages ?? null,
|
|
37900
38115
|
guideline_files: context.evalCase.guideline_paths,
|
|
37901
38116
|
input_files: context.evalCase.file_paths.filter(
|
|
37902
38117
|
(path132) => !context.evalCase.guideline_paths.includes(path132)
|
|
37903
38118
|
),
|
|
37904
38119
|
input_messages: context.evalCase.input_messages,
|
|
37905
|
-
|
|
37906
|
-
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
38120
|
+
candidate_trace_summary: context.traceSummary ?? null
|
|
37907
38121
|
},
|
|
37908
38122
|
null,
|
|
37909
38123
|
2
|
|
@@ -38030,8 +38244,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38030
38244
|
this.config = options.config;
|
|
38031
38245
|
}
|
|
38032
38246
|
evaluate(context) {
|
|
38033
|
-
const {
|
|
38034
|
-
|
|
38247
|
+
const { outputMessages, traceSummary } = context;
|
|
38248
|
+
const toolCalls = this.extractToolCallsFromMessages(outputMessages);
|
|
38249
|
+
if (toolCalls.length === 0 && !traceSummary) {
|
|
38250
|
+
return {
|
|
38251
|
+
score: 0,
|
|
38252
|
+
verdict: "fail",
|
|
38253
|
+
hits: [],
|
|
38254
|
+
misses: ["No trace available for evaluation"],
|
|
38255
|
+
expectedAspectCount: 1
|
|
38256
|
+
};
|
|
38257
|
+
}
|
|
38258
|
+
const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
|
|
38259
|
+
if (!summary) {
|
|
38035
38260
|
return {
|
|
38036
38261
|
score: 0,
|
|
38037
38262
|
verdict: "fail",
|
|
@@ -38042,11 +38267,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38042
38267
|
}
|
|
38043
38268
|
switch (this.config.mode) {
|
|
38044
38269
|
case "any_order":
|
|
38045
|
-
return this.evaluateAnyOrder(
|
|
38270
|
+
return this.evaluateAnyOrder(summary);
|
|
38046
38271
|
case "in_order":
|
|
38047
|
-
return this.evaluateInOrder(
|
|
38272
|
+
return this.evaluateInOrder(toolCalls);
|
|
38048
38273
|
case "exact":
|
|
38049
|
-
return this.evaluateExact(
|
|
38274
|
+
return this.evaluateExact(toolCalls);
|
|
38050
38275
|
default:
|
|
38051
38276
|
return {
|
|
38052
38277
|
score: 0,
|
|
@@ -38057,6 +38282,39 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38057
38282
|
};
|
|
38058
38283
|
}
|
|
38059
38284
|
}
|
|
38285
|
+
/**
|
|
38286
|
+
* Extract tool calls from output messages.
|
|
38287
|
+
*/
|
|
38288
|
+
extractToolCallsFromMessages(messages) {
|
|
38289
|
+
if (!messages) {
|
|
38290
|
+
return [];
|
|
38291
|
+
}
|
|
38292
|
+
const toolCalls = [];
|
|
38293
|
+
for (const message of messages) {
|
|
38294
|
+
if (message.toolCalls) {
|
|
38295
|
+
for (const call of message.toolCalls) {
|
|
38296
|
+
toolCalls.push({ name: call.tool });
|
|
38297
|
+
}
|
|
38298
|
+
}
|
|
38299
|
+
}
|
|
38300
|
+
return toolCalls;
|
|
38301
|
+
}
|
|
38302
|
+
/**
|
|
38303
|
+
* Build a summary from extracted tool calls.
|
|
38304
|
+
*/
|
|
38305
|
+
buildSummary(toolCalls) {
|
|
38306
|
+
const toolCallsByName = {};
|
|
38307
|
+
for (const call of toolCalls) {
|
|
38308
|
+
toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
|
|
38309
|
+
}
|
|
38310
|
+
const toolNames = Object.keys(toolCallsByName).sort();
|
|
38311
|
+
return {
|
|
38312
|
+
eventCount: toolCalls.length,
|
|
38313
|
+
toolNames,
|
|
38314
|
+
toolCallsByName,
|
|
38315
|
+
errorCount: 0
|
|
38316
|
+
};
|
|
38317
|
+
}
|
|
38060
38318
|
evaluateAnyOrder(summary) {
|
|
38061
38319
|
const minimums = this.config.minimums ?? {};
|
|
38062
38320
|
const toolNames = Object.keys(minimums);
|
|
@@ -38089,7 +38347,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38089
38347
|
expectedAspectCount: toolNames.length
|
|
38090
38348
|
};
|
|
38091
38349
|
}
|
|
38092
|
-
evaluateInOrder(
|
|
38350
|
+
evaluateInOrder(toolCalls) {
|
|
38093
38351
|
const expected = this.config.expected ?? [];
|
|
38094
38352
|
if (expected.length === 0) {
|
|
38095
38353
|
return {
|
|
@@ -38100,15 +38358,14 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38100
38358
|
expectedAspectCount: 0
|
|
38101
38359
|
};
|
|
38102
38360
|
}
|
|
38103
|
-
const actualToolCalls = trace2.filter((e) => e.type === "tool_call" && e.name);
|
|
38104
38361
|
const hits = [];
|
|
38105
38362
|
const misses = [];
|
|
38106
38363
|
let actualIndex = 0;
|
|
38107
38364
|
for (let i = 0; i < expected.length; i++) {
|
|
38108
38365
|
const expectedTool = expected[i].tool;
|
|
38109
38366
|
let found = false;
|
|
38110
|
-
while (actualIndex <
|
|
38111
|
-
if (
|
|
38367
|
+
while (actualIndex < toolCalls.length) {
|
|
38368
|
+
if (toolCalls[actualIndex].name === expectedTool) {
|
|
38112
38369
|
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
38113
38370
|
actualIndex++;
|
|
38114
38371
|
found = true;
|
|
@@ -38129,7 +38386,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38129
38386
|
expectedAspectCount: expected.length
|
|
38130
38387
|
};
|
|
38131
38388
|
}
|
|
38132
|
-
evaluateExact(
|
|
38389
|
+
evaluateExact(toolCalls) {
|
|
38133
38390
|
const expected = this.config.expected ?? [];
|
|
38134
38391
|
if (expected.length === 0) {
|
|
38135
38392
|
return {
|
|
@@ -38140,16 +38397,15 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38140
38397
|
expectedAspectCount: 0
|
|
38141
38398
|
};
|
|
38142
38399
|
}
|
|
38143
|
-
const actualToolCalls = trace2.filter((e) => e.type === "tool_call" && e.name);
|
|
38144
38400
|
const hits = [];
|
|
38145
38401
|
const misses = [];
|
|
38146
|
-
if (
|
|
38147
|
-
misses.push(`Expected ${expected.length} tool calls, got ${
|
|
38402
|
+
if (toolCalls.length !== expected.length) {
|
|
38403
|
+
misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
|
|
38148
38404
|
}
|
|
38149
|
-
const checkLength = Math.min(expected.length,
|
|
38405
|
+
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
38150
38406
|
for (let i = 0; i < checkLength; i++) {
|
|
38151
38407
|
const expectedTool = expected[i].tool;
|
|
38152
|
-
const actualTool =
|
|
38408
|
+
const actualTool = toolCalls[i].name;
|
|
38153
38409
|
if (actualTool === expectedTool) {
|
|
38154
38410
|
hits.push(`Position ${i}: ${expectedTool} \u2713`);
|
|
38155
38411
|
} else {
|
|
@@ -38363,11 +38619,13 @@ var CompositeEvaluator = class {
|
|
|
38363
38619
|
evalCaseId: context.evalCase.id,
|
|
38364
38620
|
attempt: context.attempt
|
|
38365
38621
|
});
|
|
38366
|
-
const data = freeformEvaluationSchema.parse(
|
|
38622
|
+
const data = freeformEvaluationSchema.parse(
|
|
38623
|
+
parseJsonFromText(extractLastAssistantContent(response.outputMessages))
|
|
38624
|
+
);
|
|
38367
38625
|
const score = clampScore(data.score);
|
|
38368
38626
|
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38369
38627
|
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38370
|
-
const reasoning = data.reasoning
|
|
38628
|
+
const reasoning = data.reasoning;
|
|
38371
38629
|
return {
|
|
38372
38630
|
score,
|
|
38373
38631
|
verdict: scoreToVerdict(score),
|
|
@@ -38779,11 +39037,14 @@ async function runBatchEvaluation(options) {
|
|
|
38779
39037
|
const evalCase = evalCases[i];
|
|
38780
39038
|
const promptInputs = promptInputsList[i];
|
|
38781
39039
|
const providerResponse = batchResponse[i];
|
|
39040
|
+
const outputMessages = providerResponse.outputMessages;
|
|
39041
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
39042
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
38782
39043
|
let result;
|
|
38783
39044
|
try {
|
|
38784
39045
|
result = await evaluateCandidate({
|
|
38785
39046
|
evalCase,
|
|
38786
|
-
candidate
|
|
39047
|
+
candidate,
|
|
38787
39048
|
target,
|
|
38788
39049
|
provider,
|
|
38789
39050
|
evaluators: evaluatorRegistry,
|
|
@@ -38791,7 +39052,9 @@ async function runBatchEvaluation(options) {
|
|
|
38791
39052
|
nowFn,
|
|
38792
39053
|
attempt: 0,
|
|
38793
39054
|
judgeProvider: await resolveJudgeProvider(target),
|
|
38794
|
-
agentTimeoutMs
|
|
39055
|
+
agentTimeoutMs,
|
|
39056
|
+
outputMessages,
|
|
39057
|
+
traceSummary
|
|
38795
39058
|
});
|
|
38796
39059
|
} catch (error40) {
|
|
38797
39060
|
const errorResult = buildErrorResult(
|
|
@@ -38895,21 +39158,13 @@ async function runEvalCase(options) {
|
|
|
38895
39158
|
if (cacheKey && cache && !cachedResponse) {
|
|
38896
39159
|
await cache.set(cacheKey, providerResponse);
|
|
38897
39160
|
}
|
|
38898
|
-
|
|
38899
|
-
|
|
38900
|
-
|
|
38901
|
-
const rawTrace = await readJsonFile(providerResponse.traceRef);
|
|
38902
|
-
if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
|
|
38903
|
-
candidateTrace = rawTrace;
|
|
38904
|
-
}
|
|
38905
|
-
} catch {
|
|
38906
|
-
}
|
|
38907
|
-
}
|
|
38908
|
-
const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
|
|
39161
|
+
const outputMessages = providerResponse.outputMessages;
|
|
39162
|
+
const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
39163
|
+
const candidate = extractLastAssistantContent(outputMessages);
|
|
38909
39164
|
try {
|
|
38910
39165
|
return await evaluateCandidate({
|
|
38911
39166
|
evalCase,
|
|
38912
|
-
candidate
|
|
39167
|
+
candidate,
|
|
38913
39168
|
target,
|
|
38914
39169
|
provider,
|
|
38915
39170
|
evaluators,
|
|
@@ -38918,9 +39173,8 @@ async function runEvalCase(options) {
|
|
|
38918
39173
|
attempt,
|
|
38919
39174
|
judgeProvider,
|
|
38920
39175
|
agentTimeoutMs,
|
|
38921
|
-
|
|
38922
|
-
|
|
38923
|
-
candidateTraceSummary
|
|
39176
|
+
outputMessages,
|
|
39177
|
+
traceSummary
|
|
38924
39178
|
});
|
|
38925
39179
|
} catch (error40) {
|
|
38926
39180
|
return buildErrorResult(evalCase, target.name, nowFn(), error40, promptInputs, provider);
|
|
@@ -38938,9 +39192,8 @@ async function evaluateCandidate(options) {
|
|
|
38938
39192
|
attempt,
|
|
38939
39193
|
judgeProvider,
|
|
38940
39194
|
agentTimeoutMs,
|
|
38941
|
-
|
|
38942
|
-
|
|
38943
|
-
candidateTraceSummary
|
|
39195
|
+
outputMessages,
|
|
39196
|
+
traceSummary
|
|
38944
39197
|
} = options;
|
|
38945
39198
|
const gradeTimestamp = nowFn();
|
|
38946
39199
|
const { score, evaluatorResults } = await runEvaluatorsForCase({
|
|
@@ -38954,9 +39207,8 @@ async function evaluateCandidate(options) {
|
|
|
38954
39207
|
now: gradeTimestamp,
|
|
38955
39208
|
judgeProvider,
|
|
38956
39209
|
agentTimeoutMs,
|
|
38957
|
-
|
|
38958
|
-
|
|
38959
|
-
candidateTraceSummary
|
|
39210
|
+
outputMessages,
|
|
39211
|
+
traceSummary
|
|
38960
39212
|
});
|
|
38961
39213
|
const completedAt = nowFn();
|
|
38962
39214
|
let agentProviderRequest;
|
|
@@ -38994,7 +39246,7 @@ async function evaluateCandidate(options) {
|
|
|
38994
39246
|
lm_provider_request: lmProviderRequest,
|
|
38995
39247
|
evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
38996
39248
|
evaluator_results: evaluatorResults,
|
|
38997
|
-
trace_summary:
|
|
39249
|
+
trace_summary: traceSummary
|
|
38998
39250
|
};
|
|
38999
39251
|
}
|
|
39000
39252
|
async function runEvaluatorsForCase(options) {
|
|
@@ -39009,9 +39261,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
39009
39261
|
now,
|
|
39010
39262
|
judgeProvider,
|
|
39011
39263
|
agentTimeoutMs,
|
|
39012
|
-
|
|
39013
|
-
|
|
39014
|
-
candidateTraceSummary
|
|
39264
|
+
outputMessages,
|
|
39265
|
+
traceSummary
|
|
39015
39266
|
} = options;
|
|
39016
39267
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
39017
39268
|
return runEvaluatorList({
|
|
@@ -39026,9 +39277,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
39026
39277
|
now,
|
|
39027
39278
|
judgeProvider,
|
|
39028
39279
|
agentTimeoutMs,
|
|
39029
|
-
|
|
39030
|
-
|
|
39031
|
-
candidateTraceSummary
|
|
39280
|
+
outputMessages,
|
|
39281
|
+
traceSummary
|
|
39032
39282
|
});
|
|
39033
39283
|
}
|
|
39034
39284
|
const evaluatorKind = evalCase.evaluator ?? "llm_judge";
|
|
@@ -39045,9 +39295,8 @@ async function runEvaluatorsForCase(options) {
|
|
|
39045
39295
|
promptInputs,
|
|
39046
39296
|
now,
|
|
39047
39297
|
judgeProvider,
|
|
39048
|
-
|
|
39049
|
-
|
|
39050
|
-
candidateTraceSummary
|
|
39298
|
+
outputMessages,
|
|
39299
|
+
traceSummary
|
|
39051
39300
|
});
|
|
39052
39301
|
return { score };
|
|
39053
39302
|
}
|
|
@@ -39064,9 +39313,8 @@ async function runEvaluatorList(options) {
|
|
|
39064
39313
|
now,
|
|
39065
39314
|
judgeProvider,
|
|
39066
39315
|
agentTimeoutMs,
|
|
39067
|
-
|
|
39068
|
-
|
|
39069
|
-
candidateTraceSummary
|
|
39316
|
+
outputMessages,
|
|
39317
|
+
traceSummary
|
|
39070
39318
|
} = options;
|
|
39071
39319
|
const scored = [];
|
|
39072
39320
|
const evaluatorResults = [];
|
|
@@ -39113,8 +39361,8 @@ async function runEvaluatorList(options) {
|
|
|
39113
39361
|
attempt,
|
|
39114
39362
|
promptInputs,
|
|
39115
39363
|
now,
|
|
39116
|
-
|
|
39117
|
-
|
|
39364
|
+
outputMessages,
|
|
39365
|
+
traceSummary
|
|
39118
39366
|
});
|
|
39119
39367
|
const weight = evaluator.weight ?? 1;
|
|
39120
39368
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -39200,9 +39448,8 @@ async function runEvaluatorList(options) {
|
|
|
39200
39448
|
attempt,
|
|
39201
39449
|
promptInputs,
|
|
39202
39450
|
now,
|
|
39203
|
-
|
|
39204
|
-
|
|
39205
|
-
candidateTraceSummary
|
|
39451
|
+
outputMessages,
|
|
39452
|
+
traceSummary
|
|
39206
39453
|
});
|
|
39207
39454
|
const weight = evaluator.weight ?? 1;
|
|
39208
39455
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
@@ -39562,16 +39809,90 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
39562
39809
|
return parts.join("\n");
|
|
39563
39810
|
}
|
|
39564
39811
|
|
|
39812
|
+
// src/commands/convert/index.ts
|
|
39813
|
+
import { command, option, optional as optional2, positional, string as string4 } from "cmd-ts";
|
|
39814
|
+
import { stringify as stringifyYaml } from "yaml";
|
|
39815
|
+
function convertJsonlToYaml(inputPath, outputPath) {
|
|
39816
|
+
const content = readFileSync(inputPath, "utf8");
|
|
39817
|
+
const lines = content.trim().split("\n").filter((line2) => line2.trim());
|
|
39818
|
+
let yamlOutput = "";
|
|
39819
|
+
let isFirst = true;
|
|
39820
|
+
for (const line2 of lines) {
|
|
39821
|
+
const record2 = JSON.parse(line2);
|
|
39822
|
+
const yamlDoc = stringifyYaml(record2, {
|
|
39823
|
+
indent: 2,
|
|
39824
|
+
lineWidth: 0
|
|
39825
|
+
});
|
|
39826
|
+
const normalizedYaml = normalizeLineEndings(yamlDoc);
|
|
39827
|
+
const separator = isFirst ? "---\n" : "\n---\n";
|
|
39828
|
+
isFirst = false;
|
|
39829
|
+
yamlOutput += separator + normalizedYaml;
|
|
39830
|
+
}
|
|
39831
|
+
writeFileSync(outputPath, yamlOutput);
|
|
39832
|
+
return lines.length;
|
|
39833
|
+
}
|
|
39834
|
+
var convertCommand = command({
|
|
39835
|
+
name: "convert",
|
|
39836
|
+
description: "Convert evaluation results from JSONL to YAML format",
|
|
39837
|
+
args: {
|
|
39838
|
+
input: positional({
|
|
39839
|
+
type: string4,
|
|
39840
|
+
displayName: "input",
|
|
39841
|
+
description: "Path to input JSONL file"
|
|
39842
|
+
}),
|
|
39843
|
+
out: option({
|
|
39844
|
+
type: optional2(string4),
|
|
39845
|
+
long: "out",
|
|
39846
|
+
short: "o",
|
|
39847
|
+
description: "Output file path (defaults to input path with .yaml extension)"
|
|
39848
|
+
})
|
|
39849
|
+
},
|
|
39850
|
+
handler: async ({ input, out }) => {
|
|
39851
|
+
if (!input.endsWith(".jsonl")) {
|
|
39852
|
+
console.error("Error: Input file must be a .jsonl file");
|
|
39853
|
+
process.exit(1);
|
|
39854
|
+
}
|
|
39855
|
+
const outputPath = out ?? input.replace(/\.jsonl$/, ".yaml");
|
|
39856
|
+
try {
|
|
39857
|
+
const count = convertJsonlToYaml(input, outputPath);
|
|
39858
|
+
console.log(`Converted ${count} records to ${path14.resolve(outputPath)}`);
|
|
39859
|
+
} catch (error40) {
|
|
39860
|
+
console.error(`Error: ${error40.message}`);
|
|
39861
|
+
process.exit(1);
|
|
39862
|
+
}
|
|
39863
|
+
}
|
|
39864
|
+
});
|
|
39865
|
+
|
|
39866
|
+
// src/commands/eval/index.ts
|
|
39867
|
+
import { stat as stat4 } from "node:fs/promises";
|
|
39868
|
+
import path21 from "node:path";
|
|
39869
|
+
import {
|
|
39870
|
+
command as command2,
|
|
39871
|
+
flag,
|
|
39872
|
+
number as number4,
|
|
39873
|
+
option as option2,
|
|
39874
|
+
optional as optional3,
|
|
39875
|
+
restPositionals,
|
|
39876
|
+
string as string5
|
|
39877
|
+
} from "cmd-ts";
|
|
39878
|
+
import fg from "fast-glob";
|
|
39879
|
+
|
|
39880
|
+
// src/commands/eval/run-eval.ts
|
|
39881
|
+
import { constants as constants6 } from "node:fs";
|
|
39882
|
+
import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
39883
|
+
import path20 from "node:path";
|
|
39884
|
+
import { pathToFileURL } from "node:url";
|
|
39885
|
+
|
|
39565
39886
|
// src/commands/eval/env.ts
|
|
39566
39887
|
import { constants as constants4 } from "node:fs";
|
|
39567
39888
|
import { access as access4 } from "node:fs/promises";
|
|
39568
|
-
import
|
|
39889
|
+
import path15 from "node:path";
|
|
39569
39890
|
import { config as loadDotenv } from "dotenv";
|
|
39570
39891
|
function uniqueDirs(directories) {
|
|
39571
39892
|
const seen = /* @__PURE__ */ new Set();
|
|
39572
39893
|
const result = [];
|
|
39573
39894
|
for (const dir of directories) {
|
|
39574
|
-
const absolute =
|
|
39895
|
+
const absolute = path15.resolve(dir);
|
|
39575
39896
|
if (seen.has(absolute)) {
|
|
39576
39897
|
continue;
|
|
39577
39898
|
}
|
|
@@ -39590,14 +39911,14 @@ async function fileExists4(filePath) {
|
|
|
39590
39911
|
}
|
|
39591
39912
|
function collectAncestorDirectories(start, boundary) {
|
|
39592
39913
|
const directories = [];
|
|
39593
|
-
const boundaryDir =
|
|
39594
|
-
let current =
|
|
39914
|
+
const boundaryDir = path15.resolve(boundary);
|
|
39915
|
+
let current = path15.resolve(start);
|
|
39595
39916
|
while (current !== void 0) {
|
|
39596
39917
|
directories.push(current);
|
|
39597
39918
|
if (current === boundaryDir) {
|
|
39598
39919
|
break;
|
|
39599
39920
|
}
|
|
39600
|
-
const parent =
|
|
39921
|
+
const parent = path15.dirname(current);
|
|
39601
39922
|
if (parent === current) {
|
|
39602
39923
|
break;
|
|
39603
39924
|
}
|
|
@@ -39607,12 +39928,12 @@ function collectAncestorDirectories(start, boundary) {
|
|
|
39607
39928
|
}
|
|
39608
39929
|
async function loadEnvFromHierarchy(options) {
|
|
39609
39930
|
const { testFilePath, repoRoot, verbose } = options;
|
|
39610
|
-
const testDir =
|
|
39931
|
+
const testDir = path15.dirname(path15.resolve(testFilePath));
|
|
39611
39932
|
const cwd = process.cwd();
|
|
39612
39933
|
const searchDirs = uniqueDirs([...collectAncestorDirectories(testDir, repoRoot), repoRoot, cwd]);
|
|
39613
39934
|
const envFiles = [];
|
|
39614
39935
|
for (const dir of searchDirs) {
|
|
39615
|
-
const candidate =
|
|
39936
|
+
const candidate = path15.join(dir, ".env");
|
|
39616
39937
|
if (await fileExists4(candidate)) {
|
|
39617
39938
|
envFiles.push(candidate);
|
|
39618
39939
|
}
|
|
@@ -39636,7 +39957,7 @@ async function loadEnvFromHierarchy(options) {
|
|
|
39636
39957
|
// src/commands/eval/jsonl-writer.ts
|
|
39637
39958
|
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
39638
39959
|
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
39639
|
-
import
|
|
39960
|
+
import path16 from "node:path";
|
|
39640
39961
|
import { finished } from "node:stream/promises";
|
|
39641
39962
|
|
|
39642
39963
|
// ../../node_modules/.bun/async-mutex@0.5.0/node_modules/async-mutex/index.mjs
|
|
@@ -39854,7 +40175,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
39854
40175
|
this.stream = stream;
|
|
39855
40176
|
}
|
|
39856
40177
|
static async open(filePath) {
|
|
39857
|
-
await mkdir5(
|
|
40178
|
+
await mkdir5(path16.dirname(filePath), { recursive: true });
|
|
39858
40179
|
const stream = createWriteStream2(filePath, { flags: "w", encoding: "utf8" });
|
|
39859
40180
|
return new _JsonlWriter(stream);
|
|
39860
40181
|
}
|
|
@@ -39886,9 +40207,9 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
39886
40207
|
// src/commands/eval/yaml-writer.ts
|
|
39887
40208
|
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
39888
40209
|
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
39889
|
-
import
|
|
40210
|
+
import path17 from "node:path";
|
|
39890
40211
|
import { finished as finished2 } from "node:stream/promises";
|
|
39891
|
-
import { stringify as
|
|
40212
|
+
import { stringify as stringifyYaml2 } from "yaml";
|
|
39892
40213
|
var YamlWriter = class _YamlWriter {
|
|
39893
40214
|
stream;
|
|
39894
40215
|
mutex = new Mutex();
|
|
@@ -39898,7 +40219,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
39898
40219
|
this.stream = stream;
|
|
39899
40220
|
}
|
|
39900
40221
|
static async open(filePath) {
|
|
39901
|
-
await mkdir6(
|
|
40222
|
+
await mkdir6(path17.dirname(filePath), { recursive: true });
|
|
39902
40223
|
const stream = createWriteStream3(filePath, { flags: "w", encoding: "utf8" });
|
|
39903
40224
|
return new _YamlWriter(stream);
|
|
39904
40225
|
}
|
|
@@ -39907,7 +40228,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
39907
40228
|
if (this.closed) {
|
|
39908
40229
|
throw new Error("Cannot write to closed YAML writer");
|
|
39909
40230
|
}
|
|
39910
|
-
const yamlDoc =
|
|
40231
|
+
const yamlDoc = stringifyYaml2(record2, {
|
|
39911
40232
|
indent: 2,
|
|
39912
40233
|
lineWidth: 0
|
|
39913
40234
|
// Disable line wrapping
|
|
@@ -39963,196 +40284,86 @@ function getDefaultExtension(format) {
|
|
|
39963
40284
|
}
|
|
39964
40285
|
|
|
39965
40286
|
// src/commands/eval/progress-display.ts
|
|
39966
|
-
import { stripVTControlCharacters } from "node:util";
|
|
39967
|
-
var ESC = "\x1B[";
|
|
39968
|
-
var CLEAR_LINE = `${ESC}K`;
|
|
39969
|
-
var MOVE_CURSOR_UP = `${ESC}1A`;
|
|
39970
40287
|
var ProgressDisplay = class {
|
|
39971
40288
|
workers = /* @__PURE__ */ new Map();
|
|
39972
|
-
maxWorkers;
|
|
39973
40289
|
totalTests = 0;
|
|
39974
40290
|
completedTests = 0;
|
|
39975
|
-
renderTimer;
|
|
39976
|
-
renderScheduled = false;
|
|
39977
|
-
isInteractive;
|
|
39978
40291
|
logPaths = [];
|
|
39979
40292
|
logPathSet = /* @__PURE__ */ new Set();
|
|
39980
40293
|
hasPrintedLogHeader = false;
|
|
39981
|
-
windowHeight = 0;
|
|
39982
40294
|
started = false;
|
|
39983
40295
|
finished = false;
|
|
39984
|
-
|
|
39985
|
-
|
|
39986
|
-
this.
|
|
40296
|
+
verbose;
|
|
40297
|
+
constructor(_maxWorkers, options) {
|
|
40298
|
+
this.verbose = options?.verbose ?? false;
|
|
39987
40299
|
}
|
|
39988
40300
|
isInteractiveMode() {
|
|
39989
|
-
return
|
|
40301
|
+
return false;
|
|
39990
40302
|
}
|
|
39991
40303
|
start() {
|
|
39992
40304
|
this.started = true;
|
|
39993
40305
|
this.finished = false;
|
|
39994
|
-
if (this.isInteractive) {
|
|
39995
|
-
this.write("\n");
|
|
39996
|
-
this.renderTimer = setInterval(() => {
|
|
39997
|
-
this.scheduleRender();
|
|
39998
|
-
}, 1e3);
|
|
39999
|
-
this.renderTimer.unref?.();
|
|
40000
|
-
}
|
|
40001
40306
|
}
|
|
40002
40307
|
setTotalTests(count) {
|
|
40003
40308
|
this.totalTests = count;
|
|
40004
40309
|
}
|
|
40005
40310
|
updateWorker(progress) {
|
|
40311
|
+
const previous = this.workers.get(progress.workerId);
|
|
40006
40312
|
this.workers.set(progress.workerId, progress);
|
|
40007
40313
|
if (progress.status === "completed" || progress.status === "failed") {
|
|
40008
40314
|
this.completedTests++;
|
|
40009
40315
|
}
|
|
40010
|
-
|
|
40011
|
-
|
|
40012
|
-
|
|
40013
|
-
|
|
40014
|
-
|
|
40015
|
-
|
|
40016
|
-
|
|
40316
|
+
const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
|
|
40317
|
+
const countPrefix = `${this.completedTests}/${this.totalTests}`;
|
|
40318
|
+
switch (progress.status) {
|
|
40319
|
+
case "pending":
|
|
40320
|
+
if (this.verbose && !previous) {
|
|
40321
|
+
console.log(`${countPrefix} \u23F3 ${progress.evalId}${targetSuffix}`);
|
|
40322
|
+
}
|
|
40323
|
+
break;
|
|
40324
|
+
case "running":
|
|
40325
|
+
if (!previous || previous.status === "pending") {
|
|
40326
|
+
console.log(`${countPrefix} \u{1F504} ${progress.evalId}${targetSuffix}`);
|
|
40327
|
+
}
|
|
40328
|
+
break;
|
|
40329
|
+
case "completed":
|
|
40330
|
+
console.log(`${countPrefix} \u2705 ${progress.evalId}${targetSuffix}`);
|
|
40331
|
+
break;
|
|
40332
|
+
case "failed":
|
|
40017
40333
|
console.log(
|
|
40018
|
-
|
|
40334
|
+
`${countPrefix} \u274C ${progress.evalId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
|
|
40019
40335
|
);
|
|
40020
|
-
|
|
40336
|
+
break;
|
|
40021
40337
|
}
|
|
40022
40338
|
}
|
|
40023
40339
|
addLogPaths(paths) {
|
|
40024
40340
|
const newPaths = [];
|
|
40025
|
-
for (const
|
|
40026
|
-
if (this.logPathSet.has(
|
|
40341
|
+
for (const path28 of paths) {
|
|
40342
|
+
if (this.logPathSet.has(path28)) {
|
|
40027
40343
|
continue;
|
|
40028
40344
|
}
|
|
40029
|
-
this.logPathSet.add(
|
|
40030
|
-
newPaths.push(
|
|
40345
|
+
this.logPathSet.add(path28);
|
|
40346
|
+
newPaths.push(path28);
|
|
40031
40347
|
}
|
|
40032
40348
|
if (newPaths.length === 0) {
|
|
40033
40349
|
return;
|
|
40034
40350
|
}
|
|
40035
40351
|
this.logPaths.push(...newPaths);
|
|
40036
|
-
if (this.isInteractive) {
|
|
40037
|
-
this.scheduleRender();
|
|
40038
|
-
return;
|
|
40039
|
-
}
|
|
40040
40352
|
if (!this.hasPrintedLogHeader) {
|
|
40041
40353
|
console.log("");
|
|
40042
40354
|
console.log("Codex CLI logs:");
|
|
40043
40355
|
this.hasPrintedLogHeader = true;
|
|
40044
40356
|
}
|
|
40045
40357
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
40046
|
-
newPaths.forEach((
|
|
40047
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
40358
|
+
newPaths.forEach((path28, offset) => {
|
|
40359
|
+
console.log(`${startIndex + offset + 1}. ${path28}`);
|
|
40048
40360
|
});
|
|
40049
40361
|
}
|
|
40050
|
-
scheduleRender() {
|
|
40051
|
-
if (this.renderScheduled || this.finished) {
|
|
40052
|
-
return;
|
|
40053
|
-
}
|
|
40054
|
-
this.renderScheduled = true;
|
|
40055
|
-
setTimeout(() => {
|
|
40056
|
-
this.renderScheduled = false;
|
|
40057
|
-
this.render();
|
|
40058
|
-
}, 100);
|
|
40059
|
-
}
|
|
40060
|
-
write(content) {
|
|
40061
|
-
process.stdout.write(content);
|
|
40062
|
-
}
|
|
40063
|
-
clearWindow() {
|
|
40064
|
-
if (this.windowHeight === 0) {
|
|
40065
|
-
return;
|
|
40066
|
-
}
|
|
40067
|
-
this.write(`\r${CLEAR_LINE}`);
|
|
40068
|
-
for (let i = 1; i < this.windowHeight; i++) {
|
|
40069
|
-
this.write(`${MOVE_CURSOR_UP}\r${CLEAR_LINE}`);
|
|
40070
|
-
}
|
|
40071
|
-
this.windowHeight = 0;
|
|
40072
|
-
}
|
|
40073
|
-
getRenderedRowCount(rows) {
|
|
40074
|
-
const columns = process.stdout.columns || 80;
|
|
40075
|
-
let count = 0;
|
|
40076
|
-
for (const row of rows) {
|
|
40077
|
-
const text2 = stripVTControlCharacters(row);
|
|
40078
|
-
count += Math.max(1, Math.ceil(text2.length / columns));
|
|
40079
|
-
}
|
|
40080
|
-
return count;
|
|
40081
|
-
}
|
|
40082
|
-
render() {
|
|
40083
|
-
if (!this.isInteractive || !this.started || this.finished) {
|
|
40084
|
-
return;
|
|
40085
|
-
}
|
|
40086
|
-
const lines = [];
|
|
40087
|
-
const sortedWorkers = Array.from(this.workers.values()).sort((a, b) => a.workerId - b.workerId);
|
|
40088
|
-
for (const worker of sortedWorkers) {
|
|
40089
|
-
const line2 = this.formatWorkerLine(worker);
|
|
40090
|
-
lines.push(line2);
|
|
40091
|
-
}
|
|
40092
|
-
if (this.logPaths.length > 0) {
|
|
40093
|
-
lines.push("");
|
|
40094
|
-
lines.push("Codex CLI logs:");
|
|
40095
|
-
this.logPaths.forEach((path27, index) => {
|
|
40096
|
-
lines.push(`${index + 1}. ${path27}`);
|
|
40097
|
-
});
|
|
40098
|
-
}
|
|
40099
|
-
const rowCount = this.getRenderedRowCount(lines);
|
|
40100
|
-
this.clearWindow();
|
|
40101
|
-
if (lines.length > 0) {
|
|
40102
|
-
this.write(lines.join("\n"));
|
|
40103
|
-
}
|
|
40104
|
-
this.windowHeight = rowCount;
|
|
40105
|
-
}
|
|
40106
|
-
formatWorkerLine(worker) {
|
|
40107
|
-
const workerLabel = `${worker.workerId}.`.padEnd(4);
|
|
40108
|
-
const statusIcon = this.getStatusIcon(worker.status);
|
|
40109
|
-
const targetLabel = worker.targetLabel ? ` | ${worker.targetLabel}` : "";
|
|
40110
|
-
const columns = process.stdout.columns || 80;
|
|
40111
|
-
const maxLineLength = Math.max(40, columns - 4);
|
|
40112
|
-
const reservedLength = workerLabel.length + statusIcon.length + targetLabel.length + 4;
|
|
40113
|
-
const availableLabelLength = Math.max(15, maxLineLength - reservedLength);
|
|
40114
|
-
let testLabel = worker.evalId;
|
|
40115
|
-
if (testLabel.length > availableLabelLength) {
|
|
40116
|
-
testLabel = `${testLabel.substring(0, Math.max(0, availableLabelLength - 3))}...`;
|
|
40117
|
-
}
|
|
40118
|
-
return `${workerLabel} ${statusIcon} ${testLabel}${targetLabel}`;
|
|
40119
|
-
}
|
|
40120
|
-
getStatusIcon(status) {
|
|
40121
|
-
switch (status) {
|
|
40122
|
-
case "pending":
|
|
40123
|
-
return "\u23F3";
|
|
40124
|
-
case "running":
|
|
40125
|
-
return "\u{1F504}";
|
|
40126
|
-
case "completed":
|
|
40127
|
-
return "\u2705";
|
|
40128
|
-
case "failed":
|
|
40129
|
-
return "\u274C";
|
|
40130
|
-
default:
|
|
40131
|
-
return " ";
|
|
40132
|
-
}
|
|
40133
|
-
}
|
|
40134
40362
|
finish() {
|
|
40135
|
-
if (this.renderTimer) {
|
|
40136
|
-
clearInterval(this.renderTimer);
|
|
40137
|
-
this.renderTimer = void 0;
|
|
40138
|
-
}
|
|
40139
40363
|
this.finished = true;
|
|
40140
|
-
|
|
40141
|
-
this.clearWindow();
|
|
40142
|
-
const sortedWorkers = Array.from(this.workers.values()).sort(
|
|
40143
|
-
(a, b) => a.workerId - b.workerId
|
|
40144
|
-
);
|
|
40145
|
-
for (const worker of sortedWorkers) {
|
|
40146
|
-
this.write(`${this.formatWorkerLine(worker)}
|
|
40147
|
-
`);
|
|
40148
|
-
}
|
|
40149
|
-
this.write("\n");
|
|
40150
|
-
}
|
|
40364
|
+
console.log("");
|
|
40151
40365
|
}
|
|
40152
40366
|
clear() {
|
|
40153
|
-
if (this.isInteractive) {
|
|
40154
|
-
this.clearWindow();
|
|
40155
|
-
}
|
|
40156
40367
|
}
|
|
40157
40368
|
};
|
|
40158
40369
|
|
|
@@ -40300,7 +40511,7 @@ function formatEvaluationSummary(summary) {
|
|
|
40300
40511
|
|
|
40301
40512
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
40302
40513
|
import { readFile as readFile7 } from "node:fs/promises";
|
|
40303
|
-
import
|
|
40514
|
+
import path18 from "node:path";
|
|
40304
40515
|
import { parse as parse6 } from "yaml";
|
|
40305
40516
|
import { readFile as readFile23 } from "node:fs/promises";
|
|
40306
40517
|
import path23 from "node:path";
|
|
@@ -40343,8 +40554,8 @@ async function detectFileType(filePath) {
|
|
|
40343
40554
|
}
|
|
40344
40555
|
}
|
|
40345
40556
|
function inferFileTypeFromPath(filePath) {
|
|
40346
|
-
const normalized =
|
|
40347
|
-
const basename =
|
|
40557
|
+
const normalized = path18.normalize(filePath).replace(/\\/g, "/");
|
|
40558
|
+
const basename = path18.basename(filePath);
|
|
40348
40559
|
if (normalized.includes("/.agentv/")) {
|
|
40349
40560
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
40350
40561
|
return "config";
|
|
@@ -40656,6 +40867,9 @@ var CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
40656
40867
|
...COMMON_SETTINGS,
|
|
40657
40868
|
"command_template",
|
|
40658
40869
|
"commandTemplate",
|
|
40870
|
+
"verbose",
|
|
40871
|
+
"cli_verbose",
|
|
40872
|
+
"cliVerbose",
|
|
40659
40873
|
"files_format",
|
|
40660
40874
|
"filesFormat",
|
|
40661
40875
|
"attachments_format",
|
|
@@ -40664,7 +40878,11 @@ var CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
40664
40878
|
"env",
|
|
40665
40879
|
"timeout_seconds",
|
|
40666
40880
|
"timeoutSeconds",
|
|
40667
|
-
"healthcheck"
|
|
40881
|
+
"healthcheck",
|
|
40882
|
+
"keep_temp_files",
|
|
40883
|
+
"keepTempFiles",
|
|
40884
|
+
"keep_output_files",
|
|
40885
|
+
"keepOutputFiles"
|
|
40668
40886
|
]);
|
|
40669
40887
|
function getKnownSettings(provider) {
|
|
40670
40888
|
const normalizedProvider = provider.toLowerCase();
|
|
@@ -40789,6 +41007,15 @@ async function validateTargetsFile(filePath) {
|
|
|
40789
41007
|
if (healthcheck !== void 0) {
|
|
40790
41008
|
validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
|
|
40791
41009
|
}
|
|
41010
|
+
const verbose = target.verbose ?? target.cli_verbose ?? target.cliVerbose;
|
|
41011
|
+
if (verbose !== void 0 && typeof verbose !== "boolean") {
|
|
41012
|
+
errors2.push({
|
|
41013
|
+
severity: "error",
|
|
41014
|
+
filePath: absolutePath2,
|
|
41015
|
+
location: `${location}.verbose`,
|
|
41016
|
+
message: "'verbose' must be a boolean when provided"
|
|
41017
|
+
});
|
|
41018
|
+
}
|
|
40792
41019
|
}
|
|
40793
41020
|
function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
|
|
40794
41021
|
if (!isObject22(healthcheck)) {
|
|
@@ -41173,12 +41400,12 @@ async function validateMessagesFileRefs(messages, location, searchRoots, filePat
|
|
|
41173
41400
|
// src/utils/targets.ts
|
|
41174
41401
|
import { constants as constants5 } from "node:fs";
|
|
41175
41402
|
import { access as access5 } from "node:fs/promises";
|
|
41176
|
-
import
|
|
41403
|
+
import path19 from "node:path";
|
|
41177
41404
|
var TARGET_FILE_CANDIDATES = [
|
|
41178
41405
|
"targets.yaml",
|
|
41179
41406
|
"targets.yml",
|
|
41180
|
-
|
|
41181
|
-
|
|
41407
|
+
path19.join(".agentv", "targets.yaml"),
|
|
41408
|
+
path19.join(".agentv", "targets.yml")
|
|
41182
41409
|
];
|
|
41183
41410
|
async function fileExists5(filePath) {
|
|
41184
41411
|
try {
|
|
@@ -41191,12 +41418,12 @@ async function fileExists5(filePath) {
|
|
|
41191
41418
|
async function discoverTargetsFile(options) {
|
|
41192
41419
|
const { explicitPath, testFilePath, repoRoot, cwd } = options;
|
|
41193
41420
|
if (explicitPath) {
|
|
41194
|
-
const resolvedExplicit =
|
|
41421
|
+
const resolvedExplicit = path19.resolve(explicitPath);
|
|
41195
41422
|
if (await fileExists5(resolvedExplicit)) {
|
|
41196
41423
|
return resolvedExplicit;
|
|
41197
41424
|
}
|
|
41198
41425
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
41199
|
-
const nested =
|
|
41426
|
+
const nested = path19.join(resolvedExplicit, candidate);
|
|
41200
41427
|
if (await fileExists5(nested)) {
|
|
41201
41428
|
return nested;
|
|
41202
41429
|
}
|
|
@@ -41204,13 +41431,13 @@ async function discoverTargetsFile(options) {
|
|
|
41204
41431
|
throw new Error(`targets.yaml not found at provided path: ${resolvedExplicit}`);
|
|
41205
41432
|
}
|
|
41206
41433
|
const directories = [...buildDirectoryChain(testFilePath, repoRoot)];
|
|
41207
|
-
const resolvedCwd =
|
|
41434
|
+
const resolvedCwd = path19.resolve(cwd);
|
|
41208
41435
|
if (!directories.includes(resolvedCwd)) {
|
|
41209
41436
|
directories.push(resolvedCwd);
|
|
41210
41437
|
}
|
|
41211
41438
|
for (const directory of directories) {
|
|
41212
41439
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
41213
|
-
const fullPath =
|
|
41440
|
+
const fullPath = path19.join(directory, candidate);
|
|
41214
41441
|
if (await fileExists5(fullPath)) {
|
|
41215
41442
|
return fullPath;
|
|
41216
41443
|
}
|
|
@@ -41389,15 +41616,15 @@ async function ensureFileExists(filePath, description) {
|
|
|
41389
41616
|
}
|
|
41390
41617
|
}
|
|
41391
41618
|
async function findRepoRoot(start) {
|
|
41392
|
-
const fallback =
|
|
41619
|
+
const fallback = path20.resolve(start);
|
|
41393
41620
|
let current = fallback;
|
|
41394
41621
|
while (current !== void 0) {
|
|
41395
|
-
const candidate =
|
|
41622
|
+
const candidate = path20.join(current, ".git");
|
|
41396
41623
|
try {
|
|
41397
41624
|
await access6(candidate, constants6.F_OK);
|
|
41398
41625
|
return current;
|
|
41399
41626
|
} catch {
|
|
41400
|
-
const parent =
|
|
41627
|
+
const parent = path20.dirname(current);
|
|
41401
41628
|
if (parent === current) {
|
|
41402
41629
|
break;
|
|
41403
41630
|
}
|
|
@@ -41410,16 +41637,16 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
41410
41637
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
41411
41638
|
const baseName = "eval";
|
|
41412
41639
|
const extension = getDefaultExtension(format);
|
|
41413
|
-
return
|
|
41640
|
+
return path20.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
41414
41641
|
}
|
|
41415
|
-
function resolvePromptDirectory(
|
|
41416
|
-
if (
|
|
41642
|
+
function resolvePromptDirectory(option5, cwd) {
|
|
41643
|
+
if (option5 === void 0) {
|
|
41417
41644
|
return void 0;
|
|
41418
41645
|
}
|
|
41419
|
-
if (typeof
|
|
41420
|
-
return
|
|
41646
|
+
if (typeof option5 === "string" && option5.trim().length > 0) {
|
|
41647
|
+
return path20.resolve(cwd, option5);
|
|
41421
41648
|
}
|
|
41422
|
-
return
|
|
41649
|
+
return path20.join(cwd, ".agentv", "prompts");
|
|
41423
41650
|
}
|
|
41424
41651
|
function createEvaluationCache() {
|
|
41425
41652
|
const store = /* @__PURE__ */ new Map();
|
|
@@ -41432,8 +41659,8 @@ function createEvaluationCache() {
|
|
|
41432
41659
|
}
|
|
41433
41660
|
};
|
|
41434
41661
|
}
|
|
41435
|
-
function createProgressReporter(maxWorkers) {
|
|
41436
|
-
const display = new ProgressDisplay(maxWorkers);
|
|
41662
|
+
function createProgressReporter(maxWorkers, options) {
|
|
41663
|
+
const display = new ProgressDisplay(maxWorkers, options);
|
|
41437
41664
|
return {
|
|
41438
41665
|
isInteractive: display.isInteractiveMode(),
|
|
41439
41666
|
start: () => display.start(),
|
|
@@ -41444,7 +41671,7 @@ function createProgressReporter(maxWorkers) {
|
|
|
41444
41671
|
};
|
|
41445
41672
|
}
|
|
41446
41673
|
function makeEvalKey(testFilePath, evalId) {
|
|
41447
|
-
return `${
|
|
41674
|
+
return `${path20.resolve(testFilePath)}::${evalId}`;
|
|
41448
41675
|
}
|
|
41449
41676
|
function createDisplayIdTracker() {
|
|
41450
41677
|
const map2 = /* @__PURE__ */ new Map();
|
|
@@ -41461,6 +41688,22 @@ function createDisplayIdTracker() {
|
|
|
41461
41688
|
}
|
|
41462
41689
|
};
|
|
41463
41690
|
}
|
|
41691
|
+
function applyVerboseOverride(selection, cliVerbose) {
|
|
41692
|
+
const { resolvedTarget } = selection;
|
|
41693
|
+
if (resolvedTarget.kind !== "cli") {
|
|
41694
|
+
return selection;
|
|
41695
|
+
}
|
|
41696
|
+
return {
|
|
41697
|
+
...selection,
|
|
41698
|
+
resolvedTarget: {
|
|
41699
|
+
...resolvedTarget,
|
|
41700
|
+
config: {
|
|
41701
|
+
...resolvedTarget.config,
|
|
41702
|
+
verbose: cliVerbose
|
|
41703
|
+
}
|
|
41704
|
+
}
|
|
41705
|
+
};
|
|
41706
|
+
}
|
|
41464
41707
|
async function prepareFileMetadata(params) {
|
|
41465
41708
|
const { testFilePath, repoRoot, cwd, options } = params;
|
|
41466
41709
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -41520,7 +41763,7 @@ async function runSingleEvalFile(params) {
|
|
|
41520
41763
|
evalCases
|
|
41521
41764
|
} = params;
|
|
41522
41765
|
await ensureFileExists(testFilePath, "Test file");
|
|
41523
|
-
const resolvedTargetSelection = selection;
|
|
41766
|
+
const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
|
|
41524
41767
|
const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
|
|
41525
41768
|
const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} [provider=${providerLabel}] via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`;
|
|
41526
41769
|
if (!progressReporter.isInteractive || options.verbose) {
|
|
@@ -41600,7 +41843,7 @@ async function runEvalCommand(input) {
|
|
|
41600
41843
|
if (options.verbose) {
|
|
41601
41844
|
console.log(`Repository root: ${repoRoot}`);
|
|
41602
41845
|
}
|
|
41603
|
-
const outputPath = options.outPath ?
|
|
41846
|
+
const outputPath = options.outPath ? path20.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
|
|
41604
41847
|
console.log(`Output path: ${outputPath}`);
|
|
41605
41848
|
const outputWriter = await createOutputWriter(outputPath, options.format);
|
|
41606
41849
|
const cache = options.cache ? createEvaluationCache() : void 0;
|
|
@@ -41608,7 +41851,7 @@ async function runEvalCommand(input) {
|
|
|
41608
41851
|
const allResults = [];
|
|
41609
41852
|
let lastPromptDumpDir;
|
|
41610
41853
|
const seenEvalCases = /* @__PURE__ */ new Set();
|
|
41611
|
-
const resolvedTestFiles = input.testFiles.map((file2) =>
|
|
41854
|
+
const resolvedTestFiles = input.testFiles.map((file2) => path20.resolve(file2));
|
|
41612
41855
|
const displayIdTracker = createDisplayIdTracker();
|
|
41613
41856
|
const totalWorkers = options.workers ?? DEFAULT_WORKERS;
|
|
41614
41857
|
const fileConcurrency = Math.min(
|
|
@@ -41633,7 +41876,7 @@ async function runEvalCommand(input) {
|
|
|
41633
41876
|
if (totalEvalCount === 0) {
|
|
41634
41877
|
throw new Error("No eval cases matched the provided filters.");
|
|
41635
41878
|
}
|
|
41636
|
-
const progressReporter = createProgressReporter(totalWorkers);
|
|
41879
|
+
const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
|
|
41637
41880
|
progressReporter.start();
|
|
41638
41881
|
progressReporter.setTotal(totalEvalCount);
|
|
41639
41882
|
const seenCodexLogPaths = /* @__PURE__ */ new Set();
|
|
@@ -41704,7 +41947,7 @@ async function resolveEvaluationRunner() {
|
|
|
41704
41947
|
if (!overridePath) {
|
|
41705
41948
|
return runEvaluation;
|
|
41706
41949
|
}
|
|
41707
|
-
const resolved =
|
|
41950
|
+
const resolved = path20.isAbsolute(overridePath) ? overridePath : path20.resolve(process.cwd(), overridePath);
|
|
41708
41951
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
41709
41952
|
const mod = await import(moduleUrl);
|
|
41710
41953
|
const candidate = mod.runEvaluation;
|
|
@@ -41717,44 +41960,44 @@ async function resolveEvaluationRunner() {
|
|
|
41717
41960
|
}
|
|
41718
41961
|
|
|
41719
41962
|
// src/commands/eval/index.ts
|
|
41720
|
-
var evalCommand =
|
|
41963
|
+
var evalCommand = command2({
|
|
41721
41964
|
name: "eval",
|
|
41722
41965
|
description: "Run eval suites and report results",
|
|
41723
41966
|
args: {
|
|
41724
41967
|
evalPaths: restPositionals({
|
|
41725
|
-
type:
|
|
41968
|
+
type: string5,
|
|
41726
41969
|
displayName: "eval-paths",
|
|
41727
41970
|
description: "Path(s) or glob(s) to evaluation .yaml file(s)"
|
|
41728
41971
|
}),
|
|
41729
|
-
target:
|
|
41730
|
-
type:
|
|
41972
|
+
target: option2({
|
|
41973
|
+
type: string5,
|
|
41731
41974
|
long: "target",
|
|
41732
41975
|
description: "Override target name from targets.yaml",
|
|
41733
41976
|
defaultValue: () => "default"
|
|
41734
41977
|
}),
|
|
41735
|
-
targets:
|
|
41736
|
-
type:
|
|
41978
|
+
targets: option2({
|
|
41979
|
+
type: optional3(string5),
|
|
41737
41980
|
long: "targets",
|
|
41738
41981
|
description: "Path to targets.yaml (overrides discovery)"
|
|
41739
41982
|
}),
|
|
41740
|
-
evalId:
|
|
41741
|
-
type:
|
|
41983
|
+
evalId: option2({
|
|
41984
|
+
type: optional3(string5),
|
|
41742
41985
|
long: "eval-id",
|
|
41743
41986
|
description: "Run only the eval case with this identifier"
|
|
41744
41987
|
}),
|
|
41745
|
-
workers:
|
|
41988
|
+
workers: option2({
|
|
41746
41989
|
type: number4,
|
|
41747
41990
|
long: "workers",
|
|
41748
41991
|
description: "Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml",
|
|
41749
41992
|
defaultValue: () => 3
|
|
41750
41993
|
}),
|
|
41751
|
-
out:
|
|
41752
|
-
type:
|
|
41994
|
+
out: option2({
|
|
41995
|
+
type: optional3(string5),
|
|
41753
41996
|
long: "out",
|
|
41754
41997
|
description: "Write results to the specified path"
|
|
41755
41998
|
}),
|
|
41756
|
-
outputFormat:
|
|
41757
|
-
type:
|
|
41999
|
+
outputFormat: option2({
|
|
42000
|
+
type: string5,
|
|
41758
42001
|
long: "output-format",
|
|
41759
42002
|
description: "Output format: 'jsonl' or 'yaml' (default: jsonl)",
|
|
41760
42003
|
defaultValue: () => "jsonl"
|
|
@@ -41763,31 +42006,31 @@ var evalCommand = command({
|
|
|
41763
42006
|
long: "dry-run",
|
|
41764
42007
|
description: "Use mock provider responses instead of real LLM calls"
|
|
41765
42008
|
}),
|
|
41766
|
-
dryRunDelay:
|
|
42009
|
+
dryRunDelay: option2({
|
|
41767
42010
|
type: number4,
|
|
41768
42011
|
long: "dry-run-delay",
|
|
41769
42012
|
description: "Fixed delay in milliseconds for dry-run mode (overridden by delay range if specified)",
|
|
41770
42013
|
defaultValue: () => 0
|
|
41771
42014
|
}),
|
|
41772
|
-
dryRunDelayMin:
|
|
42015
|
+
dryRunDelayMin: option2({
|
|
41773
42016
|
type: number4,
|
|
41774
42017
|
long: "dry-run-delay-min",
|
|
41775
42018
|
description: "Minimum delay in milliseconds for dry-run mode (requires --dry-run-delay-max)",
|
|
41776
42019
|
defaultValue: () => 0
|
|
41777
42020
|
}),
|
|
41778
|
-
dryRunDelayMax:
|
|
42021
|
+
dryRunDelayMax: option2({
|
|
41779
42022
|
type: number4,
|
|
41780
42023
|
long: "dry-run-delay-max",
|
|
41781
42024
|
description: "Maximum delay in milliseconds for dry-run mode (requires --dry-run-delay-min)",
|
|
41782
42025
|
defaultValue: () => 0
|
|
41783
42026
|
}),
|
|
41784
|
-
agentTimeout:
|
|
42027
|
+
agentTimeout: option2({
|
|
41785
42028
|
type: number4,
|
|
41786
42029
|
long: "agent-timeout",
|
|
41787
42030
|
description: "Timeout in seconds for provider responses (default: 120)",
|
|
41788
42031
|
defaultValue: () => 120
|
|
41789
42032
|
}),
|
|
41790
|
-
maxRetries:
|
|
42033
|
+
maxRetries: option2({
|
|
41791
42034
|
type: number4,
|
|
41792
42035
|
long: "max-retries",
|
|
41793
42036
|
description: "Retry count for timeout recoveries (default: 2)",
|
|
@@ -41801,8 +42044,8 @@ var evalCommand = command({
|
|
|
41801
42044
|
long: "verbose",
|
|
41802
42045
|
description: "Enable verbose logging"
|
|
41803
42046
|
}),
|
|
41804
|
-
dumpPrompts:
|
|
41805
|
-
type:
|
|
42047
|
+
dumpPrompts: option2({
|
|
42048
|
+
type: optional3(string5),
|
|
41806
42049
|
long: "dump-prompts",
|
|
41807
42050
|
description: "Directory path for persisting prompt payloads for debugging"
|
|
41808
42051
|
}),
|
|
@@ -41848,7 +42091,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
41848
42091
|
const unmatched = [];
|
|
41849
42092
|
const results = /* @__PURE__ */ new Set();
|
|
41850
42093
|
for (const pattern of normalizedInputs) {
|
|
41851
|
-
const candidatePath =
|
|
42094
|
+
const candidatePath = path21.isAbsolute(pattern) ? path21.normalize(pattern) : path21.resolve(cwd, pattern);
|
|
41852
42095
|
try {
|
|
41853
42096
|
const stats = await stat4(candidatePath);
|
|
41854
42097
|
if (stats.isFile() && /\.ya?ml$/i.test(candidatePath)) {
|
|
@@ -41872,7 +42115,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
41872
42115
|
continue;
|
|
41873
42116
|
}
|
|
41874
42117
|
for (const filePath of yamlMatches) {
|
|
41875
|
-
results.add(
|
|
42118
|
+
results.add(path21.normalize(filePath));
|
|
41876
42119
|
}
|
|
41877
42120
|
}
|
|
41878
42121
|
if (unmatched.length > 0) {
|
|
@@ -41888,11 +42131,11 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
41888
42131
|
}
|
|
41889
42132
|
|
|
41890
42133
|
// src/commands/generate/index.ts
|
|
41891
|
-
import { command as
|
|
42134
|
+
import { command as command3, flag as flag2, option as option3, optional as optional4, positional as positional3, string as string6, subcommands } from "cmd-ts";
|
|
41892
42135
|
|
|
41893
42136
|
// src/commands/generate/rubrics.ts
|
|
41894
42137
|
import { readFile as readFile8, writeFile as writeFile6 } from "node:fs/promises";
|
|
41895
|
-
import
|
|
42138
|
+
import path24 from "node:path";
|
|
41896
42139
|
import { pathToFileURL as pathToFileURL2 } from "node:url";
|
|
41897
42140
|
import { isMap, isSeq, parseDocument } from "yaml";
|
|
41898
42141
|
function isJsonObject3(value) {
|
|
@@ -41904,7 +42147,7 @@ function asString6(value) {
|
|
|
41904
42147
|
async function loadRubricGenerator() {
|
|
41905
42148
|
const customGenerator = process.env.AGENTEVO_CLI_RUBRIC_GENERATOR;
|
|
41906
42149
|
if (customGenerator) {
|
|
41907
|
-
const generatorPath =
|
|
42150
|
+
const generatorPath = path24.resolve(customGenerator);
|
|
41908
42151
|
const generatorUrl = pathToFileURL2(generatorPath).href;
|
|
41909
42152
|
const module = await import(generatorUrl);
|
|
41910
42153
|
return module.generateRubrics;
|
|
@@ -41914,7 +42157,7 @@ async function loadRubricGenerator() {
|
|
|
41914
42157
|
async function generateRubricsCommand(options) {
|
|
41915
42158
|
const { file: file2, target: targetOverride, verbose } = options;
|
|
41916
42159
|
console.log(`Generating rubrics for: ${file2}`);
|
|
41917
|
-
const absolutePath =
|
|
42160
|
+
const absolutePath = path24.resolve(file2);
|
|
41918
42161
|
const content = await readFile8(absolutePath, "utf8");
|
|
41919
42162
|
const doc = parseDocument(content);
|
|
41920
42163
|
const parsed = doc.toJSON();
|
|
@@ -42031,17 +42274,17 @@ function extractQuestion(evalCase) {
|
|
|
42031
42274
|
}
|
|
42032
42275
|
|
|
42033
42276
|
// src/commands/generate/index.ts
|
|
42034
|
-
var rubricsCommand =
|
|
42277
|
+
var rubricsCommand = command3({
|
|
42035
42278
|
name: "rubrics",
|
|
42036
42279
|
description: "Generate rubrics from expected_outcome in YAML eval file",
|
|
42037
42280
|
args: {
|
|
42038
|
-
file:
|
|
42039
|
-
type:
|
|
42281
|
+
file: positional3({
|
|
42282
|
+
type: string6,
|
|
42040
42283
|
displayName: "file",
|
|
42041
42284
|
description: "Path to YAML eval file"
|
|
42042
42285
|
}),
|
|
42043
|
-
target:
|
|
42044
|
-
type:
|
|
42286
|
+
target: option3({
|
|
42287
|
+
type: optional4(string6),
|
|
42045
42288
|
long: "target",
|
|
42046
42289
|
short: "t",
|
|
42047
42290
|
description: "Override target for rubric generation (default: file target or openai:gpt-4o)"
|
|
@@ -42074,14 +42317,14 @@ var generateCommand = subcommands({
|
|
|
42074
42317
|
});
|
|
42075
42318
|
|
|
42076
42319
|
// src/commands/init/index.ts
|
|
42077
|
-
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
42078
|
-
import
|
|
42320
|
+
import { existsSync, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
|
|
42321
|
+
import path26 from "node:path";
|
|
42079
42322
|
import * as readline from "node:readline/promises";
|
|
42080
|
-
import { command as
|
|
42323
|
+
import { command as command4, option as option4, optional as optional5, string as string7 } from "cmd-ts";
|
|
42081
42324
|
|
|
42082
42325
|
// src/templates/index.ts
|
|
42083
|
-
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
42084
|
-
import
|
|
42326
|
+
import { readFileSync as readFileSync2, readdirSync, statSync } from "node:fs";
|
|
42327
|
+
import path25 from "node:path";
|
|
42085
42328
|
import { fileURLToPath } from "node:url";
|
|
42086
42329
|
function getGithubTemplates() {
|
|
42087
42330
|
return getTemplatesFromDir(".github");
|
|
@@ -42093,12 +42336,12 @@ function getClaudeTemplates() {
|
|
|
42093
42336
|
return getTemplatesFromDir(".claude");
|
|
42094
42337
|
}
|
|
42095
42338
|
function getTemplatesFromDir(subdir) {
|
|
42096
|
-
const currentDir =
|
|
42339
|
+
const currentDir = path25.dirname(fileURLToPath(import.meta.url));
|
|
42097
42340
|
let templatesDir;
|
|
42098
|
-
if (currentDir.includes(`${
|
|
42099
|
-
templatesDir =
|
|
42341
|
+
if (currentDir.includes(`${path25.sep}dist`)) {
|
|
42342
|
+
templatesDir = path25.join(currentDir, "templates", subdir);
|
|
42100
42343
|
} else {
|
|
42101
|
-
templatesDir =
|
|
42344
|
+
templatesDir = path25.join(currentDir, subdir);
|
|
42102
42345
|
}
|
|
42103
42346
|
return readTemplatesRecursively(templatesDir, "");
|
|
42104
42347
|
}
|
|
@@ -42106,15 +42349,15 @@ function readTemplatesRecursively(dir, relativePath) {
|
|
|
42106
42349
|
const templates = [];
|
|
42107
42350
|
const entries = readdirSync(dir);
|
|
42108
42351
|
for (const entry of entries) {
|
|
42109
|
-
const fullPath =
|
|
42352
|
+
const fullPath = path25.join(dir, entry);
|
|
42110
42353
|
const stat6 = statSync(fullPath);
|
|
42111
|
-
const entryRelativePath = relativePath ?
|
|
42354
|
+
const entryRelativePath = relativePath ? path25.join(relativePath, entry) : entry;
|
|
42112
42355
|
if (stat6.isDirectory()) {
|
|
42113
42356
|
templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
|
|
42114
42357
|
} else {
|
|
42115
|
-
const content =
|
|
42358
|
+
const content = readFileSync2(fullPath, "utf-8");
|
|
42116
42359
|
templates.push({
|
|
42117
|
-
path: entryRelativePath.split(
|
|
42360
|
+
path: entryRelativePath.split(path25.sep).join("/"),
|
|
42118
42361
|
// Normalize to forward slashes
|
|
42119
42362
|
content
|
|
42120
42363
|
});
|
|
@@ -42137,10 +42380,10 @@ async function promptYesNo(message) {
|
|
|
42137
42380
|
}
|
|
42138
42381
|
}
|
|
42139
42382
|
async function initCommand(options = {}) {
|
|
42140
|
-
const targetPath =
|
|
42141
|
-
const githubDir =
|
|
42142
|
-
const agentvDir =
|
|
42143
|
-
const claudeDir =
|
|
42383
|
+
const targetPath = path26.resolve(options.targetPath ?? ".");
|
|
42384
|
+
const githubDir = path26.join(targetPath, ".github");
|
|
42385
|
+
const agentvDir = path26.join(targetPath, ".agentv");
|
|
42386
|
+
const claudeDir = path26.join(targetPath, ".claude");
|
|
42144
42387
|
const githubTemplates = getGithubTemplates();
|
|
42145
42388
|
const agentvTemplates = getAgentvTemplates();
|
|
42146
42389
|
const claudeTemplates = getClaudeTemplates();
|
|
@@ -42148,32 +42391,32 @@ async function initCommand(options = {}) {
|
|
|
42148
42391
|
const otherAgentvTemplates = agentvTemplates.filter((t) => t.path !== ".env.template");
|
|
42149
42392
|
const existingFiles = [];
|
|
42150
42393
|
if (envTemplate) {
|
|
42151
|
-
const envFilePath =
|
|
42394
|
+
const envFilePath = path26.join(targetPath, ".env.template");
|
|
42152
42395
|
if (existsSync(envFilePath)) {
|
|
42153
42396
|
existingFiles.push(".env.template");
|
|
42154
42397
|
}
|
|
42155
42398
|
}
|
|
42156
42399
|
if (existsSync(githubDir)) {
|
|
42157
42400
|
for (const template of githubTemplates) {
|
|
42158
|
-
const targetFilePath =
|
|
42401
|
+
const targetFilePath = path26.join(githubDir, template.path);
|
|
42159
42402
|
if (existsSync(targetFilePath)) {
|
|
42160
|
-
existingFiles.push(
|
|
42403
|
+
existingFiles.push(path26.relative(targetPath, targetFilePath));
|
|
42161
42404
|
}
|
|
42162
42405
|
}
|
|
42163
42406
|
}
|
|
42164
42407
|
if (existsSync(agentvDir)) {
|
|
42165
42408
|
for (const template of otherAgentvTemplates) {
|
|
42166
|
-
const targetFilePath =
|
|
42409
|
+
const targetFilePath = path26.join(agentvDir, template.path);
|
|
42167
42410
|
if (existsSync(targetFilePath)) {
|
|
42168
|
-
existingFiles.push(
|
|
42411
|
+
existingFiles.push(path26.relative(targetPath, targetFilePath));
|
|
42169
42412
|
}
|
|
42170
42413
|
}
|
|
42171
42414
|
}
|
|
42172
42415
|
if (existsSync(claudeDir)) {
|
|
42173
42416
|
for (const template of claudeTemplates) {
|
|
42174
|
-
const targetFilePath =
|
|
42417
|
+
const targetFilePath = path26.join(claudeDir, template.path);
|
|
42175
42418
|
if (existsSync(targetFilePath)) {
|
|
42176
|
-
existingFiles.push(
|
|
42419
|
+
existingFiles.push(path26.relative(targetPath, targetFilePath));
|
|
42177
42420
|
}
|
|
42178
42421
|
}
|
|
42179
42422
|
}
|
|
@@ -42200,36 +42443,36 @@ async function initCommand(options = {}) {
|
|
|
42200
42443
|
mkdirSync(claudeDir, { recursive: true });
|
|
42201
42444
|
}
|
|
42202
42445
|
if (envTemplate) {
|
|
42203
|
-
const envFilePath =
|
|
42204
|
-
|
|
42446
|
+
const envFilePath = path26.join(targetPath, ".env.template");
|
|
42447
|
+
writeFileSync2(envFilePath, envTemplate.content, "utf-8");
|
|
42205
42448
|
console.log("Created .env.template");
|
|
42206
42449
|
}
|
|
42207
42450
|
for (const template of githubTemplates) {
|
|
42208
|
-
const targetFilePath =
|
|
42209
|
-
const targetDirPath =
|
|
42451
|
+
const targetFilePath = path26.join(githubDir, template.path);
|
|
42452
|
+
const targetDirPath = path26.dirname(targetFilePath);
|
|
42210
42453
|
if (!existsSync(targetDirPath)) {
|
|
42211
42454
|
mkdirSync(targetDirPath, { recursive: true });
|
|
42212
42455
|
}
|
|
42213
|
-
|
|
42214
|
-
console.log(`Created ${
|
|
42456
|
+
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
42457
|
+
console.log(`Created ${path26.relative(targetPath, targetFilePath)}`);
|
|
42215
42458
|
}
|
|
42216
42459
|
for (const template of otherAgentvTemplates) {
|
|
42217
|
-
const targetFilePath =
|
|
42218
|
-
const targetDirPath =
|
|
42460
|
+
const targetFilePath = path26.join(agentvDir, template.path);
|
|
42461
|
+
const targetDirPath = path26.dirname(targetFilePath);
|
|
42219
42462
|
if (!existsSync(targetDirPath)) {
|
|
42220
42463
|
mkdirSync(targetDirPath, { recursive: true });
|
|
42221
42464
|
}
|
|
42222
|
-
|
|
42223
|
-
console.log(`Created ${
|
|
42465
|
+
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
42466
|
+
console.log(`Created ${path26.relative(targetPath, targetFilePath)}`);
|
|
42224
42467
|
}
|
|
42225
42468
|
for (const template of claudeTemplates) {
|
|
42226
|
-
const targetFilePath =
|
|
42227
|
-
const targetDirPath =
|
|
42469
|
+
const targetFilePath = path26.join(claudeDir, template.path);
|
|
42470
|
+
const targetDirPath = path26.dirname(targetFilePath);
|
|
42228
42471
|
if (!existsSync(targetDirPath)) {
|
|
42229
42472
|
mkdirSync(targetDirPath, { recursive: true });
|
|
42230
42473
|
}
|
|
42231
|
-
|
|
42232
|
-
console.log(`Created ${
|
|
42474
|
+
writeFileSync2(targetFilePath, template.content, "utf-8");
|
|
42475
|
+
console.log(`Created ${path26.relative(targetPath, targetFilePath)}`);
|
|
42233
42476
|
}
|
|
42234
42477
|
console.log("\nAgentV initialized successfully!");
|
|
42235
42478
|
console.log("\nFiles installed to root:");
|
|
@@ -42237,17 +42480,17 @@ async function initCommand(options = {}) {
|
|
|
42237
42480
|
console.log(" - .env.template");
|
|
42238
42481
|
}
|
|
42239
42482
|
console.log(`
|
|
42240
|
-
Files installed to ${
|
|
42483
|
+
Files installed to ${path26.relative(targetPath, githubDir)}:`);
|
|
42241
42484
|
for (const t of githubTemplates) {
|
|
42242
42485
|
console.log(` - ${t.path}`);
|
|
42243
42486
|
}
|
|
42244
42487
|
console.log(`
|
|
42245
|
-
Files installed to ${
|
|
42488
|
+
Files installed to ${path26.relative(targetPath, agentvDir)}:`);
|
|
42246
42489
|
for (const t of otherAgentvTemplates) {
|
|
42247
42490
|
console.log(` - ${t.path}`);
|
|
42248
42491
|
}
|
|
42249
42492
|
console.log(`
|
|
42250
|
-
Files installed to ${
|
|
42493
|
+
Files installed to ${path26.relative(targetPath, claudeDir)}:`);
|
|
42251
42494
|
for (const t of claudeTemplates) {
|
|
42252
42495
|
console.log(` - ${t.path}`);
|
|
42253
42496
|
}
|
|
@@ -42256,12 +42499,12 @@ Files installed to ${path25.relative(targetPath, claudeDir)}:`);
|
|
|
42256
42499
|
console.log(" 2. Configure targets in .agentv/targets.yaml");
|
|
42257
42500
|
console.log(" 3. Create eval files using the schema and prompt templates");
|
|
42258
42501
|
}
|
|
42259
|
-
var initCmdTsCommand =
|
|
42502
|
+
var initCmdTsCommand = command4({
|
|
42260
42503
|
name: "init",
|
|
42261
42504
|
description: "Initialize AgentV in your project (installs prompt templates and schema to .github)",
|
|
42262
42505
|
args: {
|
|
42263
|
-
path:
|
|
42264
|
-
type:
|
|
42506
|
+
path: option4({
|
|
42507
|
+
type: optional5(string7),
|
|
42265
42508
|
long: "path",
|
|
42266
42509
|
description: "Target directory for initialization (default: current directory)"
|
|
42267
42510
|
})
|
|
@@ -42277,7 +42520,7 @@ var initCmdTsCommand = command3({
|
|
|
42277
42520
|
});
|
|
42278
42521
|
|
|
42279
42522
|
// src/commands/validate/index.ts
|
|
42280
|
-
import { command as
|
|
42523
|
+
import { command as command5, restPositionals as restPositionals2, string as string8 } from "cmd-ts";
|
|
42281
42524
|
|
|
42282
42525
|
// src/commands/validate/format-output.ts
|
|
42283
42526
|
var ANSI_RED3 = "\x1B[31m";
|
|
@@ -42362,7 +42605,7 @@ function isTTY2() {
|
|
|
42362
42605
|
// src/commands/validate/validate-files.ts
|
|
42363
42606
|
import { constants as constants7 } from "node:fs";
|
|
42364
42607
|
import { access as access7, readdir as readdir3, stat as stat5 } from "node:fs/promises";
|
|
42365
|
-
import
|
|
42608
|
+
import path27 from "node:path";
|
|
42366
42609
|
async function validateFiles(paths) {
|
|
42367
42610
|
const filePaths = await expandPaths(paths);
|
|
42368
42611
|
const results = [];
|
|
@@ -42380,7 +42623,7 @@ async function validateFiles(paths) {
|
|
|
42380
42623
|
};
|
|
42381
42624
|
}
|
|
42382
42625
|
async function validateSingleFile(filePath) {
|
|
42383
|
-
const absolutePath =
|
|
42626
|
+
const absolutePath = path27.resolve(filePath);
|
|
42384
42627
|
const fileType = await detectFileType(absolutePath);
|
|
42385
42628
|
let result;
|
|
42386
42629
|
if (fileType === "eval") {
|
|
@@ -42405,7 +42648,7 @@ async function validateSingleFile(filePath) {
|
|
|
42405
42648
|
async function expandPaths(paths) {
|
|
42406
42649
|
const expanded = [];
|
|
42407
42650
|
for (const inputPath of paths) {
|
|
42408
|
-
const absolutePath =
|
|
42651
|
+
const absolutePath = path27.resolve(inputPath);
|
|
42409
42652
|
try {
|
|
42410
42653
|
await access7(absolutePath, constants7.F_OK);
|
|
42411
42654
|
} catch {
|
|
@@ -42429,7 +42672,7 @@ async function findYamlFiles(dirPath) {
|
|
|
42429
42672
|
try {
|
|
42430
42673
|
const entries = await readdir3(dirPath, { withFileTypes: true });
|
|
42431
42674
|
for (const entry of entries) {
|
|
42432
|
-
const fullPath =
|
|
42675
|
+
const fullPath = path27.join(dirPath, entry.name);
|
|
42433
42676
|
if (entry.isDirectory()) {
|
|
42434
42677
|
if (entry.name === "node_modules" || entry.name.startsWith(".")) {
|
|
42435
42678
|
continue;
|
|
@@ -42446,7 +42689,7 @@ async function findYamlFiles(dirPath) {
|
|
|
42446
42689
|
return results;
|
|
42447
42690
|
}
|
|
42448
42691
|
function isYamlFile(filePath) {
|
|
42449
|
-
const ext =
|
|
42692
|
+
const ext = path27.extname(filePath).toLowerCase();
|
|
42450
42693
|
return ext === ".yaml" || ext === ".yml";
|
|
42451
42694
|
}
|
|
42452
42695
|
|
|
@@ -42463,12 +42706,12 @@ async function runValidateCommand(paths) {
|
|
|
42463
42706
|
process.exit(1);
|
|
42464
42707
|
}
|
|
42465
42708
|
}
|
|
42466
|
-
var validateCommand =
|
|
42709
|
+
var validateCommand = command5({
|
|
42467
42710
|
name: "validate",
|
|
42468
42711
|
description: "Validate AgentV eval and targets YAML files",
|
|
42469
42712
|
args: {
|
|
42470
42713
|
paths: restPositionals2({
|
|
42471
|
-
type:
|
|
42714
|
+
type: string8,
|
|
42472
42715
|
displayName: "paths",
|
|
42473
42716
|
description: "Files or directories to validate"
|
|
42474
42717
|
})
|
|
@@ -42484,16 +42727,17 @@ var validateCommand = command4({
|
|
|
42484
42727
|
});
|
|
42485
42728
|
|
|
42486
42729
|
// src/index.ts
|
|
42487
|
-
var packageJson = JSON.parse(
|
|
42730
|
+
var packageJson = JSON.parse(readFileSync3(new URL("../package.json", import.meta.url), "utf8"));
|
|
42488
42731
|
var app = subcommands2({
|
|
42489
42732
|
name: "agentv",
|
|
42490
42733
|
description: "AgentV CLI",
|
|
42491
42734
|
version: packageJson.version,
|
|
42492
42735
|
cmds: {
|
|
42736
|
+
convert: convertCommand,
|
|
42493
42737
|
eval: evalCommand,
|
|
42494
|
-
validate: validateCommand,
|
|
42495
42738
|
generate: generateCommand,
|
|
42496
|
-
init: initCmdTsCommand
|
|
42739
|
+
init: initCmdTsCommand,
|
|
42740
|
+
validate: validateCommand
|
|
42497
42741
|
}
|
|
42498
42742
|
});
|
|
42499
42743
|
async function runCli(argv = process.argv) {
|
|
@@ -42504,4 +42748,4 @@ export {
|
|
|
42504
42748
|
app,
|
|
42505
42749
|
runCli
|
|
42506
42750
|
};
|
|
42507
|
-
//# sourceMappingURL=chunk-
|
|
42751
|
+
//# sourceMappingURL=chunk-3RYQPI4H.js.map
|