agentv 1.5.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -2
- package/dist/{chunk-3RYQPI4H.js → chunk-HU4B6ODF.js} +1429 -369
- package/dist/chunk-HU4B6ODF.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +2 -3
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +115 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +34 -9
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -7
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -2
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +28 -2
- package/package.json +5 -2
- package/dist/chunk-3RYQPI4H.js.map +0 -1
|
@@ -141,14 +141,114 @@ var require_dist = __commonJS({
|
|
|
141
141
|
});
|
|
142
142
|
|
|
143
143
|
// src/index.ts
|
|
144
|
-
import { readFileSync as
|
|
144
|
+
import { readFileSync as readFileSync4 } from "node:fs";
|
|
145
145
|
import { binary, run, subcommands as subcommands2 } from "cmd-ts";
|
|
146
146
|
|
|
147
|
+
// src/commands/compare/index.ts
|
|
148
|
+
import { readFileSync } from "node:fs";
|
|
149
|
+
import { command, number, option, optional, positional, string } from "cmd-ts";
|
|
150
|
+
function loadJsonlResults(filePath) {
|
|
151
|
+
const content = readFileSync(filePath, "utf8");
|
|
152
|
+
const lines = content.trim().split("\n").filter((line2) => line2.trim());
|
|
153
|
+
return lines.map((line2) => {
|
|
154
|
+
const record2 = JSON.parse(line2);
|
|
155
|
+
if (typeof record2.evalId !== "string") {
|
|
156
|
+
throw new Error(`Missing evalId in result: ${line2}`);
|
|
157
|
+
}
|
|
158
|
+
if (typeof record2.score !== "number") {
|
|
159
|
+
throw new Error(`Missing or invalid score in result: ${line2}`);
|
|
160
|
+
}
|
|
161
|
+
return { evalId: record2.evalId, score: record2.score };
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
function classifyOutcome(delta, threshold) {
|
|
165
|
+
if (delta >= threshold) return "win";
|
|
166
|
+
if (delta <= -threshold) return "loss";
|
|
167
|
+
return "tie";
|
|
168
|
+
}
|
|
169
|
+
function compareResults(results1, results2, threshold) {
|
|
170
|
+
const map1 = new Map(results1.map((r) => [r.evalId, r.score]));
|
|
171
|
+
const map2 = new Map(results2.map((r) => [r.evalId, r.score]));
|
|
172
|
+
const matched = [];
|
|
173
|
+
const matchedIds = /* @__PURE__ */ new Set();
|
|
174
|
+
for (const [evalId, score1] of map1) {
|
|
175
|
+
const score2 = map2.get(evalId);
|
|
176
|
+
if (score2 !== void 0) {
|
|
177
|
+
const delta = score2 - score1;
|
|
178
|
+
matched.push({
|
|
179
|
+
evalId,
|
|
180
|
+
score1,
|
|
181
|
+
score2,
|
|
182
|
+
delta,
|
|
183
|
+
outcome: classifyOutcome(delta, threshold)
|
|
184
|
+
});
|
|
185
|
+
matchedIds.add(evalId);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
const unmatchedFile1 = results1.filter((r) => !matchedIds.has(r.evalId)).length;
|
|
189
|
+
const unmatchedFile2 = results2.filter((r) => !map1.has(r.evalId)).length;
|
|
190
|
+
const wins = matched.filter((m) => m.outcome === "win").length;
|
|
191
|
+
const losses = matched.filter((m) => m.outcome === "loss").length;
|
|
192
|
+
const ties = matched.filter((m) => m.outcome === "tie").length;
|
|
193
|
+
const meanDelta = matched.length > 0 ? matched.reduce((sum, m) => sum + m.delta, 0) / matched.length : 0;
|
|
194
|
+
return {
|
|
195
|
+
matched,
|
|
196
|
+
unmatched: { file1: unmatchedFile1, file2: unmatchedFile2 },
|
|
197
|
+
summary: {
|
|
198
|
+
total: results1.length + results2.length,
|
|
199
|
+
matched: matched.length,
|
|
200
|
+
wins,
|
|
201
|
+
losses,
|
|
202
|
+
ties,
|
|
203
|
+
meanDelta: Math.round(meanDelta * 1e3) / 1e3
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
function determineExitCode(meanDelta) {
|
|
208
|
+
return meanDelta >= 0 ? 0 : 1;
|
|
209
|
+
}
|
|
210
|
+
var compareCommand = command({
|
|
211
|
+
name: "compare",
|
|
212
|
+
description: "Compare two evaluation result files and compute score differences",
|
|
213
|
+
args: {
|
|
214
|
+
result1: positional({
|
|
215
|
+
type: string,
|
|
216
|
+
displayName: "result1",
|
|
217
|
+
description: "Path to first JSONL result file (baseline)"
|
|
218
|
+
}),
|
|
219
|
+
result2: positional({
|
|
220
|
+
type: string,
|
|
221
|
+
displayName: "result2",
|
|
222
|
+
description: "Path to second JSONL result file (candidate)"
|
|
223
|
+
}),
|
|
224
|
+
threshold: option({
|
|
225
|
+
type: optional(number),
|
|
226
|
+
long: "threshold",
|
|
227
|
+
short: "t",
|
|
228
|
+
description: "Score delta threshold for win/loss classification (default: 0.1)"
|
|
229
|
+
})
|
|
230
|
+
},
|
|
231
|
+
handler: async ({ result1, result2, threshold }) => {
|
|
232
|
+
const effectiveThreshold = threshold ?? 0.1;
|
|
233
|
+
try {
|
|
234
|
+
const results1 = loadJsonlResults(result1);
|
|
235
|
+
const results2 = loadJsonlResults(result2);
|
|
236
|
+
const comparison = compareResults(results1, results2, effectiveThreshold);
|
|
237
|
+
console.log(JSON.stringify(comparison, null, 2));
|
|
238
|
+
const exitCode = determineExitCode(comparison.summary.meanDelta);
|
|
239
|
+
process.exit(exitCode);
|
|
240
|
+
} catch (error40) {
|
|
241
|
+
console.error(`Error: ${error40.message}`);
|
|
242
|
+
process.exit(1);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
});
|
|
246
|
+
|
|
147
247
|
// src/commands/convert/index.ts
|
|
148
|
-
import { readFileSync, writeFileSync } from "node:fs";
|
|
248
|
+
import { readFileSync as readFileSync2, writeFileSync } from "node:fs";
|
|
149
249
|
import path14 from "node:path";
|
|
150
250
|
|
|
151
|
-
// ../../packages/core/dist/chunk-
|
|
251
|
+
// ../../packages/core/dist/chunk-E2VSU4WZ.js
|
|
152
252
|
import { constants } from "node:fs";
|
|
153
253
|
import { access, readFile } from "node:fs/promises";
|
|
154
254
|
import path from "node:path";
|
|
@@ -1033,8 +1133,8 @@ var ZodType = class {
|
|
|
1033
1133
|
promise() {
|
|
1034
1134
|
return ZodPromise.create(this, this._def);
|
|
1035
1135
|
}
|
|
1036
|
-
or(
|
|
1037
|
-
return ZodUnion.create([this,
|
|
1136
|
+
or(option6) {
|
|
1137
|
+
return ZodUnion.create([this, option6], this._def);
|
|
1038
1138
|
}
|
|
1039
1139
|
and(incoming) {
|
|
1040
1140
|
return ZodIntersection.create(this, incoming, this._def);
|
|
@@ -2884,7 +2984,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2884
2984
|
return INVALID;
|
|
2885
2985
|
}
|
|
2886
2986
|
if (ctx.common.async) {
|
|
2887
|
-
return Promise.all(options.map(async (
|
|
2987
|
+
return Promise.all(options.map(async (option6) => {
|
|
2888
2988
|
const childCtx = {
|
|
2889
2989
|
...ctx,
|
|
2890
2990
|
common: {
|
|
@@ -2894,7 +2994,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2894
2994
|
parent: null
|
|
2895
2995
|
};
|
|
2896
2996
|
return {
|
|
2897
|
-
result: await
|
|
2997
|
+
result: await option6._parseAsync({
|
|
2898
2998
|
data: ctx.data,
|
|
2899
2999
|
path: ctx.path,
|
|
2900
3000
|
parent: childCtx
|
|
@@ -2905,7 +3005,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2905
3005
|
} else {
|
|
2906
3006
|
let dirty = void 0;
|
|
2907
3007
|
const issues = [];
|
|
2908
|
-
for (const
|
|
3008
|
+
for (const option6 of options) {
|
|
2909
3009
|
const childCtx = {
|
|
2910
3010
|
...ctx,
|
|
2911
3011
|
common: {
|
|
@@ -2914,7 +3014,7 @@ var ZodUnion = class extends ZodType {
|
|
|
2914
3014
|
},
|
|
2915
3015
|
parent: null
|
|
2916
3016
|
};
|
|
2917
|
-
const result =
|
|
3017
|
+
const result = option6._parseSync({
|
|
2918
3018
|
data: ctx.data,
|
|
2919
3019
|
path: ctx.path,
|
|
2920
3020
|
parent: childCtx
|
|
@@ -2995,8 +3095,8 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
2995
3095
|
}
|
|
2996
3096
|
const discriminator = this.discriminator;
|
|
2997
3097
|
const discriminatorValue = ctx.data[discriminator];
|
|
2998
|
-
const
|
|
2999
|
-
if (!
|
|
3098
|
+
const option6 = this.optionsMap.get(discriminatorValue);
|
|
3099
|
+
if (!option6) {
|
|
3000
3100
|
addIssueToContext(ctx, {
|
|
3001
3101
|
code: ZodIssueCode.invalid_union_discriminator,
|
|
3002
3102
|
options: Array.from(this.optionsMap.keys()),
|
|
@@ -3005,13 +3105,13 @@ var ZodDiscriminatedUnion = class _ZodDiscriminatedUnion extends ZodType {
|
|
|
3005
3105
|
return INVALID;
|
|
3006
3106
|
}
|
|
3007
3107
|
if (ctx.common.async) {
|
|
3008
|
-
return
|
|
3108
|
+
return option6._parseAsync({
|
|
3009
3109
|
data: ctx.data,
|
|
3010
3110
|
path: ctx.path,
|
|
3011
3111
|
parent: ctx
|
|
3012
3112
|
});
|
|
3013
3113
|
} else {
|
|
3014
|
-
return
|
|
3114
|
+
return option6._parseSync({
|
|
3015
3115
|
data: ctx.data,
|
|
3016
3116
|
path: ctx.path,
|
|
3017
3117
|
parent: ctx
|
|
@@ -4195,7 +4295,7 @@ var coerce = {
|
|
|
4195
4295
|
};
|
|
4196
4296
|
var NEVER = INVALID;
|
|
4197
4297
|
|
|
4198
|
-
// ../../packages/core/dist/chunk-
|
|
4298
|
+
// ../../packages/core/dist/chunk-E2VSU4WZ.js
|
|
4199
4299
|
async function fileExists(filePath) {
|
|
4200
4300
|
try {
|
|
4201
4301
|
await access(filePath, constants.F_OK);
|
|
@@ -4302,6 +4402,161 @@ async function resolveFileReference(rawValue, searchRoots) {
|
|
|
4302
4402
|
}
|
|
4303
4403
|
return { displayPath, attempted };
|
|
4304
4404
|
}
|
|
4405
|
+
var CliHealthcheckHttpInputSchema = external_exports.object({
|
|
4406
|
+
type: external_exports.literal("http"),
|
|
4407
|
+
url: external_exports.string().min(1, "healthcheck URL is required"),
|
|
4408
|
+
timeout_seconds: external_exports.number().positive().optional(),
|
|
4409
|
+
timeoutSeconds: external_exports.number().positive().optional()
|
|
4410
|
+
});
|
|
4411
|
+
var CliHealthcheckCommandInputSchema = external_exports.object({
|
|
4412
|
+
type: external_exports.literal("command"),
|
|
4413
|
+
command_template: external_exports.string().optional(),
|
|
4414
|
+
commandTemplate: external_exports.string().optional(),
|
|
4415
|
+
cwd: external_exports.string().optional(),
|
|
4416
|
+
timeout_seconds: external_exports.number().positive().optional(),
|
|
4417
|
+
timeoutSeconds: external_exports.number().positive().optional()
|
|
4418
|
+
});
|
|
4419
|
+
var CliHealthcheckInputSchema = external_exports.discriminatedUnion("type", [
|
|
4420
|
+
CliHealthcheckHttpInputSchema,
|
|
4421
|
+
CliHealthcheckCommandInputSchema
|
|
4422
|
+
]);
|
|
4423
|
+
var CliTargetInputSchema = external_exports.object({
|
|
4424
|
+
name: external_exports.string().min(1, "target name is required"),
|
|
4425
|
+
provider: external_exports.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
|
|
4426
|
+
// Command template - required (accept both naming conventions)
|
|
4427
|
+
command_template: external_exports.string().optional(),
|
|
4428
|
+
commandTemplate: external_exports.string().optional(),
|
|
4429
|
+
// Files format - optional
|
|
4430
|
+
files_format: external_exports.string().optional(),
|
|
4431
|
+
filesFormat: external_exports.string().optional(),
|
|
4432
|
+
attachments_format: external_exports.string().optional(),
|
|
4433
|
+
attachmentsFormat: external_exports.string().optional(),
|
|
4434
|
+
// Working directory - optional
|
|
4435
|
+
cwd: external_exports.string().optional(),
|
|
4436
|
+
// Timeout in seconds - optional
|
|
4437
|
+
timeout_seconds: external_exports.number().positive().optional(),
|
|
4438
|
+
timeoutSeconds: external_exports.number().positive().optional(),
|
|
4439
|
+
// Healthcheck configuration - optional
|
|
4440
|
+
healthcheck: CliHealthcheckInputSchema.optional(),
|
|
4441
|
+
// Verbose mode - optional
|
|
4442
|
+
verbose: external_exports.boolean().optional(),
|
|
4443
|
+
cli_verbose: external_exports.boolean().optional(),
|
|
4444
|
+
cliVerbose: external_exports.boolean().optional(),
|
|
4445
|
+
// Keep temp files - optional
|
|
4446
|
+
keep_temp_files: external_exports.boolean().optional(),
|
|
4447
|
+
keepTempFiles: external_exports.boolean().optional(),
|
|
4448
|
+
keep_output_files: external_exports.boolean().optional(),
|
|
4449
|
+
keepOutputFiles: external_exports.boolean().optional(),
|
|
4450
|
+
// Common target fields
|
|
4451
|
+
judge_target: external_exports.string().optional(),
|
|
4452
|
+
workers: external_exports.number().int().min(1).optional(),
|
|
4453
|
+
provider_batching: external_exports.boolean().optional(),
|
|
4454
|
+
providerBatching: external_exports.boolean().optional()
|
|
4455
|
+
}).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
|
|
4456
|
+
message: "Either command_template or commandTemplate is required"
|
|
4457
|
+
});
|
|
4458
|
+
var CliHealthcheckHttpSchema = external_exports.object({
|
|
4459
|
+
type: external_exports.literal("http"),
|
|
4460
|
+
url: external_exports.string().min(1),
|
|
4461
|
+
timeoutMs: external_exports.number().positive().optional()
|
|
4462
|
+
}).strict();
|
|
4463
|
+
var CliHealthcheckCommandSchema = external_exports.object({
|
|
4464
|
+
type: external_exports.literal("command"),
|
|
4465
|
+
commandTemplate: external_exports.string().min(1),
|
|
4466
|
+
cwd: external_exports.string().optional(),
|
|
4467
|
+
timeoutMs: external_exports.number().positive().optional()
|
|
4468
|
+
}).strict();
|
|
4469
|
+
var CliHealthcheckSchema = external_exports.discriminatedUnion("type", [
|
|
4470
|
+
CliHealthcheckHttpSchema,
|
|
4471
|
+
CliHealthcheckCommandSchema
|
|
4472
|
+
]);
|
|
4473
|
+
var CliTargetConfigSchema = external_exports.object({
|
|
4474
|
+
commandTemplate: external_exports.string().min(1),
|
|
4475
|
+
filesFormat: external_exports.string().optional(),
|
|
4476
|
+
cwd: external_exports.string().optional(),
|
|
4477
|
+
timeoutMs: external_exports.number().positive().optional(),
|
|
4478
|
+
healthcheck: CliHealthcheckSchema.optional(),
|
|
4479
|
+
verbose: external_exports.boolean().optional(),
|
|
4480
|
+
keepTempFiles: external_exports.boolean().optional()
|
|
4481
|
+
}).strict();
|
|
4482
|
+
function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
|
|
4483
|
+
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4484
|
+
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
4485
|
+
if (input.type === "http") {
|
|
4486
|
+
const url2 = resolveString(input.url, env, `${targetName} healthcheck URL`);
|
|
4487
|
+
return {
|
|
4488
|
+
type: "http",
|
|
4489
|
+
url: url2,
|
|
4490
|
+
timeoutMs
|
|
4491
|
+
};
|
|
4492
|
+
}
|
|
4493
|
+
const commandTemplateSource = input.command_template ?? input.commandTemplate;
|
|
4494
|
+
if (commandTemplateSource === void 0) {
|
|
4495
|
+
throw new Error(
|
|
4496
|
+
`${targetName} healthcheck: Either command_template or commandTemplate is required for command healthcheck`
|
|
4497
|
+
);
|
|
4498
|
+
}
|
|
4499
|
+
const commandTemplate = resolveString(
|
|
4500
|
+
commandTemplateSource,
|
|
4501
|
+
env,
|
|
4502
|
+
`${targetName} healthcheck command template`,
|
|
4503
|
+
true
|
|
4504
|
+
);
|
|
4505
|
+
let cwd = resolveOptionalString(input.cwd, env, `${targetName} healthcheck cwd`, {
|
|
4506
|
+
allowLiteral: true,
|
|
4507
|
+
optionalEnv: true
|
|
4508
|
+
});
|
|
4509
|
+
if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
|
|
4510
|
+
cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
|
|
4511
|
+
}
|
|
4512
|
+
return {
|
|
4513
|
+
type: "command",
|
|
4514
|
+
commandTemplate,
|
|
4515
|
+
cwd,
|
|
4516
|
+
timeoutMs
|
|
4517
|
+
};
|
|
4518
|
+
}
|
|
4519
|
+
function normalizeCliTargetInput(input, env, evalFilePath) {
|
|
4520
|
+
const targetName = input.name;
|
|
4521
|
+
const commandTemplateSource = input.command_template ?? input.commandTemplate;
|
|
4522
|
+
if (commandTemplateSource === void 0) {
|
|
4523
|
+
throw new Error(`${targetName}: Either command_template or commandTemplate is required`);
|
|
4524
|
+
}
|
|
4525
|
+
const commandTemplate = resolveString(
|
|
4526
|
+
commandTemplateSource,
|
|
4527
|
+
env,
|
|
4528
|
+
`${targetName} CLI command template`,
|
|
4529
|
+
true
|
|
4530
|
+
);
|
|
4531
|
+
const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
|
|
4532
|
+
const filesFormat = resolveOptionalLiteralString(filesFormatSource);
|
|
4533
|
+
let cwd = resolveOptionalString(input.cwd, env, `${targetName} working directory`, {
|
|
4534
|
+
allowLiteral: true,
|
|
4535
|
+
optionalEnv: true
|
|
4536
|
+
});
|
|
4537
|
+
if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
|
|
4538
|
+
cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
|
|
4539
|
+
}
|
|
4540
|
+
if (!cwd && evalFilePath) {
|
|
4541
|
+
cwd = path2.dirname(path2.resolve(evalFilePath));
|
|
4542
|
+
}
|
|
4543
|
+
const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
|
|
4544
|
+
const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
|
|
4545
|
+
const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
|
|
4546
|
+
const keepTempFiles = resolveOptionalBoolean(
|
|
4547
|
+
input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
|
|
4548
|
+
);
|
|
4549
|
+
const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
|
|
4550
|
+
return {
|
|
4551
|
+
commandTemplate,
|
|
4552
|
+
filesFormat,
|
|
4553
|
+
cwd,
|
|
4554
|
+
timeoutMs,
|
|
4555
|
+
healthcheck,
|
|
4556
|
+
verbose,
|
|
4557
|
+
keepTempFiles
|
|
4558
|
+
};
|
|
4559
|
+
}
|
|
4305
4560
|
var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
4306
4561
|
"PROMPT",
|
|
4307
4562
|
"GUIDELINES",
|
|
@@ -4407,6 +4662,16 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
4407
4662
|
providerBatching,
|
|
4408
4663
|
config: resolveCodexConfig(parsed, env)
|
|
4409
4664
|
};
|
|
4665
|
+
case "pi":
|
|
4666
|
+
case "pi-coding-agent":
|
|
4667
|
+
return {
|
|
4668
|
+
kind: "pi-coding-agent",
|
|
4669
|
+
name: parsed.name,
|
|
4670
|
+
judgeTarget: parsed.judge_target,
|
|
4671
|
+
workers: parsed.workers,
|
|
4672
|
+
providerBatching,
|
|
4673
|
+
config: resolvePiCodingAgentConfig(parsed, env)
|
|
4674
|
+
};
|
|
4410
4675
|
case "mock":
|
|
4411
4676
|
return {
|
|
4412
4677
|
kind: "mock",
|
|
@@ -4515,6 +4780,7 @@ function resolveCodexConfig(target, env) {
|
|
|
4515
4780
|
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
4516
4781
|
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
4517
4782
|
const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
|
|
4783
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
4518
4784
|
const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
|
|
4519
4785
|
allowLiteral: true,
|
|
4520
4786
|
optionalEnv: true
|
|
@@ -4530,13 +4796,15 @@ function resolveCodexConfig(target, env) {
|
|
|
4530
4796
|
optionalEnv: true
|
|
4531
4797
|
});
|
|
4532
4798
|
const logFormat = normalizeCodexLogFormat(logFormatSource);
|
|
4799
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
4533
4800
|
return {
|
|
4534
4801
|
executable,
|
|
4535
4802
|
args,
|
|
4536
4803
|
cwd,
|
|
4537
4804
|
timeoutMs,
|
|
4538
4805
|
logDir,
|
|
4539
|
-
logFormat
|
|
4806
|
+
logFormat,
|
|
4807
|
+
systemPrompt
|
|
4540
4808
|
};
|
|
4541
4809
|
}
|
|
4542
4810
|
function normalizeCodexLogFormat(value) {
|
|
@@ -4552,6 +4820,70 @@ function normalizeCodexLogFormat(value) {
|
|
|
4552
4820
|
}
|
|
4553
4821
|
throw new Error("codex log format must be 'summary' or 'json'");
|
|
4554
4822
|
}
|
|
4823
|
+
function resolvePiCodingAgentConfig(target, env) {
|
|
4824
|
+
const executableSource = target.executable ?? target.command ?? target.binary;
|
|
4825
|
+
const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
|
|
4826
|
+
const modelSource = target.model ?? target.pi_model ?? target.piModel;
|
|
4827
|
+
const apiKeySource = target.api_key ?? target.apiKey;
|
|
4828
|
+
const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
|
|
4829
|
+
const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
|
|
4830
|
+
const argsSource = target.args ?? target.arguments;
|
|
4831
|
+
const cwdSource = target.cwd;
|
|
4832
|
+
const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
|
|
4833
|
+
const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
|
|
4834
|
+
const logFormatSource = target.log_format ?? target.logFormat;
|
|
4835
|
+
const systemPromptSource = target.system_prompt ?? target.systemPrompt;
|
|
4836
|
+
const executable = resolveOptionalString(executableSource, env, `${target.name} pi executable`, {
|
|
4837
|
+
allowLiteral: true,
|
|
4838
|
+
optionalEnv: true
|
|
4839
|
+
}) ?? "pi";
|
|
4840
|
+
const provider = resolveOptionalString(providerSource, env, `${target.name} pi provider`, {
|
|
4841
|
+
allowLiteral: true,
|
|
4842
|
+
optionalEnv: true
|
|
4843
|
+
});
|
|
4844
|
+
const model = resolveOptionalString(modelSource, env, `${target.name} pi model`, {
|
|
4845
|
+
allowLiteral: true,
|
|
4846
|
+
optionalEnv: true
|
|
4847
|
+
});
|
|
4848
|
+
const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi api key`, {
|
|
4849
|
+
allowLiteral: false,
|
|
4850
|
+
optionalEnv: true
|
|
4851
|
+
});
|
|
4852
|
+
const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
|
|
4853
|
+
allowLiteral: true,
|
|
4854
|
+
optionalEnv: true
|
|
4855
|
+
});
|
|
4856
|
+
const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
|
|
4857
|
+
allowLiteral: true,
|
|
4858
|
+
optionalEnv: true
|
|
4859
|
+
});
|
|
4860
|
+
const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
|
|
4861
|
+
const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
|
|
4862
|
+
allowLiteral: true,
|
|
4863
|
+
optionalEnv: true
|
|
4864
|
+
});
|
|
4865
|
+
const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
|
|
4866
|
+
const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
|
|
4867
|
+
allowLiteral: true,
|
|
4868
|
+
optionalEnv: true
|
|
4869
|
+
});
|
|
4870
|
+
const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
|
|
4871
|
+
const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
|
|
4872
|
+
return {
|
|
4873
|
+
executable,
|
|
4874
|
+
provider,
|
|
4875
|
+
model,
|
|
4876
|
+
apiKey,
|
|
4877
|
+
tools,
|
|
4878
|
+
thinking,
|
|
4879
|
+
args,
|
|
4880
|
+
cwd,
|
|
4881
|
+
timeoutMs,
|
|
4882
|
+
logDir,
|
|
4883
|
+
logFormat,
|
|
4884
|
+
systemPrompt
|
|
4885
|
+
};
|
|
4886
|
+
}
|
|
4555
4887
|
function resolveMockConfig(target) {
|
|
4556
4888
|
const response = typeof target.response === "string" ? target.response : void 0;
|
|
4557
4889
|
return { response };
|
|
@@ -4574,9 +4906,9 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4574
4906
|
const dryRunSource = target.dry_run ?? target.dryRun;
|
|
4575
4907
|
const subagentRootSource = target.subagent_root ?? target.subagentRoot;
|
|
4576
4908
|
const defaultCommand = insiders ? "code-insiders" : "code";
|
|
4577
|
-
const
|
|
4909
|
+
const command7 = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
|
|
4578
4910
|
return {
|
|
4579
|
-
command:
|
|
4911
|
+
command: command7,
|
|
4580
4912
|
waitForResponse: resolveOptionalBoolean(waitSource) ?? true,
|
|
4581
4913
|
dryRun: resolveOptionalBoolean(dryRunSource) ?? false,
|
|
4582
4914
|
subagentRoot: resolveOptionalString(subagentRootSource, env, `${target.name} subagent root`, {
|
|
@@ -4586,46 +4918,35 @@ function resolveVSCodeConfig(target, env, insiders) {
|
|
|
4586
4918
|
workspaceTemplate
|
|
4587
4919
|
};
|
|
4588
4920
|
}
|
|
4589
|
-
|
|
4590
|
-
|
|
4591
|
-
|
|
4592
|
-
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
4593
|
-
);
|
|
4594
|
-
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
4595
|
-
const keepTempFiles = resolveOptionalBoolean(
|
|
4596
|
-
target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
|
|
4597
|
-
);
|
|
4598
|
-
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
4599
|
-
allowLiteral: true,
|
|
4600
|
-
optionalEnv: true
|
|
4601
|
-
});
|
|
4602
|
-
if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
|
|
4603
|
-
cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
|
|
4921
|
+
var cliErrorMap = (issue2, ctx) => {
|
|
4922
|
+
if (issue2.code === external_exports.ZodIssueCode.unrecognized_keys) {
|
|
4923
|
+
return { message: `Unknown CLI provider settings: ${issue2.keys.join(", ")}` };
|
|
4604
4924
|
}
|
|
4605
|
-
if (
|
|
4606
|
-
|
|
4925
|
+
if (issue2.code === external_exports.ZodIssueCode.invalid_union_discriminator) {
|
|
4926
|
+
return { message: "healthcheck type must be 'http' or 'command'" };
|
|
4607
4927
|
}
|
|
4608
|
-
|
|
4609
|
-
|
|
4610
|
-
|
|
4611
|
-
|
|
4612
|
-
|
|
4613
|
-
|
|
4614
|
-
|
|
4615
|
-
|
|
4616
|
-
|
|
4617
|
-
|
|
4618
|
-
|
|
4619
|
-
|
|
4620
|
-
|
|
4621
|
-
|
|
4622
|
-
|
|
4623
|
-
|
|
4624
|
-
|
|
4625
|
-
|
|
4626
|
-
|
|
4627
|
-
|
|
4628
|
-
}
|
|
4928
|
+
if (issue2.code === external_exports.ZodIssueCode.invalid_type && issue2.expected === "string") {
|
|
4929
|
+
return { message: `${ctx.defaultError} (expected a string value)` };
|
|
4930
|
+
}
|
|
4931
|
+
return { message: ctx.defaultError };
|
|
4932
|
+
};
|
|
4933
|
+
function resolveCliConfig(target, env, evalFilePath) {
|
|
4934
|
+
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
4935
|
+
if (!parseResult.success) {
|
|
4936
|
+
const firstError = parseResult.error.errors[0];
|
|
4937
|
+
const path34 = firstError?.path.join(".") || "";
|
|
4938
|
+
const prefix = path34 ? `${target.name} ${path34}: ` : `${target.name}: `;
|
|
4939
|
+
throw new Error(`${prefix}${firstError?.message}`);
|
|
4940
|
+
}
|
|
4941
|
+
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
4942
|
+
assertSupportedCliPlaceholders(normalized.commandTemplate, `${target.name} CLI command template`);
|
|
4943
|
+
if (normalized.healthcheck?.type === "command") {
|
|
4944
|
+
assertSupportedCliPlaceholders(
|
|
4945
|
+
normalized.healthcheck.commandTemplate,
|
|
4946
|
+
`${target.name} healthcheck command template`
|
|
4947
|
+
);
|
|
4948
|
+
}
|
|
4949
|
+
return normalized;
|
|
4629
4950
|
}
|
|
4630
4951
|
function resolveTimeoutMs(source2, description) {
|
|
4631
4952
|
const seconds = resolveOptionalNumber(source2, `${description} (seconds)`);
|
|
@@ -4637,49 +4958,6 @@ function resolveTimeoutMs(source2, description) {
|
|
|
4637
4958
|
}
|
|
4638
4959
|
return Math.floor(seconds * 1e3);
|
|
4639
4960
|
}
|
|
4640
|
-
function resolveCliHealthcheck(source2, env, targetName, evalFilePath) {
|
|
4641
|
-
if (source2 === void 0 || source2 === null) {
|
|
4642
|
-
return void 0;
|
|
4643
|
-
}
|
|
4644
|
-
if (typeof source2 !== "object" || Array.isArray(source2)) {
|
|
4645
|
-
throw new Error(`${targetName} healthcheck must be an object`);
|
|
4646
|
-
}
|
|
4647
|
-
const candidate = source2;
|
|
4648
|
-
const type = candidate.type;
|
|
4649
|
-
const timeoutMs = resolveTimeoutMs(
|
|
4650
|
-
candidate.timeout_seconds ?? candidate.timeoutSeconds,
|
|
4651
|
-
`${targetName} healthcheck timeout`
|
|
4652
|
-
);
|
|
4653
|
-
if (type === "http") {
|
|
4654
|
-
const url2 = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
|
|
4655
|
-
return {
|
|
4656
|
-
type: "http",
|
|
4657
|
-
url: url2,
|
|
4658
|
-
timeoutMs
|
|
4659
|
-
};
|
|
4660
|
-
}
|
|
4661
|
-
if (type === "command") {
|
|
4662
|
-
const commandTemplate = resolveString(
|
|
4663
|
-
candidate.command_template ?? candidate.commandTemplate,
|
|
4664
|
-
env,
|
|
4665
|
-
`${targetName} healthcheck command template`,
|
|
4666
|
-
true
|
|
4667
|
-
);
|
|
4668
|
-
assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
|
|
4669
|
-
const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
|
|
4670
|
-
allowLiteral: true,
|
|
4671
|
-
optionalEnv: true
|
|
4672
|
-
});
|
|
4673
|
-
const resolvedCwd = cwd && evalFilePath && !path2.isAbsolute(cwd) ? path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd) : cwd;
|
|
4674
|
-
return {
|
|
4675
|
-
type: "command",
|
|
4676
|
-
commandTemplate,
|
|
4677
|
-
timeoutMs,
|
|
4678
|
-
cwd: resolvedCwd
|
|
4679
|
-
};
|
|
4680
|
-
}
|
|
4681
|
-
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
4682
|
-
}
|
|
4683
4961
|
function assertSupportedCliPlaceholders(template, description) {
|
|
4684
4962
|
const placeholders = extractCliPlaceholders(template);
|
|
4685
4963
|
for (const placeholder of placeholders) {
|
|
@@ -4845,6 +5123,7 @@ function resolveOptionalNumberArray(source2, description) {
|
|
|
4845
5123
|
}
|
|
4846
5124
|
var AGENT_PROVIDER_KINDS = [
|
|
4847
5125
|
"codex",
|
|
5126
|
+
"pi-coding-agent",
|
|
4848
5127
|
"vscode",
|
|
4849
5128
|
"vscode-insiders"
|
|
4850
5129
|
];
|
|
@@ -4853,6 +5132,7 @@ var KNOWN_PROVIDERS = [
|
|
|
4853
5132
|
"anthropic",
|
|
4854
5133
|
"gemini",
|
|
4855
5134
|
"codex",
|
|
5135
|
+
"pi-coding-agent",
|
|
4856
5136
|
"cli",
|
|
4857
5137
|
"mock",
|
|
4858
5138
|
"vscode",
|
|
@@ -4867,6 +5147,8 @@ var PROVIDER_ALIASES = [
|
|
|
4867
5147
|
// alias for "gemini"
|
|
4868
5148
|
"codex-cli",
|
|
4869
5149
|
// alias for "codex"
|
|
5150
|
+
"pi",
|
|
5151
|
+
// alias for "pi-coding-agent"
|
|
4870
5152
|
"openai",
|
|
4871
5153
|
// legacy/future support
|
|
4872
5154
|
"bedrock",
|
|
@@ -5502,9 +5784,9 @@ __export(external_exports2, {
|
|
|
5502
5784
|
null: () => _null3,
|
|
5503
5785
|
nullable: () => nullable,
|
|
5504
5786
|
nullish: () => nullish2,
|
|
5505
|
-
number: () =>
|
|
5787
|
+
number: () => number3,
|
|
5506
5788
|
object: () => object,
|
|
5507
|
-
optional: () =>
|
|
5789
|
+
optional: () => optional2,
|
|
5508
5790
|
overwrite: () => _overwrite,
|
|
5509
5791
|
parse: () => parse2,
|
|
5510
5792
|
parseAsync: () => parseAsync2,
|
|
@@ -5529,7 +5811,7 @@ __export(external_exports2, {
|
|
|
5529
5811
|
size: () => _size,
|
|
5530
5812
|
startsWith: () => _startsWith,
|
|
5531
5813
|
strictObject: () => strictObject,
|
|
5532
|
-
string: () =>
|
|
5814
|
+
string: () => string3,
|
|
5533
5815
|
stringFormat: () => stringFormat,
|
|
5534
5816
|
stringbool: () => stringbool,
|
|
5535
5817
|
success: () => success,
|
|
@@ -6614,9 +6896,9 @@ __export(regexes_exports, {
|
|
|
6614
6896
|
lowercase: () => lowercase,
|
|
6615
6897
|
nanoid: () => nanoid,
|
|
6616
6898
|
null: () => _null,
|
|
6617
|
-
number: () =>
|
|
6899
|
+
number: () => number2,
|
|
6618
6900
|
rfc5322Email: () => rfc5322Email,
|
|
6619
|
-
string: () =>
|
|
6901
|
+
string: () => string2,
|
|
6620
6902
|
time: () => time,
|
|
6621
6903
|
ulid: () => ulid,
|
|
6622
6904
|
undefined: () => _undefined,
|
|
@@ -6683,13 +6965,13 @@ function datetime(args) {
|
|
|
6683
6965
|
const timeRegex2 = `${time3}(?:${opts.join("|")})`;
|
|
6684
6966
|
return new RegExp(`^${dateSource}T(?:${timeRegex2})$`);
|
|
6685
6967
|
}
|
|
6686
|
-
var
|
|
6968
|
+
var string2 = (params) => {
|
|
6687
6969
|
const regex = params ? `[\\s\\S]{${params?.minimum ?? 0},${params?.maximum ?? ""}}` : `[\\s\\S]*`;
|
|
6688
6970
|
return new RegExp(`^${regex}$`);
|
|
6689
6971
|
};
|
|
6690
6972
|
var bigint = /^\d+n?$/;
|
|
6691
6973
|
var integer = /^\d+$/;
|
|
6692
|
-
var
|
|
6974
|
+
var number2 = /^-?\d+(?:\.\d+)?/i;
|
|
6693
6975
|
var boolean = /true|false/i;
|
|
6694
6976
|
var _null = /null/i;
|
|
6695
6977
|
var _undefined = /undefined/i;
|
|
@@ -7364,7 +7646,7 @@ var $ZodType = /* @__PURE__ */ $constructor("$ZodType", (inst, def) => {
|
|
|
7364
7646
|
});
|
|
7365
7647
|
var $ZodString = /* @__PURE__ */ $constructor("$ZodString", (inst, def) => {
|
|
7366
7648
|
$ZodType.init(inst, def);
|
|
7367
|
-
inst._zod.pattern = [...inst?._zod.bag?.patterns ?? []].pop() ??
|
|
7649
|
+
inst._zod.pattern = [...inst?._zod.bag?.patterns ?? []].pop() ?? string2(inst._zod.bag);
|
|
7368
7650
|
inst._zod.parse = (payload, _) => {
|
|
7369
7651
|
if (def.coerce)
|
|
7370
7652
|
try {
|
|
@@ -7677,7 +7959,7 @@ var $ZodCustomStringFormat = /* @__PURE__ */ $constructor("$ZodCustomStringForma
|
|
|
7677
7959
|
});
|
|
7678
7960
|
var $ZodNumber = /* @__PURE__ */ $constructor("$ZodNumber", (inst, def) => {
|
|
7679
7961
|
$ZodType.init(inst, def);
|
|
7680
|
-
inst._zod.pattern = inst._zod.bag.pattern ??
|
|
7962
|
+
inst._zod.pattern = inst._zod.bag.pattern ?? number2;
|
|
7681
7963
|
inst._zod.parse = (payload, _ctx) => {
|
|
7682
7964
|
if (def.coerce)
|
|
7683
7965
|
try {
|
|
@@ -8104,7 +8386,7 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8104
8386
|
defineLazy(inst._zod, "optout", () => def.options.some((o) => o._zod.optout === "optional") ? "optional" : void 0);
|
|
8105
8387
|
defineLazy(inst._zod, "values", () => {
|
|
8106
8388
|
if (def.options.every((o) => o._zod.values)) {
|
|
8107
|
-
return new Set(def.options.flatMap((
|
|
8389
|
+
return new Set(def.options.flatMap((option6) => Array.from(option6._zod.values)));
|
|
8108
8390
|
}
|
|
8109
8391
|
return void 0;
|
|
8110
8392
|
});
|
|
@@ -8118,8 +8400,8 @@ var $ZodUnion = /* @__PURE__ */ $constructor("$ZodUnion", (inst, def) => {
|
|
|
8118
8400
|
inst._zod.parse = (payload, ctx) => {
|
|
8119
8401
|
let async = false;
|
|
8120
8402
|
const results = [];
|
|
8121
|
-
for (const
|
|
8122
|
-
const result =
|
|
8403
|
+
for (const option6 of def.options) {
|
|
8404
|
+
const result = option6._zod.run({
|
|
8123
8405
|
value: payload.value,
|
|
8124
8406
|
issues: []
|
|
8125
8407
|
}, ctx);
|
|
@@ -8144,10 +8426,10 @@ var $ZodDiscriminatedUnion = /* @__PURE__ */ $constructor("$ZodDiscriminatedUnio
|
|
|
8144
8426
|
const _super = inst._zod.parse;
|
|
8145
8427
|
defineLazy(inst._zod, "propValues", () => {
|
|
8146
8428
|
const propValues = {};
|
|
8147
|
-
for (const
|
|
8148
|
-
const pv =
|
|
8429
|
+
for (const option6 of def.options) {
|
|
8430
|
+
const pv = option6._zod.propValues;
|
|
8149
8431
|
if (!pv || Object.keys(pv).length === 0)
|
|
8150
|
-
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(
|
|
8432
|
+
throw new Error(`Invalid discriminated union option at index "${def.options.indexOf(option6)}"`);
|
|
8151
8433
|
for (const [k, v] of Object.entries(pv)) {
|
|
8152
8434
|
if (!propValues[k])
|
|
8153
8435
|
propValues[k] = /* @__PURE__ */ new Set();
|
|
@@ -15351,8 +15633,8 @@ function isTransforming(_schema, _ctx) {
|
|
|
15351
15633
|
return false;
|
|
15352
15634
|
}
|
|
15353
15635
|
case "union": {
|
|
15354
|
-
for (const
|
|
15355
|
-
if (isTransforming(
|
|
15636
|
+
for (const option6 of def.options) {
|
|
15637
|
+
if (isTransforming(option6, ctx))
|
|
15356
15638
|
return true;
|
|
15357
15639
|
}
|
|
15358
15640
|
return false;
|
|
@@ -15529,9 +15811,9 @@ var ZodType2 = /* @__PURE__ */ $constructor("ZodType", (inst, def) => {
|
|
|
15529
15811
|
inst.refine = (check2, params) => inst.check(refine(check2, params));
|
|
15530
15812
|
inst.superRefine = (refinement) => inst.check(superRefine(refinement));
|
|
15531
15813
|
inst.overwrite = (fn) => inst.check(_overwrite(fn));
|
|
15532
|
-
inst.optional = () =>
|
|
15814
|
+
inst.optional = () => optional2(inst);
|
|
15533
15815
|
inst.nullable = () => nullable(inst);
|
|
15534
|
-
inst.nullish = () =>
|
|
15816
|
+
inst.nullish = () => optional2(nullable(inst));
|
|
15535
15817
|
inst.nonoptional = (params) => nonoptional(inst, params);
|
|
15536
15818
|
inst.array = () => array(inst);
|
|
15537
15819
|
inst.or = (arg) => union([inst, arg]);
|
|
@@ -15618,7 +15900,7 @@ var ZodString2 = /* @__PURE__ */ $constructor("ZodString", (inst, def) => {
|
|
|
15618
15900
|
inst.time = (params) => inst.check(time2(params));
|
|
15619
15901
|
inst.duration = (params) => inst.check(duration2(params));
|
|
15620
15902
|
});
|
|
15621
|
-
function
|
|
15903
|
+
function string3(params) {
|
|
15622
15904
|
return _string(ZodString2, params);
|
|
15623
15905
|
}
|
|
15624
15906
|
var ZodStringFormat = /* @__PURE__ */ $constructor("ZodStringFormat", (inst, def) => {
|
|
@@ -15799,7 +16081,7 @@ var ZodNumber2 = /* @__PURE__ */ $constructor("ZodNumber", (inst, def) => {
|
|
|
15799
16081
|
inst.isFinite = true;
|
|
15800
16082
|
inst.format = bag.format ?? null;
|
|
15801
16083
|
});
|
|
15802
|
-
function
|
|
16084
|
+
function number3(params) {
|
|
15803
16085
|
return _number(ZodNumber2, params);
|
|
15804
16086
|
}
|
|
15805
16087
|
var ZodNumberFormat = /* @__PURE__ */ $constructor("ZodNumberFormat", (inst, def) => {
|
|
@@ -16219,7 +16501,7 @@ var ZodOptional2 = /* @__PURE__ */ $constructor("ZodOptional", (inst, def) => {
|
|
|
16219
16501
|
ZodType2.init(inst, def);
|
|
16220
16502
|
inst.unwrap = () => inst._zod.def.innerType;
|
|
16221
16503
|
});
|
|
16222
|
-
function
|
|
16504
|
+
function optional2(innerType) {
|
|
16223
16505
|
return new ZodOptional2({
|
|
16224
16506
|
type: "optional",
|
|
16225
16507
|
innerType
|
|
@@ -16237,7 +16519,7 @@ function nullable(innerType) {
|
|
|
16237
16519
|
});
|
|
16238
16520
|
}
|
|
16239
16521
|
function nullish2(innerType) {
|
|
16240
|
-
return
|
|
16522
|
+
return optional2(nullable(innerType));
|
|
16241
16523
|
}
|
|
16242
16524
|
var ZodDefault2 = /* @__PURE__ */ $constructor("ZodDefault", (inst, def) => {
|
|
16243
16525
|
$ZodDefault.init(inst, def);
|
|
@@ -16427,7 +16709,7 @@ var stringbool = (...args) => _stringbool({
|
|
|
16427
16709
|
}, ...args);
|
|
16428
16710
|
function json(params) {
|
|
16429
16711
|
const jsonSchema2 = lazy(() => {
|
|
16430
|
-
return union([
|
|
16712
|
+
return union([string3(params), number3(), boolean2(), _null3(), array(jsonSchema2), record(string3(), jsonSchema2)]);
|
|
16431
16713
|
});
|
|
16432
16714
|
return jsonSchema2;
|
|
16433
16715
|
}
|
|
@@ -16464,13 +16746,13 @@ __export(coerce_exports, {
|
|
|
16464
16746
|
bigint: () => bigint3,
|
|
16465
16747
|
boolean: () => boolean3,
|
|
16466
16748
|
date: () => date4,
|
|
16467
|
-
number: () =>
|
|
16468
|
-
string: () =>
|
|
16749
|
+
number: () => number4,
|
|
16750
|
+
string: () => string4
|
|
16469
16751
|
});
|
|
16470
|
-
function
|
|
16752
|
+
function string4(params) {
|
|
16471
16753
|
return _coercedString(ZodString2, params);
|
|
16472
16754
|
}
|
|
16473
|
-
function
|
|
16755
|
+
function number4(params) {
|
|
16474
16756
|
return _coercedNumber(ZodNumber2, params);
|
|
16475
16757
|
}
|
|
16476
16758
|
function boolean3(params) {
|
|
@@ -32509,7 +32791,13 @@ import { tmpdir } from "node:os";
|
|
|
32509
32791
|
import path92 from "node:path";
|
|
32510
32792
|
import { promisify as promisify22 } from "node:util";
|
|
32511
32793
|
import path82 from "node:path";
|
|
32794
|
+
import { spawn as spawn22 } from "node:child_process";
|
|
32795
|
+
import { randomUUID as randomUUID2 } from "node:crypto";
|
|
32796
|
+
import { createWriteStream as createWriteStream2 } from "node:fs";
|
|
32797
|
+
import { mkdir as mkdir22, mkdtemp as mkdtemp2, rm as rm22, writeFile as writeFile22 } from "node:fs/promises";
|
|
32798
|
+
import { tmpdir as tmpdir2 } from "node:os";
|
|
32512
32799
|
import path102 from "node:path";
|
|
32800
|
+
import path112 from "node:path";
|
|
32513
32801
|
|
|
32514
32802
|
// ../../node_modules/.bun/subagent@0.5.6/node_modules/subagent/dist/vscode/agentDispatch.js
|
|
32515
32803
|
import { stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
|
|
@@ -34532,11 +34820,11 @@ async function provisionSubagents(options) {
|
|
|
34532
34820
|
// ../../packages/core/dist/index.js
|
|
34533
34821
|
import { constants as constants32 } from "node:fs";
|
|
34534
34822
|
import { access as access32, readFile as readFile6 } from "node:fs/promises";
|
|
34535
|
-
import path112 from "node:path";
|
|
34536
|
-
import { parse as parse32 } from "yaml";
|
|
34537
|
-
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
34538
|
-
import { mkdir as mkdir22, writeFile as writeFile22 } from "node:fs/promises";
|
|
34539
34823
|
import path122 from "node:path";
|
|
34824
|
+
import { parse as parse32 } from "yaml";
|
|
34825
|
+
import { createHash, randomUUID as randomUUID3 } from "node:crypto";
|
|
34826
|
+
import { mkdir as mkdir32, writeFile as writeFile32 } from "node:fs/promises";
|
|
34827
|
+
import path132 from "node:path";
|
|
34540
34828
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
34541
34829
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
34542
34830
|
function isTestMessageRole(value) {
|
|
@@ -34611,6 +34899,15 @@ function computeTraceSummary(messages) {
|
|
|
34611
34899
|
errorCount: 0
|
|
34612
34900
|
};
|
|
34613
34901
|
}
|
|
34902
|
+
function mergeExecutionMetrics(summary, metrics) {
|
|
34903
|
+
if (!metrics) return summary;
|
|
34904
|
+
return {
|
|
34905
|
+
...summary,
|
|
34906
|
+
tokenUsage: metrics.tokenUsage,
|
|
34907
|
+
costUsd: metrics.costUsd,
|
|
34908
|
+
durationMs: metrics.durationMs
|
|
34909
|
+
};
|
|
34910
|
+
}
|
|
34614
34911
|
function extractCodeBlocks(segments) {
|
|
34615
34912
|
const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
|
|
34616
34913
|
const codeBlocks = [];
|
|
@@ -35093,7 +35390,13 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35093
35390
|
expected = [];
|
|
35094
35391
|
for (const item of rawExpected) {
|
|
35095
35392
|
if (isJsonObject2(item) && typeof item.tool === "string") {
|
|
35096
|
-
|
|
35393
|
+
let args;
|
|
35394
|
+
if (item.args === "any") {
|
|
35395
|
+
args = "any";
|
|
35396
|
+
} else if (isJsonObject2(item.args)) {
|
|
35397
|
+
args = item.args;
|
|
35398
|
+
}
|
|
35399
|
+
expected.push({ tool: item.tool, ...args !== void 0 ? { args } : {} });
|
|
35097
35400
|
}
|
|
35098
35401
|
}
|
|
35099
35402
|
}
|
|
@@ -36168,7 +36471,7 @@ async function withRetry(fn, retryConfig, signal) {
|
|
|
36168
36471
|
}
|
|
36169
36472
|
var execAsync2 = promisify2(execWithCallback);
|
|
36170
36473
|
var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
|
|
36171
|
-
async function defaultCommandRunner(
|
|
36474
|
+
async function defaultCommandRunner(command7, options) {
|
|
36172
36475
|
const execOptions = {
|
|
36173
36476
|
cwd: options.cwd,
|
|
36174
36477
|
env: options.env,
|
|
@@ -36178,7 +36481,7 @@ async function defaultCommandRunner(command6, options) {
|
|
|
36178
36481
|
shell: process.platform === "win32" ? "powershell.exe" : void 0
|
|
36179
36482
|
};
|
|
36180
36483
|
try {
|
|
36181
|
-
const { stdout, stderr } = await execAsync2(
|
|
36484
|
+
const { stdout, stderr } = await execAsync2(command7, execOptions);
|
|
36182
36485
|
return {
|
|
36183
36486
|
stdout,
|
|
36184
36487
|
stderr,
|
|
@@ -36230,12 +36533,14 @@ var CliProvider = class {
|
|
|
36230
36533
|
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36231
36534
|
);
|
|
36232
36535
|
}
|
|
36536
|
+
const startTime = Date.now();
|
|
36233
36537
|
const result = await this.runCommand(renderedCommand, {
|
|
36234
36538
|
cwd: this.config.cwd,
|
|
36235
36539
|
env: process.env,
|
|
36236
36540
|
timeoutMs: this.config.timeoutMs,
|
|
36237
36541
|
signal: request.signal
|
|
36238
36542
|
});
|
|
36543
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
36239
36544
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
36240
36545
|
if (request.signal?.aborted) {
|
|
36241
36546
|
throw new Error("CLI provider request was aborted");
|
|
@@ -36254,6 +36559,9 @@ var CliProvider = class {
|
|
|
36254
36559
|
const parsed = this.parseOutputContent(responseContent);
|
|
36255
36560
|
return {
|
|
36256
36561
|
outputMessages: parsed.outputMessages,
|
|
36562
|
+
tokenUsage: parsed.tokenUsage,
|
|
36563
|
+
costUsd: parsed.costUsd,
|
|
36564
|
+
durationMs: parsed.durationMs ?? measuredDurationMs,
|
|
36257
36565
|
raw: {
|
|
36258
36566
|
command: renderedCommand,
|
|
36259
36567
|
stderr: result.stderr,
|
|
@@ -36301,12 +36609,14 @@ var CliProvider = class {
|
|
|
36301
36609
|
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36302
36610
|
);
|
|
36303
36611
|
}
|
|
36612
|
+
const startTime = Date.now();
|
|
36304
36613
|
const result = await this.runCommand(renderedCommand, {
|
|
36305
36614
|
cwd: this.config.cwd,
|
|
36306
36615
|
env: process.env,
|
|
36307
36616
|
timeoutMs: this.config.timeoutMs,
|
|
36308
36617
|
signal: controller.signal
|
|
36309
36618
|
});
|
|
36619
|
+
const measuredDurationMs = Date.now() - startTime;
|
|
36310
36620
|
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
36311
36621
|
if (controller.signal.aborted) {
|
|
36312
36622
|
throw new Error("CLI provider request was aborted");
|
|
@@ -36328,11 +36638,13 @@ var CliProvider = class {
|
|
|
36328
36638
|
if (missingIds.length > 0) {
|
|
36329
36639
|
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
36330
36640
|
}
|
|
36641
|
+
const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
|
|
36331
36642
|
const responses = requests.map((request) => {
|
|
36332
36643
|
const evalCaseId = request.evalCaseId;
|
|
36333
36644
|
if (!evalCaseId) {
|
|
36334
36645
|
return {
|
|
36335
36646
|
outputMessages: [],
|
|
36647
|
+
durationMs: perRequestFallbackMs,
|
|
36336
36648
|
raw: {
|
|
36337
36649
|
command: renderedCommand,
|
|
36338
36650
|
stderr: result.stderr,
|
|
@@ -36346,6 +36658,7 @@ var CliProvider = class {
|
|
|
36346
36658
|
if (!parsed) {
|
|
36347
36659
|
return {
|
|
36348
36660
|
outputMessages: [],
|
|
36661
|
+
durationMs: perRequestFallbackMs,
|
|
36349
36662
|
raw: {
|
|
36350
36663
|
command: renderedCommand,
|
|
36351
36664
|
stderr: result.stderr,
|
|
@@ -36357,6 +36670,9 @@ var CliProvider = class {
|
|
|
36357
36670
|
}
|
|
36358
36671
|
return {
|
|
36359
36672
|
outputMessages: parsed.outputMessages,
|
|
36673
|
+
tokenUsage: parsed.tokenUsage,
|
|
36674
|
+
costUsd: parsed.costUsd,
|
|
36675
|
+
durationMs: parsed.durationMs ?? perRequestFallbackMs,
|
|
36360
36676
|
raw: {
|
|
36361
36677
|
command: renderedCommand,
|
|
36362
36678
|
stderr: result.stderr,
|
|
@@ -36374,25 +36690,55 @@ var CliProvider = class {
|
|
|
36374
36690
|
* If the content is valid JSON with 'output_messages' or 'text' field, extract them.
|
|
36375
36691
|
* If only 'text' is provided, wrap it in outputMessages.
|
|
36376
36692
|
* Otherwise, treat the entire content as plain text wrapped in outputMessages.
|
|
36693
|
+
*
|
|
36694
|
+
* Also extracts optional execution metrics:
|
|
36695
|
+
* - token_usage: { input, output, cached? }
|
|
36696
|
+
* - cost_usd: number
|
|
36697
|
+
* - duration_ms: number
|
|
36377
36698
|
*/
|
|
36378
36699
|
parseOutputContent(content) {
|
|
36379
36700
|
try {
|
|
36380
36701
|
const parsed = JSON.parse(content);
|
|
36381
36702
|
if (typeof parsed === "object" && parsed !== null) {
|
|
36382
36703
|
const obj = parsed;
|
|
36704
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
36705
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
36706
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
36383
36707
|
const outputMessages = this.parseOutputMessages(obj.output_messages);
|
|
36384
36708
|
if (outputMessages && outputMessages.length > 0) {
|
|
36385
|
-
return { outputMessages };
|
|
36709
|
+
return { outputMessages, tokenUsage, costUsd, durationMs };
|
|
36386
36710
|
}
|
|
36387
36711
|
if ("text" in obj) {
|
|
36388
36712
|
const text2 = typeof obj.text === "string" ? obj.text : String(obj.text);
|
|
36389
|
-
return {
|
|
36713
|
+
return {
|
|
36714
|
+
outputMessages: [{ role: "assistant", content: text2 }],
|
|
36715
|
+
tokenUsage,
|
|
36716
|
+
costUsd,
|
|
36717
|
+
durationMs
|
|
36718
|
+
};
|
|
36390
36719
|
}
|
|
36391
36720
|
}
|
|
36392
36721
|
} catch {
|
|
36393
36722
|
}
|
|
36394
36723
|
return { outputMessages: [{ role: "assistant", content }] };
|
|
36395
36724
|
}
|
|
36725
|
+
/**
|
|
36726
|
+
* Parse token_usage from CLI output.
|
|
36727
|
+
*/
|
|
36728
|
+
parseTokenUsage(tokenUsage) {
|
|
36729
|
+
if (typeof tokenUsage !== "object" || tokenUsage === null) {
|
|
36730
|
+
return void 0;
|
|
36731
|
+
}
|
|
36732
|
+
const obj = tokenUsage;
|
|
36733
|
+
if (typeof obj.input !== "number" || typeof obj.output !== "number") {
|
|
36734
|
+
return void 0;
|
|
36735
|
+
}
|
|
36736
|
+
return {
|
|
36737
|
+
input: obj.input,
|
|
36738
|
+
output: obj.output,
|
|
36739
|
+
cached: typeof obj.cached === "number" ? obj.cached : void 0
|
|
36740
|
+
};
|
|
36741
|
+
}
|
|
36396
36742
|
/**
|
|
36397
36743
|
* Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
|
|
36398
36744
|
*/
|
|
@@ -36469,6 +36815,9 @@ var CliProvider = class {
|
|
|
36469
36815
|
if (records.has(id)) {
|
|
36470
36816
|
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
36471
36817
|
}
|
|
36818
|
+
const tokenUsage = this.parseTokenUsage(obj.token_usage);
|
|
36819
|
+
const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
|
|
36820
|
+
const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
|
|
36472
36821
|
const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
|
|
36473
36822
|
let outputMessages;
|
|
36474
36823
|
if (parsedOutputMessages && parsedOutputMessages.length > 0) {
|
|
@@ -36478,7 +36827,10 @@ var CliProvider = class {
|
|
|
36478
36827
|
outputMessages = text2 ? [{ role: "assistant", content: text2 }] : [];
|
|
36479
36828
|
}
|
|
36480
36829
|
records.set(id, {
|
|
36481
|
-
outputMessages
|
|
36830
|
+
outputMessages,
|
|
36831
|
+
tokenUsage,
|
|
36832
|
+
costUsd,
|
|
36833
|
+
durationMs
|
|
36482
36834
|
});
|
|
36483
36835
|
}
|
|
36484
36836
|
return records;
|
|
@@ -36771,6 +37123,11 @@ var execAsync22 = promisify22(execCallback);
|
|
|
36771
37123
|
var WORKSPACE_PREFIX = "agentv-codex-";
|
|
36772
37124
|
var PROMPT_FILENAME = "prompt.md";
|
|
36773
37125
|
var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
|
|
37126
|
+
var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
|
|
37127
|
+
- Do NOT create any additional output files in the workspace.
|
|
37128
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
37129
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
37130
|
+
This is required for evaluation scoring.`;
|
|
36774
37131
|
var CodexProvider = class {
|
|
36775
37132
|
id;
|
|
36776
37133
|
kind = "codex";
|
|
@@ -36795,7 +37152,11 @@ var CodexProvider = class {
|
|
|
36795
37152
|
const workspaceRoot = await this.createWorkspace();
|
|
36796
37153
|
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
36797
37154
|
try {
|
|
36798
|
-
const
|
|
37155
|
+
const basePrompt = buildPromptDocument(request, inputFiles);
|
|
37156
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
|
|
37157
|
+
const promptContent = `${systemPrompt}
|
|
37158
|
+
|
|
37159
|
+
${basePrompt}`;
|
|
36799
37160
|
const promptFile = path92.join(workspaceRoot, PROMPT_FILENAME);
|
|
36800
37161
|
await writeFile5(promptFile, promptContent, "utf8");
|
|
36801
37162
|
const args = this.buildCodexArgs();
|
|
@@ -37476,6 +37837,666 @@ var MockProvider = class {
|
|
|
37476
37837
|
return this.delayMs;
|
|
37477
37838
|
}
|
|
37478
37839
|
};
|
|
37840
|
+
var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
|
|
37841
|
+
var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
|
|
37842
|
+
function getPiLogStore() {
|
|
37843
|
+
const globalObject = globalThis;
|
|
37844
|
+
const existing = globalObject[GLOBAL_LOGS_KEY2];
|
|
37845
|
+
if (existing) {
|
|
37846
|
+
return existing;
|
|
37847
|
+
}
|
|
37848
|
+
const created = [];
|
|
37849
|
+
globalObject[GLOBAL_LOGS_KEY2] = created;
|
|
37850
|
+
return created;
|
|
37851
|
+
}
|
|
37852
|
+
function getSubscriberStore2() {
|
|
37853
|
+
const globalObject = globalThis;
|
|
37854
|
+
const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
|
|
37855
|
+
if (existing) {
|
|
37856
|
+
return existing;
|
|
37857
|
+
}
|
|
37858
|
+
const created = /* @__PURE__ */ new Set();
|
|
37859
|
+
globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
|
|
37860
|
+
return created;
|
|
37861
|
+
}
|
|
37862
|
+
function notifySubscribers2(entry) {
|
|
37863
|
+
const subscribers = Array.from(getSubscriberStore2());
|
|
37864
|
+
for (const listener of subscribers) {
|
|
37865
|
+
try {
|
|
37866
|
+
listener(entry);
|
|
37867
|
+
} catch (error40) {
|
|
37868
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
37869
|
+
console.warn(`Pi log subscriber failed: ${message}`);
|
|
37870
|
+
}
|
|
37871
|
+
}
|
|
37872
|
+
}
|
|
37873
|
+
function recordPiLogEntry(entry) {
|
|
37874
|
+
getPiLogStore().push(entry);
|
|
37875
|
+
notifySubscribers2(entry);
|
|
37876
|
+
}
|
|
37877
|
+
function subscribeToPiLogEntries(listener) {
|
|
37878
|
+
const store = getSubscriberStore2();
|
|
37879
|
+
store.add(listener);
|
|
37880
|
+
return () => {
|
|
37881
|
+
store.delete(listener);
|
|
37882
|
+
};
|
|
37883
|
+
}
|
|
37884
|
+
var WORKSPACE_PREFIX2 = "agentv-pi-";
|
|
37885
|
+
var PROMPT_FILENAME2 = "prompt.md";
|
|
37886
|
+
var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
|
|
37887
|
+
- Do NOT create any additional output files in the workspace.
|
|
37888
|
+
- All intended file outputs/changes MUST be written in your response.
|
|
37889
|
+
- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
|
|
37890
|
+
This is required for evaluation scoring.`;
|
|
37891
|
+
var PiCodingAgentProvider = class {
|
|
37892
|
+
id;
|
|
37893
|
+
kind = "pi-coding-agent";
|
|
37894
|
+
targetName;
|
|
37895
|
+
supportsBatch = false;
|
|
37896
|
+
config;
|
|
37897
|
+
runPi;
|
|
37898
|
+
constructor(targetName, config2, runner = defaultPiRunner) {
|
|
37899
|
+
this.id = `pi-coding-agent:${targetName}`;
|
|
37900
|
+
this.targetName = targetName;
|
|
37901
|
+
this.config = config2;
|
|
37902
|
+
this.runPi = runner;
|
|
37903
|
+
}
|
|
37904
|
+
async invoke(request) {
|
|
37905
|
+
if (request.signal?.aborted) {
|
|
37906
|
+
throw new Error("Pi coding agent request was aborted before execution");
|
|
37907
|
+
}
|
|
37908
|
+
const inputFiles = normalizeInputFiles2(request.inputFiles);
|
|
37909
|
+
const workspaceRoot = await this.createWorkspace();
|
|
37910
|
+
const logger = await this.createStreamLogger(request).catch(() => void 0);
|
|
37911
|
+
try {
|
|
37912
|
+
const promptFile = path102.join(workspaceRoot, PROMPT_FILENAME2);
|
|
37913
|
+
await writeFile22(promptFile, request.question, "utf8");
|
|
37914
|
+
const args = this.buildPiArgs(request.question, inputFiles);
|
|
37915
|
+
const cwd = this.resolveCwd(workspaceRoot);
|
|
37916
|
+
const result = await this.executePi(args, cwd, request.signal, logger);
|
|
37917
|
+
if (result.timedOut) {
|
|
37918
|
+
throw new Error(
|
|
37919
|
+
`Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
|
|
37920
|
+
);
|
|
37921
|
+
}
|
|
37922
|
+
if (result.exitCode !== 0) {
|
|
37923
|
+
const detail = pickDetail2(result.stderr, result.stdout);
|
|
37924
|
+
const prefix = `Pi coding agent exited with code ${result.exitCode}`;
|
|
37925
|
+
throw new Error(detail ? `${prefix}: ${detail}` : prefix);
|
|
37926
|
+
}
|
|
37927
|
+
const parsed = parsePiJsonl(result.stdout);
|
|
37928
|
+
const outputMessages = extractOutputMessages(parsed);
|
|
37929
|
+
const assistantText = extractAssistantText2(outputMessages);
|
|
37930
|
+
return {
|
|
37931
|
+
raw: {
|
|
37932
|
+
response: parsed,
|
|
37933
|
+
stdout: result.stdout,
|
|
37934
|
+
stderr: result.stderr,
|
|
37935
|
+
exitCode: result.exitCode,
|
|
37936
|
+
args,
|
|
37937
|
+
executable: this.config.executable,
|
|
37938
|
+
promptFile,
|
|
37939
|
+
workspace: workspaceRoot,
|
|
37940
|
+
inputFiles,
|
|
37941
|
+
logFile: logger?.filePath
|
|
37942
|
+
},
|
|
37943
|
+
outputMessages
|
|
37944
|
+
};
|
|
37945
|
+
} finally {
|
|
37946
|
+
await logger?.close();
|
|
37947
|
+
await this.cleanupWorkspace(workspaceRoot);
|
|
37948
|
+
}
|
|
37949
|
+
}
|
|
37950
|
+
resolveCwd(workspaceRoot) {
|
|
37951
|
+
if (!this.config.cwd) {
|
|
37952
|
+
return workspaceRoot;
|
|
37953
|
+
}
|
|
37954
|
+
return path102.resolve(this.config.cwd);
|
|
37955
|
+
}
|
|
37956
|
+
buildPiArgs(prompt, inputFiles) {
|
|
37957
|
+
const args = [];
|
|
37958
|
+
if (this.config.provider) {
|
|
37959
|
+
args.push("--provider", this.config.provider);
|
|
37960
|
+
}
|
|
37961
|
+
if (this.config.model) {
|
|
37962
|
+
args.push("--model", this.config.model);
|
|
37963
|
+
}
|
|
37964
|
+
if (this.config.apiKey) {
|
|
37965
|
+
args.push("--api-key", this.config.apiKey);
|
|
37966
|
+
}
|
|
37967
|
+
args.push("--mode", "json");
|
|
37968
|
+
args.push("--print");
|
|
37969
|
+
args.push("--no-session");
|
|
37970
|
+
if (this.config.tools) {
|
|
37971
|
+
args.push("--tools", this.config.tools);
|
|
37972
|
+
}
|
|
37973
|
+
if (this.config.thinking) {
|
|
37974
|
+
args.push("--thinking", this.config.thinking);
|
|
37975
|
+
}
|
|
37976
|
+
if (this.config.args && this.config.args.length > 0) {
|
|
37977
|
+
args.push(...this.config.args);
|
|
37978
|
+
}
|
|
37979
|
+
if (inputFiles && inputFiles.length > 0) {
|
|
37980
|
+
for (const file2 of inputFiles) {
|
|
37981
|
+
args.push(`@${file2}`);
|
|
37982
|
+
}
|
|
37983
|
+
}
|
|
37984
|
+
const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
|
|
37985
|
+
const fullPrompt = `${systemPrompt}
|
|
37986
|
+
|
|
37987
|
+
${prompt}`;
|
|
37988
|
+
const escapedPrompt = escapeAtSymbols(fullPrompt);
|
|
37989
|
+
args.push(escapedPrompt);
|
|
37990
|
+
return args;
|
|
37991
|
+
}
|
|
37992
|
+
async executePi(args, cwd, signal, logger) {
|
|
37993
|
+
try {
|
|
37994
|
+
return await this.runPi({
|
|
37995
|
+
executable: this.config.executable,
|
|
37996
|
+
args,
|
|
37997
|
+
cwd,
|
|
37998
|
+
timeoutMs: this.config.timeoutMs,
|
|
37999
|
+
env: this.buildEnv(),
|
|
38000
|
+
signal,
|
|
38001
|
+
onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
|
|
38002
|
+
onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
|
|
38003
|
+
});
|
|
38004
|
+
} catch (error40) {
|
|
38005
|
+
const err = error40;
|
|
38006
|
+
if (err.code === "ENOENT") {
|
|
38007
|
+
throw new Error(
|
|
38008
|
+
`Pi coding agent executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
|
|
38009
|
+
);
|
|
38010
|
+
}
|
|
38011
|
+
throw error40;
|
|
38012
|
+
}
|
|
38013
|
+
}
|
|
38014
|
+
buildEnv() {
|
|
38015
|
+
const env = { ...process.env };
|
|
38016
|
+
if (this.config.apiKey) {
|
|
38017
|
+
const provider = this.config.provider?.toLowerCase() ?? "google";
|
|
38018
|
+
switch (provider) {
|
|
38019
|
+
case "google":
|
|
38020
|
+
case "gemini":
|
|
38021
|
+
env.GEMINI_API_KEY = this.config.apiKey;
|
|
38022
|
+
break;
|
|
38023
|
+
case "anthropic":
|
|
38024
|
+
env.ANTHROPIC_API_KEY = this.config.apiKey;
|
|
38025
|
+
break;
|
|
38026
|
+
case "openai":
|
|
38027
|
+
env.OPENAI_API_KEY = this.config.apiKey;
|
|
38028
|
+
break;
|
|
38029
|
+
case "groq":
|
|
38030
|
+
env.GROQ_API_KEY = this.config.apiKey;
|
|
38031
|
+
break;
|
|
38032
|
+
case "xai":
|
|
38033
|
+
env.XAI_API_KEY = this.config.apiKey;
|
|
38034
|
+
break;
|
|
38035
|
+
case "openrouter":
|
|
38036
|
+
env.OPENROUTER_API_KEY = this.config.apiKey;
|
|
38037
|
+
break;
|
|
38038
|
+
}
|
|
38039
|
+
}
|
|
38040
|
+
return env;
|
|
38041
|
+
}
|
|
38042
|
+
async createWorkspace() {
|
|
38043
|
+
return await mkdtemp2(path102.join(tmpdir2(), WORKSPACE_PREFIX2));
|
|
38044
|
+
}
|
|
38045
|
+
async cleanupWorkspace(workspaceRoot) {
|
|
38046
|
+
try {
|
|
38047
|
+
await rm22(workspaceRoot, { recursive: true, force: true });
|
|
38048
|
+
} catch {
|
|
38049
|
+
}
|
|
38050
|
+
}
|
|
38051
|
+
resolveLogDirectory() {
|
|
38052
|
+
if (this.config.logDir) {
|
|
38053
|
+
return path102.resolve(this.config.logDir);
|
|
38054
|
+
}
|
|
38055
|
+
return path102.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
|
|
38056
|
+
}
|
|
38057
|
+
async createStreamLogger(request) {
|
|
38058
|
+
const logDir = this.resolveLogDirectory();
|
|
38059
|
+
if (!logDir) {
|
|
38060
|
+
return void 0;
|
|
38061
|
+
}
|
|
38062
|
+
try {
|
|
38063
|
+
await mkdir22(logDir, { recursive: true });
|
|
38064
|
+
} catch (error40) {
|
|
38065
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38066
|
+
console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
|
|
38067
|
+
return void 0;
|
|
38068
|
+
}
|
|
38069
|
+
const filePath = path102.join(logDir, buildLogFilename2(request, this.targetName));
|
|
38070
|
+
try {
|
|
38071
|
+
const logger = await PiStreamLogger.create({
|
|
38072
|
+
filePath,
|
|
38073
|
+
targetName: this.targetName,
|
|
38074
|
+
evalCaseId: request.evalCaseId,
|
|
38075
|
+
attempt: request.attempt,
|
|
38076
|
+
format: this.config.logFormat ?? "summary"
|
|
38077
|
+
});
|
|
38078
|
+
recordPiLogEntry({
|
|
38079
|
+
filePath,
|
|
38080
|
+
targetName: this.targetName,
|
|
38081
|
+
evalCaseId: request.evalCaseId,
|
|
38082
|
+
attempt: request.attempt
|
|
38083
|
+
});
|
|
38084
|
+
return logger;
|
|
38085
|
+
} catch (error40) {
|
|
38086
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38087
|
+
console.warn(`Skipping Pi stream logging for ${filePath}: ${message}`);
|
|
38088
|
+
return void 0;
|
|
38089
|
+
}
|
|
38090
|
+
}
|
|
38091
|
+
};
|
|
38092
|
+
var PiStreamLogger = class _PiStreamLogger {
|
|
38093
|
+
filePath;
|
|
38094
|
+
stream;
|
|
38095
|
+
startedAt = Date.now();
|
|
38096
|
+
stdoutBuffer = "";
|
|
38097
|
+
stderrBuffer = "";
|
|
38098
|
+
format;
|
|
38099
|
+
constructor(filePath, format) {
|
|
38100
|
+
this.filePath = filePath;
|
|
38101
|
+
this.format = format;
|
|
38102
|
+
this.stream = createWriteStream2(filePath, { flags: "a" });
|
|
38103
|
+
}
|
|
38104
|
+
static async create(options) {
|
|
38105
|
+
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
38106
|
+
const header = [
|
|
38107
|
+
"# Pi Coding Agent stream log",
|
|
38108
|
+
`# target: ${options.targetName}`,
|
|
38109
|
+
options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
|
|
38110
|
+
options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
|
|
38111
|
+
`# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
|
|
38112
|
+
""
|
|
38113
|
+
].filter((line2) => Boolean(line2));
|
|
38114
|
+
logger.writeLines(header);
|
|
38115
|
+
return logger;
|
|
38116
|
+
}
|
|
38117
|
+
handleStdoutChunk(chunk) {
|
|
38118
|
+
this.stdoutBuffer += chunk;
|
|
38119
|
+
this.flushBuffer("stdout");
|
|
38120
|
+
}
|
|
38121
|
+
handleStderrChunk(chunk) {
|
|
38122
|
+
this.stderrBuffer += chunk;
|
|
38123
|
+
this.flushBuffer("stderr");
|
|
38124
|
+
}
|
|
38125
|
+
async close() {
|
|
38126
|
+
this.flushBuffer("stdout");
|
|
38127
|
+
this.flushBuffer("stderr");
|
|
38128
|
+
this.flushRemainder();
|
|
38129
|
+
await new Promise((resolve2, reject) => {
|
|
38130
|
+
this.stream.once("error", reject);
|
|
38131
|
+
this.stream.end(() => resolve2());
|
|
38132
|
+
});
|
|
38133
|
+
}
|
|
38134
|
+
writeLines(lines) {
|
|
38135
|
+
for (const line2 of lines) {
|
|
38136
|
+
this.stream.write(`${line2}
|
|
38137
|
+
`);
|
|
38138
|
+
}
|
|
38139
|
+
}
|
|
38140
|
+
flushBuffer(source2) {
|
|
38141
|
+
const buffer2 = source2 === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
|
|
38142
|
+
const lines = buffer2.split(/\r?\n/);
|
|
38143
|
+
const remainder = lines.pop() ?? "";
|
|
38144
|
+
if (source2 === "stdout") {
|
|
38145
|
+
this.stdoutBuffer = remainder;
|
|
38146
|
+
} else {
|
|
38147
|
+
this.stderrBuffer = remainder;
|
|
38148
|
+
}
|
|
38149
|
+
for (const line2 of lines) {
|
|
38150
|
+
const formatted = this.formatLine(line2, source2);
|
|
38151
|
+
if (formatted) {
|
|
38152
|
+
this.stream.write(formatted);
|
|
38153
|
+
this.stream.write("\n");
|
|
38154
|
+
}
|
|
38155
|
+
}
|
|
38156
|
+
}
|
|
38157
|
+
formatLine(rawLine, source2) {
|
|
38158
|
+
const trimmed = rawLine.trim();
|
|
38159
|
+
if (trimmed.length === 0) {
|
|
38160
|
+
return void 0;
|
|
38161
|
+
}
|
|
38162
|
+
const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source2);
|
|
38163
|
+
return `[+${formatElapsed2(this.startedAt)}] [${source2}] ${message}`;
|
|
38164
|
+
}
|
|
38165
|
+
flushRemainder() {
|
|
38166
|
+
const stdoutRemainder = this.stdoutBuffer.trim();
|
|
38167
|
+
if (stdoutRemainder.length > 0) {
|
|
38168
|
+
const formatted = this.formatLine(stdoutRemainder, "stdout");
|
|
38169
|
+
if (formatted) {
|
|
38170
|
+
this.stream.write(formatted);
|
|
38171
|
+
this.stream.write("\n");
|
|
38172
|
+
}
|
|
38173
|
+
}
|
|
38174
|
+
const stderrRemainder = this.stderrBuffer.trim();
|
|
38175
|
+
if (stderrRemainder.length > 0) {
|
|
38176
|
+
const formatted = this.formatLine(stderrRemainder, "stderr");
|
|
38177
|
+
if (formatted) {
|
|
38178
|
+
this.stream.write(formatted);
|
|
38179
|
+
this.stream.write("\n");
|
|
38180
|
+
}
|
|
38181
|
+
}
|
|
38182
|
+
this.stdoutBuffer = "";
|
|
38183
|
+
this.stderrBuffer = "";
|
|
38184
|
+
}
|
|
38185
|
+
};
|
|
38186
|
+
function buildLogFilename2(request, targetName) {
|
|
38187
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
38188
|
+
const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
|
|
38189
|
+
const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
|
|
38190
|
+
const target = sanitizeForFilename2(targetName);
|
|
38191
|
+
return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
|
|
38192
|
+
}
|
|
38193
|
+
function sanitizeForFilename2(value) {
|
|
38194
|
+
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
38195
|
+
return sanitized.length > 0 ? sanitized : "pi";
|
|
38196
|
+
}
|
|
38197
|
+
function formatElapsed2(startedAt) {
|
|
38198
|
+
const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
|
|
38199
|
+
const hours = Math.floor(elapsedSeconds / 3600);
|
|
38200
|
+
const minutes = Math.floor(elapsedSeconds % 3600 / 60);
|
|
38201
|
+
const seconds = elapsedSeconds % 60;
|
|
38202
|
+
if (hours > 0) {
|
|
38203
|
+
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
38204
|
+
}
|
|
38205
|
+
return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
|
|
38206
|
+
}
|
|
38207
|
+
function formatPiLogMessage(rawLine, source2) {
|
|
38208
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
38209
|
+
if (parsed) {
|
|
38210
|
+
const summary = summarizePiEvent(parsed);
|
|
38211
|
+
if (summary) {
|
|
38212
|
+
return summary;
|
|
38213
|
+
}
|
|
38214
|
+
}
|
|
38215
|
+
if (source2 === "stderr") {
|
|
38216
|
+
return `stderr: ${rawLine}`;
|
|
38217
|
+
}
|
|
38218
|
+
return rawLine;
|
|
38219
|
+
}
|
|
38220
|
+
function formatPiJsonLog(rawLine) {
|
|
38221
|
+
const parsed = tryParseJsonValue2(rawLine);
|
|
38222
|
+
if (!parsed) {
|
|
38223
|
+
return rawLine;
|
|
38224
|
+
}
|
|
38225
|
+
try {
|
|
38226
|
+
return JSON.stringify(parsed, null, 2);
|
|
38227
|
+
} catch {
|
|
38228
|
+
return rawLine;
|
|
38229
|
+
}
|
|
38230
|
+
}
|
|
38231
|
+
function summarizePiEvent(event) {
|
|
38232
|
+
if (!event || typeof event !== "object") {
|
|
38233
|
+
return void 0;
|
|
38234
|
+
}
|
|
38235
|
+
const record2 = event;
|
|
38236
|
+
const type = typeof record2.type === "string" ? record2.type : void 0;
|
|
38237
|
+
if (!type) {
|
|
38238
|
+
return void 0;
|
|
38239
|
+
}
|
|
38240
|
+
switch (type) {
|
|
38241
|
+
case "agent_start":
|
|
38242
|
+
return "agent_start";
|
|
38243
|
+
case "agent_end":
|
|
38244
|
+
return "agent_end";
|
|
38245
|
+
case "turn_start":
|
|
38246
|
+
return "turn_start";
|
|
38247
|
+
case "turn_end":
|
|
38248
|
+
return "turn_end";
|
|
38249
|
+
case "message_start":
|
|
38250
|
+
case "message_end": {
|
|
38251
|
+
const message = record2.message;
|
|
38252
|
+
const role = message?.role;
|
|
38253
|
+
return `${type}: ${role}`;
|
|
38254
|
+
}
|
|
38255
|
+
case "message_update": {
|
|
38256
|
+
const event2 = record2.assistantMessageEvent;
|
|
38257
|
+
const eventType = event2?.type;
|
|
38258
|
+
if (eventType === "text_delta") {
|
|
38259
|
+
const delta = event2?.delta;
|
|
38260
|
+
if (typeof delta === "string") {
|
|
38261
|
+
const preview = delta.length > 50 ? `${delta.slice(0, 50)}...` : delta;
|
|
38262
|
+
return `text_delta: ${preview}`;
|
|
38263
|
+
}
|
|
38264
|
+
}
|
|
38265
|
+
return `message_update: ${eventType}`;
|
|
38266
|
+
}
|
|
38267
|
+
default:
|
|
38268
|
+
return type;
|
|
38269
|
+
}
|
|
38270
|
+
}
|
|
38271
|
+
function tryParseJsonValue2(rawLine) {
|
|
38272
|
+
try {
|
|
38273
|
+
return JSON.parse(rawLine);
|
|
38274
|
+
} catch {
|
|
38275
|
+
return void 0;
|
|
38276
|
+
}
|
|
38277
|
+
}
|
|
38278
|
+
function parsePiJsonl(output) {
|
|
38279
|
+
const trimmed = output.trim();
|
|
38280
|
+
if (trimmed.length === 0) {
|
|
38281
|
+
throw new Error("Pi coding agent produced no output");
|
|
38282
|
+
}
|
|
38283
|
+
const lines = trimmed.split(/\r?\n/).map((line2) => line2.trim()).filter((line2) => line2.length > 0);
|
|
38284
|
+
const parsed = [];
|
|
38285
|
+
for (const line2 of lines) {
|
|
38286
|
+
try {
|
|
38287
|
+
parsed.push(JSON.parse(line2));
|
|
38288
|
+
} catch {
|
|
38289
|
+
}
|
|
38290
|
+
}
|
|
38291
|
+
if (parsed.length === 0) {
|
|
38292
|
+
throw new Error("Pi coding agent produced no valid JSON output");
|
|
38293
|
+
}
|
|
38294
|
+
return parsed;
|
|
38295
|
+
}
|
|
38296
|
+
function extractOutputMessages(events) {
|
|
38297
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
38298
|
+
const event = events[i];
|
|
38299
|
+
if (!event || typeof event !== "object") {
|
|
38300
|
+
continue;
|
|
38301
|
+
}
|
|
38302
|
+
const record2 = event;
|
|
38303
|
+
if (record2.type !== "agent_end") {
|
|
38304
|
+
continue;
|
|
38305
|
+
}
|
|
38306
|
+
const messages = record2.messages;
|
|
38307
|
+
if (!Array.isArray(messages)) {
|
|
38308
|
+
continue;
|
|
38309
|
+
}
|
|
38310
|
+
return messages.map(convertPiMessage).filter((m) => m !== void 0);
|
|
38311
|
+
}
|
|
38312
|
+
const outputMessages = [];
|
|
38313
|
+
for (const event of events) {
|
|
38314
|
+
if (!event || typeof event !== "object") {
|
|
38315
|
+
continue;
|
|
38316
|
+
}
|
|
38317
|
+
const record2 = event;
|
|
38318
|
+
if (record2.type === "turn_end") {
|
|
38319
|
+
const message = record2.message;
|
|
38320
|
+
const converted = convertPiMessage(message);
|
|
38321
|
+
if (converted) {
|
|
38322
|
+
outputMessages.push(converted);
|
|
38323
|
+
}
|
|
38324
|
+
}
|
|
38325
|
+
}
|
|
38326
|
+
return outputMessages;
|
|
38327
|
+
}
|
|
38328
|
+
function convertPiMessage(message) {
|
|
38329
|
+
if (!message || typeof message !== "object") {
|
|
38330
|
+
return void 0;
|
|
38331
|
+
}
|
|
38332
|
+
const msg = message;
|
|
38333
|
+
const role = msg.role;
|
|
38334
|
+
if (typeof role !== "string") {
|
|
38335
|
+
return void 0;
|
|
38336
|
+
}
|
|
38337
|
+
const content = extractTextContent2(msg.content);
|
|
38338
|
+
const toolCalls = extractToolCalls(msg.content);
|
|
38339
|
+
const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
38340
|
+
const metadata = {};
|
|
38341
|
+
if (msg.api) metadata.api = msg.api;
|
|
38342
|
+
if (msg.provider) metadata.provider = msg.provider;
|
|
38343
|
+
if (msg.model) metadata.model = msg.model;
|
|
38344
|
+
if (msg.usage) metadata.usage = msg.usage;
|
|
38345
|
+
if (msg.stopReason) metadata.stopReason = msg.stopReason;
|
|
38346
|
+
return {
|
|
38347
|
+
role,
|
|
38348
|
+
content,
|
|
38349
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
|
|
38350
|
+
timestamp,
|
|
38351
|
+
metadata: Object.keys(metadata).length > 0 ? metadata : void 0
|
|
38352
|
+
};
|
|
38353
|
+
}
|
|
38354
|
+
function extractTextContent2(content) {
|
|
38355
|
+
if (typeof content === "string") {
|
|
38356
|
+
return content;
|
|
38357
|
+
}
|
|
38358
|
+
if (!Array.isArray(content)) {
|
|
38359
|
+
return void 0;
|
|
38360
|
+
}
|
|
38361
|
+
const textParts = [];
|
|
38362
|
+
for (const part of content) {
|
|
38363
|
+
if (!part || typeof part !== "object") {
|
|
38364
|
+
continue;
|
|
38365
|
+
}
|
|
38366
|
+
const p = part;
|
|
38367
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
38368
|
+
textParts.push(p.text);
|
|
38369
|
+
}
|
|
38370
|
+
}
|
|
38371
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
38372
|
+
}
|
|
38373
|
+
function extractToolCalls(content) {
|
|
38374
|
+
if (!Array.isArray(content)) {
|
|
38375
|
+
return [];
|
|
38376
|
+
}
|
|
38377
|
+
const toolCalls = [];
|
|
38378
|
+
for (const part of content) {
|
|
38379
|
+
if (!part || typeof part !== "object") {
|
|
38380
|
+
continue;
|
|
38381
|
+
}
|
|
38382
|
+
const p = part;
|
|
38383
|
+
if (p.type === "tool_use" && typeof p.name === "string") {
|
|
38384
|
+
toolCalls.push({
|
|
38385
|
+
tool: p.name,
|
|
38386
|
+
input: p.input,
|
|
38387
|
+
id: typeof p.id === "string" ? p.id : void 0
|
|
38388
|
+
});
|
|
38389
|
+
}
|
|
38390
|
+
if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
|
|
38391
|
+
const existing = toolCalls.find((tc) => tc.id === p.tool_use_id);
|
|
38392
|
+
if (existing) {
|
|
38393
|
+
const idx = toolCalls.indexOf(existing);
|
|
38394
|
+
toolCalls[idx] = {
|
|
38395
|
+
...existing,
|
|
38396
|
+
output: p.content
|
|
38397
|
+
};
|
|
38398
|
+
}
|
|
38399
|
+
}
|
|
38400
|
+
}
|
|
38401
|
+
return toolCalls;
|
|
38402
|
+
}
|
|
38403
|
+
function extractAssistantText2(messages) {
|
|
38404
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
38405
|
+
const msg = messages[i];
|
|
38406
|
+
if (msg.role === "assistant" && msg.content) {
|
|
38407
|
+
if (typeof msg.content === "string") {
|
|
38408
|
+
return msg.content;
|
|
38409
|
+
}
|
|
38410
|
+
return JSON.stringify(msg.content);
|
|
38411
|
+
}
|
|
38412
|
+
}
|
|
38413
|
+
return "";
|
|
38414
|
+
}
|
|
38415
|
+
function escapeAtSymbols(prompt) {
|
|
38416
|
+
return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
|
|
38417
|
+
}
|
|
38418
|
+
function pickDetail2(stderr, stdout) {
|
|
38419
|
+
const errorText = stderr.trim();
|
|
38420
|
+
if (errorText.length > 0) {
|
|
38421
|
+
return errorText;
|
|
38422
|
+
}
|
|
38423
|
+
const stdoutText = stdout.trim();
|
|
38424
|
+
return stdoutText.length > 0 ? stdoutText : void 0;
|
|
38425
|
+
}
|
|
38426
|
+
function formatTimeoutSuffix3(timeoutMs) {
|
|
38427
|
+
if (!timeoutMs || timeoutMs <= 0) {
|
|
38428
|
+
return "";
|
|
38429
|
+
}
|
|
38430
|
+
const seconds = Math.ceil(timeoutMs / 1e3);
|
|
38431
|
+
return ` after ${seconds}s`;
|
|
38432
|
+
}
|
|
38433
|
+
async function defaultPiRunner(options) {
|
|
38434
|
+
return await new Promise((resolve2, reject) => {
|
|
38435
|
+
const parts = options.executable.split(/\s+/);
|
|
38436
|
+
const executable = parts[0];
|
|
38437
|
+
const executableArgs = parts.slice(1);
|
|
38438
|
+
const allArgs = [...executableArgs, ...options.args];
|
|
38439
|
+
const child = spawn22(executable, allArgs, {
|
|
38440
|
+
cwd: options.cwd,
|
|
38441
|
+
env: options.env,
|
|
38442
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
38443
|
+
shell: false
|
|
38444
|
+
});
|
|
38445
|
+
let stdout = "";
|
|
38446
|
+
let stderr = "";
|
|
38447
|
+
let timedOut = false;
|
|
38448
|
+
const onAbort = () => {
|
|
38449
|
+
child.kill("SIGTERM");
|
|
38450
|
+
};
|
|
38451
|
+
if (options.signal) {
|
|
38452
|
+
if (options.signal.aborted) {
|
|
38453
|
+
onAbort();
|
|
38454
|
+
} else {
|
|
38455
|
+
options.signal.addEventListener("abort", onAbort, { once: true });
|
|
38456
|
+
}
|
|
38457
|
+
}
|
|
38458
|
+
let timeoutHandle;
|
|
38459
|
+
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
38460
|
+
timeoutHandle = setTimeout(() => {
|
|
38461
|
+
timedOut = true;
|
|
38462
|
+
child.kill("SIGTERM");
|
|
38463
|
+
}, options.timeoutMs);
|
|
38464
|
+
timeoutHandle.unref?.();
|
|
38465
|
+
}
|
|
38466
|
+
child.stdout.setEncoding("utf8");
|
|
38467
|
+
child.stdout.on("data", (chunk) => {
|
|
38468
|
+
stdout += chunk;
|
|
38469
|
+
options.onStdoutChunk?.(chunk);
|
|
38470
|
+
});
|
|
38471
|
+
child.stderr.setEncoding("utf8");
|
|
38472
|
+
child.stderr.on("data", (chunk) => {
|
|
38473
|
+
stderr += chunk;
|
|
38474
|
+
options.onStderrChunk?.(chunk);
|
|
38475
|
+
});
|
|
38476
|
+
child.stdin.end();
|
|
38477
|
+
const cleanup = () => {
|
|
38478
|
+
if (timeoutHandle) {
|
|
38479
|
+
clearTimeout(timeoutHandle);
|
|
38480
|
+
}
|
|
38481
|
+
if (options.signal) {
|
|
38482
|
+
options.signal.removeEventListener("abort", onAbort);
|
|
38483
|
+
}
|
|
38484
|
+
};
|
|
38485
|
+
child.on("error", (error40) => {
|
|
38486
|
+
cleanup();
|
|
38487
|
+
reject(error40);
|
|
38488
|
+
});
|
|
38489
|
+
child.on("close", (code) => {
|
|
38490
|
+
cleanup();
|
|
38491
|
+
resolve2({
|
|
38492
|
+
stdout,
|
|
38493
|
+
stderr,
|
|
38494
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
38495
|
+
timedOut
|
|
38496
|
+
});
|
|
38497
|
+
});
|
|
38498
|
+
});
|
|
38499
|
+
}
|
|
37479
38500
|
var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
|
|
37480
38501
|
|
|
37481
38502
|
{{userQuery}}
|
|
@@ -37640,7 +38661,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
|
|
|
37640
38661
|
return "";
|
|
37641
38662
|
}
|
|
37642
38663
|
const buildList = (files) => files.map((absolutePath) => {
|
|
37643
|
-
const fileName =
|
|
38664
|
+
const fileName = path112.basename(absolutePath);
|
|
37644
38665
|
const fileUri = pathToFileUri22(absolutePath);
|
|
37645
38666
|
return `* [${fileName}](${fileUri})`;
|
|
37646
38667
|
});
|
|
@@ -37665,8 +38686,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
|
|
|
37665
38686
|
}
|
|
37666
38687
|
const unique = /* @__PURE__ */ new Map();
|
|
37667
38688
|
for (const attachment of attachments) {
|
|
37668
|
-
const absolutePath =
|
|
37669
|
-
const normalized = absolutePath.split(
|
|
38689
|
+
const absolutePath = path112.resolve(attachment);
|
|
38690
|
+
const normalized = absolutePath.split(path112.sep).join("/");
|
|
37670
38691
|
if (isGuidelineFile(normalized, guidelinePatterns)) {
|
|
37671
38692
|
if (!unique.has(absolutePath)) {
|
|
37672
38693
|
unique.set(absolutePath, absolutePath);
|
|
@@ -37681,7 +38702,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
37681
38702
|
}
|
|
37682
38703
|
const unique = /* @__PURE__ */ new Map();
|
|
37683
38704
|
for (const attachment of attachments) {
|
|
37684
|
-
const absolutePath =
|
|
38705
|
+
const absolutePath = path112.resolve(attachment);
|
|
37685
38706
|
if (!unique.has(absolutePath)) {
|
|
37686
38707
|
unique.set(absolutePath, absolutePath);
|
|
37687
38708
|
}
|
|
@@ -37689,7 +38710,7 @@ function collectAttachmentFiles(attachments) {
|
|
|
37689
38710
|
return Array.from(unique.values());
|
|
37690
38711
|
}
|
|
37691
38712
|
function pathToFileUri22(filePath) {
|
|
37692
|
-
const absolutePath =
|
|
38713
|
+
const absolutePath = path112.isAbsolute(filePath) ? filePath : path112.resolve(filePath);
|
|
37693
38714
|
const normalizedPath = absolutePath.replace(/\\/g, "/");
|
|
37694
38715
|
if (/^[a-zA-Z]:\//.test(normalizedPath)) {
|
|
37695
38716
|
return `file:///${normalizedPath}`;
|
|
@@ -37702,7 +38723,7 @@ function normalizeAttachments(attachments) {
|
|
|
37702
38723
|
}
|
|
37703
38724
|
const deduped = /* @__PURE__ */ new Set();
|
|
37704
38725
|
for (const attachment of attachments) {
|
|
37705
|
-
deduped.add(
|
|
38726
|
+
deduped.add(path112.resolve(attachment));
|
|
37706
38727
|
}
|
|
37707
38728
|
return Array.from(deduped);
|
|
37708
38729
|
}
|
|
@@ -37711,7 +38732,7 @@ function mergeAttachments(all) {
|
|
|
37711
38732
|
for (const list of all) {
|
|
37712
38733
|
if (!list) continue;
|
|
37713
38734
|
for (const inputFile of list) {
|
|
37714
|
-
deduped.add(
|
|
38735
|
+
deduped.add(path112.resolve(inputFile));
|
|
37715
38736
|
}
|
|
37716
38737
|
}
|
|
37717
38738
|
return deduped.size > 0 ? Array.from(deduped) : void 0;
|
|
@@ -37791,7 +38812,7 @@ async function fileExists3(filePath) {
|
|
|
37791
38812
|
}
|
|
37792
38813
|
}
|
|
37793
38814
|
async function readTargetDefinitions(filePath) {
|
|
37794
|
-
const absolutePath =
|
|
38815
|
+
const absolutePath = path122.resolve(filePath);
|
|
37795
38816
|
if (!await fileExists3(absolutePath)) {
|
|
37796
38817
|
throw new Error(`targets.yaml not found at ${absolutePath}`);
|
|
37797
38818
|
}
|
|
@@ -37821,6 +38842,8 @@ function createProvider(target) {
|
|
|
37821
38842
|
return new CliProvider(target.name, target.config);
|
|
37822
38843
|
case "codex":
|
|
37823
38844
|
return new CodexProvider(target.name, target.config);
|
|
38845
|
+
case "pi-coding-agent":
|
|
38846
|
+
return new PiCodingAgentProvider(target.name, target.config);
|
|
37824
38847
|
case "mock":
|
|
37825
38848
|
return new MockProvider(target.name, target.config);
|
|
37826
38849
|
case "vscode":
|
|
@@ -37832,6 +38855,70 @@ function createProvider(target) {
|
|
|
37832
38855
|
}
|
|
37833
38856
|
}
|
|
37834
38857
|
}
|
|
38858
|
+
function getBunSpawn() {
|
|
38859
|
+
const bunSpawn = globalThis.Bun?.spawn;
|
|
38860
|
+
return typeof bunSpawn === "function" ? bunSpawn : void 0;
|
|
38861
|
+
}
|
|
38862
|
+
async function execShellWithStdin(command7, stdinPayload, options = {}) {
|
|
38863
|
+
const bunSpawn = getBunSpawn();
|
|
38864
|
+
if (bunSpawn) {
|
|
38865
|
+
const encoder = new TextEncoder();
|
|
38866
|
+
const proc = bunSpawn({
|
|
38867
|
+
cmd: ["sh", "-c", command7],
|
|
38868
|
+
cwd: options.cwd,
|
|
38869
|
+
stdin: encoder.encode(stdinPayload),
|
|
38870
|
+
stdout: "pipe",
|
|
38871
|
+
stderr: "pipe"
|
|
38872
|
+
});
|
|
38873
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
38874
|
+
proc.kill();
|
|
38875
|
+
}, options.timeoutMs) : void 0;
|
|
38876
|
+
try {
|
|
38877
|
+
const stdout = await new Response(proc.stdout).text();
|
|
38878
|
+
const stderr = await new Response(proc.stderr).text();
|
|
38879
|
+
const exitCode = await proc.exited;
|
|
38880
|
+
return { stdout, stderr, exitCode };
|
|
38881
|
+
} finally {
|
|
38882
|
+
if (timeout !== void 0) {
|
|
38883
|
+
clearTimeout(timeout);
|
|
38884
|
+
}
|
|
38885
|
+
}
|
|
38886
|
+
}
|
|
38887
|
+
const { spawn: spawn3 } = await import("node:child_process");
|
|
38888
|
+
return await new Promise((resolve2, reject) => {
|
|
38889
|
+
const child = spawn3(command7, {
|
|
38890
|
+
shell: true,
|
|
38891
|
+
cwd: options.cwd,
|
|
38892
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
38893
|
+
});
|
|
38894
|
+
let stdout = "";
|
|
38895
|
+
let stderr = "";
|
|
38896
|
+
const timeout = options.timeoutMs ? setTimeout(() => {
|
|
38897
|
+
child.kill();
|
|
38898
|
+
reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
|
|
38899
|
+
}, options.timeoutMs) : void 0;
|
|
38900
|
+
child.stdout?.on("data", (data) => {
|
|
38901
|
+
stdout += data.toString();
|
|
38902
|
+
});
|
|
38903
|
+
child.stderr?.on("data", (data) => {
|
|
38904
|
+
stderr += data.toString();
|
|
38905
|
+
});
|
|
38906
|
+
child.on("error", (error40) => {
|
|
38907
|
+
if (timeout !== void 0) {
|
|
38908
|
+
clearTimeout(timeout);
|
|
38909
|
+
}
|
|
38910
|
+
reject(error40);
|
|
38911
|
+
});
|
|
38912
|
+
child.on("exit", (code) => {
|
|
38913
|
+
if (timeout !== void 0) {
|
|
38914
|
+
clearTimeout(timeout);
|
|
38915
|
+
}
|
|
38916
|
+
resolve2({ stdout, stderr, exitCode: code ?? 0 });
|
|
38917
|
+
});
|
|
38918
|
+
child.stdin?.write(stdinPayload);
|
|
38919
|
+
child.stdin?.end();
|
|
38920
|
+
});
|
|
38921
|
+
}
|
|
37835
38922
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37836
38923
|
|
|
37837
38924
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -38107,17 +39194,17 @@ var CodeEvaluator = class {
|
|
|
38107
39194
|
const inputPayload = JSON.stringify(
|
|
38108
39195
|
{
|
|
38109
39196
|
question: context.evalCase.question,
|
|
38110
|
-
|
|
38111
|
-
|
|
38112
|
-
|
|
38113
|
-
|
|
38114
|
-
|
|
38115
|
-
|
|
38116
|
-
|
|
38117
|
-
(
|
|
39197
|
+
expectedOutcome: context.evalCase.expected_outcome,
|
|
39198
|
+
expectedMessages: context.evalCase.expected_messages,
|
|
39199
|
+
referenceAnswer: context.evalCase.reference_answer,
|
|
39200
|
+
candidateAnswer: context.candidate,
|
|
39201
|
+
outputMessages: context.outputMessages ?? null,
|
|
39202
|
+
guidelineFiles: context.evalCase.guideline_paths,
|
|
39203
|
+
inputFiles: context.evalCase.file_paths.filter(
|
|
39204
|
+
(path142) => !context.evalCase.guideline_paths.includes(path142)
|
|
38118
39205
|
),
|
|
38119
|
-
|
|
38120
|
-
|
|
39206
|
+
inputMessages: context.evalCase.input_messages,
|
|
39207
|
+
traceSummary: context.traceSummary ?? null
|
|
38121
39208
|
},
|
|
38122
39209
|
null,
|
|
38123
39210
|
2
|
|
@@ -38187,43 +39274,17 @@ function calculateRubricScore(result, rubrics) {
|
|
|
38187
39274
|
return { score, verdict, hits, misses };
|
|
38188
39275
|
}
|
|
38189
39276
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
38190
|
-
const {
|
|
38191
|
-
|
|
38192
|
-
|
|
38193
|
-
shell: true,
|
|
38194
|
-
cwd
|
|
38195
|
-
});
|
|
38196
|
-
let stdout = "";
|
|
38197
|
-
let stderr = "";
|
|
38198
|
-
const timeout = agentTimeoutMs ? setTimeout(() => {
|
|
38199
|
-
child.kill();
|
|
38200
|
-
reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
|
|
38201
|
-
}, agentTimeoutMs) : void 0;
|
|
38202
|
-
child.stdout?.on("data", (data) => {
|
|
38203
|
-
stdout += data.toString();
|
|
38204
|
-
});
|
|
38205
|
-
child.stderr?.on("data", (data) => {
|
|
38206
|
-
stderr += data.toString();
|
|
38207
|
-
});
|
|
38208
|
-
child.on("error", (error40) => {
|
|
38209
|
-
if (timeout !== void 0) {
|
|
38210
|
-
clearTimeout(timeout);
|
|
38211
|
-
}
|
|
38212
|
-
reject(error40);
|
|
38213
|
-
});
|
|
38214
|
-
child.on("exit", (code) => {
|
|
38215
|
-
if (timeout !== void 0) {
|
|
38216
|
-
clearTimeout(timeout);
|
|
38217
|
-
}
|
|
38218
|
-
if (code && code !== 0 && stderr.length > 0) {
|
|
38219
|
-
reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
|
|
38220
|
-
return;
|
|
38221
|
-
}
|
|
38222
|
-
resolve2(stdout.trim());
|
|
38223
|
-
});
|
|
38224
|
-
child.stdin?.write(input);
|
|
38225
|
-
child.stdin?.end();
|
|
39277
|
+
const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
|
|
39278
|
+
cwd,
|
|
39279
|
+
timeoutMs: agentTimeoutMs
|
|
38226
39280
|
});
|
|
39281
|
+
if (exitCode !== 0) {
|
|
39282
|
+
const trimmedErr = stderr.trim();
|
|
39283
|
+
throw new Error(
|
|
39284
|
+
trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
|
|
39285
|
+
);
|
|
39286
|
+
}
|
|
39287
|
+
return stdout.trim();
|
|
38227
39288
|
}
|
|
38228
39289
|
function parseJsonSafe(payload) {
|
|
38229
39290
|
try {
|
|
@@ -38237,6 +39298,33 @@ function substituteVariables(template, variables) {
|
|
|
38237
39298
|
return variables[varName] ?? match;
|
|
38238
39299
|
});
|
|
38239
39300
|
}
|
|
39301
|
+
function deepEqual(a, b) {
|
|
39302
|
+
if (a === b) return true;
|
|
39303
|
+
if (a === null || b === null) return a === b;
|
|
39304
|
+
if (typeof a !== typeof b) return false;
|
|
39305
|
+
if (typeof a !== "object") return a === b;
|
|
39306
|
+
if (Array.isArray(a) !== Array.isArray(b)) return false;
|
|
39307
|
+
if (Array.isArray(a) && Array.isArray(b)) {
|
|
39308
|
+
if (a.length !== b.length) return false;
|
|
39309
|
+
return a.every((val, i) => deepEqual(val, b[i]));
|
|
39310
|
+
}
|
|
39311
|
+
const aObj = a;
|
|
39312
|
+
const bObj = b;
|
|
39313
|
+
const aKeys = Object.keys(aObj);
|
|
39314
|
+
const bKeys = Object.keys(bObj);
|
|
39315
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
39316
|
+
return aKeys.every((key2) => Object.hasOwn(bObj, key2) && deepEqual(aObj[key2], bObj[key2]));
|
|
39317
|
+
}
|
|
39318
|
+
function argsMatch(expected, actual) {
|
|
39319
|
+
if (expected === void 0) return true;
|
|
39320
|
+
if (expected === "any") return true;
|
|
39321
|
+
if (actual === void 0) return false;
|
|
39322
|
+
for (const key2 of Object.keys(expected)) {
|
|
39323
|
+
if (!Object.hasOwn(actual, key2)) return false;
|
|
39324
|
+
if (!deepEqual(expected[key2], actual[key2])) return false;
|
|
39325
|
+
}
|
|
39326
|
+
return true;
|
|
39327
|
+
}
|
|
38240
39328
|
var ToolTrajectoryEvaluator = class {
|
|
38241
39329
|
kind = "tool_trajectory";
|
|
38242
39330
|
config;
|
|
@@ -38293,7 +39381,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38293
39381
|
for (const message of messages) {
|
|
38294
39382
|
if (message.toolCalls) {
|
|
38295
39383
|
for (const call of message.toolCalls) {
|
|
38296
|
-
toolCalls.push({
|
|
39384
|
+
toolCalls.push({
|
|
39385
|
+
name: call.tool,
|
|
39386
|
+
args: call.input
|
|
39387
|
+
});
|
|
38297
39388
|
}
|
|
38298
39389
|
}
|
|
38299
39390
|
}
|
|
@@ -38362,18 +39453,29 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38362
39453
|
const misses = [];
|
|
38363
39454
|
let actualIndex = 0;
|
|
38364
39455
|
for (let i = 0; i < expected.length; i++) {
|
|
38365
|
-
const
|
|
39456
|
+
const expectedItem = expected[i];
|
|
39457
|
+
const expectedTool = expectedItem.tool;
|
|
38366
39458
|
let found = false;
|
|
39459
|
+
let argsMismatch = false;
|
|
38367
39460
|
while (actualIndex < toolCalls.length) {
|
|
38368
|
-
|
|
38369
|
-
|
|
39461
|
+
const actualCall = toolCalls[actualIndex];
|
|
39462
|
+
if (actualCall.name === expectedTool) {
|
|
39463
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
39464
|
+
hits.push(`Found ${expectedTool} at position ${actualIndex}`);
|
|
39465
|
+
actualIndex++;
|
|
39466
|
+
found = true;
|
|
39467
|
+
break;
|
|
39468
|
+
}
|
|
39469
|
+
misses.push(
|
|
39470
|
+
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`
|
|
39471
|
+
);
|
|
38370
39472
|
actualIndex++;
|
|
38371
|
-
|
|
39473
|
+
argsMismatch = true;
|
|
38372
39474
|
break;
|
|
38373
39475
|
}
|
|
38374
39476
|
actualIndex++;
|
|
38375
39477
|
}
|
|
38376
|
-
if (!found) {
|
|
39478
|
+
if (!found && !argsMismatch) {
|
|
38377
39479
|
misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
|
|
38378
39480
|
}
|
|
38379
39481
|
}
|
|
@@ -38404,10 +39506,16 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38404
39506
|
}
|
|
38405
39507
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
38406
39508
|
for (let i = 0; i < checkLength; i++) {
|
|
38407
|
-
const
|
|
38408
|
-
const
|
|
39509
|
+
const expectedItem = expected[i];
|
|
39510
|
+
const expectedTool = expectedItem.tool;
|
|
39511
|
+
const actualCall = toolCalls[i];
|
|
39512
|
+
const actualTool = actualCall.name;
|
|
38409
39513
|
if (actualTool === expectedTool) {
|
|
38410
|
-
|
|
39514
|
+
if (argsMatch(expectedItem.args, actualCall.args)) {
|
|
39515
|
+
hits.push(`Position ${i}: ${expectedTool}`);
|
|
39516
|
+
} else {
|
|
39517
|
+
misses.push(`Position ${i}: ${expectedTool} args mismatch`);
|
|
39518
|
+
}
|
|
38411
39519
|
} else {
|
|
38412
39520
|
misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
|
|
38413
39521
|
}
|
|
@@ -39038,7 +40146,12 @@ async function runBatchEvaluation(options) {
|
|
|
39038
40146
|
const promptInputs = promptInputsList[i];
|
|
39039
40147
|
const providerResponse = batchResponse[i];
|
|
39040
40148
|
const outputMessages = providerResponse.outputMessages;
|
|
39041
|
-
const
|
|
40149
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
40150
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
40151
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
40152
|
+
costUsd: providerResponse.costUsd,
|
|
40153
|
+
durationMs: providerResponse.durationMs
|
|
40154
|
+
}) : void 0;
|
|
39042
40155
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
39043
40156
|
let result;
|
|
39044
40157
|
try {
|
|
@@ -39159,7 +40272,12 @@ async function runEvalCase(options) {
|
|
|
39159
40272
|
await cache.set(cacheKey, providerResponse);
|
|
39160
40273
|
}
|
|
39161
40274
|
const outputMessages = providerResponse.outputMessages;
|
|
39162
|
-
const
|
|
40275
|
+
const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
|
|
40276
|
+
const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
|
|
40277
|
+
tokenUsage: providerResponse.tokenUsage,
|
|
40278
|
+
costUsd: providerResponse.costUsd,
|
|
40279
|
+
durationMs: providerResponse.durationMs
|
|
40280
|
+
}) : void 0;
|
|
39163
40281
|
const candidate = extractLastAssistantContent(outputMessages);
|
|
39164
40282
|
try {
|
|
39165
40283
|
return await evaluateCandidate({
|
|
@@ -39232,21 +40350,21 @@ async function evaluateCandidate(options) {
|
|
|
39232
40350
|
}
|
|
39233
40351
|
return {
|
|
39234
40352
|
timestamp: completedAt.toISOString(),
|
|
39235
|
-
|
|
40353
|
+
evalId: evalCase.id,
|
|
39236
40354
|
dataset: evalCase.dataset,
|
|
39237
|
-
|
|
40355
|
+
conversationId: evalCase.conversation_id,
|
|
39238
40356
|
score: score.score,
|
|
39239
40357
|
hits: score.hits,
|
|
39240
40358
|
misses: score.misses,
|
|
39241
|
-
|
|
40359
|
+
candidateAnswer: candidate,
|
|
39242
40360
|
target: target.name,
|
|
39243
40361
|
reasoning: score.reasoning,
|
|
39244
|
-
|
|
39245
|
-
|
|
39246
|
-
|
|
39247
|
-
|
|
39248
|
-
|
|
39249
|
-
|
|
40362
|
+
rawAspects: score.rawAspects,
|
|
40363
|
+
agentProviderRequest,
|
|
40364
|
+
lmProviderRequest,
|
|
40365
|
+
evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
|
|
40366
|
+
evaluatorResults,
|
|
40367
|
+
traceSummary
|
|
39250
40368
|
};
|
|
39251
40369
|
}
|
|
39252
40370
|
async function runEvaluatorsForCase(options) {
|
|
@@ -39344,7 +40462,7 @@ async function runEvaluatorList(options) {
|
|
|
39344
40462
|
hits: score2.hits,
|
|
39345
40463
|
misses: score2.misses,
|
|
39346
40464
|
reasoning: score2.reasoning,
|
|
39347
|
-
|
|
40465
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
39348
40466
|
});
|
|
39349
40467
|
}
|
|
39350
40468
|
if (evaluator.type === "code") {
|
|
@@ -39375,11 +40493,11 @@ async function runEvaluatorList(options) {
|
|
|
39375
40493
|
hits: score2.hits,
|
|
39376
40494
|
misses: score2.misses,
|
|
39377
40495
|
reasoning: score2.reasoning,
|
|
39378
|
-
|
|
40496
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest
|
|
39379
40497
|
});
|
|
39380
40498
|
}
|
|
39381
40499
|
if (evaluator.type === "composite") {
|
|
39382
|
-
const evalFileDir = evalCase.guideline_paths[0] ?
|
|
40500
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path132.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
39383
40501
|
const createEvaluator = (memberConfig) => {
|
|
39384
40502
|
switch (memberConfig.type) {
|
|
39385
40503
|
case "llm_judge":
|
|
@@ -39432,8 +40550,8 @@ async function runEvaluatorList(options) {
|
|
|
39432
40550
|
hits: score2.hits,
|
|
39433
40551
|
misses: score2.misses,
|
|
39434
40552
|
reasoning: score2.reasoning,
|
|
39435
|
-
|
|
39436
|
-
|
|
40553
|
+
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
40554
|
+
evaluatorResults: mapChildResults(score2.evaluatorResults)
|
|
39437
40555
|
});
|
|
39438
40556
|
}
|
|
39439
40557
|
if (evaluator.type === "tool_trajectory") {
|
|
@@ -39591,22 +40709,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
|
|
|
39591
40709
|
async function dumpPrompt(directory, evalCase, promptInputs) {
|
|
39592
40710
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
39593
40711
|
const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
|
|
39594
|
-
const filePath =
|
|
39595
|
-
await
|
|
40712
|
+
const filePath = path132.resolve(directory, filename);
|
|
40713
|
+
await mkdir32(path132.dirname(filePath), { recursive: true });
|
|
39596
40714
|
const payload = {
|
|
39597
40715
|
eval_id: evalCase.id,
|
|
39598
40716
|
question: promptInputs.question,
|
|
39599
40717
|
guidelines: promptInputs.guidelines,
|
|
39600
40718
|
guideline_paths: evalCase.guideline_paths
|
|
39601
40719
|
};
|
|
39602
|
-
await
|
|
40720
|
+
await writeFile32(filePath, JSON.stringify(payload, null, 2), "utf8");
|
|
39603
40721
|
}
|
|
39604
40722
|
function sanitizeFilename(value) {
|
|
39605
40723
|
if (!value) {
|
|
39606
40724
|
return "prompt";
|
|
39607
40725
|
}
|
|
39608
40726
|
const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
|
|
39609
|
-
return sanitized.length > 0 ? sanitized :
|
|
40727
|
+
return sanitized.length > 0 ? sanitized : randomUUID3();
|
|
39610
40728
|
}
|
|
39611
40729
|
async function invokeProvider(provider, options) {
|
|
39612
40730
|
const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
|
|
@@ -39663,17 +40781,17 @@ function buildErrorResult(evalCase, targetName, timestamp, error40, promptInputs
|
|
|
39663
40781
|
}
|
|
39664
40782
|
return {
|
|
39665
40783
|
timestamp: timestamp.toISOString(),
|
|
39666
|
-
|
|
40784
|
+
evalId: evalCase.id,
|
|
39667
40785
|
dataset: evalCase.dataset,
|
|
39668
|
-
|
|
40786
|
+
conversationId: evalCase.conversation_id,
|
|
39669
40787
|
score: 0,
|
|
39670
40788
|
hits: [],
|
|
39671
40789
|
misses: [`Error: ${message}`],
|
|
39672
|
-
|
|
40790
|
+
candidateAnswer: `Error occurred: ${message}`,
|
|
39673
40791
|
target: targetName,
|
|
39674
|
-
|
|
39675
|
-
|
|
39676
|
-
|
|
40792
|
+
rawAspects: [],
|
|
40793
|
+
agentProviderRequest,
|
|
40794
|
+
lmProviderRequest,
|
|
39677
40795
|
error: message
|
|
39678
40796
|
};
|
|
39679
40797
|
}
|
|
@@ -39718,8 +40836,8 @@ function mapChildResults(children) {
|
|
|
39718
40836
|
hits: child.hits,
|
|
39719
40837
|
misses: child.misses,
|
|
39720
40838
|
reasoning: child.reasoning,
|
|
39721
|
-
|
|
39722
|
-
|
|
40839
|
+
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
40840
|
+
evaluatorResults: mapChildResults(child.evaluatorResults)
|
|
39723
40841
|
}));
|
|
39724
40842
|
}
|
|
39725
40843
|
function computeWeightedMean(entries) {
|
|
@@ -39810,10 +40928,10 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
|
|
|
39810
40928
|
}
|
|
39811
40929
|
|
|
39812
40930
|
// src/commands/convert/index.ts
|
|
39813
|
-
import { command, option, optional as
|
|
40931
|
+
import { command as command2, option as option2, optional as optional3, positional as positional2, string as string5 } from "cmd-ts";
|
|
39814
40932
|
import { stringify as stringifyYaml } from "yaml";
|
|
39815
40933
|
function convertJsonlToYaml(inputPath, outputPath) {
|
|
39816
|
-
const content =
|
|
40934
|
+
const content = readFileSync2(inputPath, "utf8");
|
|
39817
40935
|
const lines = content.trim().split("\n").filter((line2) => line2.trim());
|
|
39818
40936
|
let yamlOutput = "";
|
|
39819
40937
|
let isFirst = true;
|
|
@@ -39831,17 +40949,17 @@ function convertJsonlToYaml(inputPath, outputPath) {
|
|
|
39831
40949
|
writeFileSync(outputPath, yamlOutput);
|
|
39832
40950
|
return lines.length;
|
|
39833
40951
|
}
|
|
39834
|
-
var convertCommand =
|
|
40952
|
+
var convertCommand = command2({
|
|
39835
40953
|
name: "convert",
|
|
39836
40954
|
description: "Convert evaluation results from JSONL to YAML format",
|
|
39837
40955
|
args: {
|
|
39838
|
-
input:
|
|
39839
|
-
type:
|
|
40956
|
+
input: positional2({
|
|
40957
|
+
type: string5,
|
|
39840
40958
|
displayName: "input",
|
|
39841
40959
|
description: "Path to input JSONL file"
|
|
39842
40960
|
}),
|
|
39843
|
-
out:
|
|
39844
|
-
type:
|
|
40961
|
+
out: option2({
|
|
40962
|
+
type: optional3(string5),
|
|
39845
40963
|
long: "out",
|
|
39846
40964
|
short: "o",
|
|
39847
40965
|
description: "Output file path (defaults to input path with .yaml extension)"
|
|
@@ -39867,13 +40985,13 @@ var convertCommand = command({
|
|
|
39867
40985
|
import { stat as stat4 } from "node:fs/promises";
|
|
39868
40986
|
import path21 from "node:path";
|
|
39869
40987
|
import {
|
|
39870
|
-
command as
|
|
40988
|
+
command as command3,
|
|
39871
40989
|
flag,
|
|
39872
|
-
number as
|
|
39873
|
-
option as
|
|
39874
|
-
optional as
|
|
40990
|
+
number as number5,
|
|
40991
|
+
option as option3,
|
|
40992
|
+
optional as optional4,
|
|
39875
40993
|
restPositionals,
|
|
39876
|
-
string as
|
|
40994
|
+
string as string6
|
|
39877
40995
|
} from "cmd-ts";
|
|
39878
40996
|
import fg from "fast-glob";
|
|
39879
40997
|
|
|
@@ -39955,7 +41073,7 @@ async function loadEnvFromHierarchy(options) {
|
|
|
39955
41073
|
}
|
|
39956
41074
|
|
|
39957
41075
|
// src/commands/eval/jsonl-writer.ts
|
|
39958
|
-
import { createWriteStream as
|
|
41076
|
+
import { createWriteStream as createWriteStream3 } from "node:fs";
|
|
39959
41077
|
import { mkdir as mkdir5 } from "node:fs/promises";
|
|
39960
41078
|
import path16 from "node:path";
|
|
39961
41079
|
import { finished } from "node:stream/promises";
|
|
@@ -40176,7 +41294,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
40176
41294
|
}
|
|
40177
41295
|
static async open(filePath) {
|
|
40178
41296
|
await mkdir5(path16.dirname(filePath), { recursive: true });
|
|
40179
|
-
const stream =
|
|
41297
|
+
const stream = createWriteStream3(filePath, { flags: "w", encoding: "utf8" });
|
|
40180
41298
|
return new _JsonlWriter(stream);
|
|
40181
41299
|
}
|
|
40182
41300
|
async append(record2) {
|
|
@@ -40205,7 +41323,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
40205
41323
|
};
|
|
40206
41324
|
|
|
40207
41325
|
// src/commands/eval/yaml-writer.ts
|
|
40208
|
-
import { createWriteStream as
|
|
41326
|
+
import { createWriteStream as createWriteStream4 } from "node:fs";
|
|
40209
41327
|
import { mkdir as mkdir6 } from "node:fs/promises";
|
|
40210
41328
|
import path17 from "node:path";
|
|
40211
41329
|
import { finished as finished2 } from "node:stream/promises";
|
|
@@ -40220,7 +41338,7 @@ var YamlWriter = class _YamlWriter {
|
|
|
40220
41338
|
}
|
|
40221
41339
|
static async open(filePath) {
|
|
40222
41340
|
await mkdir6(path17.dirname(filePath), { recursive: true });
|
|
40223
|
-
const stream =
|
|
41341
|
+
const stream = createWriteStream4(filePath, { flags: "w", encoding: "utf8" });
|
|
40224
41342
|
return new _YamlWriter(stream);
|
|
40225
41343
|
}
|
|
40226
41344
|
async append(record2) {
|
|
@@ -40336,7 +41454,7 @@ var ProgressDisplay = class {
|
|
|
40336
41454
|
break;
|
|
40337
41455
|
}
|
|
40338
41456
|
}
|
|
40339
|
-
addLogPaths(paths) {
|
|
41457
|
+
addLogPaths(paths, provider) {
|
|
40340
41458
|
const newPaths = [];
|
|
40341
41459
|
for (const path28 of paths) {
|
|
40342
41460
|
if (this.logPathSet.has(path28)) {
|
|
@@ -40351,7 +41469,8 @@ var ProgressDisplay = class {
|
|
|
40351
41469
|
this.logPaths.push(...newPaths);
|
|
40352
41470
|
if (!this.hasPrintedLogHeader) {
|
|
40353
41471
|
console.log("");
|
|
40354
|
-
|
|
41472
|
+
const label = provider === "pi" ? "Pi Coding Agent" : "Codex CLI";
|
|
41473
|
+
console.log(`${label} logs:`);
|
|
40355
41474
|
this.hasPrintedLogHeader = true;
|
|
40356
41475
|
}
|
|
40357
41476
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
@@ -40419,7 +41538,7 @@ function buildHistogram(values) {
|
|
|
40419
41538
|
function calculateEvaluationSummary(results) {
|
|
40420
41539
|
const scores = results.map((result) => result.score);
|
|
40421
41540
|
const total = results.length;
|
|
40422
|
-
const errors = results.filter((result) => result.error !== void 0).map((result) => ({ evalId: result.
|
|
41541
|
+
const errors = results.filter((result) => result.error !== void 0).map((result) => ({ evalId: result.evalId, error: result.error }));
|
|
40423
41542
|
const errorCount = errors.length;
|
|
40424
41543
|
if (total === 0) {
|
|
40425
41544
|
return {
|
|
@@ -40500,11 +41619,11 @@ function formatEvaluationSummary(summary) {
|
|
|
40500
41619
|
}
|
|
40501
41620
|
lines.push("\nTop performing eval cases:");
|
|
40502
41621
|
summary.topResults.forEach((result, index) => {
|
|
40503
|
-
lines.push(` ${index + 1}. ${result.
|
|
41622
|
+
lines.push(` ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
|
|
40504
41623
|
});
|
|
40505
41624
|
lines.push("\nLowest performing eval cases:");
|
|
40506
41625
|
summary.bottomResults.forEach((result, index) => {
|
|
40507
|
-
lines.push(` ${index + 1}. ${result.
|
|
41626
|
+
lines.push(` ${index + 1}. ${result.evalId}: ${formatScore(result.score)}`);
|
|
40508
41627
|
});
|
|
40509
41628
|
return lines.join("\n");
|
|
40510
41629
|
}
|
|
@@ -40863,27 +41982,6 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
40863
41982
|
"trace"
|
|
40864
41983
|
// For testing tool_trajectory evaluator
|
|
40865
41984
|
]);
|
|
40866
|
-
var CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
40867
|
-
...COMMON_SETTINGS,
|
|
40868
|
-
"command_template",
|
|
40869
|
-
"commandTemplate",
|
|
40870
|
-
"verbose",
|
|
40871
|
-
"cli_verbose",
|
|
40872
|
-
"cliVerbose",
|
|
40873
|
-
"files_format",
|
|
40874
|
-
"filesFormat",
|
|
40875
|
-
"attachments_format",
|
|
40876
|
-
"attachmentsFormat",
|
|
40877
|
-
"cwd",
|
|
40878
|
-
"env",
|
|
40879
|
-
"timeout_seconds",
|
|
40880
|
-
"timeoutSeconds",
|
|
40881
|
-
"healthcheck",
|
|
40882
|
-
"keep_temp_files",
|
|
40883
|
-
"keepTempFiles",
|
|
40884
|
-
"keep_output_files",
|
|
40885
|
-
"keepOutputFiles"
|
|
40886
|
-
]);
|
|
40887
41985
|
function getKnownSettings(provider) {
|
|
40888
41986
|
const normalizedProvider = provider.toLowerCase();
|
|
40889
41987
|
switch (normalizedProvider) {
|
|
@@ -40905,7 +42003,7 @@ function getKnownSettings(provider) {
|
|
|
40905
42003
|
case "mock":
|
|
40906
42004
|
return MOCK_SETTINGS;
|
|
40907
42005
|
case "cli":
|
|
40908
|
-
return
|
|
42006
|
+
return null;
|
|
40909
42007
|
default:
|
|
40910
42008
|
return null;
|
|
40911
42009
|
}
|
|
@@ -40954,7 +42052,7 @@ async function validateTargetsFile(filePath) {
|
|
|
40954
42052
|
severity: "error",
|
|
40955
42053
|
filePath: absolutePath2,
|
|
40956
42054
|
location: `${location}.commandTemplate`,
|
|
40957
|
-
message: "CLI provider requires 'commandTemplate' as a non-empty string"
|
|
42055
|
+
message: "CLI provider requires 'command_template' or 'commandTemplate' as a non-empty string"
|
|
40958
42056
|
});
|
|
40959
42057
|
} else {
|
|
40960
42058
|
recordUnknownPlaceholders(
|
|
@@ -40964,58 +42062,10 @@ async function validateTargetsFile(filePath) {
|
|
|
40964
42062
|
errors2
|
|
40965
42063
|
);
|
|
40966
42064
|
}
|
|
40967
|
-
const attachmentsFormat = target.attachments_format ?? target.attachmentsFormat;
|
|
40968
|
-
if (attachmentsFormat !== void 0 && typeof attachmentsFormat !== "string") {
|
|
40969
|
-
errors2.push({
|
|
40970
|
-
severity: "error",
|
|
40971
|
-
filePath: absolutePath2,
|
|
40972
|
-
location: `${location}.attachmentsFormat`,
|
|
40973
|
-
message: "'attachmentsFormat' must be a string when provided"
|
|
40974
|
-
});
|
|
40975
|
-
}
|
|
40976
|
-
const filesFormat = target.files_format ?? target.filesFormat;
|
|
40977
|
-
if (filesFormat !== void 0 && typeof filesFormat !== "string") {
|
|
40978
|
-
errors2.push({
|
|
40979
|
-
severity: "error",
|
|
40980
|
-
filePath: absolutePath2,
|
|
40981
|
-
location: `${location}.filesFormat`,
|
|
40982
|
-
message: "'filesFormat' must be a string when provided"
|
|
40983
|
-
});
|
|
40984
|
-
}
|
|
40985
|
-
const cwd = target.cwd;
|
|
40986
|
-
if (cwd !== void 0 && typeof cwd !== "string") {
|
|
40987
|
-
errors2.push({
|
|
40988
|
-
severity: "error",
|
|
40989
|
-
filePath: absolutePath2,
|
|
40990
|
-
location: `${location}.cwd`,
|
|
40991
|
-
message: "'cwd' must be a string when provided"
|
|
40992
|
-
});
|
|
40993
|
-
}
|
|
40994
|
-
const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
|
|
40995
|
-
if (timeoutSeconds !== void 0) {
|
|
40996
|
-
const numericTimeout = Number(timeoutSeconds);
|
|
40997
|
-
if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
|
|
40998
|
-
errors2.push({
|
|
40999
|
-
severity: "error",
|
|
41000
|
-
filePath: absolutePath2,
|
|
41001
|
-
location: `${location}.timeoutSeconds`,
|
|
41002
|
-
message: "'timeoutSeconds' must be a positive number when provided"
|
|
41003
|
-
});
|
|
41004
|
-
}
|
|
41005
|
-
}
|
|
41006
42065
|
const healthcheck = target.healthcheck;
|
|
41007
42066
|
if (healthcheck !== void 0) {
|
|
41008
42067
|
validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
|
|
41009
42068
|
}
|
|
41010
|
-
const verbose = target.verbose ?? target.cli_verbose ?? target.cliVerbose;
|
|
41011
|
-
if (verbose !== void 0 && typeof verbose !== "boolean") {
|
|
41012
|
-
errors2.push({
|
|
41013
|
-
severity: "error",
|
|
41014
|
-
filePath: absolutePath2,
|
|
41015
|
-
location: `${location}.verbose`,
|
|
41016
|
-
message: "'verbose' must be a boolean when provided"
|
|
41017
|
-
});
|
|
41018
|
-
}
|
|
41019
42069
|
}
|
|
41020
42070
|
function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
|
|
41021
42071
|
if (!isObject22(healthcheck)) {
|
|
@@ -41639,12 +42689,12 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
41639
42689
|
const extension = getDefaultExtension(format);
|
|
41640
42690
|
return path20.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
41641
42691
|
}
|
|
41642
|
-
function resolvePromptDirectory(
|
|
41643
|
-
if (
|
|
42692
|
+
function resolvePromptDirectory(option6, cwd) {
|
|
42693
|
+
if (option6 === void 0) {
|
|
41644
42694
|
return void 0;
|
|
41645
42695
|
}
|
|
41646
|
-
if (typeof
|
|
41647
|
-
return path20.resolve(cwd,
|
|
42696
|
+
if (typeof option6 === "string" && option6.trim().length > 0) {
|
|
42697
|
+
return path20.resolve(cwd, option6);
|
|
41648
42698
|
}
|
|
41649
42699
|
return path20.join(cwd, ".agentv", "prompts");
|
|
41650
42700
|
}
|
|
@@ -41667,7 +42717,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
41667
42717
|
setTotal: (total) => display.setTotalTests(total),
|
|
41668
42718
|
update: (workerId, progress) => display.updateWorker({ ...progress, workerId }),
|
|
41669
42719
|
finish: () => display.finish(),
|
|
41670
|
-
addLogPaths: (paths) => display.addLogPaths(paths)
|
|
42720
|
+
addLogPaths: (paths, provider) => display.addLogPaths(paths, provider)
|
|
41671
42721
|
};
|
|
41672
42722
|
}
|
|
41673
42723
|
function makeEvalKey(testFilePath, evalId) {
|
|
@@ -41885,7 +42935,15 @@ async function runEvalCommand(input) {
|
|
|
41885
42935
|
return;
|
|
41886
42936
|
}
|
|
41887
42937
|
seenCodexLogPaths.add(entry.filePath);
|
|
41888
|
-
progressReporter.addLogPaths([entry.filePath]);
|
|
42938
|
+
progressReporter.addLogPaths([entry.filePath], "codex");
|
|
42939
|
+
});
|
|
42940
|
+
const seenPiLogPaths = /* @__PURE__ */ new Set();
|
|
42941
|
+
const unsubscribePiLogs = subscribeToPiLogEntries((entry) => {
|
|
42942
|
+
if (!entry.filePath || seenPiLogPaths.has(entry.filePath)) {
|
|
42943
|
+
return;
|
|
42944
|
+
}
|
|
42945
|
+
seenPiLogPaths.add(entry.filePath);
|
|
42946
|
+
progressReporter.addLogPaths([entry.filePath], "pi");
|
|
41889
42947
|
});
|
|
41890
42948
|
for (const [testFilePath, meta] of fileMetadata.entries()) {
|
|
41891
42949
|
for (const evalId of meta.evalIds) {
|
|
@@ -41939,6 +42997,7 @@ Results written to: ${outputPath}`);
|
|
|
41939
42997
|
}
|
|
41940
42998
|
} finally {
|
|
41941
42999
|
unsubscribeCodexLogs();
|
|
43000
|
+
unsubscribePiLogs();
|
|
41942
43001
|
await outputWriter.close().catch(() => void 0);
|
|
41943
43002
|
}
|
|
41944
43003
|
}
|
|
@@ -41960,44 +43019,44 @@ async function resolveEvaluationRunner() {
|
|
|
41960
43019
|
}
|
|
41961
43020
|
|
|
41962
43021
|
// src/commands/eval/index.ts
|
|
41963
|
-
var evalCommand =
|
|
43022
|
+
var evalCommand = command3({
|
|
41964
43023
|
name: "eval",
|
|
41965
43024
|
description: "Run eval suites and report results",
|
|
41966
43025
|
args: {
|
|
41967
43026
|
evalPaths: restPositionals({
|
|
41968
|
-
type:
|
|
43027
|
+
type: string6,
|
|
41969
43028
|
displayName: "eval-paths",
|
|
41970
43029
|
description: "Path(s) or glob(s) to evaluation .yaml file(s)"
|
|
41971
43030
|
}),
|
|
41972
|
-
target:
|
|
41973
|
-
type:
|
|
43031
|
+
target: option3({
|
|
43032
|
+
type: string6,
|
|
41974
43033
|
long: "target",
|
|
41975
43034
|
description: "Override target name from targets.yaml",
|
|
41976
43035
|
defaultValue: () => "default"
|
|
41977
43036
|
}),
|
|
41978
|
-
targets:
|
|
41979
|
-
type:
|
|
43037
|
+
targets: option3({
|
|
43038
|
+
type: optional4(string6),
|
|
41980
43039
|
long: "targets",
|
|
41981
43040
|
description: "Path to targets.yaml (overrides discovery)"
|
|
41982
43041
|
}),
|
|
41983
|
-
evalId:
|
|
41984
|
-
type:
|
|
43042
|
+
evalId: option3({
|
|
43043
|
+
type: optional4(string6),
|
|
41985
43044
|
long: "eval-id",
|
|
41986
43045
|
description: "Run only the eval case with this identifier"
|
|
41987
43046
|
}),
|
|
41988
|
-
workers:
|
|
41989
|
-
type:
|
|
43047
|
+
workers: option3({
|
|
43048
|
+
type: number5,
|
|
41990
43049
|
long: "workers",
|
|
41991
43050
|
description: "Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml",
|
|
41992
43051
|
defaultValue: () => 3
|
|
41993
43052
|
}),
|
|
41994
|
-
out:
|
|
41995
|
-
type:
|
|
43053
|
+
out: option3({
|
|
43054
|
+
type: optional4(string6),
|
|
41996
43055
|
long: "out",
|
|
41997
43056
|
description: "Write results to the specified path"
|
|
41998
43057
|
}),
|
|
41999
|
-
outputFormat:
|
|
42000
|
-
type:
|
|
43058
|
+
outputFormat: option3({
|
|
43059
|
+
type: string6,
|
|
42001
43060
|
long: "output-format",
|
|
42002
43061
|
description: "Output format: 'jsonl' or 'yaml' (default: jsonl)",
|
|
42003
43062
|
defaultValue: () => "jsonl"
|
|
@@ -42006,32 +43065,32 @@ var evalCommand = command2({
|
|
|
42006
43065
|
long: "dry-run",
|
|
42007
43066
|
description: "Use mock provider responses instead of real LLM calls"
|
|
42008
43067
|
}),
|
|
42009
|
-
dryRunDelay:
|
|
42010
|
-
type:
|
|
43068
|
+
dryRunDelay: option3({
|
|
43069
|
+
type: number5,
|
|
42011
43070
|
long: "dry-run-delay",
|
|
42012
43071
|
description: "Fixed delay in milliseconds for dry-run mode (overridden by delay range if specified)",
|
|
42013
43072
|
defaultValue: () => 0
|
|
42014
43073
|
}),
|
|
42015
|
-
dryRunDelayMin:
|
|
42016
|
-
type:
|
|
43074
|
+
dryRunDelayMin: option3({
|
|
43075
|
+
type: number5,
|
|
42017
43076
|
long: "dry-run-delay-min",
|
|
42018
43077
|
description: "Minimum delay in milliseconds for dry-run mode (requires --dry-run-delay-max)",
|
|
42019
43078
|
defaultValue: () => 0
|
|
42020
43079
|
}),
|
|
42021
|
-
dryRunDelayMax:
|
|
42022
|
-
type:
|
|
43080
|
+
dryRunDelayMax: option3({
|
|
43081
|
+
type: number5,
|
|
42023
43082
|
long: "dry-run-delay-max",
|
|
42024
43083
|
description: "Maximum delay in milliseconds for dry-run mode (requires --dry-run-delay-min)",
|
|
42025
43084
|
defaultValue: () => 0
|
|
42026
43085
|
}),
|
|
42027
|
-
agentTimeout:
|
|
42028
|
-
type:
|
|
43086
|
+
agentTimeout: option3({
|
|
43087
|
+
type: number5,
|
|
42029
43088
|
long: "agent-timeout",
|
|
42030
43089
|
description: "Timeout in seconds for provider responses (default: 120)",
|
|
42031
43090
|
defaultValue: () => 120
|
|
42032
43091
|
}),
|
|
42033
|
-
maxRetries:
|
|
42034
|
-
type:
|
|
43092
|
+
maxRetries: option3({
|
|
43093
|
+
type: number5,
|
|
42035
43094
|
long: "max-retries",
|
|
42036
43095
|
description: "Retry count for timeout recoveries (default: 2)",
|
|
42037
43096
|
defaultValue: () => 2
|
|
@@ -42044,8 +43103,8 @@ var evalCommand = command2({
|
|
|
42044
43103
|
long: "verbose",
|
|
42045
43104
|
description: "Enable verbose logging"
|
|
42046
43105
|
}),
|
|
42047
|
-
dumpPrompts:
|
|
42048
|
-
type:
|
|
43106
|
+
dumpPrompts: option3({
|
|
43107
|
+
type: optional4(string6),
|
|
42049
43108
|
long: "dump-prompts",
|
|
42050
43109
|
description: "Directory path for persisting prompt payloads for debugging"
|
|
42051
43110
|
}),
|
|
@@ -42131,7 +43190,7 @@ async function resolveEvalPaths(evalPaths, cwd) {
|
|
|
42131
43190
|
}
|
|
42132
43191
|
|
|
42133
43192
|
// src/commands/generate/index.ts
|
|
42134
|
-
import { command as
|
|
43193
|
+
import { command as command4, flag as flag2, option as option4, optional as optional5, positional as positional4, string as string7, subcommands } from "cmd-ts";
|
|
42135
43194
|
|
|
42136
43195
|
// src/commands/generate/rubrics.ts
|
|
42137
43196
|
import { readFile as readFile8, writeFile as writeFile6 } from "node:fs/promises";
|
|
@@ -42274,17 +43333,17 @@ function extractQuestion(evalCase) {
|
|
|
42274
43333
|
}
|
|
42275
43334
|
|
|
42276
43335
|
// src/commands/generate/index.ts
|
|
42277
|
-
var rubricsCommand =
|
|
43336
|
+
var rubricsCommand = command4({
|
|
42278
43337
|
name: "rubrics",
|
|
42279
43338
|
description: "Generate rubrics from expected_outcome in YAML eval file",
|
|
42280
43339
|
args: {
|
|
42281
|
-
file:
|
|
42282
|
-
type:
|
|
43340
|
+
file: positional4({
|
|
43341
|
+
type: string7,
|
|
42283
43342
|
displayName: "file",
|
|
42284
43343
|
description: "Path to YAML eval file"
|
|
42285
43344
|
}),
|
|
42286
|
-
target:
|
|
42287
|
-
type:
|
|
43345
|
+
target: option4({
|
|
43346
|
+
type: optional5(string7),
|
|
42288
43347
|
long: "target",
|
|
42289
43348
|
short: "t",
|
|
42290
43349
|
description: "Override target for rubric generation (default: file target or openai:gpt-4o)"
|
|
@@ -42320,10 +43379,10 @@ var generateCommand = subcommands({
|
|
|
42320
43379
|
import { existsSync, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
|
|
42321
43380
|
import path26 from "node:path";
|
|
42322
43381
|
import * as readline from "node:readline/promises";
|
|
42323
|
-
import { command as
|
|
43382
|
+
import { command as command5, option as option5, optional as optional6, string as string8 } from "cmd-ts";
|
|
42324
43383
|
|
|
42325
43384
|
// src/templates/index.ts
|
|
42326
|
-
import { readFileSync as
|
|
43385
|
+
import { readFileSync as readFileSync3, readdirSync, statSync } from "node:fs";
|
|
42327
43386
|
import path25 from "node:path";
|
|
42328
43387
|
import { fileURLToPath } from "node:url";
|
|
42329
43388
|
function getGithubTemplates() {
|
|
@@ -42355,7 +43414,7 @@ function readTemplatesRecursively(dir, relativePath) {
|
|
|
42355
43414
|
if (stat6.isDirectory()) {
|
|
42356
43415
|
templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
|
|
42357
43416
|
} else {
|
|
42358
|
-
const content =
|
|
43417
|
+
const content = readFileSync3(fullPath, "utf-8");
|
|
42359
43418
|
templates.push({
|
|
42360
43419
|
path: entryRelativePath.split(path25.sep).join("/"),
|
|
42361
43420
|
// Normalize to forward slashes
|
|
@@ -42499,12 +43558,12 @@ Files installed to ${path26.relative(targetPath, claudeDir)}:`);
|
|
|
42499
43558
|
console.log(" 2. Configure targets in .agentv/targets.yaml");
|
|
42500
43559
|
console.log(" 3. Create eval files using the schema and prompt templates");
|
|
42501
43560
|
}
|
|
42502
|
-
var initCmdTsCommand =
|
|
43561
|
+
var initCmdTsCommand = command5({
|
|
42503
43562
|
name: "init",
|
|
42504
43563
|
description: "Initialize AgentV in your project (installs prompt templates and schema to .github)",
|
|
42505
43564
|
args: {
|
|
42506
|
-
path:
|
|
42507
|
-
type:
|
|
43565
|
+
path: option5({
|
|
43566
|
+
type: optional6(string8),
|
|
42508
43567
|
long: "path",
|
|
42509
43568
|
description: "Target directory for initialization (default: current directory)"
|
|
42510
43569
|
})
|
|
@@ -42520,7 +43579,7 @@ var initCmdTsCommand = command4({
|
|
|
42520
43579
|
});
|
|
42521
43580
|
|
|
42522
43581
|
// src/commands/validate/index.ts
|
|
42523
|
-
import { command as
|
|
43582
|
+
import { command as command6, restPositionals as restPositionals2, string as string9 } from "cmd-ts";
|
|
42524
43583
|
|
|
42525
43584
|
// src/commands/validate/format-output.ts
|
|
42526
43585
|
var ANSI_RED3 = "\x1B[31m";
|
|
@@ -42706,12 +43765,12 @@ async function runValidateCommand(paths) {
|
|
|
42706
43765
|
process.exit(1);
|
|
42707
43766
|
}
|
|
42708
43767
|
}
|
|
42709
|
-
var validateCommand =
|
|
43768
|
+
var validateCommand = command6({
|
|
42710
43769
|
name: "validate",
|
|
42711
43770
|
description: "Validate AgentV eval and targets YAML files",
|
|
42712
43771
|
args: {
|
|
42713
43772
|
paths: restPositionals2({
|
|
42714
|
-
type:
|
|
43773
|
+
type: string9,
|
|
42715
43774
|
displayName: "paths",
|
|
42716
43775
|
description: "Files or directories to validate"
|
|
42717
43776
|
})
|
|
@@ -42727,12 +43786,13 @@ var validateCommand = command5({
|
|
|
42727
43786
|
});
|
|
42728
43787
|
|
|
42729
43788
|
// src/index.ts
|
|
42730
|
-
var packageJson = JSON.parse(
|
|
43789
|
+
var packageJson = JSON.parse(readFileSync4(new URL("../package.json", import.meta.url), "utf8"));
|
|
42731
43790
|
var app = subcommands2({
|
|
42732
43791
|
name: "agentv",
|
|
42733
43792
|
description: "AgentV CLI",
|
|
42734
43793
|
version: packageJson.version,
|
|
42735
43794
|
cmds: {
|
|
43795
|
+
compare: compareCommand,
|
|
42736
43796
|
convert: convertCommand,
|
|
42737
43797
|
eval: evalCommand,
|
|
42738
43798
|
generate: generateCommand,
|
|
@@ -42748,4 +43808,4 @@ export {
|
|
|
42748
43808
|
app,
|
|
42749
43809
|
runCli
|
|
42750
43810
|
};
|
|
42751
|
-
//# sourceMappingURL=chunk-
|
|
43811
|
+
//# sourceMappingURL=chunk-HU4B6ODF.js.map
|