agentv 1.0.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-RIJO5WBF.js → chunk-6R2YRXCQ.js} +287 -405
- package/dist/chunk-6R2YRXCQ.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index.js +1 -1
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +40 -19
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +288 -0
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +100 -41
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +10 -68
- package/package.json +2 -2
- package/dist/chunk-RIJO5WBF.js.map +0 -1
|
@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
|
|
|
164
164
|
import path19 from "node:path";
|
|
165
165
|
import { pathToFileURL } from "node:url";
|
|
166
166
|
|
|
167
|
-
// ../../packages/core/dist/chunk-
|
|
167
|
+
// ../../packages/core/dist/chunk-4A6L2F6L.js
|
|
168
168
|
import { constants } from "node:fs";
|
|
169
169
|
import { access, readFile } from "node:fs/promises";
|
|
170
170
|
import path from "node:path";
|
|
@@ -4211,7 +4211,7 @@ var coerce = {
|
|
|
4211
4211
|
};
|
|
4212
4212
|
var NEVER = INVALID;
|
|
4213
4213
|
|
|
4214
|
-
// ../../packages/core/dist/chunk-
|
|
4214
|
+
// ../../packages/core/dist/chunk-4A6L2F6L.js
|
|
4215
4215
|
async function fileExists(filePath) {
|
|
4216
4216
|
try {
|
|
4217
4217
|
await access(filePath, constants.F_OK);
|
|
@@ -4612,10 +4612,14 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4612
4612
|
const filesFormat = resolveOptionalLiteralString(
|
|
4613
4613
|
target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
|
|
4614
4614
|
);
|
|
4615
|
+
const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
|
|
4615
4616
|
let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
|
|
4616
4617
|
allowLiteral: true,
|
|
4617
4618
|
optionalEnv: true
|
|
4618
4619
|
});
|
|
4620
|
+
if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
|
|
4621
|
+
cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
|
|
4622
|
+
}
|
|
4619
4623
|
if (!cwd && evalFilePath) {
|
|
4620
4624
|
cwd = path2.dirname(path2.resolve(evalFilePath));
|
|
4621
4625
|
}
|
|
@@ -4623,7 +4627,7 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4623
4627
|
target.timeout_seconds ?? target.timeoutSeconds,
|
|
4624
4628
|
`${target.name} timeout`
|
|
4625
4629
|
);
|
|
4626
|
-
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
|
|
4630
|
+
const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
|
|
4627
4631
|
const commandTemplate = resolveString(
|
|
4628
4632
|
commandTemplateSource,
|
|
4629
4633
|
env,
|
|
@@ -4636,7 +4640,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
4636
4640
|
filesFormat,
|
|
4637
4641
|
cwd,
|
|
4638
4642
|
timeoutMs,
|
|
4639
|
-
healthcheck
|
|
4643
|
+
healthcheck,
|
|
4644
|
+
verbose
|
|
4640
4645
|
};
|
|
4641
4646
|
}
|
|
4642
4647
|
function resolveTimeoutMs(source2, description) {
|
|
@@ -4649,7 +4654,7 @@ function resolveTimeoutMs(source2, description) {
|
|
|
4649
4654
|
}
|
|
4650
4655
|
return Math.floor(seconds * 1e3);
|
|
4651
4656
|
}
|
|
4652
|
-
function resolveCliHealthcheck(source2, env, targetName) {
|
|
4657
|
+
function resolveCliHealthcheck(source2, env, targetName, evalFilePath) {
|
|
4653
4658
|
if (source2 === void 0 || source2 === null) {
|
|
4654
4659
|
return void 0;
|
|
4655
4660
|
}
|
|
@@ -4682,11 +4687,12 @@ function resolveCliHealthcheck(source2, env, targetName) {
|
|
|
4682
4687
|
allowLiteral: true,
|
|
4683
4688
|
optionalEnv: true
|
|
4684
4689
|
});
|
|
4690
|
+
const resolvedCwd = cwd && evalFilePath && !path2.isAbsolute(cwd) ? path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd) : cwd;
|
|
4685
4691
|
return {
|
|
4686
4692
|
type: "command",
|
|
4687
4693
|
commandTemplate,
|
|
4688
4694
|
timeoutMs,
|
|
4689
|
-
cwd
|
|
4695
|
+
cwd: resolvedCwd
|
|
4690
4696
|
};
|
|
4691
4697
|
}
|
|
4692
4698
|
throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
|
|
@@ -34567,18 +34573,23 @@ function isTestMessage(value) {
|
|
|
34567
34573
|
if (typeof candidate.content === "string") {
|
|
34568
34574
|
return true;
|
|
34569
34575
|
}
|
|
34570
|
-
if (
|
|
34571
|
-
return
|
|
34576
|
+
if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
|
|
34577
|
+
return true;
|
|
34578
|
+
}
|
|
34579
|
+
if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
|
|
34580
|
+
return true;
|
|
34581
|
+
}
|
|
34582
|
+
if (isJsonObject(candidate.content)) {
|
|
34583
|
+
return true;
|
|
34572
34584
|
}
|
|
34573
|
-
return
|
|
34585
|
+
return false;
|
|
34574
34586
|
}
|
|
34575
34587
|
var EVALUATOR_KIND_VALUES = [
|
|
34576
34588
|
"code_judge",
|
|
34577
34589
|
"llm_judge",
|
|
34578
34590
|
"rubric",
|
|
34579
34591
|
"composite",
|
|
34580
|
-
"tool_trajectory"
|
|
34581
|
-
"expected_tool_calls"
|
|
34592
|
+
"tool_trajectory"
|
|
34582
34593
|
];
|
|
34583
34594
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34584
34595
|
function isEvaluatorKind(value) {
|
|
@@ -35058,15 +35069,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
35058
35069
|
});
|
|
35059
35070
|
continue;
|
|
35060
35071
|
}
|
|
35061
|
-
if (typeValue === "expected_tool_calls") {
|
|
35062
|
-
const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
|
|
35063
|
-
evaluators.push({
|
|
35064
|
-
name: name16,
|
|
35065
|
-
type: "expected_tool_calls",
|
|
35066
|
-
...weight2 !== void 0 ? { weight: weight2 } : {}
|
|
35067
|
-
});
|
|
35068
|
-
continue;
|
|
35069
|
-
}
|
|
35070
35072
|
if (typeValue === "tool_trajectory") {
|
|
35071
35073
|
const mode = asString2(rawEvaluator.mode);
|
|
35072
35074
|
if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
|
|
@@ -35257,6 +35259,17 @@ async function processMessages(options) {
|
|
|
35257
35259
|
}
|
|
35258
35260
|
continue;
|
|
35259
35261
|
}
|
|
35262
|
+
if (isJsonObject(content)) {
|
|
35263
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
35264
|
+
segments.push({ type: "text", value: rendered });
|
|
35265
|
+
if (textParts) {
|
|
35266
|
+
textParts.push(rendered);
|
|
35267
|
+
}
|
|
35268
|
+
continue;
|
|
35269
|
+
}
|
|
35270
|
+
if (!Array.isArray(content)) {
|
|
35271
|
+
continue;
|
|
35272
|
+
}
|
|
35260
35273
|
for (const rawSegment of content) {
|
|
35261
35274
|
if (!isJsonObject(rawSegment)) {
|
|
35262
35275
|
continue;
|
|
@@ -35317,63 +35330,6 @@ async function processMessages(options) {
|
|
|
35317
35330
|
}
|
|
35318
35331
|
return segments;
|
|
35319
35332
|
}
|
|
35320
|
-
async function resolveAssistantContent(content, searchRoots, verbose) {
|
|
35321
|
-
if (typeof content === "string") {
|
|
35322
|
-
return content;
|
|
35323
|
-
}
|
|
35324
|
-
if (!content) {
|
|
35325
|
-
return "";
|
|
35326
|
-
}
|
|
35327
|
-
const parts = [];
|
|
35328
|
-
for (const entry of content) {
|
|
35329
|
-
if (typeof entry === "string") {
|
|
35330
|
-
parts.push({ content: entry, isFile: false });
|
|
35331
|
-
continue;
|
|
35332
|
-
}
|
|
35333
|
-
if (!isJsonObject(entry)) {
|
|
35334
|
-
continue;
|
|
35335
|
-
}
|
|
35336
|
-
const segmentType = asString3(entry.type);
|
|
35337
|
-
if (segmentType === "file") {
|
|
35338
|
-
const rawValue = asString3(entry.value);
|
|
35339
|
-
if (!rawValue) {
|
|
35340
|
-
continue;
|
|
35341
|
-
}
|
|
35342
|
-
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
35343
|
-
rawValue,
|
|
35344
|
-
searchRoots
|
|
35345
|
-
);
|
|
35346
|
-
if (!resolvedPath) {
|
|
35347
|
-
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
35348
|
-
logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
|
|
35349
|
-
continue;
|
|
35350
|
-
}
|
|
35351
|
-
try {
|
|
35352
|
-
const fileContent = (await readFile32(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
|
|
35353
|
-
parts.push({ content: fileContent, isFile: true, displayPath });
|
|
35354
|
-
if (verbose) {
|
|
35355
|
-
console.log(` [Expected Assistant File] Found: ${displayPath}`);
|
|
35356
|
-
console.log(` Resolved to: ${resolvedPath}`);
|
|
35357
|
-
}
|
|
35358
|
-
} catch (error40) {
|
|
35359
|
-
logWarning3(`Could not read file ${resolvedPath}: ${error40.message}`);
|
|
35360
|
-
}
|
|
35361
|
-
continue;
|
|
35362
|
-
}
|
|
35363
|
-
const textValue = asString3(entry.text);
|
|
35364
|
-
if (typeof textValue === "string") {
|
|
35365
|
-
parts.push({ content: textValue, isFile: false });
|
|
35366
|
-
continue;
|
|
35367
|
-
}
|
|
35368
|
-
const valueValue = asString3(entry.value);
|
|
35369
|
-
if (typeof valueValue === "string") {
|
|
35370
|
-
parts.push({ content: valueValue, isFile: false });
|
|
35371
|
-
continue;
|
|
35372
|
-
}
|
|
35373
|
-
parts.push({ content: JSON.stringify(entry), isFile: false });
|
|
35374
|
-
}
|
|
35375
|
-
return formatFileContents(parts);
|
|
35376
|
-
}
|
|
35377
35333
|
function asString3(value) {
|
|
35378
35334
|
return typeof value === "string" ? value : void 0;
|
|
35379
35335
|
}
|
|
@@ -35406,14 +35362,15 @@ ${detailBlock}${ANSI_RESET4}`);
|
|
|
35406
35362
|
}
|
|
35407
35363
|
}
|
|
35408
35364
|
async function processExpectedMessages(options) {
|
|
35409
|
-
const { messages, searchRoots,
|
|
35365
|
+
const { messages, searchRoots, verbose } = options;
|
|
35410
35366
|
const segments = [];
|
|
35411
35367
|
for (const message of messages) {
|
|
35368
|
+
const extendedMessage = message;
|
|
35412
35369
|
const segment = {
|
|
35413
35370
|
role: message.role
|
|
35414
35371
|
};
|
|
35415
|
-
if (
|
|
35416
|
-
segment.
|
|
35372
|
+
if (extendedMessage.name) {
|
|
35373
|
+
segment.name = extendedMessage.name;
|
|
35417
35374
|
}
|
|
35418
35375
|
const content = message.content;
|
|
35419
35376
|
if (typeof content === "string") {
|
|
@@ -35461,6 +35418,13 @@ async function processExpectedMessages(options) {
|
|
|
35461
35418
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
35462
35419
|
}
|
|
35463
35420
|
segment.content = processedContent;
|
|
35421
|
+
} else if (isJsonObject(content)) {
|
|
35422
|
+
segment.content = cloneJsonObject(content);
|
|
35423
|
+
}
|
|
35424
|
+
if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
|
|
35425
|
+
segment.tool_calls = extendedMessage.tool_calls.map(
|
|
35426
|
+
(tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
|
|
35427
|
+
);
|
|
35464
35428
|
}
|
|
35465
35429
|
segments.push(segment);
|
|
35466
35430
|
}
|
|
@@ -35528,6 +35492,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
|
|
|
35528
35492
|
}
|
|
35529
35493
|
}
|
|
35530
35494
|
}
|
|
35495
|
+
} else if (isJsonObject(message.content)) {
|
|
35496
|
+
const rendered = JSON.stringify(message.content, null, 2);
|
|
35497
|
+
if (rendered.trim().length > 0) {
|
|
35498
|
+
messageSegments.push({ type: "text", value: rendered });
|
|
35499
|
+
}
|
|
35531
35500
|
}
|
|
35532
35501
|
segmentsByMessage.push(messageSegments);
|
|
35533
35502
|
}
|
|
@@ -35749,9 +35718,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35749
35718
|
logError(`No valid expected message found for eval case: ${id}`);
|
|
35750
35719
|
continue;
|
|
35751
35720
|
}
|
|
35752
|
-
if (expectedMessages.length > 1) {
|
|
35753
|
-
logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
|
|
35754
|
-
}
|
|
35755
35721
|
const guidelinePaths = [];
|
|
35756
35722
|
const inputTextParts = [];
|
|
35757
35723
|
const inputSegments = await processMessages({
|
|
@@ -35771,8 +35737,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35771
35737
|
verbose
|
|
35772
35738
|
}) : [];
|
|
35773
35739
|
const codeSnippets = extractCodeBlocks(inputSegments);
|
|
35774
|
-
|
|
35775
|
-
|
|
35740
|
+
let referenceAnswer = "";
|
|
35741
|
+
if (outputSegments.length > 1) {
|
|
35742
|
+
referenceAnswer = JSON.stringify(outputSegments, null, 2);
|
|
35743
|
+
} else if (outputSegments.length === 1) {
|
|
35744
|
+
const singleMessage = outputSegments[0];
|
|
35745
|
+
if (typeof singleMessage.content === "string") {
|
|
35746
|
+
referenceAnswer = singleMessage.content;
|
|
35747
|
+
} else if (singleMessage.content) {
|
|
35748
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
35749
|
+
} else if (singleMessage.tool_calls) {
|
|
35750
|
+
referenceAnswer = JSON.stringify(singleMessage, null, 2);
|
|
35751
|
+
}
|
|
35752
|
+
}
|
|
35776
35753
|
const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
|
|
35777
35754
|
const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
|
|
35778
35755
|
let evaluators;
|
|
@@ -35827,7 +35804,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35827
35804
|
question,
|
|
35828
35805
|
input_messages: inputMessages,
|
|
35829
35806
|
input_segments: inputSegments,
|
|
35830
|
-
|
|
35807
|
+
expected_messages: outputSegments,
|
|
35831
35808
|
reference_answer: referenceAnswer,
|
|
35832
35809
|
guideline_paths: guidelinePaths.map((guidelinePath) => path62.resolve(guidelinePath)),
|
|
35833
35810
|
guideline_patterns: guidelinePatterns,
|
|
@@ -36238,7 +36215,7 @@ var CliProvider = class {
|
|
|
36238
36215
|
id;
|
|
36239
36216
|
kind = "cli";
|
|
36240
36217
|
targetName;
|
|
36241
|
-
supportsBatch =
|
|
36218
|
+
supportsBatch = true;
|
|
36242
36219
|
config;
|
|
36243
36220
|
runCommand;
|
|
36244
36221
|
verbose;
|
|
@@ -36258,6 +36235,11 @@ var CliProvider = class {
|
|
|
36258
36235
|
const outputFilePath = generateOutputFilePath(request.evalCaseId);
|
|
36259
36236
|
const templateValues = buildTemplateValues(request, this.config, outputFilePath);
|
|
36260
36237
|
const renderedCommand = renderTemplate2(this.config.commandTemplate, templateValues);
|
|
36238
|
+
if (this.verbose) {
|
|
36239
|
+
console.log(
|
|
36240
|
+
`[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36241
|
+
);
|
|
36242
|
+
}
|
|
36261
36243
|
const result = await this.runCommand(renderedCommand, {
|
|
36262
36244
|
cwd: this.config.cwd,
|
|
36263
36245
|
env: process.env,
|
|
@@ -36292,6 +36274,114 @@ var CliProvider = class {
|
|
|
36292
36274
|
}
|
|
36293
36275
|
};
|
|
36294
36276
|
}
|
|
36277
|
+
async invokeBatch(requests) {
|
|
36278
|
+
if (requests.length === 0) {
|
|
36279
|
+
return [];
|
|
36280
|
+
}
|
|
36281
|
+
for (const request of requests) {
|
|
36282
|
+
if (request.signal?.aborted) {
|
|
36283
|
+
throw new Error("CLI provider batch request was aborted before execution");
|
|
36284
|
+
}
|
|
36285
|
+
}
|
|
36286
|
+
const controller = new AbortController();
|
|
36287
|
+
for (const request of requests) {
|
|
36288
|
+
request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
|
|
36289
|
+
}
|
|
36290
|
+
await this.ensureHealthy(controller.signal);
|
|
36291
|
+
const outputFilePath = generateOutputFilePath("batch", ".jsonl");
|
|
36292
|
+
const batchInputFiles = [];
|
|
36293
|
+
for (const request of requests) {
|
|
36294
|
+
if (request.inputFiles && request.inputFiles.length > 0) {
|
|
36295
|
+
batchInputFiles.push(...request.inputFiles);
|
|
36296
|
+
}
|
|
36297
|
+
}
|
|
36298
|
+
const templateValues = buildTemplateValues(
|
|
36299
|
+
{
|
|
36300
|
+
question: "",
|
|
36301
|
+
guidelines: "",
|
|
36302
|
+
inputFiles: batchInputFiles,
|
|
36303
|
+
evalCaseId: "batch",
|
|
36304
|
+
attempt: 0
|
|
36305
|
+
},
|
|
36306
|
+
this.config,
|
|
36307
|
+
outputFilePath
|
|
36308
|
+
);
|
|
36309
|
+
const renderedCommand = renderTemplate2(this.config.commandTemplate, templateValues);
|
|
36310
|
+
if (this.verbose) {
|
|
36311
|
+
console.log(
|
|
36312
|
+
`[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36313
|
+
);
|
|
36314
|
+
}
|
|
36315
|
+
const result = await this.runCommand(renderedCommand, {
|
|
36316
|
+
cwd: this.config.cwd,
|
|
36317
|
+
env: process.env,
|
|
36318
|
+
timeoutMs: this.config.timeoutMs,
|
|
36319
|
+
signal: controller.signal
|
|
36320
|
+
});
|
|
36321
|
+
if (result.failed || (result.exitCode ?? 0) !== 0) {
|
|
36322
|
+
if (controller.signal.aborted) {
|
|
36323
|
+
throw new Error("CLI provider request was aborted");
|
|
36324
|
+
}
|
|
36325
|
+
if (result.timedOut) {
|
|
36326
|
+
throw new Error(
|
|
36327
|
+
`CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
|
|
36328
|
+
);
|
|
36329
|
+
}
|
|
36330
|
+
const codeText = result.exitCode !== null ? result.exitCode : "unknown";
|
|
36331
|
+
const detail = result.stderr.trim() || result.stdout.trim();
|
|
36332
|
+
const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
|
|
36333
|
+
throw new Error(message);
|
|
36334
|
+
}
|
|
36335
|
+
const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
|
|
36336
|
+
const recordsById = this.parseJsonlBatchOutput(responseContent);
|
|
36337
|
+
const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
|
|
36338
|
+
const missingIds = requestedIds.filter((id) => !recordsById.has(id));
|
|
36339
|
+
if (missingIds.length > 0) {
|
|
36340
|
+
throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
|
|
36341
|
+
}
|
|
36342
|
+
const responses = requests.map((request) => {
|
|
36343
|
+
const evalCaseId = request.evalCaseId;
|
|
36344
|
+
if (!evalCaseId) {
|
|
36345
|
+
return {
|
|
36346
|
+
text: "",
|
|
36347
|
+
raw: {
|
|
36348
|
+
command: renderedCommand,
|
|
36349
|
+
stderr: result.stderr,
|
|
36350
|
+
exitCode: result.exitCode ?? 0,
|
|
36351
|
+
cwd: this.config.cwd,
|
|
36352
|
+
outputFile: outputFilePath
|
|
36353
|
+
}
|
|
36354
|
+
};
|
|
36355
|
+
}
|
|
36356
|
+
const parsed = recordsById.get(evalCaseId);
|
|
36357
|
+
if (!parsed) {
|
|
36358
|
+
return {
|
|
36359
|
+
text: "",
|
|
36360
|
+
raw: {
|
|
36361
|
+
command: renderedCommand,
|
|
36362
|
+
stderr: result.stderr,
|
|
36363
|
+
exitCode: result.exitCode ?? 0,
|
|
36364
|
+
cwd: this.config.cwd,
|
|
36365
|
+
outputFile: outputFilePath
|
|
36366
|
+
}
|
|
36367
|
+
};
|
|
36368
|
+
}
|
|
36369
|
+
return {
|
|
36370
|
+
text: parsed.text,
|
|
36371
|
+
trace: parsed.trace,
|
|
36372
|
+
traceRef: parsed.traceRef,
|
|
36373
|
+
raw: {
|
|
36374
|
+
command: renderedCommand,
|
|
36375
|
+
stderr: result.stderr,
|
|
36376
|
+
exitCode: result.exitCode ?? 0,
|
|
36377
|
+
cwd: this.config.cwd,
|
|
36378
|
+
outputFile: outputFilePath,
|
|
36379
|
+
recordId: evalCaseId
|
|
36380
|
+
}
|
|
36381
|
+
};
|
|
36382
|
+
});
|
|
36383
|
+
return responses;
|
|
36384
|
+
}
|
|
36295
36385
|
/**
|
|
36296
36386
|
* Parse output content from CLI.
|
|
36297
36387
|
* If the content is valid JSON with a 'text' field, extract text and optional trace.
|
|
@@ -36317,6 +36407,38 @@ var CliProvider = class {
|
|
|
36317
36407
|
const validEvents = trace2.filter(isTraceEvent);
|
|
36318
36408
|
return validEvents.length > 0 ? validEvents : void 0;
|
|
36319
36409
|
}
|
|
36410
|
+
parseJsonlBatchOutput(content) {
|
|
36411
|
+
const records = /* @__PURE__ */ new Map();
|
|
36412
|
+
const lines = content.split(/\r?\n/).map((line2) => line2.trim()).filter((line2) => line2.length > 0);
|
|
36413
|
+
for (const line2 of lines) {
|
|
36414
|
+
let parsed;
|
|
36415
|
+
try {
|
|
36416
|
+
parsed = JSON.parse(line2);
|
|
36417
|
+
} catch (error40) {
|
|
36418
|
+
const reason = error40 instanceof Error ? error40.message : String(error40);
|
|
36419
|
+
throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
|
|
36420
|
+
}
|
|
36421
|
+
if (typeof parsed !== "object" || parsed === null) {
|
|
36422
|
+
throw new Error("CLI batch output JSONL line must be an object");
|
|
36423
|
+
}
|
|
36424
|
+
const obj = parsed;
|
|
36425
|
+
const id = typeof obj.id === "string" ? obj.id : void 0;
|
|
36426
|
+
if (!id || id.trim().length === 0) {
|
|
36427
|
+
throw new Error("CLI batch output JSONL line missing required string field: id");
|
|
36428
|
+
}
|
|
36429
|
+
if (records.has(id)) {
|
|
36430
|
+
throw new Error(`CLI batch output contains duplicate id: ${id}`);
|
|
36431
|
+
}
|
|
36432
|
+
const text2 = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
|
|
36433
|
+
const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
|
|
36434
|
+
records.set(id, {
|
|
36435
|
+
text: text2,
|
|
36436
|
+
trace: this.parseTrace(obj.trace),
|
|
36437
|
+
traceRef
|
|
36438
|
+
});
|
|
36439
|
+
}
|
|
36440
|
+
return records;
|
|
36441
|
+
}
|
|
36320
36442
|
async readAndCleanupOutputFile(filePath) {
|
|
36321
36443
|
try {
|
|
36322
36444
|
const content = await readTextFile(filePath);
|
|
@@ -36378,7 +36500,7 @@ var CliProvider = class {
|
|
|
36378
36500
|
);
|
|
36379
36501
|
if (this.verbose) {
|
|
36380
36502
|
console.log(
|
|
36381
|
-
`[cli-provider:${this.targetName}] (healthcheck)
|
|
36503
|
+
`[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
|
|
36382
36504
|
);
|
|
36383
36505
|
}
|
|
36384
36506
|
const result = await this.runCommand(renderedCommand, {
|
|
@@ -36446,11 +36568,11 @@ function shellEscape(value) {
|
|
|
36446
36568
|
}
|
|
36447
36569
|
return `'${value.replace(/'/g, `'"'"'`)}'`;
|
|
36448
36570
|
}
|
|
36449
|
-
function generateOutputFilePath(evalCaseId) {
|
|
36571
|
+
function generateOutputFilePath(evalCaseId, extension = ".json") {
|
|
36450
36572
|
const safeEvalId = evalCaseId || "unknown";
|
|
36451
36573
|
const timestamp = Date.now();
|
|
36452
36574
|
const random = Math.random().toString(36).substring(2, 9);
|
|
36453
|
-
return path72.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}
|
|
36575
|
+
return path72.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
|
|
36454
36576
|
}
|
|
36455
36577
|
function formatTimeoutSuffix(timeoutMs) {
|
|
36456
36578
|
if (!timeoutMs || timeoutMs <= 0) {
|
|
@@ -37669,7 +37791,7 @@ function createProvider(target) {
|
|
|
37669
37791
|
}
|
|
37670
37792
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37671
37793
|
|
|
37672
|
-
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
37794
|
+
Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
37673
37795
|
|
|
37674
37796
|
Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
|
|
37675
37797
|
|
|
@@ -37727,7 +37849,7 @@ var LlmJudgeEvaluator = class {
|
|
|
37727
37849
|
const variables = {
|
|
37728
37850
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
37729
37851
|
[TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
|
|
37730
|
-
context.evalCase.
|
|
37852
|
+
context.evalCase.expected_messages,
|
|
37731
37853
|
null,
|
|
37732
37854
|
2
|
|
37733
37855
|
),
|
|
@@ -37940,13 +38062,16 @@ var CodeEvaluator = class {
|
|
|
37940
38062
|
{
|
|
37941
38063
|
question: context.evalCase.question,
|
|
37942
38064
|
expected_outcome: context.evalCase.expected_outcome,
|
|
38065
|
+
expected_messages: context.evalCase.expected_messages,
|
|
37943
38066
|
reference_answer: context.evalCase.reference_answer,
|
|
37944
38067
|
candidate_answer: context.candidate,
|
|
37945
38068
|
guideline_files: context.evalCase.guideline_paths,
|
|
37946
38069
|
input_files: context.evalCase.file_paths.filter(
|
|
37947
38070
|
(path132) => !context.evalCase.guideline_paths.includes(path132)
|
|
37948
38071
|
),
|
|
37949
|
-
input_messages: context.evalCase.input_messages
|
|
38072
|
+
input_messages: context.evalCase.input_messages,
|
|
38073
|
+
candidate_trace_file: context.candidateTraceRef ?? null,
|
|
38074
|
+
candidate_trace_summary: context.candidateTraceSummary ?? null
|
|
37950
38075
|
},
|
|
37951
38076
|
null,
|
|
37952
38077
|
2
|
|
@@ -38212,105 +38337,6 @@ var ToolTrajectoryEvaluator = class {
|
|
|
38212
38337
|
};
|
|
38213
38338
|
}
|
|
38214
38339
|
};
|
|
38215
|
-
var ExpectedToolCallsEvaluator = class {
|
|
38216
|
-
kind = "expected_tool_calls";
|
|
38217
|
-
evaluate(context) {
|
|
38218
|
-
const { candidateTrace, evalCase } = context;
|
|
38219
|
-
const expectedSegments = evalCase.expected_segments;
|
|
38220
|
-
const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
|
|
38221
|
-
if (expectedToolCalls.length === 0) {
|
|
38222
|
-
return {
|
|
38223
|
-
score: 1,
|
|
38224
|
-
verdict: "pass",
|
|
38225
|
-
hits: ["No tool_calls specified in expected_messages"],
|
|
38226
|
-
misses: [],
|
|
38227
|
-
expectedAspectCount: 1
|
|
38228
|
-
};
|
|
38229
|
-
}
|
|
38230
|
-
if (!candidateTrace || candidateTrace.length === 0) {
|
|
38231
|
-
return {
|
|
38232
|
-
score: 0,
|
|
38233
|
-
verdict: "fail",
|
|
38234
|
-
hits: [],
|
|
38235
|
-
misses: ["No trace available to validate tool_calls"],
|
|
38236
|
-
expectedAspectCount: expectedToolCalls.length
|
|
38237
|
-
};
|
|
38238
|
-
}
|
|
38239
|
-
const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
|
|
38240
|
-
return this.validateToolCalls(expectedToolCalls, actualToolCalls);
|
|
38241
|
-
}
|
|
38242
|
-
extractExpectedToolCalls(segments) {
|
|
38243
|
-
if (!segments) {
|
|
38244
|
-
return [];
|
|
38245
|
-
}
|
|
38246
|
-
const toolCalls = [];
|
|
38247
|
-
for (const segment of segments) {
|
|
38248
|
-
const role = segment.role;
|
|
38249
|
-
const segmentToolCalls = segment.tool_calls;
|
|
38250
|
-
if (role === "assistant" && Array.isArray(segmentToolCalls)) {
|
|
38251
|
-
for (const tc of segmentToolCalls) {
|
|
38252
|
-
if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
|
|
38253
|
-
const toolCall = tc;
|
|
38254
|
-
toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
|
|
38255
|
-
}
|
|
38256
|
-
}
|
|
38257
|
-
}
|
|
38258
|
-
}
|
|
38259
|
-
return toolCalls;
|
|
38260
|
-
}
|
|
38261
|
-
validateToolCalls(expected, actual) {
|
|
38262
|
-
const hits = [];
|
|
38263
|
-
const misses = [];
|
|
38264
|
-
for (let i = 0; i < expected.length; i++) {
|
|
38265
|
-
const expectedCall = expected[i];
|
|
38266
|
-
const actualCall = actual[i];
|
|
38267
|
-
if (!actualCall) {
|
|
38268
|
-
misses.push(
|
|
38269
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
|
|
38270
|
-
);
|
|
38271
|
-
continue;
|
|
38272
|
-
}
|
|
38273
|
-
if (actualCall.name !== expectedCall.tool) {
|
|
38274
|
-
misses.push(
|
|
38275
|
-
`tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
|
|
38276
|
-
);
|
|
38277
|
-
continue;
|
|
38278
|
-
}
|
|
38279
|
-
if (expectedCall.input !== void 0) {
|
|
38280
|
-
if (!this.deepEquals(expectedCall.input, actualCall.input)) {
|
|
38281
|
-
misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
|
|
38282
|
-
continue;
|
|
38283
|
-
}
|
|
38284
|
-
}
|
|
38285
|
-
hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
|
|
38286
|
-
}
|
|
38287
|
-
const totalChecks = expected.length || 1;
|
|
38288
|
-
const score = hits.length / totalChecks;
|
|
38289
|
-
return {
|
|
38290
|
-
score,
|
|
38291
|
-
verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
|
|
38292
|
-
hits,
|
|
38293
|
-
misses,
|
|
38294
|
-
expectedAspectCount: totalChecks
|
|
38295
|
-
};
|
|
38296
|
-
}
|
|
38297
|
-
deepEquals(a, b) {
|
|
38298
|
-
if (a === b) return true;
|
|
38299
|
-
if (typeof a !== typeof b) return false;
|
|
38300
|
-
if (typeof a !== "object" || a === null || b === null) return false;
|
|
38301
|
-
if (Array.isArray(a) && Array.isArray(b)) {
|
|
38302
|
-
if (a.length !== b.length) return false;
|
|
38303
|
-
return a.every((val, i) => this.deepEquals(val, b[i]));
|
|
38304
|
-
}
|
|
38305
|
-
if (Array.isArray(a) || Array.isArray(b)) return false;
|
|
38306
|
-
const aObj = a;
|
|
38307
|
-
const bObj = b;
|
|
38308
|
-
const aKeys = Object.keys(aObj);
|
|
38309
|
-
const bKeys = Object.keys(bObj);
|
|
38310
|
-
if (aKeys.length !== bKeys.length) return false;
|
|
38311
|
-
return aKeys.every((key2) => this.deepEquals(aObj[key2], bObj[key2]));
|
|
38312
|
-
}
|
|
38313
|
-
};
|
|
38314
38340
|
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
38315
38341
|
{{EVALUATOR_RESULTS_JSON}}
|
|
38316
38342
|
|
|
@@ -39061,6 +39087,7 @@ async function runEvalCase(options) {
|
|
|
39061
39087
|
judgeProvider,
|
|
39062
39088
|
agentTimeoutMs,
|
|
39063
39089
|
candidateTrace,
|
|
39090
|
+
candidateTraceRef: providerResponse.traceRef,
|
|
39064
39091
|
candidateTraceSummary
|
|
39065
39092
|
});
|
|
39066
39093
|
} catch (error40) {
|
|
@@ -39080,6 +39107,7 @@ async function evaluateCandidate(options) {
|
|
|
39080
39107
|
judgeProvider,
|
|
39081
39108
|
agentTimeoutMs,
|
|
39082
39109
|
candidateTrace,
|
|
39110
|
+
candidateTraceRef,
|
|
39083
39111
|
candidateTraceSummary
|
|
39084
39112
|
} = options;
|
|
39085
39113
|
const gradeTimestamp = nowFn();
|
|
@@ -39095,6 +39123,7 @@ async function evaluateCandidate(options) {
|
|
|
39095
39123
|
judgeProvider,
|
|
39096
39124
|
agentTimeoutMs,
|
|
39097
39125
|
candidateTrace,
|
|
39126
|
+
candidateTraceRef,
|
|
39098
39127
|
candidateTraceSummary
|
|
39099
39128
|
});
|
|
39100
39129
|
const completedAt = nowFn();
|
|
@@ -39149,6 +39178,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
39149
39178
|
judgeProvider,
|
|
39150
39179
|
agentTimeoutMs,
|
|
39151
39180
|
candidateTrace,
|
|
39181
|
+
candidateTraceRef,
|
|
39152
39182
|
candidateTraceSummary
|
|
39153
39183
|
} = options;
|
|
39154
39184
|
if (evalCase.evaluators && evalCase.evaluators.length > 0) {
|
|
@@ -39165,6 +39195,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
39165
39195
|
judgeProvider,
|
|
39166
39196
|
agentTimeoutMs,
|
|
39167
39197
|
candidateTrace,
|
|
39198
|
+
candidateTraceRef,
|
|
39168
39199
|
candidateTraceSummary
|
|
39169
39200
|
});
|
|
39170
39201
|
}
|
|
@@ -39183,6 +39214,7 @@ async function runEvaluatorsForCase(options) {
|
|
|
39183
39214
|
now,
|
|
39184
39215
|
judgeProvider,
|
|
39185
39216
|
candidateTrace,
|
|
39217
|
+
candidateTraceRef,
|
|
39186
39218
|
candidateTraceSummary
|
|
39187
39219
|
});
|
|
39188
39220
|
return { score };
|
|
@@ -39201,6 +39233,7 @@ async function runEvaluatorList(options) {
|
|
|
39201
39233
|
judgeProvider,
|
|
39202
39234
|
agentTimeoutMs,
|
|
39203
39235
|
candidateTrace,
|
|
39236
|
+
candidateTraceRef,
|
|
39204
39237
|
candidateTraceSummary
|
|
39205
39238
|
} = options;
|
|
39206
39239
|
const scored = [];
|
|
@@ -39247,7 +39280,9 @@ async function runEvaluatorList(options) {
|
|
|
39247
39280
|
provider,
|
|
39248
39281
|
attempt,
|
|
39249
39282
|
promptInputs,
|
|
39250
|
-
now
|
|
39283
|
+
now,
|
|
39284
|
+
candidateTraceRef,
|
|
39285
|
+
candidateTraceSummary
|
|
39251
39286
|
});
|
|
39252
39287
|
const weight = evaluator.weight ?? 1;
|
|
39253
39288
|
scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
|
|
@@ -39285,8 +39320,6 @@ async function runEvaluatorList(options) {
|
|
|
39285
39320
|
return new ToolTrajectoryEvaluator({
|
|
39286
39321
|
config: memberConfig
|
|
39287
39322
|
});
|
|
39288
|
-
case "expected_tool_calls":
|
|
39289
|
-
return new ExpectedToolCallsEvaluator();
|
|
39290
39323
|
default: {
|
|
39291
39324
|
const unknownConfig = memberConfig;
|
|
39292
39325
|
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
@@ -39336,32 +39369,7 @@ async function runEvaluatorList(options) {
|
|
|
39336
39369
|
promptInputs,
|
|
39337
39370
|
now,
|
|
39338
39371
|
candidateTrace,
|
|
39339
|
-
|
|
39340
|
-
});
|
|
39341
|
-
const weight = evaluator.weight ?? 1;
|
|
39342
|
-
scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
|
|
39343
|
-
evaluatorResults.push({
|
|
39344
|
-
name: evaluator.name,
|
|
39345
|
-
type: evaluator.type,
|
|
39346
|
-
score: score2.score,
|
|
39347
|
-
weight,
|
|
39348
|
-
verdict: score2.verdict,
|
|
39349
|
-
hits: score2.hits,
|
|
39350
|
-
misses: score2.misses,
|
|
39351
|
-
reasoning: score2.reasoning
|
|
39352
|
-
});
|
|
39353
|
-
}
|
|
39354
|
-
if (evaluator.type === "expected_tool_calls") {
|
|
39355
|
-
const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
|
|
39356
|
-
const score2 = expectedToolCallsEvaluator.evaluate({
|
|
39357
|
-
evalCase,
|
|
39358
|
-
candidate,
|
|
39359
|
-
target,
|
|
39360
|
-
provider,
|
|
39361
|
-
attempt,
|
|
39362
|
-
promptInputs,
|
|
39363
|
-
now,
|
|
39364
|
-
candidateTrace,
|
|
39372
|
+
candidateTraceRef,
|
|
39365
39373
|
candidateTraceSummary
|
|
39366
39374
|
});
|
|
39367
39375
|
const weight = evaluator.weight ?? 1;
|
|
@@ -40123,61 +40131,56 @@ function getDefaultExtension(format) {
|
|
|
40123
40131
|
}
|
|
40124
40132
|
|
|
40125
40133
|
// src/commands/eval/progress-display.ts
|
|
40126
|
-
import { stripVTControlCharacters } from "node:util";
|
|
40127
|
-
var ESC = "\x1B[";
|
|
40128
|
-
var CLEAR_LINE = `${ESC}K`;
|
|
40129
|
-
var MOVE_CURSOR_UP = `${ESC}1A`;
|
|
40130
40134
|
var ProgressDisplay = class {
|
|
40131
40135
|
workers = /* @__PURE__ */ new Map();
|
|
40132
|
-
maxWorkers;
|
|
40133
40136
|
totalTests = 0;
|
|
40134
40137
|
completedTests = 0;
|
|
40135
|
-
renderTimer;
|
|
40136
|
-
renderScheduled = false;
|
|
40137
|
-
isInteractive;
|
|
40138
40138
|
logPaths = [];
|
|
40139
40139
|
logPathSet = /* @__PURE__ */ new Set();
|
|
40140
40140
|
hasPrintedLogHeader = false;
|
|
40141
|
-
windowHeight = 0;
|
|
40142
40141
|
started = false;
|
|
40143
40142
|
finished = false;
|
|
40144
|
-
|
|
40145
|
-
|
|
40146
|
-
this.
|
|
40143
|
+
verbose;
|
|
40144
|
+
constructor(_maxWorkers, options) {
|
|
40145
|
+
this.verbose = options?.verbose ?? false;
|
|
40147
40146
|
}
|
|
40148
40147
|
isInteractiveMode() {
|
|
40149
|
-
return
|
|
40148
|
+
return false;
|
|
40150
40149
|
}
|
|
40151
40150
|
start() {
|
|
40152
40151
|
this.started = true;
|
|
40153
40152
|
this.finished = false;
|
|
40154
|
-
if (this.isInteractive) {
|
|
40155
|
-
this.write("\n");
|
|
40156
|
-
this.renderTimer = setInterval(() => {
|
|
40157
|
-
this.scheduleRender();
|
|
40158
|
-
}, 1e3);
|
|
40159
|
-
this.renderTimer.unref?.();
|
|
40160
|
-
}
|
|
40161
40153
|
}
|
|
40162
40154
|
setTotalTests(count) {
|
|
40163
40155
|
this.totalTests = count;
|
|
40164
40156
|
}
|
|
40165
40157
|
updateWorker(progress) {
|
|
40158
|
+
const previous = this.workers.get(progress.workerId);
|
|
40166
40159
|
this.workers.set(progress.workerId, progress);
|
|
40167
40160
|
if (progress.status === "completed" || progress.status === "failed") {
|
|
40168
40161
|
this.completedTests++;
|
|
40169
40162
|
}
|
|
40170
|
-
|
|
40171
|
-
|
|
40172
|
-
|
|
40173
|
-
|
|
40174
|
-
|
|
40175
|
-
|
|
40176
|
-
|
|
40163
|
+
const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
|
|
40164
|
+
const countPrefix = `${this.completedTests}/${this.totalTests}`;
|
|
40165
|
+
switch (progress.status) {
|
|
40166
|
+
case "pending":
|
|
40167
|
+
if (this.verbose && !previous) {
|
|
40168
|
+
console.log(`${countPrefix} \u23F3 ${progress.evalId}${targetSuffix}`);
|
|
40169
|
+
}
|
|
40170
|
+
break;
|
|
40171
|
+
case "running":
|
|
40172
|
+
if (!previous || previous.status === "pending") {
|
|
40173
|
+
console.log(`${countPrefix} \u{1F504} ${progress.evalId}${targetSuffix}`);
|
|
40174
|
+
}
|
|
40175
|
+
break;
|
|
40176
|
+
case "completed":
|
|
40177
|
+
console.log(`${countPrefix} \u2705 ${progress.evalId}${targetSuffix}`);
|
|
40178
|
+
break;
|
|
40179
|
+
case "failed":
|
|
40177
40180
|
console.log(
|
|
40178
|
-
|
|
40181
|
+
`${countPrefix} \u274C ${progress.evalId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
|
|
40179
40182
|
);
|
|
40180
|
-
|
|
40183
|
+
break;
|
|
40181
40184
|
}
|
|
40182
40185
|
}
|
|
40183
40186
|
addLogPaths(paths) {
|
|
@@ -40193,10 +40196,6 @@ var ProgressDisplay = class {
|
|
|
40193
40196
|
return;
|
|
40194
40197
|
}
|
|
40195
40198
|
this.logPaths.push(...newPaths);
|
|
40196
|
-
if (this.isInteractive) {
|
|
40197
|
-
this.scheduleRender();
|
|
40198
|
-
return;
|
|
40199
|
-
}
|
|
40200
40199
|
if (!this.hasPrintedLogHeader) {
|
|
40201
40200
|
console.log("");
|
|
40202
40201
|
console.log("Codex CLI logs:");
|
|
@@ -40207,112 +40206,11 @@ var ProgressDisplay = class {
|
|
|
40207
40206
|
console.log(`${startIndex + offset + 1}. ${path27}`);
|
|
40208
40207
|
});
|
|
40209
40208
|
}
|
|
40210
|
-
scheduleRender() {
|
|
40211
|
-
if (this.renderScheduled || this.finished) {
|
|
40212
|
-
return;
|
|
40213
|
-
}
|
|
40214
|
-
this.renderScheduled = true;
|
|
40215
|
-
setTimeout(() => {
|
|
40216
|
-
this.renderScheduled = false;
|
|
40217
|
-
this.render();
|
|
40218
|
-
}, 100);
|
|
40219
|
-
}
|
|
40220
|
-
write(content) {
|
|
40221
|
-
process.stdout.write(content);
|
|
40222
|
-
}
|
|
40223
|
-
clearWindow() {
|
|
40224
|
-
if (this.windowHeight === 0) {
|
|
40225
|
-
return;
|
|
40226
|
-
}
|
|
40227
|
-
this.write(`\r${CLEAR_LINE}`);
|
|
40228
|
-
for (let i = 1; i < this.windowHeight; i++) {
|
|
40229
|
-
this.write(`${MOVE_CURSOR_UP}\r${CLEAR_LINE}`);
|
|
40230
|
-
}
|
|
40231
|
-
this.windowHeight = 0;
|
|
40232
|
-
}
|
|
40233
|
-
getRenderedRowCount(rows) {
|
|
40234
|
-
const columns = process.stdout.columns || 80;
|
|
40235
|
-
let count = 0;
|
|
40236
|
-
for (const row of rows) {
|
|
40237
|
-
const text2 = stripVTControlCharacters(row);
|
|
40238
|
-
count += Math.max(1, Math.ceil(text2.length / columns));
|
|
40239
|
-
}
|
|
40240
|
-
return count;
|
|
40241
|
-
}
|
|
40242
|
-
render() {
|
|
40243
|
-
if (!this.isInteractive || !this.started || this.finished) {
|
|
40244
|
-
return;
|
|
40245
|
-
}
|
|
40246
|
-
const lines = [];
|
|
40247
|
-
const sortedWorkers = Array.from(this.workers.values()).sort((a, b) => a.workerId - b.workerId);
|
|
40248
|
-
for (const worker of sortedWorkers) {
|
|
40249
|
-
const line2 = this.formatWorkerLine(worker);
|
|
40250
|
-
lines.push(line2);
|
|
40251
|
-
}
|
|
40252
|
-
if (this.logPaths.length > 0) {
|
|
40253
|
-
lines.push("");
|
|
40254
|
-
lines.push("Codex CLI logs:");
|
|
40255
|
-
this.logPaths.forEach((path27, index) => {
|
|
40256
|
-
lines.push(`${index + 1}. ${path27}`);
|
|
40257
|
-
});
|
|
40258
|
-
}
|
|
40259
|
-
const rowCount = this.getRenderedRowCount(lines);
|
|
40260
|
-
this.clearWindow();
|
|
40261
|
-
if (lines.length > 0) {
|
|
40262
|
-
this.write(lines.join("\n"));
|
|
40263
|
-
}
|
|
40264
|
-
this.windowHeight = rowCount;
|
|
40265
|
-
}
|
|
40266
|
-
formatWorkerLine(worker) {
|
|
40267
|
-
const workerLabel = `${worker.workerId}.`.padEnd(4);
|
|
40268
|
-
const statusIcon = this.getStatusIcon(worker.status);
|
|
40269
|
-
const targetLabel = worker.targetLabel ? ` | ${worker.targetLabel}` : "";
|
|
40270
|
-
const columns = process.stdout.columns || 80;
|
|
40271
|
-
const maxLineLength = Math.max(40, columns - 4);
|
|
40272
|
-
const reservedLength = workerLabel.length + statusIcon.length + targetLabel.length + 4;
|
|
40273
|
-
const availableLabelLength = Math.max(15, maxLineLength - reservedLength);
|
|
40274
|
-
let testLabel = worker.evalId;
|
|
40275
|
-
if (testLabel.length > availableLabelLength) {
|
|
40276
|
-
testLabel = `${testLabel.substring(0, Math.max(0, availableLabelLength - 3))}...`;
|
|
40277
|
-
}
|
|
40278
|
-
return `${workerLabel} ${statusIcon} ${testLabel}${targetLabel}`;
|
|
40279
|
-
}
|
|
40280
|
-
getStatusIcon(status) {
|
|
40281
|
-
switch (status) {
|
|
40282
|
-
case "pending":
|
|
40283
|
-
return "\u23F3";
|
|
40284
|
-
case "running":
|
|
40285
|
-
return "\u{1F504}";
|
|
40286
|
-
case "completed":
|
|
40287
|
-
return "\u2705";
|
|
40288
|
-
case "failed":
|
|
40289
|
-
return "\u274C";
|
|
40290
|
-
default:
|
|
40291
|
-
return " ";
|
|
40292
|
-
}
|
|
40293
|
-
}
|
|
40294
40209
|
finish() {
|
|
40295
|
-
if (this.renderTimer) {
|
|
40296
|
-
clearInterval(this.renderTimer);
|
|
40297
|
-
this.renderTimer = void 0;
|
|
40298
|
-
}
|
|
40299
40210
|
this.finished = true;
|
|
40300
|
-
|
|
40301
|
-
this.clearWindow();
|
|
40302
|
-
const sortedWorkers = Array.from(this.workers.values()).sort(
|
|
40303
|
-
(a, b) => a.workerId - b.workerId
|
|
40304
|
-
);
|
|
40305
|
-
for (const worker of sortedWorkers) {
|
|
40306
|
-
this.write(`${this.formatWorkerLine(worker)}
|
|
40307
|
-
`);
|
|
40308
|
-
}
|
|
40309
|
-
this.write("\n");
|
|
40310
|
-
}
|
|
40211
|
+
console.log("");
|
|
40311
40212
|
}
|
|
40312
40213
|
clear() {
|
|
40313
|
-
if (this.isInteractive) {
|
|
40314
|
-
this.clearWindow();
|
|
40315
|
-
}
|
|
40316
40214
|
}
|
|
40317
40215
|
};
|
|
40318
40216
|
|
|
@@ -40649,26 +40547,6 @@ function validateMessages(messages, location, filePath, errors) {
|
|
|
40649
40547
|
message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
|
|
40650
40548
|
});
|
|
40651
40549
|
}
|
|
40652
|
-
const toolCalls = message.tool_calls;
|
|
40653
|
-
if (toolCalls !== void 0) {
|
|
40654
|
-
if (role !== "assistant") {
|
|
40655
|
-
errors.push({
|
|
40656
|
-
severity: "error",
|
|
40657
|
-
filePath,
|
|
40658
|
-
location: `${msgLocation}.tool_calls`,
|
|
40659
|
-
message: "tool_calls can only be specified on assistant messages"
|
|
40660
|
-
});
|
|
40661
|
-
} else if (!Array.isArray(toolCalls)) {
|
|
40662
|
-
errors.push({
|
|
40663
|
-
severity: "error",
|
|
40664
|
-
filePath,
|
|
40665
|
-
location: `${msgLocation}.tool_calls`,
|
|
40666
|
-
message: "tool_calls must be an array"
|
|
40667
|
-
});
|
|
40668
|
-
} else {
|
|
40669
|
-
validateToolCalls(toolCalls, `${msgLocation}.tool_calls`, filePath, errors);
|
|
40670
|
-
}
|
|
40671
|
-
}
|
|
40672
40550
|
const content = message.content;
|
|
40673
40551
|
if (typeof content === "string") {
|
|
40674
40552
|
validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
|
|
@@ -40733,30 +40611,6 @@ function validateContentForRoleMarkers(content, location, filePath, errors) {
|
|
|
40733
40611
|
}
|
|
40734
40612
|
}
|
|
40735
40613
|
}
|
|
40736
|
-
function validateToolCalls(toolCalls, location, filePath, errors) {
|
|
40737
|
-
for (let i = 0; i < toolCalls.length; i++) {
|
|
40738
|
-
const toolCall = toolCalls[i];
|
|
40739
|
-
const callLocation = `${location}[${i}]`;
|
|
40740
|
-
if (!isObject2(toolCall)) {
|
|
40741
|
-
errors.push({
|
|
40742
|
-
severity: "error",
|
|
40743
|
-
filePath,
|
|
40744
|
-
location: callLocation,
|
|
40745
|
-
message: "Tool call must be an object"
|
|
40746
|
-
});
|
|
40747
|
-
continue;
|
|
40748
|
-
}
|
|
40749
|
-
const tool2 = toolCall.tool;
|
|
40750
|
-
if (typeof tool2 !== "string" || tool2.trim().length === 0) {
|
|
40751
|
-
errors.push({
|
|
40752
|
-
severity: "error",
|
|
40753
|
-
filePath,
|
|
40754
|
-
location: `${callLocation}.tool`,
|
|
40755
|
-
message: "Missing or invalid 'tool' field (must be a non-empty string)"
|
|
40756
|
-
});
|
|
40757
|
-
}
|
|
40758
|
-
}
|
|
40759
|
-
}
|
|
40760
40614
|
function isObject22(value) {
|
|
40761
40615
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
40762
40616
|
}
|
|
@@ -40860,6 +40714,9 @@ var CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
40860
40714
|
...COMMON_SETTINGS,
|
|
40861
40715
|
"command_template",
|
|
40862
40716
|
"commandTemplate",
|
|
40717
|
+
"verbose",
|
|
40718
|
+
"cli_verbose",
|
|
40719
|
+
"cliVerbose",
|
|
40863
40720
|
"files_format",
|
|
40864
40721
|
"filesFormat",
|
|
40865
40722
|
"attachments_format",
|
|
@@ -40993,6 +40850,15 @@ async function validateTargetsFile(filePath) {
|
|
|
40993
40850
|
if (healthcheck !== void 0) {
|
|
40994
40851
|
validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
|
|
40995
40852
|
}
|
|
40853
|
+
const verbose = target.verbose ?? target.cli_verbose ?? target.cliVerbose;
|
|
40854
|
+
if (verbose !== void 0 && typeof verbose !== "boolean") {
|
|
40855
|
+
errors2.push({
|
|
40856
|
+
severity: "error",
|
|
40857
|
+
filePath: absolutePath2,
|
|
40858
|
+
location: `${location}.verbose`,
|
|
40859
|
+
message: "'verbose' must be a boolean when provided"
|
|
40860
|
+
});
|
|
40861
|
+
}
|
|
40996
40862
|
}
|
|
40997
40863
|
function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
|
|
40998
40864
|
if (!isObject22(healthcheck)) {
|
|
@@ -41636,8 +41502,8 @@ function createEvaluationCache() {
|
|
|
41636
41502
|
}
|
|
41637
41503
|
};
|
|
41638
41504
|
}
|
|
41639
|
-
function createProgressReporter(maxWorkers) {
|
|
41640
|
-
const display = new ProgressDisplay(maxWorkers);
|
|
41505
|
+
function createProgressReporter(maxWorkers, options) {
|
|
41506
|
+
const display = new ProgressDisplay(maxWorkers, options);
|
|
41641
41507
|
return {
|
|
41642
41508
|
isInteractive: display.isInteractiveMode(),
|
|
41643
41509
|
start: () => display.start(),
|
|
@@ -41665,6 +41531,22 @@ function createDisplayIdTracker() {
|
|
|
41665
41531
|
}
|
|
41666
41532
|
};
|
|
41667
41533
|
}
|
|
41534
|
+
function applyVerboseOverride(selection, cliVerbose) {
|
|
41535
|
+
const { resolvedTarget } = selection;
|
|
41536
|
+
if (resolvedTarget.kind !== "cli") {
|
|
41537
|
+
return selection;
|
|
41538
|
+
}
|
|
41539
|
+
return {
|
|
41540
|
+
...selection,
|
|
41541
|
+
resolvedTarget: {
|
|
41542
|
+
...resolvedTarget,
|
|
41543
|
+
config: {
|
|
41544
|
+
...resolvedTarget.config,
|
|
41545
|
+
verbose: cliVerbose
|
|
41546
|
+
}
|
|
41547
|
+
}
|
|
41548
|
+
};
|
|
41549
|
+
}
|
|
41668
41550
|
async function prepareFileMetadata(params) {
|
|
41669
41551
|
const { testFilePath, repoRoot, cwd, options } = params;
|
|
41670
41552
|
await ensureFileExists(testFilePath, "Test file");
|
|
@@ -41724,7 +41606,7 @@ async function runSingleEvalFile(params) {
|
|
|
41724
41606
|
evalCases
|
|
41725
41607
|
} = params;
|
|
41726
41608
|
await ensureFileExists(testFilePath, "Test file");
|
|
41727
|
-
const resolvedTargetSelection = selection;
|
|
41609
|
+
const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
|
|
41728
41610
|
const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
|
|
41729
41611
|
const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} [provider=${providerLabel}] via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`;
|
|
41730
41612
|
if (!progressReporter.isInteractive || options.verbose) {
|
|
@@ -41837,7 +41719,7 @@ async function runEvalCommand(input) {
|
|
|
41837
41719
|
if (totalEvalCount === 0) {
|
|
41838
41720
|
throw new Error("No eval cases matched the provided filters.");
|
|
41839
41721
|
}
|
|
41840
|
-
const progressReporter = createProgressReporter(totalWorkers);
|
|
41722
|
+
const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
|
|
41841
41723
|
progressReporter.start();
|
|
41842
41724
|
progressReporter.setTotal(totalEvalCount);
|
|
41843
41725
|
const seenCodexLogPaths = /* @__PURE__ */ new Set();
|
|
@@ -42708,4 +42590,4 @@ export {
|
|
|
42708
42590
|
app,
|
|
42709
42591
|
runCli
|
|
42710
42592
|
};
|
|
42711
|
-
//# sourceMappingURL=chunk-
|
|
42593
|
+
//# sourceMappingURL=chunk-6R2YRXCQ.js.map
|