@agentv/core 4.4.1 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +443 -122
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +104 -1
- package/dist/index.d.ts +104 -1
- package/dist/index.js +415 -103
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -1491,6 +1491,7 @@ __export(index_exports, {
|
|
|
1491
1491
|
deriveProjectId: () => deriveProjectId,
|
|
1492
1492
|
detectFormat: () => detectFormat,
|
|
1493
1493
|
discoverAssertions: () => discoverAssertions,
|
|
1494
|
+
discoverClaudeSessions: () => discoverClaudeSessions,
|
|
1494
1495
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
1495
1496
|
discoverGraders: () => discoverGraders,
|
|
1496
1497
|
discoverJudges: () => discoverGraders,
|
|
@@ -1550,12 +1551,14 @@ __export(index_exports, {
|
|
|
1550
1551
|
negateScore: () => negateScore,
|
|
1551
1552
|
normalizeLineEndings: () => normalizeLineEndings,
|
|
1552
1553
|
parseAgentSkillsEvals: () => parseAgentSkillsEvals,
|
|
1554
|
+
parseClaudeSession: () => parseClaudeSession,
|
|
1553
1555
|
parseJsonFromText: () => parseJsonFromText,
|
|
1554
1556
|
parseJsonSafe: () => parseJsonSafe,
|
|
1555
1557
|
readJsonFile: () => readJsonFile,
|
|
1556
1558
|
readTargetDefinitions: () => readTargetDefinitions,
|
|
1557
1559
|
readTestSuiteMetadata: () => readTestSuiteMetadata,
|
|
1558
1560
|
readTextFile: () => readTextFile,
|
|
1561
|
+
readTranscriptFile: () => readTranscriptFile,
|
|
1559
1562
|
removeProject: () => removeProject,
|
|
1560
1563
|
resolveAndCreateProvider: () => resolveAndCreateProvider,
|
|
1561
1564
|
resolveFileReference: () => resolveFileReference3,
|
|
@@ -8763,6 +8766,7 @@ var CopilotLogProvider = class {
|
|
|
8763
8766
|
// src/evaluation/providers/copilot-sdk.ts
|
|
8764
8767
|
init_cjs_shims();
|
|
8765
8768
|
var import_node_crypto6 = require("crypto");
|
|
8769
|
+
var import_node_fs8 = require("fs");
|
|
8766
8770
|
var import_promises17 = require("fs/promises");
|
|
8767
8771
|
var import_node_path20 = __toESM(require("path"), 1);
|
|
8768
8772
|
|
|
@@ -8879,6 +8883,7 @@ var CopilotSdkProvider = class {
|
|
|
8879
8883
|
const cwd = this.resolveCwd(request.cwd);
|
|
8880
8884
|
if (cwd) {
|
|
8881
8885
|
sessionOptions.workingDirectory = cwd;
|
|
8886
|
+
sessionOptions.skillDirectories = resolveSkillDirectories(cwd);
|
|
8882
8887
|
}
|
|
8883
8888
|
const systemPrompt = this.config.systemPrompt;
|
|
8884
8889
|
if (systemPrompt) {
|
|
@@ -9110,6 +9115,14 @@ var CopilotSdkProvider = class {
|
|
|
9110
9115
|
}
|
|
9111
9116
|
}
|
|
9112
9117
|
};
|
|
9118
|
+
function resolveSkillDirectories(cwd) {
|
|
9119
|
+
const candidates = [
|
|
9120
|
+
import_node_path20.default.join(cwd, ".claude", "skills"),
|
|
9121
|
+
import_node_path20.default.join(cwd, ".agents", "skills"),
|
|
9122
|
+
import_node_path20.default.join(cwd, ".codex", "skills")
|
|
9123
|
+
];
|
|
9124
|
+
return candidates.filter((dir) => (0, import_node_fs8.existsSync)(dir));
|
|
9125
|
+
}
|
|
9113
9126
|
function summarizeSdkEvent(eventType, data) {
|
|
9114
9127
|
if (!data || typeof data !== "object") {
|
|
9115
9128
|
return eventType;
|
|
@@ -9179,7 +9192,7 @@ var MockProvider = class {
|
|
|
9179
9192
|
init_cjs_shims();
|
|
9180
9193
|
var import_node_child_process4 = require("child_process");
|
|
9181
9194
|
var import_node_crypto7 = require("crypto");
|
|
9182
|
-
var
|
|
9195
|
+
var import_node_fs9 = require("fs");
|
|
9183
9196
|
var import_promises18 = require("fs/promises");
|
|
9184
9197
|
var import_node_os5 = require("os");
|
|
9185
9198
|
var import_node_path21 = __toESM(require("path"), 1);
|
|
@@ -9552,7 +9565,7 @@ var PiStreamLogger = class _PiStreamLogger {
|
|
|
9552
9565
|
constructor(filePath, format) {
|
|
9553
9566
|
this.filePath = filePath;
|
|
9554
9567
|
this.format = format;
|
|
9555
|
-
this.stream = (0,
|
|
9568
|
+
this.stream = (0, import_node_fs9.createWriteStream)(filePath, { flags: "a" });
|
|
9556
9569
|
}
|
|
9557
9570
|
static async create(options) {
|
|
9558
9571
|
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
@@ -10025,7 +10038,7 @@ async function defaultPiRunner(options) {
|
|
|
10025
10038
|
init_cjs_shims();
|
|
10026
10039
|
var import_node_child_process5 = require("child_process");
|
|
10027
10040
|
var import_node_crypto8 = require("crypto");
|
|
10028
|
-
var
|
|
10041
|
+
var import_node_fs10 = require("fs");
|
|
10029
10042
|
var import_promises19 = require("fs/promises");
|
|
10030
10043
|
var import_node_path22 = __toESM(require("path"), 1);
|
|
10031
10044
|
var import_node_readline = require("readline");
|
|
@@ -10053,7 +10066,7 @@ function findAgentvRoot() {
|
|
|
10053
10066
|
for (let i = 0; i < 10; i++) {
|
|
10054
10067
|
try {
|
|
10055
10068
|
const pkg = import_node_path22.default.join(dir, "package.json");
|
|
10056
|
-
(0,
|
|
10069
|
+
(0, import_node_fs10.accessSync)(pkg);
|
|
10057
10070
|
return dir;
|
|
10058
10071
|
} catch {
|
|
10059
10072
|
const parent = import_node_path22.default.dirname(dir);
|
|
@@ -10143,6 +10156,11 @@ var PiCodingAgentProvider = class {
|
|
|
10143
10156
|
const modelId = this.config.model ?? "gemini-2.5-flash";
|
|
10144
10157
|
this.setApiKeyEnv(providerName);
|
|
10145
10158
|
const model = sdk.getModel(providerName, modelId);
|
|
10159
|
+
if (!model) {
|
|
10160
|
+
throw new Error(
|
|
10161
|
+
`pi-coding-agent: getModel('${providerName}', '${modelId}') returned undefined. The model '${modelId}' is not registered for provider '${providerName}' in pi-ai. Check that subprovider and model are correct in your target config.`
|
|
10162
|
+
);
|
|
10163
|
+
}
|
|
10146
10164
|
const tools = this.resolveTools(sdk);
|
|
10147
10165
|
const { session } = await sdk.createAgentSession({
|
|
10148
10166
|
cwd,
|
|
@@ -10257,6 +10275,15 @@ ${fileList}`;
|
|
|
10257
10275
|
await session.prompt(prompt);
|
|
10258
10276
|
}
|
|
10259
10277
|
const agentMessages = session.agent.state.messages;
|
|
10278
|
+
const lastAssistant = [...agentMessages].reverse().find(
|
|
10279
|
+
(m) => !!m && typeof m === "object" && m.role === "assistant"
|
|
10280
|
+
);
|
|
10281
|
+
if (lastAssistant?.stopReason === "error") {
|
|
10282
|
+
const errorMsg = typeof lastAssistant.errorMessage === "string" ? lastAssistant.errorMessage : "unknown SDK error";
|
|
10283
|
+
throw new Error(
|
|
10284
|
+
`pi-coding-agent SDK error (provider: ${lastAssistant.provider ?? providerName}, model: ${lastAssistant.model ?? modelId}): ${errorMsg}`
|
|
10285
|
+
);
|
|
10286
|
+
}
|
|
10260
10287
|
const output = [];
|
|
10261
10288
|
for (const msg of agentMessages) {
|
|
10262
10289
|
output.push(convertAgentMessage(msg, toolTrackers, completedToolResults));
|
|
@@ -10372,7 +10399,7 @@ var PiStreamLogger2 = class _PiStreamLogger {
|
|
|
10372
10399
|
constructor(filePath, format) {
|
|
10373
10400
|
this.filePath = filePath;
|
|
10374
10401
|
this.format = format;
|
|
10375
|
-
this.stream = (0,
|
|
10402
|
+
this.stream = (0, import_node_fs10.createWriteStream)(filePath, { flags: "a" });
|
|
10376
10403
|
}
|
|
10377
10404
|
static async create(options) {
|
|
10378
10405
|
const logger = new _PiStreamLogger(options.filePath, options.format);
|
|
@@ -11565,8 +11592,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11565
11592
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
11566
11593
|
if (!parseResult.success) {
|
|
11567
11594
|
const firstError = parseResult.error.errors[0];
|
|
11568
|
-
const
|
|
11569
|
-
const prefix =
|
|
11595
|
+
const path52 = firstError?.path.join(".") || "";
|
|
11596
|
+
const prefix = path52 ? `${target.name} ${path52}: ` : `${target.name}: `;
|
|
11570
11597
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
11571
11598
|
}
|
|
11572
11599
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -11827,12 +11854,12 @@ var import_node_path33 = __toESM(require("path"), 1);
|
|
|
11827
11854
|
|
|
11828
11855
|
// src/evaluation/providers/vscode/utils/fs.ts
|
|
11829
11856
|
init_cjs_shims();
|
|
11830
|
-
var
|
|
11857
|
+
var import_node_fs11 = require("fs");
|
|
11831
11858
|
var import_promises20 = require("fs/promises");
|
|
11832
11859
|
var import_node_path24 = __toESM(require("path"), 1);
|
|
11833
11860
|
async function pathExists(target) {
|
|
11834
11861
|
try {
|
|
11835
|
-
await (0, import_promises20.access)(target,
|
|
11862
|
+
await (0, import_promises20.access)(target, import_node_fs11.constants.F_OK);
|
|
11836
11863
|
return true;
|
|
11837
11864
|
} catch {
|
|
11838
11865
|
return false;
|
|
@@ -13312,7 +13339,7 @@ function isAgentProvider(provider) {
|
|
|
13312
13339
|
|
|
13313
13340
|
// src/evaluation/providers/targets-file.ts
|
|
13314
13341
|
init_cjs_shims();
|
|
13315
|
-
var
|
|
13342
|
+
var import_node_fs12 = require("fs");
|
|
13316
13343
|
var import_promises27 = require("fs/promises");
|
|
13317
13344
|
var import_node_path36 = __toESM(require("path"), 1);
|
|
13318
13345
|
var import_yaml7 = require("yaml");
|
|
@@ -13344,7 +13371,7 @@ function assertTargetDefinition(value, index, filePath) {
|
|
|
13344
13371
|
}
|
|
13345
13372
|
async function fileExists3(filePath) {
|
|
13346
13373
|
try {
|
|
13347
|
-
await (0, import_promises27.access)(filePath,
|
|
13374
|
+
await (0, import_promises27.access)(filePath, import_node_fs12.constants.F_OK);
|
|
13348
13375
|
return true;
|
|
13349
13376
|
} catch {
|
|
13350
13377
|
return false;
|
|
@@ -13629,15 +13656,15 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
13629
13656
|
});
|
|
13630
13657
|
}
|
|
13631
13658
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
13632
|
-
const { mkdir: mkdir17, readFile:
|
|
13659
|
+
const { mkdir: mkdir17, readFile: readFile17, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
13633
13660
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
13634
|
-
const
|
|
13661
|
+
const path52 = await import("path");
|
|
13635
13662
|
const { randomUUID: randomUUID10 } = await import("crypto");
|
|
13636
|
-
const dir =
|
|
13663
|
+
const dir = path52.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
13637
13664
|
await mkdir17(dir, { recursive: true });
|
|
13638
|
-
const stdinPath =
|
|
13639
|
-
const stdoutPath =
|
|
13640
|
-
const stderrPath =
|
|
13665
|
+
const stdinPath = path52.join(dir, "stdin.txt");
|
|
13666
|
+
const stdoutPath = path52.join(dir, "stdout.txt");
|
|
13667
|
+
const stderrPath = path52.join(dir, "stderr.txt");
|
|
13641
13668
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
13642
13669
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
13643
13670
|
const { spawn: spawn5 } = await import("child_process");
|
|
@@ -13667,8 +13694,8 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
|
13667
13694
|
resolve(code ?? 0);
|
|
13668
13695
|
});
|
|
13669
13696
|
});
|
|
13670
|
-
const stdout = (await
|
|
13671
|
-
const stderr = (await
|
|
13697
|
+
const stdout = (await readFile17(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
|
|
13698
|
+
const stderr = (await readFile17(stderrPath, "utf8")).replace(/\r\n/g, "\n");
|
|
13672
13699
|
return { stdout, stderr, exitCode };
|
|
13673
13700
|
} finally {
|
|
13674
13701
|
await rm6(dir, { recursive: true, force: true });
|
|
@@ -15148,11 +15175,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
15148
15175
|
execute: async (input) => {
|
|
15149
15176
|
try {
|
|
15150
15177
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
15151
|
-
const
|
|
15152
|
-
if (
|
|
15178
|
+
const stat10 = await import_promises29.default.stat(resolved);
|
|
15179
|
+
if (stat10.isDirectory()) {
|
|
15153
15180
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
15154
15181
|
}
|
|
15155
|
-
const buffer = Buffer.alloc(Math.min(
|
|
15182
|
+
const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
|
|
15156
15183
|
const fd = await import_promises29.default.open(resolved, "r");
|
|
15157
15184
|
try {
|
|
15158
15185
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -15160,8 +15187,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
15160
15187
|
await fd.close();
|
|
15161
15188
|
}
|
|
15162
15189
|
const content = buffer.toString("utf-8");
|
|
15163
|
-
const truncated =
|
|
15164
|
-
return { content, truncated, size:
|
|
15190
|
+
const truncated = stat10.size > MAX_FILE_SIZE;
|
|
15191
|
+
return { content, truncated, size: stat10.size };
|
|
15165
15192
|
} catch (error) {
|
|
15166
15193
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
15167
15194
|
}
|
|
@@ -15212,8 +15239,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
15212
15239
|
const ext = import_node_path39.default.extname(entry.name).toLowerCase();
|
|
15213
15240
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
15214
15241
|
try {
|
|
15215
|
-
const
|
|
15216
|
-
if (
|
|
15242
|
+
const stat10 = await import_promises29.default.stat(fullPath);
|
|
15243
|
+
if (stat10.size > MAX_FILE_SIZE) continue;
|
|
15217
15244
|
const content = await import_promises29.default.readFile(fullPath, "utf-8");
|
|
15218
15245
|
const lines = content.split("\n");
|
|
15219
15246
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -15857,115 +15884,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
15857
15884
|
* Evaluate a single field against the expected value.
|
|
15858
15885
|
*/
|
|
15859
15886
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
15860
|
-
const { path:
|
|
15861
|
-
const candidateValue = resolvePath(candidateData,
|
|
15862
|
-
const expectedValue = resolvePath(expectedData,
|
|
15887
|
+
const { path: path52, match, required = true, weight = 1 } = fieldConfig;
|
|
15888
|
+
const candidateValue = resolvePath(candidateData, path52);
|
|
15889
|
+
const expectedValue = resolvePath(expectedData, path52);
|
|
15863
15890
|
if (expectedValue === void 0) {
|
|
15864
15891
|
return {
|
|
15865
|
-
path:
|
|
15892
|
+
path: path52,
|
|
15866
15893
|
score: 1,
|
|
15867
15894
|
// No expected value means no comparison needed
|
|
15868
15895
|
weight,
|
|
15869
15896
|
hit: true,
|
|
15870
|
-
message: `${
|
|
15897
|
+
message: `${path52}: no expected value`
|
|
15871
15898
|
};
|
|
15872
15899
|
}
|
|
15873
15900
|
if (candidateValue === void 0) {
|
|
15874
15901
|
if (required) {
|
|
15875
15902
|
return {
|
|
15876
|
-
path:
|
|
15903
|
+
path: path52,
|
|
15877
15904
|
score: 0,
|
|
15878
15905
|
weight,
|
|
15879
15906
|
hit: false,
|
|
15880
|
-
message: `${
|
|
15907
|
+
message: `${path52} (required, missing)`
|
|
15881
15908
|
};
|
|
15882
15909
|
}
|
|
15883
15910
|
return {
|
|
15884
|
-
path:
|
|
15911
|
+
path: path52,
|
|
15885
15912
|
score: 1,
|
|
15886
15913
|
// Don't penalize missing optional fields
|
|
15887
15914
|
weight: 0,
|
|
15888
15915
|
// Zero weight means it won't affect the score
|
|
15889
15916
|
hit: true,
|
|
15890
|
-
message: `${
|
|
15917
|
+
message: `${path52}: optional field missing`
|
|
15891
15918
|
};
|
|
15892
15919
|
}
|
|
15893
15920
|
switch (match) {
|
|
15894
15921
|
case "exact":
|
|
15895
|
-
return this.compareExact(
|
|
15922
|
+
return this.compareExact(path52, candidateValue, expectedValue, weight);
|
|
15896
15923
|
case "numeric_tolerance":
|
|
15897
15924
|
return this.compareNumericTolerance(
|
|
15898
|
-
|
|
15925
|
+
path52,
|
|
15899
15926
|
candidateValue,
|
|
15900
15927
|
expectedValue,
|
|
15901
15928
|
fieldConfig,
|
|
15902
15929
|
weight
|
|
15903
15930
|
);
|
|
15904
15931
|
case "date":
|
|
15905
|
-
return this.compareDate(
|
|
15932
|
+
return this.compareDate(path52, candidateValue, expectedValue, fieldConfig, weight);
|
|
15906
15933
|
default:
|
|
15907
15934
|
return {
|
|
15908
|
-
path:
|
|
15935
|
+
path: path52,
|
|
15909
15936
|
score: 0,
|
|
15910
15937
|
weight,
|
|
15911
15938
|
hit: false,
|
|
15912
|
-
message: `${
|
|
15939
|
+
message: `${path52}: unknown match type "${match}"`
|
|
15913
15940
|
};
|
|
15914
15941
|
}
|
|
15915
15942
|
}
|
|
15916
15943
|
/**
|
|
15917
15944
|
* Exact equality comparison.
|
|
15918
15945
|
*/
|
|
15919
|
-
compareExact(
|
|
15946
|
+
compareExact(path52, candidateValue, expectedValue, weight) {
|
|
15920
15947
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
15921
15948
|
return {
|
|
15922
|
-
path:
|
|
15949
|
+
path: path52,
|
|
15923
15950
|
score: 1,
|
|
15924
15951
|
weight,
|
|
15925
15952
|
hit: true,
|
|
15926
|
-
message:
|
|
15953
|
+
message: path52
|
|
15927
15954
|
};
|
|
15928
15955
|
}
|
|
15929
15956
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
15930
15957
|
return {
|
|
15931
|
-
path:
|
|
15958
|
+
path: path52,
|
|
15932
15959
|
score: 0,
|
|
15933
15960
|
weight,
|
|
15934
15961
|
hit: false,
|
|
15935
|
-
message: `${
|
|
15962
|
+
message: `${path52} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
15936
15963
|
};
|
|
15937
15964
|
}
|
|
15938
15965
|
return {
|
|
15939
|
-
path:
|
|
15966
|
+
path: path52,
|
|
15940
15967
|
score: 0,
|
|
15941
15968
|
weight,
|
|
15942
15969
|
hit: false,
|
|
15943
|
-
message: `${
|
|
15970
|
+
message: `${path52} (value mismatch)`
|
|
15944
15971
|
};
|
|
15945
15972
|
}
|
|
15946
15973
|
/**
|
|
15947
15974
|
* Numeric comparison with absolute or relative tolerance.
|
|
15948
15975
|
*/
|
|
15949
|
-
compareNumericTolerance(
|
|
15976
|
+
compareNumericTolerance(path52, candidateValue, expectedValue, fieldConfig, weight) {
|
|
15950
15977
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
15951
15978
|
const candidateNum = toNumber(candidateValue);
|
|
15952
15979
|
const expectedNum = toNumber(expectedValue);
|
|
15953
15980
|
if (candidateNum === null || expectedNum === null) {
|
|
15954
15981
|
return {
|
|
15955
|
-
path:
|
|
15982
|
+
path: path52,
|
|
15956
15983
|
score: 0,
|
|
15957
15984
|
weight,
|
|
15958
15985
|
hit: false,
|
|
15959
|
-
message: `${
|
|
15986
|
+
message: `${path52} (non-numeric value)`
|
|
15960
15987
|
};
|
|
15961
15988
|
}
|
|
15962
15989
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
15963
15990
|
return {
|
|
15964
|
-
path:
|
|
15991
|
+
path: path52,
|
|
15965
15992
|
score: 0,
|
|
15966
15993
|
weight,
|
|
15967
15994
|
hit: false,
|
|
15968
|
-
message: `${
|
|
15995
|
+
message: `${path52} (invalid numeric value)`
|
|
15969
15996
|
};
|
|
15970
15997
|
}
|
|
15971
15998
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -15978,61 +16005,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
15978
16005
|
}
|
|
15979
16006
|
if (withinTolerance) {
|
|
15980
16007
|
return {
|
|
15981
|
-
path:
|
|
16008
|
+
path: path52,
|
|
15982
16009
|
score: 1,
|
|
15983
16010
|
weight,
|
|
15984
16011
|
hit: true,
|
|
15985
|
-
message: `${
|
|
16012
|
+
message: `${path52} (within tolerance: diff=${diff.toFixed(2)})`
|
|
15986
16013
|
};
|
|
15987
16014
|
}
|
|
15988
16015
|
return {
|
|
15989
|
-
path:
|
|
16016
|
+
path: path52,
|
|
15990
16017
|
score: 0,
|
|
15991
16018
|
weight,
|
|
15992
16019
|
hit: false,
|
|
15993
|
-
message: `${
|
|
16020
|
+
message: `${path52} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
15994
16021
|
};
|
|
15995
16022
|
}
|
|
15996
16023
|
/**
|
|
15997
16024
|
* Date comparison with format normalization.
|
|
15998
16025
|
*/
|
|
15999
|
-
compareDate(
|
|
16026
|
+
compareDate(path52, candidateValue, expectedValue, fieldConfig, weight) {
|
|
16000
16027
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
16001
16028
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
16002
16029
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
16003
16030
|
if (candidateDate === null) {
|
|
16004
16031
|
return {
|
|
16005
|
-
path:
|
|
16032
|
+
path: path52,
|
|
16006
16033
|
score: 0,
|
|
16007
16034
|
weight,
|
|
16008
16035
|
hit: false,
|
|
16009
|
-
message: `${
|
|
16036
|
+
message: `${path52} (unparseable candidate date)`
|
|
16010
16037
|
};
|
|
16011
16038
|
}
|
|
16012
16039
|
if (expectedDate === null) {
|
|
16013
16040
|
return {
|
|
16014
|
-
path:
|
|
16041
|
+
path: path52,
|
|
16015
16042
|
score: 0,
|
|
16016
16043
|
weight,
|
|
16017
16044
|
hit: false,
|
|
16018
|
-
message: `${
|
|
16045
|
+
message: `${path52} (unparseable expected date)`
|
|
16019
16046
|
};
|
|
16020
16047
|
}
|
|
16021
16048
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
16022
16049
|
return {
|
|
16023
|
-
path:
|
|
16050
|
+
path: path52,
|
|
16024
16051
|
score: 1,
|
|
16025
16052
|
weight,
|
|
16026
16053
|
hit: true,
|
|
16027
|
-
message:
|
|
16054
|
+
message: path52
|
|
16028
16055
|
};
|
|
16029
16056
|
}
|
|
16030
16057
|
return {
|
|
16031
|
-
path:
|
|
16058
|
+
path: path52,
|
|
16032
16059
|
score: 0,
|
|
16033
16060
|
weight,
|
|
16034
16061
|
hit: false,
|
|
16035
|
-
message: `${
|
|
16062
|
+
message: `${path52} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
16036
16063
|
};
|
|
16037
16064
|
}
|
|
16038
16065
|
/**
|
|
@@ -16065,11 +16092,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
16065
16092
|
};
|
|
16066
16093
|
}
|
|
16067
16094
|
};
|
|
16068
|
-
function resolvePath(obj,
|
|
16069
|
-
if (!
|
|
16095
|
+
function resolvePath(obj, path52) {
|
|
16096
|
+
if (!path52 || !obj) {
|
|
16070
16097
|
return void 0;
|
|
16071
16098
|
}
|
|
16072
|
-
const parts =
|
|
16099
|
+
const parts = path52.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
16073
16100
|
let current = obj;
|
|
16074
16101
|
for (const part of parts) {
|
|
16075
16102
|
if (current === null || current === void 0) {
|
|
@@ -16295,6 +16322,14 @@ var SkillTriggerEvaluator = class {
|
|
|
16295
16322
|
evidence = `Read tool loaded skill file via tool name "${toolName}"`;
|
|
16296
16323
|
break;
|
|
16297
16324
|
}
|
|
16325
|
+
if (!triggered && toolCall.output != null) {
|
|
16326
|
+
const outputStr = typeof toolCall.output === "string" ? toolCall.output : JSON.stringify(toolCall.output);
|
|
16327
|
+
if (outputStr.includes(`skills/${skillName}/`)) {
|
|
16328
|
+
triggered = true;
|
|
16329
|
+
evidence = `Tool "${toolName}" output referenced skill file for "${skillName}"`;
|
|
16330
|
+
break;
|
|
16331
|
+
}
|
|
16332
|
+
}
|
|
16298
16333
|
}
|
|
16299
16334
|
const pass = triggered === shouldTrigger;
|
|
16300
16335
|
if (pass) {
|
|
@@ -16558,8 +16593,8 @@ var TokenUsageEvaluator = class {
|
|
|
16558
16593
|
|
|
16559
16594
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
16560
16595
|
init_cjs_shims();
|
|
16561
|
-
function getNestedValue(obj,
|
|
16562
|
-
const parts =
|
|
16596
|
+
function getNestedValue(obj, path52) {
|
|
16597
|
+
const parts = path52.split(".");
|
|
16563
16598
|
let current = obj;
|
|
16564
16599
|
for (const part of parts) {
|
|
16565
16600
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -17957,7 +17992,7 @@ function getTCritical(df) {
|
|
|
17957
17992
|
// src/evaluation/workspace/file-changes.ts
|
|
17958
17993
|
init_cjs_shims();
|
|
17959
17994
|
var import_node_child_process8 = require("child_process");
|
|
17960
|
-
var
|
|
17995
|
+
var import_node_fs13 = require("fs");
|
|
17961
17996
|
var import_node_path43 = __toESM(require("path"), 1);
|
|
17962
17997
|
var import_node_util4 = require("util");
|
|
17963
17998
|
var execAsync4 = (0, import_node_util4.promisify)(import_node_child_process8.exec);
|
|
@@ -17986,7 +18021,7 @@ async function captureFileChanges(workspacePath, baselineCommit) {
|
|
|
17986
18021
|
async function stageNestedRepoChanges(workspacePath) {
|
|
17987
18022
|
let entries;
|
|
17988
18023
|
try {
|
|
17989
|
-
entries = (0,
|
|
18024
|
+
entries = (0, import_node_fs13.readdirSync)(workspacePath);
|
|
17990
18025
|
} catch {
|
|
17991
18026
|
return;
|
|
17992
18027
|
}
|
|
@@ -17994,8 +18029,8 @@ async function stageNestedRepoChanges(workspacePath) {
|
|
|
17994
18029
|
if (entry === ".git" || entry === "node_modules") continue;
|
|
17995
18030
|
const childPath = import_node_path43.default.join(workspacePath, entry);
|
|
17996
18031
|
try {
|
|
17997
|
-
if (!(0,
|
|
17998
|
-
if (!(0,
|
|
18032
|
+
if (!(0, import_node_fs13.statSync)(childPath).isDirectory()) continue;
|
|
18033
|
+
if (!(0, import_node_fs13.statSync)(import_node_path43.default.join(childPath, ".git")).isDirectory()) continue;
|
|
17999
18034
|
} catch {
|
|
18000
18035
|
continue;
|
|
18001
18036
|
}
|
|
@@ -18115,7 +18150,7 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
|
|
|
18115
18150
|
init_cjs_shims();
|
|
18116
18151
|
var import_node_child_process9 = require("child_process");
|
|
18117
18152
|
var import_node_crypto10 = require("crypto");
|
|
18118
|
-
var
|
|
18153
|
+
var import_node_fs14 = require("fs");
|
|
18119
18154
|
var import_promises31 = require("fs/promises");
|
|
18120
18155
|
var import_node_path45 = __toESM(require("path"), 1);
|
|
18121
18156
|
var import_node_util5 = require("util");
|
|
@@ -18221,7 +18256,7 @@ var WorkspacePoolManager = class {
|
|
|
18221
18256
|
if (!locked) {
|
|
18222
18257
|
continue;
|
|
18223
18258
|
}
|
|
18224
|
-
const slotExists = (0,
|
|
18259
|
+
const slotExists = (0, import_node_fs14.existsSync)(slotPath);
|
|
18225
18260
|
if (slotExists) {
|
|
18226
18261
|
await this.resetSlot(slotPath, templatePath, repos, poolReset);
|
|
18227
18262
|
return {
|
|
@@ -18327,7 +18362,7 @@ var WorkspacePoolManager = class {
|
|
|
18327
18362
|
for (const entry of entries) {
|
|
18328
18363
|
if (entry.startsWith("slot-") && !entry.endsWith(".lock")) {
|
|
18329
18364
|
const lockPath = import_node_path45.default.join(poolDir, `${entry}.lock`);
|
|
18330
|
-
if ((0,
|
|
18365
|
+
if ((0, import_node_fs14.existsSync)(lockPath)) {
|
|
18331
18366
|
try {
|
|
18332
18367
|
const pidStr = await (0, import_promises31.readFile)(lockPath, "utf-8");
|
|
18333
18368
|
const pid = Number.parseInt(pidStr.trim(), 10);
|
|
@@ -18358,7 +18393,7 @@ var WorkspacePoolManager = class {
|
|
|
18358
18393
|
async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
|
|
18359
18394
|
for (const repo of repos) {
|
|
18360
18395
|
const repoDir = import_node_path45.default.join(slotPath, repo.path);
|
|
18361
|
-
if (!(0,
|
|
18396
|
+
if (!(0, import_node_fs14.existsSync)(repoDir)) {
|
|
18362
18397
|
continue;
|
|
18363
18398
|
}
|
|
18364
18399
|
if (poolReset === "none") {
|
|
@@ -18384,7 +18419,7 @@ var WorkspacePoolManager = class {
|
|
|
18384
18419
|
// src/evaluation/workspace/repo-manager.ts
|
|
18385
18420
|
init_cjs_shims();
|
|
18386
18421
|
var import_node_child_process10 = require("child_process");
|
|
18387
|
-
var
|
|
18422
|
+
var import_node_fs15 = require("fs");
|
|
18388
18423
|
var import_node_path46 = __toESM(require("path"), 1);
|
|
18389
18424
|
var import_node_util6 = require("util");
|
|
18390
18425
|
var execFileAsync2 = (0, import_node_util6.promisify)(import_node_child_process10.execFile);
|
|
@@ -18436,7 +18471,7 @@ var RepoManager = class {
|
|
|
18436
18471
|
resolvedSourcePath: sourcePath ?? "",
|
|
18437
18472
|
reason: "empty_path"
|
|
18438
18473
|
});
|
|
18439
|
-
} else if (!(0,
|
|
18474
|
+
} else if (!(0, import_node_fs15.existsSync)(sourcePath)) {
|
|
18440
18475
|
errors.push({
|
|
18441
18476
|
repoPath: repo.path,
|
|
18442
18477
|
resolvedSourcePath: sourcePath,
|
|
@@ -18665,8 +18700,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
|
|
|
18665
18700
|
}
|
|
18666
18701
|
|
|
18667
18702
|
// src/evaluation/orchestrator.ts
|
|
18668
|
-
function classifyQualityStatus(score) {
|
|
18669
|
-
return score >=
|
|
18703
|
+
function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
|
|
18704
|
+
return score >= threshold ? "ok" : "quality_failure";
|
|
18670
18705
|
}
|
|
18671
18706
|
function buildSkippedEvaluatorError(scores) {
|
|
18672
18707
|
const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
|
|
@@ -18742,7 +18777,8 @@ async function runEvaluation(options) {
|
|
|
18742
18777
|
retainOnSuccess,
|
|
18743
18778
|
retainOnFailure,
|
|
18744
18779
|
graderTarget: cliGraderTarget,
|
|
18745
|
-
model: cliModel
|
|
18780
|
+
model: cliModel,
|
|
18781
|
+
threshold: scoreThreshold
|
|
18746
18782
|
} = options;
|
|
18747
18783
|
let useCache = options.useCache;
|
|
18748
18784
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -18871,7 +18907,8 @@ async function runEvaluation(options) {
|
|
|
18871
18907
|
resolveGraderProvider,
|
|
18872
18908
|
agentTimeoutMs,
|
|
18873
18909
|
targetResolver,
|
|
18874
|
-
availableTargets
|
|
18910
|
+
availableTargets,
|
|
18911
|
+
threshold: scoreThreshold
|
|
18875
18912
|
});
|
|
18876
18913
|
} catch (error) {
|
|
18877
18914
|
if (verbose) {
|
|
@@ -19240,7 +19277,8 @@ async function runEvaluation(options) {
|
|
|
19240
19277
|
typeRegistry,
|
|
19241
19278
|
repoManager,
|
|
19242
19279
|
evalDir,
|
|
19243
|
-
verbose
|
|
19280
|
+
verbose,
|
|
19281
|
+
threshold: scoreThreshold
|
|
19244
19282
|
};
|
|
19245
19283
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
19246
19284
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -19398,7 +19436,8 @@ async function runBatchEvaluation(options) {
|
|
|
19398
19436
|
resolveGraderProvider,
|
|
19399
19437
|
agentTimeoutMs,
|
|
19400
19438
|
targetResolver,
|
|
19401
|
-
availableTargets
|
|
19439
|
+
availableTargets,
|
|
19440
|
+
threshold: batchThreshold
|
|
19402
19441
|
} = options;
|
|
19403
19442
|
const promptInputsList = [];
|
|
19404
19443
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -19483,7 +19522,8 @@ async function runBatchEvaluation(options) {
|
|
|
19483
19522
|
endTime,
|
|
19484
19523
|
targetResolver,
|
|
19485
19524
|
availableTargets,
|
|
19486
|
-
verbose
|
|
19525
|
+
verbose,
|
|
19526
|
+
threshold: batchThreshold
|
|
19487
19527
|
});
|
|
19488
19528
|
if (providerError) {
|
|
19489
19529
|
result = {
|
|
@@ -19569,7 +19609,8 @@ async function runEvalCase(options) {
|
|
|
19569
19609
|
typeRegistry: providedTypeRegistry,
|
|
19570
19610
|
repoManager,
|
|
19571
19611
|
evalDir,
|
|
19572
|
-
verbose
|
|
19612
|
+
verbose,
|
|
19613
|
+
threshold: caseThreshold
|
|
19573
19614
|
} = options;
|
|
19574
19615
|
const setupDebug = process.env.AGENTV_SETUP_DEBUG === "1";
|
|
19575
19616
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
@@ -19801,7 +19842,9 @@ async function runEvalCase(options) {
|
|
|
19801
19842
|
});
|
|
19802
19843
|
} catch (error) {
|
|
19803
19844
|
lastError = error;
|
|
19804
|
-
if (
|
|
19845
|
+
if (attempt + 1 < attemptBudget) {
|
|
19846
|
+
const delayMs = retryBackoffMs(attempt);
|
|
19847
|
+
await sleep3(delayMs, signal);
|
|
19805
19848
|
attempt += 1;
|
|
19806
19849
|
continue;
|
|
19807
19850
|
}
|
|
@@ -19932,7 +19975,8 @@ async function runEvalCase(options) {
|
|
|
19932
19975
|
availableTargets,
|
|
19933
19976
|
fileChanges,
|
|
19934
19977
|
workspacePath,
|
|
19935
|
-
verbose
|
|
19978
|
+
verbose,
|
|
19979
|
+
threshold: caseThreshold
|
|
19936
19980
|
});
|
|
19937
19981
|
const totalDurationMs = Date.now() - caseStartMs;
|
|
19938
19982
|
const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
|
|
@@ -19947,7 +19991,7 @@ async function runEvalCase(options) {
|
|
|
19947
19991
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
19948
19992
|
};
|
|
19949
19993
|
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
19950
|
-
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
|
|
19994
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
|
|
19951
19995
|
const finalResult = providerError ? {
|
|
19952
19996
|
...result,
|
|
19953
19997
|
evalRun,
|
|
@@ -20118,7 +20162,8 @@ async function evaluateCandidate(options) {
|
|
|
20118
20162
|
targetResolver,
|
|
20119
20163
|
availableTargets,
|
|
20120
20164
|
fileChanges,
|
|
20121
|
-
workspacePath
|
|
20165
|
+
workspacePath,
|
|
20166
|
+
threshold: evalThreshold
|
|
20122
20167
|
} = options;
|
|
20123
20168
|
const gradeTimestamp = nowFn();
|
|
20124
20169
|
const { score, scores } = await runEvaluatorsForCase({
|
|
@@ -20191,7 +20236,7 @@ async function evaluateCandidate(options) {
|
|
|
20191
20236
|
scores,
|
|
20192
20237
|
trace: trace2,
|
|
20193
20238
|
fileChanges,
|
|
20194
|
-
executionStatus: classifyQualityStatus(score.score)
|
|
20239
|
+
executionStatus: classifyQualityStatus(score.score, evalThreshold)
|
|
20195
20240
|
};
|
|
20196
20241
|
}
|
|
20197
20242
|
async function runEvaluatorsForCase(options) {
|
|
@@ -20495,7 +20540,7 @@ async function invokeProvider(provider, options) {
|
|
|
20495
20540
|
}
|
|
20496
20541
|
}
|
|
20497
20542
|
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode, verbose) {
|
|
20498
|
-
const message =
|
|
20543
|
+
const message = extractErrorMessage(error);
|
|
20499
20544
|
let agentRequest;
|
|
20500
20545
|
let lmRequest;
|
|
20501
20546
|
if (isAgentProvider(provider)) {
|
|
@@ -20612,20 +20657,45 @@ function aggregateEvaluatorTokenUsage(scores) {
|
|
|
20612
20657
|
...hasCached ? { cached } : {}
|
|
20613
20658
|
};
|
|
20614
20659
|
}
|
|
20615
|
-
function
|
|
20616
|
-
if (!error) {
|
|
20617
|
-
return false;
|
|
20618
|
-
}
|
|
20619
|
-
if (typeof DOMException !== "undefined" && error instanceof DOMException && error.name === "AbortError") {
|
|
20620
|
-
return true;
|
|
20621
|
-
}
|
|
20660
|
+
function extractErrorMessage(error) {
|
|
20622
20661
|
if (error instanceof Error) {
|
|
20623
|
-
|
|
20624
|
-
const message = error.message?.toLowerCase();
|
|
20625
|
-
return name.includes("timeout") || message.includes("timeout");
|
|
20662
|
+
return error.message;
|
|
20626
20663
|
}
|
|
20627
|
-
|
|
20628
|
-
|
|
20664
|
+
if (error !== null && typeof error === "object") {
|
|
20665
|
+
const obj = error;
|
|
20666
|
+
const parts = [];
|
|
20667
|
+
if (typeof obj.message === "string" && obj.message) {
|
|
20668
|
+
parts.push(obj.message);
|
|
20669
|
+
}
|
|
20670
|
+
if (typeof obj.code === "number") {
|
|
20671
|
+
parts.push(`(code ${obj.code})`);
|
|
20672
|
+
}
|
|
20673
|
+
if (parts.length > 0) {
|
|
20674
|
+
return parts.join(" ");
|
|
20675
|
+
}
|
|
20676
|
+
try {
|
|
20677
|
+
return JSON.stringify(error);
|
|
20678
|
+
} catch {
|
|
20679
|
+
}
|
|
20680
|
+
}
|
|
20681
|
+
return String(error);
|
|
20682
|
+
}
|
|
20683
|
+
function retryBackoffMs(attempt) {
|
|
20684
|
+
return Math.min(2 ** attempt * 1e3, 3e4);
|
|
20685
|
+
}
|
|
20686
|
+
function sleep3(ms, signal) {
|
|
20687
|
+
if (signal?.aborted) return Promise.resolve();
|
|
20688
|
+
return new Promise((resolve) => {
|
|
20689
|
+
const timer = setTimeout(resolve, ms);
|
|
20690
|
+
signal?.addEventListener(
|
|
20691
|
+
"abort",
|
|
20692
|
+
() => {
|
|
20693
|
+
clearTimeout(timer);
|
|
20694
|
+
resolve();
|
|
20695
|
+
},
|
|
20696
|
+
{ once: true }
|
|
20697
|
+
);
|
|
20698
|
+
});
|
|
20629
20699
|
}
|
|
20630
20700
|
function mapChildResults(children) {
|
|
20631
20701
|
if (!children || children.length === 0) {
|
|
@@ -20657,7 +20727,7 @@ function computeWeightedMean(entries) {
|
|
|
20657
20727
|
|
|
20658
20728
|
// src/evaluation/evaluate.ts
|
|
20659
20729
|
init_cjs_shims();
|
|
20660
|
-
var
|
|
20730
|
+
var import_node_fs16 = require("fs");
|
|
20661
20731
|
var import_node_path49 = __toESM(require("path"), 1);
|
|
20662
20732
|
|
|
20663
20733
|
// src/evaluation/providers/function-provider.ts
|
|
@@ -20814,7 +20884,7 @@ async function discoverDefaultTarget(repoRoot) {
|
|
|
20814
20884
|
for (const dir of chain) {
|
|
20815
20885
|
for (const candidate of TARGET_FILE_CANDIDATES) {
|
|
20816
20886
|
const targetsPath = import_node_path49.default.join(dir, candidate);
|
|
20817
|
-
if (!(0,
|
|
20887
|
+
if (!(0, import_node_fs16.existsSync)(targetsPath)) continue;
|
|
20818
20888
|
try {
|
|
20819
20889
|
const definitions = await readTargetDefinitions(targetsPath);
|
|
20820
20890
|
const defaultTarget = definitions.find((d) => d.name === "default");
|
|
@@ -20831,7 +20901,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
|
|
|
20831
20901
|
const envFiles = [];
|
|
20832
20902
|
for (const dir of chain) {
|
|
20833
20903
|
const envPath = import_node_path49.default.join(dir, ".env");
|
|
20834
|
-
if ((0,
|
|
20904
|
+
if ((0, import_node_fs16.existsSync)(envPath)) envFiles.push(envPath);
|
|
20835
20905
|
}
|
|
20836
20906
|
for (let i = 0; i < envFiles.length; i++) {
|
|
20837
20907
|
try {
|
|
@@ -20908,12 +20978,12 @@ var CONFIG_FILE_NAMES = [
|
|
|
20908
20978
|
".agentv/config.js"
|
|
20909
20979
|
];
|
|
20910
20980
|
async function loadTsConfig(projectRoot) {
|
|
20911
|
-
const { existsSync:
|
|
20981
|
+
const { existsSync: existsSync7 } = await import("fs");
|
|
20912
20982
|
const { pathToFileURL } = await import("url");
|
|
20913
20983
|
const { join: join2 } = await import("path");
|
|
20914
20984
|
for (const fileName of CONFIG_FILE_NAMES) {
|
|
20915
20985
|
const filePath = join2(projectRoot, fileName);
|
|
20916
|
-
if (!
|
|
20986
|
+
if (!existsSync7(filePath)) {
|
|
20917
20987
|
continue;
|
|
20918
20988
|
}
|
|
20919
20989
|
try {
|
|
@@ -21060,7 +21130,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
21060
21130
|
|
|
21061
21131
|
// src/projects.ts
|
|
21062
21132
|
init_cjs_shims();
|
|
21063
|
-
var
|
|
21133
|
+
var import_node_fs17 = require("fs");
|
|
21064
21134
|
var import_node_path51 = __toESM(require("path"), 1);
|
|
21065
21135
|
var import_yaml8 = require("yaml");
|
|
21066
21136
|
function getProjectsRegistryPath() {
|
|
@@ -21068,11 +21138,11 @@ function getProjectsRegistryPath() {
|
|
|
21068
21138
|
}
|
|
21069
21139
|
function loadProjectRegistry() {
|
|
21070
21140
|
const registryPath = getProjectsRegistryPath();
|
|
21071
|
-
if (!(0,
|
|
21141
|
+
if (!(0, import_node_fs17.existsSync)(registryPath)) {
|
|
21072
21142
|
return { projects: [] };
|
|
21073
21143
|
}
|
|
21074
21144
|
try {
|
|
21075
|
-
const raw = (0,
|
|
21145
|
+
const raw = (0, import_node_fs17.readFileSync)(registryPath, "utf-8");
|
|
21076
21146
|
const parsed = (0, import_yaml8.parse)(raw);
|
|
21077
21147
|
if (!parsed || !Array.isArray(parsed.projects)) {
|
|
21078
21148
|
return { projects: [] };
|
|
@@ -21085,10 +21155,10 @@ function loadProjectRegistry() {
|
|
|
21085
21155
|
function saveProjectRegistry(registry) {
|
|
21086
21156
|
const registryPath = getProjectsRegistryPath();
|
|
21087
21157
|
const dir = import_node_path51.default.dirname(registryPath);
|
|
21088
|
-
if (!(0,
|
|
21089
|
-
(0,
|
|
21158
|
+
if (!(0, import_node_fs17.existsSync)(dir)) {
|
|
21159
|
+
(0, import_node_fs17.mkdirSync)(dir, { recursive: true });
|
|
21090
21160
|
}
|
|
21091
|
-
(0,
|
|
21161
|
+
(0, import_node_fs17.writeFileSync)(registryPath, (0, import_yaml8.stringify)(registry), "utf-8");
|
|
21092
21162
|
}
|
|
21093
21163
|
function deriveProjectId(dirPath, existingIds) {
|
|
21094
21164
|
const base = import_node_path51.default.basename(dirPath).toLowerCase().replace(/[^a-z0-9-]/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "");
|
|
@@ -21102,10 +21172,10 @@ function deriveProjectId(dirPath, existingIds) {
|
|
|
21102
21172
|
}
|
|
21103
21173
|
function addProject(projectPath) {
|
|
21104
21174
|
const absPath = import_node_path51.default.resolve(projectPath);
|
|
21105
|
-
if (!(0,
|
|
21175
|
+
if (!(0, import_node_fs17.existsSync)(absPath)) {
|
|
21106
21176
|
throw new Error(`Directory not found: ${absPath}`);
|
|
21107
21177
|
}
|
|
21108
|
-
if (!(0,
|
|
21178
|
+
if (!(0, import_node_fs17.existsSync)(import_node_path51.default.join(absPath, ".agentv"))) {
|
|
21109
21179
|
throw new Error(`No .agentv/ directory found in ${absPath}. Run an evaluation first.`);
|
|
21110
21180
|
}
|
|
21111
21181
|
const registry = loadProjectRegistry();
|
|
@@ -21149,19 +21219,19 @@ function touchProject(projectId) {
|
|
|
21149
21219
|
}
|
|
21150
21220
|
function discoverProjects(rootDir, maxDepth = 2) {
|
|
21151
21221
|
const absRoot = import_node_path51.default.resolve(rootDir);
|
|
21152
|
-
if (!(0,
|
|
21222
|
+
if (!(0, import_node_fs17.existsSync)(absRoot) || !(0, import_node_fs17.statSync)(absRoot).isDirectory()) {
|
|
21153
21223
|
return [];
|
|
21154
21224
|
}
|
|
21155
21225
|
const results = [];
|
|
21156
21226
|
function scan(dir, depth) {
|
|
21157
21227
|
if (depth > maxDepth) return;
|
|
21158
|
-
if ((0,
|
|
21228
|
+
if ((0, import_node_fs17.existsSync)(import_node_path51.default.join(dir, ".agentv"))) {
|
|
21159
21229
|
results.push(dir);
|
|
21160
21230
|
return;
|
|
21161
21231
|
}
|
|
21162
21232
|
if (depth === maxDepth) return;
|
|
21163
21233
|
try {
|
|
21164
|
-
const entries = (0,
|
|
21234
|
+
const entries = (0, import_node_fs17.readdirSync)(dir, { withFileTypes: true });
|
|
21165
21235
|
for (const entry of entries) {
|
|
21166
21236
|
if (!entry.isDirectory()) continue;
|
|
21167
21237
|
if (entry.name.startsWith(".") || entry.name === "node_modules") continue;
|
|
@@ -21737,6 +21807,254 @@ function toHrTime(iso) {
|
|
|
21737
21807
|
// src/observability/index.ts
|
|
21738
21808
|
init_otlp_json_file_exporter();
|
|
21739
21809
|
|
|
21810
|
+
// src/import/index.ts
|
|
21811
|
+
init_cjs_shims();
|
|
21812
|
+
|
|
21813
|
+
// src/import/claude-parser.ts
|
|
21814
|
+
init_cjs_shims();
|
|
21815
|
+
var SKIPPED_TYPES = /* @__PURE__ */ new Set(["progress", "system", "file-history-snapshot"]);
|
|
21816
|
+
function parseClaudeSession(jsonl) {
|
|
21817
|
+
const messages = [];
|
|
21818
|
+
let sessionId = "";
|
|
21819
|
+
let projectPath;
|
|
21820
|
+
let model;
|
|
21821
|
+
let startTimestamp;
|
|
21822
|
+
let endTimestamp;
|
|
21823
|
+
const usageByRequestId = /* @__PURE__ */ new Map();
|
|
21824
|
+
let lastAssistantRequestId;
|
|
21825
|
+
let lastAssistantIdx = -1;
|
|
21826
|
+
const pendingToolCalls = /* @__PURE__ */ new Map();
|
|
21827
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
21828
|
+
for (const line of lines) {
|
|
21829
|
+
let event;
|
|
21830
|
+
try {
|
|
21831
|
+
event = JSON.parse(line);
|
|
21832
|
+
} catch {
|
|
21833
|
+
continue;
|
|
21834
|
+
}
|
|
21835
|
+
if (!event.type) continue;
|
|
21836
|
+
if (event.timestamp) {
|
|
21837
|
+
if (!startTimestamp) startTimestamp = event.timestamp;
|
|
21838
|
+
endTimestamp = event.timestamp;
|
|
21839
|
+
}
|
|
21840
|
+
if (SKIPPED_TYPES.has(event.type)) continue;
|
|
21841
|
+
if (event.isSidechain) continue;
|
|
21842
|
+
if (!sessionId && event.sessionId) {
|
|
21843
|
+
sessionId = event.sessionId;
|
|
21844
|
+
}
|
|
21845
|
+
if (!projectPath && event.cwd) {
|
|
21846
|
+
projectPath = event.cwd;
|
|
21847
|
+
}
|
|
21848
|
+
switch (event.type) {
|
|
21849
|
+
case "user": {
|
|
21850
|
+
const msg = event.message;
|
|
21851
|
+
if (!msg) break;
|
|
21852
|
+
const contentArr = msg.content;
|
|
21853
|
+
if (Array.isArray(contentArr)) {
|
|
21854
|
+
for (const block of contentArr) {
|
|
21855
|
+
if (block.type === "tool_result" && block.tool_use_id) {
|
|
21856
|
+
const pending = pendingToolCalls.get(block.tool_use_id);
|
|
21857
|
+
if (pending) {
|
|
21858
|
+
const existingMsg = messages[pending.msgIdx];
|
|
21859
|
+
const existingCalls = [...existingMsg.toolCalls ?? []];
|
|
21860
|
+
existingCalls[pending.toolIdx] = {
|
|
21861
|
+
...existingCalls[pending.toolIdx],
|
|
21862
|
+
output: extractToolResultContent(block.content)
|
|
21863
|
+
};
|
|
21864
|
+
messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
|
|
21865
|
+
pendingToolCalls.delete(block.tool_use_id);
|
|
21866
|
+
}
|
|
21867
|
+
}
|
|
21868
|
+
}
|
|
21869
|
+
}
|
|
21870
|
+
const text = extractTextContent2(contentArr);
|
|
21871
|
+
if (text !== void 0) {
|
|
21872
|
+
messages.push({ role: "user", content: text });
|
|
21873
|
+
}
|
|
21874
|
+
break;
|
|
21875
|
+
}
|
|
21876
|
+
case "assistant": {
|
|
21877
|
+
const msg = event.message;
|
|
21878
|
+
if (!msg) break;
|
|
21879
|
+
if (!model && msg.model) {
|
|
21880
|
+
model = msg.model;
|
|
21881
|
+
}
|
|
21882
|
+
if (msg.usage && event.requestId) {
|
|
21883
|
+
usageByRequestId.set(event.requestId, msg.usage);
|
|
21884
|
+
}
|
|
21885
|
+
const { text, toolCalls } = extractAssistantContent(msg.content);
|
|
21886
|
+
if (event.requestId && event.requestId === lastAssistantRequestId && lastAssistantIdx >= 0) {
|
|
21887
|
+
messages[lastAssistantIdx] = {
|
|
21888
|
+
role: "assistant",
|
|
21889
|
+
content: text || void 0,
|
|
21890
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
21891
|
+
};
|
|
21892
|
+
registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls);
|
|
21893
|
+
} else {
|
|
21894
|
+
if (text || toolCalls.length > 0) {
|
|
21895
|
+
lastAssistantIdx = messages.length;
|
|
21896
|
+
messages.push({
|
|
21897
|
+
role: "assistant",
|
|
21898
|
+
content: text || void 0,
|
|
21899
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
21900
|
+
});
|
|
21901
|
+
registerPendingToolCalls(toolCalls, lastAssistantIdx, pendingToolCalls);
|
|
21902
|
+
}
|
|
21903
|
+
}
|
|
21904
|
+
lastAssistantRequestId = event.requestId;
|
|
21905
|
+
break;
|
|
21906
|
+
}
|
|
21907
|
+
}
|
|
21908
|
+
}
|
|
21909
|
+
let totalInputTokens = 0;
|
|
21910
|
+
let totalOutputTokens = 0;
|
|
21911
|
+
for (const usage of usageByRequestId.values()) {
|
|
21912
|
+
totalInputTokens += Number(usage.input_tokens ?? 0);
|
|
21913
|
+
totalOutputTokens += Number(usage.output_tokens ?? 0);
|
|
21914
|
+
}
|
|
21915
|
+
const hasUsage = usageByRequestId.size > 0;
|
|
21916
|
+
let durationMs;
|
|
21917
|
+
if (startTimestamp && endTimestamp) {
|
|
21918
|
+
durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
|
|
21919
|
+
}
|
|
21920
|
+
const source = {
|
|
21921
|
+
provider: "claude",
|
|
21922
|
+
sessionId,
|
|
21923
|
+
projectPath,
|
|
21924
|
+
startedAt: startTimestamp,
|
|
21925
|
+
model
|
|
21926
|
+
};
|
|
21927
|
+
return {
|
|
21928
|
+
messages,
|
|
21929
|
+
source,
|
|
21930
|
+
tokenUsage: hasUsage ? { input: totalInputTokens, output: totalOutputTokens } : void 0,
|
|
21931
|
+
durationMs,
|
|
21932
|
+
costUsd: null
|
|
21933
|
+
};
|
|
21934
|
+
}
|
|
21935
|
+
function registerPendingToolCalls(toolCalls, msgIdx, pending) {
|
|
21936
|
+
for (let i = 0; i < toolCalls.length; i++) {
|
|
21937
|
+
const id = toolCalls[i].id;
|
|
21938
|
+
if (id) {
|
|
21939
|
+
pending.set(id, { msgIdx, toolIdx: i });
|
|
21940
|
+
}
|
|
21941
|
+
}
|
|
21942
|
+
}
|
|
21943
|
+
function extractTextContent2(content) {
|
|
21944
|
+
if (content === void 0 || content === null) return void 0;
|
|
21945
|
+
if (typeof content === "string") return content;
|
|
21946
|
+
const textParts = [];
|
|
21947
|
+
for (const block of content) {
|
|
21948
|
+
if (block.type === "text" && block.text) {
|
|
21949
|
+
textParts.push(block.text);
|
|
21950
|
+
}
|
|
21951
|
+
}
|
|
21952
|
+
return textParts.length > 0 ? textParts.join("") : void 0;
|
|
21953
|
+
}
|
|
21954
|
+
function extractAssistantContent(content) {
|
|
21955
|
+
if (content === void 0 || content === null) {
|
|
21956
|
+
return { text: void 0, toolCalls: [] };
|
|
21957
|
+
}
|
|
21958
|
+
if (typeof content === "string") {
|
|
21959
|
+
return { text: content, toolCalls: [] };
|
|
21960
|
+
}
|
|
21961
|
+
const textParts = [];
|
|
21962
|
+
const toolCalls = [];
|
|
21963
|
+
for (const block of content) {
|
|
21964
|
+
switch (block.type) {
|
|
21965
|
+
case "text":
|
|
21966
|
+
if (block.text) textParts.push(block.text);
|
|
21967
|
+
break;
|
|
21968
|
+
case "tool_use":
|
|
21969
|
+
if (block.name) {
|
|
21970
|
+
toolCalls.push({
|
|
21971
|
+
tool: block.name,
|
|
21972
|
+
input: block.input,
|
|
21973
|
+
id: block.id
|
|
21974
|
+
});
|
|
21975
|
+
}
|
|
21976
|
+
break;
|
|
21977
|
+
}
|
|
21978
|
+
}
|
|
21979
|
+
return {
|
|
21980
|
+
text: textParts.length > 0 ? textParts.join("") : void 0,
|
|
21981
|
+
toolCalls
|
|
21982
|
+
};
|
|
21983
|
+
}
|
|
21984
|
+
function extractToolResultContent(content) {
|
|
21985
|
+
if (content === void 0 || content === null) return void 0;
|
|
21986
|
+
if (typeof content === "string") return content;
|
|
21987
|
+
const parts = [];
|
|
21988
|
+
for (const block of content) {
|
|
21989
|
+
if (block.type === "text" && block.text) {
|
|
21990
|
+
parts.push(block.text);
|
|
21991
|
+
}
|
|
21992
|
+
}
|
|
21993
|
+
return parts.length > 0 ? parts.join("") : void 0;
|
|
21994
|
+
}
|
|
21995
|
+
|
|
21996
|
+
// src/import/session-discovery.ts
|
|
21997
|
+
init_cjs_shims();
|
|
21998
|
+
var import_promises36 = require("fs/promises");
|
|
21999
|
+
var import_node_os8 = require("os");
|
|
22000
|
+
var import_node_path53 = __toESM(require("path"), 1);
|
|
22001
|
+
var DEFAULT_PROJECTS_DIR = () => import_node_path53.default.join((0, import_node_os8.homedir)(), ".claude", "projects");
|
|
22002
|
+
function encodeProjectPath(projectPath) {
|
|
22003
|
+
return projectPath.replace(/\//g, "-");
|
|
22004
|
+
}
|
|
22005
|
+
async function discoverClaudeSessions(opts) {
|
|
22006
|
+
const projectsDir = opts?.projectsDir ?? DEFAULT_PROJECTS_DIR();
|
|
22007
|
+
const limit = opts?.latest ? 1 : opts?.limit ?? 10;
|
|
22008
|
+
let projectDirs;
|
|
22009
|
+
try {
|
|
22010
|
+
projectDirs = await (0, import_promises36.readdir)(projectsDir);
|
|
22011
|
+
} catch {
|
|
22012
|
+
return [];
|
|
22013
|
+
}
|
|
22014
|
+
if (opts?.projectPath) {
|
|
22015
|
+
const encoded = encodeProjectPath(opts.projectPath);
|
|
22016
|
+
projectDirs = projectDirs.filter((dir) => dir === encoded || dir.includes(encoded));
|
|
22017
|
+
}
|
|
22018
|
+
const sessions = [];
|
|
22019
|
+
for (const projectDir of projectDirs) {
|
|
22020
|
+
const dirPath = import_node_path53.default.join(projectsDir, projectDir);
|
|
22021
|
+
let entries;
|
|
22022
|
+
try {
|
|
22023
|
+
entries = await (0, import_promises36.readdir)(dirPath);
|
|
22024
|
+
} catch {
|
|
22025
|
+
continue;
|
|
22026
|
+
}
|
|
22027
|
+
for (const entry of entries) {
|
|
22028
|
+
if (!entry.endsWith(".jsonl")) continue;
|
|
22029
|
+
const sessionId = entry.replace(/\.jsonl$/, "");
|
|
22030
|
+
if (opts?.sessionId && sessionId !== opts.sessionId) continue;
|
|
22031
|
+
const filePath = import_node_path53.default.join(dirPath, entry);
|
|
22032
|
+
let updatedAt;
|
|
22033
|
+
try {
|
|
22034
|
+
const fileStat = await (0, import_promises36.stat)(filePath);
|
|
22035
|
+
updatedAt = fileStat.mtime;
|
|
22036
|
+
} catch {
|
|
22037
|
+
updatedAt = /* @__PURE__ */ new Date(0);
|
|
22038
|
+
}
|
|
22039
|
+
sessions.push({
|
|
22040
|
+
sessionId,
|
|
22041
|
+
filePath,
|
|
22042
|
+
projectDir,
|
|
22043
|
+
updatedAt
|
|
22044
|
+
});
|
|
22045
|
+
}
|
|
22046
|
+
}
|
|
22047
|
+
sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
|
|
22048
|
+
return sessions.slice(0, limit);
|
|
22049
|
+
}
|
|
22050
|
+
|
|
22051
|
+
// src/import/types.ts
|
|
22052
|
+
init_cjs_shims();
|
|
22053
|
+
var import_promises37 = require("fs/promises");
|
|
22054
|
+
async function readTranscriptFile(filePath) {
|
|
22055
|
+
return (0, import_promises37.readFile)(filePath, "utf8");
|
|
22056
|
+
}
|
|
22057
|
+
|
|
21740
22058
|
// src/index.ts
|
|
21741
22059
|
function createAgentKernel() {
|
|
21742
22060
|
return { status: "stub" };
|
|
@@ -21807,6 +22125,7 @@ function createAgentKernel() {
|
|
|
21807
22125
|
deriveProjectId,
|
|
21808
22126
|
detectFormat,
|
|
21809
22127
|
discoverAssertions,
|
|
22128
|
+
discoverClaudeSessions,
|
|
21810
22129
|
discoverCopilotSessions,
|
|
21811
22130
|
discoverGraders,
|
|
21812
22131
|
discoverJudges,
|
|
@@ -21866,12 +22185,14 @@ function createAgentKernel() {
|
|
|
21866
22185
|
negateScore,
|
|
21867
22186
|
normalizeLineEndings,
|
|
21868
22187
|
parseAgentSkillsEvals,
|
|
22188
|
+
parseClaudeSession,
|
|
21869
22189
|
parseJsonFromText,
|
|
21870
22190
|
parseJsonSafe,
|
|
21871
22191
|
readJsonFile,
|
|
21872
22192
|
readTargetDefinitions,
|
|
21873
22193
|
readTestSuiteMetadata,
|
|
21874
22194
|
readTextFile,
|
|
22195
|
+
readTranscriptFile,
|
|
21875
22196
|
removeProject,
|
|
21876
22197
|
resolveAndCreateProvider,
|
|
21877
22198
|
resolveFileReference,
|