@agentv/core 4.15.9 → 4.16.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HVEQNYTC.js → chunk-6VZY3B6M.js} +55 -165
- package/dist/chunk-6VZY3B6M.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +18 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +13 -12
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +329 -257
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +71 -25
- package/dist/index.d.ts +71 -25
- package/dist/index.js +249 -59
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-HVEQNYTC.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -25,7 +25,7 @@ import {
|
|
|
25
25
|
resolveDelegatedTargetDefinition,
|
|
26
26
|
resolveFileReference,
|
|
27
27
|
resolveTargetDefinition
|
|
28
|
-
} from "./chunk-
|
|
28
|
+
} from "./chunk-6VZY3B6M.js";
|
|
29
29
|
import {
|
|
30
30
|
execFileWithStdin,
|
|
31
31
|
execShellWithStdin
|
|
@@ -483,17 +483,76 @@ function extractTargetFromSuite(suite) {
|
|
|
483
483
|
}
|
|
484
484
|
return void 0;
|
|
485
485
|
}
|
|
486
|
-
function
|
|
486
|
+
function extractTargetRefsFromSuite(suite) {
|
|
487
487
|
const execution = suite.execution;
|
|
488
488
|
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
489
489
|
return void 0;
|
|
490
490
|
}
|
|
491
491
|
const targets = execution.targets;
|
|
492
|
-
if (Array.isArray(targets)) {
|
|
493
|
-
|
|
494
|
-
return valid.length > 0 ? valid.map((t) => t.trim()) : void 0;
|
|
492
|
+
if (!Array.isArray(targets)) {
|
|
493
|
+
return void 0;
|
|
495
494
|
}
|
|
496
|
-
|
|
495
|
+
const refs = [];
|
|
496
|
+
for (const t of targets) {
|
|
497
|
+
if (typeof t === "string" && t.trim().length > 0) {
|
|
498
|
+
refs.push({ name: t.trim() });
|
|
499
|
+
} else if (t && typeof t === "object" && !Array.isArray(t) && "name" in t) {
|
|
500
|
+
const obj = t;
|
|
501
|
+
const name = typeof obj.name === "string" ? obj.name.trim() : "";
|
|
502
|
+
if (name.length === 0) continue;
|
|
503
|
+
const useTarget = typeof obj.use_target === "string" ? obj.use_target.trim() : void 0;
|
|
504
|
+
const hooks = parseTargetHooks(obj.hooks);
|
|
505
|
+
refs.push({
|
|
506
|
+
name,
|
|
507
|
+
...useTarget && { use_target: useTarget },
|
|
508
|
+
...hooks && { hooks }
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
return refs.length > 0 ? refs : void 0;
|
|
513
|
+
}
|
|
514
|
+
function extractTargetsFromSuite(suite) {
|
|
515
|
+
const refs = extractTargetRefsFromSuite(suite);
|
|
516
|
+
if (!refs) return void 0;
|
|
517
|
+
const names = refs.map((r) => r.name);
|
|
518
|
+
return names.length > 0 ? names : void 0;
|
|
519
|
+
}
|
|
520
|
+
function parseHookConfig(raw) {
|
|
521
|
+
if (!raw || typeof raw !== "object") return void 0;
|
|
522
|
+
const obj = raw;
|
|
523
|
+
let command;
|
|
524
|
+
if (typeof obj.command === "string") {
|
|
525
|
+
command = ["sh", "-c", obj.command];
|
|
526
|
+
} else if (Array.isArray(obj.command)) {
|
|
527
|
+
command = obj.command.filter((s) => typeof s === "string");
|
|
528
|
+
} else if (typeof obj.script === "string") {
|
|
529
|
+
command = ["sh", "-c", obj.script];
|
|
530
|
+
} else if (Array.isArray(obj.script)) {
|
|
531
|
+
command = obj.script.filter((s) => typeof s === "string");
|
|
532
|
+
}
|
|
533
|
+
if (!command || command.length === 0) return void 0;
|
|
534
|
+
const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : typeof obj.timeoutMs === "number" ? obj.timeoutMs : void 0;
|
|
535
|
+
const cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
|
|
536
|
+
return {
|
|
537
|
+
command,
|
|
538
|
+
...timeoutMs !== void 0 && { timeout_ms: timeoutMs },
|
|
539
|
+
...cwd && { cwd }
|
|
540
|
+
};
|
|
541
|
+
}
|
|
542
|
+
function parseTargetHooks(raw) {
|
|
543
|
+
if (!raw || typeof raw !== "object") return void 0;
|
|
544
|
+
const obj = raw;
|
|
545
|
+
const beforeAll = parseHookConfig(obj.before_all);
|
|
546
|
+
const beforeEach = parseHookConfig(obj.before_each);
|
|
547
|
+
const afterEach = parseHookConfig(obj.after_each);
|
|
548
|
+
const afterAll = parseHookConfig(obj.after_all);
|
|
549
|
+
if (!beforeAll && !beforeEach && !afterEach && !afterAll) return void 0;
|
|
550
|
+
return {
|
|
551
|
+
...beforeAll && { before_all: beforeAll },
|
|
552
|
+
...beforeEach && { before_each: beforeEach },
|
|
553
|
+
...afterEach && { after_each: afterEach },
|
|
554
|
+
...afterAll && { after_all: afterAll }
|
|
555
|
+
};
|
|
497
556
|
}
|
|
498
557
|
function extractWorkersFromSuite(suite) {
|
|
499
558
|
const execution = suite.execution;
|
|
@@ -1160,7 +1219,11 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId, defa
|
|
|
1160
1219
|
}
|
|
1161
1220
|
const placeholderIndex = result.indexOf(PLACEHOLDER);
|
|
1162
1221
|
if (strings.length > 0 && placeholderIndex !== -1) {
|
|
1163
|
-
result[placeholderIndex] = {
|
|
1222
|
+
result[placeholderIndex] = {
|
|
1223
|
+
type: "rubrics",
|
|
1224
|
+
criteria: strings,
|
|
1225
|
+
weight: strings.length
|
|
1226
|
+
};
|
|
1164
1227
|
} else if (placeholderIndex !== -1) {
|
|
1165
1228
|
result.splice(placeholderIndex, 1);
|
|
1166
1229
|
}
|
|
@@ -3555,6 +3618,7 @@ async function readTestSuiteMetadata(testFilePath) {
|
|
|
3555
3618
|
return {
|
|
3556
3619
|
target: extractTargetFromSuite(parsed),
|
|
3557
3620
|
targets: extractTargetsFromSuite(parsed),
|
|
3621
|
+
targetRefs: extractTargetRefsFromSuite(parsed),
|
|
3558
3622
|
trials: extractTrialsConfig(parsed)
|
|
3559
3623
|
};
|
|
3560
3624
|
} catch {
|
|
@@ -3581,6 +3645,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
3581
3645
|
tests,
|
|
3582
3646
|
trials: extractTrialsConfig(parsed),
|
|
3583
3647
|
targets: extractTargetsFromSuite(parsed),
|
|
3648
|
+
targetRefs: extractTargetRefsFromSuite(parsed),
|
|
3584
3649
|
workers: extractWorkersFromSuite(parsed),
|
|
3585
3650
|
cacheConfig: extractCacheConfig(parsed),
|
|
3586
3651
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
@@ -5208,7 +5273,7 @@ var ClaudeCliProvider = class {
|
|
|
5208
5273
|
if (options.cwd) {
|
|
5209
5274
|
spawnOptions.cwd = options.cwd;
|
|
5210
5275
|
}
|
|
5211
|
-
const child = spawn(
|
|
5276
|
+
const child = spawn(this.config.executable, options.args, spawnOptions);
|
|
5212
5277
|
let stdout = "";
|
|
5213
5278
|
let stderr = "";
|
|
5214
5279
|
let timedOut = false;
|
|
@@ -5267,7 +5332,7 @@ var ClaudeCliProvider = class {
|
|
|
5267
5332
|
if (err.code === "ENOENT") {
|
|
5268
5333
|
reject(
|
|
5269
5334
|
new Error(
|
|
5270
|
-
`Claude CLI executable '
|
|
5335
|
+
`Claude CLI executable '${this.config.executable}' was not found on PATH. Install claude-code or ensure it is in PATH.`
|
|
5271
5336
|
)
|
|
5272
5337
|
);
|
|
5273
5338
|
} else {
|
|
@@ -9339,6 +9404,9 @@ import { fileURLToPath as fileURLToPath4, pathToFileURL } from "node:url";
|
|
|
9339
9404
|
import os2 from "node:os";
|
|
9340
9405
|
import path22 from "node:path";
|
|
9341
9406
|
var logged = false;
|
|
9407
|
+
function getAgentvConfigDir() {
|
|
9408
|
+
return path22.join(os2.homedir(), ".agentv");
|
|
9409
|
+
}
|
|
9342
9410
|
function getAgentvHome() {
|
|
9343
9411
|
const envHome = process.env.AGENTV_HOME;
|
|
9344
9412
|
if (envHome && envHome !== "undefined") {
|
|
@@ -10010,7 +10078,7 @@ var ProviderRegistry = class {
|
|
|
10010
10078
|
|
|
10011
10079
|
// src/evaluation/providers/vscode-provider.ts
|
|
10012
10080
|
import { exec as exec2 } from "node:child_process";
|
|
10013
|
-
import { constants as constants3, access as access3
|
|
10081
|
+
import { constants as constants3, access as access3 } from "node:fs/promises";
|
|
10014
10082
|
import path34 from "node:path";
|
|
10015
10083
|
import { promisify as promisify4 } from "node:util";
|
|
10016
10084
|
|
|
@@ -11151,7 +11219,7 @@ var VSCodeProvider = class {
|
|
|
11151
11219
|
await this.ensureEnvironmentReady();
|
|
11152
11220
|
const inputFiles = normalizeAttachments(request.inputFiles);
|
|
11153
11221
|
const promptContent = buildPromptDocument2(request, inputFiles);
|
|
11154
|
-
const workspaceTemplate = request.workspaceFile
|
|
11222
|
+
const workspaceTemplate = request.workspaceFile;
|
|
11155
11223
|
const startTime = Date.now();
|
|
11156
11224
|
const session = await dispatchAgentSession({
|
|
11157
11225
|
userQuery: promptContent,
|
|
@@ -11207,9 +11275,6 @@ var VSCodeProvider = class {
|
|
|
11207
11275
|
const userQueries = normalizedRequests.map(
|
|
11208
11276
|
({ request, inputFiles }) => buildPromptDocument2(request, inputFiles)
|
|
11209
11277
|
);
|
|
11210
|
-
const batchWorkspaceTemplate = await resolveWorkspaceTemplateFile(
|
|
11211
|
-
this.config.workspaceTemplate
|
|
11212
|
-
);
|
|
11213
11278
|
const startTime = Date.now();
|
|
11214
11279
|
const session = await dispatchBatchAgent({
|
|
11215
11280
|
userQueries,
|
|
@@ -11219,7 +11284,7 @@ var VSCodeProvider = class {
|
|
|
11219
11284
|
dryRun: this.config.dryRun,
|
|
11220
11285
|
vscodeCmd: this.config.executable,
|
|
11221
11286
|
subagentRoot: this.config.subagentRoot,
|
|
11222
|
-
workspaceTemplate:
|
|
11287
|
+
workspaceTemplate: void 0,
|
|
11223
11288
|
silent: true,
|
|
11224
11289
|
timeoutMs: this.config.timeoutMs
|
|
11225
11290
|
});
|
|
@@ -11299,17 +11364,6 @@ async function locateVSCodeExecutable(candidate) {
|
|
|
11299
11364
|
`VS Code executable '${candidate}' was not found on PATH. Check the 'executable' setting in your target configuration.`
|
|
11300
11365
|
);
|
|
11301
11366
|
}
|
|
11302
|
-
async function resolveWorkspaceTemplateFile(template) {
|
|
11303
|
-
if (!template) {
|
|
11304
|
-
return void 0;
|
|
11305
|
-
}
|
|
11306
|
-
try {
|
|
11307
|
-
const stats = await stat6(path34.resolve(template));
|
|
11308
|
-
return stats.isFile() ? template : void 0;
|
|
11309
|
-
} catch {
|
|
11310
|
-
return template;
|
|
11311
|
-
}
|
|
11312
|
-
}
|
|
11313
11367
|
function buildPromptDocument2(request, attachments) {
|
|
11314
11368
|
const parts = [];
|
|
11315
11369
|
if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
|
|
@@ -12507,7 +12561,7 @@ ${context.fileChanges}`;
|
|
|
12507
12561
|
const workspacePath = context.workspacePath;
|
|
12508
12562
|
if (!workspacePath) {
|
|
12509
12563
|
throw new Error(
|
|
12510
|
-
"llm-grader built-in agent mode requires a
|
|
12564
|
+
"llm-grader built-in agent mode requires a workspace (workspacePath is not set)"
|
|
12511
12565
|
);
|
|
12512
12566
|
}
|
|
12513
12567
|
const systemPrompt = this.buildAgentSystemPrompt(context);
|
|
@@ -13246,11 +13300,11 @@ function createFilesystemTools(workspacePath) {
|
|
|
13246
13300
|
execute: async (input) => {
|
|
13247
13301
|
try {
|
|
13248
13302
|
const resolved = resolveSandboxed(workspacePath, input.path);
|
|
13249
|
-
const
|
|
13250
|
-
if (
|
|
13303
|
+
const stat12 = await fs2.stat(resolved);
|
|
13304
|
+
if (stat12.isDirectory()) {
|
|
13251
13305
|
return { error: `'${input.path}' is a directory, not a file` };
|
|
13252
13306
|
}
|
|
13253
|
-
const buffer = Buffer.alloc(Math.min(
|
|
13307
|
+
const buffer = Buffer.alloc(Math.min(stat12.size, MAX_FILE_SIZE));
|
|
13254
13308
|
const fd = await fs2.open(resolved, "r");
|
|
13255
13309
|
try {
|
|
13256
13310
|
await fd.read(buffer, 0, buffer.length, 0);
|
|
@@ -13258,8 +13312,8 @@ function createFilesystemTools(workspacePath) {
|
|
|
13258
13312
|
await fd.close();
|
|
13259
13313
|
}
|
|
13260
13314
|
const content = buffer.toString("utf-8");
|
|
13261
|
-
const truncated =
|
|
13262
|
-
return { content, truncated, size:
|
|
13315
|
+
const truncated = stat12.size > MAX_FILE_SIZE;
|
|
13316
|
+
return { content, truncated, size: stat12.size };
|
|
13263
13317
|
} catch (error) {
|
|
13264
13318
|
return { error: error instanceof Error ? error.message : String(error) };
|
|
13265
13319
|
}
|
|
@@ -13310,8 +13364,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
|
|
|
13310
13364
|
const ext = path37.extname(entry.name).toLowerCase();
|
|
13311
13365
|
if (BINARY_EXTENSIONS.has(ext)) continue;
|
|
13312
13366
|
try {
|
|
13313
|
-
const
|
|
13314
|
-
if (
|
|
13367
|
+
const stat12 = await fs2.stat(fullPath);
|
|
13368
|
+
if (stat12.size > MAX_FILE_SIZE) continue;
|
|
13315
13369
|
const content = await fs2.readFile(fullPath, "utf-8");
|
|
13316
13370
|
const lines = content.split("\n");
|
|
13317
13371
|
for (let i = 0; i < lines.length; i++) {
|
|
@@ -15205,7 +15259,7 @@ function runEqualsAssertion(output, value) {
|
|
|
15205
15259
|
import { execFile as execFile3 } from "node:child_process";
|
|
15206
15260
|
import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
|
|
15207
15261
|
import { existsSync as existsSync5 } from "node:fs";
|
|
15208
|
-
import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as
|
|
15262
|
+
import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat8 } from "node:fs/promises";
|
|
15209
15263
|
import path45 from "node:path";
|
|
15210
15264
|
import { promisify as promisify7 } from "node:util";
|
|
15211
15265
|
import micromatch3 from "micromatch";
|
|
@@ -15983,7 +16037,7 @@ function getTCritical(df) {
|
|
|
15983
16037
|
}
|
|
15984
16038
|
|
|
15985
16039
|
// src/evaluation/workspace/manager.ts
|
|
15986
|
-
import { cp, mkdir as mkdir12, readdir as readdir5, rm as rm4, stat as
|
|
16040
|
+
import { cp, mkdir as mkdir12, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
|
|
15987
16041
|
import path41 from "node:path";
|
|
15988
16042
|
var TemplateNotFoundError = class extends Error {
|
|
15989
16043
|
constructor(templatePath) {
|
|
@@ -16006,7 +16060,7 @@ var WorkspaceCreationError = class extends Error {
|
|
|
16006
16060
|
};
|
|
16007
16061
|
async function isDirectory(filePath) {
|
|
16008
16062
|
try {
|
|
16009
|
-
const stats = await
|
|
16063
|
+
const stats = await stat6(filePath);
|
|
16010
16064
|
return stats.isDirectory();
|
|
16011
16065
|
} catch {
|
|
16012
16066
|
return false;
|
|
@@ -16582,14 +16636,14 @@ ${lines.join("\n")}`;
|
|
|
16582
16636
|
};
|
|
16583
16637
|
|
|
16584
16638
|
// src/evaluation/workspace/resolve.ts
|
|
16585
|
-
import { readdir as readdir7, stat as
|
|
16639
|
+
import { readdir as readdir7, stat as stat7 } from "node:fs/promises";
|
|
16586
16640
|
import path44 from "node:path";
|
|
16587
16641
|
async function resolveWorkspaceTemplate(templatePath) {
|
|
16588
16642
|
if (!templatePath) {
|
|
16589
16643
|
return void 0;
|
|
16590
16644
|
}
|
|
16591
16645
|
const resolved = path44.resolve(templatePath);
|
|
16592
|
-
const stats = await
|
|
16646
|
+
const stats = await stat7(resolved);
|
|
16593
16647
|
if (stats.isFile()) {
|
|
16594
16648
|
return {
|
|
16595
16649
|
dir: path44.dirname(resolved),
|
|
@@ -16729,13 +16783,6 @@ async function resetWorkspaceRoot(workspacePath, resetMode, baselineRef) {
|
|
|
16729
16783
|
await execFileAsync3("git", ["clean", cleanFlag], opts);
|
|
16730
16784
|
return true;
|
|
16731
16785
|
}
|
|
16732
|
-
function getWorkspaceTemplate(target) {
|
|
16733
|
-
const config = target.config;
|
|
16734
|
-
if ("workspaceTemplate" in config && typeof config.workspaceTemplate === "string") {
|
|
16735
|
-
return config.workspaceTemplate;
|
|
16736
|
-
}
|
|
16737
|
-
return void 0;
|
|
16738
|
-
}
|
|
16739
16786
|
function validateDependencyGraph(tests) {
|
|
16740
16787
|
const ids = /* @__PURE__ */ new Set();
|
|
16741
16788
|
for (const test of tests) {
|
|
@@ -17009,7 +17056,7 @@ async function runEvaluation(options) {
|
|
|
17009
17056
|
}
|
|
17010
17057
|
}
|
|
17011
17058
|
const suiteWorkspace = filteredEvalCases[0]?.workspace;
|
|
17012
|
-
const rawTemplate = suiteWorkspace?.template
|
|
17059
|
+
const rawTemplate = suiteWorkspace?.template;
|
|
17013
17060
|
const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
|
|
17014
17061
|
const workspaceTemplate = resolvedTemplate?.dir;
|
|
17015
17062
|
let suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
|
|
@@ -17102,7 +17149,7 @@ async function runEvaluation(options) {
|
|
|
17102
17149
|
let staticMaterialised = false;
|
|
17103
17150
|
const isYamlConfiguredPath = !cliWorkspacePath && !!yamlWorkspacePath;
|
|
17104
17151
|
if (useStaticWorkspace && configuredStaticPath) {
|
|
17105
|
-
const dirExists = await
|
|
17152
|
+
const dirExists = await stat8(configuredStaticPath).then(
|
|
17106
17153
|
(s) => s.isDirectory(),
|
|
17107
17154
|
() => false
|
|
17108
17155
|
);
|
|
@@ -17192,7 +17239,7 @@ async function runEvaluation(options) {
|
|
|
17192
17239
|
if (suiteWorkspaceFile && sharedWorkspacePath) {
|
|
17193
17240
|
const copiedWorkspaceFile = path45.join(sharedWorkspacePath, path45.basename(suiteWorkspaceFile));
|
|
17194
17241
|
try {
|
|
17195
|
-
await
|
|
17242
|
+
await stat8(copiedWorkspaceFile);
|
|
17196
17243
|
suiteWorkspaceFile = copiedWorkspaceFile;
|
|
17197
17244
|
} catch {
|
|
17198
17245
|
}
|
|
@@ -17297,6 +17344,54 @@ async function runEvaluation(options) {
|
|
|
17297
17344
|
}
|
|
17298
17345
|
}
|
|
17299
17346
|
}
|
|
17347
|
+
const targetHooks = options.targetHooks;
|
|
17348
|
+
const targetBeforeAllHook = targetHooks?.before_all;
|
|
17349
|
+
if (sharedWorkspacePath && hasHookCommand(targetBeforeAllHook)) {
|
|
17350
|
+
const beforeAllCommand = (targetBeforeAllHook.command ?? []).join(" ");
|
|
17351
|
+
setupLog(`running target before_all command=${beforeAllCommand}`);
|
|
17352
|
+
const scriptContext = {
|
|
17353
|
+
workspacePath: sharedWorkspacePath,
|
|
17354
|
+
testId: "__target_before_all__",
|
|
17355
|
+
evalRunId,
|
|
17356
|
+
evalDir,
|
|
17357
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
17358
|
+
};
|
|
17359
|
+
try {
|
|
17360
|
+
await executeWorkspaceScript(
|
|
17361
|
+
toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
|
|
17362
|
+
scriptContext
|
|
17363
|
+
);
|
|
17364
|
+
setupLog("target before_all completed");
|
|
17365
|
+
} catch (error) {
|
|
17366
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
17367
|
+
if (sharedWorkspacePath && !useStaticWorkspace) {
|
|
17368
|
+
await cleanupWorkspace(sharedWorkspacePath).catch(() => {
|
|
17369
|
+
});
|
|
17370
|
+
}
|
|
17371
|
+
throw new Error(`target before_all hook failed: ${message}`);
|
|
17372
|
+
}
|
|
17373
|
+
}
|
|
17374
|
+
if (availablePoolSlots.length > 0 && hasHookCommand(targetBeforeAllHook)) {
|
|
17375
|
+
for (const slot of availablePoolSlots) {
|
|
17376
|
+
setupLog(`running target before_all on pool slot ${slot.index}`);
|
|
17377
|
+
const scriptContext = {
|
|
17378
|
+
workspacePath: slot.path,
|
|
17379
|
+
testId: "__target_before_all__",
|
|
17380
|
+
evalRunId,
|
|
17381
|
+
evalDir,
|
|
17382
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
17383
|
+
};
|
|
17384
|
+
try {
|
|
17385
|
+
await executeWorkspaceScript(
|
|
17386
|
+
toScriptConfig(targetBeforeAllHook, "before_all", "target hooks"),
|
|
17387
|
+
scriptContext
|
|
17388
|
+
);
|
|
17389
|
+
} catch (error) {
|
|
17390
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
17391
|
+
throw new Error(`target before_all hook failed on pool slot ${slot.index}: ${message}`);
|
|
17392
|
+
}
|
|
17393
|
+
}
|
|
17394
|
+
}
|
|
17300
17395
|
if (sharedWorkspacePath) {
|
|
17301
17396
|
try {
|
|
17302
17397
|
sharedBaselineCommit = await initializeBaseline(sharedWorkspacePath);
|
|
@@ -17442,6 +17537,7 @@ async function runEvaluation(options) {
|
|
|
17442
17537
|
evalDir,
|
|
17443
17538
|
verbose,
|
|
17444
17539
|
threshold: scoreThreshold,
|
|
17540
|
+
targetHooks: options.targetHooks,
|
|
17445
17541
|
...depResults && Object.keys(depResults).length > 0 ? { dependencyResults: depResults } : {}
|
|
17446
17542
|
};
|
|
17447
17543
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
@@ -17583,6 +17679,26 @@ async function runEvaluation(options) {
|
|
|
17583
17679
|
}
|
|
17584
17680
|
}
|
|
17585
17681
|
const afterAllWorkspaces = poolSlots.length > 1 ? poolSlots.map((s) => s.path) : sharedWorkspacePath ? [sharedWorkspacePath] : [];
|
|
17682
|
+
const targetAfterAllHook = targetHooks?.after_all;
|
|
17683
|
+
if (afterAllWorkspaces.length > 0 && hasHookCommand(targetAfterAllHook)) {
|
|
17684
|
+
for (const wsPath of afterAllWorkspaces) {
|
|
17685
|
+
const scriptContext = {
|
|
17686
|
+
workspacePath: wsPath,
|
|
17687
|
+
testId: "__target_after_all__",
|
|
17688
|
+
evalRunId,
|
|
17689
|
+
evalDir,
|
|
17690
|
+
workspaceFileDir: suiteWorkspace?.workspaceFileDir
|
|
17691
|
+
};
|
|
17692
|
+
try {
|
|
17693
|
+
await executeWorkspaceScript(
|
|
17694
|
+
toScriptConfig(targetAfterAllHook, "after_all", "target hooks"),
|
|
17695
|
+
scriptContext,
|
|
17696
|
+
"warn"
|
|
17697
|
+
);
|
|
17698
|
+
} catch {
|
|
17699
|
+
}
|
|
17700
|
+
}
|
|
17701
|
+
}
|
|
17586
17702
|
const suiteAfterAllHook = suiteWorkspace?.hooks?.after_all;
|
|
17587
17703
|
if (afterAllWorkspaces.length > 0 && suiteHooksEnabled && hasHookCommand(suiteAfterAllHook)) {
|
|
17588
17704
|
const afterAllHook = suiteAfterAllHook;
|
|
@@ -17845,7 +17961,7 @@ async function runEvalCase(options) {
|
|
|
17845
17961
|
let caseWorkspaceFile;
|
|
17846
17962
|
const caseHooksEnabled = hooksEnabled(evalCase.workspace);
|
|
17847
17963
|
if (!workspacePath) {
|
|
17848
|
-
const rawCaseTemplate = evalCase.workspace?.template
|
|
17964
|
+
const rawCaseTemplate = evalCase.workspace?.template;
|
|
17849
17965
|
const resolvedCaseTemplate = await resolveWorkspaceTemplate(rawCaseTemplate);
|
|
17850
17966
|
const caseWorkspaceTemplate = resolvedCaseTemplate?.dir;
|
|
17851
17967
|
caseWorkspaceFile = resolvedCaseTemplate?.workspaceFile;
|
|
@@ -17869,7 +17985,7 @@ async function runEvalCase(options) {
|
|
|
17869
17985
|
if (caseWorkspaceFile && workspacePath) {
|
|
17870
17986
|
const copiedFile = path45.join(workspacePath, path45.basename(caseWorkspaceFile));
|
|
17871
17987
|
try {
|
|
17872
|
-
await
|
|
17988
|
+
await stat8(copiedFile);
|
|
17873
17989
|
caseWorkspaceFile = copiedFile;
|
|
17874
17990
|
} catch {
|
|
17875
17991
|
}
|
|
@@ -18063,6 +18179,38 @@ async function runEvalCase(options) {
|
|
|
18063
18179
|
);
|
|
18064
18180
|
}
|
|
18065
18181
|
}
|
|
18182
|
+
const targetBeforeEachHook = options.targetHooks?.before_each;
|
|
18183
|
+
if (workspacePath && hasHookCommand(targetBeforeEachHook)) {
|
|
18184
|
+
const scriptContext = {
|
|
18185
|
+
workspacePath,
|
|
18186
|
+
testId: evalCase.id,
|
|
18187
|
+
evalRunId: evalRunId ?? "",
|
|
18188
|
+
caseInput: evalCase.question,
|
|
18189
|
+
caseMetadata: evalCase.metadata,
|
|
18190
|
+
evalDir,
|
|
18191
|
+
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
18192
|
+
};
|
|
18193
|
+
try {
|
|
18194
|
+
await executeWorkspaceScript(
|
|
18195
|
+
toScriptConfig(targetBeforeEachHook, "before_each", `target hook for '${evalCase.id}'`),
|
|
18196
|
+
scriptContext
|
|
18197
|
+
);
|
|
18198
|
+
beforeEachNeedsFreshBaseline = true;
|
|
18199
|
+
} catch (error) {
|
|
18200
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
18201
|
+
return buildErrorResult(
|
|
18202
|
+
evalCase,
|
|
18203
|
+
target.name,
|
|
18204
|
+
nowFn(),
|
|
18205
|
+
new Error(`target before_each hook failed: ${message}`),
|
|
18206
|
+
promptInputs,
|
|
18207
|
+
provider,
|
|
18208
|
+
"setup",
|
|
18209
|
+
"script_error",
|
|
18210
|
+
verbose
|
|
18211
|
+
);
|
|
18212
|
+
}
|
|
18213
|
+
}
|
|
18066
18214
|
let baselineCommit = beforeEachNeedsFreshBaseline ? void 0 : sharedBaselineCommit;
|
|
18067
18215
|
if (!baselineCommit && workspacePath) {
|
|
18068
18216
|
try {
|
|
@@ -18217,6 +18365,26 @@ async function runEvalCase(options) {
|
|
|
18217
18365
|
${providerFileChanges}` : providerFileChanges;
|
|
18218
18366
|
}
|
|
18219
18367
|
const providerError = extractProviderError(providerResponse);
|
|
18368
|
+
const targetAfterEachHook = options.targetHooks?.after_each;
|
|
18369
|
+
if (workspacePath && hasHookCommand(targetAfterEachHook)) {
|
|
18370
|
+
const scriptContext = {
|
|
18371
|
+
workspacePath,
|
|
18372
|
+
testId: evalCase.id,
|
|
18373
|
+
evalRunId: evalRunId ?? "",
|
|
18374
|
+
caseInput: evalCase.question,
|
|
18375
|
+
caseMetadata: evalCase.metadata,
|
|
18376
|
+
evalDir,
|
|
18377
|
+
workspaceFileDir: evalCase.workspace?.workspaceFileDir
|
|
18378
|
+
};
|
|
18379
|
+
try {
|
|
18380
|
+
await executeWorkspaceScript(
|
|
18381
|
+
toScriptConfig(targetAfterEachHook, "after_each", `target hook for '${evalCase.id}'`),
|
|
18382
|
+
scriptContext,
|
|
18383
|
+
"warn"
|
|
18384
|
+
);
|
|
18385
|
+
} catch {
|
|
18386
|
+
}
|
|
18387
|
+
}
|
|
18220
18388
|
if (caseHooksEnabled && workspacePath && evalCase.workspace?.hooks?.after_each?.reset && evalCase.workspace.hooks.after_each.reset !== "none") {
|
|
18221
18389
|
try {
|
|
18222
18390
|
if (repoManager && evalCase.workspace.repos?.length) {
|
|
@@ -19838,7 +20006,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
19838
20006
|
// src/evaluation/results-repo.ts
|
|
19839
20007
|
import { execFile as execFile4 } from "node:child_process";
|
|
19840
20008
|
import { existsSync as existsSync7, mkdirSync as mkdirSync2, readFileSync as readFileSync3, rmSync, writeFileSync } from "node:fs";
|
|
19841
|
-
import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as
|
|
20009
|
+
import { cp as cp3, mkdtemp as mkdtemp3, readdir as readdir9, rm as rm6, stat as stat9 } from "node:fs/promises";
|
|
19842
20010
|
import os3 from "node:os";
|
|
19843
20011
|
import path49 from "node:path";
|
|
19844
20012
|
import { promisify as promisify8 } from "node:util";
|
|
@@ -20061,7 +20229,7 @@ function resolveResultsRepoRunsDir(config) {
|
|
|
20061
20229
|
);
|
|
20062
20230
|
}
|
|
20063
20231
|
async function directorySizeBytes(targetPath) {
|
|
20064
|
-
const entry = await
|
|
20232
|
+
const entry = await stat9(targetPath);
|
|
20065
20233
|
if (entry.isFile()) {
|
|
20066
20234
|
return entry.size;
|
|
20067
20235
|
}
|
|
@@ -20117,14 +20285,34 @@ async function createDraftResultsPr(params) {
|
|
|
20117
20285
|
}
|
|
20118
20286
|
|
|
20119
20287
|
// src/benchmarks.ts
|
|
20120
|
-
import {
|
|
20288
|
+
import {
|
|
20289
|
+
copyFileSync,
|
|
20290
|
+
existsSync as existsSync8,
|
|
20291
|
+
mkdirSync as mkdirSync3,
|
|
20292
|
+
readFileSync as readFileSync4,
|
|
20293
|
+
readdirSync as readdirSync3,
|
|
20294
|
+
statSync as statSync2,
|
|
20295
|
+
writeFileSync as writeFileSync2
|
|
20296
|
+
} from "node:fs";
|
|
20121
20297
|
import path50 from "node:path";
|
|
20122
20298
|
import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
|
|
20123
20299
|
function getBenchmarksRegistryPath() {
|
|
20124
|
-
return path50.join(
|
|
20300
|
+
return path50.join(getAgentvConfigDir(), "projects.yaml");
|
|
20301
|
+
}
|
|
20302
|
+
function migrateProjectsYaml(targetPath) {
|
|
20303
|
+
const dataHome = getAgentvHome();
|
|
20304
|
+
const configDir = getAgentvConfigDir();
|
|
20305
|
+
if (dataHome === configDir) return;
|
|
20306
|
+
const legacyPath = path50.join(dataHome, "projects.yaml");
|
|
20307
|
+
if (!existsSync8(legacyPath)) return;
|
|
20308
|
+
mkdirSync3(path50.dirname(targetPath), { recursive: true });
|
|
20309
|
+
copyFileSync(legacyPath, targetPath);
|
|
20125
20310
|
}
|
|
20126
20311
|
function loadBenchmarkRegistry() {
|
|
20127
20312
|
const registryPath = getBenchmarksRegistryPath();
|
|
20313
|
+
if (!existsSync8(registryPath)) {
|
|
20314
|
+
migrateProjectsYaml(registryPath);
|
|
20315
|
+
}
|
|
20128
20316
|
if (!existsSync8(registryPath)) {
|
|
20129
20317
|
return { benchmarks: [] };
|
|
20130
20318
|
}
|
|
@@ -21143,7 +21331,7 @@ function extractResponseItemContent(content) {
|
|
|
21143
21331
|
}
|
|
21144
21332
|
|
|
21145
21333
|
// src/import/codex-session-discovery.ts
|
|
21146
|
-
import { readdir as readdir10, stat as
|
|
21334
|
+
import { readdir as readdir10, stat as stat10 } from "node:fs/promises";
|
|
21147
21335
|
import { homedir as homedir5 } from "node:os";
|
|
21148
21336
|
import path51 from "node:path";
|
|
21149
21337
|
var DEFAULT_SESSIONS_DIR = () => path51.join(homedir5(), ".codex", "sessions");
|
|
@@ -21193,7 +21381,7 @@ async function discoverCodexSessions(opts) {
|
|
|
21193
21381
|
const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
|
|
21194
21382
|
let updatedAt;
|
|
21195
21383
|
try {
|
|
21196
|
-
const fileStat = await
|
|
21384
|
+
const fileStat = await stat10(filePath);
|
|
21197
21385
|
updatedAt = fileStat.mtime;
|
|
21198
21386
|
} catch {
|
|
21199
21387
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -21208,7 +21396,7 @@ async function discoverCodexSessions(opts) {
|
|
|
21208
21396
|
}
|
|
21209
21397
|
|
|
21210
21398
|
// src/import/session-discovery.ts
|
|
21211
|
-
import { readdir as readdir11, stat as
|
|
21399
|
+
import { readdir as readdir11, stat as stat11 } from "node:fs/promises";
|
|
21212
21400
|
import { homedir as homedir6 } from "node:os";
|
|
21213
21401
|
import path52 from "node:path";
|
|
21214
21402
|
var DEFAULT_PROJECTS_DIR = () => path52.join(homedir6(), ".claude", "projects");
|
|
@@ -21244,7 +21432,7 @@ async function discoverClaudeSessions(opts) {
|
|
|
21244
21432
|
const filePath = path52.join(dirPath, entry);
|
|
21245
21433
|
let updatedAt;
|
|
21246
21434
|
try {
|
|
21247
|
-
const fileStat = await
|
|
21435
|
+
const fileStat = await stat11(filePath);
|
|
21248
21436
|
updatedAt = fileStat.mtime;
|
|
21249
21437
|
} catch {
|
|
21250
21438
|
updatedAt = /* @__PURE__ */ new Date(0);
|
|
@@ -21500,6 +21688,7 @@ export {
|
|
|
21500
21688
|
extractJsonBlob,
|
|
21501
21689
|
extractLastAssistantContent,
|
|
21502
21690
|
extractTargetFromSuite,
|
|
21691
|
+
extractTargetRefsFromSuite,
|
|
21503
21692
|
extractTargetsFromSuite,
|
|
21504
21693
|
extractTargetsFromTestCase,
|
|
21505
21694
|
extractThreshold,
|
|
@@ -21509,6 +21698,7 @@ export {
|
|
|
21509
21698
|
findGitRoot,
|
|
21510
21699
|
freeformEvaluationSchema,
|
|
21511
21700
|
generateRubrics,
|
|
21701
|
+
getAgentvConfigDir,
|
|
21512
21702
|
getAgentvHome,
|
|
21513
21703
|
getBenchmark,
|
|
21514
21704
|
getBenchmarksRegistryPath,
|