@forwardimpact/libeval 0.1.36 → 0.1.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +32 -7
- package/bin/fit-eval.js +24 -3
- package/bin/fit-trace.js +42 -0
- package/package.json +2 -1
- package/src/benchmark/apm-installer.js +78 -16
- package/src/benchmark/env-loader.js +146 -0
- package/src/benchmark/judge.js +4 -3
- package/src/benchmark/report.js +43 -17
- package/src/benchmark/result.js +9 -3
- package/src/benchmark/runner.js +164 -117
- package/src/benchmark/scorer.js +5 -5
- package/src/benchmark/task-family.js +43 -50
- package/src/benchmark/workdir.js +21 -8
- package/src/commands/assert.js +145 -0
- package/src/commands/benchmark-report.js +1 -2
- package/src/commands/benchmark-run.js +11 -4
- package/src/commands/facilitate.js +4 -2
- package/src/commands/run.js +3 -3
- package/src/commands/supervise.js +5 -2
- package/src/facilitator.js +7 -3
- package/src/supervisor.js +42 -12
package/src/benchmark/workdir.js
CHANGED
|
@@ -11,9 +11,10 @@ import { spawn } from "node:child_process";
|
|
|
11
11
|
import { cp, mkdir } from "node:fs/promises";
|
|
12
12
|
import { createServer } from "node:net";
|
|
13
13
|
import { connect } from "node:net";
|
|
14
|
-
import { join
|
|
14
|
+
import { join } from "node:path";
|
|
15
|
+
|
|
16
|
+
import { loadEnv } from "./env-loader.js";
|
|
15
17
|
|
|
16
|
-
const PREFLIGHT_REL = join("workdir", "scripts");
|
|
17
18
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
18
19
|
|
|
19
20
|
/**
|
|
@@ -24,7 +25,9 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
|
24
25
|
* @property {number} pgid - Process-group id captured from the preflight child.
|
|
25
26
|
* @property {*} scaffold - Reserved per design § Components; v1 sets null.
|
|
26
27
|
* @property {string} agentTracePath
|
|
28
|
+
* @property {string} supervisorTracePath
|
|
27
29
|
* @property {string} judgeTracePath
|
|
30
|
+
* @property {string[]} [envNames] - Env var names loaded from .env files.
|
|
28
31
|
* @property {{phase: string, message: string, exitCode: number}} [preflightError]
|
|
29
32
|
*/
|
|
30
33
|
|
|
@@ -35,12 +38,13 @@ export class WorkdirManager {
|
|
|
35
38
|
* @param {string} deps.stagingDir - Output of `installApm(...)`.
|
|
36
39
|
* @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
|
|
37
40
|
*/
|
|
38
|
-
constructor({ stagingDir, runOutputDir, termGraceMs }) {
|
|
41
|
+
constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
|
|
39
42
|
if (!stagingDir) throw new Error("stagingDir is required");
|
|
40
43
|
if (!runOutputDir) throw new Error("runOutputDir is required");
|
|
41
44
|
this.stagingDir = stagingDir;
|
|
42
45
|
this.runOutputDir = runOutputDir;
|
|
43
46
|
this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
|
|
47
|
+
this.familyRootPath = familyRootPath ?? null;
|
|
44
48
|
}
|
|
45
49
|
|
|
46
50
|
/**
|
|
@@ -55,9 +59,8 @@ export class WorkdirManager {
|
|
|
55
59
|
const cwd = join(runDir, "cwd");
|
|
56
60
|
await mkdir(cwd, { recursive: true });
|
|
57
61
|
|
|
58
|
-
await cp(task.paths.workdir, cwd, {
|
|
59
|
-
|
|
60
|
-
filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
|
|
62
|
+
await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
|
|
63
|
+
if (e.code !== "ENOENT") throw e;
|
|
61
64
|
});
|
|
62
65
|
await cp(task.paths.specs, join(cwd, "specs"), {
|
|
63
66
|
recursive: true,
|
|
@@ -68,12 +71,20 @@ export class WorkdirManager {
|
|
|
68
71
|
recursive: true,
|
|
69
72
|
});
|
|
70
73
|
|
|
74
|
+
const envDirs = [
|
|
75
|
+
...(this.familyRootPath ? [this.familyRootPath] : []),
|
|
76
|
+
...(task.paths.taskDir ? [task.paths.taskDir] : []),
|
|
77
|
+
];
|
|
78
|
+
const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
|
|
79
|
+
|
|
71
80
|
const port = await allocatePort();
|
|
72
81
|
const agentTracePath = join(runDir, "agent.ndjson");
|
|
82
|
+
const supervisorTracePath = join(runDir, "supervisor.ndjson");
|
|
73
83
|
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
74
84
|
|
|
75
|
-
const
|
|
76
|
-
|
|
85
|
+
const preflight = task.paths.preflight
|
|
86
|
+
? await runPreflight(task.paths.preflight, cwd, port)
|
|
87
|
+
: { pgid: 0 };
|
|
77
88
|
|
|
78
89
|
return {
|
|
79
90
|
cwd,
|
|
@@ -82,7 +93,9 @@ export class WorkdirManager {
|
|
|
82
93
|
pgid: preflight.pgid,
|
|
83
94
|
scaffold: null,
|
|
84
95
|
agentTracePath,
|
|
96
|
+
supervisorTracePath,
|
|
85
97
|
judgeTracePath,
|
|
98
|
+
envNames,
|
|
86
99
|
...(preflight.error && { preflightError: preflight.error }),
|
|
87
100
|
};
|
|
88
101
|
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
2
|
+
import { basename } from "node:path";
|
|
3
|
+
import jmespath from "jmespath";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Evaluate an assertion and return the structured result.
|
|
7
|
+
* @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
|
|
8
|
+
* @param {string[]} args - [testName, file]
|
|
9
|
+
* @returns {{ test: string, pass: boolean, message?: string }}
|
|
10
|
+
*/
|
|
11
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
|
|
12
|
+
export function evaluateAssertion(values, args) {
|
|
13
|
+
const testName = args[0];
|
|
14
|
+
if (!testName) throw new Error("assert: missing test name");
|
|
15
|
+
|
|
16
|
+
const file = args[1];
|
|
17
|
+
const modes = [
|
|
18
|
+
values.grep,
|
|
19
|
+
values.query,
|
|
20
|
+
values.exists,
|
|
21
|
+
values["cites-job"],
|
|
22
|
+
].filter((v) => v !== undefined && v !== false);
|
|
23
|
+
if (modes.length === 0) {
|
|
24
|
+
throw new Error(
|
|
25
|
+
"assert: specify one of --grep, --query, --exists, or --cites-job",
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
if (modes.length > 1) {
|
|
29
|
+
throw new Error(
|
|
30
|
+
"assert: specify only one of --grep, --query, --exists, or --cites-job",
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
let result;
|
|
35
|
+
if (values.exists) {
|
|
36
|
+
if (!file) throw new Error("assert: missing file argument");
|
|
37
|
+
result = assertExists(file);
|
|
38
|
+
} else if (values.grep) {
|
|
39
|
+
if (!file) throw new Error("assert: missing file argument for --grep");
|
|
40
|
+
result = assertGrep(values.grep, file);
|
|
41
|
+
} else if (values["cites-job"]) {
|
|
42
|
+
if (!file) throw new Error("assert: missing file argument for --cites-job");
|
|
43
|
+
result = assertCitesJob(values["cites-job"], file);
|
|
44
|
+
} else {
|
|
45
|
+
if (!file) throw new Error("assert: missing file argument for --query");
|
|
46
|
+
result = assertQuery(values.query, file);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (values.not) {
|
|
50
|
+
result.pass = !result.pass;
|
|
51
|
+
if (result.pass) {
|
|
52
|
+
delete result.message;
|
|
53
|
+
} else {
|
|
54
|
+
result.message =
|
|
55
|
+
result.message ?? `inverted assertion failed for ${basename(file)}`;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (!result.pass && values.message) {
|
|
60
|
+
result.message = values.message;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const output = { test: testName, pass: result.pass };
|
|
64
|
+
if (result.message) output.message = result.message;
|
|
65
|
+
return output;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Run an assertion, write JSON to stdout, and set process.exitCode on failure.
|
|
70
|
+
* @param {object} values
|
|
71
|
+
* @param {string[]} args
|
|
72
|
+
*/
|
|
73
|
+
export async function runAssertCommand(values, args) {
|
|
74
|
+
const result = evaluateAssertion(values, args);
|
|
75
|
+
process.stdout.write(JSON.stringify(result) + "\n");
|
|
76
|
+
if (!result.pass) process.exitCode = 1;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function assertExists(file) {
|
|
80
|
+
if (existsSync(file)) return { pass: true };
|
|
81
|
+
return { pass: false, message: `${file} not found` };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function assertGrep(pattern, file) {
|
|
85
|
+
const content = readFileSync(file, "utf8");
|
|
86
|
+
const re = new RegExp(pattern, "im");
|
|
87
|
+
if (re.test(content)) return { pass: true };
|
|
88
|
+
return {
|
|
89
|
+
pass: false,
|
|
90
|
+
message: `pattern "${pattern}" not found in ${basename(file)}`,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function assertQuery(expression, file) {
|
|
95
|
+
const content = readFileSync(file, "utf8");
|
|
96
|
+
const data = parseJsonOrNdjson(content);
|
|
97
|
+
const result = jmespath.search(data, expression);
|
|
98
|
+
const truthy =
|
|
99
|
+
result !== null &&
|
|
100
|
+
result !== undefined &&
|
|
101
|
+
result !== false &&
|
|
102
|
+
(Array.isArray(result) ? result.length > 0 : true);
|
|
103
|
+
if (truthy) return { pass: true };
|
|
104
|
+
return {
|
|
105
|
+
pass: false,
|
|
106
|
+
message: `query returned ${JSON.stringify(result)}`,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
|
|
111
|
+
|
|
112
|
+
function assertCitesJob(jobFile, file) {
|
|
113
|
+
const jobContent = readFileSync(jobFile, "utf8");
|
|
114
|
+
const match = JOB_TAG_RE.exec(jobContent);
|
|
115
|
+
if (!match) {
|
|
116
|
+
return {
|
|
117
|
+
pass: false,
|
|
118
|
+
message: `no <job> tag found in ${basename(jobFile)}`,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
const citation = `${match[1]}: ${match[2]}`;
|
|
122
|
+
const content = readFileSync(file, "utf8");
|
|
123
|
+
if (content.includes(citation)) return { pass: true };
|
|
124
|
+
return { pass: false, message: `missing "${citation}"` };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function parseJsonOrNdjson(content) {
|
|
128
|
+
try {
|
|
129
|
+
return JSON.parse(content);
|
|
130
|
+
} catch {
|
|
131
|
+
// Fall through to NDJSON
|
|
132
|
+
}
|
|
133
|
+
const lines = [];
|
|
134
|
+
for (const raw of content.split("\n")) {
|
|
135
|
+
const trimmed = raw.trim();
|
|
136
|
+
if (!trimmed) continue;
|
|
137
|
+
try {
|
|
138
|
+
lines.push(JSON.parse(trimmed));
|
|
139
|
+
} catch {
|
|
140
|
+
// skip unparseable lines
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (lines.length === 0) throw new Error("assert: no valid JSON in file");
|
|
144
|
+
return lines;
|
|
145
|
+
}
|
|
@@ -13,8 +13,7 @@ import { aggregate, renderTextReport } from "../benchmark/report.js";
|
|
|
13
13
|
* @param {string[]} _args
|
|
14
14
|
*/
|
|
15
15
|
export async function runBenchmarkReportCommand(values, _args) {
|
|
16
|
-
const inputDir = values.input;
|
|
17
|
-
if (!inputDir) throw new Error("--input is required");
|
|
16
|
+
const inputDir = values.input ?? "benchmark-runs";
|
|
18
17
|
const kRaw = values.k ?? "1,3,5";
|
|
19
18
|
const kValues = kRaw.split(",").map((t) => {
|
|
20
19
|
const n = Number.parseInt(t.trim(), 10);
|
|
@@ -31,21 +31,28 @@ export async function runBenchmarkRunCommand(values, _args) {
|
|
|
31
31
|
function parseRunOptions(values) {
|
|
32
32
|
const family = values.family;
|
|
33
33
|
if (!family) throw new Error("--family is required");
|
|
34
|
-
const output = values.output;
|
|
35
|
-
|
|
36
|
-
const runs = Number.parseInt(values.runs ?? "1", 10);
|
|
34
|
+
const output = values.output ?? "benchmark-runs";
|
|
35
|
+
const runs = Number.parseInt(values.runs ?? "5", 10);
|
|
37
36
|
if (!Number.isFinite(runs) || runs < 1)
|
|
38
37
|
throw new Error("--runs must be a positive integer");
|
|
39
38
|
return {
|
|
40
39
|
family,
|
|
41
40
|
runs,
|
|
42
41
|
output: resolve(output),
|
|
43
|
-
|
|
42
|
+
agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
|
|
43
|
+
supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
|
|
44
|
+
judgeModel: values["judge-model"] ?? "claude-opus-4-7",
|
|
44
45
|
profiles: {
|
|
45
46
|
agent: values["agent-profile"] ?? null,
|
|
46
47
|
judge: values["judge-profile"] ?? null,
|
|
47
48
|
},
|
|
48
49
|
maxTurns: parseMaxTurns(values["max-turns"]),
|
|
50
|
+
allowedTools: values["allowed-tools"]
|
|
51
|
+
? values["allowed-tools"]
|
|
52
|
+
.split(",")
|
|
53
|
+
.map((s) => s.trim())
|
|
54
|
+
.filter(Boolean)
|
|
55
|
+
: undefined,
|
|
49
56
|
};
|
|
50
57
|
}
|
|
51
58
|
|
|
@@ -45,7 +45,8 @@ function parseFacilitateOptions(values) {
|
|
|
45
45
|
taskAmend,
|
|
46
46
|
agentConfigs,
|
|
47
47
|
facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
|
|
48
|
-
|
|
48
|
+
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
49
|
+
facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
|
|
49
50
|
maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
|
|
50
51
|
outputPath: values.output,
|
|
51
52
|
facilitatorProfile: values["facilitator-profile"] ?? undefined,
|
|
@@ -89,7 +90,8 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
89
90
|
agentConfigs: opts.agentConfigs,
|
|
90
91
|
query,
|
|
91
92
|
output,
|
|
92
|
-
|
|
93
|
+
agentModel: opts.agentModel,
|
|
94
|
+
facilitatorModel: opts.facilitatorModel,
|
|
93
95
|
maxTurns: opts.maxTurns,
|
|
94
96
|
facilitatorProfile: opts.facilitatorProfile,
|
|
95
97
|
taskAmend: opts.taskAmend,
|
package/src/commands/run.js
CHANGED
|
@@ -29,7 +29,7 @@ function parseRunOptions(values) {
|
|
|
29
29
|
taskContent,
|
|
30
30
|
taskAmend,
|
|
31
31
|
cwd: resolve(values.cwd ?? "."),
|
|
32
|
-
|
|
32
|
+
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
33
33
|
maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
|
|
34
34
|
outputPath: values.output,
|
|
35
35
|
agentProfile: values["agent-profile"] ?? undefined,
|
|
@@ -54,7 +54,7 @@ export async function runRunCommand(values, _args) {
|
|
|
54
54
|
taskContent,
|
|
55
55
|
taskAmend,
|
|
56
56
|
cwd,
|
|
57
|
-
|
|
57
|
+
agentModel,
|
|
58
58
|
maxTurns,
|
|
59
59
|
outputPath,
|
|
60
60
|
agentProfile,
|
|
@@ -114,7 +114,7 @@ export async function runRunCommand(values, _args) {
|
|
|
114
114
|
cwd,
|
|
115
115
|
query,
|
|
116
116
|
output: devNull,
|
|
117
|
-
model,
|
|
117
|
+
model: agentModel,
|
|
118
118
|
maxTurns,
|
|
119
119
|
allowedTools,
|
|
120
120
|
onLine,
|
|
@@ -11,6 +11,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
|
11
11
|
* @param {object} values - Parsed option values from cli.parse()
|
|
12
12
|
* @returns {object}
|
|
13
13
|
*/
|
|
14
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
|
|
14
15
|
function parseSuperviseOptions(values) {
|
|
15
16
|
const taskFile = values["task-file"];
|
|
16
17
|
const taskText = values["task-text"];
|
|
@@ -31,7 +32,8 @@ function parseSuperviseOptions(values) {
|
|
|
31
32
|
agentCwd: resolve(
|
|
32
33
|
values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
33
34
|
),
|
|
34
|
-
|
|
35
|
+
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
36
|
+
supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
|
|
35
37
|
maxTurns: (() => {
|
|
36
38
|
const raw = values["max-turns"] ?? "20";
|
|
37
39
|
return raw === "0" ? 0 : parseInt(raw, 10);
|
|
@@ -102,7 +104,8 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
102
104
|
agentCwd: opts.agentCwd,
|
|
103
105
|
query,
|
|
104
106
|
output,
|
|
105
|
-
|
|
107
|
+
agentModel: opts.agentModel,
|
|
108
|
+
supervisorModel: opts.supervisorModel,
|
|
106
109
|
maxTurns: opts.maxTurns,
|
|
107
110
|
allowedTools: opts.allowedTools,
|
|
108
111
|
supervisorAllowedTools: opts.supervisorAllowedTools,
|
package/src/facilitator.js
CHANGED
|
@@ -390,7 +390,9 @@ const devNull = new Writable({
|
|
|
390
390
|
* @param {Array<{name: string, role: string, cwd?: string, maxTurns?: number, allowedTools?: string[], agentProfile?: string, systemPromptAmend?: string}>} deps.agentConfigs
|
|
391
391
|
* @param {function} deps.query
|
|
392
392
|
* @param {import("stream").Writable} deps.output
|
|
393
|
-
* @param {string} [deps.model]
|
|
393
|
+
* @param {string} [deps.model] - Default model for all participants.
|
|
394
|
+
* @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
|
|
395
|
+
* @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
|
|
394
396
|
* @param {number} [deps.maxTurns]
|
|
395
397
|
* @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
|
|
396
398
|
* @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
|
|
@@ -403,6 +405,8 @@ export function createFacilitator({
|
|
|
403
405
|
query,
|
|
404
406
|
output,
|
|
405
407
|
model,
|
|
408
|
+
agentModel,
|
|
409
|
+
facilitatorModel,
|
|
406
410
|
maxTurns,
|
|
407
411
|
facilitatorProfile,
|
|
408
412
|
profilesDir,
|
|
@@ -450,7 +454,7 @@ export function createFacilitator({
|
|
|
450
454
|
cwd: config.cwd ?? facilitatorCwd,
|
|
451
455
|
query,
|
|
452
456
|
output: devNull,
|
|
453
|
-
model,
|
|
457
|
+
model: agentModel ?? model,
|
|
454
458
|
maxTurns: config.maxTurns ?? 50,
|
|
455
459
|
allowedTools: config.allowedTools,
|
|
456
460
|
onLine: (line) => facilitator.emitLine(config.name, line),
|
|
@@ -467,7 +471,7 @@ export function createFacilitator({
|
|
|
467
471
|
cwd: facilitatorCwd,
|
|
468
472
|
query,
|
|
469
473
|
output: devNull,
|
|
470
|
-
model,
|
|
474
|
+
model: facilitatorModel ?? model,
|
|
471
475
|
maxTurns: maxTurns ?? 20,
|
|
472
476
|
onLine: (line) => facilitator.emitLine("facilitator", line),
|
|
473
477
|
mcpServers: { orchestration: facilitatorServer },
|
package/src/supervisor.js
CHANGED
|
@@ -100,17 +100,18 @@ export class Supervisor {
|
|
|
100
100
|
/**
|
|
101
101
|
* Run the supervisor ↔ agent relay loop.
|
|
102
102
|
* @param {string} task - The initial task for the supervisor
|
|
103
|
-
* @returns {Promise<{success: boolean, turns: number}>}
|
|
103
|
+
* @returns {Promise<{success: boolean, turns: number, concluded: boolean}>}
|
|
104
104
|
*/
|
|
105
105
|
async run(task) {
|
|
106
106
|
const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
|
|
107
|
+
this.taskContext = initialTask;
|
|
107
108
|
this.currentSource = "supervisor";
|
|
108
109
|
this.currentTurn = 0;
|
|
109
110
|
let supervisorResult = await this.supervisorRunner.run(initialTask);
|
|
110
111
|
|
|
111
112
|
if (supervisorResult.error) {
|
|
112
113
|
this.emitSummary({ success: false, turns: 0 });
|
|
113
|
-
return { success: false, turns: 0 };
|
|
114
|
+
return { success: false, turns: 0, concluded: false };
|
|
114
115
|
}
|
|
115
116
|
|
|
116
117
|
if (this.ctx.concluded) {
|
|
@@ -121,7 +122,7 @@ export class Supervisor {
|
|
|
121
122
|
turns: 0,
|
|
122
123
|
summary: this.ctx.summary,
|
|
123
124
|
});
|
|
124
|
-
return { success, turns: 0 };
|
|
125
|
+
return { success, turns: 0, concluded: true };
|
|
125
126
|
}
|
|
126
127
|
|
|
127
128
|
let pendingRelay = null;
|
|
@@ -131,16 +132,20 @@ export class Supervisor {
|
|
|
131
132
|
pendingRelay ?? this.#buildInitialRelay(supervisorResult.text);
|
|
132
133
|
|
|
133
134
|
const turnOutcome = await this.#runAgentTurn(turn, relay);
|
|
134
|
-
if (turnOutcome.exit)
|
|
135
|
+
if (turnOutcome.exit) {
|
|
136
|
+
return { ...turnOutcome.exit, concluded: this.ctx.concluded };
|
|
137
|
+
}
|
|
135
138
|
|
|
136
139
|
const reviewOutcome = await this.#endOfTurnReview(turn);
|
|
137
|
-
if (reviewOutcome.exit)
|
|
140
|
+
if (reviewOutcome.exit) {
|
|
141
|
+
return { ...reviewOutcome.exit, concluded: this.ctx.concluded };
|
|
142
|
+
}
|
|
138
143
|
supervisorResult = reviewOutcome.supervisorResult;
|
|
139
144
|
pendingRelay = reviewOutcome.relay ?? null;
|
|
140
145
|
}
|
|
141
146
|
|
|
142
147
|
this.emitSummary({ success: false, turns: this.maxTurns });
|
|
143
|
-
return { success: false, turns: this.maxTurns };
|
|
148
|
+
return { success: false, turns: this.maxTurns, concluded: false };
|
|
144
149
|
}
|
|
145
150
|
|
|
146
151
|
#buildInitialRelay(fallbackText) {
|
|
@@ -247,6 +252,22 @@ export class Supervisor {
|
|
|
247
252
|
return { type: "continue" };
|
|
248
253
|
}
|
|
249
254
|
|
|
255
|
+
/**
|
|
256
|
+
* Resume the supervisor runner, falling back to a fresh session when the
|
|
257
|
+
* SDK reports that the conversation no longer exists (e.g. session GC'd
|
|
258
|
+
* while the agent was running). The fresh session includes the original
|
|
259
|
+
* task context so the supervisor can still evaluate the agent's work.
|
|
260
|
+
* @param {string} prompt
|
|
261
|
+
* @returns {Promise<object>}
|
|
262
|
+
*/
|
|
263
|
+
async #resumeSupervisor(prompt) {
|
|
264
|
+
const result = await this.supervisorRunner.resume(prompt);
|
|
265
|
+
if (result.error && isSessionNotFound(result.error)) {
|
|
266
|
+
return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
|
|
267
|
+
}
|
|
268
|
+
return result;
|
|
269
|
+
}
|
|
270
|
+
|
|
250
271
|
/**
|
|
251
272
|
* If the agent has an unanswered ask, drain reminders and return a
|
|
252
273
|
* formatted relay string. Returns null when no relay is needed.
|
|
@@ -274,7 +295,7 @@ export class Supervisor {
|
|
|
274
295
|
this.currentSource = "supervisor";
|
|
275
296
|
this.ctx.redirect = null;
|
|
276
297
|
|
|
277
|
-
await this
|
|
298
|
+
await this.#resumeSupervisor(
|
|
278
299
|
`The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
|
|
279
300
|
`Review and use your tools if action is needed.`,
|
|
280
301
|
);
|
|
@@ -312,7 +333,7 @@ export class Supervisor {
|
|
|
312
333
|
`Review and decide how to proceed.`
|
|
313
334
|
: `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
|
|
314
335
|
|
|
315
|
-
let supervisorResult = await this
|
|
336
|
+
let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
|
|
316
337
|
|
|
317
338
|
if (supervisorResult.error) {
|
|
318
339
|
this.emitSummary({ success: false, turns: turn });
|
|
@@ -333,7 +354,7 @@ export class Supervisor {
|
|
|
333
354
|
if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
|
|
334
355
|
const reminders = this.messageBus.drain("supervisor");
|
|
335
356
|
if (reminders.length > 0) {
|
|
336
|
-
supervisorResult = await this
|
|
357
|
+
supervisorResult = await this.#resumeSupervisor(
|
|
337
358
|
formatMessages(reminders),
|
|
338
359
|
);
|
|
339
360
|
if (this.ctx.concluded) {
|
|
@@ -478,7 +499,9 @@ const devNull = new Writable({
|
|
|
478
499
|
* @param {string} deps.agentCwd
|
|
479
500
|
* @param {function} deps.query
|
|
480
501
|
* @param {import("stream").Writable} deps.output
|
|
481
|
-
* @param {string} [deps.model]
|
|
502
|
+
* @param {string} [deps.model] - Default model for both runners.
|
|
503
|
+
* @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
|
|
504
|
+
* @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
|
|
482
505
|
* @param {number} [deps.maxTurns]
|
|
483
506
|
* @param {string[]} [deps.allowedTools]
|
|
484
507
|
* @param {string[]} [deps.supervisorAllowedTools]
|
|
@@ -496,6 +519,8 @@ export function createSupervisor({
|
|
|
496
519
|
query,
|
|
497
520
|
output,
|
|
498
521
|
model,
|
|
522
|
+
agentModel,
|
|
523
|
+
supervisorModel,
|
|
499
524
|
maxTurns,
|
|
500
525
|
allowedTools,
|
|
501
526
|
supervisorDisallowedTools,
|
|
@@ -543,7 +568,7 @@ export function createSupervisor({
|
|
|
543
568
|
cwd: agentCwd,
|
|
544
569
|
query,
|
|
545
570
|
output: devNull,
|
|
546
|
-
model,
|
|
571
|
+
model: agentModel ?? model,
|
|
547
572
|
maxTurns: perInvocationTurns,
|
|
548
573
|
allowedTools,
|
|
549
574
|
onLine,
|
|
@@ -562,7 +587,7 @@ export function createSupervisor({
|
|
|
562
587
|
cwd: supervisorCwd,
|
|
563
588
|
query,
|
|
564
589
|
output: devNull,
|
|
565
|
-
model,
|
|
590
|
+
model: supervisorModel ?? model,
|
|
566
591
|
maxTurns: perInvocationTurns,
|
|
567
592
|
allowedTools: supervisorAllowedTools ?? [
|
|
568
593
|
"Bash",
|
|
@@ -592,3 +617,8 @@ export function createSupervisor({
|
|
|
592
617
|
});
|
|
593
618
|
return supervisor;
|
|
594
619
|
}
|
|
620
|
+
|
|
621
|
+
function isSessionNotFound(error) {
|
|
622
|
+
const msg = error?.message ?? String(error);
|
|
623
|
+
return msg.includes("No conversation found with session ID");
|
|
624
|
+
}
|