@forwardimpact/libeval 0.1.48 → 0.1.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -5
- package/bin/fit-benchmark.js +1 -1
- package/bin/fit-eval.js +27 -48
- package/package.json +2 -2
- package/src/agent-runner.js +3 -1
- package/src/benchmark/npm-installer.js +87 -0
- package/src/benchmark/runner.js +6 -0
- package/src/benchmark/workdir.js +5 -0
- package/src/commands/benchmark-run.js +8 -0
- package/src/commands/discuss.js +3 -11
- package/src/commands/facilitate.js +3 -10
- package/src/commands/run.js +3 -10
- package/src/commands/supervise.js +3 -12
- package/src/commands/task-input.js +49 -0
- package/src/discuss-tools.js +30 -10
- package/src/discusser.js +5 -4
- package/src/events/github.js +133 -0
- package/src/facilitator.js +6 -5
- package/src/index.js +10 -0
- package/src/orchestration-loop.js +5 -1
- package/src/orchestration-toolkit.js +35 -2
- package/src/render/tool-hints.js +2 -1
- package/src/render/turn-renderer.js +1 -2
- package/src/supervisor.js +5 -4
- package/src/tee-writer.js +8 -8
- package/src/trace-collector.js +2 -2
package/README.md
CHANGED
|
@@ -176,11 +176,9 @@ downloadable through retention.
|
|
|
176
176
|
## fit-selfedit
|
|
177
177
|
|
|
178
178
|
A narrow, audited bypass for sessions where `Edit`/`Write` (and bash
|
|
179
|
-
writes) are blocked against paths the project's own allowlist permits
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
original episodes. Reads stdin, writes the target, exits 0 / 2
|
|
183
|
-
(safeguard violation) / 1 (I/O error).
|
|
179
|
+
writes) are blocked against paths the project's own allowlist permits.
|
|
180
|
+
Reads stdin, writes the target, exits 0 / 2 (safeguard violation) / 1
|
|
181
|
+
(I/O error).
|
|
184
182
|
|
|
185
183
|
```sh
|
|
186
184
|
echo "<content>" | bunx fit-selfedit <path>
|
package/bin/fit-benchmark.js
CHANGED
|
@@ -134,7 +134,7 @@ export const definition = {
|
|
|
134
134
|
"fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
|
|
135
135
|
"fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
|
|
136
136
|
"fit-benchmark report --format=text",
|
|
137
|
-
"fit-benchmark report --input=./runs/
|
|
137
|
+
"fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
|
|
138
138
|
],
|
|
139
139
|
documentation: [
|
|
140
140
|
{
|
package/bin/fit-eval.js
CHANGED
|
@@ -34,6 +34,29 @@ const LEAD_OPTIONS = {
|
|
|
34
34
|
},
|
|
35
35
|
};
|
|
36
36
|
|
|
37
|
+
// Shared task-input flags: --task-file (path), --task-text (inline), and
|
|
38
|
+
// --task-event (path to native GitHub event JSON composed into a task via
|
|
39
|
+
// libeval/src/events/github.js). Exactly one of the three is required.
|
|
40
|
+
const TASK_INPUT_OPTIONS = {
|
|
41
|
+
"task-file": {
|
|
42
|
+
type: "string",
|
|
43
|
+
description: "Path to a markdown task file",
|
|
44
|
+
},
|
|
45
|
+
"task-text": {
|
|
46
|
+
type: "string",
|
|
47
|
+
description: "Inline task text (alternative to --task-file)",
|
|
48
|
+
},
|
|
49
|
+
"task-event": {
|
|
50
|
+
type: "string",
|
|
51
|
+
description:
|
|
52
|
+
"Path to a native GitHub event payload JSON, composed into the task via libeval/src/events/github.js (reads $GITHUB_EVENT_NAME)",
|
|
53
|
+
},
|
|
54
|
+
"task-amend": {
|
|
55
|
+
type: "string",
|
|
56
|
+
description: "Additional text appended to the task",
|
|
57
|
+
},
|
|
58
|
+
};
|
|
59
|
+
|
|
37
60
|
const definition = {
|
|
38
61
|
name: "fit-eval",
|
|
39
62
|
version: VERSION,
|
|
@@ -45,18 +68,7 @@ const definition = {
|
|
|
45
68
|
args: "",
|
|
46
69
|
description: "Run a single agent autonomously on a defined task",
|
|
47
70
|
options: {
|
|
48
|
-
|
|
49
|
-
type: "string",
|
|
50
|
-
description: "Path to a markdown task file",
|
|
51
|
-
},
|
|
52
|
-
"task-text": {
|
|
53
|
-
type: "string",
|
|
54
|
-
description: "Inline task text (alternative to --task-file)",
|
|
55
|
-
},
|
|
56
|
-
"task-amend": {
|
|
57
|
-
type: "string",
|
|
58
|
-
description: "Additional text appended to the task",
|
|
59
|
-
},
|
|
71
|
+
...TASK_INPUT_OPTIONS,
|
|
60
72
|
"agent-model": {
|
|
61
73
|
type: "string",
|
|
62
74
|
description:
|
|
@@ -92,18 +104,7 @@ const definition = {
|
|
|
92
104
|
description:
|
|
93
105
|
"Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
|
|
94
106
|
options: {
|
|
95
|
-
|
|
96
|
-
type: "string",
|
|
97
|
-
description: "Path to a markdown task file",
|
|
98
|
-
},
|
|
99
|
-
"task-text": {
|
|
100
|
-
type: "string",
|
|
101
|
-
description: "Inline task text (alternative to --task-file)",
|
|
102
|
-
},
|
|
103
|
-
"task-amend": {
|
|
104
|
-
type: "string",
|
|
105
|
-
description: "Additional text appended to the task",
|
|
106
|
-
},
|
|
107
|
+
...TASK_INPUT_OPTIONS,
|
|
107
108
|
"agent-model": {
|
|
108
109
|
type: "string",
|
|
109
110
|
description:
|
|
@@ -146,18 +147,7 @@ const definition = {
|
|
|
146
147
|
description:
|
|
147
148
|
"Run a facilitator with N participants — typical shape for multi-agent collaboration",
|
|
148
149
|
options: {
|
|
149
|
-
|
|
150
|
-
type: "string",
|
|
151
|
-
description: "Path to a markdown task file",
|
|
152
|
-
},
|
|
153
|
-
"task-text": {
|
|
154
|
-
type: "string",
|
|
155
|
-
description: "Inline task text (alternative to --task-file)",
|
|
156
|
-
},
|
|
157
|
-
"task-amend": {
|
|
158
|
-
type: "string",
|
|
159
|
-
description: "Additional text appended to the task",
|
|
160
|
-
},
|
|
150
|
+
...TASK_INPUT_OPTIONS,
|
|
161
151
|
"agent-model": {
|
|
162
152
|
type: "string",
|
|
163
153
|
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
@@ -192,18 +182,7 @@ const definition = {
|
|
|
192
182
|
description:
|
|
193
183
|
"Run an async, suspendable discussion — Chair + N participants + bridge callback",
|
|
194
184
|
options: {
|
|
195
|
-
|
|
196
|
-
type: "string",
|
|
197
|
-
description: "Path to a markdown task file",
|
|
198
|
-
},
|
|
199
|
-
"task-text": {
|
|
200
|
-
type: "string",
|
|
201
|
-
description: "Inline task text (alternative to --task-file)",
|
|
202
|
-
},
|
|
203
|
-
"task-amend": {
|
|
204
|
-
type: "string",
|
|
205
|
-
description: "Additional text appended to the task",
|
|
206
|
-
},
|
|
185
|
+
...TASK_INPUT_OPTIONS,
|
|
207
186
|
"agent-model": {
|
|
208
187
|
type: "string",
|
|
209
188
|
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.50",
|
|
4
4
|
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
@@ -62,7 +62,7 @@
|
|
|
62
62
|
"zod": "^4.4.3"
|
|
63
63
|
},
|
|
64
64
|
"devDependencies": {
|
|
65
|
-
"@forwardimpact/
|
|
65
|
+
"@forwardimpact/libmock": "^0.1.0"
|
|
66
66
|
},
|
|
67
67
|
"engines": {
|
|
68
68
|
"bun": ">=1.2.0",
|
package/src/agent-runner.js
CHANGED
|
@@ -62,7 +62,9 @@ export class AgentRunner {
|
|
|
62
62
|
const abortController = new AbortController();
|
|
63
63
|
this.currentAbortController = abortController;
|
|
64
64
|
const effectiveTask = this.taskAmend
|
|
65
|
-
?
|
|
65
|
+
? task
|
|
66
|
+
? `${task}\n\n${this.taskAmend}`
|
|
67
|
+
: this.taskAmend
|
|
66
68
|
: task;
|
|
67
69
|
try {
|
|
68
70
|
const iterator = this.query({
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NpmInstaller — runs `bun install` in the family root when a package.json
|
|
3
|
+
* is present, then copies the resulting `node_modules/` into the staging
|
|
4
|
+
* directory so WorkdirManager can seed each per-task CWD.
|
|
5
|
+
*
|
|
6
|
+
* Symmetric to ApmInstaller: constructor injection of `spawn` for testability,
|
|
7
|
+
* factory function, and a free-function shorthand.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { spawn as nodeSpawn } from "node:child_process";
|
|
11
|
+
import { access, cp } from "node:fs/promises";
|
|
12
|
+
import { join } from "node:path";
|
|
13
|
+
|
|
14
|
+
/** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
|
|
15
|
+
export class NpmInstaller {
|
|
16
|
+
/**
|
|
17
|
+
* @param {object} [deps]
|
|
18
|
+
* @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
|
|
19
|
+
* `node:child_process` spawn). Tests inject a fake to avoid shelling out.
|
|
20
|
+
*/
|
|
21
|
+
constructor({ spawn } = {}) {
|
|
22
|
+
this.spawn = spawn ?? nodeSpawn;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* @param {import("./task-family.js").TaskFamily} family
|
|
27
|
+
* @param {string} stagingDir - The staging directory (created by ApmInstaller).
|
|
28
|
+
* @returns {Promise<void>}
|
|
29
|
+
*/
|
|
30
|
+
async install(family, stagingDir) {
|
|
31
|
+
const pkgJson = join(family.rootPath, "package.json");
|
|
32
|
+
const hasPkg = await access(pkgJson)
|
|
33
|
+
.then(() => true)
|
|
34
|
+
.catch(() => false);
|
|
35
|
+
if (!hasPkg) return;
|
|
36
|
+
|
|
37
|
+
await this.#runBunInstall(family.rootPath);
|
|
38
|
+
|
|
39
|
+
const sourceModules = join(family.rootPath, "node_modules");
|
|
40
|
+
try {
|
|
41
|
+
await access(sourceModules);
|
|
42
|
+
} catch {
|
|
43
|
+
throw new Error(
|
|
44
|
+
`bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
await cp(sourceModules, join(stagingDir, "node_modules"), {
|
|
49
|
+
recursive: true,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
#runBunInstall(cwd) {
|
|
54
|
+
return new Promise((res, rej) => {
|
|
55
|
+
const child = this.spawn("bun", ["install"], {
|
|
56
|
+
cwd,
|
|
57
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
58
|
+
});
|
|
59
|
+
let stderr = "";
|
|
60
|
+
child.stdout.on("data", () => {});
|
|
61
|
+
child.stderr.on("data", (d) => {
|
|
62
|
+
stderr += d.toString();
|
|
63
|
+
});
|
|
64
|
+
child.on("error", (e) => {
|
|
65
|
+
rej(new Error(`failed to spawn bun: ${e.message}`));
|
|
66
|
+
});
|
|
67
|
+
child.on("close", (code) => {
|
|
68
|
+
if (code === 0) res();
|
|
69
|
+
else rej(new Error(`bun install exited ${code}: ${stderr}`));
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Factory function — wires real dependencies. */
|
|
76
|
+
export function createNpmInstaller(deps) {
|
|
77
|
+
return new NpmInstaller(deps);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Free-function shorthand for callers that don't need to inject a spawn seam.
|
|
82
|
+
* @param {import("./task-family.js").TaskFamily} family
|
|
83
|
+
* @param {string} stagingDir
|
|
84
|
+
*/
|
|
85
|
+
export function installNpm(family, stagingDir) {
|
|
86
|
+
return new NpmInstaller().install(family, stagingDir);
|
|
87
|
+
}
|
package/src/benchmark/runner.js
CHANGED
|
@@ -22,6 +22,7 @@ import { join, resolve as resolvePath } from "node:path";
|
|
|
22
22
|
import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
|
|
23
23
|
import { createSupervisor } from "../supervisor.js";
|
|
24
24
|
import { installApm as defaultInstallApm } from "./apm-installer.js";
|
|
25
|
+
import { installNpm as defaultInstallNpm } from "./npm-installer.js";
|
|
25
26
|
import { runJudge } from "./judge.js";
|
|
26
27
|
import { validateResultRecord } from "./result.js";
|
|
27
28
|
import { runScoring } from "./scorer.js";
|
|
@@ -68,6 +69,8 @@ export class BenchmarkRunner {
|
|
|
68
69
|
* Same contract as `installApm(family, outputDir)`. Lets tests inject a
|
|
69
70
|
* fake `apm` spawn (or skip the install entirely) so the suite never
|
|
70
71
|
* shells out to a real `apm` binary. Internal testing only.
|
|
72
|
+
* @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
|
|
73
|
+
* Same contract as `installNpm(family, stagingDir)`. Internal testing only.
|
|
71
74
|
*/
|
|
72
75
|
constructor({
|
|
73
76
|
family,
|
|
@@ -86,6 +89,7 @@ export class BenchmarkRunner {
|
|
|
86
89
|
runScoring: runScoringHook,
|
|
87
90
|
runJudge: runJudgeHook,
|
|
88
91
|
installApm: installApmHook,
|
|
92
|
+
installNpm: installNpmHook,
|
|
89
93
|
}) {
|
|
90
94
|
if (!family) throw new Error("family is required");
|
|
91
95
|
if (!Number.isInteger(runs) || runs < 1)
|
|
@@ -111,6 +115,7 @@ export class BenchmarkRunner {
|
|
|
111
115
|
this._runScoringHook = runScoringHook ?? runScoring;
|
|
112
116
|
this._runJudgeHook = runJudgeHook ?? runJudge;
|
|
113
117
|
this._installApmHook = installApmHook ?? defaultInstallApm;
|
|
118
|
+
this._installNpmHook = installNpmHook ?? defaultInstallNpm;
|
|
114
119
|
}
|
|
115
120
|
|
|
116
121
|
/**
|
|
@@ -126,6 +131,7 @@ export class BenchmarkRunner {
|
|
|
126
131
|
await mkdir(this.output, { recursive: true });
|
|
127
132
|
const { stagingDir, skillSetHash, judgeProfilesDir } =
|
|
128
133
|
await this._installApmHook(family, this.output);
|
|
134
|
+
await this._installNpmHook(family, stagingDir);
|
|
129
135
|
|
|
130
136
|
const tasks = family.tasks();
|
|
131
137
|
if (this.profiles.judge) {
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -70,6 +70,11 @@ export class WorkdirManager {
|
|
|
70
70
|
await cp(join(this.stagingDir, ".claude"), join(cwd, ".claude"), {
|
|
71
71
|
recursive: true,
|
|
72
72
|
});
|
|
73
|
+
await cp(join(this.stagingDir, "node_modules"), join(cwd, "node_modules"), {
|
|
74
|
+
recursive: true,
|
|
75
|
+
}).catch((e) => {
|
|
76
|
+
if (e.code !== "ENOENT") throw e;
|
|
77
|
+
});
|
|
73
78
|
|
|
74
79
|
const envDirs = [
|
|
75
80
|
...(this.familyRootPath ? [this.familyRootPath] : []),
|
|
@@ -17,6 +17,14 @@ export async function runBenchmarkRunCommand(values, _args) {
|
|
|
17
17
|
const opts = parseRunOptions(values);
|
|
18
18
|
const config = await createConfig("script", "benchmark");
|
|
19
19
|
process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
|
|
20
|
+
|
|
21
|
+
// The Claude Agent SDK spawns a `claude` subprocess that inherits
|
|
22
|
+
// process.env. NODE_EXTRA_CA_CERTS causes undici (the HTTP client
|
|
23
|
+
// inside that subprocess) to fail with UND_ERR_INVALID_ARG on
|
|
24
|
+
// Node 22+, aborting every API call after 10 retries. Strip it
|
|
25
|
+
// before the SDK loads so the subprocess gets a clean environment.
|
|
26
|
+
delete process.env.NODE_EXTRA_CA_CERTS;
|
|
27
|
+
|
|
20
28
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
21
29
|
const runner = createBenchmarkRunner({ ...opts, query });
|
|
22
30
|
|
package/src/commands/discuss.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { resolve } from "node:path";
|
|
3
3
|
import { createDiscusser } from "../discusser.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
6
7
|
|
|
7
8
|
function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
8
9
|
if (!raw) return [];
|
|
@@ -18,17 +19,8 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
|
18
19
|
* @param {object} values - Parsed option values
|
|
19
20
|
* @returns {object}
|
|
20
21
|
*/
|
|
21
|
-
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
|
|
22
22
|
export function parseDiscussOptions(values) {
|
|
23
|
-
const
|
|
24
|
-
const taskText = values["task-text"];
|
|
25
|
-
if (taskFile && taskText)
|
|
26
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
27
|
-
if (!taskFile && !taskText)
|
|
28
|
-
throw new Error("--task-file or --task-text is required");
|
|
29
|
-
|
|
30
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
31
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
23
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
|
|
32
24
|
|
|
33
25
|
const profilesRaw = values["agent-profiles"];
|
|
34
26
|
const agentCwd = resolve(values["agent-cwd"] ?? ".");
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { resolve } from "node:path";
|
|
3
3
|
import { createFacilitator } from "../facilitator.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
6
7
|
|
|
7
8
|
/**
|
|
8
9
|
* Parse comma-separated agent profile names into structured configs.
|
|
@@ -25,15 +26,7 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
|
25
26
|
* @returns {object} Parsed options
|
|
26
27
|
*/
|
|
27
28
|
export function parseFacilitateOptions(values) {
|
|
28
|
-
const
|
|
29
|
-
const taskText = values["task-text"];
|
|
30
|
-
if (taskFile && taskText)
|
|
31
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
32
|
-
if (!taskFile && !taskText)
|
|
33
|
-
throw new Error("--task-file or --task-text is required");
|
|
34
|
-
|
|
35
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
36
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
29
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
|
|
37
30
|
|
|
38
31
|
const profilesRaw = values["agent-profiles"];
|
|
39
32
|
if (!profilesRaw) throw new Error("--agent-profiles is required");
|
package/src/commands/run.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { Writable } from "node:stream";
|
|
3
3
|
import { resolve } from "node:path";
|
|
4
4
|
import { createAgentRunner } from "../agent-runner.js";
|
|
@@ -6,6 +6,7 @@ import { composeProfilePrompt } from "../profile-prompt.js";
|
|
|
6
6
|
import { createRedactor } from "../redaction.js";
|
|
7
7
|
import { createTeeWriter } from "../tee-writer.js";
|
|
8
8
|
import { SequenceCounter } from "../sequence-counter.js";
|
|
9
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
9
10
|
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
10
11
|
|
|
11
12
|
/**
|
|
@@ -14,16 +15,8 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
|
14
15
|
* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
|
|
15
16
|
*/
|
|
16
17
|
function parseRunOptions(values) {
|
|
17
|
-
const
|
|
18
|
-
const taskText = values["task-text"];
|
|
19
|
-
if (taskFile && taskText)
|
|
20
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
21
|
-
if (!taskFile && !taskText)
|
|
22
|
-
throw new Error("--task-file or --task-text is required");
|
|
23
|
-
|
|
18
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
|
|
24
19
|
const maxTurnsRaw = values["max-turns"] ?? "50";
|
|
25
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
26
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
27
20
|
|
|
28
21
|
return {
|
|
29
22
|
taskContent,
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream, mkdtempSync } from "node:fs";
|
|
2
2
|
import { resolve, join } from "node:path";
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { createSupervisor } from "../supervisor.js";
|
|
5
5
|
import { createRedactor } from "../redaction.js";
|
|
6
6
|
import { createTeeWriter } from "../tee-writer.js";
|
|
7
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
7
8
|
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
8
9
|
|
|
9
10
|
/**
|
|
@@ -11,20 +12,10 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
|
11
12
|
* @param {object} values - Parsed option values from cli.parse()
|
|
12
13
|
* @returns {object}
|
|
13
14
|
*/
|
|
14
|
-
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
|
|
15
15
|
export function parseSuperviseOptions(values) {
|
|
16
|
-
const
|
|
17
|
-
const taskText = values["task-text"];
|
|
18
|
-
if (taskFile && taskText)
|
|
19
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
20
|
-
if (!taskFile && !taskText)
|
|
21
|
-
throw new Error("--task-file or --task-text is required");
|
|
22
|
-
|
|
16
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
|
|
23
17
|
const supervisorAllowedToolsRaw = values["supervisor-allowed-tools"];
|
|
24
18
|
|
|
25
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
26
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
27
|
-
|
|
28
19
|
return {
|
|
29
20
|
taskContent,
|
|
30
21
|
taskAmend,
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import { composeTaskFromGitHubEvent } from "../events/github.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Resolve `--task-file` / `--task-text` / `--task-event` into the task pair the
|
|
6
|
+
* runner consumes. Exactly one of the three must be set. For `--task-event`,
|
|
7
|
+
* libeval reads the event payload and extracts both the main task (from the
|
|
8
|
+
* template that matches `$GITHUB_EVENT_NAME` + `payload.action`) and the
|
|
9
|
+
* amendment (from `payload.inputs?.prompt`) — so the workflow doesn't need to
|
|
10
|
+
* wire `--task-amend` separately. For the other two modes, `--task-amend`
|
|
11
|
+
* works as before.
|
|
12
|
+
*
|
|
13
|
+
* @param {object} values - Parsed option values from cli.parse()
|
|
14
|
+
* @returns {{ task: string, amend: string | undefined }}
|
|
15
|
+
*/
|
|
16
|
+
export function resolveTaskContent(values) {
|
|
17
|
+
const taskFile = values["task-file"];
|
|
18
|
+
const taskText = values["task-text"];
|
|
19
|
+
const taskEvent = values["task-event"];
|
|
20
|
+
|
|
21
|
+
const set = [taskFile, taskText, taskEvent].filter(Boolean).length;
|
|
22
|
+
if (set === 0) {
|
|
23
|
+
throw new Error(
|
|
24
|
+
"one of --task-file, --task-text, --task-event is required",
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
if (set > 1) {
|
|
28
|
+
throw new Error(
|
|
29
|
+
"--task-file, --task-text, --task-event are mutually exclusive",
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const amendFlag = values["task-amend"] ?? undefined;
|
|
34
|
+
|
|
35
|
+
if (taskFile) {
|
|
36
|
+
return { task: readFileSync(taskFile, "utf8"), amend: amendFlag };
|
|
37
|
+
}
|
|
38
|
+
if (taskText) {
|
|
39
|
+
return { task: taskText, amend: amendFlag };
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const eventName = process.env.GITHUB_EVENT_NAME;
|
|
43
|
+
if (!eventName) {
|
|
44
|
+
throw new Error("--task-event requires GITHUB_EVENT_NAME to be set");
|
|
45
|
+
}
|
|
46
|
+
const payload = JSON.parse(readFileSync(taskEvent, "utf8"));
|
|
47
|
+
const composed = composeTaskFromGitHubEvent(payload, eventName);
|
|
48
|
+
return { task: composed.task, amend: amendFlag ?? composed.amend };
|
|
49
|
+
}
|
package/src/discuss-tools.js
CHANGED
|
@@ -20,28 +20,44 @@ import { tool } from "@anthropic-ai/claude-agent-sdk";
|
|
|
20
20
|
import { z } from "zod";
|
|
21
21
|
|
|
22
22
|
import {
|
|
23
|
+
ADJOURN_DESC,
|
|
23
24
|
baseTools,
|
|
24
25
|
concludeSession,
|
|
25
26
|
orchestrationServer,
|
|
27
|
+
RECESS_DESC,
|
|
26
28
|
requestForCommentTool,
|
|
29
|
+
requireNoPendingAsks,
|
|
27
30
|
} from "./orchestration-toolkit.js";
|
|
28
31
|
|
|
29
32
|
/** System prompt for discuss-mode agent participants. L0 mechanics only per COALIGNED. */
|
|
30
33
|
export const DISCUSS_AGENT_SYSTEM_PROMPT =
|
|
31
34
|
"You are a participant in a discussion.\n" +
|
|
32
|
-
"Each question arrives as `[ask#N] <name>: <text
|
|
35
|
+
"Each question arrives as `[ask#N] <name>: <text>` in your inbox.\n" +
|
|
33
36
|
"Quote N as askId on your `Answer` to route the reply correctly.\n" +
|
|
34
37
|
"Your `Answer` is posted to the discussion thread as a separate reply.\n" +
|
|
35
38
|
"If the task already contains a completed response with no new human input after it, `Answer` that no further action is needed.\n" +
|
|
36
39
|
"Do not redo completed work.";
|
|
37
40
|
|
|
38
|
-
const RESUME_TRIGGER_SCHEMA = z
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
41
|
+
const RESUME_TRIGGER_SCHEMA = z.discriminatedUnion("kind", [
|
|
42
|
+
z
|
|
43
|
+
.object({
|
|
44
|
+
kind: z.literal("missing_input"),
|
|
45
|
+
replies: z.number().int().positive(),
|
|
46
|
+
})
|
|
47
|
+
.strict(),
|
|
48
|
+
z
|
|
49
|
+
.object({
|
|
50
|
+
kind: z.literal("escalation_needed"),
|
|
51
|
+
signal: z.string().min(1),
|
|
52
|
+
})
|
|
53
|
+
.strict(),
|
|
54
|
+
z
|
|
55
|
+
.object({
|
|
56
|
+
kind: z.literal("elapsed"),
|
|
57
|
+
elapsed: z.string().min(1),
|
|
58
|
+
})
|
|
59
|
+
.strict(),
|
|
60
|
+
]);
|
|
45
61
|
|
|
46
62
|
/** Discuss-mode lead tool server. */
|
|
47
63
|
export function createDiscussLeadToolServer(ctx) {
|
|
@@ -49,13 +65,13 @@ export function createDiscussLeadToolServer(ctx) {
|
|
|
49
65
|
...baseTools(ctx, { from: "lead", defaultTo: undefined, broadcast: true }),
|
|
50
66
|
tool(
|
|
51
67
|
"Recess",
|
|
52
|
-
|
|
68
|
+
RECESS_DESC,
|
|
53
69
|
{ reason: z.string(), trigger: RESUME_TRIGGER_SCHEMA },
|
|
54
70
|
createRecessHandler(ctx),
|
|
55
71
|
),
|
|
56
72
|
tool(
|
|
57
73
|
"Adjourn",
|
|
58
|
-
|
|
74
|
+
ADJOURN_DESC,
|
|
59
75
|
{
|
|
60
76
|
verdict: z.enum(["adjourned", "failed"]),
|
|
61
77
|
summary: z.string(),
|
|
@@ -83,6 +99,8 @@ export function createDiscussAgentToolServer(ctx, { from }) {
|
|
|
83
99
|
*/
|
|
84
100
|
export function createRecessHandler(ctx) {
|
|
85
101
|
return async ({ reason, trigger }) => {
|
|
102
|
+
const guard = requireNoPendingAsks(ctx);
|
|
103
|
+
if (guard) return guard;
|
|
86
104
|
ctx.recessTrigger = trigger;
|
|
87
105
|
concludeSession(ctx, {
|
|
88
106
|
verdict: "recessed",
|
|
@@ -96,6 +114,8 @@ export function createRecessHandler(ctx) {
|
|
|
96
114
|
/** Adjourn handler — ends the discussion with a verdict. */
|
|
97
115
|
export function createAdjournHandler(ctx) {
|
|
98
116
|
return async ({ verdict, summary, outcome }) => {
|
|
117
|
+
const guard = requireNoPendingAsks(ctx);
|
|
118
|
+
if (guard) return guard;
|
|
99
119
|
if (outcome !== undefined) ctx.outcome = outcome;
|
|
100
120
|
concludeSession(ctx, {
|
|
101
121
|
verdict,
|
package/src/discusser.js
CHANGED
|
@@ -36,10 +36,11 @@ export const DISCUSS_SYSTEM_PROMPT =
|
|
|
36
36
|
"Use `Ask` to delegate work to the best-suited participant.\n" +
|
|
37
37
|
"Participants are domain experts; state the task, not how to do it.\n" +
|
|
38
38
|
"Each participant's `Answer` is posted to the discussion thread as a separate reply.\n" +
|
|
39
|
-
"`Ask` returns {askIds:[N,…]} immediately.\n" +
|
|
40
|
-
"Answers arrive on your next turn as `[answer#N] <participant>: <text
|
|
41
|
-
"
|
|
42
|
-
"
|
|
39
|
+
"`Ask` is async and returns {askIds:[N,…]} immediately.\n" +
|
|
40
|
+
"Answers arrive on your next turn as `[answer#N] <participant>: <text>` in your inbox.\n" +
|
|
41
|
+
"End your turn while Asks are pending. The system resumes you when answers arrive.\n" +
|
|
42
|
+
"Multiple `Ask` calls in one turn run participants in parallel.\n" +
|
|
43
|
+
"End the discussion by calling `Adjourn` with a verdict and summary, or `Recess` only to wait on an external reply or duration.";
|
|
43
44
|
|
|
44
45
|
/**
|
|
45
46
|
* Augment a base orchestration context with discuss-mode fields.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub event → task-prompt composition. Replaces ~70 lines of shell in
|
|
3
|
+
* kata-dispatch.yml's `Compose task text` step. Each branch in the dispatch
|
|
4
|
+
* function corresponds to one (event_name, action) the agent workflows react
|
|
5
|
+
* to; the rendered string is identical to what the shell `case` block
|
|
6
|
+
* produced, so existing facilitator behaviour is preserved.
|
|
7
|
+
*
|
|
8
|
+
* Templates live as named `export const` declarations at the top of the file,
|
|
9
|
+
* mirroring `SUPERVISOR_SYSTEM_PROMPT` / `JUDGE_SYSTEM_PROMPT` / etc., so a
|
|
10
|
+
* reader scanning libeval source can find the exact string that an agent
|
|
11
|
+
* receives. Substitutions use `${KEY}` so the literal placeholders are
|
|
12
|
+
* grep-discoverable.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
export const TASK_TEMPLATE_ISSUE_OPENED =
|
|
16
|
+
'New issue: "${ISSUE_TITLE}" (#${NUMBER}) by @${AUTHOR} (type: ${AUTHOR_TYPE}). Issue URL: ${URL}.';
|
|
17
|
+
|
|
18
|
+
export const TASK_TEMPLATE_ISSUE_LABELED =
|
|
19
|
+
'Label "${LABEL}" was added to issue "${ISSUE_TITLE}" (#${NUMBER}). Issue URL: ${URL}.';
|
|
20
|
+
|
|
21
|
+
export const TASK_TEMPLATE_PR_LABELED =
|
|
22
|
+
'Label "${LABEL}" was added to PR "${PR_TITLE}" (#${NUMBER}). PR URL: ${URL}.';
|
|
23
|
+
|
|
24
|
+
export const TASK_TEMPLATE_PR_MERGED =
|
|
25
|
+
'PR "${PR_TITLE}" (#${NUMBER}) merged. PR URL: ${URL}.';
|
|
26
|
+
|
|
27
|
+
export const TASK_TEMPLATE_ISSUE_COMMENT_ON_ISSUE =
|
|
28
|
+
'New comment on issue "${ISSUE_TITLE}" (#${NUMBER}) by @${AUTHOR} (type: ${AUTHOR_TYPE}). Comment URL: ${URL}.';
|
|
29
|
+
|
|
30
|
+
export const TASK_TEMPLATE_ISSUE_COMMENT_ON_PR =
|
|
31
|
+
"New comment on PR #${NUMBER} by @${AUTHOR} (type: ${AUTHOR_TYPE}). Comment URL: ${URL}.";
|
|
32
|
+
|
|
33
|
+
export const TASK_TEMPLATE_REVIEW_SUBMITTED =
|
|
34
|
+
'Review submitted on PR "${PR_TITLE}" (#${NUMBER}) by @${AUTHOR} (type: ${AUTHOR_TYPE}). Review URL: ${URL}.';
|
|
35
|
+
|
|
36
|
+
function render(template, fields) {
|
|
37
|
+
let out = template;
|
|
38
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
39
|
+
out = out.replaceAll("${" + key + "}", value ?? "");
|
|
40
|
+
}
|
|
41
|
+
return out;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function extractCommonFields(payload) {
|
|
45
|
+
return {
|
|
46
|
+
NUMBER: String(payload.issue?.number ?? payload.pull_request?.number ?? ""),
|
|
47
|
+
ISSUE_TITLE: payload.issue?.title ?? "",
|
|
48
|
+
PR_TITLE: payload.pull_request?.title ?? "",
|
|
49
|
+
LABEL: payload.label?.name ?? "",
|
|
50
|
+
AUTHOR:
|
|
51
|
+
payload.comment?.user?.login ??
|
|
52
|
+
payload.review?.user?.login ??
|
|
53
|
+
payload.issue?.user?.login ??
|
|
54
|
+
payload.pull_request?.user?.login ??
|
|
55
|
+
"",
|
|
56
|
+
AUTHOR_TYPE:
|
|
57
|
+
payload.comment?.user?.type ??
|
|
58
|
+
payload.review?.user?.type ??
|
|
59
|
+
payload.issue?.user?.type ??
|
|
60
|
+
payload.pull_request?.user?.type ??
|
|
61
|
+
"User",
|
|
62
|
+
URL:
|
|
63
|
+
payload.comment?.html_url ??
|
|
64
|
+
payload.review?.html_url ??
|
|
65
|
+
payload.issue?.html_url ??
|
|
66
|
+
payload.pull_request?.html_url ??
|
|
67
|
+
"",
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Static `(event_name, action)` → template lookup. The "issue_comment" /
|
|
72
|
+
// "created" entry needs payload context (issue vs PR), so it returns a chooser
|
|
73
|
+
// instead of a template. Anything missing from the table throws downstream.
|
|
74
|
+
const TEMPLATE_DISPATCH = {
|
|
75
|
+
"issues:opened": () => TASK_TEMPLATE_ISSUE_OPENED,
|
|
76
|
+
"issues:labeled": () => TASK_TEMPLATE_ISSUE_LABELED,
|
|
77
|
+
"pull_request:closed": () => TASK_TEMPLATE_PR_MERGED,
|
|
78
|
+
"pull_request:labeled": () => TASK_TEMPLATE_PR_LABELED,
|
|
79
|
+
"pull_request_target:closed": () => TASK_TEMPLATE_PR_MERGED,
|
|
80
|
+
"pull_request_target:labeled": () => TASK_TEMPLATE_PR_LABELED,
|
|
81
|
+
"pull_request_review:submitted": () => TASK_TEMPLATE_REVIEW_SUBMITTED,
|
|
82
|
+
"issue_comment:created": (payload) =>
|
|
83
|
+
payload.issue?.pull_request != null
|
|
84
|
+
? TASK_TEMPLATE_ISSUE_COMMENT_ON_PR
|
|
85
|
+
: TASK_TEMPLATE_ISSUE_COMMENT_ON_ISSUE,
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
function pickTemplate(payload, eventName) {
|
|
89
|
+
const chooser = TEMPLATE_DISPATCH[`${eventName}:${payload.action}`];
|
|
90
|
+
return chooser ? chooser(payload) : null;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Compose the task a libeval lead receives from a native GitHub event payload.
|
|
95
|
+
* Returns `{ task, amend }`: `task` is the template-rendered context for real
|
|
96
|
+
* events (or empty string for `workflow_dispatch`); `amend` is read from
|
|
97
|
+
* `payload.inputs?.prompt` so an ad-hoc dispatcher (workflow_dispatch trigger
|
|
98
|
+
* or bridge) can layer instructions on top without the workflow wiring
|
|
99
|
+
* `--task-amend` separately. The runner combines them via the existing
|
|
100
|
+
* taskAmend path.
|
|
101
|
+
*
|
|
102
|
+
* Throws on unknown (event_name, action) combos so a typo doesn't silently
|
|
103
|
+
* ship a misleading prompt.
|
|
104
|
+
*
|
|
105
|
+
* @param {object} payload - Native event payload (shape mirrors
|
|
106
|
+
* `$GITHUB_EVENT_PATH` JSON written by the runner).
|
|
107
|
+
* @param {string} eventName - Value of `$GITHUB_EVENT_NAME` for the run.
|
|
108
|
+
* @returns {{ task: string, amend: string }}
|
|
109
|
+
*/
|
|
110
|
+
export function composeTaskFromGitHubEvent(payload, eventName) {
|
|
111
|
+
if (!eventName) {
|
|
112
|
+
throw new Error("composeTaskFromGitHubEvent: eventName is required");
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const amend = payload.inputs?.prompt ?? "";
|
|
116
|
+
|
|
117
|
+
if (eventName === "workflow_dispatch") {
|
|
118
|
+
if (!amend) {
|
|
119
|
+
throw new Error(
|
|
120
|
+
"composeTaskFromGitHubEvent: workflow_dispatch payload must include inputs.prompt",
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
return { task: "", amend };
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
const template = pickTemplate(payload, eventName);
|
|
127
|
+
if (!template) {
|
|
128
|
+
throw new Error(
|
|
129
|
+
`composeTaskFromGitHubEvent: no template for event_name="${eventName}" action="${payload.action}"`,
|
|
130
|
+
);
|
|
131
|
+
}
|
|
132
|
+
return { task: render(template, extractCommonFields(payload)), amend };
|
|
133
|
+
}
|
package/src/facilitator.js
CHANGED
|
@@ -25,15 +25,16 @@ export const FACILITATOR_SYSTEM_PROMPT =
|
|
|
25
25
|
"Use `RollCall` to list participants.\n" +
|
|
26
26
|
"Use `Ask` to delegate work to the best-suited participant.\n" +
|
|
27
27
|
"Participants are domain experts; state the task, not how to do it.\n" +
|
|
28
|
-
"`Ask` returns {askIds:[N,…]} immediately.\n" +
|
|
29
|
-
"Answers arrive on your next turn as `[answer#N] <participant>: <text
|
|
30
|
-
"
|
|
31
|
-
"
|
|
28
|
+
"`Ask` is async and returns {askIds:[N,…]} immediately.\n" +
|
|
29
|
+
"Answers arrive on your next turn as `[answer#N] <participant>: <text>` in your inbox.\n" +
|
|
30
|
+
"End your turn while Asks are pending. The system resumes you when answers arrive.\n" +
|
|
31
|
+
"Multiple `Ask` calls in one turn run participants in parallel.\n" +
|
|
32
|
+
"End every session by calling `Conclude` with a verdict and summary.";
|
|
32
33
|
|
|
33
34
|
/** System prompt for facilitated agent participants. L0 mechanics only per COALIGNED. */
|
|
34
35
|
export const FACILITATED_AGENT_SYSTEM_PROMPT =
|
|
35
36
|
"You are a participant in a facilitated session.\n" +
|
|
36
|
-
"Each question arrives as `[ask#N] <name>: <text
|
|
37
|
+
"Each question arrives as `[ask#N] <name>: <text>` in your inbox.\n" +
|
|
37
38
|
"Quote N as askId on your `Answer` to route the reply correctly.\n" +
|
|
38
39
|
"If the task already contains a completed response with no new human input after it, `Answer` that no further action is needed.\n" +
|
|
39
40
|
"Do not redo completed work.";
|
package/src/index.js
CHANGED
|
@@ -50,6 +50,16 @@ export {
|
|
|
50
50
|
DISCUSS_AGENT_SYSTEM_PROMPT,
|
|
51
51
|
} from "./discuss-tools.js";
|
|
52
52
|
export { Judge, createJudge, JUDGE_SYSTEM_PROMPT } from "./judge.js";
|
|
53
|
+
export {
|
|
54
|
+
composeTaskFromGitHubEvent,
|
|
55
|
+
TASK_TEMPLATE_ISSUE_OPENED,
|
|
56
|
+
TASK_TEMPLATE_ISSUE_LABELED,
|
|
57
|
+
TASK_TEMPLATE_PR_LABELED,
|
|
58
|
+
TASK_TEMPLATE_PR_MERGED,
|
|
59
|
+
TASK_TEMPLATE_ISSUE_COMMENT_ON_ISSUE,
|
|
60
|
+
TASK_TEMPLATE_ISSUE_COMMENT_ON_PR,
|
|
61
|
+
TASK_TEMPLATE_REVIEW_SUBMITTED,
|
|
62
|
+
} from "./events/github.js";
|
|
53
63
|
export {
|
|
54
64
|
Redactor,
|
|
55
65
|
createRedactor,
|
|
@@ -94,7 +94,11 @@ export class OrchestrationLoop {
|
|
|
94
94
|
*/
|
|
95
95
|
async run(task) {
|
|
96
96
|
this.emitOrchestratorEvent({ type: "session_start" });
|
|
97
|
-
const initialTask = this.taskAmend
|
|
97
|
+
const initialTask = this.taskAmend
|
|
98
|
+
? task
|
|
99
|
+
? `${task}\n\n${this.taskAmend}`
|
|
100
|
+
: this.taskAmend
|
|
101
|
+
: task;
|
|
98
102
|
|
|
99
103
|
let firstError = null;
|
|
100
104
|
const abort = (err) => {
|
|
@@ -46,9 +46,24 @@ export function createOrchestrationContext() {
|
|
|
46
46
|
|
|
47
47
|
// --- Handler factories ---
|
|
48
48
|
|
|
49
|
+
/**
|
|
50
|
+
* Guard for terminal tools (`Conclude`, `Adjourn`, `Recess`). Returns an
|
|
51
|
+
* error result when the caller still has Asks in flight, telling them to
|
|
52
|
+
* end the turn and wait for the auto-resume. Returns `null` when no Asks
|
|
53
|
+
* are pending and the terminal tool is free to run.
|
|
54
|
+
*/
|
|
55
|
+
export function requireNoPendingAsks(ctx) {
|
|
56
|
+
if (ctx.pendingAsks.size === 0) return null;
|
|
57
|
+
return errorResult(
|
|
58
|
+
"Asks are still pending. End your turn. You will be resumed when answers arrive.",
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
|
|
49
62
|
/** Mark the session as concluded; cancel any open Asks so askers see the synthetic null on their next turn. */
|
|
50
63
|
export function createConcludeHandler(ctx) {
|
|
51
64
|
return async ({ verdict, summary }) => {
|
|
65
|
+
const guard = requireNoPendingAsks(ctx);
|
|
66
|
+
if (guard) return guard;
|
|
52
67
|
concludeSession(ctx, { verdict, summary, reason: "session concluded" });
|
|
53
68
|
return { content: [{ type: "text", text: "Session concluded." }] };
|
|
54
69
|
};
|
|
@@ -235,8 +250,18 @@ const ANNOUNCE_DESC = "Broadcast a message with no reply expected.";
|
|
|
235
250
|
|
|
236
251
|
const ROLLCALL_DESC = "List all participants in the session.";
|
|
237
252
|
|
|
253
|
+
// Terminal-tool descriptions. Each one ends the run. Group them so the
|
|
254
|
+
// contrast is visible: Conclude (success/failure), Adjourn (settled in
|
|
255
|
+
// thread), Recess (paused for out-of-session input). Each description
|
|
256
|
+
// leads with the cost.
|
|
238
257
|
const CONCLUDE_DESC =
|
|
239
|
-
"End the session
|
|
258
|
+
"End the session. Provide a verdict ('success' or 'failure') and a summary.";
|
|
259
|
+
|
|
260
|
+
const ADJOURN_DESC =
|
|
261
|
+
"End the discussion. Provide a verdict ('adjourned' or 'failed') and a summary. Cancels any unanswered Asks.";
|
|
262
|
+
|
|
263
|
+
const RECESS_DESC =
|
|
264
|
+
"End the run and schedule an out-of-session re-dispatch. Cancels any unanswered Asks. Use only when waiting on an external reply or duration. Do not use to wait on in-flight Asks.";
|
|
240
265
|
|
|
241
266
|
// --- Tool builders ---
|
|
242
267
|
|
|
@@ -244,6 +269,7 @@ const CONCLUDE_DESC =
|
|
|
244
269
|
function textResult(text) {
|
|
245
270
|
return { content: [{ type: "text", text }] };
|
|
246
271
|
}
|
|
272
|
+
/** Build an MCP tool error result wrapping a single text message. */
|
|
247
273
|
function errorResult(text) {
|
|
248
274
|
return { content: [{ type: "text", text }], isError: true };
|
|
249
275
|
}
|
|
@@ -391,4 +417,11 @@ function requestForCommentTool(ctx) {
|
|
|
391
417
|
|
|
392
418
|
// Re-export the building blocks discuss-tools.js needs to assemble its
|
|
393
419
|
// own lead tool surface (it has two extra terminal tools).
|
|
394
|
-
export {
|
|
420
|
+
export {
|
|
421
|
+
ADJOURN_DESC,
|
|
422
|
+
baseTools,
|
|
423
|
+
errorResult,
|
|
424
|
+
orchestrationServer,
|
|
425
|
+
RECESS_DESC,
|
|
426
|
+
requestForCommentTool,
|
|
427
|
+
};
|
package/src/render/tool-hints.js
CHANGED
|
@@ -101,7 +101,8 @@ export function simplifyToolName(name) {
|
|
|
101
101
|
*
|
|
102
102
|
* Three branches, in priority order:
|
|
103
103
|
* - A built-in tool with an entry in `HINT_HANDLERS` → sanitized hint, no
|
|
104
|
-
* `{` / `"` from the input (
|
|
104
|
+
* `{` / `"` from the input (built-in tool hints stay free of JSON
|
|
105
|
+
* punctuation so readers see clean one-liners).
|
|
105
106
|
* - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
|
|
106
107
|
* single-line JSON; `{` and `"` intentionally appear so readers see
|
|
107
108
|
* the actual MCP payload.
|
|
@@ -2,8 +2,7 @@
|
|
|
2
2
|
* Turn renderer — maps a structured turn into formatted text lines.
|
|
3
3
|
*
|
|
4
4
|
* Shared by `TeeWriter.flushTurns()` (live stream) and
|
|
5
|
-
* `TraceCollector.toText()` (offline replay) so both emit identical output
|
|
6
|
-
* (spec 540).
|
|
5
|
+
* `TraceCollector.toText()` (offline replay) so both emit identical output.
|
|
7
6
|
*/
|
|
8
7
|
|
|
9
8
|
import {
|
package/src/supervisor.js
CHANGED
|
@@ -32,15 +32,16 @@ export const SUPERVISOR_SYSTEM_PROMPT =
|
|
|
32
32
|
"You supervise one agent.\n" +
|
|
33
33
|
"You have no tools to perform work yourself.\n" +
|
|
34
34
|
"Use `Ask` to delegate work to the agent.\n" +
|
|
35
|
-
"`Ask` returns {askIds:[N]} immediately.\n" +
|
|
36
|
-
"The reply arrives on your next turn as `[answer#N] agent: <text
|
|
35
|
+
"`Ask` is async and returns {askIds:[N]} immediately.\n" +
|
|
36
|
+
"The reply arrives on your next turn as `[answer#N] agent: <text>` in your inbox.\n" +
|
|
37
|
+
"End your turn while Asks are pending. The system resumes you when an answer arrives.\n" +
|
|
37
38
|
"If the agent goes off-track, send a corrective `Ask`.\n" +
|
|
38
|
-
"End every session by calling `Conclude
|
|
39
|
+
"End every session by calling `Conclude` with a verdict and summary.";
|
|
39
40
|
|
|
40
41
|
/** System prompt for the supervised agent. L0 mechanics only per COALIGNED. */
|
|
41
42
|
export const AGENT_SYSTEM_PROMPT =
|
|
42
43
|
"A supervisor directs your work.\n" +
|
|
43
|
-
"Each question arrives as `[ask#N] supervisor: <text
|
|
44
|
+
"Each question arrives as `[ask#N] supervisor: <text>` in your inbox.\n" +
|
|
44
45
|
"Quote N as askId on your `Answer` to route the reply correctly.\n" +
|
|
45
46
|
"If the task already contains a completed response with no new human input after it, `Answer` that no further action is needed.\n" +
|
|
46
47
|
"Do not redo completed work.";
|
package/src/tee-writer.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Human text rendering is delegated to the pure modules under `./render/`
|
|
11
11
|
* so the live stream and the offline `TraceCollector.toText()` replay share
|
|
12
|
-
* one formatting path
|
|
12
|
+
* one formatting path. The NDJSON going to `fileStream` is
|
|
13
13
|
* untouched — only what reaches `textStream` changes.
|
|
14
14
|
*
|
|
15
15
|
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
@@ -67,10 +67,9 @@ export class TeeWriter extends Writable {
|
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
// Emit the trailing `--- Result: ... ---` footer — the one summary line
|
|
70
|
-
// humans want
|
|
71
|
-
// appends, so the live stream and the offline replay stay in sync
|
|
72
|
-
//
|
|
73
|
-
// footer is gone in every mode.
|
|
70
|
+
// humans want. This is the same tail TraceCollector.toText()
|
|
71
|
+
// appends, so the live stream and the offline replay stay in sync.
|
|
72
|
+
// The superseded `--- Evaluation ... ---` footer is gone in every mode.
|
|
74
73
|
if (this.collector.result) {
|
|
75
74
|
const text = this.collector.toText();
|
|
76
75
|
const idx = text.lastIndexOf("\n---");
|
|
@@ -78,7 +77,7 @@ export class TeeWriter extends Writable {
|
|
|
78
77
|
// Slice past the leading `\n` — the previously-streamed body
|
|
79
78
|
// already ended with its own newline, so re-emitting `\n---` here
|
|
80
79
|
// would produce a blank line before the footer and desync from
|
|
81
|
-
// the offline replay
|
|
80
|
+
// the offline replay.
|
|
82
81
|
this.textStream.write(text.slice(idx + 1) + "\n");
|
|
83
82
|
}
|
|
84
83
|
}
|
|
@@ -107,7 +106,8 @@ export class TeeWriter extends Writable {
|
|
|
107
106
|
this.collector.addLine(line);
|
|
108
107
|
|
|
109
108
|
// Orchestrator lifecycle events are suppressed from the text stream
|
|
110
|
-
// entirely
|
|
109
|
+
// entirely — humans only want agent-visible content. They still
|
|
110
|
+
// reached fileStream above.
|
|
111
111
|
if (
|
|
112
112
|
parsed.source === "orchestrator" &&
|
|
113
113
|
isSuppressedOrchestratorEvent(parsed.event)
|
|
@@ -118,7 +118,7 @@ export class TeeWriter extends Writable {
|
|
|
118
118
|
return;
|
|
119
119
|
}
|
|
120
120
|
|
|
121
|
-
// Bare event (run mode
|
|
121
|
+
// Bare event (unwrapped run mode line or direct feed)
|
|
122
122
|
this.collector.addLine(line);
|
|
123
123
|
this.flushTurns();
|
|
124
124
|
}
|
package/src/trace-collector.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Human text rendering is delegated to the pure modules under `./render/`
|
|
8
8
|
* so the live `TeeWriter` stream and the offline `toText()` replay share
|
|
9
|
-
* one formatting path
|
|
9
|
+
* one formatting path.
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
12
|
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
@@ -293,7 +293,7 @@ export class TraceCollector {
|
|
|
293
293
|
}
|
|
294
294
|
|
|
295
295
|
/**
|
|
296
|
-
* Format the trailing result summary line
|
|
296
|
+
* Format the trailing result summary line. When an orchestrator
|
|
297
297
|
* summary is present (supervised / facilitated mode), the headline word is
|
|
298
298
|
* the supervisor's verdict ("success" / "failure") rather than the SDK's
|
|
299
299
|
* per-runner subtype, so the footer aligns with the CI exit code.
|