@forwardimpact/libeval 0.1.52 → 0.1.54
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +8 -14
- package/bin/fit-eval.js +7 -14
- package/bin/fit-selfedit.js +6 -4
- package/bin/fit-trace.js +7 -14
- package/package.json +1 -1
- package/src/benchmark/result.js +2 -2
- package/src/benchmark/task-family.js +1 -1
- package/src/commands/benchmark-invariants.js +1 -1
- package/src/discusser.js +3 -5
- package/src/events/github.js +7 -1
- package/src/facilitator.js +2 -5
- package/src/inbox-poller.js +5 -8
- package/src/judge.js +10 -14
- package/src/profile-prompt.js +193 -26
- package/src/redaction.js +3 -16
- package/src/supervisor.js +3 -0
package/bin/fit-benchmark.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import {
|
|
5
|
+
import { realpathSync } from "node:fs";
|
|
6
6
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
7
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
8
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
@@ -11,17 +11,8 @@ import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
|
|
|
11
11
|
import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
|
|
12
12
|
import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
|
|
13
13
|
|
|
14
|
-
// `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
|
|
15
|
-
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
16
|
-
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
17
|
-
const VERSION =
|
|
18
|
-
process.env.FIT_BENCHMARK_VERSION ||
|
|
19
|
-
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
20
|
-
.version;
|
|
21
|
-
|
|
22
14
|
export const definition = {
|
|
23
15
|
name: "fit-benchmark",
|
|
24
|
-
version: VERSION,
|
|
25
16
|
description:
|
|
26
17
|
"Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
|
|
27
18
|
commands: [
|
|
@@ -156,11 +147,14 @@ export const definition = {
|
|
|
156
147
|
],
|
|
157
148
|
};
|
|
158
149
|
|
|
159
|
-
const
|
|
150
|
+
const runtime = createDefaultRuntime();
|
|
151
|
+
const logger = createLogger("benchmark", runtime);
|
|
160
152
|
|
|
161
153
|
async function main() {
|
|
162
|
-
const
|
|
163
|
-
|
|
154
|
+
const cli = createCli(definition, {
|
|
155
|
+
runtime,
|
|
156
|
+
packageJsonUrl: new URL("../package.json", import.meta.url),
|
|
157
|
+
});
|
|
164
158
|
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
165
159
|
if (!parsed) return runtime.proc.exit(0);
|
|
166
160
|
|
|
@@ -187,7 +181,7 @@ async function main() {
|
|
|
187
181
|
if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
|
|
188
182
|
main().catch((error) => {
|
|
189
183
|
logger.exception("main", error);
|
|
190
|
-
createCli(definition).error(error.message);
|
|
184
|
+
createCli(definition, { runtime }).error(error.message);
|
|
191
185
|
process.exit(1);
|
|
192
186
|
});
|
|
193
187
|
}
|
package/bin/fit-eval.js
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import { readFileSync } from "node:fs";
|
|
6
5
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
6
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
7
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
@@ -15,14 +14,6 @@ import { runFacilitateCommand } from "../src/commands/facilitate.js";
|
|
|
15
14
|
import { runDiscussCommand } from "../src/commands/discuss.js";
|
|
16
15
|
import { runCallbackCommand } from "../src/commands/callback.js";
|
|
17
16
|
|
|
18
|
-
// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
|
|
19
|
-
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
20
|
-
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
21
|
-
const VERSION =
|
|
22
|
-
process.env.FIT_EVAL_VERSION ||
|
|
23
|
-
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
24
|
-
.version;
|
|
25
|
-
|
|
26
17
|
const LEAD_OPTIONS = {
|
|
27
18
|
"lead-profile": {
|
|
28
19
|
type: "string",
|
|
@@ -60,7 +51,6 @@ const TASK_INPUT_OPTIONS = {
|
|
|
60
51
|
|
|
61
52
|
const definition = {
|
|
62
53
|
name: "fit-eval",
|
|
63
|
-
version: VERSION,
|
|
64
54
|
description:
|
|
65
55
|
"Run agents and capture NDJSON traces — for agent evaluations or multi-agent collaboration",
|
|
66
56
|
commands: [
|
|
@@ -313,11 +303,14 @@ const definition = {
|
|
|
313
303
|
],
|
|
314
304
|
};
|
|
315
305
|
|
|
316
|
-
const
|
|
306
|
+
const runtime = createDefaultRuntime();
|
|
307
|
+
const logger = createLogger("eval", runtime);
|
|
317
308
|
|
|
318
309
|
async function main() {
|
|
319
|
-
const
|
|
320
|
-
|
|
310
|
+
const cli = createCli(definition, {
|
|
311
|
+
runtime,
|
|
312
|
+
packageJsonUrl: new URL("../package.json", import.meta.url),
|
|
313
|
+
});
|
|
321
314
|
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
322
315
|
if (!parsed) return runtime.proc.exit(0);
|
|
323
316
|
|
|
@@ -341,6 +334,6 @@ async function main() {
|
|
|
341
334
|
|
|
342
335
|
main().catch((error) => {
|
|
343
336
|
logger.exception("main", error);
|
|
344
|
-
createCli(definition).error(error.message);
|
|
337
|
+
createCli(definition, { runtime }).error(error.message);
|
|
345
338
|
process.exit(1);
|
|
346
339
|
});
|
package/bin/fit-selfedit.js
CHANGED
|
@@ -7,12 +7,11 @@
|
|
|
7
7
|
|
|
8
8
|
import "@forwardimpact/libpreflight/node22";
|
|
9
9
|
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
|
-
import fsPromises from "node:fs/promises";
|
|
11
10
|
import { parseArgs } from "node:util";
|
|
12
11
|
import { resolve, relative, dirname } from "node:path";
|
|
13
12
|
import { execFileSync } from "node:child_process";
|
|
14
13
|
|
|
15
|
-
import {
|
|
14
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
16
15
|
import { minimatch } from "minimatch";
|
|
17
16
|
|
|
18
17
|
const HELP = `fit-selfedit — write stdin to a settings.json-allowed path on a non-main branch.
|
|
@@ -71,8 +70,11 @@ if (extra.length > 0) fail(`unexpected extra arguments: ${extra.join(" ")}`);
|
|
|
71
70
|
|
|
72
71
|
const absoluteTarget = resolve(process.cwd(), targetArg);
|
|
73
72
|
|
|
74
|
-
// Safeguard 1: settings.json must grant Edit() on this path.
|
|
75
|
-
|
|
73
|
+
// Safeguard 1: settings.json must grant Edit() on this path. The bin is the
|
|
74
|
+
// sole construction site for the runtime; resolve the finder off the bag
|
|
75
|
+
// rather than constructing a Finder here (Success Criterion 9).
|
|
76
|
+
const runtime = createDefaultRuntime();
|
|
77
|
+
const settingsPath = runtime.finder.findUpward(
|
|
76
78
|
dirname(absoluteTarget),
|
|
77
79
|
".claude/settings.json",
|
|
78
80
|
20,
|
package/bin/fit-trace.js
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import { readFileSync } from "node:fs";
|
|
6
5
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
6
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
7
|
import { createScriptConfig } from "@forwardimpact/libconfig";
|
|
@@ -31,17 +30,8 @@ import {
|
|
|
31
30
|
import { runAssertCommand } from "../src/commands/assert.js";
|
|
32
31
|
import { runByDiscussionCommand } from "../src/commands/by-discussion.js";
|
|
33
32
|
|
|
34
|
-
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
35
|
-
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
36
|
-
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
37
|
-
const VERSION =
|
|
38
|
-
process.env.FIT_TRACE_VERSION ||
|
|
39
|
-
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
40
|
-
.version;
|
|
41
|
-
|
|
42
33
|
const definition = {
|
|
43
34
|
name: "fit-trace",
|
|
44
|
-
version: VERSION,
|
|
45
35
|
description:
|
|
46
36
|
"Download, query, and analyze agent execution traces — read NDJSON output from fit-eval as qualitative research",
|
|
47
37
|
commands: [
|
|
@@ -340,15 +330,18 @@ const definition = {
|
|
|
340
330
|
],
|
|
341
331
|
};
|
|
342
332
|
|
|
343
|
-
const
|
|
333
|
+
const runtime = createDefaultRuntime();
|
|
334
|
+
const logger = createLogger("trace", runtime);
|
|
344
335
|
|
|
345
336
|
// Commands that talk to the GitHub API need a config-backed token resolver;
|
|
346
337
|
// the rest only read local trace files through the runtime.
|
|
347
338
|
const NEEDS_CONFIG = new Set(["runs", "download"]);
|
|
348
339
|
|
|
349
340
|
async function main() {
|
|
350
|
-
const
|
|
351
|
-
|
|
341
|
+
const cli = createCli(definition, {
|
|
342
|
+
runtime,
|
|
343
|
+
packageJsonUrl: new URL("../package.json", import.meta.url),
|
|
344
|
+
});
|
|
352
345
|
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
353
346
|
if (!parsed) return runtime.proc.exit(0);
|
|
354
347
|
|
|
@@ -376,6 +369,6 @@ async function main() {
|
|
|
376
369
|
|
|
377
370
|
main().catch((error) => {
|
|
378
371
|
logger.exception("main", error);
|
|
379
|
-
createCli(definition).error(error.message);
|
|
372
|
+
createCli(definition, { runtime }).error(error.message);
|
|
380
373
|
process.exit(1);
|
|
381
374
|
});
|
package/package.json
CHANGED
package/src/benchmark/result.js
CHANGED
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
* - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
|
|
6
6
|
* benchmark run. Has a happy branch (invariants + judge present) and a
|
|
7
7
|
* pre-flight-failure branch (invariants/judgeVerdict/submission absent).
|
|
8
|
-
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants
|
|
9
|
-
*
|
|
8
|
+
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`:
|
|
9
|
+
* ad-hoc grading without a full lifecycle.
|
|
10
10
|
*
|
|
11
11
|
* Validation is throw-on-mismatch so the runner can wrap every JSONL append
|
|
12
12
|
* in a guard and reject schema drift at write time.
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Task-family loader. A task family is a directory under
|
|
3
3
|
* <root>/
|
|
4
4
|
* apm.lock.yaml
|
|
5
|
-
* .claude/ # pre-staged skills + agents
|
|
5
|
+
* .claude/ # pre-staged skills + agents
|
|
6
6
|
* tasks/<task_name>/
|
|
7
7
|
* agent.task.md
|
|
8
8
|
* supervisor.task.md # optional; appended to the task as supervisor context
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* `fit-benchmark invariants` — check a single task's invariants against a
|
|
3
|
-
* post-run workdir directory without invoking an agent
|
|
3
|
+
* post-run workdir directory without invoking an agent. Useful for
|
|
4
4
|
* re-checking an agent's output against revised grading material.
|
|
5
5
|
*/
|
|
6
6
|
|
package/src/discusser.js
CHANGED
|
@@ -274,6 +274,7 @@ export function createDiscusser({
|
|
|
274
274
|
messageBus,
|
|
275
275
|
leadName: "lead",
|
|
276
276
|
signal: abortController.signal,
|
|
277
|
+
runtime,
|
|
277
278
|
})
|
|
278
279
|
: null;
|
|
279
280
|
|
|
@@ -309,10 +310,6 @@ export function createDiscusser({
|
|
|
309
310
|
from: config.name,
|
|
310
311
|
});
|
|
311
312
|
|
|
312
|
-
const agentTrailer = config.systemPromptAmend
|
|
313
|
-
? `${DISCUSS_AGENT_SYSTEM_PROMPT}\n\n${config.systemPromptAmend}`
|
|
314
|
-
: DISCUSS_AGENT_SYSTEM_PROMPT;
|
|
315
|
-
|
|
316
313
|
const runner = createAgentRunner({
|
|
317
314
|
cwd: config.cwd ?? resolvedLeadCwd,
|
|
318
315
|
query,
|
|
@@ -327,7 +324,8 @@ export function createDiscusser({
|
|
|
327
324
|
role: "agent",
|
|
328
325
|
profile: config.agentProfile,
|
|
329
326
|
profilesDir: resolvedProfilesDir,
|
|
330
|
-
trailer:
|
|
327
|
+
trailer: DISCUSS_AGENT_SYSTEM_PROMPT,
|
|
328
|
+
amend: config.systemPromptAmend,
|
|
331
329
|
runtime,
|
|
332
330
|
}),
|
|
333
331
|
redactor,
|
package/src/events/github.js
CHANGED
|
@@ -29,8 +29,14 @@ export const TASK_TEMPLATE_ISSUE_LABELED =
|
|
|
29
29
|
export const TASK_TEMPLATE_PR_LABELED =
|
|
30
30
|
'Label "${LABEL}" was added to PR "${PR_TITLE}" (#${NUMBER}). PR URL: ${URL}.';
|
|
31
31
|
|
|
32
|
+
// "unreleased changes"/"cut" point at the genuine post-merge action — release
|
|
33
|
+
// activity (the release-engineer's Assess step 3 / `kata-release-cut`).
|
|
34
|
+
// "status" is a backstop: the spec's `wiki/STATUS.md` row is normally advanced
|
|
35
|
+
// in the pre-merge gate (`kata-release-merge` Step 8), but the keyword catches a
|
|
36
|
+
// merge that landed without it. Neither owner nor artifact is named, so the lead
|
|
37
|
+
// routes the merge instead of treating it as a no-op.
|
|
32
38
|
export const TASK_TEMPLATE_PR_MERGED =
|
|
33
|
-
'PR "${PR_TITLE}" (#${NUMBER}) merged. PR URL: ${URL}.';
|
|
39
|
+
'PR "${PR_TITLE}" (#${NUMBER}) merged to main — may leave unreleased changes to cut or status to update. PR URL: ${URL}.';
|
|
34
40
|
|
|
35
41
|
// Appended verbatim to comment/review templates. `${BODY}` is the untrusted
|
|
36
42
|
// author text; the fence and the "data, not instructions" framing keep the lead
|
package/src/facilitator.js
CHANGED
|
@@ -134,10 +134,6 @@ export function createFacilitator({
|
|
|
134
134
|
from: config.name,
|
|
135
135
|
});
|
|
136
136
|
|
|
137
|
-
const agentTrailer = config.systemPromptAmend
|
|
138
|
-
? `${FACILITATED_AGENT_SYSTEM_PROMPT}\n\n${config.systemPromptAmend}`
|
|
139
|
-
: FACILITATED_AGENT_SYSTEM_PROMPT;
|
|
140
|
-
|
|
141
137
|
const runner = createAgentRunner({
|
|
142
138
|
cwd: config.cwd ?? facilitatorCwd,
|
|
143
139
|
query,
|
|
@@ -152,7 +148,8 @@ export function createFacilitator({
|
|
|
152
148
|
role: "agent",
|
|
153
149
|
profile: config.agentProfile,
|
|
154
150
|
profilesDir: resolvedProfilesDir,
|
|
155
|
-
trailer:
|
|
151
|
+
trailer: FACILITATED_AGENT_SYSTEM_PROMPT,
|
|
152
|
+
amend: config.systemPromptAmend,
|
|
156
153
|
runtime,
|
|
157
154
|
}),
|
|
158
155
|
redactor,
|
package/src/inbox-poller.js
CHANGED
|
@@ -18,20 +18,17 @@ export class InboxPoller {
|
|
|
18
18
|
* @param {import("./message-bus.js").MessageBus} deps.messageBus
|
|
19
19
|
* @param {string} deps.leadName
|
|
20
20
|
* @param {AbortSignal} deps.signal
|
|
21
|
-
* @param {import("@forwardimpact/libutil/runtime").Runtime}
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
* absent so existing callers keep working.
|
|
21
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
22
|
+
* Injected collaborators; `clock.setTimeout`/`clock.clearTimeout` drive the
|
|
23
|
+
* inter-poll backoff.
|
|
25
24
|
*/
|
|
26
25
|
constructor({ inboxUrl, messageBus, leadName, signal, runtime }) {
|
|
26
|
+
if (!runtime) throw new Error("runtime is required");
|
|
27
27
|
this.#inboxUrl = inboxUrl;
|
|
28
28
|
this.#messageBus = messageBus;
|
|
29
29
|
this.#leadName = leadName;
|
|
30
30
|
this.#signal = signal;
|
|
31
|
-
this.#clock = runtime
|
|
32
|
-
setTimeout: (fn, ms) => globalThis.setTimeout(fn, ms),
|
|
33
|
-
clearTimeout: (h) => globalThis.clearTimeout(h),
|
|
34
|
-
};
|
|
31
|
+
this.#clock = runtime.clock;
|
|
35
32
|
}
|
|
36
33
|
|
|
37
34
|
/** Long-poll the inbox until the abort signal fires. */
|
package/src/judge.js
CHANGED
|
@@ -17,7 +17,7 @@ import { resolve } from "node:path";
|
|
|
17
17
|
import { Writable } from "node:stream";
|
|
18
18
|
|
|
19
19
|
import { createAgentRunner } from "./agent-runner.js";
|
|
20
|
-
import {
|
|
20
|
+
import { composeSystemPrompt } from "./profile-prompt.js";
|
|
21
21
|
import { SequenceCounter } from "./sequence-counter.js";
|
|
22
22
|
import {
|
|
23
23
|
createJudgeToolServer,
|
|
@@ -140,7 +140,7 @@ export class Judge {
|
|
|
140
140
|
/**
|
|
141
141
|
* Factory function — wires the AgentRunner with the judge orchestration server
|
|
142
142
|
* and the JUDGE_SYSTEM_PROMPT trailer. A `judgeProfile` (when supplied) layers
|
|
143
|
-
* on top of the trailer via `
|
|
143
|
+
* on top of the trailer via `composeSystemPrompt`, matching the
|
|
144
144
|
* supervisor/facilitator pattern.
|
|
145
145
|
*
|
|
146
146
|
* @param {object} deps
|
|
@@ -151,7 +151,7 @@ export class Judge {
|
|
|
151
151
|
* @param {string} [deps.model]
|
|
152
152
|
* @param {number} [deps.maxTurns] - Default 5 (the judge is expected to act in turn 1; 5 leaves headroom for tool inspection).
|
|
153
153
|
* @param {string[]} [deps.allowedTools] - Default `["Read","Glob","Grep","Bash"]` — read-only inspection.
|
|
154
|
-
* @param {string} [deps.judgeProfile] - Profile name; resolved into the system prompt via `
|
|
154
|
+
* @param {string} [deps.judgeProfile] - Profile name; resolved into the system prompt via `composeSystemPrompt`.
|
|
155
155
|
* @param {string} [deps.profilesDir] - Defaults to `<cwd>/.claude/agents`.
|
|
156
156
|
* @param {string} [deps.taskAmend]
|
|
157
157
|
* @returns {Judge}
|
|
@@ -176,17 +176,13 @@ export function createJudge({
|
|
|
176
176
|
if (!runtime) throw new Error("runtime is required");
|
|
177
177
|
|
|
178
178
|
const resolvedProfilesDir = profilesDir ?? resolve(cwd, ".claude/agents");
|
|
179
|
-
const systemPrompt =
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
type: "preset",
|
|
187
|
-
preset: "claude_code",
|
|
188
|
-
append: JUDGE_SYSTEM_PROMPT,
|
|
189
|
-
};
|
|
179
|
+
const systemPrompt = composeSystemPrompt({
|
|
180
|
+
role: "agent",
|
|
181
|
+
profile: judgeProfile,
|
|
182
|
+
profilesDir: resolvedProfilesDir,
|
|
183
|
+
trailer: JUDGE_SYSTEM_PROMPT,
|
|
184
|
+
runtime,
|
|
185
|
+
});
|
|
190
186
|
|
|
191
187
|
const ctx = createOrchestrationContext();
|
|
192
188
|
ctx.participants = [{ name: "judge", role: "judge" }];
|
package/src/profile-prompt.js
CHANGED
|
@@ -1,7 +1,39 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* System prompt composition for agent runners.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* libeval assembles every agent system prompt from up to two parallel,
|
|
5
|
+
* sibling-tagged sections (see COALIGNED.md § L0):
|
|
6
|
+
*
|
|
7
|
+
* <agent_profile>
|
|
8
|
+
* …persona body…
|
|
9
|
+
* </agent_profile>
|
|
10
|
+
*
|
|
11
|
+
* <session_protocol>
|
|
12
|
+
* …orchestration mechanics, then any amendment…
|
|
13
|
+
* </session_protocol>
|
|
14
|
+
*
|
|
15
|
+
* The two tags are siblings joined by a blank line — neither nests inside
|
|
16
|
+
* the other. A section appears only when its content is present. The tag
|
|
17
|
+
* convention lives entirely here: profile `.md` files and trailer constants
|
|
18
|
+
* carry no tags.
|
|
19
|
+
*
|
|
20
|
+
* The `<session_protocol>` body is assembled from up to three fragments, in
|
|
21
|
+
* order of decreasing generality:
|
|
22
|
+
*
|
|
23
|
+
* 1. the role-invariant orchestration trailer (libeval-owned);
|
|
24
|
+
* 2. the profile's own hoisted `## Session Protocol` section, if present;
|
|
25
|
+
* 3. a run-specific amendment, if supplied.
|
|
26
|
+
*
|
|
27
|
+
* Fragment 2 is the convention-based hoist: a profile may carry a level-2
|
|
28
|
+
* `## Session Protocol` markdown heading whose body is the role's work
|
|
29
|
+
* routine. When present, that section is lifted out of `<agent_profile>` and
|
|
30
|
+
* folded into `<session_protocol>` next to the orchestration mechanics, so
|
|
31
|
+
* the harness comms protocol and the role's work routine read as one
|
|
32
|
+
* coherent block. The heading line itself is dropped — the tag already names
|
|
33
|
+
* the section. Profiles with no such heading are unaffected (the entire body
|
|
34
|
+
* stays in `<agent_profile>`).
|
|
35
|
+
*
|
|
36
|
+
* Helpers:
|
|
5
37
|
*
|
|
6
38
|
* - `composeProfilePrompt(name, opts)` — profile + `claude_code` preset.
|
|
7
39
|
* Used by agent participants that need the full Claude Code tool surface.
|
|
@@ -10,61 +42,186 @@
|
|
|
10
42
|
* roles (supervisor, facilitator, discuss lead) that should only see
|
|
11
43
|
* the orchestration instructions and optionally a profile body.
|
|
12
44
|
*
|
|
13
|
-
* - `composeSystemPrompt(opts)` — unified entry point.
|
|
45
|
+
* - `composeSystemPrompt(opts)` — unified entry point. Threads `amend` into
|
|
46
|
+
* the protocol section as the run-specific fragment, then delegates to one
|
|
14
47
|
* of the above based on `opts.role`.
|
|
15
48
|
*/
|
|
16
49
|
|
|
17
50
|
import { join } from "node:path";
|
|
18
51
|
|
|
52
|
+
/** Sibling section tags. Neither nests inside the other. */
|
|
53
|
+
const AGENT_PROFILE_TAG = "agent_profile";
|
|
54
|
+
const SESSION_PROTOCOL_TAG = "session_protocol";
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* A level-2 heading that names the profile's hoisted session-protocol
|
|
58
|
+
* section. Case-insensitive, tolerant of trailing whitespace, but the level
|
|
59
|
+
* is fixed at two `#` so a `### Session Protocol` subsection does not trip
|
|
60
|
+
* the hoist.
|
|
61
|
+
*/
|
|
62
|
+
const SESSION_PROTOCOL_HEADING = /^##[ \t]+session protocol[ \t]*$/i;
|
|
63
|
+
|
|
64
|
+
/** A level-1 or level-2 heading — the boundary that ends a hoisted section. */
|
|
65
|
+
const SECTION_BOUNDARY = /^#{1,2}[ \t]+\S/;
|
|
66
|
+
|
|
67
|
+
/** Wrap content in a semantic section tag, each on its own line. */
|
|
68
|
+
function wrapSection(tag, content) {
|
|
69
|
+
return `<${tag}>\n${content}\n</${tag}>`;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Assemble the parallel `<agent_profile>` / `<session_protocol>` sections.
|
|
74
|
+
* The profile section is emitted only when `body` is non-empty. The protocol
|
|
75
|
+
* section is built by joining its fragments (in the order given) with a
|
|
76
|
+
* blank-line separator, dropping any that are empty, and is emitted only
|
|
77
|
+
* when at least one fragment survives. The two tags are siblings joined by a
|
|
78
|
+
* blank line and never nest.
|
|
79
|
+
*
|
|
80
|
+
* @param {object} parts
|
|
81
|
+
* @param {string} [parts.body] - Profile body, frontmatter-stripped and with
|
|
82
|
+
* any `## Session Protocol` section already hoisted out.
|
|
83
|
+
* @param {Array<string | undefined>} [parts.protocolParts] - Ordered session
|
|
84
|
+
* protocol fragments: trailer, hoisted profile section, run amendment.
|
|
85
|
+
* @returns {string}
|
|
86
|
+
*/
|
|
87
|
+
function assembleSections({ body, protocolParts = [] }) {
|
|
88
|
+
const sections = [];
|
|
89
|
+
if (body) sections.push(wrapSection(AGENT_PROFILE_TAG, body));
|
|
90
|
+
const protocol = protocolParts.filter(Boolean).join("\n\n");
|
|
91
|
+
if (protocol) sections.push(wrapSection(SESSION_PROTOCOL_TAG, protocol));
|
|
92
|
+
return sections.join("\n\n");
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Split a frontmatter-stripped profile body into its persona and an optional
|
|
97
|
+
* hoisted `## Session Protocol` section. The section runs from its heading to
|
|
98
|
+
* the next level-1/level-2 heading (or end of body); the heading line is
|
|
99
|
+
* dropped. Anything before and after the section is rejoined into `persona`.
|
|
100
|
+
* When the body carries no `## Session Protocol` heading, the whole body is
|
|
101
|
+
* returned as `persona` and `protocol` is `undefined`.
|
|
102
|
+
*
|
|
103
|
+
* @param {string} body - Frontmatter-stripped, trimmed profile body.
|
|
104
|
+
* @returns {{ persona: string, protocol: string | undefined }}
|
|
105
|
+
*/
|
|
106
|
+
function splitSessionProtocol(body) {
|
|
107
|
+
const lines = body.split("\n");
|
|
108
|
+
const start = lines.findIndex((line) => SESSION_PROTOCOL_HEADING.test(line));
|
|
109
|
+
if (start === -1) return { persona: body, protocol: undefined };
|
|
110
|
+
|
|
111
|
+
let end = lines.length;
|
|
112
|
+
for (let i = start + 1; i < lines.length; i++) {
|
|
113
|
+
if (SECTION_BOUNDARY.test(lines[i])) {
|
|
114
|
+
end = i;
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const protocol = lines
|
|
120
|
+
.slice(start + 1, end)
|
|
121
|
+
.join("\n")
|
|
122
|
+
.trim();
|
|
123
|
+
const before = lines.slice(0, start).join("\n").trim();
|
|
124
|
+
const after = lines.slice(end).join("\n").trim();
|
|
125
|
+
const persona = [before, after].filter(Boolean).join("\n\n");
|
|
126
|
+
return { persona, protocol: protocol || undefined };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Read a profile `.md`, strip its frontmatter, and split off any hoisted
|
|
131
|
+
* `## Session Protocol` section. Reads synchronously off the injected
|
|
132
|
+
* `runtime.fsSync` surface — this composer runs inside the synchronous
|
|
133
|
+
* SDK-option builders of the supervisor / facilitator / discusser / judge
|
|
134
|
+
* factories, so it cannot go async without an unbounded cascade.
|
|
135
|
+
*
|
|
136
|
+
* @param {string} name - Profile basename (no `.md` suffix)
|
|
137
|
+
* @param {string} profilesDir - Directory containing `<name>.md`
|
|
138
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
139
|
+
* @returns {{ persona: string, protocol: string | undefined }}
|
|
140
|
+
*/
|
|
141
|
+
function readProfileSections(name, profilesDir, runtime) {
|
|
142
|
+
const path = join(profilesDir, `${name}.md`);
|
|
143
|
+
const raw = runtime.fsSync.readFileSync(path, "utf8");
|
|
144
|
+
return splitSessionProtocol(stripFrontmatter(raw).trim());
|
|
145
|
+
}
|
|
146
|
+
|
|
19
147
|
/**
|
|
20
148
|
* Compose a `claude_code`-preset system prompt from a profile file. The
|
|
21
|
-
*
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
* async without an unbounded cascade.
|
|
149
|
+
* persona is wrapped in `<agent_profile>`; the protocol trailer, the
|
|
150
|
+
* profile's hoisted `## Session Protocol` section, and any amendment are
|
|
151
|
+
* joined (in that order) into a sibling `<session_protocol>`.
|
|
25
152
|
*
|
|
26
153
|
* @param {string} name - Profile basename (no `.md` suffix)
|
|
27
154
|
* @param {object} opts
|
|
28
155
|
* @param {string} opts.profilesDir - Directory containing `<name>.md`
|
|
29
|
-
* @param {string} [opts.trailer] -
|
|
156
|
+
* @param {string} [opts.trailer] - Session protocol orchestration mechanics,
|
|
157
|
+
* the first fragment of the `<session_protocol>` section.
|
|
158
|
+
* @param {string} [opts.amend] - Run-specific amendment, the last fragment of
|
|
159
|
+
* the `<session_protocol>` section.
|
|
30
160
|
* @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime - Ambient collaborators; uses `fsSync.readFileSync`.
|
|
31
161
|
* @returns {{type: "preset", preset: "claude_code", append: string}}
|
|
32
162
|
*/
|
|
33
|
-
export function composeProfilePrompt(
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
const
|
|
38
|
-
return {
|
|
163
|
+
export function composeProfilePrompt(
|
|
164
|
+
name,
|
|
165
|
+
{ profilesDir, trailer, amend, runtime },
|
|
166
|
+
) {
|
|
167
|
+
const { persona, protocol } = readProfileSections(name, profilesDir, runtime);
|
|
168
|
+
return {
|
|
169
|
+
type: "preset",
|
|
170
|
+
preset: "claude_code",
|
|
171
|
+
append: assembleSections({
|
|
172
|
+
body: persona,
|
|
173
|
+
protocolParts: [trailer, protocol, amend],
|
|
174
|
+
}),
|
|
175
|
+
};
|
|
39
176
|
}
|
|
40
177
|
|
|
41
178
|
/**
|
|
42
|
-
* Compose a plain-string system prompt for a lead role (no Claude Code
|
|
179
|
+
* Compose a plain-string system prompt for a lead role (no Claude Code
|
|
180
|
+
* preset). The protocol trailer, an optional profile's hoisted
|
|
181
|
+
* `## Session Protocol` section, and any amendment are joined into
|
|
182
|
+
* `<session_protocol>`; an optional persona is wrapped in a sibling
|
|
183
|
+
* `<agent_profile>` before it.
|
|
184
|
+
*
|
|
43
185
|
* @param {object} opts
|
|
44
186
|
* @param {string} [opts.profile] - Profile basename (no `.md` suffix)
|
|
45
187
|
* @param {string} [opts.profilesDir] - Directory containing profile files
|
|
46
|
-
* @param {string} opts.trailer -
|
|
188
|
+
* @param {string} opts.trailer - Session protocol (orchestration instructions)
|
|
189
|
+
* @param {string} [opts.amend] - Run-specific amendment, the last fragment of
|
|
190
|
+
* the `<session_protocol>` section.
|
|
47
191
|
* @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime - Ambient collaborators; uses `fsSync.readFileSync`.
|
|
48
192
|
* @returns {string}
|
|
49
193
|
*/
|
|
50
|
-
export function composeLeadPrompt({
|
|
194
|
+
export function composeLeadPrompt({
|
|
195
|
+
profile,
|
|
196
|
+
profilesDir,
|
|
197
|
+
trailer,
|
|
198
|
+
amend,
|
|
199
|
+
runtime,
|
|
200
|
+
}) {
|
|
51
201
|
if (!trailer) throw new Error("trailer is required");
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
202
|
+
const { persona, protocol } = profile
|
|
203
|
+
? readProfileSections(profile, profilesDir, runtime)
|
|
204
|
+
: { persona: undefined, protocol: undefined };
|
|
205
|
+
return assembleSections({
|
|
206
|
+
body: persona,
|
|
207
|
+
protocolParts: [trailer, protocol, amend],
|
|
208
|
+
});
|
|
57
209
|
}
|
|
58
210
|
|
|
59
211
|
/**
|
|
60
|
-
* Unified entry point for composing system prompts.
|
|
212
|
+
* Unified entry point for composing system prompts. Threads an optional
|
|
213
|
+
* amendment through as the run-specific fragment of `<session_protocol>`
|
|
214
|
+
* (after the trailer and any hoisted profile section), then delegates by
|
|
215
|
+
* role.
|
|
61
216
|
*
|
|
62
217
|
* @param {object} opts
|
|
63
218
|
* @param {"lead"|"agent"} opts.role - `"lead"` produces a plain string;
|
|
64
219
|
* `"agent"` produces a `claude_code` preset object.
|
|
65
220
|
* @param {string} [opts.profile] - Profile basename
|
|
66
221
|
* @param {string} [opts.profilesDir]
|
|
67
|
-
* @param {string} opts.trailer -
|
|
222
|
+
* @param {string} opts.trailer - Session protocol (orchestration instructions)
|
|
223
|
+
* @param {string} [opts.amend] - Caller-supplied amendment, the last fragment
|
|
224
|
+
* inside `<session_protocol>`, joined with a blank-line separator.
|
|
68
225
|
* @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime - Ambient collaborators; uses `fsSync.readFileSync`.
|
|
69
226
|
* @returns {string | {type: "preset", preset: "claude_code", append: string}}
|
|
70
227
|
*/
|
|
@@ -73,16 +230,26 @@ export function composeSystemPrompt({
|
|
|
73
230
|
profile,
|
|
74
231
|
profilesDir,
|
|
75
232
|
trailer,
|
|
233
|
+
amend,
|
|
76
234
|
runtime,
|
|
77
235
|
}) {
|
|
78
236
|
if (!trailer) throw new Error("trailer is required");
|
|
79
237
|
if (role === "lead") {
|
|
80
|
-
return composeLeadPrompt({ profile, profilesDir, trailer, runtime });
|
|
238
|
+
return composeLeadPrompt({ profile, profilesDir, trailer, amend, runtime });
|
|
81
239
|
}
|
|
82
240
|
if (profile) {
|
|
83
|
-
return composeProfilePrompt(profile, {
|
|
241
|
+
return composeProfilePrompt(profile, {
|
|
242
|
+
profilesDir,
|
|
243
|
+
trailer,
|
|
244
|
+
amend,
|
|
245
|
+
runtime,
|
|
246
|
+
});
|
|
84
247
|
}
|
|
85
|
-
return {
|
|
248
|
+
return {
|
|
249
|
+
type: "preset",
|
|
250
|
+
preset: "claude_code",
|
|
251
|
+
append: assembleSections({ protocolParts: [trailer, amend] }),
|
|
252
|
+
};
|
|
86
253
|
}
|
|
87
254
|
|
|
88
255
|
/**
|
package/src/redaction.js
CHANGED
|
@@ -15,6 +15,7 @@ export const DEFAULT_ENV_ALLOWLIST = Object.freeze([
|
|
|
15
15
|
"DATABASE_PASSWORD",
|
|
16
16
|
"GH_TOKEN",
|
|
17
17
|
"GITHUB_TOKEN",
|
|
18
|
+
"JWT_SECRET",
|
|
18
19
|
"MCP_TOKEN",
|
|
19
20
|
"MICROSOFT_APP_ID",
|
|
20
21
|
"MICROSOFT_APP_PASSWORD",
|
|
@@ -22,7 +23,6 @@ export const DEFAULT_ENV_ALLOWLIST = Object.freeze([
|
|
|
22
23
|
"PRODUCT_LANDMARK_TOKEN",
|
|
23
24
|
"SERVICE_SECRET",
|
|
24
25
|
"SUPABASE_ANON_KEY",
|
|
25
|
-
"SUPABASE_JWT_SECRET",
|
|
26
26
|
"SUPABASE_SERVICE_ROLE_KEY",
|
|
27
27
|
]);
|
|
28
28
|
|
|
@@ -135,7 +135,8 @@ export function createRedactor({
|
|
|
135
135
|
patterns = DEFAULT_PATTERNS,
|
|
136
136
|
enabled,
|
|
137
137
|
} = {}) {
|
|
138
|
-
|
|
138
|
+
if (!runtime) throw new Error("runtime is required");
|
|
139
|
+
const proc = runtime.proc;
|
|
139
140
|
const resolvedEnv = env ?? proc.env;
|
|
140
141
|
const envDisabled = resolvedEnv.LIBEVAL_REDACTION_DISABLED === "1";
|
|
141
142
|
const resolvedEnabled = enabled ?? !envDisabled;
|
|
@@ -151,20 +152,6 @@ export function createRedactor({
|
|
|
151
152
|
return new Redactor({ envSnapshot, patterns, enabled: resolvedEnabled });
|
|
152
153
|
}
|
|
153
154
|
|
|
154
|
-
/**
|
|
155
|
-
* Lazily build the production proc surface so callers that don't inject a
|
|
156
|
-
* runtime keep working. Imported indirectly to avoid pulling the whole
|
|
157
|
-
* runtime bag (and its `node:fs`/`node:child_process` imports) into modules
|
|
158
|
-
* that only ever receive an injected runtime.
|
|
159
|
-
* @returns {{env: Record<string, string|undefined>, stderr: {write: (s: string) => void}}}
|
|
160
|
-
*/
|
|
161
|
-
function defaultProc() {
|
|
162
|
-
return {
|
|
163
|
-
env: globalThis.process?.env ?? {},
|
|
164
|
-
stderr: { write: (s) => globalThis.process?.stderr?.write(s) },
|
|
165
|
-
};
|
|
166
|
-
}
|
|
167
|
-
|
|
168
155
|
/**
|
|
169
156
|
* Parse `LIBEVAL_REDACTION_ENV_VARS` into a trimmed, non-empty name list.
|
|
170
157
|
* Falls back to `DEFAULT_ENV_ALLOWLIST` when unset or empty.
|
package/src/supervisor.js
CHANGED
|
@@ -122,6 +122,7 @@ const devNull = new Writable({
|
|
|
122
122
|
* @param {string[]} [deps.supervisorDisallowedTools]
|
|
123
123
|
* @param {string} [deps.supervisorProfile]
|
|
124
124
|
* @param {string} [deps.agentProfile]
|
|
125
|
+
* @param {string} [deps.agentSystemPromptAmend] - Amendment folded into the agent's `<session_protocol>` section, after the protocol trailer.
|
|
125
126
|
* @param {string} [deps.profilesDir]
|
|
126
127
|
* @param {string} [deps.taskAmend]
|
|
127
128
|
* @param {Record<string, object>} [deps.agentMcpServers]
|
|
@@ -141,6 +142,7 @@ export function createSupervisor({
|
|
|
141
142
|
supervisorDisallowedTools,
|
|
142
143
|
supervisorProfile,
|
|
143
144
|
agentProfile,
|
|
145
|
+
agentSystemPromptAmend,
|
|
144
146
|
profilesDir,
|
|
145
147
|
taskAmend,
|
|
146
148
|
agentMcpServers,
|
|
@@ -182,6 +184,7 @@ export function createSupervisor({
|
|
|
182
184
|
profile: agentProfile,
|
|
183
185
|
profilesDir: resolvedProfilesDir,
|
|
184
186
|
trailer: AGENT_SYSTEM_PROMPT,
|
|
187
|
+
amend: agentSystemPromptAmend,
|
|
185
188
|
runtime,
|
|
186
189
|
}),
|
|
187
190
|
mcpServers: { orchestration: agentServer, ...agentMcpServers },
|