@forwardimpact/libeval 0.1.48 → 0.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -176,11 +176,9 @@ downloadable through retention.
176
176
  ## fit-selfedit
177
177
 
178
178
  A narrow, audited bypass for sessions where `Edit`/`Write` (and bash
179
- writes) are blocked against paths the project's own allowlist permits
180
- see [#1162](https://github.com/forwardimpact/monorepo/issues/1162) and
181
- [#441](https://github.com/forwardimpact/monorepo/issues/441) for the
182
- original episodes. Reads stdin, writes the target, exits 0 / 2
183
- (safeguard violation) / 1 (I/O error).
179
+ writes) are blocked against paths the project's own allowlist permits.
180
+ Reads stdin, writes the target, exits 0 / 2 (safeguard violation) / 1
181
+ (I/O error).
184
182
 
185
183
  ```sh
186
184
  echo "<content>" | bunx fit-selfedit <path>
@@ -134,7 +134,7 @@ export const definition = {
134
134
  "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
135
135
  "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
136
136
  "fit-benchmark report --format=text",
137
- "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
137
+ "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
138
138
  ],
139
139
  documentation: [
140
140
  {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.48",
3
+ "version": "0.1.49",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -20,28 +20,44 @@ import { tool } from "@anthropic-ai/claude-agent-sdk";
20
20
  import { z } from "zod";
21
21
 
22
22
  import {
23
+ ADJOURN_DESC,
23
24
  baseTools,
24
25
  concludeSession,
25
26
  orchestrationServer,
27
+ RECESS_DESC,
26
28
  requestForCommentTool,
29
+ requireNoPendingAsks,
27
30
  } from "./orchestration-toolkit.js";
28
31
 
29
32
  /** System prompt for discuss-mode agent participants. L0 mechanics only per COALIGNED. */
30
33
  export const DISCUSS_AGENT_SYSTEM_PROMPT =
31
34
  "You are a participant in a discussion.\n" +
32
- "Each question arrives as `[ask#N] <name>: <text>`.\n" +
35
+ "Each question arrives as `[ask#N] <name>: <text>` in your inbox.\n" +
33
36
  "Quote N as askId on your `Answer` to route the reply correctly.\n" +
34
37
  "Your `Answer` is posted to the discussion thread as a separate reply.\n" +
35
38
  "If the task already contains a completed response with no new human input after it, `Answer` that no further action is needed.\n" +
36
39
  "Do not redo completed work.";
37
40
 
38
- const RESUME_TRIGGER_SCHEMA = z
39
- .object({
40
- kind: z.enum(["responses", "elapsed", "either"]),
41
- responses: z.number().optional(),
42
- elapsed: z.string().optional(),
43
- })
44
- .strict();
41
+ const RESUME_TRIGGER_SCHEMA = z.discriminatedUnion("kind", [
42
+ z
43
+ .object({
44
+ kind: z.literal("missing_input"),
45
+ replies: z.number().int().positive(),
46
+ })
47
+ .strict(),
48
+ z
49
+ .object({
50
+ kind: z.literal("escalation_needed"),
51
+ signal: z.string().min(1),
52
+ })
53
+ .strict(),
54
+ z
55
+ .object({
56
+ kind: z.literal("elapsed"),
57
+ elapsed: z.string().min(1),
58
+ })
59
+ .strict(),
60
+ ]);
45
61
 
46
62
  /** Discuss-mode lead tool server. */
47
63
  export function createDiscussLeadToolServer(ctx) {
@@ -49,13 +65,13 @@ export function createDiscussLeadToolServer(ctx) {
49
65
  ...baseTools(ctx, { from: "lead", defaultTo: undefined, broadcast: true }),
50
66
  tool(
51
67
  "Recess",
52
- "Suspend the run. The bridge re-dispatches the workflow when the trigger fires.",
68
+ RECESS_DESC,
53
69
  { reason: z.string(), trigger: RESUME_TRIGGER_SCHEMA },
54
70
  createRecessHandler(ctx),
55
71
  ),
56
72
  tool(
57
73
  "Adjourn",
58
- "End the discussion with a verdict ('adjourned' / 'failed') and a summary.",
74
+ ADJOURN_DESC,
59
75
  {
60
76
  verdict: z.enum(["adjourned", "failed"]),
61
77
  summary: z.string(),
@@ -83,6 +99,8 @@ export function createDiscussAgentToolServer(ctx, { from }) {
83
99
  */
84
100
  export function createRecessHandler(ctx) {
85
101
  return async ({ reason, trigger }) => {
102
+ const guard = requireNoPendingAsks(ctx);
103
+ if (guard) return guard;
86
104
  ctx.recessTrigger = trigger;
87
105
  concludeSession(ctx, {
88
106
  verdict: "recessed",
@@ -96,6 +114,8 @@ export function createRecessHandler(ctx) {
96
114
  /** Adjourn handler — ends the discussion with a verdict. */
97
115
  export function createAdjournHandler(ctx) {
98
116
  return async ({ verdict, summary, outcome }) => {
117
+ const guard = requireNoPendingAsks(ctx);
118
+ if (guard) return guard;
99
119
  if (outcome !== undefined) ctx.outcome = outcome;
100
120
  concludeSession(ctx, {
101
121
  verdict,
package/src/discusser.js CHANGED
@@ -36,10 +36,11 @@ export const DISCUSS_SYSTEM_PROMPT =
36
36
  "Use `Ask` to delegate work to the best-suited participant.\n" +
37
37
  "Participants are domain experts; state the task, not how to do it.\n" +
38
38
  "Each participant's `Answer` is posted to the discussion thread as a separate reply.\n" +
39
- "`Ask` returns {askIds:[N,…]} immediately.\n" +
40
- "Answers arrive on your next turn as `[answer#N] <participant>: <text>`.\n" +
41
- "Multiple `Ask` calls in one turn run participants concurrently.\n" +
42
- "Wait for all participants to `Answer` before calling `Adjourn` or `Recess`.";
39
+ "`Ask` is async and returns {askIds:[N,…]} immediately.\n" +
40
+ "Answers arrive on your next turn as `[answer#N] <participant>: <text>` in your inbox.\n" +
41
+ "End your turn while Asks are pending. The system resumes you when answers arrive.\n" +
42
+ "Multiple `Ask` calls in one turn run participants in parallel.\n" +
43
+ "End the discussion by calling `Adjourn` with a verdict and summary, or `Recess` only to wait on an external reply or duration.";
43
44
 
44
45
  /**
45
46
  * Augment a base orchestration context with discuss-mode fields.
@@ -25,15 +25,16 @@ export const FACILITATOR_SYSTEM_PROMPT =
25
25
  "Use `RollCall` to list participants.\n" +
26
26
  "Use `Ask` to delegate work to the best-suited participant.\n" +
27
27
  "Participants are domain experts; state the task, not how to do it.\n" +
28
- "`Ask` returns {askIds:[N,…]} immediately.\n" +
29
- "Answers arrive on your next turn as `[answer#N] <participant>: <text>`.\n" +
30
- "Multiple `Ask` calls in one turn run participants concurrently.\n" +
31
- "Wait for all participants to `Answer` before calling `Conclude`.";
28
+ "`Ask` is async and returns {askIds:[N,…]} immediately.\n" +
29
+ "Answers arrive on your next turn as `[answer#N] <participant>: <text>` in your inbox.\n" +
30
+ "End your turn while Asks are pending. The system resumes you when answers arrive.\n" +
31
+ "Multiple `Ask` calls in one turn run participants in parallel.\n" +
32
+ "End every session by calling `Conclude` with a verdict and summary.";
32
33
 
33
34
  /** System prompt for facilitated agent participants. L0 mechanics only per COALIGNED. */
34
35
  export const FACILITATED_AGENT_SYSTEM_PROMPT =
35
36
  "You are a participant in a facilitated session.\n" +
36
- "Each question arrives as `[ask#N] <name>: <text>`.\n" +
37
+ "Each question arrives as `[ask#N] <name>: <text>` in your inbox.\n" +
37
38
  "Quote N as askId on your `Answer` to route the reply correctly.\n" +
38
39
  "If the task already contains a completed response with no new human input after it, `Answer` that no further action is needed.\n" +
39
40
  "Do not redo completed work.";
@@ -46,9 +46,24 @@ export function createOrchestrationContext() {
46
46
 
47
47
  // --- Handler factories ---
48
48
 
49
+ /**
50
+ * Guard for terminal tools (`Conclude`, `Adjourn`, `Recess`). Returns an
51
+ * error result when the caller still has Asks in flight, telling them to
52
+ * end the turn and wait for the auto-resume. Returns `null` when no Asks
53
+ * are pending and the terminal tool is free to run.
54
+ */
55
+ export function requireNoPendingAsks(ctx) {
56
+ if (ctx.pendingAsks.size === 0) return null;
57
+ return errorResult(
58
+ "Asks are still pending. End your turn. You will be resumed when answers arrive.",
59
+ );
60
+ }
61
+
49
62
  /** Mark the session as concluded; cancel any open Asks so askers see the synthetic null on their next turn. */
50
63
  export function createConcludeHandler(ctx) {
51
64
  return async ({ verdict, summary }) => {
65
+ const guard = requireNoPendingAsks(ctx);
66
+ if (guard) return guard;
52
67
  concludeSession(ctx, { verdict, summary, reason: "session concluded" });
53
68
  return { content: [{ type: "text", text: "Session concluded." }] };
54
69
  };
@@ -235,8 +250,18 @@ const ANNOUNCE_DESC = "Broadcast a message with no reply expected.";
235
250
 
236
251
  const ROLLCALL_DESC = "List all participants in the session.";
237
252
 
253
+ // Terminal-tool descriptions. Each one ends the run. Group them so the
254
+ // contrast is visible: Conclude (success/failure), Adjourn (settled in
255
+ // thread), Recess (paused for out-of-session input). Each description
256
+ // leads with the cost.
238
257
  const CONCLUDE_DESC =
239
- "End the session with a verdict ('success' or 'failure') and a summary.";
258
+ "End the session. Provide a verdict ('success' or 'failure') and a summary.";
259
+
260
+ const ADJOURN_DESC =
261
+ "End the discussion. Provide a verdict ('adjourned' or 'failed') and a summary. Cancels any unanswered Asks.";
262
+
263
+ const RECESS_DESC =
264
+ "End the run and schedule an out-of-session re-dispatch. Cancels any unanswered Asks. Use only when waiting on an external reply or duration. Do not use to wait on in-flight Asks.";
240
265
 
241
266
  // --- Tool builders ---
242
267
 
@@ -244,6 +269,7 @@ const CONCLUDE_DESC =
244
269
  function textResult(text) {
245
270
  return { content: [{ type: "text", text }] };
246
271
  }
272
+ /** Build an MCP tool error result wrapping a single text message. */
247
273
  function errorResult(text) {
248
274
  return { content: [{ type: "text", text }], isError: true };
249
275
  }
@@ -391,4 +417,11 @@ function requestForCommentTool(ctx) {
391
417
 
392
418
  // Re-export the building blocks discuss-tools.js needs to assemble its
393
419
  // own lead tool surface (it has two extra terminal tools).
394
- export { baseTools, orchestrationServer, requestForCommentTool };
420
+ export {
421
+ ADJOURN_DESC,
422
+ baseTools,
423
+ errorResult,
424
+ orchestrationServer,
425
+ RECESS_DESC,
426
+ requestForCommentTool,
427
+ };
@@ -101,7 +101,8 @@ export function simplifyToolName(name) {
101
101
  *
102
102
  * Three branches, in priority order:
103
103
  * - A built-in tool with an entry in `HINT_HANDLERS` → sanitized hint, no
104
- * `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
104
+ * `{` / `"` from the input (built-in tool hints stay free of JSON
105
+ * punctuation so readers see clean one-liners).
105
106
  * - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
106
107
  * single-line JSON; `{` and `"` intentionally appear so readers see
107
108
  * the actual MCP payload.
@@ -2,8 +2,7 @@
2
2
  * Turn renderer — maps a structured turn into formatted text lines.
3
3
  *
4
4
  * Shared by `TeeWriter.flushTurns()` (live stream) and
5
- * `TraceCollector.toText()` (offline replay) so both emit identical output
6
- * (spec 540).
5
+ * `TraceCollector.toText()` (offline replay) so both emit identical output.
7
6
  */
8
7
 
9
8
  import {
package/src/supervisor.js CHANGED
@@ -32,15 +32,16 @@ export const SUPERVISOR_SYSTEM_PROMPT =
32
32
  "You supervise one agent.\n" +
33
33
  "You have no tools to perform work yourself.\n" +
34
34
  "Use `Ask` to delegate work to the agent.\n" +
35
- "`Ask` returns {askIds:[N]} immediately.\n" +
36
- "The reply arrives on your next turn as `[answer#N] agent: <text>`.\n" +
35
+ "`Ask` is async and returns {askIds:[N]} immediately.\n" +
36
+ "The reply arrives on your next turn as `[answer#N] agent: <text>` in your inbox.\n" +
37
+ "End your turn while Asks are pending. The system resumes you when an answer arrives.\n" +
37
38
  "If the agent goes off-track, send a corrective `Ask`.\n" +
38
- "End every session by calling `Conclude`.";
39
+ "End every session by calling `Conclude` with a verdict and summary.";
39
40
 
40
41
  /** System prompt for the supervised agent. L0 mechanics only per COALIGNED. */
41
42
  export const AGENT_SYSTEM_PROMPT =
42
43
  "A supervisor directs your work.\n" +
43
- "Each question arrives as `[ask#N] supervisor: <text>`.\n" +
44
+ "Each question arrives as `[ask#N] supervisor: <text>` in your inbox.\n" +
44
45
  "Quote N as askId on your `Answer` to route the reply correctly.\n" +
45
46
  "If the task already contains a completed response with no new human input after it, `Answer` that no further action is needed.\n" +
46
47
  "Do not redo completed work.";
package/src/tee-writer.js CHANGED
@@ -9,7 +9,7 @@
9
9
  *
10
10
  * Human text rendering is delegated to the pure modules under `./render/`
11
11
  * so the live stream and the offline `TraceCollector.toText()` replay share
12
- * one formatting path (spec 540). The NDJSON going to `fileStream` is
12
+ * one formatting path. The NDJSON going to `fileStream` is
13
13
  * untouched — only what reaches `textStream` changes.
14
14
  *
15
15
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
@@ -67,10 +67,9 @@ export class TeeWriter extends Writable {
67
67
  }
68
68
 
69
69
  // Emit the trailing `--- Result: ... ---` footer — the one summary line
70
- // humans want (spec 540). This is the same tail TraceCollector.toText()
71
- // appends, so the live stream and the offline replay stay in sync
72
- // (spec 540 criterion #6). The superseded `--- Evaluation ... ---`
73
- // footer is gone in every mode.
70
+ // humans want. This is the same tail TraceCollector.toText()
71
+ // appends, so the live stream and the offline replay stay in sync.
72
+ // The superseded `--- Evaluation ... ---` footer is gone in every mode.
74
73
  if (this.collector.result) {
75
74
  const text = this.collector.toText();
76
75
  const idx = text.lastIndexOf("\n---");
@@ -78,7 +77,7 @@ export class TeeWriter extends Writable {
78
77
  // Slice past the leading `\n` — the previously-streamed body
79
78
  // already ended with its own newline, so re-emitting `\n---` here
80
79
  // would produce a blank line before the footer and desync from
81
- // the offline replay (spec 540 #6).
80
+ // the offline replay.
82
81
  this.textStream.write(text.slice(idx + 1) + "\n");
83
82
  }
84
83
  }
@@ -107,7 +106,8 @@ export class TeeWriter extends Writable {
107
106
  this.collector.addLine(line);
108
107
 
109
108
  // Orchestrator lifecycle events are suppressed from the text stream
110
- // entirely (spec 540). They still reached fileStream above.
109
+ // entirely humans only want agent-visible content. They still
110
+ // reached fileStream above.
111
111
  if (
112
112
  parsed.source === "orchestrator" &&
113
113
  isSuppressedOrchestratorEvent(parsed.event)
@@ -118,7 +118,7 @@ export class TeeWriter extends Writable {
118
118
  return;
119
119
  }
120
120
 
121
- // Bare event (run mode pre-migration or direct feed)
121
+ // Bare event (unwrapped run mode line or direct feed)
122
122
  this.collector.addLine(line);
123
123
  this.flushTurns();
124
124
  }
@@ -6,7 +6,7 @@
6
6
  *
7
7
  * Human text rendering is delegated to the pure modules under `./render/`
8
8
  * so the live `TeeWriter` stream and the offline `toText()` replay share
9
- * one formatting path (spec 540).
9
+ * one formatting path.
10
10
  */
11
11
 
12
12
  import { renderTurnLines } from "./render/turn-renderer.js";
@@ -293,7 +293,7 @@ export class TraceCollector {
293
293
  }
294
294
 
295
295
  /**
296
- * Format the trailing result summary line (spec 540). When an orchestrator
296
+ * Format the trailing result summary line. When an orchestrator
297
297
  * summary is present (supervised / facilitated mode), the headline word is
298
298
  * the supervisor's verdict ("success" / "failure") rather than the SDK's
299
299
  * per-runner subtype, so the footer aligns with the CI exit code.