@forwardimpact/libeval 0.1.50 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +36 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/apm-installer.js +48 -44
  8. package/src/benchmark/env-loader.js +35 -23
  9. package/src/benchmark/invariants.js +128 -0
  10. package/src/benchmark/judge.js +18 -19
  11. package/src/benchmark/npm-installer.js +33 -33
  12. package/src/benchmark/report.js +40 -26
  13. package/src/benchmark/result.js +11 -11
  14. package/src/benchmark/runner.js +90 -46
  15. package/src/benchmark/task-family.js +78 -65
  16. package/src/benchmark/workdir.js +100 -93
  17. package/src/commands/assert.js +30 -22
  18. package/src/commands/benchmark-invariants.js +74 -0
  19. package/src/commands/benchmark-report.js +24 -15
  20. package/src/commands/benchmark-run.js +16 -9
  21. package/src/commands/by-discussion.js +33 -23
  22. package/src/commands/callback.js +20 -11
  23. package/src/commands/discuss.js +31 -13
  24. package/src/commands/facilitate.js +21 -14
  25. package/src/commands/output.js +15 -13
  26. package/src/commands/run.js +28 -14
  27. package/src/commands/supervise.js +29 -19
  28. package/src/commands/task-input.js +10 -5
  29. package/src/commands/tee.js +24 -9
  30. package/src/commands/trace.js +181 -99
  31. package/src/discuss-tools.js +48 -2
  32. package/src/discusser.js +53 -2
  33. package/src/events/github.js +27 -5
  34. package/src/facilitator.js +4 -0
  35. package/src/inbox-poller.js +84 -0
  36. package/src/judge.js +4 -1
  37. package/src/message-bus.js +6 -0
  38. package/src/orchestration-loop.js +14 -4
  39. package/src/orchestration-toolkit.js +14 -0
  40. package/src/profile-prompt.js +22 -9
  41. package/src/redaction.js +31 -9
  42. package/src/reply-emitter.js +47 -0
  43. package/src/supervisor.js +4 -0
  44. package/src/tee-writer.js +4 -2
  45. package/src/trace-collector.js +9 -2
  46. package/src/trace-github.js +47 -27
  47. package/src/benchmark/scorer.js +0 -138
  48. package/src/commands/benchmark-score.js +0 -68
@@ -1,6 +1,6 @@
1
- import { createWriteStream } from "node:fs";
2
1
  import { Writable } from "node:stream";
3
2
  import { resolve } from "node:path";
3
+ import { isoTimestamp } from "@forwardimpact/libutil";
4
4
  import { createAgentRunner } from "../agent-runner.js";
5
5
  import { composeProfilePrompt } from "../profile-prompt.js";
6
6
  import { createRedactor } from "../redaction.js";
@@ -12,10 +12,14 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
12
12
  /**
13
13
  * Parse and validate run command options from parsed values.
14
14
  * @param {object} values - Parsed option values from cli.parse()
15
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
15
16
  * @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
16
17
  */
17
- function parseRunOptions(values) {
18
- const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
18
+ function parseRunOptions(values, runtime) {
19
+ const { task: taskContent, amend: taskAmend } = resolveTaskContent(
20
+ values,
21
+ runtime,
22
+ );
19
23
  const maxTurnsRaw = values["max-turns"] ?? "50";
20
24
 
21
25
  return {
@@ -39,10 +43,11 @@ function parseRunOptions(values) {
39
43
  *
40
44
  * Usage: fit-eval run [options]
41
45
  *
42
- * @param {object} values - Parsed option values from cli.parse()
43
- * @param {string[]} args - Positional arguments
46
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
47
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
44
48
  */
45
- export async function runRunCommand(values, _args) {
49
+ export async function runRunCommand(ctx) {
50
+ const runtime = ctx.deps.runtime;
46
51
  const {
47
52
  taskContent,
48
53
  taskAmend,
@@ -53,19 +58,26 @@ export async function runRunCommand(values, _args) {
53
58
  agentProfile,
54
59
  allowedTools,
55
60
  mcpServer,
56
- } = parseRunOptions(values);
61
+ } = parseRunOptions(ctx.options, runtime);
57
62
 
58
63
  // Build the redactor as the first observable side-effect after option
59
64
  // parsing — the env snapshot must freeze BEFORE any in-process
60
- // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
61
- const redactor = createRedactor();
65
+ // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
66
+ const redactor = createRedactor({ runtime });
62
67
 
63
68
  // When --output is specified, stream text to stdout while writing NDJSON to file.
64
69
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
65
- const fileStream = outputPath ? createWriteStream(outputPath) : null;
70
+ const fileStream = outputPath
71
+ ? runtime.fs.createWriteStream(outputPath)
72
+ : null;
66
73
  const output = fileStream
67
- ? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
68
- : process.stdout;
74
+ ? createTeeWriter({
75
+ fileStream,
76
+ textStream: runtime.proc.stdout,
77
+ mode: "raw",
78
+ now: () => isoTimestamp(runtime.clock.now()),
79
+ })
80
+ : runtime.proc.stdout;
69
81
 
70
82
  const counter = new SequenceCounter();
71
83
  const devNull = new Writable({
@@ -93,12 +105,13 @@ export async function runRunCommand(values, _args) {
93
105
  }
94
106
 
95
107
  if (agentProfile) {
96
- process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
108
+ runtime.proc.env.LIBEVAL_AGENT_PROFILE = agentProfile;
97
109
  }
98
110
 
99
111
  const systemPrompt = agentProfile
100
112
  ? composeProfilePrompt(agentProfile, {
101
113
  profilesDir: resolve(cwd, ".claude/agents"),
114
+ runtime,
102
115
  })
103
116
  : undefined;
104
117
 
@@ -116,6 +129,7 @@ export async function runRunCommand(values, _args) {
116
129
  taskAmend,
117
130
  mcpServers,
118
131
  redactor,
132
+ runtime,
119
133
  });
120
134
 
121
135
  const result = await runner.run(taskContent);
@@ -125,5 +139,5 @@ export async function runRunCommand(values, _args) {
125
139
  await new Promise((r) => fileStream.end(r));
126
140
  }
127
141
 
128
- process.exit(result.success ? 0 : 1);
142
+ return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
129
143
  }
@@ -1,6 +1,5 @@
1
- import { createWriteStream, mkdtempSync } from "node:fs";
2
1
  import { resolve, join } from "node:path";
3
- import { tmpdir } from "node:os";
2
+ import { isoTimestamp } from "@forwardimpact/libutil";
4
3
  import { createSupervisor } from "../supervisor.js";
5
4
  import { createRedactor } from "../redaction.js";
6
5
  import { createTeeWriter } from "../tee-writer.js";
@@ -10,19 +9,27 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
10
9
  /**
11
10
  * Parse all supervise flags from parsed values into an options object.
12
11
  * @param {object} values - Parsed option values from cli.parse()
13
- * @returns {object}
12
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
13
+ * @returns {Promise<object>}
14
14
  */
15
- export function parseSuperviseOptions(values) {
16
- const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
15
+ export async function parseSuperviseOptions(values, runtime) {
16
+ const { task: taskContent, amend: taskAmend } = resolveTaskContent(
17
+ values,
18
+ runtime,
19
+ );
17
20
  const supervisorAllowedToolsRaw = values["supervisor-allowed-tools"];
18
21
 
22
+ const tmpRoot = runtime.proc.env.TMPDIR ?? "/tmp";
23
+ const agentCwd = resolve(
24
+ values["agent-cwd"] ??
25
+ (await runtime.fs.mkdtemp(join(tmpRoot, "fit-eval-agent-"))),
26
+ );
27
+
19
28
  return {
20
29
  taskContent,
21
30
  taskAmend,
22
31
  supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
23
- agentCwd: resolve(
24
- values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
25
- ),
32
+ agentCwd,
26
33
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
27
34
  supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
28
35
  maxTurns: (() => {
@@ -50,29 +57,31 @@ export function parseSuperviseOptions(values) {
50
57
  *
51
58
  * Usage: fit-eval supervise [options]
52
59
  *
53
- * @param {object} values - Parsed option values from cli.parse()
54
- * @param {string[]} args - Positional arguments
60
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
61
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
55
62
  */
56
- export async function runSuperviseCommand(values, _args) {
57
- const opts = parseSuperviseOptions(values);
63
+ export async function runSuperviseCommand(ctx) {
64
+ const runtime = ctx.deps.runtime;
65
+ const opts = await parseSuperviseOptions(ctx.options, runtime);
58
66
 
59
67
  // Build the redactor as the first observable side-effect after option
60
68
  // parsing — the env snapshot must freeze BEFORE any in-process
61
- // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
62
- const redactor = createRedactor();
69
+ // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
70
+ const redactor = createRedactor({ runtime });
63
71
 
64
72
  // When --output is specified, stream text to stdout while writing NDJSON to file.
65
73
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
66
74
  const fileStream = opts.outputPath
67
- ? createWriteStream(opts.outputPath)
75
+ ? runtime.fs.createWriteStream(opts.outputPath)
68
76
  : null;
69
77
  const output = fileStream
70
78
  ? createTeeWriter({
71
79
  fileStream,
72
- textStream: process.stdout,
80
+ textStream: runtime.proc.stdout,
73
81
  mode: "supervised",
82
+ now: () => isoTimestamp(runtime.clock.now()),
74
83
  })
75
- : process.stdout;
84
+ : runtime.proc.stdout;
76
85
 
77
86
  let agentMcpServers = null;
78
87
  if (opts.mcpServer) {
@@ -88,7 +97,7 @@ export async function runSuperviseCommand(values, _args) {
88
97
  }
89
98
 
90
99
  if (opts.agentProfile) {
91
- process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
100
+ runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
92
101
  }
93
102
 
94
103
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -107,6 +116,7 @@ export async function runSuperviseCommand(values, _args) {
107
116
  taskAmend: opts.taskAmend,
108
117
  agentMcpServers,
109
118
  redactor,
119
+ runtime,
110
120
  });
111
121
 
112
122
  const result = await supervisor.run(opts.taskContent);
@@ -116,5 +126,5 @@ export async function runSuperviseCommand(values, _args) {
116
126
  await new Promise((r) => fileStream.end(r));
117
127
  }
118
128
 
119
- process.exit(result.success ? 0 : 1);
129
+ return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
120
130
  }
@@ -1,4 +1,3 @@
1
- import { readFileSync } from "node:fs";
2
1
  import { composeTaskFromGitHubEvent } from "../events/github.js";
3
2
 
4
3
  /**
@@ -11,9 +10,12 @@ import { composeTaskFromGitHubEvent } from "../events/github.js";
11
10
  * works as before.
12
11
  *
13
12
  * @param {object} values - Parsed option values from cli.parse()
13
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
14
+ * collaborators; `fsSync.readFileSync` loads `--task-file`/`--task-event`
15
+ * and `proc.env` resolves `GITHUB_EVENT_NAME`.
14
16
  * @returns {{ task: string, amend: string | undefined }}
15
17
  */
16
- export function resolveTaskContent(values) {
18
+ export function resolveTaskContent(values, runtime) {
17
19
  const taskFile = values["task-file"];
18
20
  const taskText = values["task-text"];
19
21
  const taskEvent = values["task-event"];
@@ -33,17 +35,20 @@ export function resolveTaskContent(values) {
33
35
  const amendFlag = values["task-amend"] ?? undefined;
34
36
 
35
37
  if (taskFile) {
36
- return { task: readFileSync(taskFile, "utf8"), amend: amendFlag };
38
+ return {
39
+ task: runtime.fsSync.readFileSync(taskFile, "utf8"),
40
+ amend: amendFlag,
41
+ };
37
42
  }
38
43
  if (taskText) {
39
44
  return { task: taskText, amend: amendFlag };
40
45
  }
41
46
 
42
- const eventName = process.env.GITHUB_EVENT_NAME;
47
+ const eventName = runtime.proc.env.GITHUB_EVENT_NAME;
43
48
  if (!eventName) {
44
49
  throw new Error("--task-event requires GITHUB_EVENT_NAME to be set");
45
50
  }
46
- const payload = JSON.parse(readFileSync(taskEvent, "utf8"));
51
+ const payload = JSON.parse(runtime.fsSync.readFileSync(taskEvent, "utf8"));
47
52
  const composed = composeTaskFromGitHubEvent(payload, eventName);
48
53
  return { task: composed.task, amend: amendFlag ?? composed.amend };
49
54
  }
@@ -1,32 +1,47 @@
1
- import { createWriteStream } from "fs";
2
1
  import { PassThrough } from "node:stream";
3
2
  import { pipeline } from "node:stream/promises";
3
+ import { isoTimestamp } from "@forwardimpact/libutil";
4
4
  import { createTeeWriter } from "../tee-writer.js";
5
5
 
6
6
  /**
7
7
  * Tee command — stream text output to stdout while optionally saving the raw
8
- * NDJSON to a file. Processes stdin line-by-line for streaming output.
8
+ * NDJSON to a file. Reads stdin line-by-line through the injected runtime and
9
+ * re-delimits each record with a newline so the TeeWriter's line splitter sees
10
+ * the same framing the raw byte stream produced.
9
11
  *
10
12
  * Usage: fit-eval tee [output.ndjson] < trace.ndjson
11
13
  *
12
- * @param {object} values - Parsed option values from cli.parse()
13
- * @param {string[]} args - Positional arguments (optional output file path)
14
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
15
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
14
16
  */
15
- export async function runTeeCommand(values, args) {
16
- const outputPath = args.find((a) => !a.startsWith("-")) ?? null;
17
- const fileStream = outputPath ? createWriteStream(outputPath) : null;
17
+ export async function runTeeCommand(ctx) {
18
+ const runtime = ctx.deps.runtime;
19
+ const outputPath = ctx.args.output ?? null;
20
+ const fileStream = outputPath
21
+ ? runtime.fs.createWriteStream(outputPath)
22
+ : null;
18
23
 
19
24
  // TeeWriter requires a fileStream; when no output file is specified,
20
25
  // use a PassThrough as a no-op sink (NDJSON is not saved).
21
26
  const sink = fileStream ?? new PassThrough();
22
27
  const tee = createTeeWriter({
23
28
  fileStream: sink,
24
- textStream: process.stdout,
29
+ textStream: runtime.proc.stdout,
25
30
  mode: "raw",
31
+ now: () => isoTimestamp(runtime.clock.now()),
26
32
  });
27
33
 
28
34
  try {
29
- await pipeline(process.stdin, tee);
35
+ // `runtime.proc.stdin` yields newline-stripped lines; re-append `\n` so the
36
+ // TeeWriter's `_write` line splitter frames records exactly as it did when
37
+ // piped the raw byte stream.
38
+ const lines = (async function* () {
39
+ for await (const line of runtime.proc.stdin) yield `${line}\n`;
40
+ })();
41
+ await pipeline(lines, tee);
42
+ return { ok: true };
43
+ } catch (error) {
44
+ return { ok: false, code: 1, error: error.message };
30
45
  } finally {
31
46
  if (fileStream) {
32
47
  await new Promise((resolve, reject) => {