@forwardimpact/libeval 0.1.27 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/bin/fit-eval.js +7 -7
- package/bin/fit-trace.js +7 -7
- package/package.json +12 -10
- package/src/agent-runner.js +1 -0
- package/src/facilitator.js +2 -0
- package/src/message-bus.js +1 -0
- package/src/orchestration-toolkit.js +3 -0
- package/src/orchestrator-helpers.js +1 -0
- package/src/sequence-counter.js +4 -0
- package/src/supervisor.js +4 -2
- package/src/tee-writer.js +1 -0
- package/src/trace-collector.js +1 -0
package/README.md
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# libeval
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<!-- BEGIN:description — Do not edit. Generated from package.json. -->
|
|
4
|
+
|
|
5
|
+
Agent evaluation framework — prove whether agent changes improved outcomes with
|
|
6
|
+
reproducible evidence.
|
|
7
|
+
|
|
8
|
+
<!-- END:description -->
|
|
4
9
|
|
|
5
10
|
## Getting Started
|
|
6
11
|
|
package/bin/fit-eval.js
CHANGED
|
@@ -177,20 +177,20 @@ const definition = {
|
|
|
177
177
|
],
|
|
178
178
|
documentation: [
|
|
179
179
|
{
|
|
180
|
-
title: "
|
|
181
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
180
|
+
title: "Run an Eval",
|
|
181
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
|
|
182
182
|
description:
|
|
183
183
|
"Author a judge profile, run an eval locally, wire it into CI, and inspect the resulting trace.",
|
|
184
184
|
},
|
|
185
185
|
{
|
|
186
|
-
title: "Agent
|
|
187
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
186
|
+
title: "Prove Agent Changes",
|
|
187
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
|
|
188
188
|
description:
|
|
189
|
-
"
|
|
189
|
+
"End-to-end workflow from dataset generation through evaluation to trace analysis, including multi-agent collaboration sessions.",
|
|
190
190
|
},
|
|
191
191
|
{
|
|
192
|
-
title: "
|
|
193
|
-
url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
|
|
192
|
+
title: "Analyze Traces",
|
|
193
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
|
|
194
194
|
description:
|
|
195
195
|
"Read the NDJSON traces produced by `fit-eval` with `fit-trace` — grounded-theory method and worked examples.",
|
|
196
196
|
},
|
package/bin/fit-trace.js
CHANGED
|
@@ -214,22 +214,22 @@ const definition = {
|
|
|
214
214
|
],
|
|
215
215
|
documentation: [
|
|
216
216
|
{
|
|
217
|
-
title: "
|
|
218
|
-
url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
|
|
217
|
+
title: "Analyze Traces",
|
|
218
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
|
|
219
219
|
description:
|
|
220
220
|
"The full method walkthrough with worked examples (an eval that failed, a multi-agent session that stalled).",
|
|
221
221
|
},
|
|
222
222
|
{
|
|
223
|
-
title: "
|
|
224
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
223
|
+
title: "Run an Eval",
|
|
224
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
|
|
225
225
|
description:
|
|
226
226
|
"How `fit-eval supervise` produces the traces this skill analyzes.",
|
|
227
227
|
},
|
|
228
228
|
{
|
|
229
|
-
title: "Agent
|
|
230
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
229
|
+
title: "Prove Agent Changes",
|
|
230
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
|
|
231
231
|
description:
|
|
232
|
-
"
|
|
232
|
+
"End-to-end workflow including multi-agent collaboration; `split` is the bridge into per-source trace files.",
|
|
233
233
|
},
|
|
234
234
|
],
|
|
235
235
|
};
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
4
|
-
"description": "Agent evaluation
|
|
3
|
+
"version": "0.1.28",
|
|
4
|
+
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
7
7
|
"agent",
|
|
@@ -17,14 +17,16 @@
|
|
|
17
17
|
},
|
|
18
18
|
"license": "Apache-2.0",
|
|
19
19
|
"author": "D. Olsson <hi@senzilla.io>",
|
|
20
|
-
"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
"jobs": [
|
|
21
|
+
{
|
|
22
|
+
"user": "Platform Builders",
|
|
23
|
+
"goal": "Prove Agent Changes",
|
|
24
|
+
"trigger": "An eval passes locally but fails in CI and the only output is 'assertion failed.'",
|
|
25
|
+
"bigHire": "prove whether agent changes improved outcomes with reproducible evidence.",
|
|
26
|
+
"littleHire": "run an eval and get a trace that shows exactly what the agent did.",
|
|
27
|
+
"competesWith": "manual before/after comparison; trusting gut feeling over evidence; skipping evaluation entirely"
|
|
28
|
+
}
|
|
29
|
+
],
|
|
28
30
|
"type": "module",
|
|
29
31
|
"main": "./src/index.js",
|
|
30
32
|
"exports": {
|
package/src/agent-runner.js
CHANGED
package/src/facilitator.js
CHANGED
|
@@ -36,6 +36,7 @@ export const FACILITATED_AGENT_SYSTEM_PROMPT =
|
|
|
36
36
|
"Announce broadcasts a message. " +
|
|
37
37
|
"RollCall lists participants.";
|
|
38
38
|
|
|
39
|
+
/** Orchestrate N agent sessions coordinated by a single facilitator LLM session. */
|
|
39
40
|
export class Facilitator {
|
|
40
41
|
/**
|
|
41
42
|
* @param {object} deps
|
|
@@ -296,6 +297,7 @@ export class Facilitator {
|
|
|
296
297
|
}
|
|
297
298
|
}
|
|
298
299
|
|
|
300
|
+
/** Return the last assistant text block from a runner's buffer, or the fallback if none exists. */
|
|
299
301
|
extractLastText(runner, fallback) {
|
|
300
302
|
const lines = runner.buffer;
|
|
301
303
|
for (let i = lines.length - 1; i >= 0; i--) {
|
package/src/message-bus.js
CHANGED
|
@@ -37,6 +37,7 @@ export function createOrchestrationContext() {
|
|
|
37
37
|
|
|
38
38
|
// --- Handler factories ---
|
|
39
39
|
|
|
40
|
+
/** Create a handler that marks the session as concluded and records the summary. */
|
|
40
41
|
export function createConcludeHandler(ctx) {
|
|
41
42
|
return async ({ summary }) => {
|
|
42
43
|
ctx.concluded = true;
|
|
@@ -45,6 +46,7 @@ export function createConcludeHandler(ctx) {
|
|
|
45
46
|
};
|
|
46
47
|
}
|
|
47
48
|
|
|
49
|
+
/** Create a handler that queues a redirect to interrupt a participant with replacement instructions. */
|
|
48
50
|
export function createRedirectHandler(ctx) {
|
|
49
51
|
return async ({ message, to }) => {
|
|
50
52
|
ctx.redirect = { message, to: to ?? null };
|
|
@@ -52,6 +54,7 @@ export function createRedirectHandler(ctx) {
|
|
|
52
54
|
};
|
|
53
55
|
}
|
|
54
56
|
|
|
57
|
+
/** Create a handler that returns the list of all session participants and their roles. */
|
|
55
58
|
export function createRollCallHandler(ctx) {
|
|
56
59
|
return async () => {
|
|
57
60
|
return {
|
package/src/sequence-counter.js
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
* SequenceCounter — global monotonic counter shared across all participants
|
|
3
3
|
* in a session. Single-threaded JS means no synchronization needed.
|
|
4
4
|
*/
|
|
5
|
+
/** Monotonic counter that assigns globally ordered sequence numbers within a session. */
|
|
5
6
|
export class SequenceCounter {
|
|
7
|
+
/** Initialize the counter at zero. */
|
|
6
8
|
constructor() {
|
|
7
9
|
this.value = 0;
|
|
8
10
|
}
|
|
9
11
|
|
|
12
|
+
/** Return the current value and advance the counter by one. */
|
|
10
13
|
next() {
|
|
11
14
|
return this.value++;
|
|
12
15
|
}
|
|
13
16
|
}
|
|
14
17
|
|
|
18
|
+
/** Create a new SequenceCounter starting at zero. */
|
|
15
19
|
export function createSequenceCounter() {
|
|
16
20
|
return new SequenceCounter();
|
|
17
21
|
}
|
package/src/supervisor.js
CHANGED
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
* introduces itself, and delegates work to the agent. The loop then alternates:
|
|
5
5
|
* agent → supervisor → agent.
|
|
6
6
|
*
|
|
7
|
-
* Signaling uses orchestration tools (Ask /
|
|
8
|
-
*
|
|
7
|
+
* Signaling uses orchestration tools (Ask / Announce / Redirect / Conclude)
|
|
8
|
+
* via in-process MCP servers; the supervisor has no Answer tool — agent replies
|
|
9
|
+
* are routed back through the relay loop. The Ask/Answer contract is enforced
|
|
9
10
|
* at turn boundaries: an unanswered Ask triggers one synthetic reminder and
|
|
10
11
|
* then a `protocol_violation` trace event plus a null-answer injection so the
|
|
11
12
|
* session advances without silent deadlock.
|
|
@@ -52,6 +53,7 @@ export const AGENT_SYSTEM_PROMPT =
|
|
|
52
53
|
*/
|
|
53
54
|
const MAX_INTERVENTIONS_PER_TURN = 5;
|
|
54
55
|
|
|
56
|
+
/** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
|
|
55
57
|
export class Supervisor {
|
|
56
58
|
/**
|
|
57
59
|
* @param {object} deps
|
package/src/tee-writer.js
CHANGED
|
@@ -20,6 +20,7 @@ import { TraceCollector } from "./trace-collector.js";
|
|
|
20
20
|
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
21
21
|
import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
|
|
22
22
|
|
|
23
|
+
/** Writable stream that saves raw NDJSON to a file while streaming human-readable text to a display stream. */
|
|
23
24
|
export class TeeWriter extends Writable {
|
|
24
25
|
/**
|
|
25
26
|
* @param {object} deps
|
package/src/trace-collector.js
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
13
13
|
import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
|
|
14
14
|
|
|
15
|
+
/** Accumulate Claude Code NDJSON stream events into structured traces for analysis or text replay. */
|
|
15
16
|
export class TraceCollector {
|
|
16
17
|
/**
|
|
17
18
|
* @param {object} [deps]
|