opencode-swarm-plugin 0.38.0 → 0.39.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +11 -0
- package/.hive/memories.jsonl +23 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/CHANGELOG.md +130 -0
- package/README.md +29 -12
- package/bin/swarm.test.ts +475 -0
- package/bin/swarm.ts +383 -0
- package/dist/compaction-hook.d.ts +1 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +81 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +370 -13
- package/dist/plugin.js +203 -13
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +4 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +589 -105
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
- package/evals/scorers/coordinator-discipline.ts +13 -13
- package/examples/plugin-wrapper-template.ts +117 -0
- package/package.json +7 -5
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +42 -0
- package/src/compaction-hook.ts +81 -0
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +422 -0
- package/src/eval-capture.ts +94 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/index.ts +61 -1
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.ts +2 -2
- package/src/swarm-prompts.ts +2 -2
- package/src/swarm-review.ts +3 -3
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for data-loader quality filters
|
|
3
|
+
*
|
|
4
|
+
* TDD approach: RED → GREEN → REFACTOR
|
|
5
|
+
*/
|
|
6
|
+
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
7
|
+
import * as fs from "node:fs";
|
|
8
|
+
import * as os from "node:os";
|
|
9
|
+
import * as path from "node:path";
|
|
10
|
+
import type { CoordinatorEvent } from "../../src/eval-capture.js";
|
|
11
|
+
import { loadCapturedSessions } from "./data-loader.js";
|
|
12
|
+
|
|
13
|
+
// Test helper: create a temp session directory
|
|
14
|
+
let tempSessionDir: string;
|
|
15
|
+
|
|
16
|
+
beforeEach(() => {
|
|
17
|
+
tempSessionDir = fs.mkdtempSync(path.join(os.tmpdir(), "test-sessions-"));
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
afterEach(() => {
|
|
21
|
+
if (fs.existsSync(tempSessionDir)) {
|
|
22
|
+
fs.rmSync(tempSessionDir, { recursive: true });
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Helper: create a session JSONL file with events
|
|
28
|
+
*/
|
|
29
|
+
function createSessionFile(
|
|
30
|
+
sessionId: string,
|
|
31
|
+
events: CoordinatorEvent[],
|
|
32
|
+
): void {
|
|
33
|
+
const filePath = path.join(tempSessionDir, `${sessionId}.jsonl`);
|
|
34
|
+
const lines = events.map((e) => JSON.stringify(e)).join("\n") + "\n";
|
|
35
|
+
fs.writeFileSync(filePath, lines, "utf-8");
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Helper: create minimal events
|
|
40
|
+
*/
|
|
41
|
+
function createEvent(
|
|
42
|
+
sessionId: string,
|
|
43
|
+
epicId: string,
|
|
44
|
+
type: "DECISION" | "VIOLATION" | "OUTCOME",
|
|
45
|
+
subtype: string,
|
|
46
|
+
): CoordinatorEvent {
|
|
47
|
+
const base = {
|
|
48
|
+
session_id: sessionId,
|
|
49
|
+
epic_id: epicId,
|
|
50
|
+
timestamp: new Date().toISOString(),
|
|
51
|
+
payload: {},
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
if (type === "DECISION") {
|
|
55
|
+
return {
|
|
56
|
+
...base,
|
|
57
|
+
event_type: "DECISION" as const,
|
|
58
|
+
decision_type: subtype as any,
|
|
59
|
+
};
|
|
60
|
+
} else if (type === "VIOLATION") {
|
|
61
|
+
return {
|
|
62
|
+
...base,
|
|
63
|
+
event_type: "VIOLATION" as const,
|
|
64
|
+
violation_type: subtype as any,
|
|
65
|
+
};
|
|
66
|
+
} else {
|
|
67
|
+
return {
|
|
68
|
+
...base,
|
|
69
|
+
event_type: "OUTCOME" as const,
|
|
70
|
+
outcome_type: subtype as any,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
describe("loadCapturedSessions - quality filters", () => {
|
|
76
|
+
test("filters out sessions with fewer than minEvents (default: 3)", async () => {
|
|
77
|
+
// Create sessions with different event counts
|
|
78
|
+
createSessionFile("session-2-events", [
|
|
79
|
+
createEvent("session-2-events", "epic-1", "DECISION", "worker_spawned"),
|
|
80
|
+
createEvent("session-2-events", "epic-1", "OUTCOME", "subtask_success"),
|
|
81
|
+
]);
|
|
82
|
+
|
|
83
|
+
createSessionFile("session-3-events", [
|
|
84
|
+
createEvent("session-3-events", "epic-2", "DECISION", "worker_spawned"),
|
|
85
|
+
createEvent("session-3-events", "epic-2", "DECISION", "review_completed"),
|
|
86
|
+
createEvent("session-3-events", "epic-2", "OUTCOME", "subtask_success"),
|
|
87
|
+
]);
|
|
88
|
+
|
|
89
|
+
createSessionFile("session-5-events", [
|
|
90
|
+
createEvent("session-5-events", "epic-3", "DECISION", "worker_spawned"),
|
|
91
|
+
createEvent("session-5-events", "epic-3", "DECISION", "review_completed"),
|
|
92
|
+
createEvent("session-5-events", "epic-3", "OUTCOME", "subtask_success"),
|
|
93
|
+
createEvent("session-5-events", "epic-3", "OUTCOME", "subtask_success"),
|
|
94
|
+
createEvent("session-5-events", "epic-3", "OUTCOME", "epic_complete"),
|
|
95
|
+
]);
|
|
96
|
+
|
|
97
|
+
const sessions = await loadCapturedSessions({
|
|
98
|
+
minEvents: 3,
|
|
99
|
+
sessionDir: tempSessionDir,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
// Should only get sessions with >= 3 events
|
|
103
|
+
expect(sessions.length).toBe(2);
|
|
104
|
+
expect(
|
|
105
|
+
sessions.some((s) => s.session.session_id === "session-3-events"),
|
|
106
|
+
).toBe(true);
|
|
107
|
+
expect(
|
|
108
|
+
sessions.some((s) => s.session.session_id === "session-5-events"),
|
|
109
|
+
).toBe(true);
|
|
110
|
+
expect(
|
|
111
|
+
sessions.some((s) => s.session.session_id === "session-2-events"),
|
|
112
|
+
).toBe(false);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
test("filters out sessions without worker_spawned event when requireWorkerSpawn=true", async () => {
|
|
116
|
+
// Session WITH worker_spawned
|
|
117
|
+
createSessionFile("session-with-spawn", [
|
|
118
|
+
createEvent("session-with-spawn", "epic-1", "DECISION", "worker_spawned"),
|
|
119
|
+
createEvent(
|
|
120
|
+
"session-with-spawn",
|
|
121
|
+
"epic-1",
|
|
122
|
+
"DECISION",
|
|
123
|
+
"review_completed",
|
|
124
|
+
),
|
|
125
|
+
createEvent("session-with-spawn", "epic-1", "OUTCOME", "subtask_success"),
|
|
126
|
+
]);
|
|
127
|
+
|
|
128
|
+
// Session WITHOUT worker_spawned
|
|
129
|
+
createSessionFile("session-no-spawn", [
|
|
130
|
+
createEvent(
|
|
131
|
+
"session-no-spawn",
|
|
132
|
+
"epic-2",
|
|
133
|
+
"DECISION",
|
|
134
|
+
"strategy_selected",
|
|
135
|
+
),
|
|
136
|
+
createEvent(
|
|
137
|
+
"session-no-spawn",
|
|
138
|
+
"epic-2",
|
|
139
|
+
"DECISION",
|
|
140
|
+
"decomposition_complete",
|
|
141
|
+
),
|
|
142
|
+
createEvent("session-no-spawn", "epic-2", "OUTCOME", "epic_complete"),
|
|
143
|
+
]);
|
|
144
|
+
|
|
145
|
+
const sessions = await loadCapturedSessions({
|
|
146
|
+
requireWorkerSpawn: true,
|
|
147
|
+
sessionDir: tempSessionDir,
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
expect(sessions.length).toBe(1);
|
|
151
|
+
expect(sessions[0]?.session.session_id).toBe("session-with-spawn");
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
test("filters out sessions without review_completed event when requireReview=true", async () => {
|
|
155
|
+
// Session WITH review
|
|
156
|
+
createSessionFile("session-with-review", [
|
|
157
|
+
createEvent(
|
|
158
|
+
"session-with-review",
|
|
159
|
+
"epic-1",
|
|
160
|
+
"DECISION",
|
|
161
|
+
"worker_spawned",
|
|
162
|
+
),
|
|
163
|
+
createEvent(
|
|
164
|
+
"session-with-review",
|
|
165
|
+
"epic-1",
|
|
166
|
+
"DECISION",
|
|
167
|
+
"review_completed",
|
|
168
|
+
),
|
|
169
|
+
createEvent("session-with-review", "epic-1", "OUTCOME", "subtask_success"),
|
|
170
|
+
]);
|
|
171
|
+
|
|
172
|
+
// Session WITHOUT review
|
|
173
|
+
createSessionFile("session-no-review", [
|
|
174
|
+
createEvent("session-no-review", "epic-2", "DECISION", "worker_spawned"),
|
|
175
|
+
createEvent("session-no-review", "epic-2", "OUTCOME", "subtask_success"),
|
|
176
|
+
createEvent("session-no-review", "epic-2", "OUTCOME", "epic_complete"),
|
|
177
|
+
]);
|
|
178
|
+
|
|
179
|
+
const sessions = await loadCapturedSessions({
|
|
180
|
+
requireReview: true,
|
|
181
|
+
sessionDir: tempSessionDir,
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
expect(sessions.length).toBe(1);
|
|
185
|
+
expect(sessions[0]?.session.session_id).toBe("session-with-review");
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
test("allows disabling filters individually", async () => {
|
|
189
|
+
// Session with only 2 events, no worker_spawned, no review
|
|
190
|
+
createSessionFile("session-low-quality", [
|
|
191
|
+
createEvent(
|
|
192
|
+
"session-low-quality",
|
|
193
|
+
"epic-1",
|
|
194
|
+
"DECISION",
|
|
195
|
+
"strategy_selected",
|
|
196
|
+
),
|
|
197
|
+
createEvent("session-low-quality", "epic-1", "OUTCOME", "epic_complete"),
|
|
198
|
+
]);
|
|
199
|
+
|
|
200
|
+
// Disable all filters
|
|
201
|
+
const sessions = await loadCapturedSessions({
|
|
202
|
+
minEvents: 0,
|
|
203
|
+
requireWorkerSpawn: false,
|
|
204
|
+
requireReview: false,
|
|
205
|
+
sessionDir: tempSessionDir,
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
expect(sessions.length).toBe(1);
|
|
209
|
+
expect(sessions[0]?.session.session_id).toBe("session-low-quality");
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
test("applies limit AFTER filtering", async () => {
|
|
213
|
+
// Create 5 high-quality sessions
|
|
214
|
+
for (let i = 1; i <= 5; i++) {
|
|
215
|
+
createSessionFile(`session-${i}`, [
|
|
216
|
+
createEvent(`session-${i}`, `epic-${i}`, "DECISION", "worker_spawned"),
|
|
217
|
+
createEvent(
|
|
218
|
+
`session-${i}`,
|
|
219
|
+
`epic-${i}`,
|
|
220
|
+
"DECISION",
|
|
221
|
+
"review_completed",
|
|
222
|
+
),
|
|
223
|
+
createEvent(`session-${i}`, `epic-${i}`, "OUTCOME", "subtask_success"),
|
|
224
|
+
]);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Create 3 low-quality sessions (will be filtered out)
|
|
228
|
+
for (let i = 6; i <= 8; i++) {
|
|
229
|
+
createSessionFile(`session-${i}`, [
|
|
230
|
+
createEvent(`session-${i}`, `epic-${i}`, "DECISION", "strategy_selected"),
|
|
231
|
+
]);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Filter first (remove 3 low-quality), then limit to 2
|
|
235
|
+
const sessions = await loadCapturedSessions({
|
|
236
|
+
minEvents: 3,
|
|
237
|
+
requireWorkerSpawn: true,
|
|
238
|
+
requireReview: true,
|
|
239
|
+
limit: 2,
|
|
240
|
+
sessionDir: tempSessionDir,
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
// Should get 2 sessions from the 5 high-quality ones
|
|
244
|
+
expect(sessions.length).toBe(2);
|
|
245
|
+
expect(sessions.every((s) => s.session.events.length >= 3)).toBe(true);
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
test("combines all filters correctly", async () => {
|
|
249
|
+
// High-quality session (passes all filters)
|
|
250
|
+
createSessionFile("session-high-quality", [
|
|
251
|
+
createEvent(
|
|
252
|
+
"session-high-quality",
|
|
253
|
+
"epic-1",
|
|
254
|
+
"DECISION",
|
|
255
|
+
"worker_spawned",
|
|
256
|
+
),
|
|
257
|
+
createEvent(
|
|
258
|
+
"session-high-quality",
|
|
259
|
+
"epic-1",
|
|
260
|
+
"DECISION",
|
|
261
|
+
"review_completed",
|
|
262
|
+
),
|
|
263
|
+
createEvent("session-high-quality", "epic-1", "OUTCOME", "subtask_success"),
|
|
264
|
+
createEvent("session-high-quality", "epic-1", "OUTCOME", "epic_complete"),
|
|
265
|
+
]);
|
|
266
|
+
|
|
267
|
+
// Missing worker_spawned
|
|
268
|
+
createSessionFile("session-no-spawn", [
|
|
269
|
+
createEvent(
|
|
270
|
+
"session-no-spawn",
|
|
271
|
+
"epic-2",
|
|
272
|
+
"DECISION",
|
|
273
|
+
"review_completed",
|
|
274
|
+
),
|
|
275
|
+
createEvent("session-no-spawn", "epic-2", "OUTCOME", "subtask_success"),
|
|
276
|
+
createEvent("session-no-spawn", "epic-2", "OUTCOME", "epic_complete"),
|
|
277
|
+
]);
|
|
278
|
+
|
|
279
|
+
// Missing review_completed
|
|
280
|
+
createSessionFile("session-no-review", [
|
|
281
|
+
createEvent("session-no-review", "epic-3", "DECISION", "worker_spawned"),
|
|
282
|
+
createEvent("session-no-review", "epic-3", "OUTCOME", "subtask_success"),
|
|
283
|
+
createEvent("session-no-review", "epic-3", "OUTCOME", "epic_complete"),
|
|
284
|
+
]);
|
|
285
|
+
|
|
286
|
+
// Too few events
|
|
287
|
+
createSessionFile("session-too-few", [
|
|
288
|
+
createEvent("session-too-few", "epic-4", "DECISION", "worker_spawned"),
|
|
289
|
+
createEvent("session-too-few", "epic-4", "DECISION", "review_completed"),
|
|
290
|
+
]);
|
|
291
|
+
|
|
292
|
+
const sessions = await loadCapturedSessions({
|
|
293
|
+
minEvents: 3,
|
|
294
|
+
requireWorkerSpawn: true,
|
|
295
|
+
requireReview: true,
|
|
296
|
+
sessionDir: tempSessionDir,
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
// Only high-quality session should pass
|
|
300
|
+
expect(sessions.length).toBe(1);
|
|
301
|
+
expect(sessions[0]?.session.session_id).toBe("session-high-quality");
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
test("defaults are: minEvents=3, requireWorkerSpawn=true, requireReview=true", async () => {
|
|
305
|
+
// Create one session that meets defaults
|
|
306
|
+
createSessionFile("session-meets-defaults", [
|
|
307
|
+
createEvent(
|
|
308
|
+
"session-meets-defaults",
|
|
309
|
+
"epic-1",
|
|
310
|
+
"DECISION",
|
|
311
|
+
"worker_spawned",
|
|
312
|
+
),
|
|
313
|
+
createEvent(
|
|
314
|
+
"session-meets-defaults",
|
|
315
|
+
"epic-1",
|
|
316
|
+
"DECISION",
|
|
317
|
+
"review_completed",
|
|
318
|
+
),
|
|
319
|
+
createEvent(
|
|
320
|
+
"session-meets-defaults",
|
|
321
|
+
"epic-1",
|
|
322
|
+
"OUTCOME",
|
|
323
|
+
"subtask_success",
|
|
324
|
+
),
|
|
325
|
+
]);
|
|
326
|
+
|
|
327
|
+
// Create one that doesn't
|
|
328
|
+
createSessionFile("session-fails-defaults", [
|
|
329
|
+
createEvent(
|
|
330
|
+
"session-fails-defaults",
|
|
331
|
+
"epic-2",
|
|
332
|
+
"DECISION",
|
|
333
|
+
"strategy_selected",
|
|
334
|
+
),
|
|
335
|
+
]);
|
|
336
|
+
|
|
337
|
+
// Call with NO options except sessionDir - should use defaults
|
|
338
|
+
const sessions = await loadCapturedSessions({
|
|
339
|
+
sessionDir: tempSessionDir,
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
expect(sessions.length).toBe(1);
|
|
343
|
+
expect(sessions[0]?.session.session_id).toBe("session-meets-defaults");
|
|
344
|
+
});
|
|
345
|
+
});
|
package/evals/lib/data-loader.ts
CHANGED
|
@@ -111,24 +111,86 @@ export async function getEvalDataSummary(
|
|
|
111
111
|
};
|
|
112
112
|
}
|
|
113
113
|
|
|
114
|
+
/**
|
|
115
|
+
* Check if a session meets quality criteria
|
|
116
|
+
*/
|
|
117
|
+
function meetsQualityCriteria(
|
|
118
|
+
session: import("../../src/eval-capture.js").CoordinatorSession,
|
|
119
|
+
criteria: {
|
|
120
|
+
minEvents: number;
|
|
121
|
+
requireWorkerSpawn: boolean;
|
|
122
|
+
requireReview: boolean;
|
|
123
|
+
},
|
|
124
|
+
): boolean {
|
|
125
|
+
// Filter 1: minEvents
|
|
126
|
+
if (session.events.length < criteria.minEvents) {
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Filter 2: requireWorkerSpawn
|
|
131
|
+
if (
|
|
132
|
+
criteria.requireWorkerSpawn &&
|
|
133
|
+
!session.events.some(
|
|
134
|
+
(e) => e.event_type === "DECISION" && e.decision_type === "worker_spawned",
|
|
135
|
+
)
|
|
136
|
+
) {
|
|
137
|
+
return false;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Filter 3: requireReview
|
|
141
|
+
if (
|
|
142
|
+
criteria.requireReview &&
|
|
143
|
+
!session.events.some(
|
|
144
|
+
(e) =>
|
|
145
|
+
e.event_type === "DECISION" && e.decision_type === "review_completed",
|
|
146
|
+
)
|
|
147
|
+
) {
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return true;
|
|
152
|
+
}
|
|
153
|
+
|
|
114
154
|
/**
|
|
115
155
|
* Load captured coordinator sessions from ~/.config/swarm-tools/sessions/
|
|
116
156
|
*
|
|
117
157
|
* Reads all JSONL session files and returns CoordinatorSession objects.
|
|
118
158
|
*
|
|
159
|
+
* Quality filters are applied to focus on high-signal coordinator sessions:
|
|
160
|
+
* - minEvents: Filter out incomplete/aborted sessions (default: 3)
|
|
161
|
+
* - requireWorkerSpawn: Ensure session delegated to workers (default: true)
|
|
162
|
+
* - requireReview: Ensure coordinator reviewed work (default: true)
|
|
163
|
+
*
|
|
164
|
+
* Filters are applied BEFORE the limit for accurate sampling.
|
|
165
|
+
*
|
|
119
166
|
* @param options - Filter options
|
|
120
|
-
* @returns Array of coordinator sessions
|
|
167
|
+
* @returns Array of coordinator sessions that meet quality criteria
|
|
121
168
|
*/
|
|
122
169
|
export async function loadCapturedSessions(options?: {
|
|
123
170
|
sessionIds?: string[];
|
|
124
171
|
limit?: number;
|
|
172
|
+
/** Minimum number of events required (default: 3) */
|
|
173
|
+
minEvents?: number;
|
|
174
|
+
/** Require at least one worker_spawned event (default: true) */
|
|
175
|
+
requireWorkerSpawn?: boolean;
|
|
176
|
+
/** Require at least one review_completed event (default: true) */
|
|
177
|
+
requireReview?: boolean;
|
|
178
|
+
/** Override session directory for testing */
|
|
179
|
+
sessionDir?: string;
|
|
125
180
|
}): Promise<
|
|
126
181
|
Array<{ session: import("../../src/eval-capture.js").CoordinatorSession }>
|
|
127
182
|
> {
|
|
128
183
|
const { getSessionDir, readSessionEvents, saveSession } = await import(
|
|
129
184
|
"../../src/eval-capture.js"
|
|
130
185
|
);
|
|
131
|
-
const sessionDir = getSessionDir();
|
|
186
|
+
const sessionDir = options?.sessionDir ?? getSessionDir();
|
|
187
|
+
|
|
188
|
+
// Default quality filters
|
|
189
|
+
const qualityCriteria = {
|
|
190
|
+
minEvents: options?.minEvents ?? 3,
|
|
191
|
+
requireWorkerSpawn: options?.requireWorkerSpawn ?? true,
|
|
192
|
+
requireReview: options?.requireReview ?? true,
|
|
193
|
+
};
|
|
132
194
|
|
|
133
195
|
// If session dir doesn't exist, return empty
|
|
134
196
|
if (!fs.existsSync(sessionDir)) {
|
|
@@ -149,32 +211,71 @@ export async function loadCapturedSessions(options?: {
|
|
|
149
211
|
const sessions: Array<{
|
|
150
212
|
session: import("../../src/eval-capture.js").CoordinatorSession;
|
|
151
213
|
}> = [];
|
|
214
|
+
let filteredOutCount = 0;
|
|
152
215
|
|
|
153
216
|
for (const file of targetFiles) {
|
|
154
217
|
const sessionId = file.replace(".jsonl", "");
|
|
155
218
|
|
|
156
219
|
try {
|
|
157
|
-
|
|
220
|
+
let events: import("../../src/eval-capture.js").CoordinatorEvent[];
|
|
221
|
+
|
|
222
|
+
// If custom sessionDir, read directly; otherwise use eval-capture functions
|
|
223
|
+
if (options?.sessionDir) {
|
|
224
|
+
const sessionPath = `${sessionDir}/${sessionId}.jsonl`;
|
|
225
|
+
if (!fs.existsSync(sessionPath)) continue;
|
|
226
|
+
|
|
227
|
+
const content = fs.readFileSync(sessionPath, "utf-8");
|
|
228
|
+
const lines = content.trim().split("\n").filter(Boolean);
|
|
229
|
+
const { CoordinatorEventSchema } = await import(
|
|
230
|
+
"../../src/eval-capture.js"
|
|
231
|
+
);
|
|
232
|
+
events = lines.map((line) => {
|
|
233
|
+
const parsed = JSON.parse(line);
|
|
234
|
+
return CoordinatorEventSchema.parse(parsed);
|
|
235
|
+
});
|
|
236
|
+
} else {
|
|
237
|
+
events = readSessionEvents(sessionId);
|
|
238
|
+
}
|
|
239
|
+
|
|
158
240
|
if (events.length === 0) continue;
|
|
159
241
|
|
|
160
242
|
// Find epic_id from first event
|
|
161
243
|
const epicId = events[0]?.epic_id;
|
|
162
244
|
if (!epicId) continue;
|
|
163
245
|
|
|
164
|
-
|
|
165
|
-
|
|
246
|
+
// Build session object
|
|
247
|
+
const session: import("../../src/eval-capture.js").CoordinatorSession = {
|
|
248
|
+
session_id: sessionId,
|
|
249
|
+
epic_id: epicId,
|
|
250
|
+
start_time: events[0]?.timestamp ?? new Date().toISOString(),
|
|
251
|
+
end_time: events[events.length - 1]?.timestamp,
|
|
252
|
+
events,
|
|
253
|
+
};
|
|
254
|
+
if (!session) continue;
|
|
255
|
+
|
|
256
|
+
// Apply quality filters BEFORE limit
|
|
257
|
+
if (meetsQualityCriteria(session, qualityCriteria)) {
|
|
166
258
|
sessions.push({ session });
|
|
259
|
+
} else {
|
|
260
|
+
filteredOutCount++;
|
|
167
261
|
}
|
|
168
262
|
} catch (error) {
|
|
169
263
|
// Skip invalid sessions
|
|
170
264
|
console.warn(`Failed to load session ${sessionId}:`, error);
|
|
171
265
|
}
|
|
172
266
|
|
|
173
|
-
// Apply limit
|
|
267
|
+
// Apply limit AFTER filtering
|
|
174
268
|
if (options?.limit && sessions.length >= options.limit) {
|
|
175
269
|
break;
|
|
176
270
|
}
|
|
177
271
|
}
|
|
178
272
|
|
|
273
|
+
// Log filtering stats for visibility
|
|
274
|
+
if (filteredOutCount > 0) {
|
|
275
|
+
console.log(
|
|
276
|
+
`Filtered out ${filteredOutCount} sessions (minEvents=${qualityCriteria.minEvents}, requireWorkerSpawn=${qualityCriteria.requireWorkerSpawn}, requireReview=${qualityCriteria.requireReview})`,
|
|
277
|
+
);
|
|
278
|
+
}
|
|
279
|
+
|
|
179
280
|
return sessions;
|
|
180
281
|
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compaction Prompt Quality Scorers - Evalite Wrappers
|
|
3
|
+
*
|
|
4
|
+
* These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
|
|
5
|
+
* for use with evalite's test runner.
|
|
6
|
+
*
|
|
7
|
+
* Weighted scoring:
|
|
8
|
+
* - epicIdSpecificity (0.20) - real IDs not placeholders
|
|
9
|
+
* - actionability (0.20) - swarm_status/inbox with real values
|
|
10
|
+
* - coordinatorIdentity (0.25) - ASCII header + strong mandates
|
|
11
|
+
* - forbiddenToolsPresent (0.15) - lists forbidden tools by name
|
|
12
|
+
* - postCompactionDiscipline (0.20) - first tool correct, no edit/write
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { createScorer } from "evalite";
|
|
16
|
+
import type { CompactionPrompt } from "../../src/compaction-prompt-scoring.js";
|
|
17
|
+
import {
|
|
18
|
+
scoreActionability,
|
|
19
|
+
scoreCoordinatorIdentity,
|
|
20
|
+
scoreEpicIdSpecificity,
|
|
21
|
+
scoreForbiddenToolsPresent,
|
|
22
|
+
scorePostCompactionDiscipline,
|
|
23
|
+
} from "../../src/compaction-prompt-scoring.js";
|
|
24
|
+
|
|
25
|
+
// Re-export types for convenience
|
|
26
|
+
export type { CompactionPrompt, ScorerResult } from "../../src/compaction-prompt-scoring.js";
|
|
27
|
+
|
|
28
|
+
// Re-export pure functions for direct use
|
|
29
|
+
export {
|
|
30
|
+
scoreActionability,
|
|
31
|
+
scoreCoordinatorIdentity,
|
|
32
|
+
scoreEpicIdSpecificity,
|
|
33
|
+
scoreForbiddenToolsPresent,
|
|
34
|
+
scorePostCompactionDiscipline,
|
|
35
|
+
} from "../../src/compaction-prompt-scoring.js";
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Epic ID Specificity Scorer
|
|
39
|
+
*
|
|
40
|
+
* Validates that epic IDs are REAL, not placeholders.
|
|
41
|
+
* Score: 1.0 if real IDs, 0.0 if placeholders found
|
|
42
|
+
*/
|
|
43
|
+
export const epicIdSpecificity = createScorer({
|
|
44
|
+
name: "Epic ID Specificity",
|
|
45
|
+
description: "Prompt uses real epic IDs, not placeholders",
|
|
46
|
+
scorer: ({ output }) => {
|
|
47
|
+
try {
|
|
48
|
+
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
49
|
+
return scoreEpicIdSpecificity(prompt);
|
|
50
|
+
} catch (error) {
|
|
51
|
+
return {
|
|
52
|
+
score: 0,
|
|
53
|
+
message: `Failed to parse prompt: ${error}`,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
},
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Actionability Scorer
|
|
61
|
+
*
|
|
62
|
+
* Validates that the prompt includes SPECIFIC actionable tool calls.
|
|
63
|
+
* Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
|
|
64
|
+
*/
|
|
65
|
+
export const actionability = createScorer({
|
|
66
|
+
name: "Actionability",
|
|
67
|
+
description: "Prompt includes specific tool calls with real values",
|
|
68
|
+
scorer: ({ output }) => {
|
|
69
|
+
try {
|
|
70
|
+
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
71
|
+
return scoreActionability(prompt);
|
|
72
|
+
} catch (error) {
|
|
73
|
+
return {
|
|
74
|
+
score: 0,
|
|
75
|
+
message: `Failed to parse prompt: ${error}`,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Coordinator Identity Scorer
|
|
83
|
+
*
|
|
84
|
+
* Validates that the prompt has STRONG coordinator identity reinforcement.
|
|
85
|
+
* Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
|
|
86
|
+
*/
|
|
87
|
+
export const coordinatorIdentity = createScorer({
|
|
88
|
+
name: "Coordinator Identity",
|
|
89
|
+
description: "Prompt has ASCII header and strong mandates",
|
|
90
|
+
scorer: ({ output }) => {
|
|
91
|
+
try {
|
|
92
|
+
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
93
|
+
return scoreCoordinatorIdentity(prompt);
|
|
94
|
+
} catch (error) {
|
|
95
|
+
return {
|
|
96
|
+
score: 0,
|
|
97
|
+
message: `Failed to parse prompt: ${error}`,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Forbidden Tools Present Scorer
|
|
105
|
+
*
|
|
106
|
+
* Validates that the prompt LISTS forbidden tools by name.
|
|
107
|
+
* Score: ratio of forbidden tools mentioned (0.0 to 1.0)
|
|
108
|
+
*/
|
|
109
|
+
export const forbiddenToolsPresent = createScorer({
|
|
110
|
+
name: "Forbidden Tools Present",
|
|
111
|
+
description: "Prompt lists forbidden tools by name",
|
|
112
|
+
scorer: ({ output }) => {
|
|
113
|
+
try {
|
|
114
|
+
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
115
|
+
return scoreForbiddenToolsPresent(prompt);
|
|
116
|
+
} catch (error) {
|
|
117
|
+
return {
|
|
118
|
+
score: 0,
|
|
119
|
+
message: `Failed to parse prompt: ${error}`,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Post-Compaction Discipline Scorer
|
|
127
|
+
*
|
|
128
|
+
* Validates that the FIRST suggested tool is correct.
|
|
129
|
+
* Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
|
|
130
|
+
*/
|
|
131
|
+
export const postCompactionDiscipline = createScorer({
|
|
132
|
+
name: "Post-Compaction Discipline",
|
|
133
|
+
description: "First suggested tool is swarm_status or inbox",
|
|
134
|
+
scorer: ({ output }) => {
|
|
135
|
+
try {
|
|
136
|
+
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
137
|
+
return scorePostCompactionDiscipline(prompt);
|
|
138
|
+
} catch (error) {
|
|
139
|
+
return {
|
|
140
|
+
score: 0,
|
|
141
|
+
message: `Failed to parse prompt: ${error}`,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
},
|
|
145
|
+
});
|
|
@@ -260,14 +260,14 @@ export const forbiddenPatternsAbsent = createScorer({
|
|
|
260
260
|
export const compactionQuality = createScorer({
|
|
261
261
|
name: "Overall Compaction Quality",
|
|
262
262
|
description: "Composite score for compaction hook correctness",
|
|
263
|
-
scorer: ({ output, expected }) => {
|
|
263
|
+
scorer: async ({ output, expected, input }) => {
|
|
264
264
|
try {
|
|
265
265
|
// Run all scorers
|
|
266
266
|
const scores = {
|
|
267
|
-
confidence: confidenceAccuracy
|
|
268
|
-
injection: contextInjectionCorrectness
|
|
269
|
-
required: requiredPatternsPresent
|
|
270
|
-
forbidden: forbiddenPatternsAbsent
|
|
267
|
+
confidence: await confidenceAccuracy({ output, expected, input }),
|
|
268
|
+
injection: await contextInjectionCorrectness({ output, expected, input }),
|
|
269
|
+
required: await requiredPatternsPresent({ output, expected, input }),
|
|
270
|
+
forbidden: await forbiddenPatternsAbsent({ output, expected, input }),
|
|
271
271
|
};
|
|
272
272
|
|
|
273
273
|
// Weighted average
|
|
@@ -279,16 +279,16 @@ export const compactionQuality = createScorer({
|
|
|
279
279
|
};
|
|
280
280
|
|
|
281
281
|
const totalScore =
|
|
282
|
-
scores.confidence.score * weights.confidence +
|
|
283
|
-
scores.injection.score * weights.injection +
|
|
284
|
-
scores.required.score * weights.required +
|
|
285
|
-
scores.forbidden.score * weights.forbidden;
|
|
282
|
+
(scores.confidence.score ?? 0) * weights.confidence +
|
|
283
|
+
(scores.injection.score ?? 0) * weights.injection +
|
|
284
|
+
(scores.required.score ?? 0) * weights.required +
|
|
285
|
+
(scores.forbidden.score ?? 0) * weights.forbidden;
|
|
286
286
|
|
|
287
287
|
const details = [
|
|
288
|
-
`Confidence: ${(scores.confidence.score * 100).toFixed(0)}%`,
|
|
289
|
-
`Injection: ${(scores.injection.score * 100).toFixed(0)}%`,
|
|
290
|
-
`Required: ${(scores.required.score * 100).toFixed(0)}%`,
|
|
291
|
-
`Forbidden: ${(scores.forbidden.score * 100).toFixed(0)}%`,
|
|
288
|
+
`Confidence: ${((scores.confidence.score ?? 0) * 100).toFixed(0)}%`,
|
|
289
|
+
`Injection: ${((scores.injection.score ?? 0) * 100).toFixed(0)}%`,
|
|
290
|
+
`Required: ${((scores.required.score ?? 0) * 100).toFixed(0)}%`,
|
|
291
|
+
`Forbidden: ${((scores.forbidden.score ?? 0) * 100).toFixed(0)}%`,
|
|
292
292
|
].join(", ");
|
|
293
293
|
|
|
294
294
|
return {
|