opencode-swarm-plugin 0.37.0 → 0.39.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +20 -5
  4. package/.hive/memories.jsonl +35 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/.turbo/turbo-build.log +4 -4
  7. package/.turbo/turbo-test.log +319 -319
  8. package/CHANGELOG.md +258 -0
  9. package/README.md +50 -0
  10. package/bin/swarm.test.ts +475 -0
  11. package/bin/swarm.ts +385 -208
  12. package/dist/compaction-hook.d.ts +1 -1
  13. package/dist/compaction-hook.d.ts.map +1 -1
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +81 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts +59 -0
  25. package/dist/hive.d.ts.map +1 -1
  26. package/dist/index.d.ts +87 -0
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +823 -131
  29. package/dist/plugin.js +655 -131
  30. package/dist/post-compaction-tracker.d.ts +133 -0
  31. package/dist/post-compaction-tracker.d.ts.map +1 -0
  32. package/dist/swarm-decompose.d.ts +30 -0
  33. package/dist/swarm-decompose.d.ts.map +1 -1
  34. package/dist/swarm-orchestrate.d.ts +23 -0
  35. package/dist/swarm-orchestrate.d.ts.map +1 -1
  36. package/dist/swarm-prompts.d.ts +25 -1
  37. package/dist/swarm-prompts.d.ts.map +1 -1
  38. package/dist/swarm.d.ts +19 -0
  39. package/dist/swarm.d.ts.map +1 -1
  40. package/evals/README.md +595 -94
  41. package/evals/compaction-prompt.eval.ts +149 -0
  42. package/evals/coordinator-behavior.eval.ts +8 -8
  43. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  44. package/evals/lib/compaction-loader.test.ts +248 -0
  45. package/evals/lib/compaction-loader.ts +320 -0
  46. package/evals/lib/data-loader.test.ts +345 -0
  47. package/evals/lib/data-loader.ts +107 -6
  48. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  49. package/evals/scorers/compaction-scorers.ts +13 -13
  50. package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
  51. package/evals/scorers/coordinator-discipline.ts +13 -13
  52. package/examples/plugin-wrapper-template.ts +177 -8
  53. package/package.json +7 -2
  54. package/scripts/migrate-unknown-sessions.ts +349 -0
  55. package/src/compaction-capture.integration.test.ts +257 -0
  56. package/src/compaction-hook.test.ts +139 -2
  57. package/src/compaction-hook.ts +113 -2
  58. package/src/compaction-prompt-scorers.test.ts +299 -0
  59. package/src/compaction-prompt-scoring.ts +298 -0
  60. package/src/eval-capture.test.ts +422 -0
  61. package/src/eval-capture.ts +94 -2
  62. package/src/eval-gates.test.ts +306 -0
  63. package/src/eval-gates.ts +218 -0
  64. package/src/eval-history.test.ts +508 -0
  65. package/src/eval-history.ts +214 -0
  66. package/src/eval-learning.test.ts +378 -0
  67. package/src/eval-learning.ts +360 -0
  68. package/src/index.ts +61 -1
  69. package/src/post-compaction-tracker.test.ts +251 -0
  70. package/src/post-compaction-tracker.ts +237 -0
  71. package/src/swarm-decompose.test.ts +40 -47
  72. package/src/swarm-decompose.ts +2 -2
  73. package/src/swarm-orchestrate.test.ts +270 -7
  74. package/src/swarm-orchestrate.ts +100 -13
  75. package/src/swarm-prompts.test.ts +121 -0
  76. package/src/swarm-prompts.ts +297 -4
  77. package/src/swarm-research.integration.test.ts +157 -0
  78. package/src/swarm-review.ts +3 -3
  79. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,345 @@
1
+ /**
2
+ * Tests for data-loader quality filters
3
+ *
4
+ * TDD approach: RED → GREEN → REFACTOR
5
+ */
6
+ import { afterEach, beforeEach, describe, expect, test } from "bun:test";
7
+ import * as fs from "node:fs";
8
+ import * as os from "node:os";
9
+ import * as path from "node:path";
10
+ import type { CoordinatorEvent } from "../../src/eval-capture.js";
11
+ import { loadCapturedSessions } from "./data-loader.js";
12
+
13
+ // Test helper: create a temp session directory
14
+ let tempSessionDir: string;
15
+
16
+ beforeEach(() => {
17
+ tempSessionDir = fs.mkdtempSync(path.join(os.tmpdir(), "test-sessions-"));
18
+ });
19
+
20
+ afterEach(() => {
21
+ if (fs.existsSync(tempSessionDir)) {
22
+ fs.rmSync(tempSessionDir, { recursive: true });
23
+ }
24
+ });
25
+
26
+ /**
27
+ * Helper: create a session JSONL file with events
28
+ */
29
+ function createSessionFile(
30
+ sessionId: string,
31
+ events: CoordinatorEvent[],
32
+ ): void {
33
+ const filePath = path.join(tempSessionDir, `${sessionId}.jsonl`);
34
+ const lines = events.map((e) => JSON.stringify(e)).join("\n") + "\n";
35
+ fs.writeFileSync(filePath, lines, "utf-8");
36
+ }
37
+
38
+ /**
39
+ * Helper: create minimal events
40
+ */
41
+ function createEvent(
42
+ sessionId: string,
43
+ epicId: string,
44
+ type: "DECISION" | "VIOLATION" | "OUTCOME",
45
+ subtype: string,
46
+ ): CoordinatorEvent {
47
+ const base = {
48
+ session_id: sessionId,
49
+ epic_id: epicId,
50
+ timestamp: new Date().toISOString(),
51
+ payload: {},
52
+ };
53
+
54
+ if (type === "DECISION") {
55
+ return {
56
+ ...base,
57
+ event_type: "DECISION" as const,
58
+ decision_type: subtype as any,
59
+ };
60
+ } else if (type === "VIOLATION") {
61
+ return {
62
+ ...base,
63
+ event_type: "VIOLATION" as const,
64
+ violation_type: subtype as any,
65
+ };
66
+ } else {
67
+ return {
68
+ ...base,
69
+ event_type: "OUTCOME" as const,
70
+ outcome_type: subtype as any,
71
+ };
72
+ }
73
+ }
74
+
75
+ describe("loadCapturedSessions - quality filters", () => {
76
+ test("filters out sessions with fewer than minEvents (default: 3)", async () => {
77
+ // Create sessions with different event counts
78
+ createSessionFile("session-2-events", [
79
+ createEvent("session-2-events", "epic-1", "DECISION", "worker_spawned"),
80
+ createEvent("session-2-events", "epic-1", "OUTCOME", "subtask_success"),
81
+ ]);
82
+
83
+ createSessionFile("session-3-events", [
84
+ createEvent("session-3-events", "epic-2", "DECISION", "worker_spawned"),
85
+ createEvent("session-3-events", "epic-2", "DECISION", "review_completed"),
86
+ createEvent("session-3-events", "epic-2", "OUTCOME", "subtask_success"),
87
+ ]);
88
+
89
+ createSessionFile("session-5-events", [
90
+ createEvent("session-5-events", "epic-3", "DECISION", "worker_spawned"),
91
+ createEvent("session-5-events", "epic-3", "DECISION", "review_completed"),
92
+ createEvent("session-5-events", "epic-3", "OUTCOME", "subtask_success"),
93
+ createEvent("session-5-events", "epic-3", "OUTCOME", "subtask_success"),
94
+ createEvent("session-5-events", "epic-3", "OUTCOME", "epic_complete"),
95
+ ]);
96
+
97
+ const sessions = await loadCapturedSessions({
98
+ minEvents: 3,
99
+ sessionDir: tempSessionDir,
100
+ });
101
+
102
+ // Should only get sessions with >= 3 events
103
+ expect(sessions.length).toBe(2);
104
+ expect(
105
+ sessions.some((s) => s.session.session_id === "session-3-events"),
106
+ ).toBe(true);
107
+ expect(
108
+ sessions.some((s) => s.session.session_id === "session-5-events"),
109
+ ).toBe(true);
110
+ expect(
111
+ sessions.some((s) => s.session.session_id === "session-2-events"),
112
+ ).toBe(false);
113
+ });
114
+
115
+ test("filters out sessions without worker_spawned event when requireWorkerSpawn=true", async () => {
116
+ // Session WITH worker_spawned
117
+ createSessionFile("session-with-spawn", [
118
+ createEvent("session-with-spawn", "epic-1", "DECISION", "worker_spawned"),
119
+ createEvent(
120
+ "session-with-spawn",
121
+ "epic-1",
122
+ "DECISION",
123
+ "review_completed",
124
+ ),
125
+ createEvent("session-with-spawn", "epic-1", "OUTCOME", "subtask_success"),
126
+ ]);
127
+
128
+ // Session WITHOUT worker_spawned
129
+ createSessionFile("session-no-spawn", [
130
+ createEvent(
131
+ "session-no-spawn",
132
+ "epic-2",
133
+ "DECISION",
134
+ "strategy_selected",
135
+ ),
136
+ createEvent(
137
+ "session-no-spawn",
138
+ "epic-2",
139
+ "DECISION",
140
+ "decomposition_complete",
141
+ ),
142
+ createEvent("session-no-spawn", "epic-2", "OUTCOME", "epic_complete"),
143
+ ]);
144
+
145
+ const sessions = await loadCapturedSessions({
146
+ requireWorkerSpawn: true,
147
+ sessionDir: tempSessionDir,
148
+ });
149
+
150
+ expect(sessions.length).toBe(1);
151
+ expect(sessions[0]?.session.session_id).toBe("session-with-spawn");
152
+ });
153
+
154
+ test("filters out sessions without review_completed event when requireReview=true", async () => {
155
+ // Session WITH review
156
+ createSessionFile("session-with-review", [
157
+ createEvent(
158
+ "session-with-review",
159
+ "epic-1",
160
+ "DECISION",
161
+ "worker_spawned",
162
+ ),
163
+ createEvent(
164
+ "session-with-review",
165
+ "epic-1",
166
+ "DECISION",
167
+ "review_completed",
168
+ ),
169
+ createEvent("session-with-review", "epic-1", "OUTCOME", "subtask_success"),
170
+ ]);
171
+
172
+ // Session WITHOUT review
173
+ createSessionFile("session-no-review", [
174
+ createEvent("session-no-review", "epic-2", "DECISION", "worker_spawned"),
175
+ createEvent("session-no-review", "epic-2", "OUTCOME", "subtask_success"),
176
+ createEvent("session-no-review", "epic-2", "OUTCOME", "epic_complete"),
177
+ ]);
178
+
179
+ const sessions = await loadCapturedSessions({
180
+ requireReview: true,
181
+ sessionDir: tempSessionDir,
182
+ });
183
+
184
+ expect(sessions.length).toBe(1);
185
+ expect(sessions[0]?.session.session_id).toBe("session-with-review");
186
+ });
187
+
188
+ test("allows disabling filters individually", async () => {
189
+ // Session with only 2 events, no worker_spawned, no review
190
+ createSessionFile("session-low-quality", [
191
+ createEvent(
192
+ "session-low-quality",
193
+ "epic-1",
194
+ "DECISION",
195
+ "strategy_selected",
196
+ ),
197
+ createEvent("session-low-quality", "epic-1", "OUTCOME", "epic_complete"),
198
+ ]);
199
+
200
+ // Disable all filters
201
+ const sessions = await loadCapturedSessions({
202
+ minEvents: 0,
203
+ requireWorkerSpawn: false,
204
+ requireReview: false,
205
+ sessionDir: tempSessionDir,
206
+ });
207
+
208
+ expect(sessions.length).toBe(1);
209
+ expect(sessions[0]?.session.session_id).toBe("session-low-quality");
210
+ });
211
+
212
+ test("applies limit AFTER filtering", async () => {
213
+ // Create 5 high-quality sessions
214
+ for (let i = 1; i <= 5; i++) {
215
+ createSessionFile(`session-${i}`, [
216
+ createEvent(`session-${i}`, `epic-${i}`, "DECISION", "worker_spawned"),
217
+ createEvent(
218
+ `session-${i}`,
219
+ `epic-${i}`,
220
+ "DECISION",
221
+ "review_completed",
222
+ ),
223
+ createEvent(`session-${i}`, `epic-${i}`, "OUTCOME", "subtask_success"),
224
+ ]);
225
+ }
226
+
227
+ // Create 3 low-quality sessions (will be filtered out)
228
+ for (let i = 6; i <= 8; i++) {
229
+ createSessionFile(`session-${i}`, [
230
+ createEvent(`session-${i}`, `epic-${i}`, "DECISION", "strategy_selected"),
231
+ ]);
232
+ }
233
+
234
+ // Filter first (remove 3 low-quality), then limit to 2
235
+ const sessions = await loadCapturedSessions({
236
+ minEvents: 3,
237
+ requireWorkerSpawn: true,
238
+ requireReview: true,
239
+ limit: 2,
240
+ sessionDir: tempSessionDir,
241
+ });
242
+
243
+ // Should get 2 sessions from the 5 high-quality ones
244
+ expect(sessions.length).toBe(2);
245
+ expect(sessions.every((s) => s.session.events.length >= 3)).toBe(true);
246
+ });
247
+
248
+ test("combines all filters correctly", async () => {
249
+ // High-quality session (passes all filters)
250
+ createSessionFile("session-high-quality", [
251
+ createEvent(
252
+ "session-high-quality",
253
+ "epic-1",
254
+ "DECISION",
255
+ "worker_spawned",
256
+ ),
257
+ createEvent(
258
+ "session-high-quality",
259
+ "epic-1",
260
+ "DECISION",
261
+ "review_completed",
262
+ ),
263
+ createEvent("session-high-quality", "epic-1", "OUTCOME", "subtask_success"),
264
+ createEvent("session-high-quality", "epic-1", "OUTCOME", "epic_complete"),
265
+ ]);
266
+
267
+ // Missing worker_spawned
268
+ createSessionFile("session-no-spawn", [
269
+ createEvent(
270
+ "session-no-spawn",
271
+ "epic-2",
272
+ "DECISION",
273
+ "review_completed",
274
+ ),
275
+ createEvent("session-no-spawn", "epic-2", "OUTCOME", "subtask_success"),
276
+ createEvent("session-no-spawn", "epic-2", "OUTCOME", "epic_complete"),
277
+ ]);
278
+
279
+ // Missing review_completed
280
+ createSessionFile("session-no-review", [
281
+ createEvent("session-no-review", "epic-3", "DECISION", "worker_spawned"),
282
+ createEvent("session-no-review", "epic-3", "OUTCOME", "subtask_success"),
283
+ createEvent("session-no-review", "epic-3", "OUTCOME", "epic_complete"),
284
+ ]);
285
+
286
+ // Too few events
287
+ createSessionFile("session-too-few", [
288
+ createEvent("session-too-few", "epic-4", "DECISION", "worker_spawned"),
289
+ createEvent("session-too-few", "epic-4", "DECISION", "review_completed"),
290
+ ]);
291
+
292
+ const sessions = await loadCapturedSessions({
293
+ minEvents: 3,
294
+ requireWorkerSpawn: true,
295
+ requireReview: true,
296
+ sessionDir: tempSessionDir,
297
+ });
298
+
299
+ // Only high-quality session should pass
300
+ expect(sessions.length).toBe(1);
301
+ expect(sessions[0]?.session.session_id).toBe("session-high-quality");
302
+ });
303
+
304
+ test("defaults are: minEvents=3, requireWorkerSpawn=true, requireReview=true", async () => {
305
+ // Create one session that meets defaults
306
+ createSessionFile("session-meets-defaults", [
307
+ createEvent(
308
+ "session-meets-defaults",
309
+ "epic-1",
310
+ "DECISION",
311
+ "worker_spawned",
312
+ ),
313
+ createEvent(
314
+ "session-meets-defaults",
315
+ "epic-1",
316
+ "DECISION",
317
+ "review_completed",
318
+ ),
319
+ createEvent(
320
+ "session-meets-defaults",
321
+ "epic-1",
322
+ "OUTCOME",
323
+ "subtask_success",
324
+ ),
325
+ ]);
326
+
327
+ // Create one that doesn't
328
+ createSessionFile("session-fails-defaults", [
329
+ createEvent(
330
+ "session-fails-defaults",
331
+ "epic-2",
332
+ "DECISION",
333
+ "strategy_selected",
334
+ ),
335
+ ]);
336
+
337
+ // Call with NO options except sessionDir - should use defaults
338
+ const sessions = await loadCapturedSessions({
339
+ sessionDir: tempSessionDir,
340
+ });
341
+
342
+ expect(sessions.length).toBe(1);
343
+ expect(sessions[0]?.session.session_id).toBe("session-meets-defaults");
344
+ });
345
+ });
@@ -111,24 +111,86 @@ export async function getEvalDataSummary(
111
111
  };
112
112
  }
113
113
 
114
+ /**
115
+ * Check if a session meets quality criteria
116
+ */
117
+ function meetsQualityCriteria(
118
+ session: import("../../src/eval-capture.js").CoordinatorSession,
119
+ criteria: {
120
+ minEvents: number;
121
+ requireWorkerSpawn: boolean;
122
+ requireReview: boolean;
123
+ },
124
+ ): boolean {
125
+ // Filter 1: minEvents
126
+ if (session.events.length < criteria.minEvents) {
127
+ return false;
128
+ }
129
+
130
+ // Filter 2: requireWorkerSpawn
131
+ if (
132
+ criteria.requireWorkerSpawn &&
133
+ !session.events.some(
134
+ (e) => e.event_type === "DECISION" && e.decision_type === "worker_spawned",
135
+ )
136
+ ) {
137
+ return false;
138
+ }
139
+
140
+ // Filter 3: requireReview
141
+ if (
142
+ criteria.requireReview &&
143
+ !session.events.some(
144
+ (e) =>
145
+ e.event_type === "DECISION" && e.decision_type === "review_completed",
146
+ )
147
+ ) {
148
+ return false;
149
+ }
150
+
151
+ return true;
152
+ }
153
+
114
154
  /**
115
155
  * Load captured coordinator sessions from ~/.config/swarm-tools/sessions/
116
156
  *
117
157
  * Reads all JSONL session files and returns CoordinatorSession objects.
118
158
  *
159
+ * Quality filters are applied to focus on high-signal coordinator sessions:
160
+ * - minEvents: Filter out incomplete/aborted sessions (default: 3)
161
+ * - requireWorkerSpawn: Ensure session delegated to workers (default: true)
162
+ * - requireReview: Ensure coordinator reviewed work (default: true)
163
+ *
164
+ * Filters are applied BEFORE the limit for accurate sampling.
165
+ *
119
166
  * @param options - Filter options
120
- * @returns Array of coordinator sessions
167
+ * @returns Array of coordinator sessions that meet quality criteria
121
168
  */
122
169
  export async function loadCapturedSessions(options?: {
123
170
  sessionIds?: string[];
124
171
  limit?: number;
172
+ /** Minimum number of events required (default: 3) */
173
+ minEvents?: number;
174
+ /** Require at least one worker_spawned event (default: true) */
175
+ requireWorkerSpawn?: boolean;
176
+ /** Require at least one review_completed event (default: true) */
177
+ requireReview?: boolean;
178
+ /** Override session directory for testing */
179
+ sessionDir?: string;
125
180
  }): Promise<
126
181
  Array<{ session: import("../../src/eval-capture.js").CoordinatorSession }>
127
182
  > {
128
183
  const { getSessionDir, readSessionEvents, saveSession } = await import(
129
184
  "../../src/eval-capture.js"
130
185
  );
131
- const sessionDir = getSessionDir();
186
+ const sessionDir = options?.sessionDir ?? getSessionDir();
187
+
188
+ // Default quality filters
189
+ const qualityCriteria = {
190
+ minEvents: options?.minEvents ?? 3,
191
+ requireWorkerSpawn: options?.requireWorkerSpawn ?? true,
192
+ requireReview: options?.requireReview ?? true,
193
+ };
132
194
 
133
195
  // If session dir doesn't exist, return empty
134
196
  if (!fs.existsSync(sessionDir)) {
@@ -149,32 +211,71 @@ export async function loadCapturedSessions(options?: {
149
211
  const sessions: Array<{
150
212
  session: import("../../src/eval-capture.js").CoordinatorSession;
151
213
  }> = [];
214
+ let filteredOutCount = 0;
152
215
 
153
216
  for (const file of targetFiles) {
154
217
  const sessionId = file.replace(".jsonl", "");
155
218
 
156
219
  try {
157
- const events = readSessionEvents(sessionId);
220
+ let events: import("../../src/eval-capture.js").CoordinatorEvent[];
221
+
222
+ // If custom sessionDir, read directly; otherwise use eval-capture functions
223
+ if (options?.sessionDir) {
224
+ const sessionPath = `${sessionDir}/${sessionId}.jsonl`;
225
+ if (!fs.existsSync(sessionPath)) continue;
226
+
227
+ const content = fs.readFileSync(sessionPath, "utf-8");
228
+ const lines = content.trim().split("\n").filter(Boolean);
229
+ const { CoordinatorEventSchema } = await import(
230
+ "../../src/eval-capture.js"
231
+ );
232
+ events = lines.map((line) => {
233
+ const parsed = JSON.parse(line);
234
+ return CoordinatorEventSchema.parse(parsed);
235
+ });
236
+ } else {
237
+ events = readSessionEvents(sessionId);
238
+ }
239
+
158
240
  if (events.length === 0) continue;
159
241
 
160
242
  // Find epic_id from first event
161
243
  const epicId = events[0]?.epic_id;
162
244
  if (!epicId) continue;
163
245
 
164
- const session = saveSession({ session_id: sessionId, epic_id: epicId });
165
- if (session) {
246
+ // Build session object
247
+ const session: import("../../src/eval-capture.js").CoordinatorSession = {
248
+ session_id: sessionId,
249
+ epic_id: epicId,
250
+ start_time: events[0]?.timestamp ?? new Date().toISOString(),
251
+ end_time: events[events.length - 1]?.timestamp,
252
+ events,
253
+ };
254
+ if (!session) continue;
255
+
256
+ // Apply quality filters BEFORE limit
257
+ if (meetsQualityCriteria(session, qualityCriteria)) {
166
258
  sessions.push({ session });
259
+ } else {
260
+ filteredOutCount++;
167
261
  }
168
262
  } catch (error) {
169
263
  // Skip invalid sessions
170
264
  console.warn(`Failed to load session ${sessionId}:`, error);
171
265
  }
172
266
 
173
- // Apply limit if specified
267
+ // Apply limit AFTER filtering
174
268
  if (options?.limit && sessions.length >= options.limit) {
175
269
  break;
176
270
  }
177
271
  }
178
272
 
273
+ // Log filtering stats for visibility
274
+ if (filteredOutCount > 0) {
275
+ console.log(
276
+ `Filtered out ${filteredOutCount} sessions (minEvents=${qualityCriteria.minEvents}, requireWorkerSpawn=${qualityCriteria.requireWorkerSpawn}, requireReview=${qualityCriteria.requireReview})`,
277
+ );
278
+ }
279
+
179
280
  return sessions;
180
281
  }
@@ -0,0 +1,145 @@
1
+ /**
2
+ * Compaction Prompt Quality Scorers - Evalite Wrappers
3
+ *
4
+ * These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
5
+ * for use with evalite's test runner.
6
+ *
7
+ * Weighted scoring:
8
+ * - epicIdSpecificity (0.20) - real IDs not placeholders
9
+ * - actionability (0.20) - swarm_status/inbox with real values
10
+ * - coordinatorIdentity (0.25) - ASCII header + strong mandates
11
+ * - forbiddenToolsPresent (0.15) - lists forbidden tools by name
12
+ * - postCompactionDiscipline (0.20) - first tool correct, no edit/write
13
+ */
14
+
15
+ import { createScorer } from "evalite";
16
+ import type { CompactionPrompt } from "../../src/compaction-prompt-scoring.js";
17
+ import {
18
+ scoreActionability,
19
+ scoreCoordinatorIdentity,
20
+ scoreEpicIdSpecificity,
21
+ scoreForbiddenToolsPresent,
22
+ scorePostCompactionDiscipline,
23
+ } from "../../src/compaction-prompt-scoring.js";
24
+
25
+ // Re-export types for convenience
26
+ export type { CompactionPrompt, ScorerResult } from "../../src/compaction-prompt-scoring.js";
27
+
28
+ // Re-export pure functions for direct use
29
+ export {
30
+ scoreActionability,
31
+ scoreCoordinatorIdentity,
32
+ scoreEpicIdSpecificity,
33
+ scoreForbiddenToolsPresent,
34
+ scorePostCompactionDiscipline,
35
+ } from "../../src/compaction-prompt-scoring.js";
36
+
37
+ /**
38
+ * Epic ID Specificity Scorer
39
+ *
40
+ * Validates that epic IDs are REAL, not placeholders.
41
+ * Score: 1.0 if real IDs, 0.0 if placeholders found
42
+ */
43
+ export const epicIdSpecificity = createScorer({
44
+ name: "Epic ID Specificity",
45
+ description: "Prompt uses real epic IDs, not placeholders",
46
+ scorer: ({ output }) => {
47
+ try {
48
+ const prompt = JSON.parse(String(output)) as CompactionPrompt;
49
+ return scoreEpicIdSpecificity(prompt);
50
+ } catch (error) {
51
+ return {
52
+ score: 0,
53
+ message: `Failed to parse prompt: ${error}`,
54
+ };
55
+ }
56
+ },
57
+ });
58
+
59
+ /**
60
+ * Actionability Scorer
61
+ *
62
+ * Validates that the prompt includes SPECIFIC actionable tool calls.
63
+ * Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
64
+ */
65
+ export const actionability = createScorer({
66
+ name: "Actionability",
67
+ description: "Prompt includes specific tool calls with real values",
68
+ scorer: ({ output }) => {
69
+ try {
70
+ const prompt = JSON.parse(String(output)) as CompactionPrompt;
71
+ return scoreActionability(prompt);
72
+ } catch (error) {
73
+ return {
74
+ score: 0,
75
+ message: `Failed to parse prompt: ${error}`,
76
+ };
77
+ }
78
+ },
79
+ });
80
+
81
+ /**
82
+ * Coordinator Identity Scorer
83
+ *
84
+ * Validates that the prompt has STRONG coordinator identity reinforcement.
85
+ * Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
86
+ */
87
+ export const coordinatorIdentity = createScorer({
88
+ name: "Coordinator Identity",
89
+ description: "Prompt has ASCII header and strong mandates",
90
+ scorer: ({ output }) => {
91
+ try {
92
+ const prompt = JSON.parse(String(output)) as CompactionPrompt;
93
+ return scoreCoordinatorIdentity(prompt);
94
+ } catch (error) {
95
+ return {
96
+ score: 0,
97
+ message: `Failed to parse prompt: ${error}`,
98
+ };
99
+ }
100
+ },
101
+ });
102
+
103
+ /**
104
+ * Forbidden Tools Present Scorer
105
+ *
106
+ * Validates that the prompt LISTS forbidden tools by name.
107
+ * Score: ratio of forbidden tools mentioned (0.0 to 1.0)
108
+ */
109
+ export const forbiddenToolsPresent = createScorer({
110
+ name: "Forbidden Tools Present",
111
+ description: "Prompt lists forbidden tools by name",
112
+ scorer: ({ output }) => {
113
+ try {
114
+ const prompt = JSON.parse(String(output)) as CompactionPrompt;
115
+ return scoreForbiddenToolsPresent(prompt);
116
+ } catch (error) {
117
+ return {
118
+ score: 0,
119
+ message: `Failed to parse prompt: ${error}`,
120
+ };
121
+ }
122
+ },
123
+ });
124
+
125
+ /**
126
+ * Post-Compaction Discipline Scorer
127
+ *
128
+ * Validates that the FIRST suggested tool is correct.
129
+ * Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
130
+ */
131
+ export const postCompactionDiscipline = createScorer({
132
+ name: "Post-Compaction Discipline",
133
+ description: "First suggested tool is swarm_status or inbox",
134
+ scorer: ({ output }) => {
135
+ try {
136
+ const prompt = JSON.parse(String(output)) as CompactionPrompt;
137
+ return scorePostCompactionDiscipline(prompt);
138
+ } catch (error) {
139
+ return {
140
+ score: 0,
141
+ message: `Failed to parse prompt: ${error}`,
142
+ };
143
+ }
144
+ },
145
+ });
@@ -260,14 +260,14 @@ export const forbiddenPatternsAbsent = createScorer({
260
260
  export const compactionQuality = createScorer({
261
261
  name: "Overall Compaction Quality",
262
262
  description: "Composite score for compaction hook correctness",
263
- scorer: ({ output, expected }) => {
263
+ scorer: async ({ output, expected, input }) => {
264
264
  try {
265
265
  // Run all scorers
266
266
  const scores = {
267
- confidence: confidenceAccuracy.scorer({ output, expected }),
268
- injection: contextInjectionCorrectness.scorer({ output, expected }),
269
- required: requiredPatternsPresent.scorer({ output, expected }),
270
- forbidden: forbiddenPatternsAbsent.scorer({ output, expected }),
267
+ confidence: await confidenceAccuracy({ output, expected, input }),
268
+ injection: await contextInjectionCorrectness({ output, expected, input }),
269
+ required: await requiredPatternsPresent({ output, expected, input }),
270
+ forbidden: await forbiddenPatternsAbsent({ output, expected, input }),
271
271
  };
272
272
 
273
273
  // Weighted average
@@ -279,16 +279,16 @@ export const compactionQuality = createScorer({
279
279
  };
280
280
 
281
281
  const totalScore =
282
- scores.confidence.score * weights.confidence +
283
- scores.injection.score * weights.injection +
284
- scores.required.score * weights.required +
285
- scores.forbidden.score * weights.forbidden;
282
+ (scores.confidence.score ?? 0) * weights.confidence +
283
+ (scores.injection.score ?? 0) * weights.injection +
284
+ (scores.required.score ?? 0) * weights.required +
285
+ (scores.forbidden.score ?? 0) * weights.forbidden;
286
286
 
287
287
  const details = [
288
- `Confidence: ${(scores.confidence.score * 100).toFixed(0)}%`,
289
- `Injection: ${(scores.injection.score * 100).toFixed(0)}%`,
290
- `Required: ${(scores.required.score * 100).toFixed(0)}%`,
291
- `Forbidden: ${(scores.forbidden.score * 100).toFixed(0)}%`,
288
+ `Confidence: ${((scores.confidence.score ?? 0) * 100).toFixed(0)}%`,
289
+ `Injection: ${((scores.injection.score ?? 0) * 100).toFixed(0)}%`,
290
+ `Required: ${((scores.required.score ?? 0) * 100).toFixed(0)}%`,
291
+ `Forbidden: ${((scores.forbidden.score ?? 0) * 100).toFixed(0)}%`,
292
292
  ].join(", ");
293
293
 
294
294
  return {