opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,299 @@
1
+ /**
2
+ * Tests for compaction prompt quality scorers
3
+ *
4
+ * TDD approach - tests written FIRST to define scorer behavior
5
+ * Tests the PURE scoring functions (not evalite wrappers)
6
+ */
7
+
8
+ import { describe, expect, test } from "bun:test";
9
+ import type { CompactionPrompt } from "./compaction-prompt-scoring.js";
10
+ import {
11
+ scoreActionability,
12
+ scoreCoordinatorIdentity,
13
+ scoreEpicIdSpecificity,
14
+ scoreForbiddenToolsPresent,
15
+ scorePostCompactionDiscipline,
16
+ } from "./compaction-prompt-scoring.js";
17
+
18
+ describe("epicIdSpecificity scorer", () => {
19
+ test("scores 1.0 for real epic IDs", () => {
20
+ const prompt: CompactionPrompt = {
21
+ content: "Continue coordinating epic mjkw81rkq4c",
22
+ };
23
+
24
+ const result = scoreEpicIdSpecificity(prompt);
25
+
26
+ expect(result.score).toBe(1.0);
27
+ expect(result.message).toContain("real epic ID");
28
+ });
29
+
30
+ test("scores 0.0 for placeholder IDs like <epic-id>", () => {
31
+ const prompt: CompactionPrompt = {
32
+ content: "Continue coordinating epic <epic-id>",
33
+ };
34
+
35
+ const result = scoreEpicIdSpecificity(prompt);
36
+
37
+ expect(result.score).toBe(0.0);
38
+ expect(result.message).toContain("placeholder");
39
+ });
40
+
41
+ test("scores 0.0 for bd-xxx placeholders", () => {
42
+ const prompt: CompactionPrompt = {
43
+ content: "Check status of bd-xxx",
44
+ };
45
+
46
+ const result = scoreEpicIdSpecificity(prompt);
47
+
48
+ expect(result.score).toBe(0.0);
49
+ expect(result.message).toContain("placeholder");
50
+ });
51
+
52
+ test("scores 0.0 for generic <path> placeholders", () => {
53
+ const prompt: CompactionPrompt = {
54
+ content: "Project at <path>",
55
+ };
56
+
57
+ const result = scoreEpicIdSpecificity(prompt);
58
+
59
+ expect(result.score).toBe(0.0);
60
+ });
61
+
62
+ test("scores 0.0 when no epic ID found", () => {
63
+ const prompt: CompactionPrompt = {
64
+ content: "Continue working on the task",
65
+ };
66
+
67
+ const result = scoreEpicIdSpecificity(prompt);
68
+
69
+ expect(result.score).toBe(0.0);
70
+ expect(result.message).toContain("No epic ID");
71
+ });
72
+ });
73
+
74
+ describe("actionability scorer", () => {
75
+ test("scores 1.0 when swarm_status has real epic ID", () => {
76
+ const prompt: CompactionPrompt = {
77
+ content: `First action:
78
+ swarm_status(epic_id='mjkw81rkq4c', project_key='/path/to/project')`,
79
+ };
80
+
81
+ const result = scoreActionability(prompt);
82
+
83
+ expect(result.score).toBe(1.0);
84
+ expect(result.message).toContain("actionable tool call");
85
+ });
86
+
87
+ test("scores 1.0 when swarmmail_inbox is present", () => {
88
+ const prompt: CompactionPrompt = {
89
+ content: `Check messages:
90
+ swarmmail_inbox()`,
91
+ };
92
+
93
+ const result = scoreActionability(prompt);
94
+
95
+ expect(result.score).toBe(1.0);
96
+ expect(result.message).toContain("actionable tool call");
97
+ });
98
+
99
+ test("scores 0.0 for generic instructions without tool calls", () => {
100
+ const prompt: CompactionPrompt = {
101
+ content: "Check the status of workers and review progress",
102
+ };
103
+
104
+ const result = scoreActionability(prompt);
105
+
106
+ expect(result.score).toBe(0.0);
107
+ expect(result.message).toContain("No actionable");
108
+ });
109
+
110
+ test("scores 0.0 for swarm_status with placeholders", () => {
111
+ const prompt: CompactionPrompt = {
112
+ content: `swarm_status(epic_id='<epic-id>', project_key='<path>')`,
113
+ };
114
+
115
+ const result = scoreActionability(prompt);
116
+
117
+ expect(result.score).toBe(0.0);
118
+ expect(result.message).toContain("placeholder");
119
+ });
120
+ });
121
+
122
+ describe("coordinatorIdentity scorer", () => {
123
+ test("scores 1.0 with ASCII header and strong mandates", () => {
124
+ const prompt: CompactionPrompt = {
125
+ content: `┌─────────────────────────────────────────┐
126
+ │ YOU ARE THE COORDINATOR │
127
+ │ │
128
+ │ NEVER spawn workers yourself │
129
+ │ ALWAYS review worker output │
130
+ └─────────────────────────────────────────┘
131
+
132
+ Continue coordinating the swarm.`,
133
+ };
134
+
135
+ const result = scoreCoordinatorIdentity(prompt);
136
+
137
+ expect(result.score).toBe(1.0);
138
+ expect(result.message).toContain("ASCII header");
139
+ expect(result.message).toContain("strong mandates");
140
+ });
141
+
142
+ test("scores 0.5 with ASCII header but weak language", () => {
143
+ const prompt: CompactionPrompt = {
144
+ content: `┌─────────────────────────────────────────┐
145
+ │ COORDINATOR MODE │
146
+ └─────────────────────────────────────────┘
147
+
148
+ You should consider delegating work.`,
149
+ };
150
+
151
+ const result = scoreCoordinatorIdentity(prompt);
152
+
153
+ expect(result.score).toBe(0.5);
154
+ expect(result.message).toContain("weak language");
155
+ });
156
+
157
+ test("scores 0.0 without ASCII header", () => {
158
+ const prompt: CompactionPrompt = {
159
+ content: `You are the coordinator. NEVER do work directly. ALWAYS delegate.`,
160
+ };
161
+
162
+ const result = scoreCoordinatorIdentity(prompt);
163
+
164
+ expect(result.score).toBe(0.0);
165
+ expect(result.message).toContain("No ASCII header");
166
+ });
167
+ });
168
+
169
+ describe("forbiddenToolsPresent scorer", () => {
170
+ test("scores 1.0 when all forbidden tools listed", () => {
171
+ const prompt: CompactionPrompt = {
172
+ content: `🚫 FORBIDDEN TOOLS - NEVER call these:
173
+ - Edit (use swarm_spawn_subtask)
174
+ - Write (use swarm_spawn_subtask)
175
+ - swarmmail_reserve (only workers reserve)
176
+ - bash with git commit (workers commit)`,
177
+ };
178
+
179
+ const result = scoreForbiddenToolsPresent(prompt);
180
+
181
+ expect(result.score).toBe(1.0);
182
+ expect(result.message).toContain("All 4 forbidden tools");
183
+ });
184
+
185
+ test("scores 0.75 when 3 out of 4 tools listed", () => {
186
+ const prompt: CompactionPrompt = {
187
+ content: `🚫 FORBIDDEN TOOLS:
188
+ - Edit
189
+ - Write
190
+ - swarmmail_reserve`,
191
+ };
192
+
193
+ const result = scoreForbiddenToolsPresent(prompt);
194
+
195
+ expect(result.score).toBe(0.75);
196
+ expect(result.message).toContain("3/4");
197
+ });
198
+
199
+ test("scores 0.5 when 2 out of 4 tools listed", () => {
200
+ const prompt: CompactionPrompt = {
201
+ content: `Don't use Edit or Write directly.`,
202
+ };
203
+
204
+ const result = scoreForbiddenToolsPresent(prompt);
205
+
206
+ expect(result.score).toBe(0.5);
207
+ expect(result.message).toContain("2/4");
208
+ });
209
+
210
+ test("scores 0.0 when no forbidden tools listed", () => {
211
+ const prompt: CompactionPrompt = {
212
+ content: "Continue coordinating the epic",
213
+ };
214
+
215
+ const result = scoreForbiddenToolsPresent(prompt);
216
+
217
+ expect(result.score).toBe(0.0);
218
+ expect(result.message).toContain("0/4");
219
+ });
220
+ });
221
+
222
+ describe("postCompactionDiscipline scorer", () => {
223
+ test("scores 1.0 when first tool is swarm_status", () => {
224
+ const prompt: CompactionPrompt = {
225
+ content: `Resume coordination:
226
+
227
+ 1. swarm_status(epic_id='mjkw81rkq4c')
228
+ 2. Check inbox
229
+ 3. Review progress`,
230
+ };
231
+
232
+ const result = scorePostCompactionDiscipline(prompt);
233
+
234
+ expect(result.score).toBe(1.0);
235
+ expect(result.message).toContain("swarm_status");
236
+ expect(result.message).toContain("correct");
237
+ });
238
+
239
+ test("scores 1.0 when first tool is swarmmail_inbox", () => {
240
+ const prompt: CompactionPrompt = {
241
+ content: `Next steps:
242
+ 1. swarmmail_inbox()
243
+ 2. Review messages`,
244
+ };
245
+
246
+ const result = scorePostCompactionDiscipline(prompt);
247
+
248
+ expect(result.score).toBe(1.0);
249
+ expect(result.message).toContain("inbox");
250
+ expect(result.message).toContain("correct");
251
+ });
252
+
253
+ test("scores 0.0 when first tool is Edit", () => {
254
+ const prompt: CompactionPrompt = {
255
+ content: `Resume:
256
+ 1. Edit(file='src/auth.ts', ...)
257
+ 2. Check status`,
258
+ };
259
+
260
+ const result = scorePostCompactionDiscipline(prompt);
261
+
262
+ expect(result.score).toBe(0.0);
263
+ expect(result.message).toContain("Edit");
264
+ });
265
+
266
+ test("scores 0.0 when first tool is Write", () => {
267
+ const prompt: CompactionPrompt = {
268
+ content: `1. Write(file='README.md', ...)`,
269
+ };
270
+
271
+ const result = scorePostCompactionDiscipline(prompt);
272
+
273
+ expect(result.score).toBe(0.0);
274
+ expect(result.message).toContain("Write");
275
+ });
276
+
277
+ test("scores 0.0 when first tool is Read", () => {
278
+ const prompt: CompactionPrompt = {
279
+ content: `1. Read(file='src/index.ts')
280
+ 2. swarm_status()`,
281
+ };
282
+
283
+ const result = scorePostCompactionDiscipline(prompt);
284
+
285
+ expect(result.score).toBe(0.0);
286
+ expect(result.message).toContain("Read");
287
+ });
288
+
289
+ test("scores 0.0 when no tool calls mentioned", () => {
290
+ const prompt: CompactionPrompt = {
291
+ content: "Continue coordinating the epic",
292
+ };
293
+
294
+ const result = scorePostCompactionDiscipline(prompt);
295
+
296
+ expect(result.score).toBe(0.0);
297
+ expect(result.message).toContain("No tool");
298
+ });
299
+ });
@@ -0,0 +1,298 @@
1
+ /**
2
+ * Compaction Prompt Quality Scoring - Pure Functions
3
+ *
4
+ * Evaluates the quality of continuation prompts generated after context compaction.
5
+ * **Problem**: Post-compaction coordinators often "wake up" confused, forget their role,
6
+ * and start editing files instead of checking worker status.
7
+ *
8
+ * **Solution**: Score prompts on 5 dimensions that predict coordinator success:
9
+ *
10
+ * 1. **Epic ID Specificity (0.20)**: Real IDs (`mjkw...`) not placeholders (`<epic-id>`, `bd-xxx`)
11
+ * - Placeholders = coordinator can't check actual swarm status
12
+ *
13
+ * 2. **Actionability (0.20)**: Tool calls with real values (e.g., `swarm_status(epic_id='mjkw81rkq4c')`)
14
+ * - Generic instructions like "check status" don't work
15
+ *
16
+ * 3. **Coordinator Identity (0.25)**: ASCII header + strong mandates (NEVER/ALWAYS)
17
+ * - Visual + semantic cues reinforce role post-compaction
18
+ *
19
+ * 4. **Forbidden Tools Listed (0.15)**: Explicitly lists Edit, Write, swarmmail_reserve, git commit
20
+ * - Naming forbidden tools reduces violations
21
+ *
22
+ * 5. **Post-Compaction Discipline (0.20)**: First suggested tool is swarm_status or inbox (not Edit)
23
+ * - First tool sets the pattern - "check status" vs "dive into code"
24
+ *
25
+ * **Pure functions**: These can be tested without evalite. The evalite wrappers are in
26
+ * `evals/scorers/compaction-prompt-scorers.ts`.
27
+ *
28
+ * **Data source**: Captured from `captureCompactionEvent()` with `compaction_type: "prompt_generated"`.
29
+ * The payload includes the FULL prompt content (not truncated) for scoring.
30
+ *
31
+ * **Integration**: `compaction-prompt.eval.ts` uses these scorers to track prompt quality over time.
32
+ * Progressive gates enforce quality: bootstrap → stabilization → production.
33
+ *
34
+ * @module compaction-prompt-scoring
35
+ */
36
+
37
+ /**
38
+ * Compaction prompt structure (from LLM generation)
39
+ */
40
+ export interface CompactionPrompt {
41
+ content: string;
42
+ }
43
+
44
+ /**
45
+ * Scorer result type
46
+ */
47
+ export interface ScorerResult {
48
+ score: number;
49
+ message: string;
50
+ }
51
+
52
+ // ====== Shared Regex Patterns ======
53
+
54
+ /** Matches real epic/cell IDs (mjkw prefix + 7+ base36 chars) */
55
+ export const REAL_EPIC_ID = /mjkw[a-z0-9]{7,}/;
56
+
57
+ /** Matches common placeholder patterns */
58
+ export const PLACEHOLDERS = [
59
+ /<epic-id>/i,
60
+ /bd-xxx/,
61
+ /<path>/i,
62
+ /<project>/i,
63
+ ];
64
+
65
+ /** Matches ASCII box-drawing characters (for headers) */
66
+ export const ASCII_BOX = /[┌┐└┘─│]{3,}/;
67
+
68
+ /** Matches strong mandate language */
69
+ export const STRONG_LANGUAGE = [/\bNEVER\b/, /\bALWAYS\b/, /\bNON-NEGOTIABLE\b/];
70
+
71
+ // ====== Pure Scoring Functions ======
72
+
73
+ /**
74
+ * Score epic ID specificity
75
+ *
76
+ * Validates that epic IDs are REAL, not placeholders.
77
+ * Placeholders like <epic-id>, bd-xxx, <path> indicate
78
+ * the prompt generator failed to inject actual values.
79
+ *
80
+ * @returns 1.0 if real IDs, 0.0 if placeholders found
81
+ */
82
+ export function scoreEpicIdSpecificity(prompt: CompactionPrompt): ScorerResult {
83
+ // Check for placeholder patterns
84
+ for (const pattern of PLACEHOLDERS) {
85
+ if (pattern.test(prompt.content)) {
86
+ return {
87
+ score: 0.0,
88
+ message: `Found placeholder: ${pattern.source}`,
89
+ };
90
+ }
91
+ }
92
+
93
+ // Check for real epic ID pattern
94
+ if (REAL_EPIC_ID.test(prompt.content)) {
95
+ return {
96
+ score: 1.0,
97
+ message: "Contains real epic ID",
98
+ };
99
+ }
100
+
101
+ return {
102
+ score: 0.0,
103
+ message: "No epic ID found",
104
+ };
105
+ }
106
+
107
+ /**
108
+ * Score actionability of tool calls
109
+ *
110
+ * Validates that the prompt includes SPECIFIC actionable tool calls.
111
+ * Generic instructions like "check status" are useless.
112
+ * Good: swarm_status(epic_id='mjkw81rkq4c', project_key='/path')
113
+ * Bad: "Check the status of workers"
114
+ *
115
+ * @returns 1.0 if actionable tool calls with real values, 0.0 otherwise
116
+ */
117
+ export function scoreActionability(prompt: CompactionPrompt): ScorerResult {
118
+ // Check for actionable tool patterns
119
+ const actionableTools = [
120
+ /swarm_status\([^)]*epic_id\s*=\s*['"]mjkw[a-z0-9]{7,}['"]/,
121
+ /swarmmail_inbox\(\)/,
122
+ ];
123
+
124
+ for (const pattern of actionableTools) {
125
+ if (pattern.test(prompt.content)) {
126
+ return {
127
+ score: 1.0,
128
+ message: "Contains actionable tool call with real values",
129
+ };
130
+ }
131
+ }
132
+
133
+ // Check if tool is mentioned but with placeholders
134
+ if (
135
+ /swarm_status\([^)]*<epic-id>/.test(prompt.content) ||
136
+ /swarm_status\([^)]*<path>/.test(prompt.content)
137
+ ) {
138
+ return {
139
+ score: 0.0,
140
+ message: "Tool call has placeholders",
141
+ };
142
+ }
143
+
144
+ return {
145
+ score: 0.0,
146
+ message: "No actionable tool calls found",
147
+ };
148
+ }
149
+
150
+ /**
151
+ * Score coordinator identity reinforcement
152
+ *
153
+ * Validates that the prompt has STRONG coordinator identity reinforcement.
154
+ * Post-compaction coordinators lose their identity without visual+semantic cues.
155
+ *
156
+ * Checks:
157
+ * 1. ASCII box header (visual anchor)
158
+ * 2. Strong language (NEVER/ALWAYS, not "should"/"consider")
159
+ *
160
+ * @returns 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
161
+ */
162
+ export function scoreCoordinatorIdentity(
163
+ prompt: CompactionPrompt,
164
+ ): ScorerResult {
165
+ // Check for ASCII box header (uses box-drawing characters)
166
+ const hasAsciiHeader =
167
+ ASCII_BOX.test(prompt.content) &&
168
+ /(YOU ARE THE COORDINATOR|COORDINATOR MODE)/i.test(prompt.content);
169
+
170
+ if (!hasAsciiHeader) {
171
+ return {
172
+ score: 0.0,
173
+ message: "No ASCII header found",
174
+ };
175
+ }
176
+
177
+ // Check for strong mandate language
178
+ const hasStrongLanguage = STRONG_LANGUAGE.some((pattern) =>
179
+ pattern.test(prompt.content),
180
+ );
181
+
182
+ if (!hasStrongLanguage) {
183
+ return {
184
+ score: 0.5,
185
+ message: "ASCII header present but weak language",
186
+ };
187
+ }
188
+
189
+ return {
190
+ score: 1.0,
191
+ message: "ASCII header + strong mandates present",
192
+ };
193
+ }
194
+
195
+ /**
196
+ * Score forbidden tools listing
197
+ *
198
+ * Validates that the prompt LISTS forbidden tools by name.
199
+ * Coordinators must know exactly which tools to avoid.
200
+ *
201
+ * Required forbidden tools:
202
+ * 1. Edit
203
+ * 2. Write
204
+ * 3. swarmmail_reserve (only workers reserve)
205
+ * 4. git commit (workers commit)
206
+ *
207
+ * @returns ratio of forbidden tools mentioned (0.0 to 1.0)
208
+ */
209
+ export function scoreForbiddenToolsPresent(
210
+ prompt: CompactionPrompt,
211
+ ): ScorerResult {
212
+ // Check for forbidden tool mentions
213
+ const forbiddenTools = [
214
+ /\bEdit\b/,
215
+ /\bWrite\b/,
216
+ /swarmmail_reserve/,
217
+ /git commit/,
218
+ ];
219
+
220
+ const foundTools = forbiddenTools.filter((pattern) =>
221
+ pattern.test(prompt.content),
222
+ );
223
+
224
+ const score = foundTools.length / forbiddenTools.length;
225
+
226
+ if (score === 1.0) {
227
+ return {
228
+ score: 1.0,
229
+ message: "All 4 forbidden tools listed",
230
+ };
231
+ }
232
+
233
+ if (score === 0) {
234
+ return {
235
+ score: 0.0,
236
+ message: "No forbidden tools listed (0/4)",
237
+ };
238
+ }
239
+
240
+ return {
241
+ score,
242
+ message: `${foundTools.length}/4 forbidden tools listed`,
243
+ };
244
+ }
245
+
246
+ /**
247
+ * Score post-compaction discipline (first tool correctness)
248
+ *
249
+ * Validates that the FIRST suggested tool is correct.
250
+ * Coordinators should check status FIRST, not edit files.
251
+ *
252
+ * Good first tools:
253
+ * - swarm_status
254
+ * - swarmmail_inbox
255
+ *
256
+ * Bad first tools:
257
+ * - Edit
258
+ * - Write
259
+ * - Read (should check status first)
260
+ *
261
+ * @returns 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
262
+ */
263
+ export function scorePostCompactionDiscipline(
264
+ prompt: CompactionPrompt,
265
+ ): ScorerResult {
266
+ // Extract first tool call (look for function-like patterns)
267
+ const toolCallPattern =
268
+ /\b(swarm_status|swarmmail_inbox|Edit|Write|Read)\b/i;
269
+ const match = prompt.content.match(toolCallPattern);
270
+
271
+ if (!match) {
272
+ return {
273
+ score: 0.0,
274
+ message: "No tool calls found",
275
+ };
276
+ }
277
+
278
+ const firstTool = match[1].toLowerCase();
279
+
280
+ if (firstTool === "swarm_status") {
281
+ return {
282
+ score: 1.0,
283
+ message: "First tool is swarm_status (correct)",
284
+ };
285
+ }
286
+
287
+ if (firstTool === "swarmmail_inbox") {
288
+ return {
289
+ score: 1.0,
290
+ message: "First tool is inbox (correct)",
291
+ };
292
+ }
293
+
294
+ return {
295
+ score: 0.0,
296
+ message: `First tool is ${match[1]} (should be swarm_status or inbox)`,
297
+ };
298
+ }