opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,149 @@
1
+ /**
2
+ * Compaction Prompt Quality Evaluation
3
+ *
4
+ * Tests that continuation prompts generated after context compaction meet
5
+ * quality criteria for coordinator resumption:
6
+ *
7
+ * 1. Epic ID Specificity (20%) - Real IDs not placeholders
8
+ * 2. Actionability (20%) - Specific tool calls with real values
9
+ * 3. Coordinator Identity (25%) - ASCII header + strong mandates
10
+ * 4. Forbidden Tools (15%) - Lists forbidden tools by name
11
+ * 5. Post-Compaction Discipline (20%) - First tool is correct
12
+ *
13
+ * ## Why This Matters
14
+ *
15
+ * After compaction, coordinators lose context. The continuation prompt is
16
+ * their ONLY guide to resume. Bad prompts cause:
17
+ * - Coordinators editing files (should delegate to workers)
18
+ * - Generic "check status" instead of actual tool calls
19
+ * - Lost epic IDs (can't resume coordination)
20
+ *
21
+ * ## Test Strategy
22
+ *
23
+ * - 6 synthetic fixtures covering perfect/bad prompts
24
+ * - Each fixture tests specific failure modes
25
+ * - Composite scorer validates overall quality
26
+ *
27
+ * Run with: bun run eval:compaction
28
+ */
29
+
30
+ import { evalite } from "evalite";
31
+ import { compactionPromptCases } from "./fixtures/compaction-prompt-cases.js";
32
+ import {
33
+ actionability,
34
+ coordinatorIdentity,
35
+ epicIdSpecificity,
36
+ forbiddenToolsPresent,
37
+ postCompactionDiscipline,
38
+ } from "./scorers/compaction-prompt-scorers.js";
39
+
40
+ /**
41
+ * Main eval: Compaction Prompt Quality
42
+ *
43
+ * Tests all cases from fixtures/compaction-prompt-cases.ts
44
+ */
45
+ evalite("Compaction Prompt Quality", {
46
+ data: async () =>
47
+ compactionPromptCases.map((testCase) => ({
48
+ input: testCase.prompt,
49
+ expected: testCase.expected,
50
+ })),
51
+
52
+ task: async (input) => {
53
+ // Identity task - fixture already has the prompt
54
+ // In real usage, this would call the LLM to generate the prompt
55
+ return JSON.stringify(input);
56
+ },
57
+
58
+ scorers: [
59
+ epicIdSpecificity,
60
+ actionability,
61
+ coordinatorIdentity,
62
+ forbiddenToolsPresent,
63
+ postCompactionDiscipline,
64
+ ],
65
+ });
66
+
67
+ /**
68
+ * Perfect Prompt Verification
69
+ *
70
+ * Ensures our "perfect" fixture actually scores 100%
71
+ */
72
+ evalite("Perfect Prompt Scores 100%", {
73
+ data: async () => [
74
+ {
75
+ input: compactionPromptCases[0].prompt, // First case is "perfect"
76
+ expected: {
77
+ hasRealEpicId: true,
78
+ isActionable: true,
79
+ hasCoordinatorIdentity: true,
80
+ listsForbiddenTools: true,
81
+ hasCorrectFirstTool: true,
82
+ },
83
+ },
84
+ ],
85
+
86
+ task: async (input) => JSON.stringify(input),
87
+
88
+ scorers: [
89
+ epicIdSpecificity,
90
+ actionability,
91
+ coordinatorIdentity,
92
+ forbiddenToolsPresent,
93
+ postCompactionDiscipline,
94
+ ],
95
+ });
96
+
97
+ /**
98
+ * Placeholder Detection
99
+ *
100
+ * Ensures we catch common placeholder patterns
101
+ */
102
+ evalite("Placeholder Detection", {
103
+ data: async () => [
104
+ {
105
+ input: compactionPromptCases[1].prompt, // Placeholder case
106
+ expected: { hasRealEpicId: false },
107
+ },
108
+ ],
109
+
110
+ task: async (input) => JSON.stringify(input),
111
+
112
+ scorers: [epicIdSpecificity],
113
+ });
114
+
115
+ /**
116
+ * Generic Instructions Detection
117
+ *
118
+ * Ensures we fail prompts with vague language instead of tool calls
119
+ */
120
+ evalite("Generic Instructions Fail", {
121
+ data: async () => [
122
+ {
123
+ input: compactionPromptCases[2].prompt, // Generic case
124
+ expected: { isActionable: false },
125
+ },
126
+ ],
127
+
128
+ task: async (input) => JSON.stringify(input),
129
+
130
+ scorers: [actionability],
131
+ });
132
+
133
+ /**
134
+ * First Tool Discipline
135
+ *
136
+ * Ensures first suggested tool is correct (swarm_status/inbox, not edit)
137
+ */
138
+ evalite("First Tool Discipline", {
139
+ data: async () => [
140
+ {
141
+ input: compactionPromptCases[5].prompt, // Wrong first tool
142
+ expected: { hasCorrectFirstTool: false },
143
+ },
144
+ ],
145
+
146
+ task: async (input) => JSON.stringify(input),
147
+
148
+ scorers: [postCompactionDiscipline],
149
+ });
@@ -187,20 +187,20 @@ export const coordinatorMindset = createScorer({
187
187
  export const overallCoordinatorBehavior = createScorer({
188
188
  name: "Overall Coordinator Behavior",
189
189
  description: "Composite score: does the LLM behave like a coordinator?",
190
- scorer: ({ output }) => {
191
- const toolsResult = mentionsCoordinatorTools.scorer({ output, expected: undefined });
192
- const avoidsResult = avoidsWorkerBehaviors.scorer({ output, expected: undefined });
193
- const mindsetResult = coordinatorMindset.scorer({ output, expected: undefined });
190
+ scorer: async ({ output, expected, input }) => {
191
+ const toolsResult = await mentionsCoordinatorTools({ output, expected, input });
192
+ const avoidsResult = await avoidsWorkerBehaviors({ output, expected, input });
193
+ const mindsetResult = await coordinatorMindset({ output, expected, input });
194
194
 
195
195
  // Weighted average: avoiding worker behavior is most important
196
196
  const score =
197
- toolsResult.score * 0.3 +
198
- avoidsResult.score * 0.4 +
199
- mindsetResult.score * 0.3;
197
+ (toolsResult.score ?? 0) * 0.3 +
198
+ (avoidsResult.score ?? 0) * 0.4 +
199
+ (mindsetResult.score ?? 0) * 0.3;
200
200
 
201
201
  return {
202
202
  score,
203
- message: `Tools: ${(toolsResult.score * 100).toFixed(0)}%, Avoids Worker: ${(avoidsResult.score * 100).toFixed(0)}%, Mindset: ${(mindsetResult.score * 100).toFixed(0)}%`,
203
+ message: `Tools: ${((toolsResult.score ?? 0) * 100).toFixed(0)}%, Avoids Worker: ${((avoidsResult.score ?? 0) * 100).toFixed(0)}%, Mindset: ${((mindsetResult.score ?? 0) * 100).toFixed(0)}%`,
204
204
  };
205
205
  },
206
206
  });
@@ -0,0 +1,305 @@
1
+ /**
2
+ * Test cases for compaction prompt quality evaluation
3
+ *
4
+ * Each case represents a continuation prompt that should be generated
5
+ * after context compaction. Tests validate that prompts have:
6
+ * - Real epic IDs (not placeholders)
7
+ * - Actionable tool calls with specific values
8
+ * - Strong coordinator identity
9
+ * - Explicit forbidden tools list
10
+ * - Correct first tool suggestion
11
+ */
12
+
13
+ import type { CompactionPrompt } from "../../src/compaction-prompt-scoring.js";
14
+
15
+ /**
16
+ * Compaction prompt test case structure
17
+ */
18
+ export interface CompactionPromptTestCase {
19
+ name: string;
20
+ description: string;
21
+ /**
22
+ * The generated continuation prompt
23
+ */
24
+ prompt: CompactionPrompt;
25
+ /**
26
+ * Expected scoring outcomes
27
+ */
28
+ expected: {
29
+ /**
30
+ * Should have real epic IDs (not placeholders)
31
+ */
32
+ hasRealEpicId: boolean;
33
+ /**
34
+ * Should have actionable tool calls
35
+ */
36
+ isActionable: boolean;
37
+ /**
38
+ * Should have strong coordinator identity
39
+ */
40
+ hasCoordinatorIdentity: boolean;
41
+ /**
42
+ * Should list forbidden tools by name
43
+ */
44
+ listsForbiddenTools: boolean;
45
+ /**
46
+ * First suggested tool should be correct
47
+ */
48
+ hasCorrectFirstTool: boolean;
49
+ };
50
+ }
51
+
52
+ export const compactionPromptCases: CompactionPromptTestCase[] = [
53
+ // ============================================================================
54
+ // PERFECT PROMPT: All criteria met
55
+ // ============================================================================
56
+ {
57
+ name: "Perfect coordinator resumption prompt",
58
+ description:
59
+ "Ideal continuation prompt with all quality criteria met: real IDs, actionable tools, strong identity, forbidden list, correct first tool",
60
+ prompt: {
61
+ content: `
62
+ ┌─────────────────────────────────────────────────────────────┐
63
+ │ 🐝 COORDINATOR RESUMPTION │
64
+ │ Context Compacted │
65
+ └─────────────────────────────────────────────────────────────┘
66
+
67
+ You are the COORDINATOR of swarm epic mjkweh2p4u5.
68
+
69
+ ## IMMEDIATE ACTIONS (Do These FIRST)
70
+
71
+ 1. swarm_status(epic_id="mjkweh2p4u5", project_key="/Users/joel/Code/myapp")
72
+ 2. swarmmail_inbox(limit=5)
73
+ 3. Review any completed work
74
+
75
+ ## FORBIDDEN TOOLS (NEVER Use These)
76
+
77
+ Coordinators do NOT edit code directly. These tools are FORBIDDEN:
78
+ - edit
79
+ - write
80
+ - bash (for file modifications)
81
+
82
+ Use swarm_spawn_subtask to delegate work to workers.
83
+
84
+ ## Your Role
85
+
86
+ You orchestrate. You do NOT implement. Spawn workers, monitor progress, unblock, ship.
87
+
88
+ ALWAYS spawn workers for file modifications.
89
+ NEVER edit files yourself.
90
+ NON-NEGOTIABLE: Check status and inbox before making decisions.
91
+ `,
92
+ },
93
+ expected: {
94
+ hasRealEpicId: true,
95
+ isActionable: true,
96
+ hasCoordinatorIdentity: true,
97
+ listsForbiddenTools: true,
98
+ hasCorrectFirstTool: true,
99
+ },
100
+ },
101
+
102
+ // ============================================================================
103
+ // BAD PROMPT: Placeholder epic ID
104
+ // ============================================================================
105
+ {
106
+ name: "Prompt with placeholder epic ID",
107
+ description:
108
+ "Contains placeholder <epic-id> instead of real ID - fails specificity check",
109
+ prompt: {
110
+ content: `
111
+ ## Coordinator Resumption
112
+
113
+ You are coordinating epic <epic-id>.
114
+
115
+ Check the status with:
116
+ 1. swarm_status(epic_id="<epic-id>", project_key="<path>")
117
+ 2. swarmmail_inbox()
118
+
119
+ Continue orchestrating the swarm.
120
+ `,
121
+ },
122
+ expected: {
123
+ hasRealEpicId: false, // <epic-id> is a placeholder
124
+ isActionable: false, // Has placeholders in tool calls
125
+ hasCoordinatorIdentity: false, // No ASCII header or strong language
126
+ listsForbiddenTools: false, // Doesn't list forbidden tools
127
+ hasCorrectFirstTool: true, // First tool is swarm_status (correct)
128
+ },
129
+ },
130
+
131
+ // ============================================================================
132
+ // BAD PROMPT: Generic instructions, no actionable tools
133
+ // ============================================================================
134
+ {
135
+ name: "Generic instructions without specific tools",
136
+ description:
137
+ "Vague language like 'check status' without actual tool calls - fails actionability",
138
+ prompt: {
139
+ content: `
140
+ You were coordinating a swarm before compaction.
141
+
142
+ To resume:
143
+ - Check the status of workers
144
+ - Read your messages
145
+ - Continue where you left off
146
+
147
+ Remember, you're the coordinator. Keep the work moving forward.
148
+ `,
149
+ },
150
+ expected: {
151
+ hasRealEpicId: false, // No epic ID at all
152
+ isActionable: false, // No specific tool calls
153
+ hasCoordinatorIdentity: false, // No strong identity reinforcement
154
+ listsForbiddenTools: false, // No forbidden tools list
155
+ hasCorrectFirstTool: false, // No first tool specified
156
+ },
157
+ },
158
+
159
+ // ============================================================================
160
+ // BAD PROMPT: Weak coordinator identity
161
+ // ============================================================================
162
+ {
163
+ name: "Weak coordinator identity",
164
+ description:
165
+ "Has real ID and tools but lacks strong identity reinforcement - fails coordinator identity check",
166
+ prompt: {
167
+ content: `
168
+ ## Swarm Resumption
169
+
170
+ Epic ID: mjkweh9x2a1
171
+ Project: /Users/joel/Code/myapp
172
+
173
+ You can check status with:
174
+ swarm_status(epic_id="mjkweh9x2a1", project_key="/Users/joel/Code/myapp")
175
+
176
+ And read messages:
177
+ swarmmail_inbox(limit=5)
178
+
179
+ Please continue coordinating.
180
+ `,
181
+ },
182
+ expected: {
183
+ hasRealEpicId: true, // Has real ID
184
+ isActionable: true, // Has specific tool calls
185
+ hasCoordinatorIdentity: false, // No ASCII header, no NEVER/ALWAYS/NON-NEGOTIABLE
186
+ listsForbiddenTools: false, // No forbidden tools list
187
+ hasCorrectFirstTool: true, // First tool is swarm_status
188
+ },
189
+ },
190
+
191
+ // ============================================================================
192
+ // BAD PROMPT: Missing forbidden tools list
193
+ // ============================================================================
194
+ {
195
+ name: "Missing forbidden tools list",
196
+ description:
197
+ "Good prompt but doesn't explicitly list forbidden tools - coordinators need this reminder",
198
+ prompt: {
199
+ content: `
200
+ ┌─────────────────────────────────────────────────────────────┐
201
+ │ 🐝 COORDINATOR RESUMPTION │
202
+ └─────────────────────────────────────────────────────────────┘
203
+
204
+ You are the COORDINATOR of epic mjkweh3k8p2.
205
+
206
+ ## IMMEDIATE ACTIONS
207
+
208
+ 1. swarm_status(epic_id="mjkweh3k8p2", project_key="/Users/joel/Code/myapp")
209
+ 2. swarmmail_inbox(limit=5)
210
+
211
+ ## Your Role
212
+
213
+ ALWAYS delegate to workers.
214
+ NEVER edit files directly.
215
+
216
+ Coordinators orchestrate, workers implement.
217
+ `,
218
+ },
219
+ expected: {
220
+ hasRealEpicId: true,
221
+ isActionable: true,
222
+ hasCoordinatorIdentity: true, // Has ASCII + NEVER/ALWAYS
223
+ listsForbiddenTools: false, // Doesn't list "edit", "write", "bash" by name
224
+ hasCorrectFirstTool: true,
225
+ },
226
+ },
227
+
228
+ // ============================================================================
229
+ // BAD PROMPT: Wrong first tool (edit instead of swarm_status)
230
+ // ============================================================================
231
+ {
232
+ name: "Wrong first tool suggestion",
233
+ description:
234
+ "Suggests edit/write as first action - coordinator discipline failure",
235
+ prompt: {
236
+ content: `
237
+ ┌─────────────────────────────────────────────────────────────┐
238
+ │ 🐝 COORDINATOR RESUMPTION │
239
+ └─────────────────────────────────────────────────────────────┘
240
+
241
+ You are the COORDINATOR of epic mjkweh7q9n4.
242
+
243
+ ## IMMEDIATE ACTIONS
244
+
245
+ 1. edit(filePath="/src/app.ts", oldString="...", newString="...")
246
+ 2. swarm_status(epic_id="mjkweh7q9n4", project_key="/Users/joel/Code/myapp")
247
+
248
+ ## FORBIDDEN TOOLS
249
+ - edit
250
+ - write
251
+ - bash (for file mods)
252
+
253
+ NEVER edit files yourself.
254
+ ALWAYS delegate to workers.
255
+ `,
256
+ },
257
+ expected: {
258
+ hasRealEpicId: true,
259
+ isActionable: true,
260
+ hasCoordinatorIdentity: true,
261
+ listsForbiddenTools: true,
262
+ hasCorrectFirstTool: false, // First tool is edit, should be swarm_status/inbox
263
+ },
264
+ },
265
+
266
+ // ============================================================================
267
+ // EDGE CASE: Multiple epics mentioned
268
+ // ============================================================================
269
+ {
270
+ name: "Multiple epic IDs in prompt",
271
+ description:
272
+ "Prompt references multiple epics - should still pass if at least one is real",
273
+ prompt: {
274
+ content: `
275
+ ┌─────────────────────────────────────────────────────────────┐
276
+ │ 🐝 COORDINATOR RESUMPTION │
277
+ └─────────────────────────────────────────────────────────────┘
278
+
279
+ You are coordinating epics:
280
+ - mjkweh5t2x8 (in progress)
281
+ - mjkweh6u3y9 (blocked)
282
+
283
+ ## IMMEDIATE ACTIONS
284
+
285
+ 1. swarm_status(epic_id="mjkweh5t2x8", project_key="/Users/joel/Code/myapp")
286
+ 2. swarmmail_inbox(limit=5)
287
+
288
+ ## FORBIDDEN TOOLS
289
+ - edit
290
+ - write
291
+ - bash
292
+
293
+ ALWAYS check status first.
294
+ NEVER edit files directly.
295
+ `,
296
+ },
297
+ expected: {
298
+ hasRealEpicId: true, // Has real IDs
299
+ isActionable: true,
300
+ hasCoordinatorIdentity: true,
301
+ listsForbiddenTools: true,
302
+ hasCorrectFirstTool: true,
303
+ },
304
+ },
305
+ ];