opencode-swarm-plugin 0.38.0 → 0.39.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +11 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +130 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +475 -0
  9. package/bin/swarm.ts +383 -0
  10. package/dist/compaction-hook.d.ts +1 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-prompt-scoring.d.ts +124 -0
  13. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  14. package/dist/eval-capture.d.ts +81 -1
  15. package/dist/eval-capture.d.ts.map +1 -1
  16. package/dist/eval-gates.d.ts +84 -0
  17. package/dist/eval-gates.d.ts.map +1 -0
  18. package/dist/eval-history.d.ts +117 -0
  19. package/dist/eval-history.d.ts.map +1 -0
  20. package/dist/eval-learning.d.ts +216 -0
  21. package/dist/eval-learning.d.ts.map +1 -0
  22. package/dist/index.d.ts +44 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +370 -13
  25. package/dist/plugin.js +203 -13
  26. package/dist/post-compaction-tracker.d.ts +133 -0
  27. package/dist/post-compaction-tracker.d.ts.map +1 -0
  28. package/dist/swarm-orchestrate.d.ts +23 -0
  29. package/dist/swarm-orchestrate.d.ts.map +1 -1
  30. package/dist/swarm-prompts.d.ts +25 -1
  31. package/dist/swarm-prompts.d.ts.map +1 -1
  32. package/dist/swarm.d.ts +4 -0
  33. package/dist/swarm.d.ts.map +1 -1
  34. package/evals/README.md +589 -105
  35. package/evals/compaction-prompt.eval.ts +149 -0
  36. package/evals/coordinator-behavior.eval.ts +8 -8
  37. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  38. package/evals/lib/compaction-loader.test.ts +248 -0
  39. package/evals/lib/compaction-loader.ts +320 -0
  40. package/evals/lib/data-loader.test.ts +345 -0
  41. package/evals/lib/data-loader.ts +107 -6
  42. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  43. package/evals/scorers/compaction-scorers.ts +13 -13
  44. package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
  45. package/evals/scorers/coordinator-discipline.ts +13 -13
  46. package/examples/plugin-wrapper-template.ts +117 -0
  47. package/package.json +7 -5
  48. package/scripts/migrate-unknown-sessions.ts +349 -0
  49. package/src/compaction-capture.integration.test.ts +257 -0
  50. package/src/compaction-hook.test.ts +42 -0
  51. package/src/compaction-hook.ts +81 -0
  52. package/src/compaction-prompt-scorers.test.ts +299 -0
  53. package/src/compaction-prompt-scoring.ts +298 -0
  54. package/src/eval-capture.test.ts +422 -0
  55. package/src/eval-capture.ts +94 -2
  56. package/src/eval-gates.test.ts +306 -0
  57. package/src/eval-gates.ts +218 -0
  58. package/src/eval-history.test.ts +508 -0
  59. package/src/eval-history.ts +214 -0
  60. package/src/eval-learning.test.ts +378 -0
  61. package/src/eval-learning.ts +360 -0
  62. package/src/index.ts +61 -1
  63. package/src/post-compaction-tracker.test.ts +251 -0
  64. package/src/post-compaction-tracker.ts +237 -0
  65. package/src/swarm-decompose.ts +2 -2
  66. package/src/swarm-orchestrate.ts +2 -2
  67. package/src/swarm-prompts.ts +2 -2
  68. package/src/swarm-review.ts +3 -3
  69. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,298 @@
1
+ /**
2
+ * Compaction Prompt Quality Scoring - Pure Functions
3
+ *
4
+ * Evaluates the quality of continuation prompts generated after context compaction.
5
+ * **Problem**: Post-compaction coordinators often "wake up" confused, forget their role,
6
+ * and start editing files instead of checking worker status.
7
+ *
8
+ * **Solution**: Score prompts on 5 dimensions that predict coordinator success:
9
+ *
10
+ * 1. **Epic ID Specificity (0.20)**: Real IDs (`mjkw...`) not placeholders (`<epic-id>`, `bd-xxx`)
11
+ * - Placeholders = coordinator can't check actual swarm status
12
+ *
13
+ * 2. **Actionability (0.20)**: Tool calls with real values (e.g., `swarm_status(epic_id='mjkw81rkq4c')`)
14
+ * - Generic instructions like "check status" don't work
15
+ *
16
+ * 3. **Coordinator Identity (0.25)**: ASCII header + strong mandates (NEVER/ALWAYS)
17
+ * - Visual + semantic cues reinforce role post-compaction
18
+ *
19
+ * 4. **Forbidden Tools Listed (0.15)**: Explicitly lists Edit, Write, swarmmail_reserve, git commit
20
+ * - Naming forbidden tools reduces violations
21
+ *
22
+ * 5. **Post-Compaction Discipline (0.20)**: First suggested tool is swarm_status or inbox (not Edit)
23
+ * - First tool sets the pattern - "check status" vs "dive into code"
24
+ *
25
+ * **Pure functions**: These can be tested without evalite. The evalite wrappers are in
26
+ * `evals/scorers/compaction-prompt-scorers.ts`.
27
+ *
28
+ * **Data source**: Captured from `captureCompactionEvent()` with `compaction_type: "prompt_generated"`.
29
+ * The payload includes the FULL prompt content (not truncated) for scoring.
30
+ *
31
+ * **Integration**: `compaction-prompt.eval.ts` uses these scorers to track prompt quality over time.
32
+ * Progressive gates enforce quality: bootstrap → stabilization → production.
33
+ *
34
+ * @module compaction-prompt-scoring
35
+ */
36
+
37
+ /**
38
+ * Compaction prompt structure (from LLM generation)
39
+ */
40
+ export interface CompactionPrompt {
41
+ content: string;
42
+ }
43
+
44
+ /**
45
+ * Scorer result type
46
+ */
47
+ export interface ScorerResult {
48
+ score: number;
49
+ message: string;
50
+ }
51
+
52
+ // ====== Shared Regex Patterns ======
53
+
54
+ /** Matches real epic/cell IDs (mjkw prefix + 7+ base36 chars) */
55
+ export const REAL_EPIC_ID = /mjkw[a-z0-9]{7,}/;
56
+
57
+ /** Matches common placeholder patterns */
58
+ export const PLACEHOLDERS = [
59
+ /<epic-id>/i,
60
+ /bd-xxx/,
61
+ /<path>/i,
62
+ /<project>/i,
63
+ ];
64
+
65
+ /** Matches ASCII box-drawing characters (for headers) */
66
+ export const ASCII_BOX = /[┌┐└┘─│]{3,}/;
67
+
68
+ /** Matches strong mandate language */
69
+ export const STRONG_LANGUAGE = [/\bNEVER\b/, /\bALWAYS\b/, /\bNON-NEGOTIABLE\b/];
70
+
71
+ // ====== Pure Scoring Functions ======
72
+
73
+ /**
74
+ * Score epic ID specificity
75
+ *
76
+ * Validates that epic IDs are REAL, not placeholders.
77
+ * Placeholders like <epic-id>, bd-xxx, <path> indicate
78
+ * the prompt generator failed to inject actual values.
79
+ *
80
+ * @returns 1.0 if real IDs, 0.0 if placeholders found
81
+ */
82
+ export function scoreEpicIdSpecificity(prompt: CompactionPrompt): ScorerResult {
83
+ // Check for placeholder patterns
84
+ for (const pattern of PLACEHOLDERS) {
85
+ if (pattern.test(prompt.content)) {
86
+ return {
87
+ score: 0.0,
88
+ message: `Found placeholder: ${pattern.source}`,
89
+ };
90
+ }
91
+ }
92
+
93
+ // Check for real epic ID pattern
94
+ if (REAL_EPIC_ID.test(prompt.content)) {
95
+ return {
96
+ score: 1.0,
97
+ message: "Contains real epic ID",
98
+ };
99
+ }
100
+
101
+ return {
102
+ score: 0.0,
103
+ message: "No epic ID found",
104
+ };
105
+ }
106
+
107
+ /**
108
+ * Score actionability of tool calls
109
+ *
110
+ * Validates that the prompt includes SPECIFIC actionable tool calls.
111
+ * Generic instructions like "check status" are useless.
112
+ * Good: swarm_status(epic_id='mjkw81rkq4c', project_key='/path')
113
+ * Bad: "Check the status of workers"
114
+ *
115
+ * @returns 1.0 if actionable tool calls with real values, 0.0 otherwise
116
+ */
117
+ export function scoreActionability(prompt: CompactionPrompt): ScorerResult {
118
+ // Check for actionable tool patterns
119
+ const actionableTools = [
120
+ /swarm_status\([^)]*epic_id\s*=\s*['"]mjkw[a-z0-9]{7,}['"]/,
121
+ /swarmmail_inbox\(\)/,
122
+ ];
123
+
124
+ for (const pattern of actionableTools) {
125
+ if (pattern.test(prompt.content)) {
126
+ return {
127
+ score: 1.0,
128
+ message: "Contains actionable tool call with real values",
129
+ };
130
+ }
131
+ }
132
+
133
+ // Check if tool is mentioned but with placeholders
134
+ if (
135
+ /swarm_status\([^)]*<epic-id>/.test(prompt.content) ||
136
+ /swarm_status\([^)]*<path>/.test(prompt.content)
137
+ ) {
138
+ return {
139
+ score: 0.0,
140
+ message: "Tool call has placeholders",
141
+ };
142
+ }
143
+
144
+ return {
145
+ score: 0.0,
146
+ message: "No actionable tool calls found",
147
+ };
148
+ }
149
+
150
+ /**
151
+ * Score coordinator identity reinforcement
152
+ *
153
+ * Validates that the prompt has STRONG coordinator identity reinforcement.
154
+ * Post-compaction coordinators lose their identity without visual+semantic cues.
155
+ *
156
+ * Checks:
157
+ * 1. ASCII box header (visual anchor)
158
+ * 2. Strong language (NEVER/ALWAYS, not "should"/"consider")
159
+ *
160
+ * @returns 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
161
+ */
162
+ export function scoreCoordinatorIdentity(
163
+ prompt: CompactionPrompt,
164
+ ): ScorerResult {
165
+ // Check for ASCII box header (uses box-drawing characters)
166
+ const hasAsciiHeader =
167
+ ASCII_BOX.test(prompt.content) &&
168
+ /(YOU ARE THE COORDINATOR|COORDINATOR MODE)/i.test(prompt.content);
169
+
170
+ if (!hasAsciiHeader) {
171
+ return {
172
+ score: 0.0,
173
+ message: "No ASCII header found",
174
+ };
175
+ }
176
+
177
+ // Check for strong mandate language
178
+ const hasStrongLanguage = STRONG_LANGUAGE.some((pattern) =>
179
+ pattern.test(prompt.content),
180
+ );
181
+
182
+ if (!hasStrongLanguage) {
183
+ return {
184
+ score: 0.5,
185
+ message: "ASCII header present but weak language",
186
+ };
187
+ }
188
+
189
+ return {
190
+ score: 1.0,
191
+ message: "ASCII header + strong mandates present",
192
+ };
193
+ }
194
+
195
+ /**
196
+ * Score forbidden tools listing
197
+ *
198
+ * Validates that the prompt LISTS forbidden tools by name.
199
+ * Coordinators must know exactly which tools to avoid.
200
+ *
201
+ * Required forbidden tools:
202
+ * 1. Edit
203
+ * 2. Write
204
+ * 3. swarmmail_reserve (only workers reserve)
205
+ * 4. git commit (workers commit)
206
+ *
207
+ * @returns ratio of forbidden tools mentioned (0.0 to 1.0)
208
+ */
209
+ export function scoreForbiddenToolsPresent(
210
+ prompt: CompactionPrompt,
211
+ ): ScorerResult {
212
+ // Check for forbidden tool mentions
213
+ const forbiddenTools = [
214
+ /\bEdit\b/,
215
+ /\bWrite\b/,
216
+ /swarmmail_reserve/,
217
+ /git commit/,
218
+ ];
219
+
220
+ const foundTools = forbiddenTools.filter((pattern) =>
221
+ pattern.test(prompt.content),
222
+ );
223
+
224
+ const score = foundTools.length / forbiddenTools.length;
225
+
226
+ if (score === 1.0) {
227
+ return {
228
+ score: 1.0,
229
+ message: "All 4 forbidden tools listed",
230
+ };
231
+ }
232
+
233
+ if (score === 0) {
234
+ return {
235
+ score: 0.0,
236
+ message: "No forbidden tools listed (0/4)",
237
+ };
238
+ }
239
+
240
+ return {
241
+ score,
242
+ message: `${foundTools.length}/4 forbidden tools listed`,
243
+ };
244
+ }
245
+
246
+ /**
247
+ * Score post-compaction discipline (first tool correctness)
248
+ *
249
+ * Validates that the FIRST suggested tool is correct.
250
+ * Coordinators should check status FIRST, not edit files.
251
+ *
252
+ * Good first tools:
253
+ * - swarm_status
254
+ * - swarmmail_inbox
255
+ *
256
+ * Bad first tools:
257
+ * - Edit
258
+ * - Write
259
+ * - Read (should check status first)
260
+ *
261
+ * @returns 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
262
+ */
263
+ export function scorePostCompactionDiscipline(
264
+ prompt: CompactionPrompt,
265
+ ): ScorerResult {
266
+ // Extract first tool call (look for function-like patterns)
267
+ const toolCallPattern =
268
+ /\b(swarm_status|swarmmail_inbox|Edit|Write|Read)\b/i;
269
+ const match = prompt.content.match(toolCallPattern);
270
+
271
+ if (!match) {
272
+ return {
273
+ score: 0.0,
274
+ message: "No tool calls found",
275
+ };
276
+ }
277
+
278
+ const firstTool = match[1].toLowerCase();
279
+
280
+ if (firstTool === "swarm_status") {
281
+ return {
282
+ score: 1.0,
283
+ message: "First tool is swarm_status (correct)",
284
+ };
285
+ }
286
+
287
+ if (firstTool === "swarmmail_inbox") {
288
+ return {
289
+ score: 1.0,
290
+ message: "First tool is inbox (correct)",
291
+ };
292
+ }
293
+
294
+ return {
295
+ score: 0.0,
296
+ message: `First tool is ${match[1]} (should be swarm_status or inbox)`,
297
+ };
298
+ }