opencode-swarm-plugin 0.37.0 → 0.39.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +20 -5
- package/.hive/memories.jsonl +35 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/.turbo/turbo-build.log +4 -4
- package/.turbo/turbo-test.log +319 -319
- package/CHANGELOG.md +258 -0
- package/README.md +50 -0
- package/bin/swarm.test.ts +475 -0
- package/bin/swarm.ts +385 -208
- package/dist/compaction-hook.d.ts +1 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +81 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/hive.d.ts +59 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +87 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +823 -131
- package/dist/plugin.js +655 -131
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/swarm-decompose.d.ts +30 -0
- package/dist/swarm-decompose.d.ts.map +1 -1
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +19 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +595 -94
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
- package/evals/scorers/coordinator-discipline.ts +13 -13
- package/examples/plugin-wrapper-template.ts +177 -8
- package/package.json +7 -2
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +139 -2
- package/src/compaction-hook.ts +113 -2
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +422 -0
- package/src/eval-capture.ts +94 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/index.ts +61 -1
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.test.ts +40 -47
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.test.ts +270 -7
- package/src/swarm-orchestrate.ts +100 -13
- package/src/swarm-prompts.test.ts +121 -0
- package/src/swarm-prompts.ts +297 -4
- package/src/swarm-research.integration.test.ts +157 -0
- package/src/swarm-review.ts +3 -3
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compaction Prompt Quality Scoring - Pure Functions
|
|
3
|
+
*
|
|
4
|
+
* Evaluates the quality of continuation prompts generated after context compaction.
|
|
5
|
+
* **Problem**: Post-compaction coordinators often "wake up" confused, forget their role,
|
|
6
|
+
* and start editing files instead of checking worker status.
|
|
7
|
+
*
|
|
8
|
+
* **Solution**: Score prompts on 5 dimensions that predict coordinator success:
|
|
9
|
+
*
|
|
10
|
+
* 1. **Epic ID Specificity (0.20)**: Real IDs (`mjkw...`) not placeholders (`<epic-id>`, `bd-xxx`)
|
|
11
|
+
* - Placeholders = coordinator can't check actual swarm status
|
|
12
|
+
*
|
|
13
|
+
* 2. **Actionability (0.20)**: Tool calls with real values (e.g., `swarm_status(epic_id='mjkw81rkq4c')`)
|
|
14
|
+
* - Generic instructions like "check status" don't work
|
|
15
|
+
*
|
|
16
|
+
* 3. **Coordinator Identity (0.25)**: ASCII header + strong mandates (NEVER/ALWAYS)
|
|
17
|
+
* - Visual + semantic cues reinforce role post-compaction
|
|
18
|
+
*
|
|
19
|
+
* 4. **Forbidden Tools Listed (0.15)**: Explicitly lists Edit, Write, swarmmail_reserve, git commit
|
|
20
|
+
* - Naming forbidden tools reduces violations
|
|
21
|
+
*
|
|
22
|
+
* 5. **Post-Compaction Discipline (0.20)**: First suggested tool is swarm_status or inbox (not Edit)
|
|
23
|
+
* - First tool sets the pattern - "check status" vs "dive into code"
|
|
24
|
+
*
|
|
25
|
+
* **Pure functions**: These can be tested without evalite. The evalite wrappers are in
|
|
26
|
+
* `evals/scorers/compaction-prompt-scorers.ts`.
|
|
27
|
+
*
|
|
28
|
+
* **Data source**: Captured from `captureCompactionEvent()` with `compaction_type: "prompt_generated"`.
|
|
29
|
+
* The payload includes the FULL prompt content (not truncated) for scoring.
|
|
30
|
+
*
|
|
31
|
+
* **Integration**: `compaction-prompt.eval.ts` uses these scorers to track prompt quality over time.
|
|
32
|
+
* Progressive gates enforce quality: bootstrap → stabilization → production.
|
|
33
|
+
*
|
|
34
|
+
* @module compaction-prompt-scoring
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Compaction prompt structure (from LLM generation)
|
|
39
|
+
*/
|
|
40
|
+
export interface CompactionPrompt {
|
|
41
|
+
content: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Scorer result type
|
|
46
|
+
*/
|
|
47
|
+
export interface ScorerResult {
|
|
48
|
+
score: number;
|
|
49
|
+
message: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// ====== Shared Regex Patterns ======
|
|
53
|
+
|
|
54
|
+
/** Matches real epic/cell IDs (mjkw prefix + 7+ base36 chars) */
|
|
55
|
+
export const REAL_EPIC_ID = /mjkw[a-z0-9]{7,}/;
|
|
56
|
+
|
|
57
|
+
/** Matches common placeholder patterns */
|
|
58
|
+
export const PLACEHOLDERS = [
|
|
59
|
+
/<epic-id>/i,
|
|
60
|
+
/bd-xxx/,
|
|
61
|
+
/<path>/i,
|
|
62
|
+
/<project>/i,
|
|
63
|
+
];
|
|
64
|
+
|
|
65
|
+
/** Matches ASCII box-drawing characters (for headers) */
|
|
66
|
+
export const ASCII_BOX = /[┌┐└┘─│]{3,}/;
|
|
67
|
+
|
|
68
|
+
/** Matches strong mandate language */
|
|
69
|
+
export const STRONG_LANGUAGE = [/\bNEVER\b/, /\bALWAYS\b/, /\bNON-NEGOTIABLE\b/];
|
|
70
|
+
|
|
71
|
+
// ====== Pure Scoring Functions ======
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Score epic ID specificity
|
|
75
|
+
*
|
|
76
|
+
* Validates that epic IDs are REAL, not placeholders.
|
|
77
|
+
* Placeholders like <epic-id>, bd-xxx, <path> indicate
|
|
78
|
+
* the prompt generator failed to inject actual values.
|
|
79
|
+
*
|
|
80
|
+
* @returns 1.0 if real IDs, 0.0 if placeholders found
|
|
81
|
+
*/
|
|
82
|
+
export function scoreEpicIdSpecificity(prompt: CompactionPrompt): ScorerResult {
|
|
83
|
+
// Check for placeholder patterns
|
|
84
|
+
for (const pattern of PLACEHOLDERS) {
|
|
85
|
+
if (pattern.test(prompt.content)) {
|
|
86
|
+
return {
|
|
87
|
+
score: 0.0,
|
|
88
|
+
message: `Found placeholder: ${pattern.source}`,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Check for real epic ID pattern
|
|
94
|
+
if (REAL_EPIC_ID.test(prompt.content)) {
|
|
95
|
+
return {
|
|
96
|
+
score: 1.0,
|
|
97
|
+
message: "Contains real epic ID",
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
score: 0.0,
|
|
103
|
+
message: "No epic ID found",
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Score actionability of tool calls
|
|
109
|
+
*
|
|
110
|
+
* Validates that the prompt includes SPECIFIC actionable tool calls.
|
|
111
|
+
* Generic instructions like "check status" are useless.
|
|
112
|
+
* Good: swarm_status(epic_id='mjkw81rkq4c', project_key='/path')
|
|
113
|
+
* Bad: "Check the status of workers"
|
|
114
|
+
*
|
|
115
|
+
* @returns 1.0 if actionable tool calls with real values, 0.0 otherwise
|
|
116
|
+
*/
|
|
117
|
+
export function scoreActionability(prompt: CompactionPrompt): ScorerResult {
|
|
118
|
+
// Check for actionable tool patterns
|
|
119
|
+
const actionableTools = [
|
|
120
|
+
/swarm_status\([^)]*epic_id\s*=\s*['"]mjkw[a-z0-9]{7,}['"]/,
|
|
121
|
+
/swarmmail_inbox\(\)/,
|
|
122
|
+
];
|
|
123
|
+
|
|
124
|
+
for (const pattern of actionableTools) {
|
|
125
|
+
if (pattern.test(prompt.content)) {
|
|
126
|
+
return {
|
|
127
|
+
score: 1.0,
|
|
128
|
+
message: "Contains actionable tool call with real values",
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Check if tool is mentioned but with placeholders
|
|
134
|
+
if (
|
|
135
|
+
/swarm_status\([^)]*<epic-id>/.test(prompt.content) ||
|
|
136
|
+
/swarm_status\([^)]*<path>/.test(prompt.content)
|
|
137
|
+
) {
|
|
138
|
+
return {
|
|
139
|
+
score: 0.0,
|
|
140
|
+
message: "Tool call has placeholders",
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return {
|
|
145
|
+
score: 0.0,
|
|
146
|
+
message: "No actionable tool calls found",
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Score coordinator identity reinforcement
|
|
152
|
+
*
|
|
153
|
+
* Validates that the prompt has STRONG coordinator identity reinforcement.
|
|
154
|
+
* Post-compaction coordinators lose their identity without visual+semantic cues.
|
|
155
|
+
*
|
|
156
|
+
* Checks:
|
|
157
|
+
* 1. ASCII box header (visual anchor)
|
|
158
|
+
* 2. Strong language (NEVER/ALWAYS, not "should"/"consider")
|
|
159
|
+
*
|
|
160
|
+
* @returns 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
|
|
161
|
+
*/
|
|
162
|
+
export function scoreCoordinatorIdentity(
|
|
163
|
+
prompt: CompactionPrompt,
|
|
164
|
+
): ScorerResult {
|
|
165
|
+
// Check for ASCII box header (uses box-drawing characters)
|
|
166
|
+
const hasAsciiHeader =
|
|
167
|
+
ASCII_BOX.test(prompt.content) &&
|
|
168
|
+
/(YOU ARE THE COORDINATOR|COORDINATOR MODE)/i.test(prompt.content);
|
|
169
|
+
|
|
170
|
+
if (!hasAsciiHeader) {
|
|
171
|
+
return {
|
|
172
|
+
score: 0.0,
|
|
173
|
+
message: "No ASCII header found",
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Check for strong mandate language
|
|
178
|
+
const hasStrongLanguage = STRONG_LANGUAGE.some((pattern) =>
|
|
179
|
+
pattern.test(prompt.content),
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
if (!hasStrongLanguage) {
|
|
183
|
+
return {
|
|
184
|
+
score: 0.5,
|
|
185
|
+
message: "ASCII header present but weak language",
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
score: 1.0,
|
|
191
|
+
message: "ASCII header + strong mandates present",
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Score forbidden tools listing
|
|
197
|
+
*
|
|
198
|
+
* Validates that the prompt LISTS forbidden tools by name.
|
|
199
|
+
* Coordinators must know exactly which tools to avoid.
|
|
200
|
+
*
|
|
201
|
+
* Required forbidden tools:
|
|
202
|
+
* 1. Edit
|
|
203
|
+
* 2. Write
|
|
204
|
+
* 3. swarmmail_reserve (only workers reserve)
|
|
205
|
+
* 4. git commit (workers commit)
|
|
206
|
+
*
|
|
207
|
+
* @returns ratio of forbidden tools mentioned (0.0 to 1.0)
|
|
208
|
+
*/
|
|
209
|
+
export function scoreForbiddenToolsPresent(
|
|
210
|
+
prompt: CompactionPrompt,
|
|
211
|
+
): ScorerResult {
|
|
212
|
+
// Check for forbidden tool mentions
|
|
213
|
+
const forbiddenTools = [
|
|
214
|
+
/\bEdit\b/,
|
|
215
|
+
/\bWrite\b/,
|
|
216
|
+
/swarmmail_reserve/,
|
|
217
|
+
/git commit/,
|
|
218
|
+
];
|
|
219
|
+
|
|
220
|
+
const foundTools = forbiddenTools.filter((pattern) =>
|
|
221
|
+
pattern.test(prompt.content),
|
|
222
|
+
);
|
|
223
|
+
|
|
224
|
+
const score = foundTools.length / forbiddenTools.length;
|
|
225
|
+
|
|
226
|
+
if (score === 1.0) {
|
|
227
|
+
return {
|
|
228
|
+
score: 1.0,
|
|
229
|
+
message: "All 4 forbidden tools listed",
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
if (score === 0) {
|
|
234
|
+
return {
|
|
235
|
+
score: 0.0,
|
|
236
|
+
message: "No forbidden tools listed (0/4)",
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
score,
|
|
242
|
+
message: `${foundTools.length}/4 forbidden tools listed`,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Score post-compaction discipline (first tool correctness)
|
|
248
|
+
*
|
|
249
|
+
* Validates that the FIRST suggested tool is correct.
|
|
250
|
+
* Coordinators should check status FIRST, not edit files.
|
|
251
|
+
*
|
|
252
|
+
* Good first tools:
|
|
253
|
+
* - swarm_status
|
|
254
|
+
* - swarmmail_inbox
|
|
255
|
+
*
|
|
256
|
+
* Bad first tools:
|
|
257
|
+
* - Edit
|
|
258
|
+
* - Write
|
|
259
|
+
* - Read (should check status first)
|
|
260
|
+
*
|
|
261
|
+
* @returns 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
|
|
262
|
+
*/
|
|
263
|
+
export function scorePostCompactionDiscipline(
|
|
264
|
+
prompt: CompactionPrompt,
|
|
265
|
+
): ScorerResult {
|
|
266
|
+
// Extract first tool call (look for function-like patterns)
|
|
267
|
+
const toolCallPattern =
|
|
268
|
+
/\b(swarm_status|swarmmail_inbox|Edit|Write|Read)\b/i;
|
|
269
|
+
const match = prompt.content.match(toolCallPattern);
|
|
270
|
+
|
|
271
|
+
if (!match) {
|
|
272
|
+
return {
|
|
273
|
+
score: 0.0,
|
|
274
|
+
message: "No tool calls found",
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
const firstTool = match[1].toLowerCase();
|
|
279
|
+
|
|
280
|
+
if (firstTool === "swarm_status") {
|
|
281
|
+
return {
|
|
282
|
+
score: 1.0,
|
|
283
|
+
message: "First tool is swarm_status (correct)",
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (firstTool === "swarmmail_inbox") {
|
|
288
|
+
return {
|
|
289
|
+
score: 1.0,
|
|
290
|
+
message: "First tool is inbox (correct)",
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
score: 0.0,
|
|
296
|
+
message: `First tool is ${match[1]} (should be swarm_status or inbox)`,
|
|
297
|
+
};
|
|
298
|
+
}
|