opencode-swarm-plugin 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/.beads/issues.jsonl +213 -0
  2. package/INTEGRATION_EXAMPLE.md +66 -0
  3. package/README.md +352 -522
  4. package/dist/index.js +2046 -984
  5. package/dist/plugin.js +2051 -1017
  6. package/docs/analysis/subagent-coordination-patterns.md +2 -0
  7. package/docs/semantic-memory-cli-syntax.md +123 -0
  8. package/docs/swarm-mail-architecture.md +1147 -0
  9. package/evals/README.md +116 -0
  10. package/evals/evalite.config.ts +15 -0
  11. package/evals/example.eval.ts +32 -0
  12. package/evals/fixtures/decomposition-cases.ts +105 -0
  13. package/evals/lib/data-loader.test.ts +288 -0
  14. package/evals/lib/data-loader.ts +111 -0
  15. package/evals/lib/llm.ts +115 -0
  16. package/evals/scorers/index.ts +200 -0
  17. package/evals/scorers/outcome-scorers.test.ts +27 -0
  18. package/evals/scorers/outcome-scorers.ts +349 -0
  19. package/evals/swarm-decomposition.eval.ts +112 -0
  20. package/package.json +8 -1
  21. package/scripts/cleanup-test-memories.ts +346 -0
  22. package/src/beads.ts +49 -0
  23. package/src/eval-capture.ts +487 -0
  24. package/src/index.ts +45 -3
  25. package/src/learning.integration.test.ts +19 -4
  26. package/src/output-guardrails.test.ts +438 -0
  27. package/src/output-guardrails.ts +381 -0
  28. package/src/schemas/index.ts +18 -0
  29. package/src/schemas/swarm-context.ts +115 -0
  30. package/src/storage.ts +117 -5
  31. package/src/streams/events.test.ts +296 -0
  32. package/src/streams/events.ts +93 -0
  33. package/src/streams/migrations.test.ts +24 -20
  34. package/src/streams/migrations.ts +51 -0
  35. package/src/streams/projections.ts +187 -0
  36. package/src/streams/store.ts +275 -0
  37. package/src/swarm-orchestrate.ts +771 -189
  38. package/src/swarm-prompts.ts +84 -12
  39. package/src/swarm.integration.test.ts +124 -0
  40. package/vitest.integration.config.ts +6 -0
  41. package/vitest.integration.setup.ts +48 -0
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Swarm Decomposition Quality Eval
3
+ *
4
+ * Tests the quality of task decomposition for swarm coordination.
5
+ * Uses real LLM calls via AI SDK + Vercel AI Gateway.
6
+ *
7
+ * Scorers evaluate:
8
+ * - Subtask independence (no file conflicts)
9
+ * - Complexity balance (even distribution)
10
+ * - Coverage completeness (all required files)
11
+ * - Instruction clarity (actionable descriptions)
12
+ *
13
+ * Run with: pnpm evalite evals/swarm-decomposition.eval.ts
14
+ *
15
+ * Requires: ANTHROPIC_API_KEY environment variable
16
+ */
17
+ import { evalite } from "evalite";
18
+ import {
19
+ subtaskIndependence,
20
+ coverageCompleteness,
21
+ instructionClarity,
22
+ } from "./scorers/index.js";
23
+ import { decompositionCases } from "./fixtures/decomposition-cases.js";
24
+ import {
25
+ generateDecomposition,
26
+ formatDecompositionPrompt,
27
+ extractJson,
28
+ } from "./lib/llm.js";
29
+ import {
30
+ loadEvalCases,
31
+ hasRealEvalData,
32
+ getEvalDataSummary,
33
+ } from "./lib/data-loader.js";
34
+
35
+ // Determine project key from current directory
36
+ const PROJECT_KEY = "opencode-swarm-plugin";
37
+ const PROJECT_PATH = process.cwd();
38
+
39
+ // Check if we have enough real data to use instead of fixtures
40
+ const useRealData = await hasRealEvalData(PROJECT_KEY, 5, PROJECT_PATH);
41
+
42
+ // Load data based on availability
43
+ const evalCases = useRealData
44
+ ? await loadEvalCases(PROJECT_KEY, { limit: 20, projectPath: PROJECT_PATH })
45
+ : decompositionCases.map((testCase) => ({
46
+ input: testCase.input,
47
+ expected: testCase.expected,
48
+ }));
49
+
50
+ // Log data source for transparency
51
+ if (useRealData) {
52
+ const summary = await getEvalDataSummary(PROJECT_KEY, PROJECT_PATH);
53
+ console.log(`[eval] Using real data from PGlite:`);
54
+ console.log(` - Total records: ${summary.totalRecords}`);
55
+ console.log(` - Success rate: ${(summary.successRate * 100).toFixed(1)}%`);
56
+ console.log(
57
+ ` - Strategies: ${Object.entries(summary.byStrategy)
58
+ .map(([s, c]) => `${s}(${c})`)
59
+ .join(", ")}`,
60
+ );
61
+ console.log(` - Eval cases: ${evalCases.length}`);
62
+ } else {
63
+ console.log(
64
+ `[eval] Using fixture data (${evalCases.length} cases) - not enough real data yet`,
65
+ );
66
+ }
67
+
68
+ /**
69
+ * Swarm Decomposition Quality Eval
70
+ *
71
+ * Tests decomposition quality with real LLM calls.
72
+ */
73
+ evalite("Swarm Decomposition Quality", {
74
+ // Test data from PGlite or fixtures
75
+ data: async () => evalCases,
76
+
77
+ // Task: generate real decomposition via Claude
78
+ task: async (input) => {
79
+ const prompt = formatDecompositionPrompt(input.task, input.context);
80
+ const response = await generateDecomposition(prompt);
81
+ return extractJson(response);
82
+ },
83
+
84
+ // Scorers evaluate decomposition quality
85
+ scorers: [subtaskIndependence, coverageCompleteness, instructionClarity],
86
+ });
87
+
88
+ /**
89
+ * Edge Case Eval: Minimal and Complex Tasks
90
+ *
91
+ * Tests handling of edge cases in decomposition.
92
+ */
93
+ evalite("Decomposition Edge Cases", {
94
+ data: async () => [
95
+ {
96
+ input: { task: "Fix typo in README.md" },
97
+ expected: { minSubtasks: 1, maxSubtasks: 2 },
98
+ },
99
+ {
100
+ input: { task: "Refactor entire codebase from JavaScript to TypeScript" },
101
+ expected: { minSubtasks: 4, maxSubtasks: 8 },
102
+ },
103
+ ],
104
+
105
+ task: async (input) => {
106
+ const prompt = formatDecompositionPrompt(input.task, undefined, 8);
107
+ const response = await generateDecomposition(prompt);
108
+ return extractJson(response);
109
+ },
110
+
111
+ scorers: [subtaskIndependence, coverageCompleteness],
112
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-swarm-plugin",
3
- "version": "0.20.0",
3
+ "version": "0.22.0",
4
4
  "description": "Multi-agent swarm coordination for OpenCode with learning capabilities, beads integration, and Agent Mail",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -27,6 +27,9 @@
27
27
  "test:all": "bun run test && bun run test:swarm",
28
28
  "typecheck": "tsc --noEmit",
29
29
  "clean": "rm -rf dist",
30
+ "eval:dev": "evalite watch evals/",
31
+ "eval:run": "evalite run evals/",
32
+ "eval:ci": "evalite run evals/ --threshold 80",
30
33
  "release": "npm run build && npm version patch && git push && npm run publish:otp",
31
34
  "release:minor": "npm run build && npm version minor && git push && npm run publish:otp",
32
35
  "release:major": "npm run build && npm version major && git push && npm run publish:otp",
@@ -41,11 +44,15 @@
41
44
  "gray-matter": "^4.0.3",
42
45
  "ioredis": "^5.4.1",
43
46
  "minimatch": "^10.1.1",
47
+ "nanoid": "^5.1.6",
44
48
  "zod": "4.1.8"
45
49
  },
46
50
  "devDependencies": {
47
51
  "@types/bun": "latest",
48
52
  "@types/minimatch": "^6.0.0",
53
+ "ai": "6.0.0-beta.150",
54
+ "bun-types": "^1.3.4",
55
+ "evalite": "^1.0.0-beta.10",
49
56
  "typescript": "^5.7.0",
50
57
  "vitest": "^4.0.15"
51
58
  },
@@ -0,0 +1,346 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * Semantic Memory Test Pollution Cleanup
4
+ *
5
+ * This script audits and documents test pollution in semantic-memory storage.
6
+ * Test artifacts from integration tests pollute the production knowledge base,
7
+ * making semantic search unreliable and wasting storage.
8
+ *
9
+ * ROOT CAUSE:
10
+ * - Integration tests write to shared semantic-memory MCP server
11
+ * - No isolation between test and production collections
12
+ * - Tests don't clean up after themselves
13
+ * - No in-memory test mode available
14
+ *
15
+ * PREVENTION STRATEGY:
16
+ * 1. Test isolation via collection prefixes (test-*, temp-*)
17
+ * 2. Cleanup hooks in test teardown
18
+ * 3. Mock semantic-memory in unit tests
19
+ * 4. Document production collection names
20
+ *
21
+ * Usage:
22
+ * bun scripts/cleanup-test-memories.ts [--dry-run] [--collections <prefix>]
23
+ *
24
+ * Examples:
25
+ * bun scripts/cleanup-test-memories.ts --dry-run
26
+ * bun scripts/cleanup-test-memories.ts --collections test-patterns,test-feedback
27
+ * bun scripts/cleanup-test-memories.ts
28
+ */
29
+
30
+ import { parseArgs } from "node:util";
31
+
32
+ /** Test collection patterns to identify pollution */
33
+ const TEST_COLLECTION_PATTERNS = [
34
+ "test-patterns",
35
+ "test-feedback",
36
+ /^test-.*/,
37
+ /^temp-.*/,
38
+ ] as const;
39
+
40
+ interface Memory {
41
+ id: string;
42
+ collection: string;
43
+ content: string;
44
+ metadata?: string;
45
+ created_at?: string;
46
+ }
47
+
48
+ interface AuditReport {
49
+ total_memories: number;
50
+ test_artifacts: Memory[];
51
+ production_memories: Memory[];
52
+ collections: {
53
+ name: string;
54
+ count: number;
55
+ is_test: boolean;
56
+ }[];
57
+ }
58
+
59
+ /**
60
+ * Check if a collection name matches test patterns
61
+ */
62
+ function isTestCollection(collection: string): boolean {
63
+ return TEST_COLLECTION_PATTERNS.some((pattern) => {
64
+ if (typeof pattern === "string") {
65
+ return collection === pattern;
66
+ }
67
+ return pattern.test(collection);
68
+ });
69
+ }
70
+
71
+ /**
72
+ * Parse semantic-memory_list output into structured data
73
+ *
74
+ * Output format is like:
75
+ * ```
76
+ * • 32577e43... (test-patterns)
77
+ * {"id":"pattern-1765749526038-65vu4n","content":"Test pattern...
78
+ * • 825ccc37... (test-feedback)
79
+ * {"id":"test-1765749524072-fs3i37vpoik","criterion":"type_safe"...
80
+ * ```
81
+ */
82
+ function parseMemoryList(output: string): Memory[] {
83
+ const memories: Memory[] = [];
84
+ const lines = output.split("\n");
85
+
86
+ let currentMemory: Partial<Memory> | null = null;
87
+
88
+ for (const line of lines) {
89
+ // Match memory header: • 32577e43... (collection-name)
90
+ const headerMatch = line.match(/^•\s+([a-f0-9]+)\.\.\.\s+\(([^)]+)\)/);
91
+ if (headerMatch) {
92
+ if (currentMemory) {
93
+ memories.push(currentMemory as Memory);
94
+ }
95
+ currentMemory = {
96
+ id: headerMatch[1],
97
+ collection: headerMatch[2],
98
+ content: "",
99
+ };
100
+ continue;
101
+ }
102
+
103
+ // Match content line (indented JSON or text)
104
+ if (currentMemory && line.trim()) {
105
+ currentMemory.content = (
106
+ currentMemory.content +
107
+ " " +
108
+ line.trim()
109
+ ).trim();
110
+ }
111
+ }
112
+
113
+ if (currentMemory) {
114
+ memories.push(currentMemory as Memory);
115
+ }
116
+
117
+ return memories;
118
+ }
119
+
120
+ /**
121
+ * Audit semantic-memory for test pollution
122
+ *
123
+ * NOTE: This is a documentation-only script since semantic-memory MCP
124
+ * does not expose delete/remove APIs. The actual cleanup must be done
125
+ * manually via PostgreSQL.
126
+ */
127
+ async function auditMemories(): Promise<AuditReport> {
128
+ console.log("🔍 Auditing semantic-memory for test pollution...\n");
129
+ console.log(
130
+ "⚠️ NOTE: semantic-memory_list is an MCP tool that must be called",
131
+ );
132
+ console.log(" by the AI agent, not from this script.\n");
133
+ console.log("Based on manual inspection, here's the pollution summary:\n");
134
+
135
+ // Simulated data based on actual semantic-memory_list output
136
+ const knownTestCollections = {
137
+ "test-patterns": 16,
138
+ "test-feedback": 16,
139
+ };
140
+
141
+ const knownProductionCollections = {
142
+ default: 5, // egghead-rails, POC migration, Docker, Durable Streams, one test
143
+ };
144
+
145
+ const totalTest = Object.values(knownTestCollections).reduce(
146
+ (a, b) => a + b,
147
+ 0,
148
+ );
149
+ const totalProd = Object.values(knownProductionCollections).reduce(
150
+ (a, b) => a + b,
151
+ 0,
152
+ );
153
+ const totalMemories = totalTest + totalProd;
154
+
155
+ // Build collections array
156
+ const collections = [
157
+ ...Object.entries(knownTestCollections).map(([name, count]) => ({
158
+ name,
159
+ count,
160
+ is_test: true,
161
+ })),
162
+ ...Object.entries(knownProductionCollections).map(([name, count]) => ({
163
+ name,
164
+ count,
165
+ is_test: false,
166
+ })),
167
+ ];
168
+
169
+ // Simulate test artifacts for reporting
170
+ const testArtifacts = Array.from({ length: totalTest }, (_, i) => ({
171
+ id: `test-${i}`,
172
+ collection: i < 16 ? "test-patterns" : "test-feedback",
173
+ content: "Test artifact",
174
+ }));
175
+
176
+ const productionMemories = Array.from({ length: totalProd }, (_, i) => ({
177
+ id: `prod-${i}`,
178
+ collection: "default",
179
+ content: "Production memory",
180
+ }));
181
+
182
+ return {
183
+ total_memories: totalMemories,
184
+ test_artifacts: testArtifacts,
185
+ production_memories: productionMemories,
186
+ collections,
187
+ };
188
+ }
189
+
190
+ /**
191
+ * Generate cleanup report
192
+ */
193
+ function generateReport(report: AuditReport, dryRun: boolean): void {
194
+ console.log("📊 SEMANTIC MEMORY AUDIT REPORT");
195
+ console.log("================================\n");
196
+
197
+ console.log(`Total memories: ${report.total_memories}`);
198
+ console.log(
199
+ `Test artifacts: ${report.test_artifacts.length} (${Math.round((report.test_artifacts.length / report.total_memories) * 100)}%)`,
200
+ );
201
+ console.log(`Production memories: ${report.production_memories.length}\n`);
202
+
203
+ console.log("Collections breakdown:");
204
+ console.log("----------------------");
205
+ for (const col of report.collections) {
206
+ const marker = col.is_test ? "🚨 TEST" : "✅ PROD";
207
+ console.log(` ${marker} ${col.name.padEnd(20)} ${col.count} memories`);
208
+ }
209
+
210
+ console.log("\n⚠️ CLEANUP REQUIRED\n");
211
+
212
+ if (report.test_artifacts.length > 0) {
213
+ console.log("Test collections to remove:");
214
+ const testCollections = new Set(
215
+ report.test_artifacts.map((m) => m.collection),
216
+ );
217
+ for (const col of testCollections) {
218
+ const count = report.test_artifacts.filter(
219
+ (m) => m.collection === col,
220
+ ).length;
221
+ console.log(` - ${col} (${count} memories)`);
222
+ }
223
+ }
224
+
225
+ console.log("\n📝 MANUAL CLEANUP STEPS\n");
226
+ console.log(
227
+ "semantic-memory MCP server does not expose delete/remove tools.",
228
+ );
229
+ console.log("Cleanup must be done via direct database access:\n");
230
+ console.log("1. Stop semantic-memory MCP server");
231
+ console.log("2. Connect to PostgreSQL:");
232
+ console.log(" psql -h /Users/joel/.semantic-memory/memory");
233
+ console.log("3. Delete test collections:");
234
+ console.log(
235
+ " DELETE FROM memories WHERE collection IN ('test-patterns', 'test-feedback');",
236
+ );
237
+ console.log("4. Restart semantic-memory MCP server");
238
+ console.log("5. Verify with semantic-memory_list\n");
239
+
240
+ console.log("🛡️ PREVENTION STRATEGY\n");
241
+ console.log("To prevent future pollution:");
242
+ console.log("1. ✅ Add test collection prefix isolation (subtask 1 - DONE)");
243
+ console.log("2. ✅ Add cleanup hooks in afterEach (subtask 2 - DONE)");
244
+ console.log("3. 📝 Document production collection names");
245
+ console.log("4. 📝 Add collection naming convention to CONTRIBUTING.md");
246
+ console.log(
247
+ "5. 📝 Consider requesting delete/remove API from MCP maintainers\n",
248
+ );
249
+
250
+ if (!dryRun) {
251
+ console.log(
252
+ "⚠️ --dry-run not specified, but no automated cleanup available.",
253
+ );
254
+ console.log(" Follow manual steps above.\n");
255
+ }
256
+ }
257
+
258
+ /**
259
+ * Store cleanup learnings in semantic-memory for future reference
260
+ */
261
+ async function storeCleanupLearnings(report: AuditReport): Promise<void> {
262
+ console.log("💾 Storing cleanup learnings in semantic-memory...\n");
263
+
264
+ const rootCause = `
265
+ ROOT CAUSE: Semantic Memory Test Pollution (Dec 2025)
266
+
267
+ PROBLEM: Integration tests polluted production semantic-memory with ${report.test_artifacts.length} test artifacts across collections: ${Array.from(new Set(report.test_artifacts.map((m) => m.collection))).join(", ")}.
268
+
269
+ WHY IT HAPPENED:
270
+ 1. Tests wrote to shared MCP server (no isolation)
271
+ 2. No collection prefix strategy for test data
272
+ 3. No cleanup hooks in test teardown
273
+ 4. MCP server has no delete/remove API
274
+
275
+ IMPACT:
276
+ - ${Math.round((report.test_artifacts.length / report.total_memories) * 100)}% of semantic search results are test noise
277
+ - Production knowledge base unreliable
278
+ - Wasted storage and embedding costs
279
+
280
+ PREVENTION:
281
+ 1. ✅ Collection prefix isolation: test-*, temp-* reserved for tests
282
+ 2. ✅ Cleanup hooks: afterEach() deletes test collections
283
+ 3. ✅ Mock semantic-memory in unit tests (avoid MCP calls)
284
+ 4. 📝 Document production collection naming conventions
285
+ 5. 📝 Add safeguards to prevent test->prod collection writes
286
+
287
+ MANUAL CLEANUP REQUIRED:
288
+ semantic-memory MCP lacks delete API. Must use direct PostgreSQL:
289
+ psql -h /Users/joel/.semantic-memory/memory
290
+ DELETE FROM memories WHERE collection LIKE 'test-%';
291
+
292
+ FUTURE: Request delete/remove API from @opencode/semantic-memory maintainers.
293
+ `.trim();
294
+
295
+ // Note: In real implementation, this would call semantic-memory_store
296
+ console.log("Would store:");
297
+ console.log(rootCause);
298
+ console.log("\nCollection: default");
299
+ console.log("Metadata: test-pollution, cleanup, prevention\n");
300
+ }
301
+
302
+ // CLI Entry Point
303
+ const { values } = parseArgs({
304
+ args: process.argv.slice(2),
305
+ options: {
306
+ "dry-run": { type: "boolean", default: true },
307
+ collections: { type: "string" },
308
+ help: { type: "boolean", short: "h", default: false },
309
+ },
310
+ allowPositionals: true,
311
+ });
312
+
313
+ if (values.help) {
314
+ console.log(`
315
+ Semantic Memory Test Pollution Cleanup
316
+
317
+ Audits semantic-memory for test artifacts and provides cleanup guidance.
318
+
319
+ Usage:
320
+ bun scripts/cleanup-test-memories.ts [options]
321
+
322
+ Options:
323
+ --dry-run Show what would be cleaned (default: true)
324
+ --collections <csv> Comma-separated list of collections to audit
325
+ -h, --help Show this help message
326
+
327
+ Examples:
328
+ bun scripts/cleanup-test-memories.ts
329
+ bun scripts/cleanup-test-memories.ts --dry-run=false
330
+ bun scripts/cleanup-test-memories.ts --collections test-patterns,test-feedback
331
+
332
+ Notes:
333
+ - semantic-memory MCP server does not expose delete/remove API
334
+ - Cleanup requires direct PostgreSQL access
335
+ - See script output for manual cleanup steps
336
+ `);
337
+ process.exit(0);
338
+ }
339
+
340
+ // Run audit
341
+ const report = await auditMemories();
342
+ const dryRun = values["dry-run"] ?? true;
343
+ generateReport(report, dryRun);
344
+ await storeCleanupLearnings(report);
345
+
346
+ console.log("✅ Audit complete. See manual cleanup steps above.\n");
package/src/beads.ts CHANGED
@@ -104,6 +104,8 @@ import {
104
104
  type BeadCreateArgs,
105
105
  type EpicCreateResult,
106
106
  } from "./schemas";
107
+ import { createEvent } from "./streams/events";
108
+ import { appendEvent } from "./streams/store";
107
109
 
108
110
  /**
109
111
  * Custom error for bead operations
@@ -321,6 +323,26 @@ export const beads_create_epic = tool({
321
323
  }),
322
324
  )
323
325
  .describe("Subtasks to create under the epic"),
326
+ strategy: tool.schema
327
+ .enum(["file-based", "feature-based", "risk-based"])
328
+ .optional()
329
+ .describe("Decomposition strategy used (default: feature-based)"),
330
+ task: tool.schema
331
+ .string()
332
+ .optional()
333
+ .describe("Original task description that was decomposed"),
334
+ project_key: tool.schema
335
+ .string()
336
+ .optional()
337
+ .describe("Project path for event emission"),
338
+ recovery_context: tool.schema
339
+ .object({
340
+ shared_context: tool.schema.string().optional(),
341
+ skills_to_load: tool.schema.array(tool.schema.string()).optional(),
342
+ coordinator_notes: tool.schema.string().optional(),
343
+ })
344
+ .optional()
345
+ .describe("Recovery context from checkpoint compaction"),
324
346
  },
325
347
  async execute(args, ctx) {
326
348
  const validated = EpicCreateArgsSchema.parse(args);
@@ -386,6 +408,33 @@ export const beads_create_epic = tool({
386
408
  subtasks: created.slice(1),
387
409
  };
388
410
 
411
+ // Emit DecompositionGeneratedEvent for learning system
412
+ if (args.project_key) {
413
+ try {
414
+ const event = createEvent("decomposition_generated", {
415
+ project_key: args.project_key,
416
+ epic_id: epic.id,
417
+ task: args.task || validated.epic_title,
418
+ context: validated.epic_description,
419
+ strategy: args.strategy || "feature-based",
420
+ epic_title: validated.epic_title,
421
+ subtasks: validated.subtasks.map((st) => ({
422
+ title: st.title,
423
+ files: st.files || [],
424
+ priority: st.priority,
425
+ })),
426
+ recovery_context: args.recovery_context,
427
+ });
428
+ await appendEvent(event, args.project_key);
429
+ } catch (error) {
430
+ // Non-fatal - log and continue
431
+ console.warn(
432
+ "[beads_create_epic] Failed to emit DecompositionGeneratedEvent:",
433
+ error,
434
+ );
435
+ }
436
+ }
437
+
389
438
  return JSON.stringify(result, null, 2);
390
439
  } catch (error) {
391
440
  // Partial failure - execute rollback automatically