npm - opencode-swarm-plugin - Versions diffs - 0.40.0 → 0.42.1 - Mend

opencode-swarm-plugin 0.40.0 → 0.42.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
package/.hive/analysis/session-data-quality-audit.md +320 -0
package/.hive/eval-results.json +481 -24
package/.hive/issues.jsonl +67 -16
package/.hive/memories.jsonl +159 -1
package/.opencode/eval-history.jsonl +315 -0
package/.turbo/turbo-build.log +5 -5
package/CHANGELOG.md +165 -0
package/README.md +2 -0
package/SCORER-ANALYSIS.md +598 -0
package/bin/eval-gate.test.ts +158 -0
package/bin/eval-gate.ts +74 -0
package/bin/swarm.serve.test.ts +46 -0
package/bin/swarm.test.ts +661 -732
package/bin/swarm.ts +335 -0
package/dist/compaction-hook.d.ts +7 -5
package/dist/compaction-hook.d.ts.map +1 -1
package/dist/compaction-prompt-scoring.d.ts +1 -0
package/dist/compaction-prompt-scoring.d.ts.map +1 -1
package/dist/eval-runner.d.ts +134 -0
package/dist/eval-runner.d.ts.map +1 -0
package/dist/hive.d.ts.map +1 -1
package/dist/index.d.ts +29 -0
package/dist/index.d.ts.map +1 -1
package/dist/index.js +99741 -58858
package/dist/memory-tools.d.ts +70 -2
package/dist/memory-tools.d.ts.map +1 -1
package/dist/memory.d.ts +37 -0
package/dist/memory.d.ts.map +1 -1
package/dist/observability-tools.d.ts +64 -0
package/dist/observability-tools.d.ts.map +1 -1
package/dist/plugin.js +99356 -58318
package/dist/swarm-orchestrate.d.ts.map +1 -1
package/dist/swarm-prompts.d.ts +32 -1
package/dist/swarm-prompts.d.ts.map +1 -1
package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
package/evals/ARCHITECTURE.md +1189 -0
package/evals/example.eval.ts +3 -4
package/evals/fixtures/compaction-prompt-cases.ts +6 -0
package/evals/scorers/coordinator-discipline.evalite-test.ts +1 -162
package/evals/scorers/coordinator-discipline.ts +0 -323
package/evals/swarm-decomposition.eval.ts +4 -2
package/package.json +4 -3
package/src/compaction-prompt-scorers.test.ts +185 -9
package/src/compaction-prompt-scoring.ts +7 -5
package/src/eval-runner.test.ts +128 -1
package/src/eval-runner.ts +46 -0
package/src/hive.ts +43 -42
package/src/memory-tools.test.ts +84 -0
package/src/memory-tools.ts +68 -3
package/src/memory.test.ts +2 -112
package/src/memory.ts +88 -49
package/src/observability-tools.test.ts +13 -0
package/src/observability-tools.ts +277 -0
package/src/swarm-orchestrate.test.ts +162 -0
package/src/swarm-orchestrate.ts +7 -5
package/src/swarm-prompts.test.ts +168 -4
package/src/swarm-prompts.ts +228 -7
package/.env +0 -2
package/.turbo/turbo-test.log +0 -481
package/.turbo/turbo-typecheck.log +0 -1

package/src/compaction-prompt-scorers.test.ts CHANGED Viewed

@@ -3,6 +3,15 @@
  *
  * TDD approach - tests written FIRST to define scorer behavior
  * Tests the PURE scoring functions (not evalite wrappers)
+ *
+ * **Case-Sensitivity Verification**:
+ * All tool name regexes MUST be case-insensitive (/i flag) because:
+ * - LLMs generate inconsistent casing (Edit vs edit, Read vs read)
+ * - Fixtures contain mixed case examples
+ * - Scoring must be robust to case variations
+ *
+ * Fixed in commit adding /i flags to Edit, Write, bash patterns.
+ * Tests added to prevent regression.
  */
 import { describe, expect, test } from "bun:test";
@@ -15,6 +24,109 @@ import {
 	scorePostCompactionDiscipline,
 } from "./compaction-prompt-scoring.js";
+describe("Case-Insensitive Tool Detection (Regression Prevention)", () => {
+	test("all scorers handle mixed-case tool names correctly", () => {
+		// Real-world example with mixed casing from LLM output
+		const prompt: CompactionPrompt = {
+			content: `┌─────────────────────────────────────────┐
+│     YOU ARE THE COORDINATOR             │
+└─────────────────────────────────────────┘
+You are coordinating epic mjkw81rkq4c.
+## IMMEDIATE ACTIONS
+1. swarm_status(epic_id='mjkw81rkq4c', project_key='/path')
+2. swarmmail_inbox()
+## FORBIDDEN TOOLS
+NEVER use these tools - delegate to workers:
+- edit (file modifications)
+- write (file creation)
+- BASH (shell commands for file mods)
+- swarmmail_reserve (only workers)
+- git commit (workers handle)
+ALWAYS spawn workers for code changes.`,
+		};
+		// Epic ID detection should work
+		const epicResult = scoreEpicIdSpecificity(prompt);
+		expect(epicResult.score).toBe(1.0);
+		// Actionability should detect swarm_status
+		const actionResult = scoreActionability(prompt);
+		expect(actionResult.score).toBe(1.0);
+		// Coordinator identity should detect ASCII + NEVER/ALWAYS
+		const identityResult = scoreCoordinatorIdentity(prompt);
+		expect(identityResult.score).toBe(1.0);
+		// Forbidden tools should detect all 5 despite mixed case
+		const forbiddenResult = scoreForbiddenToolsPresent(prompt);
+		expect(forbiddenResult.score).toBe(1.0);
+		expect(forbiddenResult.message).toContain("All 5");
+		// Post-compaction discipline should detect swarm_status as first tool
+		const disciplineResult = scorePostCompactionDiscipline(prompt);
+		expect(disciplineResult.score).toBe(1.0);
+	});
+	test("forbidden tools scorer detects lowercase tool names", () => {
+		// Previously failed before /i flags were added
+		const prompt: CompactionPrompt = {
+			content: `Don't use: edit, write, bash, swarmmail_reserve, git commit`,
+		};
+		const result = scoreForbiddenToolsPresent(prompt);
+		// Should detect all 5 tools regardless of case
+		expect(result.score).toBe(1.0);
+		expect(result.message).toContain("All 5");
+	});
+	test("forbidden tools scorer detects UPPERCASE tool names", () => {
+		const prompt: CompactionPrompt = {
+			content: `Forbidden: EDIT, WRITE, BASH, swarmmail_reserve, git commit`,
+		};
+		const result = scoreForbiddenToolsPresent(prompt);
+		expect(result.score).toBe(1.0);
+		expect(result.message).toContain("All 5");
+	});
+	test("post-compaction discipline detects mixed-case first tools", () => {
+		const testCases = [
+			{ tool: "EDIT", shouldPass: false },
+			{ tool: "edit", shouldPass: false },
+			{ tool: "Edit", shouldPass: false },
+			{ tool: "WRITE", shouldPass: false },
+			{ tool: "write", shouldPass: false },
+			{ tool: "READ", shouldPass: false },
+			{ tool: "read", shouldPass: false },
+			{ tool: "swarm_status", shouldPass: true },
+			{ tool: "SWARM_STATUS", shouldPass: true },
+			{ tool: "swarmmail_inbox", shouldPass: true },
+		];
+		for (const { tool, shouldPass } of testCases) {
+			const prompt: CompactionPrompt = {
+				content: `1. ${tool}()`,
+			};
+			const result = scorePostCompactionDiscipline(prompt);
+			if (shouldPass) {
+				expect(result.score).toBe(1.0);
+			} else {
+				expect(result.score).toBe(0.0);
+			}
+		}
+	});
+});
 describe("epicIdSpecificity scorer", () => {
 	test("scores 1.0 for real epic IDs", () => {
 		const prompt: CompactionPrompt = {
@@ -173,16 +285,17 @@ describe("forbiddenToolsPresent scorer", () => {
 - Edit (use swarm_spawn_subtask)
 - Write (use swarm_spawn_subtask)
 - swarmmail_reserve (only workers reserve)
-- bash with git commit (workers commit)`,
+- git commit (workers commit)
+- bash (for file modifications)`,
 		};
 		const result = scoreForbiddenToolsPresent(prompt);
 		expect(result.score).toBe(1.0);
-		expect(result.message).toContain("All 4 forbidden tools");
+		expect(result.message).toContain("All 5 forbidden tools");
 	});
-	test("scores 0.75 when 3 out of 4 tools listed", () => {
+	test("scores 0.6 when 3 out of 5 tools listed", () => {
 		const prompt: CompactionPrompt = {
 			content: `🚫 FORBIDDEN TOOLS:
 - Edit
@@ -192,19 +305,19 @@ describe("forbiddenToolsPresent scorer", () => {
 		const result = scoreForbiddenToolsPresent(prompt);
-		expect(result.score).toBe(0.75);
-		expect(result.message).toContain("3/4");
+		expect(result.score).toBe(0.6);
+		expect(result.message).toContain("3/5");
 	});
-	test("scores 0.5 when 2 out of 4 tools listed", () => {
+	test("scores 0.4 when 2 out of 5 tools listed", () => {
 		const prompt: CompactionPrompt = {
 			content: `Don't use Edit or Write directly.`,
 		};
 		const result = scoreForbiddenToolsPresent(prompt);
-		expect(result.score).toBe(0.5);
-		expect(result.message).toContain("2/4");
+		expect(result.score).toBe(0.4);
+		expect(result.message).toContain("2/5");
 	});
 	test("scores 0.0 when no forbidden tools listed", () => {
@@ -215,7 +328,34 @@ describe("forbiddenToolsPresent scorer", () => {
 		const result = scoreForbiddenToolsPresent(prompt);
 		expect(result.score).toBe(0.0);
-		expect(result.message).toContain("0/4");
+		expect(result.message).toContain("0/5");
+	});
+	test("scores 1.0 with lowercase forbidden tools (case-insensitive)", () => {
+		const prompt: CompactionPrompt = {
+			content: `🚫 FORBIDDEN TOOLS - NEVER call these:
+- edit (use swarm_spawn_subtask)
+- write (use swarm_spawn_subtask)
+- swarmmail_reserve (only workers reserve)
+- git commit (workers commit)
+- bash (for file modifications)`,
+		};
+		const result = scoreForbiddenToolsPresent(prompt);
+		expect(result.score).toBe(1.0);
+		expect(result.message).toContain("All 5 forbidden tools");
+	});
+	test("scores correctly with mixed case forbidden tools", () => {
+		const prompt: CompactionPrompt = {
+			content: `Avoid: edit, Write, BASH`,
+		};
+		const result = scoreForbiddenToolsPresent(prompt);
+		expect(result.score).toBe(0.6);
+		expect(result.message).toContain("3/5");
 	});
 });
@@ -296,4 +436,40 @@ describe("postCompactionDiscipline scorer", () => {
 		expect(result.score).toBe(0.0);
 		expect(result.message).toContain("No tool");
 	});
+	test("scores 0.0 when first tool is lowercase 'read' (case-insensitive)", () => {
+		const prompt: CompactionPrompt = {
+			content: `1. read(file='src/index.ts')
+2. swarm_status()`,
+		};
+		const result = scorePostCompactionDiscipline(prompt);
+		expect(result.score).toBe(0.0);
+		expect(result.message).toContain("read");
+	});
+	test("scores 0.0 when first tool is lowercase 'edit'", () => {
+		const prompt: CompactionPrompt = {
+			content: `1. edit(file='src/auth.ts', ...)
+2. swarm_status()`,
+		};
+		const result = scorePostCompactionDiscipline(prompt);
+		expect(result.score).toBe(0.0);
+		expect(result.message).toContain("edit");
+	});
+	test("scores 0.0 when first tool is lowercase 'write'", () => {
+		const prompt: CompactionPrompt = {
+			content: `1. write(file='README.md', content='...')
+2. swarm_status()`,
+		};
+		const result = scorePostCompactionDiscipline(prompt);
+		expect(result.score).toBe(0.0);
+		expect(result.message).toContain("write");
+	});
 });

package/src/compaction-prompt-scoring.ts CHANGED Viewed

@@ -203,6 +203,7 @@ export function scoreCoordinatorIdentity(
  * 2. Write
  * 3. swarmmail_reserve (only workers reserve)
  * 4. git commit (workers commit)
+ * 5. bash (for file modifications)
  *
  * @returns ratio of forbidden tools mentioned (0.0 to 1.0)
  */
@@ -211,10 +212,11 @@ export function scoreForbiddenToolsPresent(
 ): ScorerResult {
 	// Check for forbidden tool mentions
 	const forbiddenTools = [
-		/\bEdit\b/,
-		/\bWrite\b/,
+		/\bEdit\b/i,
+		/\bWrite\b/i,
 		/swarmmail_reserve/,
 		/git commit/,
+		/\bbash\b/i,
 	];
 	const foundTools = forbiddenTools.filter((pattern) =>
@@ -226,20 +228,20 @@ export function scoreForbiddenToolsPresent(
 	if (score === 1.0) {
 		return {
 			score: 1.0,
-			message: "All 4 forbidden tools listed",
+			message: "All 5 forbidden tools listed",
 		};
 	}
 	if (score === 0) {
 		return {
 			score: 0.0,
-			message: "No forbidden tools listed (0/4)",
+			message: "No forbidden tools listed (0/5)",
 		};
 	}
 	return {
 		score,
-		message: `${foundTools.length}/4 forbidden tools listed`,
+		message: `${foundTools.length}/5 forbidden tools listed`,
 	};
 }

package/src/eval-runner.test.ts CHANGED Viewed

@@ -4,9 +4,11 @@
  * TDD: These tests MUST fail initially, then pass after implementation.
  */
-import { describe, test, expect, beforeAll } from "bun:test";
+import { describe, test, expect, beforeAll, afterEach } from "bun:test";
 import { runEvals } from "./eval-runner";
 import path from "node:path";
+import fs from "node:fs";
+import { getEvalHistoryPath } from "./eval-history";
 // Use project root for all tests
 const PROJECT_ROOT = path.resolve(import.meta.dir, "..");
@@ -93,4 +95,129 @@ describe("runEvals", () => {
     expect(result.totalSuites).toBe(0);
     expect(result.suites).toEqual([]);
   }, 10000);
+  test("records eval run to history after execution", async () => {
+    // Clean up any existing history before test
+    const historyPath = getEvalHistoryPath(PROJECT_ROOT);
+    const historyBackup = historyPath + ".backup";
+    // Backup existing history
+    if (fs.existsSync(historyPath)) {
+      fs.copyFileSync(historyPath, historyBackup);
+    }
+    try {
+      // Remove history file to get clean state
+      if (fs.existsSync(historyPath)) {
+        fs.unlinkSync(historyPath);
+      }
+      // Run evals
+      const result = await runEvals({
+        cwd: PROJECT_ROOT,
+        suiteFilter: "example",
+      });
+      // Should have succeeded
+      expect(result.success).toBe(true);
+      expect(result.suites.length).toBeGreaterThan(0);
+      // History file should have been created
+      expect(fs.existsSync(historyPath)).toBe(true);
+      // Read history file
+      const historyContent = fs.readFileSync(historyPath, "utf-8");
+      const lines = historyContent.trim().split("\n");
+      // Should have one line per suite
+      expect(lines.length).toBe(result.suites.length);
+      // Parse first line and verify structure
+      const firstRecord = JSON.parse(lines[0]);
+      // Verify structure has all required fields
+      expect(typeof firstRecord.timestamp).toBe("string");
+      expect(typeof firstRecord.eval_name).toBe("string");
+      expect(typeof firstRecord.score).toBe("number");
+      expect(typeof firstRecord.run_count).toBe("number");
+      // Verify eval_name matches suite name
+      expect(firstRecord.eval_name).toBe(result.suites[0].name);
+      // Verify score matches suite averageScore
+      expect(firstRecord.score).toBe(result.suites[0].averageScore);
+      // First run should have run_count = 1
+      expect(firstRecord.run_count).toBe(1);
+    } finally {
+      // Restore backup
+      if (fs.existsSync(historyBackup)) {
+        fs.copyFileSync(historyBackup, historyPath);
+        fs.unlinkSync(historyBackup);
+      }
+    }
+  }, 30000);
+  test("checks gates for each suite after recording", async () => {
+    const result = await runEvals({
+      cwd: PROJECT_ROOT,
+      suiteFilter: "example",
+    });
+    expect(result.success).toBe(true);
+    expect(result.gateResults).toBeDefined();
+    expect(Array.isArray(result.gateResults)).toBe(true);
+    // Should have gate result for each suite
+    expect(result.gateResults?.length).toBe(result.suites.length);
+    // Each gate result should have required fields
+    if (result.gateResults && result.gateResults.length > 0) {
+      const gateResult = result.gateResults[0];
+      expect(gateResult).toHaveProperty("suite");
+      expect(gateResult).toHaveProperty("passed");
+      expect(gateResult).toHaveProperty("phase");
+      expect(gateResult).toHaveProperty("message");
+      expect(gateResult).toHaveProperty("currentScore");
+    }
+  }, 30000);
+  test("calls learnFromEvalFailure when gate fails", async () => {
+    // This test requires manually creating a history with regression
+    // For now, we just verify the code path exists
+    // In practice, this would be tested with mocked checkGate returning failed=true
+    const result = await runEvals({
+      cwd: PROJECT_ROOT,
+      suiteFilter: "example",
+    });
+    // Gate results should be present even if no failures
+    expect(result.gateResults).toBeDefined();
+  }, 30000);
+  test("does NOT call learnFromEvalFailure when gate passes", async () => {
+    // Similar to above - verifies the happy path
+    // Real test would mock checkGate and verify learnFromEvalFailure NOT called
+    const result = await runEvals({
+      cwd: PROJECT_ROOT,
+      suiteFilter: "example",
+    });
+    // Should succeed with gate results
+    expect(result.success).toBe(true);
+    expect(result.gateResults).toBeDefined();
+  }, 30000);
+  test("includes gateResults in return value", async () => {
+    const result = await runEvals({
+      cwd: PROJECT_ROOT,
+      suiteFilter: "example",
+    });
+    // gateResults should be array (even if empty)
+    expect(result).toHaveProperty("gateResults");
+    expect(Array.isArray(result.gateResults)).toBe(true);
+  }, 30000);
 });

package/src/eval-runner.ts CHANGED Viewed

@@ -13,6 +13,10 @@ import { createInMemoryStorage } from "evalite/in-memory-storage";
 import type { Evalite } from "evalite/types";
 import fs from "node:fs/promises";
 import path from "node:path";
+import { recordEvalRun, getScoreHistory } from "./eval-history.js";
+import { checkGate } from "./eval-gates.js";
+import { learnFromEvalFailure } from "./eval-learning.js";
+import { getMemoryAdapter } from "./memory-tools.js";
 /**
  * Options for running evals programmatically
@@ -97,6 +101,17 @@ export interface RunEvalsResult {
   /** Error message if run failed */
   error?: string;
+  /** Gate check results per suite */
+  gateResults?: Array<{
+    suite: string;
+    passed: boolean;
+    phase: string;
+    message: string;
+    baseline?: number;
+    currentScore: number;
+    regressionPercent?: number;
+  }>;
 }
 /**
@@ -246,6 +261,36 @@ export async function runEvals(
       })),
     }));
+    // Record eval runs to history
+    for (const suite of suites) {
+      const history = getScoreHistory(projectRoot, suite.name);
+      recordEvalRun(projectRoot, {
+        timestamp: new Date().toISOString(),
+        eval_name: suite.name,
+        score: suite.averageScore,
+        run_count: history.length + 1,
+      });
+    }
+    // Check gates for each suite
+    const gateResults = [];
+    for (const suite of suites) {
+      const history = getScoreHistory(projectRoot, suite.name);
+      const gate = checkGate(projectRoot, suite.name, suite.averageScore);
+      gateResults.push({ suite: suite.name, ...gate });
+      // If gate failed, trigger learning
+      if (!gate.passed) {
+        try {
+          const memoryAdapter = await getMemoryAdapter();
+          await learnFromEvalFailure(suite.name, suite.averageScore, history, memoryAdapter);
+        } catch (e) {
+          // Learning is best-effort, don't fail the eval run
+          console.warn(`Failed to store learning for ${suite.name}:`, e);
+        }
+      }
+    }
     // Calculate overall metrics
     const totalEvals = suites.reduce((sum, s) => sum + s.evalCount, 0);
     const averageScore =
@@ -263,6 +308,7 @@ export async function runEvals(
       totalEvals,
       averageScore,
       suites,
+      gateResults,
     };
   } catch (error) {
     // Return error result

package/src/hive.ts CHANGED Viewed

@@ -741,42 +741,44 @@ export const hive_create_epic = tool({
       };
       // Emit DecompositionGeneratedEvent for learning system
-      if (args.project_key) {
-        try {
-          const event = createEvent("decomposition_generated", {
-            project_key: args.project_key,
-            epic_id: epic.id,
-            task: args.task || validated.epic_title,
-            context: validated.epic_description,
-            strategy: args.strategy || "feature-based",
-            epic_title: validated.epic_title,
-            subtasks: validated.subtasks.map((st) => ({
-              title: st.title,
-              files: st.files || [],
-              priority: st.priority,
-            })),
-            recovery_context: args.recovery_context,
-          });
-          await appendEvent(event, args.project_key);
-        } catch (error) {
-          // Non-fatal - log and continue
-          console.warn(
-            "[hive_create_epic] Failed to emit DecompositionGeneratedEvent:",
-            error,
-          );
-        }
+      // Always emit using projectKey (from getHiveWorkingDirectory), not args.project_key
+      // This fixes the bug where events weren't emitted when callers didn't pass project_key
+      const effectiveProjectKey = args.project_key || projectKey;
+      try {
+        const event = createEvent("decomposition_generated", {
+          project_key: effectiveProjectKey,
+          epic_id: epic.id,
+          task: args.task || validated.epic_title,
+          context: validated.epic_description,
+          strategy: args.strategy || "feature-based",
+          epic_title: validated.epic_title,
+          subtasks: validated.subtasks.map((st) => ({
+            title: st.title,
+            files: st.files || [],
+            priority: st.priority,
+          })),
+          recovery_context: args.recovery_context,
+        });
+        await appendEvent(event, effectiveProjectKey);
+      } catch (error) {
+        // Non-fatal - log and continue
+        console.warn(
+          "[hive_create_epic] Failed to emit DecompositionGeneratedEvent:",
+          error,
+        );
+      }
-        // Capture decomposition_complete event for eval scoring
-        try {
-          const { captureCoordinatorEvent } = await import("./eval-capture.js");
-          // Build files_per_subtask map (indexed by subtask index)
-          const filesPerSubtask: Record<number, string[]> = {};
-          validated.subtasks.forEach((subtask, index) => {
-            if (subtask.files && subtask.files.length > 0) {
-              filesPerSubtask[index] = subtask.files;
-            }
-          });
+      // Capture decomposition_complete event for eval scoring
+      try {
+        const { captureCoordinatorEvent } = await import("./eval-capture.js");
+        // Build files_per_subtask map (indexed by subtask index)
+        const filesPerSubtask: Record<number, string[]> = {};
+        validated.subtasks.forEach((subtask, index) => {
+          if (subtask.files && subtask.files.length > 0) {
+            filesPerSubtask[index] = subtask.files;
+          }
+        });
           captureCoordinatorEvent({
             session_id: ctx.sessionID || "unknown",
@@ -792,13 +794,12 @@ export const hive_create_epic = tool({
               task: args.task,
             },
           });
-        } catch (error) {
-          // Non-fatal - log and continue
-          console.warn(
-            "[hive_create_epic] Failed to capture decomposition_complete event:",
-            error,
-          );
-        }
+      } catch (error) {
+        // Non-fatal - log and continue
+        console.warn(
+          "[hive_create_epic] Failed to capture decomposition_complete event:",
+          error,
+        );
       }
       // Sync cells to JSONL so spawned workers can see them immediately