npm - opencode-swarm-plugin - Versions diffs - 0.44.0 → 0.44.1 - Mend

opencode-swarm-plugin 0.44.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (205) hide show

package/bin/swarm.serve.test.ts +6 -4
package/bin/swarm.ts +16 -10
package/dist/compaction-prompt-scoring.js +139 -0
package/dist/eval-capture.js +12811 -0
package/dist/hive.d.ts.map +1 -1
package/dist/index.js +7644 -62599
package/dist/plugin.js +23766 -78721
package/dist/swarm-orchestrate.d.ts.map +1 -1
package/dist/swarm-prompts.d.ts.map +1 -1
package/dist/swarm-review.d.ts.map +1 -1
package/package.json +17 -5
package/.changeset/swarm-insights-data-layer.md +0 -63
package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
package/.hive/analysis/session-data-quality-audit.md +0 -320
package/.hive/eval-results.json +0 -483
package/.hive/issues.jsonl +0 -138
package/.hive/memories.jsonl +0 -729
package/.opencode/eval-history.jsonl +0 -327
package/.turbo/turbo-build.log +0 -9
package/CHANGELOG.md +0 -2286
package/SCORER-ANALYSIS.md +0 -598
package/docs/analysis/subagent-coordination-patterns.md +0 -902
package/docs/analysis-socratic-planner-pattern.md +0 -504
package/docs/planning/ADR-001-monorepo-structure.md +0 -171
package/docs/planning/ADR-002-package-extraction.md +0 -393
package/docs/planning/ADR-003-performance-improvements.md +0 -451
package/docs/planning/ADR-004-message-queue-features.md +0 -187
package/docs/planning/ADR-005-devtools-observability.md +0 -202
package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
package/docs/planning/ROADMAP.md +0 -368
package/docs/semantic-memory-cli-syntax.md +0 -123
package/docs/swarm-mail-architecture.md +0 -1147
package/docs/testing/context-recovery-test.md +0 -470
package/evals/ARCHITECTURE.md +0 -1189
package/evals/README.md +0 -768
package/evals/compaction-prompt.eval.ts +0 -149
package/evals/compaction-resumption.eval.ts +0 -289
package/evals/coordinator-behavior.eval.ts +0 -307
package/evals/coordinator-session.eval.ts +0 -154
package/evals/evalite.config.ts.bak +0 -15
package/evals/example.eval.ts +0 -31
package/evals/fixtures/cass-baseline.ts +0 -217
package/evals/fixtures/compaction-cases.ts +0 -350
package/evals/fixtures/compaction-prompt-cases.ts +0 -311
package/evals/fixtures/coordinator-sessions.ts +0 -328
package/evals/fixtures/decomposition-cases.ts +0 -105
package/evals/lib/compaction-loader.test.ts +0 -248
package/evals/lib/compaction-loader.ts +0 -320
package/evals/lib/data-loader.evalite-test.ts +0 -289
package/evals/lib/data-loader.test.ts +0 -345
package/evals/lib/data-loader.ts +0 -281
package/evals/lib/llm.ts +0 -115
package/evals/scorers/compaction-prompt-scorers.ts +0 -145
package/evals/scorers/compaction-scorers.ts +0 -305
package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
package/evals/scorers/coordinator-discipline.ts +0 -325
package/evals/scorers/index.test.ts +0 -146
package/evals/scorers/index.ts +0 -328
package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
package/evals/scorers/outcome-scorers.ts +0 -349
package/evals/swarm-decomposition.eval.ts +0 -121
package/examples/commands/swarm.md +0 -745
package/examples/plugin-wrapper-template.ts +0 -2515
package/examples/skills/hive-workflow/SKILL.md +0 -212
package/examples/skills/skill-creator/SKILL.md +0 -223
package/examples/skills/swarm-coordination/SKILL.md +0 -292
package/global-skills/cli-builder/SKILL.md +0 -344
package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
package/global-skills/learning-systems/SKILL.md +0 -644
package/global-skills/skill-creator/LICENSE.txt +0 -202
package/global-skills/skill-creator/SKILL.md +0 -352
package/global-skills/skill-creator/references/output-patterns.md +0 -82
package/global-skills/skill-creator/references/workflows.md +0 -28
package/global-skills/swarm-coordination/SKILL.md +0 -995
package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
package/global-skills/swarm-coordination/references/strategies.md +0 -138
package/global-skills/system-design/SKILL.md +0 -213
package/global-skills/testing-patterns/SKILL.md +0 -430
package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
package/opencode-swarm-plugin-0.30.7.tgz +0 -0
package/opencode-swarm-plugin-0.31.0.tgz +0 -0
package/scripts/cleanup-test-memories.ts +0 -346
package/scripts/init-skill.ts +0 -222
package/scripts/migrate-unknown-sessions.ts +0 -349
package/scripts/validate-skill.ts +0 -204
package/src/agent-mail.ts +0 -1724
package/src/anti-patterns.test.ts +0 -1167
package/src/anti-patterns.ts +0 -448
package/src/compaction-capture.integration.test.ts +0 -257
package/src/compaction-hook.test.ts +0 -838
package/src/compaction-hook.ts +0 -1204
package/src/compaction-observability.integration.test.ts +0 -139
package/src/compaction-observability.test.ts +0 -187
package/src/compaction-observability.ts +0 -324
package/src/compaction-prompt-scorers.test.ts +0 -475
package/src/compaction-prompt-scoring.ts +0 -300
package/src/contributor-tools.test.ts +0 -133
package/src/contributor-tools.ts +0 -201
package/src/dashboard.test.ts +0 -611
package/src/dashboard.ts +0 -462
package/src/error-enrichment.test.ts +0 -403
package/src/error-enrichment.ts +0 -219
package/src/eval-capture.test.ts +0 -1015
package/src/eval-capture.ts +0 -929
package/src/eval-gates.test.ts +0 -306
package/src/eval-gates.ts +0 -218
package/src/eval-history.test.ts +0 -508
package/src/eval-history.ts +0 -214
package/src/eval-learning.test.ts +0 -378
package/src/eval-learning.ts +0 -360
package/src/eval-runner.test.ts +0 -223
package/src/eval-runner.ts +0 -402
package/src/export-tools.test.ts +0 -476
package/src/export-tools.ts +0 -257
package/src/hive.integration.test.ts +0 -2241
package/src/hive.ts +0 -1628
package/src/index.ts +0 -940
package/src/learning.integration.test.ts +0 -1815
package/src/learning.ts +0 -1079
package/src/logger.test.ts +0 -189
package/src/logger.ts +0 -135
package/src/mandate-promotion.test.ts +0 -473
package/src/mandate-promotion.ts +0 -239
package/src/mandate-storage.integration.test.ts +0 -601
package/src/mandate-storage.test.ts +0 -578
package/src/mandate-storage.ts +0 -794
package/src/mandates.ts +0 -540
package/src/memory-tools.test.ts +0 -195
package/src/memory-tools.ts +0 -344
package/src/memory.integration.test.ts +0 -334
package/src/memory.test.ts +0 -158
package/src/memory.ts +0 -527
package/src/model-selection.test.ts +0 -188
package/src/model-selection.ts +0 -68
package/src/observability-tools.test.ts +0 -359
package/src/observability-tools.ts +0 -871
package/src/output-guardrails.test.ts +0 -438
package/src/output-guardrails.ts +0 -381
package/src/pattern-maturity.test.ts +0 -1160
package/src/pattern-maturity.ts +0 -525
package/src/planning-guardrails.test.ts +0 -491
package/src/planning-guardrails.ts +0 -438
package/src/plugin.ts +0 -23
package/src/post-compaction-tracker.test.ts +0 -251
package/src/post-compaction-tracker.ts +0 -237
package/src/query-tools.test.ts +0 -636
package/src/query-tools.ts +0 -324
package/src/rate-limiter.integration.test.ts +0 -466
package/src/rate-limiter.ts +0 -774
package/src/replay-tools.test.ts +0 -496
package/src/replay-tools.ts +0 -240
package/src/repo-crawl.integration.test.ts +0 -441
package/src/repo-crawl.ts +0 -610
package/src/schemas/cell-events.test.ts +0 -347
package/src/schemas/cell-events.ts +0 -807
package/src/schemas/cell.ts +0 -257
package/src/schemas/evaluation.ts +0 -166
package/src/schemas/index.test.ts +0 -199
package/src/schemas/index.ts +0 -286
package/src/schemas/mandate.ts +0 -232
package/src/schemas/swarm-context.ts +0 -115
package/src/schemas/task.ts +0 -161
package/src/schemas/worker-handoff.test.ts +0 -302
package/src/schemas/worker-handoff.ts +0 -131
package/src/sessions/agent-discovery.test.ts +0 -137
package/src/sessions/agent-discovery.ts +0 -112
package/src/sessions/index.ts +0 -15
package/src/skills.integration.test.ts +0 -1192
package/src/skills.test.ts +0 -643
package/src/skills.ts +0 -1549
package/src/storage.integration.test.ts +0 -341
package/src/storage.ts +0 -884
package/src/structured.integration.test.ts +0 -817
package/src/structured.test.ts +0 -1046
package/src/structured.ts +0 -762
package/src/swarm-decompose.test.ts +0 -188
package/src/swarm-decompose.ts +0 -1302
package/src/swarm-deferred.integration.test.ts +0 -157
package/src/swarm-deferred.test.ts +0 -38
package/src/swarm-insights.test.ts +0 -214
package/src/swarm-insights.ts +0 -459
package/src/swarm-mail.integration.test.ts +0 -970
package/src/swarm-mail.ts +0 -739
package/src/swarm-orchestrate.integration.test.ts +0 -282
package/src/swarm-orchestrate.test.ts +0 -548
package/src/swarm-orchestrate.ts +0 -3084
package/src/swarm-prompts.test.ts +0 -1270
package/src/swarm-prompts.ts +0 -2077
package/src/swarm-research.integration.test.ts +0 -701
package/src/swarm-research.test.ts +0 -698
package/src/swarm-research.ts +0 -472
package/src/swarm-review.integration.test.ts +0 -285
package/src/swarm-review.test.ts +0 -879
package/src/swarm-review.ts +0 -709
package/src/swarm-strategies.ts +0 -407
package/src/swarm-worktree.test.ts +0 -501
package/src/swarm-worktree.ts +0 -575
package/src/swarm.integration.test.ts +0 -2377
package/src/swarm.ts +0 -38
package/src/tool-adapter.integration.test.ts +0 -1221
package/src/tool-availability.ts +0 -461
package/tsconfig.json +0 -28

package/evals/scorers/index.ts DELETED Viewed

@@ -1,328 +0,0 @@
-import { createScorer } from "evalite";
-import { generateText, gateway } from "ai";
-import type { GatewayModelId } from "ai";
-import type { CellTree } from "../../src/schemas/index.js";
-const JUDGE_MODEL: GatewayModelId = "anthropic/claude-haiku-4-5";
-/**
- * Custom scorers for evaluating swarm task decomposition quality
- */
-/**
- * Checks that no files appear in multiple subtasks
- *
- * Independent subtasks are critical for parallel execution.
- * File conflicts cause merge conflicts and coordination overhead.
- *
- * Score: 1.0 if no conflicts, 0.0 if conflicts found
- */
-export const subtaskIndependence = createScorer({
-  name: "Subtask Independence",
-  description: "Checks that no files appear in multiple subtasks",
-  scorer: ({ output }) => {
-    try {
-      const beadTree = JSON.parse(String(output)) as CellTree;
-      const fileMap = new Map<string, number>();
-      // Track which files appear in which subtasks
-      beadTree.subtasks.forEach((subtask) => {
-        subtask.files?.forEach((file) => {
-          const count = fileMap.get(file) || 0;
-          fileMap.set(file, count + 1);
-        });
-      });
-      // Check for conflicts
-      const conflicts = Array.from(fileMap.entries()).filter(
-        ([_, count]) => count > 1,
-      );
-      if (conflicts.length > 0) {
-        return {
-          score: 0,
-          message: `File conflicts found: ${conflicts.map(([f]) => f).join(", ")}`,
-        };
-      }
-      return {
-        score: 1,
-        message: "No file conflicts - subtasks are independent",
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse CellTree: ${error}`,
-      };
-    }
-  },
-});
-// ============================================================================
-// Outcome-based scorers
-// ============================================================================
-export {
-  executionSuccess,
-  timeBalance,
-  scopeAccuracy,
-  scopeDrift,
-  noRework,
-} from "./outcome-scorers.js";
-// ============================================================================
-// Compaction-specific scorers
-// ============================================================================
-export {
-  confidenceAccuracy,
-  contextInjectionCorrectness,
-  requiredPatternsPresent,
-  forbiddenPatternsAbsent,
-  compactionQuality,
-} from "./compaction-scorers.js";
-// ============================================================================
-// Coordinator discipline scorers
-// ============================================================================
-export {
-  violationCount,
-  spawnEfficiency,
-  reviewThoroughness,
-  timeToFirstSpawn,
-  overallDiscipline,
-} from "./coordinator-discipline.js";
-/**
- * Checks that subtasks cover the full task scope
- *
- * Incomplete coverage means:
- * - Missing functionality
- * - Follow-up work required
- * - Task not actually complete
- *
- * Score: ratio of expected files covered (0.0 to 1.0)
- * If no expected files specified, checks that subtasks exist
- */
-export const coverageCompleteness = createScorer({
-  name: "Coverage Completeness",
-  description: "Checks that subtasks cover the full task scope",
-  scorer: ({ output, expected }) => {
-    try {
-      const beadTree = JSON.parse(String(output)) as CellTree;
-      // If expected files specified, check coverage
-      const expectedData = expected as Record<string, unknown> | undefined;
-      if (expectedData && Array.isArray(expectedData.requiredFiles)) {
-        const allFiles = new Set(
-          beadTree.subtasks.flatMap((st) => st.files || []),
-        );
-        const requiredFiles = expectedData.requiredFiles as string[];
-        const coveredFiles = requiredFiles.filter((f) => allFiles.has(f));
-        const coverage = coveredFiles.length / requiredFiles.length;
-        return {
-          score: coverage,
-          message: `${coveredFiles.length}/${requiredFiles.length} required files covered`,
-        };
-      }
-      // Otherwise, check min/max subtask count
-      const minSubtasks = (expectedData?.minSubtasks as number) || 1;
-      const maxSubtasks = (expectedData?.maxSubtasks as number) || 10;
-      const count = beadTree.subtasks.length;
-      if (count < minSubtasks) {
-        return {
-          score: 0,
-          message: `Too few subtasks: ${count} < ${minSubtasks}`,
-        };
-      }
-      if (count > maxSubtasks) {
-        return {
-          score: 0.5,
-          message: `Too many subtasks: ${count} > ${maxSubtasks} (over-decomposed)`,
-        };
-      }
-      return {
-        score: 1,
-        message: `Good subtask count: ${count} (${minSubtasks}-${maxSubtasks})`,
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse CellTree: ${error}`,
-      };
-    }
-  },
-});
-/**
- * Checks that each subtask has clear, actionable instructions
- *
- * Vague instructions lead to:
- * - Agent confusion and blocking
- * - Incorrect implementations
- * - Need for coordinator intervention
- *
- * Score: Average of per-subtask instruction quality
- */
-export const instructionClarity = createScorer({
-  name: "Instruction Clarity",
-  description: "Checks that subtasks have clear, actionable instructions",
-  scorer: ({ output }) => {
-    try {
-      const beadTree = JSON.parse(String(output)) as CellTree;
-      if (beadTree.subtasks.length === 0) {
-        return {
-          score: 0,
-          message: "No subtasks found",
-        };
-      }
-      // Check each subtask for clarity signals
-      const scores = beadTree.subtasks.map((subtask) => {
-        let score = 0.5; // baseline
-        // Has description?
-        if (subtask.description && subtask.description.length > 20) {
-          score += 0.2;
-        }
-        // Has files specified?
-        if (subtask.files && subtask.files.length > 0) {
-          score += 0.2;
-        }
-        // Title is specific (not generic)?
-        const genericWords = ["update", "fix", "add", "change", "modify"];
-        const titleLower = subtask.title.toLowerCase();
-        const isGeneric = genericWords.some(
-          (word) => titleLower === word || titleLower.startsWith(`${word} `),
-        );
-        if (!isGeneric) {
-          score += 0.1;
-        }
-        return Math.min(1.0, score);
-      });
-      const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
-      return {
-        score: avgScore,
-        message: `Average instruction clarity: ${(avgScore * 100).toFixed(0)}%`,
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse CellTree: ${error}`,
-      };
-    }
-  },
-});
-// ============================================================================
-// LLM-as-Judge Scorers
-// ============================================================================
-/**
- * LLM-as-judge scorer for decomposition coherence
- *
- * Uses Claude Haiku to evaluate whether subtasks are truly independent,
- * well-scoped, and complete. This catches nuances that heuristics miss:
- * - Semantic dependencies between subtasks
- * - Scope that's too big or too trivial
- * - Missing pieces that would block completion
- *
- * Only use for decomposition evals - this is where it matters.
- */
-export const decompositionCoherence = createScorer({
-  name: "Decomposition Coherence (LLM Judge)",
-  description:
-    "LLM evaluates whether subtasks are truly independent and well-scoped",
-  scorer: async ({ output, input }) => {
-    try {
-      const decomposition =
-        typeof output === "string" ? output : JSON.stringify(output, null, 2);
-      // Get original task from input if available
-      const originalTask =
-        typeof input === "object" && input !== null && "task" in input
-          ? String((input as { task: string }).task)
-          : "Unknown task";
-      const { text } = await generateText({
-        model: gateway(JUDGE_MODEL),
-        prompt: `You are evaluating a task decomposition for parallel agent execution.
-ORIGINAL TASK:
-${originalTask}
-DECOMPOSITION:
-${decomposition}
-Evaluate on these criteria (be harsh - bad decompositions waste expensive parallel work):
-1. INDEPENDENCE (25%): Can subtasks truly run in parallel? Look for:
-   - Shared state dependencies (one writes, another reads)
-   - Ordering requirements hidden in the task descriptions
-   - Shared files that will cause merge conflicts
-2. SCOPE (25%): Is each subtask right-sized?
-   - Too big: Should be split further (>2 hours of work)
-   - Too small: Trivial tasks that waste agent spawn overhead
-   - Goldilocks: 30min-2hr of focused work
-3. COMPLETENESS (25%): Does the sum equal the whole?
-   - Missing pieces that would leave the task incomplete
-   - Gaps between subtasks (who handles X?)
-   - Implicit work not captured in any subtask
-4. CLARITY (25%): Would an agent know what to do?
-   - Vague descriptions that invite interpretation
-   - Missing context needed to start work
-   - Ambiguous boundaries between subtasks
-Return ONLY valid JSON (no markdown, no explanation):
-{"score": <0-100>, "issues": ["issue1", "issue2"], "strengths": ["strength1"]}`,
-        maxOutputTokens: 512,
-      });
-      // Parse JSON response - handle potential markdown wrapping
-      let jsonText = text.trim();
-      if (jsonText.startsWith("```")) {
-        jsonText = jsonText.replace(/```json?\n?/g, "").replace(/```$/g, "");
-      }
-      const result = JSON.parse(jsonText) as {
-        score: number;
-        issues: string[];
-        strengths?: string[];
-      };
-      const issueText =
-        result.issues.length > 0 ? result.issues.join("; ") : "No issues";
-      const strengthText =
-        result.strengths && result.strengths.length > 0
-          ? ` | Strengths: ${result.strengths.join("; ")}`
-          : "";
-      return {
-        score: result.score / 100,
-        message: `${issueText}${strengthText}`,
-      };
-    } catch (error) {
-      // Don't fail the eval if judge fails - return neutral score
-      return {
-        score: 0.5,
-        message: `LLM judge error: ${error instanceof Error ? error.message : String(error)}`,
-      };
-    }
-  },
-});

package/evals/scorers/outcome-scorers.evalite-test.ts DELETED Viewed

@@ -1,27 +0,0 @@
-/**
- * Outcome-based Scorers Tests
- *
- * Tests the 5 new outcome-based scorers by verifying their exports.
- * Full functional testing happens via Evalite integration.
- */
-import { describe, it, expect } from "bun:test";
-describe("Outcome Scorers", () => {
-  it("exports all 5 outcome scorers from outcome-scorers.ts", async () => {
-    const module = await import("./outcome-scorers.js");
-    expect(module.executionSuccess).toBeDefined();
-    expect(module.timeBalance).toBeDefined();
-    expect(module.scopeAccuracy).toBeDefined();
-    expect(module.scopeDrift).toBeDefined();
-    expect(module.noRework).toBeDefined();
-  });
-  it("re-exports all 5 outcome scorers from index.ts", async () => {
-    const indexModule = await import("./index.js");
-    expect(indexModule.executionSuccess).toBeDefined();
-    expect(indexModule.timeBalance).toBeDefined();
-    expect(indexModule.scopeAccuracy).toBeDefined();
-    expect(indexModule.scopeDrift).toBeDefined();
-    expect(indexModule.noRework).toBeDefined();
-  });
-});

package/evals/scorers/outcome-scorers.ts DELETED Viewed

@@ -1,349 +0,0 @@
-import { createScorer } from "evalite";
-import type { EvalRecord } from "../../src/eval-capture.js";
-/**
- * Outcome-based scorers for evaluating decomposition quality
- *
- * These scorers evaluate based on ACTUAL execution outcomes,
- * not just the structure of the decomposition.
- *
- * Requires EvalRecord with outcomes populated.
- */
-/**
- * Execution Success Scorer
- *
- * Measures whether all subtasks succeeded without errors.
- * This is the ultimate measure - did the decomposition actually work?
- *
- * Score: 1.0 if all outcomes.success === true, 0.0 otherwise
- */
-export const executionSuccess = createScorer({
-  name: "Execution Success",
-  description: "All subtasks completed successfully without errors",
-  scorer: ({ output }) => {
-    try {
-      const record = JSON.parse(String(output)) as EvalRecord;
-      // Check if outcomes exist
-      if (!record.outcomes || record.outcomes.length === 0) {
-        return {
-          score: 0,
-          message: "No outcome data available",
-        };
-      }
-      // Check if all subtasks succeeded
-      const allSucceeded = record.outcomes.every((outcome) => outcome.success);
-      if (allSucceeded) {
-        return {
-          score: 1,
-          message: `All ${record.outcomes.length} subtasks succeeded`,
-        };
-      }
-      // Report failures
-      const failures = record.outcomes.filter((o) => !o.success);
-      const failureList = failures.map((f) => f.title || f.bead_id).join(", ");
-      return {
-        score: 0,
-        message: `${failures.length}/${record.outcomes.length} subtasks failed: ${failureList}`,
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse EvalRecord: ${error}`,
-      };
-    }
-  },
-});
-/**
- * Time Balance Scorer
- *
- * Measures how evenly balanced the work was across subtasks.
- * Unbalanced work means some agents finish early while others are bottlenecked.
- *
- * Score: 1.0 if max/min ratio < 2.0 (well balanced)
- *        0.5 if ratio < 4.0 (moderately balanced)
- *        0.0 if ratio >= 4.0 (poorly balanced)
- */
-export const timeBalance = createScorer({
-  name: "Time Balance",
-  description: "Work is evenly distributed across subtasks (max/min duration)",
-  scorer: ({ output }) => {
-    try {
-      const record = JSON.parse(String(output)) as EvalRecord;
-      // Check if outcomes exist
-      if (!record.outcomes || record.outcomes.length === 0) {
-        return {
-          score: 0,
-          message: "No outcome data available",
-        };
-      }
-      // Need at least 2 subtasks to measure balance
-      if (record.outcomes.length < 2) {
-        return {
-          score: 1,
-          message: "Only one subtask - perfect balance",
-        };
-      }
-      // Get durations (filter out zeros)
-      const durations = record.outcomes
-        .map((o) => o.duration_ms)
-        .filter((d) => d > 0);
-      if (durations.length === 0) {
-        return {
-          score: 0,
-          message: "No duration data available",
-        };
-      }
-      const maxDuration = Math.max(...durations);
-      const minDuration = Math.min(...durations);
-      const ratio = maxDuration / minDuration;
-      // Score based on ratio
-      let score: number;
-      let assessment: string;
-      if (ratio < 2.0) {
-        score = 1.0;
-        assessment = "well balanced";
-      } else if (ratio < 4.0) {
-        score = 0.5;
-        assessment = "moderately balanced";
-      } else {
-        score = 0.0;
-        assessment = "poorly balanced";
-      }
-      const maxSeconds = Math.round(maxDuration / 1000);
-      const minSeconds = Math.round(minDuration / 1000);
-      return {
-        score,
-        message: `Ratio ${ratio.toFixed(1)}x (${maxSeconds}s / ${minSeconds}s) - ${assessment}`,
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse EvalRecord: ${error}`,
-      };
-    }
-  },
-});
-/**
- * Scope Accuracy Scorer
- *
- * Measures how accurately the decomposition predicted which files would be touched.
- * High accuracy means the planner understood the work scope correctly.
- *
- * Score: intersection(actual, planned) / planned.length
- *        1.0 = all planned files were touched, no extras
- *        0.5 = half the planned files were touched
- *        0.0 = none of the planned files were touched
- */
-export const scopeAccuracy = createScorer({
-  name: "Scope Accuracy",
-  description:
-    "Planned files match actual files touched (accuracy of scope prediction)",
-  scorer: ({ output }) => {
-    try {
-      const record = JSON.parse(String(output)) as EvalRecord;
-      // Check if outcomes exist
-      if (!record.outcomes || record.outcomes.length === 0) {
-        return {
-          score: 0,
-          message: "No outcome data available",
-        };
-      }
-      // Calculate accuracy per subtask
-      let totalPlanned = 0;
-      let totalCorrect = 0;
-      for (const outcome of record.outcomes) {
-        const planned = new Set(outcome.planned_files);
-        const actual = new Set(outcome.actual_files);
-        // Count intersection (files in both planned and actual)
-        const intersection = Array.from(planned).filter((f) => actual.has(f));
-        totalPlanned += planned.size;
-        totalCorrect += intersection.length;
-      }
-      if (totalPlanned === 0) {
-        return {
-          score: 0,
-          message: "No planned files to measure against",
-        };
-      }
-      const accuracy = totalCorrect / totalPlanned;
-      return {
-        score: accuracy,
-        message: `${totalCorrect}/${totalPlanned} planned files touched (${(accuracy * 100).toFixed(0)}% accuracy)`,
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse EvalRecord: ${error}`,
-      };
-    }
-  },
-});
-/**
- * Scope Drift Scorer
- *
- * Penalizes when agents touch files NOT in their planned scope.
- * Scope drift indicates poor planning or unexpected dependencies.
- *
- * Score: 1.0 if no drift (all actual files were planned)
- *        Decreases linearly with drift percentage
- *        0.0 if drift > 50%
- */
-export const scopeDrift = createScorer({
-  name: "Scope Drift",
-  description:
-    "Agents stayed within their planned file scope (no unexpected files)",
-  scorer: ({ output }) => {
-    try {
-      const record = JSON.parse(String(output)) as EvalRecord;
-      // Check if outcomes exist
-      if (!record.outcomes || record.outcomes.length === 0) {
-        return {
-          score: 0,
-          message: "No outcome data available",
-        };
-      }
-      // Calculate drift per subtask
-      let totalActual = 0;
-      let totalDrift = 0;
-      for (const outcome of record.outcomes) {
-        const planned = new Set(outcome.planned_files);
-        const actual = new Set(outcome.actual_files);
-        // Count files in actual but NOT in planned
-        const drift = Array.from(actual).filter((f) => !planned.has(f));
-        totalActual += actual.size;
-        totalDrift += drift.length;
-      }
-      if (totalActual === 0) {
-        return {
-          score: 1,
-          message: "No files touched",
-        };
-      }
-      const driftRatio = totalDrift / totalActual;
-      // Score: 1.0 if no drift, linearly decrease to 0 at 50% drift
-      const score = Math.max(0, 1.0 - driftRatio * 2);
-      const driftPct = (driftRatio * 100).toFixed(0);
-      return {
-        score,
-        message: `${totalDrift}/${totalActual} files were unplanned (${driftPct}% drift)`,
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse EvalRecord: ${error}`,
-      };
-    }
-  },
-});
-/**
- * No Rework Scorer
- *
- * Checks that no subtask touched files assigned to another subtask.
- * Rework indicates poor decomposition or missing dependencies.
- *
- * Score: 1.0 if no rework (no subtask touched another's planned files)
- *        0.0 if rework detected
- */
-export const noRework = createScorer({
-  name: "No Rework",
-  description: "No subtask touched files assigned to another subtask",
-  scorer: ({ output }) => {
-    try {
-      const record = JSON.parse(String(output)) as EvalRecord;
-      // Check if outcomes exist
-      if (!record.outcomes || record.outcomes.length === 0) {
-        return {
-          score: 0,
-          message: "No outcome data available",
-        };
-      }
-      // Build map of planned files per subtask
-      const plannedBySubtask = new Map<string, Set<string>>();
-      for (const outcome of record.outcomes) {
-        plannedBySubtask.set(outcome.bead_id, new Set(outcome.planned_files));
-      }
-      // Check each subtask for rework
-      const reworkCases: string[] = [];
-      for (const outcome of record.outcomes) {
-        const actualFiles = new Set(outcome.actual_files);
-        // Check if this subtask touched files planned for another subtask
-        for (const [otherBeadId, otherPlanned] of plannedBySubtask.entries()) {
-          if (otherBeadId === outcome.bead_id) {
-            continue; // Skip self
-          }
-          // Find intersection
-          const overlap = Array.from(actualFiles).filter((f) =>
-            otherPlanned.has(f),
-          );
-          if (overlap.length > 0) {
-            reworkCases.push(
-              `${outcome.title || outcome.bead_id} touched ${overlap.length} file(s) from ${otherBeadId}`,
-            );
-          }
-        }
-      }
-      if (reworkCases.length > 0) {
-        return {
-          score: 0,
-          message: `Rework detected: ${reworkCases.join("; ")}`,
-        };
-      }
-      return {
-        score: 1,
-        message: "No rework - all subtasks stayed in their lanes",
-      };
-    } catch (error) {
-      return {
-        score: 0,
-        message: `Failed to parse EvalRecord: ${error}`,
-      };
-    }
-  },
-});