opencode-swarm-plugin 0.43.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (208) hide show
  1. package/bin/cass.characterization.test.ts +422 -0
  2. package/bin/swarm.serve.test.ts +6 -4
  3. package/bin/swarm.test.ts +68 -0
  4. package/bin/swarm.ts +81 -8
  5. package/dist/compaction-prompt-scoring.js +139 -0
  6. package/dist/contributor-tools.d.ts +42 -0
  7. package/dist/contributor-tools.d.ts.map +1 -0
  8. package/dist/eval-capture.js +12811 -0
  9. package/dist/hive.d.ts.map +1 -1
  10. package/dist/index.d.ts +12 -0
  11. package/dist/index.d.ts.map +1 -1
  12. package/dist/index.js +7728 -62590
  13. package/dist/plugin.js +23833 -78695
  14. package/dist/sessions/agent-discovery.d.ts +59 -0
  15. package/dist/sessions/agent-discovery.d.ts.map +1 -0
  16. package/dist/sessions/index.d.ts +10 -0
  17. package/dist/sessions/index.d.ts.map +1 -0
  18. package/dist/swarm-orchestrate.d.ts.map +1 -1
  19. package/dist/swarm-prompts.d.ts.map +1 -1
  20. package/dist/swarm-review.d.ts.map +1 -1
  21. package/package.json +17 -5
  22. package/.changeset/swarm-insights-data-layer.md +0 -63
  23. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
  24. package/.hive/analysis/session-data-quality-audit.md +0 -320
  25. package/.hive/eval-results.json +0 -483
  26. package/.hive/issues.jsonl +0 -138
  27. package/.hive/memories.jsonl +0 -729
  28. package/.opencode/eval-history.jsonl +0 -327
  29. package/.turbo/turbo-build.log +0 -9
  30. package/CHANGELOG.md +0 -2255
  31. package/SCORER-ANALYSIS.md +0 -598
  32. package/docs/analysis/subagent-coordination-patterns.md +0 -902
  33. package/docs/analysis-socratic-planner-pattern.md +0 -504
  34. package/docs/planning/ADR-001-monorepo-structure.md +0 -171
  35. package/docs/planning/ADR-002-package-extraction.md +0 -393
  36. package/docs/planning/ADR-003-performance-improvements.md +0 -451
  37. package/docs/planning/ADR-004-message-queue-features.md +0 -187
  38. package/docs/planning/ADR-005-devtools-observability.md +0 -202
  39. package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
  40. package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
  41. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
  42. package/docs/planning/ROADMAP.md +0 -368
  43. package/docs/semantic-memory-cli-syntax.md +0 -123
  44. package/docs/swarm-mail-architecture.md +0 -1147
  45. package/docs/testing/context-recovery-test.md +0 -470
  46. package/evals/ARCHITECTURE.md +0 -1189
  47. package/evals/README.md +0 -768
  48. package/evals/compaction-prompt.eval.ts +0 -149
  49. package/evals/compaction-resumption.eval.ts +0 -289
  50. package/evals/coordinator-behavior.eval.ts +0 -307
  51. package/evals/coordinator-session.eval.ts +0 -154
  52. package/evals/evalite.config.ts.bak +0 -15
  53. package/evals/example.eval.ts +0 -31
  54. package/evals/fixtures/compaction-cases.ts +0 -350
  55. package/evals/fixtures/compaction-prompt-cases.ts +0 -311
  56. package/evals/fixtures/coordinator-sessions.ts +0 -328
  57. package/evals/fixtures/decomposition-cases.ts +0 -105
  58. package/evals/lib/compaction-loader.test.ts +0 -248
  59. package/evals/lib/compaction-loader.ts +0 -320
  60. package/evals/lib/data-loader.evalite-test.ts +0 -289
  61. package/evals/lib/data-loader.test.ts +0 -345
  62. package/evals/lib/data-loader.ts +0 -281
  63. package/evals/lib/llm.ts +0 -115
  64. package/evals/scorers/compaction-prompt-scorers.ts +0 -145
  65. package/evals/scorers/compaction-scorers.ts +0 -305
  66. package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
  67. package/evals/scorers/coordinator-discipline.ts +0 -325
  68. package/evals/scorers/index.test.ts +0 -146
  69. package/evals/scorers/index.ts +0 -328
  70. package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
  71. package/evals/scorers/outcome-scorers.ts +0 -349
  72. package/evals/swarm-decomposition.eval.ts +0 -121
  73. package/examples/commands/swarm.md +0 -745
  74. package/examples/plugin-wrapper-template.ts +0 -2426
  75. package/examples/skills/hive-workflow/SKILL.md +0 -212
  76. package/examples/skills/skill-creator/SKILL.md +0 -223
  77. package/examples/skills/swarm-coordination/SKILL.md +0 -292
  78. package/global-skills/cli-builder/SKILL.md +0 -344
  79. package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
  80. package/global-skills/learning-systems/SKILL.md +0 -644
  81. package/global-skills/skill-creator/LICENSE.txt +0 -202
  82. package/global-skills/skill-creator/SKILL.md +0 -352
  83. package/global-skills/skill-creator/references/output-patterns.md +0 -82
  84. package/global-skills/skill-creator/references/workflows.md +0 -28
  85. package/global-skills/swarm-coordination/SKILL.md +0 -995
  86. package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
  87. package/global-skills/swarm-coordination/references/strategies.md +0 -138
  88. package/global-skills/system-design/SKILL.md +0 -213
  89. package/global-skills/testing-patterns/SKILL.md +0 -430
  90. package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
  91. package/opencode-swarm-plugin-0.30.7.tgz +0 -0
  92. package/opencode-swarm-plugin-0.31.0.tgz +0 -0
  93. package/scripts/cleanup-test-memories.ts +0 -346
  94. package/scripts/init-skill.ts +0 -222
  95. package/scripts/migrate-unknown-sessions.ts +0 -349
  96. package/scripts/validate-skill.ts +0 -204
  97. package/src/agent-mail.ts +0 -1724
  98. package/src/anti-patterns.test.ts +0 -1167
  99. package/src/anti-patterns.ts +0 -448
  100. package/src/compaction-capture.integration.test.ts +0 -257
  101. package/src/compaction-hook.test.ts +0 -838
  102. package/src/compaction-hook.ts +0 -1204
  103. package/src/compaction-observability.integration.test.ts +0 -139
  104. package/src/compaction-observability.test.ts +0 -187
  105. package/src/compaction-observability.ts +0 -324
  106. package/src/compaction-prompt-scorers.test.ts +0 -475
  107. package/src/compaction-prompt-scoring.ts +0 -300
  108. package/src/dashboard.test.ts +0 -611
  109. package/src/dashboard.ts +0 -462
  110. package/src/error-enrichment.test.ts +0 -403
  111. package/src/error-enrichment.ts +0 -219
  112. package/src/eval-capture.test.ts +0 -1015
  113. package/src/eval-capture.ts +0 -929
  114. package/src/eval-gates.test.ts +0 -306
  115. package/src/eval-gates.ts +0 -218
  116. package/src/eval-history.test.ts +0 -508
  117. package/src/eval-history.ts +0 -214
  118. package/src/eval-learning.test.ts +0 -378
  119. package/src/eval-learning.ts +0 -360
  120. package/src/eval-runner.test.ts +0 -223
  121. package/src/eval-runner.ts +0 -402
  122. package/src/export-tools.test.ts +0 -476
  123. package/src/export-tools.ts +0 -257
  124. package/src/hive.integration.test.ts +0 -2241
  125. package/src/hive.ts +0 -1628
  126. package/src/index.ts +0 -935
  127. package/src/learning.integration.test.ts +0 -1815
  128. package/src/learning.ts +0 -1079
  129. package/src/logger.test.ts +0 -189
  130. package/src/logger.ts +0 -135
  131. package/src/mandate-promotion.test.ts +0 -473
  132. package/src/mandate-promotion.ts +0 -239
  133. package/src/mandate-storage.integration.test.ts +0 -601
  134. package/src/mandate-storage.test.ts +0 -578
  135. package/src/mandate-storage.ts +0 -794
  136. package/src/mandates.ts +0 -540
  137. package/src/memory-tools.test.ts +0 -195
  138. package/src/memory-tools.ts +0 -344
  139. package/src/memory.integration.test.ts +0 -334
  140. package/src/memory.test.ts +0 -158
  141. package/src/memory.ts +0 -527
  142. package/src/model-selection.test.ts +0 -188
  143. package/src/model-selection.ts +0 -68
  144. package/src/observability-tools.test.ts +0 -359
  145. package/src/observability-tools.ts +0 -871
  146. package/src/output-guardrails.test.ts +0 -438
  147. package/src/output-guardrails.ts +0 -381
  148. package/src/pattern-maturity.test.ts +0 -1160
  149. package/src/pattern-maturity.ts +0 -525
  150. package/src/planning-guardrails.test.ts +0 -491
  151. package/src/planning-guardrails.ts +0 -438
  152. package/src/plugin.ts +0 -23
  153. package/src/post-compaction-tracker.test.ts +0 -251
  154. package/src/post-compaction-tracker.ts +0 -237
  155. package/src/query-tools.test.ts +0 -636
  156. package/src/query-tools.ts +0 -324
  157. package/src/rate-limiter.integration.test.ts +0 -466
  158. package/src/rate-limiter.ts +0 -774
  159. package/src/replay-tools.test.ts +0 -496
  160. package/src/replay-tools.ts +0 -240
  161. package/src/repo-crawl.integration.test.ts +0 -441
  162. package/src/repo-crawl.ts +0 -610
  163. package/src/schemas/cell-events.test.ts +0 -347
  164. package/src/schemas/cell-events.ts +0 -807
  165. package/src/schemas/cell.ts +0 -257
  166. package/src/schemas/evaluation.ts +0 -166
  167. package/src/schemas/index.test.ts +0 -199
  168. package/src/schemas/index.ts +0 -286
  169. package/src/schemas/mandate.ts +0 -232
  170. package/src/schemas/swarm-context.ts +0 -115
  171. package/src/schemas/task.ts +0 -161
  172. package/src/schemas/worker-handoff.test.ts +0 -302
  173. package/src/schemas/worker-handoff.ts +0 -131
  174. package/src/skills.integration.test.ts +0 -1192
  175. package/src/skills.test.ts +0 -643
  176. package/src/skills.ts +0 -1549
  177. package/src/storage.integration.test.ts +0 -341
  178. package/src/storage.ts +0 -884
  179. package/src/structured.integration.test.ts +0 -817
  180. package/src/structured.test.ts +0 -1046
  181. package/src/structured.ts +0 -762
  182. package/src/swarm-decompose.test.ts +0 -188
  183. package/src/swarm-decompose.ts +0 -1302
  184. package/src/swarm-deferred.integration.test.ts +0 -157
  185. package/src/swarm-deferred.test.ts +0 -38
  186. package/src/swarm-insights.test.ts +0 -214
  187. package/src/swarm-insights.ts +0 -459
  188. package/src/swarm-mail.integration.test.ts +0 -970
  189. package/src/swarm-mail.ts +0 -739
  190. package/src/swarm-orchestrate.integration.test.ts +0 -282
  191. package/src/swarm-orchestrate.test.ts +0 -548
  192. package/src/swarm-orchestrate.ts +0 -3084
  193. package/src/swarm-prompts.test.ts +0 -1270
  194. package/src/swarm-prompts.ts +0 -2077
  195. package/src/swarm-research.integration.test.ts +0 -701
  196. package/src/swarm-research.test.ts +0 -698
  197. package/src/swarm-research.ts +0 -472
  198. package/src/swarm-review.integration.test.ts +0 -285
  199. package/src/swarm-review.test.ts +0 -879
  200. package/src/swarm-review.ts +0 -709
  201. package/src/swarm-strategies.ts +0 -407
  202. package/src/swarm-worktree.test.ts +0 -501
  203. package/src/swarm-worktree.ts +0 -575
  204. package/src/swarm.integration.test.ts +0 -2377
  205. package/src/swarm.ts +0 -38
  206. package/src/tool-adapter.integration.test.ts +0 -1221
  207. package/src/tool-availability.ts +0 -461
  208. package/tsconfig.json +0 -28
package/evals/lib/llm.ts DELETED
@@ -1,115 +0,0 @@
1
- /**
2
- * LLM Client for Evalite Evals
3
- *
4
- * Uses AI SDK v6 with Vercel AI Gateway.
5
- * Gateway handles provider routing - just pass "provider/model" string.
6
- *
7
- * @module evals/lib/llm
8
- */
9
- import { generateText, gateway } from "ai";
10
- import type { GatewayModelId } from "ai";
11
-
12
- /**
13
- * Default model for decomposition evals
14
- * Using Claude Sonnet for good balance of quality and cost
15
- */
16
- export const DEFAULT_MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
17
-
18
- /**
19
- * Generate a decomposition from a task description
20
- *
21
- * @param prompt - The full decomposition prompt
22
- * @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
23
- * @returns The raw text response from the LLM
24
- */
25
- export async function generateDecomposition(
26
- prompt: string,
27
- model: GatewayModelId = DEFAULT_MODEL,
28
- ): Promise<string> {
29
- const { text } = await generateText({
30
- model: gateway(model),
31
- prompt,
32
- maxOutputTokens: 4096,
33
- });
34
-
35
- return text;
36
- }
37
-
38
- /**
39
- * Format a decomposition prompt from task and context
40
- *
41
- * Uses the same prompt template as swarm_plan_prompt
42
- */
43
- export function formatDecompositionPrompt(
44
- task: string,
45
- context?: string,
46
- maxSubtasks: number = 6,
47
- ): string {
48
- const contextSection = context ? `## Context\n${context}` : "";
49
-
50
- return `You are decomposing a task into parallelizable subtasks for a swarm of agents.
51
-
52
- ## Task
53
- ${task}
54
-
55
- ${contextSection}
56
-
57
- ## Requirements
58
-
59
- 1. **Break into 2-${maxSubtasks} independent subtasks** that can run in parallel
60
- 2. **Assign files** - each subtask must specify which files it will modify
61
- 3. **No file overlap** - files cannot appear in multiple subtasks (they get exclusive locks)
62
- 4. **Order by dependency** - if subtask B needs subtask A's output, A must come first in the array
63
- 5. **Estimate complexity** - 1 (trivial) to 5 (complex)
64
-
65
- ## Response Format
66
-
67
- Respond with ONLY a JSON object matching this schema (no markdown, no explanation):
68
-
69
- {
70
- "epic": {
71
- "title": "string",
72
- "description": "string"
73
- },
74
- "subtasks": [
75
- {
76
- "title": "string",
77
- "description": "string",
78
- "files": ["string"],
79
- "dependencies": [0],
80
- "estimated_complexity": 1
81
- }
82
- ]
83
- }
84
-
85
- ## Guidelines
86
-
87
- - **Plan aggressively** - when in doubt, split further
88
- - **Prefer smaller, focused subtasks** over large complex ones
89
- - **Include test files** in the same subtask as the code they test
90
- - **Be specific about files** - use actual file paths, not placeholders
91
-
92
- Now decompose the task. Respond with JSON only:`;
93
- }
94
-
95
- /**
96
- * Extract JSON from LLM response
97
- *
98
- * Handles responses that may have markdown code blocks or extra text
99
- */
100
- export function extractJson(text: string): string {
101
- // Try to find JSON in code blocks first
102
- const codeBlockMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
103
- if (codeBlockMatch) {
104
- return codeBlockMatch[1].trim();
105
- }
106
-
107
- // Try to find raw JSON object
108
- const jsonMatch = text.match(/\{[\s\S]*\}/);
109
- if (jsonMatch) {
110
- return jsonMatch[0];
111
- }
112
-
113
- // Return as-is if no JSON found
114
- return text;
115
- }
@@ -1,145 +0,0 @@
1
- /**
2
- * Compaction Prompt Quality Scorers - Evalite Wrappers
3
- *
4
- * These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
5
- * for use with evalite's test runner.
6
- *
7
- * Weighted scoring:
8
- * - epicIdSpecificity (0.20) - real IDs not placeholders
9
- * - actionability (0.20) - swarm_status/inbox with real values
10
- * - coordinatorIdentity (0.25) - ASCII header + strong mandates
11
- * - forbiddenToolsPresent (0.15) - lists forbidden tools by name
12
- * - postCompactionDiscipline (0.20) - first tool correct, no edit/write
13
- */
14
-
15
- import { createScorer } from "evalite";
16
- import type { CompactionPrompt } from "../../src/compaction-prompt-scoring.js";
17
- import {
18
- scoreActionability,
19
- scoreCoordinatorIdentity,
20
- scoreEpicIdSpecificity,
21
- scoreForbiddenToolsPresent,
22
- scorePostCompactionDiscipline,
23
- } from "../../src/compaction-prompt-scoring.js";
24
-
25
- // Re-export types for convenience
26
- export type { CompactionPrompt, ScorerResult } from "../../src/compaction-prompt-scoring.js";
27
-
28
- // Re-export pure functions for direct use
29
- export {
30
- scoreActionability,
31
- scoreCoordinatorIdentity,
32
- scoreEpicIdSpecificity,
33
- scoreForbiddenToolsPresent,
34
- scorePostCompactionDiscipline,
35
- } from "../../src/compaction-prompt-scoring.js";
36
-
37
- /**
38
- * Epic ID Specificity Scorer
39
- *
40
- * Validates that epic IDs are REAL, not placeholders.
41
- * Score: 1.0 if real IDs, 0.0 if placeholders found
42
- */
43
- export const epicIdSpecificity = createScorer({
44
- name: "Epic ID Specificity",
45
- description: "Prompt uses real epic IDs, not placeholders",
46
- scorer: ({ output }) => {
47
- try {
48
- const prompt = JSON.parse(String(output)) as CompactionPrompt;
49
- return scoreEpicIdSpecificity(prompt);
50
- } catch (error) {
51
- return {
52
- score: 0,
53
- message: `Failed to parse prompt: ${error}`,
54
- };
55
- }
56
- },
57
- });
58
-
59
- /**
60
- * Actionability Scorer
61
- *
62
- * Validates that the prompt includes SPECIFIC actionable tool calls.
63
- * Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
64
- */
65
- export const actionability = createScorer({
66
- name: "Actionability",
67
- description: "Prompt includes specific tool calls with real values",
68
- scorer: ({ output }) => {
69
- try {
70
- const prompt = JSON.parse(String(output)) as CompactionPrompt;
71
- return scoreActionability(prompt);
72
- } catch (error) {
73
- return {
74
- score: 0,
75
- message: `Failed to parse prompt: ${error}`,
76
- };
77
- }
78
- },
79
- });
80
-
81
- /**
82
- * Coordinator Identity Scorer
83
- *
84
- * Validates that the prompt has STRONG coordinator identity reinforcement.
85
- * Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
86
- */
87
- export const coordinatorIdentity = createScorer({
88
- name: "Coordinator Identity",
89
- description: "Prompt has ASCII header and strong mandates",
90
- scorer: ({ output }) => {
91
- try {
92
- const prompt = JSON.parse(String(output)) as CompactionPrompt;
93
- return scoreCoordinatorIdentity(prompt);
94
- } catch (error) {
95
- return {
96
- score: 0,
97
- message: `Failed to parse prompt: ${error}`,
98
- };
99
- }
100
- },
101
- });
102
-
103
- /**
104
- * Forbidden Tools Present Scorer
105
- *
106
- * Validates that the prompt LISTS forbidden tools by name.
107
- * Score: ratio of forbidden tools mentioned (0.0 to 1.0)
108
- */
109
- export const forbiddenToolsPresent = createScorer({
110
- name: "Forbidden Tools Present",
111
- description: "Prompt lists forbidden tools by name",
112
- scorer: ({ output }) => {
113
- try {
114
- const prompt = JSON.parse(String(output)) as CompactionPrompt;
115
- return scoreForbiddenToolsPresent(prompt);
116
- } catch (error) {
117
- return {
118
- score: 0,
119
- message: `Failed to parse prompt: ${error}`,
120
- };
121
- }
122
- },
123
- });
124
-
125
- /**
126
- * Post-Compaction Discipline Scorer
127
- *
128
- * Validates that the FIRST suggested tool is correct.
129
- * Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
130
- */
131
- export const postCompactionDiscipline = createScorer({
132
- name: "Post-Compaction Discipline",
133
- description: "First suggested tool is swarm_status or inbox",
134
- scorer: ({ output }) => {
135
- try {
136
- const prompt = JSON.parse(String(output)) as CompactionPrompt;
137
- return scorePostCompactionDiscipline(prompt);
138
- } catch (error) {
139
- return {
140
- score: 0,
141
- message: `Failed to parse prompt: ${error}`,
142
- };
143
- }
144
- },
145
- });
@@ -1,305 +0,0 @@
1
- /**
2
- * Custom scorers for compaction hook evaluation
3
- *
4
- * These scorers validate that the compaction hook correctly:
5
- * 1. Detects swarm state (confidence level)
6
- * 2. Injects appropriate context (full/fallback/none)
7
- * 3. Includes required patterns in context
8
- * 4. Excludes placeholder/generic content
9
- */
10
-
11
- import { createScorer } from "evalite";
12
-
13
- /**
14
- * Expected output from compaction hook tests
15
- */
16
- export interface CompactionResult {
17
- detected: boolean;
18
- confidence: "high" | "medium" | "low" | "none";
19
- contextInjected: boolean;
20
- contextType: "full" | "fallback" | "none";
21
- injectedContext: string;
22
- }
23
-
24
- /**
25
- * Expected criteria from test case
26
- */
27
- export interface CompactionExpected {
28
- confidence: "high" | "medium" | "low" | "none";
29
- contextInjected: boolean;
30
- contextType: "full" | "fallback" | "none";
31
- mustContain?: string[];
32
- mustNotContain?: string[];
33
- }
34
-
35
- /**
36
- * Validates that detection confidence matches expected level
37
- *
38
- * Confidence determines what gets injected:
39
- * - HIGH/MEDIUM: Full coordinator context
40
- * - LOW: Fallback detection prompt
41
- * - NONE: No injection
42
- *
43
- * Score: 1.0 if confidence matches, 0.0 otherwise
44
- */
45
- export const confidenceAccuracy = createScorer({
46
- name: "Confidence Accuracy",
47
- description: "Validates detection confidence matches expected level",
48
- scorer: ({ output, expected }) => {
49
- try {
50
- const result = JSON.parse(String(output)) as CompactionResult;
51
- const exp = expected as CompactionExpected;
52
-
53
- if (result.confidence === exp.confidence) {
54
- return {
55
- score: 1,
56
- message: `Correct confidence: ${result.confidence}`,
57
- };
58
- }
59
-
60
- return {
61
- score: 0,
62
- message: `Wrong confidence: got ${result.confidence}, expected ${exp.confidence}`,
63
- };
64
- } catch (error) {
65
- return {
66
- score: 0,
67
- message: `Failed to parse result: ${error}`,
68
- };
69
- }
70
- },
71
- });
72
-
73
- /**
74
- * Validates that context injection matches expected behavior
75
- *
76
- * Checks:
77
- * - Whether context was injected (boolean)
78
- * - What type of context (full/fallback/none)
79
- *
80
- * Score: 1.0 if both match, 0.5 if only injection status matches, 0.0 otherwise
81
- */
82
- export const contextInjectionCorrectness = createScorer({
83
- name: "Context Injection Correctness",
84
- description: "Validates context injection matches expected behavior",
85
- scorer: ({ output, expected }) => {
86
- try {
87
- const result = JSON.parse(String(output)) as CompactionResult;
88
- const exp = expected as CompactionExpected;
89
-
90
- const injectionMatches = result.contextInjected === exp.contextInjected;
91
- const typeMatches = result.contextType === exp.contextType;
92
-
93
- if (injectionMatches && typeMatches) {
94
- return {
95
- score: 1,
96
- message: `Correct injection: ${result.contextType}`,
97
- };
98
- }
99
-
100
- if (injectionMatches) {
101
- return {
102
- score: 0.5,
103
- message: `Injection status correct but wrong type: got ${result.contextType}, expected ${exp.contextType}`,
104
- };
105
- }
106
-
107
- return {
108
- score: 0,
109
- message: `Wrong injection: got ${result.contextInjected ? result.contextType : "none"}, expected ${exp.contextInjected ? exp.contextType : "none"}`,
110
- };
111
- } catch (error) {
112
- return {
113
- score: 0,
114
- message: `Failed to parse result: ${error}`,
115
- };
116
- }
117
- },
118
- });
119
-
120
- /**
121
- * Validates that injected context contains required patterns
122
- *
123
- * For coordinator resumption, context MUST include:
124
- * - Swarm continuation instructions
125
- * - Tool names (swarm_status, swarmmail_inbox)
126
- * - Actionable language ("COORDINATOR", "Keep Cooking")
127
- *
128
- * Score: ratio of required patterns found (0.0 to 1.0)
129
- */
130
- export const requiredPatternsPresent = createScorer({
131
- name: "Required Patterns Present",
132
- description: "Validates injected context contains required patterns",
133
- scorer: ({ output, expected }) => {
134
- try {
135
- const result = JSON.parse(String(output)) as CompactionResult;
136
- const exp = expected as CompactionExpected;
137
-
138
- // If no context injected, check that mustContain is empty
139
- if (!result.contextInjected) {
140
- if (!exp.mustContain || exp.mustContain.length === 0) {
141
- return {
142
- score: 1,
143
- message: "No context injected (expected)",
144
- };
145
- }
146
- return {
147
- score: 0,
148
- message: "No context injected but patterns were expected",
149
- };
150
- }
151
-
152
- // Check required patterns
153
- if (!exp.mustContain || exp.mustContain.length === 0) {
154
- return {
155
- score: 1,
156
- message: "No required patterns to check",
157
- };
158
- }
159
-
160
- const found = exp.mustContain.filter((pattern) =>
161
- result.injectedContext.includes(pattern),
162
- );
163
-
164
- const score = found.length / exp.mustContain.length;
165
-
166
- if (score === 1) {
167
- return {
168
- score: 1,
169
- message: `All ${exp.mustContain.length} required patterns found`,
170
- };
171
- }
172
-
173
- const missing = exp.mustContain.filter(
174
- (pattern) => !result.injectedContext.includes(pattern),
175
- );
176
-
177
- return {
178
- score,
179
- message: `${found.length}/${exp.mustContain.length} patterns found. Missing: ${missing.join(", ")}`,
180
- };
181
- } catch (error) {
182
- return {
183
- score: 0,
184
- message: `Failed to parse result: ${error}`,
185
- };
186
- }
187
- },
188
- });
189
-
190
- /**
191
- * Validates that injected context excludes forbidden patterns
192
- *
193
- * Context should NOT contain:
194
- * - Placeholder IDs ("bd-xxx")
195
- * - Generic/template language
196
- * - Wrong context type markers
197
- *
198
- * Score: 1.0 if no forbidden patterns found, 0.0 if any found
199
- */
200
- export const forbiddenPatternsAbsent = createScorer({
201
- name: "Forbidden Patterns Absent",
202
- description: "Validates injected context excludes forbidden patterns",
203
- scorer: ({ output, expected }) => {
204
- try {
205
- const result = JSON.parse(String(output)) as CompactionResult;
206
- const exp = expected as CompactionExpected;
207
-
208
- // If no context injected, all checks pass
209
- if (!result.contextInjected) {
210
- return {
211
- score: 1,
212
- message: "No context injected (no forbidden patterns possible)",
213
- };
214
- }
215
-
216
- // Check forbidden patterns
217
- if (!exp.mustNotContain || exp.mustNotContain.length === 0) {
218
- return {
219
- score: 1,
220
- message: "No forbidden patterns to check",
221
- };
222
- }
223
-
224
- const foundForbidden = exp.mustNotContain.filter((pattern) =>
225
- result.injectedContext.includes(pattern),
226
- );
227
-
228
- if (foundForbidden.length === 0) {
229
- return {
230
- score: 1,
231
- message: "No forbidden patterns found",
232
- };
233
- }
234
-
235
- return {
236
- score: 0,
237
- message: `Forbidden patterns found: ${foundForbidden.join(", ")}`,
238
- };
239
- } catch (error) {
240
- return {
241
- score: 0,
242
- message: `Failed to parse result: ${error}`,
243
- };
244
- }
245
- },
246
- });
247
-
248
- /**
249
- * Composite scorer: Overall compaction quality
250
- *
251
- * Combines all compaction-specific checks into single score.
252
- * Weighted average:
253
- * - Confidence accuracy: 25%
254
- * - Context injection: 25%
255
- * - Required patterns: 30%
256
- * - Forbidden patterns: 20%
257
- *
258
- * Score: 0.0 to 1.0
259
- */
260
- export const compactionQuality = createScorer({
261
- name: "Overall Compaction Quality",
262
- description: "Composite score for compaction hook correctness",
263
- scorer: async ({ output, expected, input }) => {
264
- try {
265
- // Run all scorers
266
- const scores = {
267
- confidence: await confidenceAccuracy({ output, expected, input }),
268
- injection: await contextInjectionCorrectness({ output, expected, input }),
269
- required: await requiredPatternsPresent({ output, expected, input }),
270
- forbidden: await forbiddenPatternsAbsent({ output, expected, input }),
271
- };
272
-
273
- // Weighted average
274
- const weights = {
275
- confidence: 0.25,
276
- injection: 0.25,
277
- required: 0.3,
278
- forbidden: 0.2,
279
- };
280
-
281
- const totalScore =
282
- (scores.confidence.score ?? 0) * weights.confidence +
283
- (scores.injection.score ?? 0) * weights.injection +
284
- (scores.required.score ?? 0) * weights.required +
285
- (scores.forbidden.score ?? 0) * weights.forbidden;
286
-
287
- const details = [
288
- `Confidence: ${((scores.confidence.score ?? 0) * 100).toFixed(0)}%`,
289
- `Injection: ${((scores.injection.score ?? 0) * 100).toFixed(0)}%`,
290
- `Required: ${((scores.required.score ?? 0) * 100).toFixed(0)}%`,
291
- `Forbidden: ${((scores.forbidden.score ?? 0) * 100).toFixed(0)}%`,
292
- ].join(", ");
293
-
294
- return {
295
- score: totalScore,
296
- message: `Overall: ${(totalScore * 100).toFixed(0)}% (${details})`,
297
- };
298
- } catch (error) {
299
- return {
300
- score: 0,
301
- message: `Failed to compute composite score: ${error}`,
302
- };
303
- }
304
- },
305
- });