opencode-swarm-plugin 0.43.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cass.characterization.test.ts +422 -0
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.test.ts +68 -0
- package/bin/swarm.ts +81 -8
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/contributor-tools.d.ts +42 -0
- package/dist/contributor-tools.d.ts.map +1 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7728 -62590
- package/dist/plugin.js +23833 -78695
- package/dist/sessions/agent-discovery.d.ts +59 -0
- package/dist/sessions/agent-discovery.d.ts.map +1 -0
- package/dist/sessions/index.d.ts +10 -0
- package/dist/sessions/index.d.ts.map +1 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2255
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2426
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -935
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
package/evals/lib/llm.ts
DELETED
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* LLM Client for Evalite Evals
|
|
3
|
-
*
|
|
4
|
-
* Uses AI SDK v6 with Vercel AI Gateway.
|
|
5
|
-
* Gateway handles provider routing - just pass "provider/model" string.
|
|
6
|
-
*
|
|
7
|
-
* @module evals/lib/llm
|
|
8
|
-
*/
|
|
9
|
-
import { generateText, gateway } from "ai";
|
|
10
|
-
import type { GatewayModelId } from "ai";
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Default model for decomposition evals
|
|
14
|
-
* Using Claude Sonnet for good balance of quality and cost
|
|
15
|
-
*/
|
|
16
|
-
export const DEFAULT_MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* Generate a decomposition from a task description
|
|
20
|
-
*
|
|
21
|
-
* @param prompt - The full decomposition prompt
|
|
22
|
-
* @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
|
|
23
|
-
* @returns The raw text response from the LLM
|
|
24
|
-
*/
|
|
25
|
-
export async function generateDecomposition(
|
|
26
|
-
prompt: string,
|
|
27
|
-
model: GatewayModelId = DEFAULT_MODEL,
|
|
28
|
-
): Promise<string> {
|
|
29
|
-
const { text } = await generateText({
|
|
30
|
-
model: gateway(model),
|
|
31
|
-
prompt,
|
|
32
|
-
maxOutputTokens: 4096,
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
return text;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Format a decomposition prompt from task and context
|
|
40
|
-
*
|
|
41
|
-
* Uses the same prompt template as swarm_plan_prompt
|
|
42
|
-
*/
|
|
43
|
-
export function formatDecompositionPrompt(
|
|
44
|
-
task: string,
|
|
45
|
-
context?: string,
|
|
46
|
-
maxSubtasks: number = 6,
|
|
47
|
-
): string {
|
|
48
|
-
const contextSection = context ? `## Context\n${context}` : "";
|
|
49
|
-
|
|
50
|
-
return `You are decomposing a task into parallelizable subtasks for a swarm of agents.
|
|
51
|
-
|
|
52
|
-
## Task
|
|
53
|
-
${task}
|
|
54
|
-
|
|
55
|
-
${contextSection}
|
|
56
|
-
|
|
57
|
-
## Requirements
|
|
58
|
-
|
|
59
|
-
1. **Break into 2-${maxSubtasks} independent subtasks** that can run in parallel
|
|
60
|
-
2. **Assign files** - each subtask must specify which files it will modify
|
|
61
|
-
3. **No file overlap** - files cannot appear in multiple subtasks (they get exclusive locks)
|
|
62
|
-
4. **Order by dependency** - if subtask B needs subtask A's output, A must come first in the array
|
|
63
|
-
5. **Estimate complexity** - 1 (trivial) to 5 (complex)
|
|
64
|
-
|
|
65
|
-
## Response Format
|
|
66
|
-
|
|
67
|
-
Respond with ONLY a JSON object matching this schema (no markdown, no explanation):
|
|
68
|
-
|
|
69
|
-
{
|
|
70
|
-
"epic": {
|
|
71
|
-
"title": "string",
|
|
72
|
-
"description": "string"
|
|
73
|
-
},
|
|
74
|
-
"subtasks": [
|
|
75
|
-
{
|
|
76
|
-
"title": "string",
|
|
77
|
-
"description": "string",
|
|
78
|
-
"files": ["string"],
|
|
79
|
-
"dependencies": [0],
|
|
80
|
-
"estimated_complexity": 1
|
|
81
|
-
}
|
|
82
|
-
]
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
## Guidelines
|
|
86
|
-
|
|
87
|
-
- **Plan aggressively** - when in doubt, split further
|
|
88
|
-
- **Prefer smaller, focused subtasks** over large complex ones
|
|
89
|
-
- **Include test files** in the same subtask as the code they test
|
|
90
|
-
- **Be specific about files** - use actual file paths, not placeholders
|
|
91
|
-
|
|
92
|
-
Now decompose the task. Respond with JSON only:`;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
/**
|
|
96
|
-
* Extract JSON from LLM response
|
|
97
|
-
*
|
|
98
|
-
* Handles responses that may have markdown code blocks or extra text
|
|
99
|
-
*/
|
|
100
|
-
export function extractJson(text: string): string {
|
|
101
|
-
// Try to find JSON in code blocks first
|
|
102
|
-
const codeBlockMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
103
|
-
if (codeBlockMatch) {
|
|
104
|
-
return codeBlockMatch[1].trim();
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// Try to find raw JSON object
|
|
108
|
-
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
109
|
-
if (jsonMatch) {
|
|
110
|
-
return jsonMatch[0];
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// Return as-is if no JSON found
|
|
114
|
-
return text;
|
|
115
|
-
}
|
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Compaction Prompt Quality Scorers - Evalite Wrappers
|
|
3
|
-
*
|
|
4
|
-
* These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
|
|
5
|
-
* for use with evalite's test runner.
|
|
6
|
-
*
|
|
7
|
-
* Weighted scoring:
|
|
8
|
-
* - epicIdSpecificity (0.20) - real IDs not placeholders
|
|
9
|
-
* - actionability (0.20) - swarm_status/inbox with real values
|
|
10
|
-
* - coordinatorIdentity (0.25) - ASCII header + strong mandates
|
|
11
|
-
* - forbiddenToolsPresent (0.15) - lists forbidden tools by name
|
|
12
|
-
* - postCompactionDiscipline (0.20) - first tool correct, no edit/write
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
import { createScorer } from "evalite";
|
|
16
|
-
import type { CompactionPrompt } from "../../src/compaction-prompt-scoring.js";
|
|
17
|
-
import {
|
|
18
|
-
scoreActionability,
|
|
19
|
-
scoreCoordinatorIdentity,
|
|
20
|
-
scoreEpicIdSpecificity,
|
|
21
|
-
scoreForbiddenToolsPresent,
|
|
22
|
-
scorePostCompactionDiscipline,
|
|
23
|
-
} from "../../src/compaction-prompt-scoring.js";
|
|
24
|
-
|
|
25
|
-
// Re-export types for convenience
|
|
26
|
-
export type { CompactionPrompt, ScorerResult } from "../../src/compaction-prompt-scoring.js";
|
|
27
|
-
|
|
28
|
-
// Re-export pure functions for direct use
|
|
29
|
-
export {
|
|
30
|
-
scoreActionability,
|
|
31
|
-
scoreCoordinatorIdentity,
|
|
32
|
-
scoreEpicIdSpecificity,
|
|
33
|
-
scoreForbiddenToolsPresent,
|
|
34
|
-
scorePostCompactionDiscipline,
|
|
35
|
-
} from "../../src/compaction-prompt-scoring.js";
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Epic ID Specificity Scorer
|
|
39
|
-
*
|
|
40
|
-
* Validates that epic IDs are REAL, not placeholders.
|
|
41
|
-
* Score: 1.0 if real IDs, 0.0 if placeholders found
|
|
42
|
-
*/
|
|
43
|
-
export const epicIdSpecificity = createScorer({
|
|
44
|
-
name: "Epic ID Specificity",
|
|
45
|
-
description: "Prompt uses real epic IDs, not placeholders",
|
|
46
|
-
scorer: ({ output }) => {
|
|
47
|
-
try {
|
|
48
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
49
|
-
return scoreEpicIdSpecificity(prompt);
|
|
50
|
-
} catch (error) {
|
|
51
|
-
return {
|
|
52
|
-
score: 0,
|
|
53
|
-
message: `Failed to parse prompt: ${error}`,
|
|
54
|
-
};
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
/**
|
|
60
|
-
* Actionability Scorer
|
|
61
|
-
*
|
|
62
|
-
* Validates that the prompt includes SPECIFIC actionable tool calls.
|
|
63
|
-
* Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
|
|
64
|
-
*/
|
|
65
|
-
export const actionability = createScorer({
|
|
66
|
-
name: "Actionability",
|
|
67
|
-
description: "Prompt includes specific tool calls with real values",
|
|
68
|
-
scorer: ({ output }) => {
|
|
69
|
-
try {
|
|
70
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
71
|
-
return scoreActionability(prompt);
|
|
72
|
-
} catch (error) {
|
|
73
|
-
return {
|
|
74
|
-
score: 0,
|
|
75
|
-
message: `Failed to parse prompt: ${error}`,
|
|
76
|
-
};
|
|
77
|
-
}
|
|
78
|
-
},
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
/**
|
|
82
|
-
* Coordinator Identity Scorer
|
|
83
|
-
*
|
|
84
|
-
* Validates that the prompt has STRONG coordinator identity reinforcement.
|
|
85
|
-
* Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
|
|
86
|
-
*/
|
|
87
|
-
export const coordinatorIdentity = createScorer({
|
|
88
|
-
name: "Coordinator Identity",
|
|
89
|
-
description: "Prompt has ASCII header and strong mandates",
|
|
90
|
-
scorer: ({ output }) => {
|
|
91
|
-
try {
|
|
92
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
93
|
-
return scoreCoordinatorIdentity(prompt);
|
|
94
|
-
} catch (error) {
|
|
95
|
-
return {
|
|
96
|
-
score: 0,
|
|
97
|
-
message: `Failed to parse prompt: ${error}`,
|
|
98
|
-
};
|
|
99
|
-
}
|
|
100
|
-
},
|
|
101
|
-
});
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* Forbidden Tools Present Scorer
|
|
105
|
-
*
|
|
106
|
-
* Validates that the prompt LISTS forbidden tools by name.
|
|
107
|
-
* Score: ratio of forbidden tools mentioned (0.0 to 1.0)
|
|
108
|
-
*/
|
|
109
|
-
export const forbiddenToolsPresent = createScorer({
|
|
110
|
-
name: "Forbidden Tools Present",
|
|
111
|
-
description: "Prompt lists forbidden tools by name",
|
|
112
|
-
scorer: ({ output }) => {
|
|
113
|
-
try {
|
|
114
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
115
|
-
return scoreForbiddenToolsPresent(prompt);
|
|
116
|
-
} catch (error) {
|
|
117
|
-
return {
|
|
118
|
-
score: 0,
|
|
119
|
-
message: `Failed to parse prompt: ${error}`,
|
|
120
|
-
};
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
});
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Post-Compaction Discipline Scorer
|
|
127
|
-
*
|
|
128
|
-
* Validates that the FIRST suggested tool is correct.
|
|
129
|
-
* Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
|
|
130
|
-
*/
|
|
131
|
-
export const postCompactionDiscipline = createScorer({
|
|
132
|
-
name: "Post-Compaction Discipline",
|
|
133
|
-
description: "First suggested tool is swarm_status or inbox",
|
|
134
|
-
scorer: ({ output }) => {
|
|
135
|
-
try {
|
|
136
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
137
|
-
return scorePostCompactionDiscipline(prompt);
|
|
138
|
-
} catch (error) {
|
|
139
|
-
return {
|
|
140
|
-
score: 0,
|
|
141
|
-
message: `Failed to parse prompt: ${error}`,
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
},
|
|
145
|
-
});
|
|
@@ -1,305 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Custom scorers for compaction hook evaluation
|
|
3
|
-
*
|
|
4
|
-
* These scorers validate that the compaction hook correctly:
|
|
5
|
-
* 1. Detects swarm state (confidence level)
|
|
6
|
-
* 2. Injects appropriate context (full/fallback/none)
|
|
7
|
-
* 3. Includes required patterns in context
|
|
8
|
-
* 4. Excludes placeholder/generic content
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
import { createScorer } from "evalite";
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Expected output from compaction hook tests
|
|
15
|
-
*/
|
|
16
|
-
export interface CompactionResult {
|
|
17
|
-
detected: boolean;
|
|
18
|
-
confidence: "high" | "medium" | "low" | "none";
|
|
19
|
-
contextInjected: boolean;
|
|
20
|
-
contextType: "full" | "fallback" | "none";
|
|
21
|
-
injectedContext: string;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
/**
|
|
25
|
-
* Expected criteria from test case
|
|
26
|
-
*/
|
|
27
|
-
export interface CompactionExpected {
|
|
28
|
-
confidence: "high" | "medium" | "low" | "none";
|
|
29
|
-
contextInjected: boolean;
|
|
30
|
-
contextType: "full" | "fallback" | "none";
|
|
31
|
-
mustContain?: string[];
|
|
32
|
-
mustNotContain?: string[];
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Validates that detection confidence matches expected level
|
|
37
|
-
*
|
|
38
|
-
* Confidence determines what gets injected:
|
|
39
|
-
* - HIGH/MEDIUM: Full coordinator context
|
|
40
|
-
* - LOW: Fallback detection prompt
|
|
41
|
-
* - NONE: No injection
|
|
42
|
-
*
|
|
43
|
-
* Score: 1.0 if confidence matches, 0.0 otherwise
|
|
44
|
-
*/
|
|
45
|
-
export const confidenceAccuracy = createScorer({
|
|
46
|
-
name: "Confidence Accuracy",
|
|
47
|
-
description: "Validates detection confidence matches expected level",
|
|
48
|
-
scorer: ({ output, expected }) => {
|
|
49
|
-
try {
|
|
50
|
-
const result = JSON.parse(String(output)) as CompactionResult;
|
|
51
|
-
const exp = expected as CompactionExpected;
|
|
52
|
-
|
|
53
|
-
if (result.confidence === exp.confidence) {
|
|
54
|
-
return {
|
|
55
|
-
score: 1,
|
|
56
|
-
message: `Correct confidence: ${result.confidence}`,
|
|
57
|
-
};
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
return {
|
|
61
|
-
score: 0,
|
|
62
|
-
message: `Wrong confidence: got ${result.confidence}, expected ${exp.confidence}`,
|
|
63
|
-
};
|
|
64
|
-
} catch (error) {
|
|
65
|
-
return {
|
|
66
|
-
score: 0,
|
|
67
|
-
message: `Failed to parse result: ${error}`,
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
},
|
|
71
|
-
});
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Validates that context injection matches expected behavior
|
|
75
|
-
*
|
|
76
|
-
* Checks:
|
|
77
|
-
* - Whether context was injected (boolean)
|
|
78
|
-
* - What type of context (full/fallback/none)
|
|
79
|
-
*
|
|
80
|
-
* Score: 1.0 if both match, 0.5 if only injection status matches, 0.0 otherwise
|
|
81
|
-
*/
|
|
82
|
-
export const contextInjectionCorrectness = createScorer({
|
|
83
|
-
name: "Context Injection Correctness",
|
|
84
|
-
description: "Validates context injection matches expected behavior",
|
|
85
|
-
scorer: ({ output, expected }) => {
|
|
86
|
-
try {
|
|
87
|
-
const result = JSON.parse(String(output)) as CompactionResult;
|
|
88
|
-
const exp = expected as CompactionExpected;
|
|
89
|
-
|
|
90
|
-
const injectionMatches = result.contextInjected === exp.contextInjected;
|
|
91
|
-
const typeMatches = result.contextType === exp.contextType;
|
|
92
|
-
|
|
93
|
-
if (injectionMatches && typeMatches) {
|
|
94
|
-
return {
|
|
95
|
-
score: 1,
|
|
96
|
-
message: `Correct injection: ${result.contextType}`,
|
|
97
|
-
};
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if (injectionMatches) {
|
|
101
|
-
return {
|
|
102
|
-
score: 0.5,
|
|
103
|
-
message: `Injection status correct but wrong type: got ${result.contextType}, expected ${exp.contextType}`,
|
|
104
|
-
};
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
return {
|
|
108
|
-
score: 0,
|
|
109
|
-
message: `Wrong injection: got ${result.contextInjected ? result.contextType : "none"}, expected ${exp.contextInjected ? exp.contextType : "none"}`,
|
|
110
|
-
};
|
|
111
|
-
} catch (error) {
|
|
112
|
-
return {
|
|
113
|
-
score: 0,
|
|
114
|
-
message: `Failed to parse result: ${error}`,
|
|
115
|
-
};
|
|
116
|
-
}
|
|
117
|
-
},
|
|
118
|
-
});
|
|
119
|
-
|
|
120
|
-
/**
|
|
121
|
-
* Validates that injected context contains required patterns
|
|
122
|
-
*
|
|
123
|
-
* For coordinator resumption, context MUST include:
|
|
124
|
-
* - Swarm continuation instructions
|
|
125
|
-
* - Tool names (swarm_status, swarmmail_inbox)
|
|
126
|
-
* - Actionable language ("COORDINATOR", "Keep Cooking")
|
|
127
|
-
*
|
|
128
|
-
* Score: ratio of required patterns found (0.0 to 1.0)
|
|
129
|
-
*/
|
|
130
|
-
export const requiredPatternsPresent = createScorer({
|
|
131
|
-
name: "Required Patterns Present",
|
|
132
|
-
description: "Validates injected context contains required patterns",
|
|
133
|
-
scorer: ({ output, expected }) => {
|
|
134
|
-
try {
|
|
135
|
-
const result = JSON.parse(String(output)) as CompactionResult;
|
|
136
|
-
const exp = expected as CompactionExpected;
|
|
137
|
-
|
|
138
|
-
// If no context injected, check that mustContain is empty
|
|
139
|
-
if (!result.contextInjected) {
|
|
140
|
-
if (!exp.mustContain || exp.mustContain.length === 0) {
|
|
141
|
-
return {
|
|
142
|
-
score: 1,
|
|
143
|
-
message: "No context injected (expected)",
|
|
144
|
-
};
|
|
145
|
-
}
|
|
146
|
-
return {
|
|
147
|
-
score: 0,
|
|
148
|
-
message: "No context injected but patterns were expected",
|
|
149
|
-
};
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// Check required patterns
|
|
153
|
-
if (!exp.mustContain || exp.mustContain.length === 0) {
|
|
154
|
-
return {
|
|
155
|
-
score: 1,
|
|
156
|
-
message: "No required patterns to check",
|
|
157
|
-
};
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
const found = exp.mustContain.filter((pattern) =>
|
|
161
|
-
result.injectedContext.includes(pattern),
|
|
162
|
-
);
|
|
163
|
-
|
|
164
|
-
const score = found.length / exp.mustContain.length;
|
|
165
|
-
|
|
166
|
-
if (score === 1) {
|
|
167
|
-
return {
|
|
168
|
-
score: 1,
|
|
169
|
-
message: `All ${exp.mustContain.length} required patterns found`,
|
|
170
|
-
};
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
const missing = exp.mustContain.filter(
|
|
174
|
-
(pattern) => !result.injectedContext.includes(pattern),
|
|
175
|
-
);
|
|
176
|
-
|
|
177
|
-
return {
|
|
178
|
-
score,
|
|
179
|
-
message: `${found.length}/${exp.mustContain.length} patterns found. Missing: ${missing.join(", ")}`,
|
|
180
|
-
};
|
|
181
|
-
} catch (error) {
|
|
182
|
-
return {
|
|
183
|
-
score: 0,
|
|
184
|
-
message: `Failed to parse result: ${error}`,
|
|
185
|
-
};
|
|
186
|
-
}
|
|
187
|
-
},
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
/**
|
|
191
|
-
* Validates that injected context excludes forbidden patterns
|
|
192
|
-
*
|
|
193
|
-
* Context should NOT contain:
|
|
194
|
-
* - Placeholder IDs ("bd-xxx")
|
|
195
|
-
* - Generic/template language
|
|
196
|
-
* - Wrong context type markers
|
|
197
|
-
*
|
|
198
|
-
* Score: 1.0 if no forbidden patterns found, 0.0 if any found
|
|
199
|
-
*/
|
|
200
|
-
export const forbiddenPatternsAbsent = createScorer({
|
|
201
|
-
name: "Forbidden Patterns Absent",
|
|
202
|
-
description: "Validates injected context excludes forbidden patterns",
|
|
203
|
-
scorer: ({ output, expected }) => {
|
|
204
|
-
try {
|
|
205
|
-
const result = JSON.parse(String(output)) as CompactionResult;
|
|
206
|
-
const exp = expected as CompactionExpected;
|
|
207
|
-
|
|
208
|
-
// If no context injected, all checks pass
|
|
209
|
-
if (!result.contextInjected) {
|
|
210
|
-
return {
|
|
211
|
-
score: 1,
|
|
212
|
-
message: "No context injected (no forbidden patterns possible)",
|
|
213
|
-
};
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// Check forbidden patterns
|
|
217
|
-
if (!exp.mustNotContain || exp.mustNotContain.length === 0) {
|
|
218
|
-
return {
|
|
219
|
-
score: 1,
|
|
220
|
-
message: "No forbidden patterns to check",
|
|
221
|
-
};
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
const foundForbidden = exp.mustNotContain.filter((pattern) =>
|
|
225
|
-
result.injectedContext.includes(pattern),
|
|
226
|
-
);
|
|
227
|
-
|
|
228
|
-
if (foundForbidden.length === 0) {
|
|
229
|
-
return {
|
|
230
|
-
score: 1,
|
|
231
|
-
message: "No forbidden patterns found",
|
|
232
|
-
};
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
return {
|
|
236
|
-
score: 0,
|
|
237
|
-
message: `Forbidden patterns found: ${foundForbidden.join(", ")}`,
|
|
238
|
-
};
|
|
239
|
-
} catch (error) {
|
|
240
|
-
return {
|
|
241
|
-
score: 0,
|
|
242
|
-
message: `Failed to parse result: ${error}`,
|
|
243
|
-
};
|
|
244
|
-
}
|
|
245
|
-
},
|
|
246
|
-
});
|
|
247
|
-
|
|
248
|
-
/**
|
|
249
|
-
* Composite scorer: Overall compaction quality
|
|
250
|
-
*
|
|
251
|
-
* Combines all compaction-specific checks into single score.
|
|
252
|
-
* Weighted average:
|
|
253
|
-
* - Confidence accuracy: 25%
|
|
254
|
-
* - Context injection: 25%
|
|
255
|
-
* - Required patterns: 30%
|
|
256
|
-
* - Forbidden patterns: 20%
|
|
257
|
-
*
|
|
258
|
-
* Score: 0.0 to 1.0
|
|
259
|
-
*/
|
|
260
|
-
export const compactionQuality = createScorer({
|
|
261
|
-
name: "Overall Compaction Quality",
|
|
262
|
-
description: "Composite score for compaction hook correctness",
|
|
263
|
-
scorer: async ({ output, expected, input }) => {
|
|
264
|
-
try {
|
|
265
|
-
// Run all scorers
|
|
266
|
-
const scores = {
|
|
267
|
-
confidence: await confidenceAccuracy({ output, expected, input }),
|
|
268
|
-
injection: await contextInjectionCorrectness({ output, expected, input }),
|
|
269
|
-
required: await requiredPatternsPresent({ output, expected, input }),
|
|
270
|
-
forbidden: await forbiddenPatternsAbsent({ output, expected, input }),
|
|
271
|
-
};
|
|
272
|
-
|
|
273
|
-
// Weighted average
|
|
274
|
-
const weights = {
|
|
275
|
-
confidence: 0.25,
|
|
276
|
-
injection: 0.25,
|
|
277
|
-
required: 0.3,
|
|
278
|
-
forbidden: 0.2,
|
|
279
|
-
};
|
|
280
|
-
|
|
281
|
-
const totalScore =
|
|
282
|
-
(scores.confidence.score ?? 0) * weights.confidence +
|
|
283
|
-
(scores.injection.score ?? 0) * weights.injection +
|
|
284
|
-
(scores.required.score ?? 0) * weights.required +
|
|
285
|
-
(scores.forbidden.score ?? 0) * weights.forbidden;
|
|
286
|
-
|
|
287
|
-
const details = [
|
|
288
|
-
`Confidence: ${((scores.confidence.score ?? 0) * 100).toFixed(0)}%`,
|
|
289
|
-
`Injection: ${((scores.injection.score ?? 0) * 100).toFixed(0)}%`,
|
|
290
|
-
`Required: ${((scores.required.score ?? 0) * 100).toFixed(0)}%`,
|
|
291
|
-
`Forbidden: ${((scores.forbidden.score ?? 0) * 100).toFixed(0)}%`,
|
|
292
|
-
].join(", ");
|
|
293
|
-
|
|
294
|
-
return {
|
|
295
|
-
score: totalScore,
|
|
296
|
-
message: `Overall: ${(totalScore * 100).toFixed(0)}% (${details})`,
|
|
297
|
-
};
|
|
298
|
-
} catch (error) {
|
|
299
|
-
return {
|
|
300
|
-
score: 0,
|
|
301
|
-
message: `Failed to compute composite score: ${error}`,
|
|
302
|
-
};
|
|
303
|
-
}
|
|
304
|
-
},
|
|
305
|
-
});
|