@swarmtools/evals 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -0
- package/dist/compaction-prompt.eval.d.ts +30 -0
- package/dist/compaction-prompt.eval.d.ts.map +1 -0
- package/dist/compaction-resumption.eval.d.ts +23 -0
- package/dist/compaction-resumption.eval.d.ts.map +1 -0
- package/dist/coordinator-behavior.eval.d.ts +29 -0
- package/dist/coordinator-behavior.eval.d.ts.map +1 -0
- package/dist/coordinator-session.eval.d.ts +26 -0
- package/dist/coordinator-session.eval.d.ts.map +1 -0
- package/dist/example.eval.d.ts +10 -0
- package/dist/example.eval.d.ts.map +1 -0
- package/dist/fixtures/cass-baseline.d.ts +147 -0
- package/dist/fixtures/cass-baseline.d.ts.map +1 -0
- package/dist/fixtures/compaction-cases.d.ts +44 -0
- package/dist/fixtures/compaction-cases.d.ts.map +1 -0
- package/dist/fixtures/compaction-prompt-cases.d.ts +50 -0
- package/dist/fixtures/compaction-prompt-cases.d.ts.map +1 -0
- package/dist/fixtures/coordinator-sessions.d.ts +73 -0
- package/dist/fixtures/coordinator-sessions.d.ts.map +1 -0
- package/dist/fixtures/decomposition-cases.d.ts +20 -0
- package/dist/fixtures/decomposition-cases.d.ts.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +0 -0
- package/dist/lib/compaction-loader.d.ts +103 -0
- package/dist/lib/compaction-loader.d.ts.map +1 -0
- package/dist/lib/data-loader.d.ts +79 -0
- package/dist/lib/data-loader.d.ts.map +1 -0
- package/dist/lib/llm.d.ts +27 -0
- package/dist/lib/llm.d.ts.map +1 -0
- package/dist/scorers/compaction-prompt-scorers.d.ts +51 -0
- package/dist/scorers/compaction-prompt-scorers.d.ts.map +1 -0
- package/dist/scorers/compaction-scorers.d.ts +86 -0
- package/dist/scorers/compaction-scorers.d.ts.map +1 -0
- package/dist/scorers/coordinator-discipline.d.ts +77 -0
- package/dist/scorers/coordinator-discipline.d.ts.map +1 -0
- package/dist/scorers/index.d.ts +51 -0
- package/dist/scorers/index.d.ts.map +1 -0
- package/dist/scorers/outcome-scorers.d.ts +62 -0
- package/dist/scorers/outcome-scorers.d.ts.map +1 -0
- package/dist/swarm-decomposition.eval.d.ts +2 -0
- package/dist/swarm-decomposition.eval.d.ts.map +1 -0
- package/package.json +45 -0
package/README.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# @swarmtools/evals
|
|
2
|
+
|
|
3
|
+
```
|
|
4
|
+
🐝 EVAL SUITE 🐝
|
|
5
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
6
|
+
Swarm Intelligence QA
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Evaluation suite for swarm-tools multi-agent coordination. Uses [Evalite](https://evalite.dev) to measure coordinator behavior, decomposition quality, and compaction correctness.
|
|
10
|
+
|
|
11
|
+
## Purpose
|
|
12
|
+
|
|
13
|
+
This package contains the evaluation framework for the swarm-tools ecosystem. Extracting evals into a separate package ensures:
|
|
14
|
+
|
|
15
|
+
1. **Clean Dependencies** - Main plugin doesn't need evalite/vitest in production
|
|
16
|
+
2. **Faster Installs** - Eval deps only needed for development/CI
|
|
17
|
+
3. **Isolated Testing** - Eval suite can evolve independently from plugin
|
|
18
|
+
|
|
19
|
+
## What Gets Evaluated
|
|
20
|
+
|
|
21
|
+
- **Coordinator Protocol** - Does the coordinator spawn workers vs doing work itself?
|
|
22
|
+
- **Task Decomposition** - Quality of task splitting, file conflict detection
|
|
23
|
+
- **Compaction** - Context compression correctness
|
|
24
|
+
- **Review Thoroughness** - Does coordinator review worker output properly?
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Run all evals
|
|
30
|
+
bun run test
|
|
31
|
+
|
|
32
|
+
# Build for publishing
|
|
33
|
+
bun run build
|
|
34
|
+
|
|
35
|
+
# Type check
|
|
36
|
+
bun run typecheck
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Package Structure
|
|
40
|
+
|
|
41
|
+
This package is part of the swarm-tools monorepo:
|
|
42
|
+
|
|
43
|
+
- `opencode-swarm-plugin` - Main plugin (peer dependency)
|
|
44
|
+
- `swarm-mail` - Event sourcing primitives (peer dependency)
|
|
45
|
+
- `@swarmtools/evals` - This package
|
|
46
|
+
|
|
47
|
+
## Development
|
|
48
|
+
|
|
49
|
+
Evals use real coordinator sessions captured to `~/.config/swarm-tools/sessions/*.jsonl`. See the main plugin's `evals/README.md` for details on session capture.
|
|
50
|
+
|
|
51
|
+
## License
|
|
52
|
+
|
|
53
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compaction Prompt Quality Evaluation
|
|
3
|
+
*
|
|
4
|
+
* Tests that continuation prompts generated after context compaction meet
|
|
5
|
+
* quality criteria for coordinator resumption:
|
|
6
|
+
*
|
|
7
|
+
* 1. Epic ID Specificity (20%) - Real IDs not placeholders
|
|
8
|
+
* 2. Actionability (20%) - Specific tool calls with real values
|
|
9
|
+
* 3. Coordinator Identity (25%) - ASCII header + strong mandates
|
|
10
|
+
* 4. Forbidden Tools (15%) - Lists forbidden tools by name
|
|
11
|
+
* 5. Post-Compaction Discipline (20%) - First tool is correct
|
|
12
|
+
*
|
|
13
|
+
* ## Why This Matters
|
|
14
|
+
*
|
|
15
|
+
* After compaction, coordinators lose context. The continuation prompt is
|
|
16
|
+
* their ONLY guide to resume. Bad prompts cause:
|
|
17
|
+
* - Coordinators editing files (should delegate to workers)
|
|
18
|
+
* - Generic "check status" instead of actual tool calls
|
|
19
|
+
* - Lost epic IDs (can't resume coordination)
|
|
20
|
+
*
|
|
21
|
+
* ## Test Strategy
|
|
22
|
+
*
|
|
23
|
+
* - 6 synthetic fixtures covering perfect/bad prompts
|
|
24
|
+
* - Each fixture tests specific failure modes
|
|
25
|
+
* - Composite scorer validates overall quality
|
|
26
|
+
*
|
|
27
|
+
* Run with: bun run eval:compaction
|
|
28
|
+
*/
|
|
29
|
+
export {};
|
|
30
|
+
//# sourceMappingURL=compaction-prompt.eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-prompt.eval.d.ts","sourceRoot":"","sources":["../src/compaction-prompt.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compaction Hook Coordinator Resumption Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests that the compaction hook correctly detects swarm state and injects
|
|
5
|
+
* appropriate context for coordinator resumption.
|
|
6
|
+
*
|
|
7
|
+
* ## Bug Being Tested
|
|
8
|
+
*
|
|
9
|
+
* Root cause: The compaction hook injects generic "you are a coordinator"
|
|
10
|
+
* context but doesn't include the SPECIFIC epic ID, subtask status, or
|
|
11
|
+
* project path. This causes coordinators to lose identity after compaction.
|
|
12
|
+
*
|
|
13
|
+
* ## Test Cases
|
|
14
|
+
*
|
|
15
|
+
* 1. Active swarm with in_progress epic - should inject full context with epic ID
|
|
16
|
+
* 2. Multiple epics - should identify the in_progress one
|
|
17
|
+
* 3. No active swarm - should not inject coordinator context
|
|
18
|
+
* 4. Blocked epic - should still detect as active swarm
|
|
19
|
+
*
|
|
20
|
+
* Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
|
|
21
|
+
*/
|
|
22
|
+
export {};
|
|
23
|
+
//# sourceMappingURL=compaction-resumption.eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-resumption.eval.d.ts","sourceRoot":"","sources":["../src/compaction-resumption.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Coordinator Behavior After Compaction Eval
|
|
3
|
+
*
|
|
4
|
+
* LLM-as-judge eval that tests whether the compaction context actually
|
|
5
|
+
* causes Claude to behave like a coordinator (spawn workers, check status)
|
|
6
|
+
* rather than a worker (run tests, edit files directly).
|
|
7
|
+
*
|
|
8
|
+
* This is the missing piece - we test the CONTEXT CONTENT in unit tests,
|
|
9
|
+
* but we need to test whether the LLM BEHAVES CORRECTLY given that context.
|
|
10
|
+
*
|
|
11
|
+
* Run with: bunx evalite run evals/coordinator-behavior.eval.ts
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Scores whether the response mentions coordinator tools
|
|
15
|
+
*/
|
|
16
|
+
export declare const mentionsCoordinatorTools: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
17
|
+
/**
|
|
18
|
+
* Scores whether the response avoids worker behaviors
|
|
19
|
+
*/
|
|
20
|
+
export declare const avoidsWorkerBehaviors: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
21
|
+
/**
|
|
22
|
+
* Scores whether the response shows coordinator mindset
|
|
23
|
+
*/
|
|
24
|
+
export declare const coordinatorMindset: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
25
|
+
/**
|
|
26
|
+
* Composite scorer for overall coordinator behavior
|
|
27
|
+
*/
|
|
28
|
+
export declare const overallCoordinatorBehavior: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
29
|
+
//# sourceMappingURL=coordinator-behavior.eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"coordinator-behavior.eval.d.ts","sourceRoot":"","sources":["../src/coordinator-behavior.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AA2DH;;GAEG;AACH,eAAO,MAAM,wBAAwB,6DAwBnC,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,qBAAqB,6DAiChC,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,kBAAkB,6DAyC7B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,0BAA0B,6DAmBrC,CAAC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Coordinator Session Eval - Scores Real Captured Sessions
|
|
3
|
+
*
|
|
4
|
+
* Tests that coordinators follow protocol:
|
|
5
|
+
* 1. Don't edit files directly (spawn workers)
|
|
6
|
+
* 2. Don't run tests directly (workers do verification)
|
|
7
|
+
* 3. Spawn workers for all subtasks
|
|
8
|
+
* 4. Review worker output before accepting
|
|
9
|
+
* 5. Minimize time to first spawn (don't overthink)
|
|
10
|
+
*
|
|
11
|
+
* ## Data Sources
|
|
12
|
+
*
|
|
13
|
+
* - **Real sessions**: Captured from ~/.config/swarm-tools/sessions/*.jsonl
|
|
14
|
+
* - **Synthetic fixtures**: Test cases in fixtures/coordinator-sessions.ts
|
|
15
|
+
*
|
|
16
|
+
* ## Test Flow
|
|
17
|
+
*
|
|
18
|
+
* 1. Load captured sessions from disk (via loadCapturedSessions)
|
|
19
|
+
* 2. Load synthetic fixtures for baseline validation
|
|
20
|
+
* 3. Run coordinator-discipline scorers on all sessions
|
|
21
|
+
* 4. Output scores and violation details
|
|
22
|
+
*
|
|
23
|
+
* Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
|
|
24
|
+
*/
|
|
25
|
+
export {};
|
|
26
|
+
//# sourceMappingURL=coordinator-session.eval.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"coordinator-session.eval.d.ts","sourceRoot":"","sources":["../src/coordinator-session.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"example.eval.d.ts","sourceRoot":"","sources":["../src/example.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG"}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CASS Baseline Response Fixtures
|
|
3
|
+
*
|
|
4
|
+
* These fixtures capture the ACTUAL behavior of the CASS binary tools.
|
|
5
|
+
* DO NOT modify to match desired behavior - these document what the binary DOES.
|
|
6
|
+
*
|
|
7
|
+
* Purpose: Characterization tests for ADR-010 (CASS inhousing).
|
|
8
|
+
* These ensure our inhouse implementation matches the binary's behavior.
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* cass stats --json
|
|
12
|
+
* Captured: 2025-12-25
|
|
13
|
+
*/
|
|
14
|
+
export declare const cassStatsBaseline: {
|
|
15
|
+
readonly by_agent: readonly [{
|
|
16
|
+
readonly agent: "claude_code";
|
|
17
|
+
readonly count: 137;
|
|
18
|
+
}, {
|
|
19
|
+
readonly agent: "cursor";
|
|
20
|
+
readonly count: 23;
|
|
21
|
+
}, {
|
|
22
|
+
readonly agent: "codex";
|
|
23
|
+
readonly count: 2;
|
|
24
|
+
}];
|
|
25
|
+
readonly conversations: 162;
|
|
26
|
+
readonly date_range: {
|
|
27
|
+
readonly newest: "2025-12-08T04:20:36.526+00:00";
|
|
28
|
+
readonly oldest: "2025-07-14T01:14:44.997+00:00";
|
|
29
|
+
};
|
|
30
|
+
readonly db_path: "/Users/joel/Library/Application Support/com.coding-agent-search.coding-agent-search/agent_search.db";
|
|
31
|
+
readonly messages: 4213;
|
|
32
|
+
readonly top_workspaces: readonly [{
|
|
33
|
+
readonly count: 28;
|
|
34
|
+
readonly workspace: "/Users/joel/Code/vercel/academy-vectr-workflow-course-content/external/workflow-builder-starter";
|
|
35
|
+
}, {
|
|
36
|
+
readonly count: 22;
|
|
37
|
+
readonly workspace: "/Users/joel/Code/vercel/slack-agents-course";
|
|
38
|
+
}];
|
|
39
|
+
};
|
|
40
|
+
/**
|
|
41
|
+
* cass search "swarm" --limit 2 --json
|
|
42
|
+
* Captured: 2025-12-25
|
|
43
|
+
*/
|
|
44
|
+
export declare const cassSearchBaseline: {
|
|
45
|
+
readonly count: 2;
|
|
46
|
+
readonly cursor: null;
|
|
47
|
+
readonly hits: readonly [{
|
|
48
|
+
readonly agent: "claude_code";
|
|
49
|
+
readonly content: "Fixed. The `plugins` key is invalid - OpenCode auto-loads plugins from directories instead.\n\n**Changes:**\n1. ✅ Removed invalid `plugins` array from `opencode.jsonc`\n2. ✅ Created `~/.config/opencode/plugin/` directory\n3. ✅ Symlinked your swarm plugin → `~/.config/opencode/plugin/swarm.js`\n\nThe plugin will now auto-load on startup. Restart OpenCode to pick it up.\n\nSources:\n- [OpenCode Plugins Documentation](https://opencode.ai/docs/plugins/)\n- [OpenCode Config Documentation](https://opencode.ai/docs/config/)";
|
|
50
|
+
readonly created_at: 1765161767083;
|
|
51
|
+
readonly line_number: 9;
|
|
52
|
+
readonly match_type: "exact";
|
|
53
|
+
readonly score: 15.536974906921387;
|
|
54
|
+
readonly snippet: "Symlinked your swarm plugin → `~/.config/opencode/plugin/swarm.js`\n\nThe plugin will now auto-load on startup. Restart OpenCode to pick it up.\n\nSources:\n- [OpenC…";
|
|
55
|
+
readonly source_path: "/Users/joel/.claude/projects/-Users-joel--config-opencode/ccd64ac6-bca7-40e5-9150-cea58c3788ae.jsonl";
|
|
56
|
+
readonly title: "@opencode.jsonc has an invalid plugins key https://opencode.ai/docs/plugins/ https://opencode.ai/doc";
|
|
57
|
+
readonly workspace: "/Users/joel/.config/opencode";
|
|
58
|
+
}, {
|
|
59
|
+
readonly agent: "claude_code";
|
|
60
|
+
readonly content: "I'm ready to help you explore the codebase and design implementation plans. I'm in **READ-ONLY mode** - I can explore files, understand architecture, and create detailed plans, but I cannot and will not modify any files.\n\nI have access to the beads issue tracker (`bd` commands) and can see your current working directory is `/Users/joel/.config/opencode`.\n\n**Current git status shows:**\n- Modified: `.beads/issues.jsonl`, `AGENTS.md`, `command/swarm.md`, `opencode.jsonc`\n- Untracked: `command/swarm-collect.md`, `command/swarm-status.md`, `plugin/`\n\n**What would you like me to explore and plan?**\n\nCommon scenarios I can help with:\n- Designing new command implementations\n- Planning plugin architecture\n- Exploring existing patterns for feature additions\n- Creating implementation strategies for beads issues\n\nLet me know what you need, and I'll dive into the codebase, understand the current architecture, and provide a detailed implementation plan.";
|
|
61
|
+
readonly created_at: 1765161814722;
|
|
62
|
+
readonly line_number: 1;
|
|
63
|
+
readonly match_type: "exact";
|
|
64
|
+
readonly score: 14.522254943847656;
|
|
65
|
+
readonly snippet: ".md`, `command/swarm.md`, `opencode.jsonc`\n- Untracked: `command/swarm-collect.md`, `command/swarm-status.md`, `plugin/`\n\n**What would you like me to explore an…";
|
|
66
|
+
readonly source_path: "/Users/joel/.claude/projects/-Users-joel--config-opencode/agent-ee2a73ee.jsonl";
|
|
67
|
+
readonly title: "opencode";
|
|
68
|
+
readonly workspace: "/Users/joel/.config/opencode";
|
|
69
|
+
}];
|
|
70
|
+
readonly hits_clamped: false;
|
|
71
|
+
readonly limit: 2;
|
|
72
|
+
readonly max_tokens: null;
|
|
73
|
+
readonly offset: 0;
|
|
74
|
+
readonly query: "swarm";
|
|
75
|
+
readonly request_id: null;
|
|
76
|
+
readonly total_matches: 2;
|
|
77
|
+
};
|
|
78
|
+
/**
|
|
79
|
+
* cass health (human-readable output)
|
|
80
|
+
* Captured: 2025-12-25
|
|
81
|
+
*/
|
|
82
|
+
export declare const cassHealthHumanBaseline = "\u2713 Healthy (3ms)\n Note: index stale (older than 300s)";
|
|
83
|
+
/**
|
|
84
|
+
* cass stats (human-readable output)
|
|
85
|
+
* Captured: 2025-12-25
|
|
86
|
+
*/
|
|
87
|
+
export declare const cassStatsHumanBaseline = "CASS Index Statistics\n=====================\nDatabase: /Users/joel/Library/Application Support/com.coding-agent-search.coding-agent-search/agent_search.db\n\nTotals:\n Conversations: 162\n Messages: 4213\n\nBy Agent:\n claude_code: 137\n cursor: 23\n codex: 2\n\nTop Workspaces:\n /Users/joel/Code/vercel/academy-vectr-workflow-course-content/external/workflow-builder-starter: 28\n /Users/joel/Code/vercel/slack-agents-course: 22\n /Users/joel/Code/vercel/academy-vectr-workflow-course-content: 22\n /Users/joel/Code/vercel/academy-content: 13\n /Users/joel/Code/joelhooks/trt-buddy: 13\n /Users/joel/Code/vercel/front: 11\n /Users/joel/.config/opencode: 9\n /Users/joel: 6\n /Users/joel/Code/badass-courses/course-builder/apps/ai-hero: 5\n /Users/joel/Code/vercel/front/apps/vercel-academy: 4\n\nDate Range: 2025-07-14 to 2025-12-08";
|
|
88
|
+
/**
|
|
89
|
+
* cass view <file> -n <line>
|
|
90
|
+
* Captured: 2025-12-25
|
|
91
|
+
*
|
|
92
|
+
* Format: File path header, line indicator with context window, separator, content with line numbers
|
|
93
|
+
*/
|
|
94
|
+
export declare const cassViewBaseline = "File: /Users/joel/.config/swarm-tools/sessions/ses_19yz2iaMpHxY1ddvVq2voC.jsonl\nLine: 1 (context: 5)\n----------------------------------------\n> 1 | {\"session_id\":\"ses_19yz2iaMpHxY1ddvVq2voC\",\"epic_id\":\"cell-f2p61v-mjko4d89zdt\",\"timestamp\":\"2025-12-24T23:51:52.896Z\",\"event_type\":\"OUTCOME\",\"outcome_type\":\"subtask_success\",\"payload\":{\"bead_id\":\"cell-f2p61v-mjko4d89zdt\",\"duration_ms\":0,\"files_touched\":[],\"verification_passed\":false,\"verification_skipped\":true}}\n----------------------------------------";
|
|
95
|
+
/**
|
|
96
|
+
* Error responses (captured from actual failures)
|
|
97
|
+
*/
|
|
98
|
+
export declare const cassErrorBaseline: {
|
|
99
|
+
readonly fileNotFound: {
|
|
100
|
+
readonly error: {
|
|
101
|
+
readonly code: 3;
|
|
102
|
+
readonly hint: null;
|
|
103
|
+
readonly kind: "file-not-found";
|
|
104
|
+
readonly message: "File not found: /Users/joel/.config/swarm-tools/sessions/ses_fRrFb7WrNr9K89JBCKd6GV.jsonl";
|
|
105
|
+
readonly retryable: false;
|
|
106
|
+
};
|
|
107
|
+
};
|
|
108
|
+
readonly invalidArgument: {
|
|
109
|
+
readonly error: {
|
|
110
|
+
readonly code: 2;
|
|
111
|
+
readonly hint: {
|
|
112
|
+
readonly common_mistakes: readonly [{
|
|
113
|
+
readonly correct: "cass robot-docs";
|
|
114
|
+
readonly wrong: "cass --robot-docs";
|
|
115
|
+
}, {
|
|
116
|
+
readonly correct: "cass robot-docs commands";
|
|
117
|
+
readonly wrong: "cass --robot-docs=commands";
|
|
118
|
+
}, {
|
|
119
|
+
readonly correct: "cass robot-docs";
|
|
120
|
+
readonly wrong: "cass robot-docs --robot";
|
|
121
|
+
}];
|
|
122
|
+
readonly error: "error: unexpected argument '--robot' found\\n\\nUsage: cass stats [OPTIONS]\\n\\nFor more information, try '--help'.\\n";
|
|
123
|
+
readonly examples: readonly ["cass robot-docs commands", "cass robot-docs schemas", "cass robot-docs examples", "cass --robot-help"];
|
|
124
|
+
readonly flag_syntax: {
|
|
125
|
+
readonly correct: readonly ["--limit 5", "--robot", "--json"];
|
|
126
|
+
readonly incorrect: readonly ["-limit 5", "limit=5", "--Limit"];
|
|
127
|
+
};
|
|
128
|
+
readonly hints: readonly ["For get robot-mode documentation, try: cass --robot-help"];
|
|
129
|
+
readonly kind: "argument_parsing";
|
|
130
|
+
readonly status: "error";
|
|
131
|
+
};
|
|
132
|
+
readonly kind: "usage";
|
|
133
|
+
readonly message: "Could not parse arguments";
|
|
134
|
+
readonly retryable: false;
|
|
135
|
+
};
|
|
136
|
+
};
|
|
137
|
+
};
|
|
138
|
+
/**
|
|
139
|
+
* Schema definitions extracted from actual responses
|
|
140
|
+
*/
|
|
141
|
+
export type CassStatsResponse = typeof cassStatsBaseline;
|
|
142
|
+
export type CassSearchResponse = typeof cassSearchBaseline;
|
|
143
|
+
export type CassSearchHit = CassSearchResponse["hits"][number];
|
|
144
|
+
export type CassAgentStats = CassStatsResponse["by_agent"][number];
|
|
145
|
+
export type CassWorkspaceStats = CassStatsResponse["top_workspaces"][number];
|
|
146
|
+
export type CassError = typeof cassErrorBaseline.fileNotFound | typeof cassErrorBaseline.invalidArgument;
|
|
147
|
+
//# sourceMappingURL=cass-baseline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cass-baseline.d.ts","sourceRoot":"","sources":["../../src/fixtures/cass-baseline.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH;;;GAGG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;;;;;;CAkCpB,CAAC;AAEX;;;GAGG;AACH,eAAO,MAAM,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA2CrB,CAAC;AAEX;;;GAGG;AACH,eAAO,MAAM,uBAAuB,gEACE,CAAC;AAEvC;;;GAGG;AACH,eAAO,MAAM,sBAAsB,u1BAyBE,CAAC;AAEtC;;;;;GAKG;AACH,eAAO,MAAM,gBAAgB,oiBAIY,CAAC;AAE1C;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAoDpB,CAAC;AAEX;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,OAAO,iBAAiB,CAAC;AACzD,MAAM,MAAM,kBAAkB,GAAG,OAAO,kBAAkB,CAAC;AAC3D,MAAM,MAAM,aAAa,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC;AAC/D,MAAM,MAAM,cAAc,GAAG,iBAAiB,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;AACnE,MAAM,MAAM,kBAAkB,GAAG,iBAAiB,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAC;AAC7E,MAAM,MAAM,SAAS,GACjB,OAAO,iBAAiB,CAAC,YAAY,GACrC,OAAO,iBAAiB,CAAC,eAAe,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test cases for compaction hook coordinator resumption
|
|
3
|
+
*
|
|
4
|
+
* Each case simulates a different swarm state and verifies that
|
|
5
|
+
* the compaction hook injects the correct context for resumption.
|
|
6
|
+
*/
|
|
7
|
+
import type { Cell } from "swarm-mail";
|
|
8
|
+
/**
|
|
9
|
+
* Compaction test case structure
|
|
10
|
+
*/
|
|
11
|
+
export interface CompactionTestCase {
|
|
12
|
+
name: string;
|
|
13
|
+
description: string;
|
|
14
|
+
/**
|
|
15
|
+
* Simulated hive state (cells to create)
|
|
16
|
+
*/
|
|
17
|
+
hiveCells: Array<Omit<Cell, "created_at" | "updated_at" | "closed_at">>;
|
|
18
|
+
/**
|
|
19
|
+
* Simulated swarm-mail state
|
|
20
|
+
*/
|
|
21
|
+
swarmMailState: {
|
|
22
|
+
agents: number;
|
|
23
|
+
reservations: number;
|
|
24
|
+
messages: number;
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Expected detection confidence
|
|
28
|
+
*/
|
|
29
|
+
expected: {
|
|
30
|
+
confidence: "high" | "medium" | "low" | "none";
|
|
31
|
+
contextInjected: boolean;
|
|
32
|
+
contextType: "full" | "fallback" | "none";
|
|
33
|
+
/**
|
|
34
|
+
* Patterns that MUST appear in injected context (if injected)
|
|
35
|
+
*/
|
|
36
|
+
mustContain?: string[];
|
|
37
|
+
/**
|
|
38
|
+
* Patterns that MUST NOT appear
|
|
39
|
+
*/
|
|
40
|
+
mustNotContain?: string[];
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export declare const compactionCases: CompactionTestCase[];
|
|
44
|
+
//# sourceMappingURL=compaction-cases.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-cases.d.ts","sourceRoot":"","sources":["../../src/fixtures/compaction-cases.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,YAAY,GAAG,YAAY,GAAG,WAAW,CAAC,CAAC,CAAC;IACxE;;OAEG;IACH,cAAc,EAAE;QACd,MAAM,EAAE,MAAM,CAAC;QACf,YAAY,EAAE,MAAM,CAAC;QACrB,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC;IACF;;OAEG;IACH,QAAQ,EAAE;QACR,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,GAAG,MAAM,CAAC;QAC/C,eAAe,EAAE,OAAO,CAAC;QACzB,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,MAAM,CAAC;QAC1C;;WAEG;QACH,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;QACvB;;WAEG;QACH,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;KAC3B,CAAC;CACH;AAED,eAAO,MAAM,eAAe,EAAE,kBAAkB,EAgT/C,CAAC"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test cases for compaction prompt quality evaluation
|
|
3
|
+
*
|
|
4
|
+
* Each case represents a continuation prompt that should be generated
|
|
5
|
+
* after context compaction. Tests validate that prompts have:
|
|
6
|
+
* - Real epic IDs (not placeholders)
|
|
7
|
+
* - Actionable tool calls with specific values
|
|
8
|
+
* - Strong coordinator identity
|
|
9
|
+
* - Explicit forbidden tools list
|
|
10
|
+
* - Correct first tool suggestion
|
|
11
|
+
*/
|
|
12
|
+
import type { CompactionPrompt } from "opencode-swarm-plugin/compaction-prompt-scoring";
|
|
13
|
+
/**
|
|
14
|
+
* Compaction prompt test case structure
|
|
15
|
+
*/
|
|
16
|
+
export interface CompactionPromptTestCase {
|
|
17
|
+
name: string;
|
|
18
|
+
description: string;
|
|
19
|
+
/**
|
|
20
|
+
* The generated continuation prompt
|
|
21
|
+
*/
|
|
22
|
+
prompt: CompactionPrompt;
|
|
23
|
+
/**
|
|
24
|
+
* Expected scoring outcomes
|
|
25
|
+
*/
|
|
26
|
+
expected: {
|
|
27
|
+
/**
|
|
28
|
+
* Should have real epic IDs (not placeholders)
|
|
29
|
+
*/
|
|
30
|
+
hasRealEpicId: boolean;
|
|
31
|
+
/**
|
|
32
|
+
* Should have actionable tool calls
|
|
33
|
+
*/
|
|
34
|
+
isActionable: boolean;
|
|
35
|
+
/**
|
|
36
|
+
* Should have strong coordinator identity
|
|
37
|
+
*/
|
|
38
|
+
hasCoordinatorIdentity: boolean;
|
|
39
|
+
/**
|
|
40
|
+
* Should list forbidden tools by name
|
|
41
|
+
*/
|
|
42
|
+
listsForbiddenTools: boolean;
|
|
43
|
+
/**
|
|
44
|
+
* First suggested tool should be correct
|
|
45
|
+
*/
|
|
46
|
+
hasCorrectFirstTool: boolean;
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
export declare const compactionPromptCases: CompactionPromptTestCase[];
|
|
50
|
+
//# sourceMappingURL=compaction-prompt-cases.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-prompt-cases.d.ts","sourceRoot":"","sources":["../../src/fixtures/compaction-prompt-cases.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iDAAiD,CAAC;AAExF;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACxC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,MAAM,EAAE,gBAAgB,CAAC;IACzB;;OAEG;IACH,QAAQ,EAAE;QACT;;WAEG;QACH,aAAa,EAAE,OAAO,CAAC;QACvB;;WAEG;QACH,YAAY,EAAE,OAAO,CAAC;QACtB;;WAEG;QACH,sBAAsB,EAAE,OAAO,CAAC;QAChC;;WAEG;QACH,mBAAmB,EAAE,OAAO,CAAC;QAC7B;;WAEG;QACH,mBAAmB,EAAE,OAAO,CAAC;KAC7B,CAAC;CACF;AAED,eAAO,MAAM,qBAAqB,EAAE,wBAAwB,EAmQ3D,CAAC"}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Coordinator Session Test Fixtures
|
|
3
|
+
*
|
|
4
|
+
* Synthetic coordinator sessions for testing coordinator-discipline scorers.
|
|
5
|
+
* Each fixture demonstrates good or bad coordinator behavior.
|
|
6
|
+
*/
|
|
7
|
+
import type { CoordinatorSession } from "opencode-swarm-plugin/eval-capture";
|
|
8
|
+
/**
|
|
9
|
+
* PERFECT COORDINATOR
|
|
10
|
+
*
|
|
11
|
+
* - No violations (no direct edits, tests, or reservations)
|
|
12
|
+
* - 100% spawn efficiency (3/3 workers spawned)
|
|
13
|
+
* - 100% review thoroughness (all workers reviewed)
|
|
14
|
+
* - Fast time to first spawn (30s)
|
|
15
|
+
*/
|
|
16
|
+
export declare const perfectCoordinator: CoordinatorSession;
|
|
17
|
+
/**
|
|
18
|
+
* BAD COORDINATOR - Multiple Violations
|
|
19
|
+
*
|
|
20
|
+
* - 3 violations (edited file, ran tests, reserved files)
|
|
21
|
+
* - 33% spawn efficiency (only 1/3 workers spawned)
|
|
22
|
+
* - 0% review thoroughness (no reviews)
|
|
23
|
+
* - Slow time to first spawn (10 minutes)
|
|
24
|
+
*/
|
|
25
|
+
export declare const badCoordinator: CoordinatorSession;
|
|
26
|
+
/**
|
|
27
|
+
* DECENT COORDINATOR - Some Issues
|
|
28
|
+
*
|
|
29
|
+
* - 1 violation (ran tests once)
|
|
30
|
+
* - 100% spawn efficiency (2/2 workers spawned)
|
|
31
|
+
* - 50% review thoroughness (reviewed only 1/2)
|
|
32
|
+
* - Good time to first spawn (45s)
|
|
33
|
+
*/
|
|
34
|
+
export declare const decentCoordinator: CoordinatorSession;
|
|
35
|
+
/**
|
|
36
|
+
* All test fixtures
|
|
37
|
+
*/
|
|
38
|
+
export declare const coordinatorSessionFixtures: {
|
|
39
|
+
session_id: string;
|
|
40
|
+
epic_id: string;
|
|
41
|
+
start_time: string;
|
|
42
|
+
events: ({
|
|
43
|
+
session_id: string;
|
|
44
|
+
epic_id: string;
|
|
45
|
+
timestamp: string;
|
|
46
|
+
event_type: "DECISION";
|
|
47
|
+
decision_type: "strategy_selected" | "worker_spawned" | "review_completed" | "decomposition_complete" | "researcher_spawned" | "skill_loaded" | "inbox_checked" | "blocker_resolved" | "scope_change_approved" | "scope_change_rejected";
|
|
48
|
+
payload: any;
|
|
49
|
+
} | {
|
|
50
|
+
session_id: string;
|
|
51
|
+
epic_id: string;
|
|
52
|
+
timestamp: string;
|
|
53
|
+
event_type: "VIOLATION";
|
|
54
|
+
violation_type: "coordinator_edited_file" | "coordinator_ran_tests" | "coordinator_reserved_files" | "no_worker_spawned";
|
|
55
|
+
payload: any;
|
|
56
|
+
} | {
|
|
57
|
+
session_id: string;
|
|
58
|
+
epic_id: string;
|
|
59
|
+
timestamp: string;
|
|
60
|
+
event_type: "OUTCOME";
|
|
61
|
+
outcome_type: "subtask_success" | "subtask_retry" | "subtask_failed" | "epic_complete" | "blocker_detected";
|
|
62
|
+
payload: any;
|
|
63
|
+
} | {
|
|
64
|
+
session_id: string;
|
|
65
|
+
epic_id: string;
|
|
66
|
+
timestamp: string;
|
|
67
|
+
event_type: "COMPACTION";
|
|
68
|
+
compaction_type: "detection_complete" | "prompt_generated" | "context_injected" | "resumption_started" | "tool_call_tracked";
|
|
69
|
+
payload: any;
|
|
70
|
+
})[];
|
|
71
|
+
end_time?: string | undefined;
|
|
72
|
+
}[];
|
|
73
|
+
//# sourceMappingURL=coordinator-sessions.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"coordinator-sessions.d.ts","sourceRoot":"","sources":["../../src/fixtures/coordinator-sessions.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,oCAAoC,CAAC;AAE7E;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,EAAE,kBAsHhC,CAAC;AAEF;;;;;;;GAOG;AACH,eAAO,MAAM,cAAc,EAAE,kBA+E5B,CAAC;AAEF;;;;;;;GAOG;AACH,eAAO,MAAM,iBAAiB,EAAE,kBAoF/B,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,0BAA0B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAItC,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test cases for swarm task decomposition
|
|
3
|
+
*
|
|
4
|
+
* Each case includes:
|
|
5
|
+
* - input: task description and optional context
|
|
6
|
+
* - expected: validation criteria (min/max subtasks, required files)
|
|
7
|
+
*/
|
|
8
|
+
export interface DecompositionTestCase {
|
|
9
|
+
input: {
|
|
10
|
+
task: string;
|
|
11
|
+
context?: string;
|
|
12
|
+
};
|
|
13
|
+
expected: {
|
|
14
|
+
minSubtasks: number;
|
|
15
|
+
maxSubtasks: number;
|
|
16
|
+
requiredFiles?: string[];
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
export declare const decompositionCases: DecompositionTestCase[];
|
|
20
|
+
//# sourceMappingURL=decomposition-cases.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"decomposition-cases.d.ts","sourceRoot":"","sources":["../../src/fixtures/decomposition-cases.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;IACF,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;KAC1B,CAAC;CACH;AAED,eAAO,MAAM,kBAAkB,EAAE,qBAAqB,EAoFrD,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,CAAC"}
|
package/dist/index.js
ADDED
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import type { CoordinatorEvent } from "opencode-swarm-plugin/eval-capture";
|
|
2
|
+
/**
|
|
3
|
+
* Compaction event - subset of CoordinatorEvent with event_type === "COMPACTION"
|
|
4
|
+
*/
|
|
5
|
+
export type CompactionEvent = Extract<CoordinatorEvent, {
|
|
6
|
+
event_type: "COMPACTION";
|
|
7
|
+
}>;
|
|
8
|
+
/**
|
|
9
|
+
* Compaction session - session with only COMPACTION events
|
|
10
|
+
*/
|
|
11
|
+
export interface CompactionSession {
|
|
12
|
+
session_id: string;
|
|
13
|
+
epic_id: string;
|
|
14
|
+
start_time: string;
|
|
15
|
+
end_time: string;
|
|
16
|
+
events: CompactionEvent[];
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Load options
|
|
20
|
+
*/
|
|
21
|
+
export interface LoadOptions {
|
|
22
|
+
/** Filter by compaction_type */
|
|
23
|
+
compaction_type?: "detection_complete" | "prompt_generated" | "context_injected" | "resumption_started" | "tool_call_tracked";
|
|
24
|
+
/** Filter by session IDs */
|
|
25
|
+
sessionIds?: string[];
|
|
26
|
+
/** Limit number of results */
|
|
27
|
+
limit?: number;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Load COMPACTION events from session JSONL files
|
|
31
|
+
*
|
|
32
|
+
* Reads all .jsonl files in the session directory, parses events,
|
|
33
|
+
* and returns only COMPACTION events matching the filters.
|
|
34
|
+
*
|
|
35
|
+
* @param sessionDir - Path to session directory (default: ~/.config/swarm-tools/sessions)
|
|
36
|
+
* @param options - Filter options
|
|
37
|
+
* @returns Array of compaction events
|
|
38
|
+
*
|
|
39
|
+
* @example
|
|
40
|
+
* // Load all COMPACTION events
|
|
41
|
+
* const events = await loadCompactionEvents("/path/to/sessions");
|
|
42
|
+
*
|
|
43
|
+
* @example
|
|
44
|
+
* // Load only detection_complete events
|
|
45
|
+
* const events = await loadCompactionEvents("/path/to/sessions", {
|
|
46
|
+
* compaction_type: "detection_complete",
|
|
47
|
+
* });
|
|
48
|
+
*
|
|
49
|
+
* @example
|
|
50
|
+
* // Load events from specific sessions
|
|
51
|
+
* const events = await loadCompactionEvents("/path/to/sessions", {
|
|
52
|
+
* sessionIds: ["session-1", "session-2"],
|
|
53
|
+
* limit: 10,
|
|
54
|
+
* });
|
|
55
|
+
*/
|
|
56
|
+
export declare function loadCompactionEvents(sessionDir: string, options?: LoadOptions): Promise<CompactionEvent[]>;
|
|
57
|
+
/**
|
|
58
|
+
* Load COMPACTION sessions grouped by session_id
|
|
59
|
+
*
|
|
60
|
+
* Groups COMPACTION events by session_id and returns session metadata.
|
|
61
|
+
*
|
|
62
|
+
* @param sessionDir - Path to session directory
|
|
63
|
+
* @param options - Filter options
|
|
64
|
+
* @returns Array of compaction sessions
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* // Load all sessions with COMPACTION events
|
|
68
|
+
* const sessions = await loadCompactionSessions("/path/to/sessions");
|
|
69
|
+
*
|
|
70
|
+
* @example
|
|
71
|
+
* // Load sessions with specific compaction_type
|
|
72
|
+
* const sessions = await loadCompactionSessions("/path/to/sessions", {
|
|
73
|
+
* compaction_type: "prompt_generated",
|
|
74
|
+
* });
|
|
75
|
+
*/
|
|
76
|
+
export declare function loadCompactionSessions(sessionDir: string, options?: LoadOptions): Promise<CompactionSession[]>;
|
|
77
|
+
/**
|
|
78
|
+
* Load COMPACTION events from default session directory
|
|
79
|
+
*
|
|
80
|
+
* Convenience wrapper that uses the default ~/.config/swarm-tools/sessions directory.
|
|
81
|
+
*
|
|
82
|
+
* @param options - Filter options
|
|
83
|
+
* @returns Array of compaction events
|
|
84
|
+
*
|
|
85
|
+
* @example
|
|
86
|
+
* // Load recent compaction events
|
|
87
|
+
* const events = await loadDefaultCompactionEvents({ limit: 10 });
|
|
88
|
+
*/
|
|
89
|
+
export declare function loadDefaultCompactionEvents(options?: LoadOptions): Promise<CompactionEvent[]>;
|
|
90
|
+
/**
|
|
91
|
+
* Load COMPACTION sessions from default session directory
|
|
92
|
+
*
|
|
93
|
+
* Convenience wrapper that uses the default ~/.config/swarm-tools/sessions directory.
|
|
94
|
+
*
|
|
95
|
+
* @param options - Filter options
|
|
96
|
+
* @returns Array of compaction sessions
|
|
97
|
+
*
|
|
98
|
+
* @example
|
|
99
|
+
* // Load all compaction sessions
|
|
100
|
+
* const sessions = await loadDefaultCompactionSessions();
|
|
101
|
+
*/
|
|
102
|
+
export declare function loadDefaultCompactionSessions(options?: LoadOptions): Promise<CompactionSession[]>;
|
|
103
|
+
//# sourceMappingURL=compaction-loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-loader.d.ts","sourceRoot":"","sources":["../../src/lib/compaction-loader.ts"],"names":[],"mappings":"AAgBA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AAG3E;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,OAAO,CACnC,gBAAgB,EAChB;IAAE,UAAU,EAAE,YAAY,CAAA;CAAE,CAC7B,CAAC;AAEF;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,eAAe,EAAE,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,gCAAgC;IAChC,eAAe,CAAC,EACZ,oBAAoB,GACpB,kBAAkB,GAClB,kBAAkB,GAClB,oBAAoB,GACpB,mBAAmB,CAAC;IACxB,4BAA4B;IAC5B,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,wBAAsB,oBAAoB,CACxC,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC,CAuE5B;AAyDD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,sBAAsB,CAC1C,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,iBAAiB,EAAE,CAAC,CA8C9B;AAED;;;;;;;;;;;GAWG;AACH,wBAAsB,2BAA2B,CAC/C,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC,CAG5B;AAED;;;;;;;;;;;GAWG;AACH,wBAAsB,6BAA6B,CACjD,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,iBAAiB,EAAE,CAAC,CAG9B"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { type EvalRecord } from "swarm-mail";
|
|
2
|
+
export interface EvalCase {
|
|
3
|
+
input: {
|
|
4
|
+
task: string;
|
|
5
|
+
context?: string;
|
|
6
|
+
};
|
|
7
|
+
expected: {
|
|
8
|
+
minSubtasks: number;
|
|
9
|
+
maxSubtasks: number;
|
|
10
|
+
requiredFiles?: string[];
|
|
11
|
+
overallSuccess?: boolean;
|
|
12
|
+
};
|
|
13
|
+
actual?: EvalRecord;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Load eval cases from PGlite
|
|
17
|
+
*
|
|
18
|
+
* @param projectKey - Project key for filtering records
|
|
19
|
+
* @param options - Filter options
|
|
20
|
+
* @returns Array of eval cases ready for Evalite
|
|
21
|
+
*/
|
|
22
|
+
export declare function loadEvalCases(projectKey: string, options?: {
|
|
23
|
+
limit?: number;
|
|
24
|
+
strategy?: "file-based" | "feature-based" | "risk-based";
|
|
25
|
+
successOnly?: boolean;
|
|
26
|
+
projectPath?: string;
|
|
27
|
+
}): Promise<EvalCase[]>;
|
|
28
|
+
/**
|
|
29
|
+
* Check if we have enough real data to run evals
|
|
30
|
+
*
|
|
31
|
+
* @param projectKey - Project key to check
|
|
32
|
+
* @param minRecords - Minimum number of records required (default: 5)
|
|
33
|
+
* @param projectPath - Optional project path for database lookup
|
|
34
|
+
* @returns True if enough data exists
|
|
35
|
+
*/
|
|
36
|
+
export declare function hasRealEvalData(projectKey: string, minRecords?: number, projectPath?: string): Promise<boolean>;
|
|
37
|
+
/**
|
|
38
|
+
* Get eval data stats for reporting
|
|
39
|
+
*
|
|
40
|
+
* @param projectKey - Project key to query
|
|
41
|
+
* @param projectPath - Optional project path for database lookup
|
|
42
|
+
* @returns Summary of available eval data
|
|
43
|
+
*/
|
|
44
|
+
export declare function getEvalDataSummary(projectKey: string, projectPath?: string): Promise<{
|
|
45
|
+
totalRecords: number;
|
|
46
|
+
successRate: number;
|
|
47
|
+
byStrategy: Record<string, number>;
|
|
48
|
+
hasEnoughData: boolean;
|
|
49
|
+
}>;
|
|
50
|
+
/**
|
|
51
|
+
* Load captured coordinator sessions from ~/.config/swarm-tools/sessions/
|
|
52
|
+
*
|
|
53
|
+
* Reads all JSONL session files and returns CoordinatorSession objects.
|
|
54
|
+
*
|
|
55
|
+
* Quality filters are applied to focus on high-signal coordinator sessions:
|
|
56
|
+
* - minEvents: Filter out incomplete/aborted sessions (default: 3)
|
|
57
|
+
* - requireWorkerSpawn: Ensure session delegated to workers (default: true)
|
|
58
|
+
* - requireReview: Ensure coordinator reviewed work (default: true)
|
|
59
|
+
*
|
|
60
|
+
* Filters are applied BEFORE the limit for accurate sampling.
|
|
61
|
+
*
|
|
62
|
+
* @param options - Filter options
|
|
63
|
+
* @returns Array of coordinator sessions that meet quality criteria
|
|
64
|
+
*/
|
|
65
|
+
export declare function loadCapturedSessions(options?: {
|
|
66
|
+
sessionIds?: string[];
|
|
67
|
+
limit?: number;
|
|
68
|
+
/** Minimum number of events required (default: 3) */
|
|
69
|
+
minEvents?: number;
|
|
70
|
+
/** Require at least one worker_spawned event (default: true) */
|
|
71
|
+
requireWorkerSpawn?: boolean;
|
|
72
|
+
/** Require at least one review_completed event (default: true) */
|
|
73
|
+
requireReview?: boolean;
|
|
74
|
+
/** Override session directory for testing */
|
|
75
|
+
sessionDir?: string;
|
|
76
|
+
}): Promise<Array<{
|
|
77
|
+
session: import("opencode-swarm-plugin/eval-capture").CoordinatorSession;
|
|
78
|
+
}>>;
|
|
79
|
+
//# sourceMappingURL=data-loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"data-loader.d.ts","sourceRoot":"","sources":["../../src/lib/data-loader.ts"],"names":[],"mappings":"AAOA,OAAO,EAGL,KAAK,UAAU,EAChB,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAC1C,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;QACzB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,CAAC;IACF,MAAM,CAAC,EAAE,UAAU,CAAC;CACrB;AAED;;;;;;GAMG;AACH,wBAAsB,aAAa,CACjC,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE;IACR,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,YAAY,GAAG,eAAe,GAAG,YAAY,CAAC;IACzD,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GACA,OAAO,CAAC,QAAQ,EAAE,CAAC,CA6BrB;AAED;;;;;;;GAOG;AACH,wBAAsB,eAAe,CACnC,UAAU,EAAE,MAAM,EAClB,UAAU,GAAE,MAAU,EACtB,WAAW,CAAC,EAAE,MAAM,GACnB,OAAO,CAAC,OAAO,CAAC,CAGlB;AAED;;;;;;GAMG;AACH,wBAAsB,kBAAkB,CACtC,UAAU,EAAE,MAAM,EAClB,WAAW,CAAC,EAAE,MAAM,GACnB,OAAO,CAAC;IACT,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,aAAa,EAAE,OAAO,CAAC;CACxB,CAAC,CASD;AA0CD;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,oBAAoB,CAAC,OAAO,CAAC,EAAE;IACnD,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,qDAAqD;IACrD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gEAAgE;IAChE,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,kEAAkE;IAClE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,6CAA6C;IAC7C,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB,GAAG,OAAO,CACT,KAAK,CAAC;IAAE,OAAO,EAAE,OAAO,oCAAoC,EAAE,kBAAkB,CAAA;CAAE,CAAC,CACpF,CAmGA"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { GatewayModelId } from "ai";
|
|
2
|
+
/**
|
|
3
|
+
* Default model for decomposition evals
|
|
4
|
+
* Using Claude Sonnet for good balance of quality and cost
|
|
5
|
+
*/
|
|
6
|
+
export declare const DEFAULT_MODEL: GatewayModelId;
|
|
7
|
+
/**
|
|
8
|
+
* Generate a decomposition from a task description
|
|
9
|
+
*
|
|
10
|
+
* @param prompt - The full decomposition prompt
|
|
11
|
+
* @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
|
|
12
|
+
* @returns The raw text response from the LLM
|
|
13
|
+
*/
|
|
14
|
+
export declare function generateDecomposition(prompt: string, model?: GatewayModelId): Promise<string>;
|
|
15
|
+
/**
|
|
16
|
+
* Format a decomposition prompt from task and context
|
|
17
|
+
*
|
|
18
|
+
* Uses the same prompt template as swarm_plan_prompt
|
|
19
|
+
*/
|
|
20
|
+
export declare function formatDecompositionPrompt(task: string, context?: string, maxSubtasks?: number): string;
|
|
21
|
+
/**
|
|
22
|
+
* Extract JSON from LLM response
|
|
23
|
+
*
|
|
24
|
+
* Handles responses that may have markdown code blocks or extra text
|
|
25
|
+
*/
|
|
26
|
+
export declare function extractJson(text: string): string;
|
|
27
|
+
//# sourceMappingURL=llm.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm.d.ts","sourceRoot":"","sources":["../../src/lib/llm.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AAEzC;;;GAGG;AACH,eAAO,MAAM,aAAa,EAAE,cAA8C,CAAC;AAE3E;;;;;;GAMG;AACH,wBAAsB,qBAAqB,CACzC,MAAM,EAAE,MAAM,EACd,KAAK,GAAE,cAA8B,GACpC,OAAO,CAAC,MAAM,CAAC,CAQjB;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,EAChB,WAAW,GAAE,MAAU,GACtB,MAAM,CA8CR;AAED;;;;GAIG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAehD"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compaction Prompt Quality Scorers - Evalite Wrappers
|
|
3
|
+
*
|
|
4
|
+
* These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
|
|
5
|
+
* for use with evalite's test runner.
|
|
6
|
+
*
|
|
7
|
+
* Weighted scoring:
|
|
8
|
+
* - epicIdSpecificity (0.20) - real IDs not placeholders
|
|
9
|
+
* - actionability (0.20) - swarm_status/inbox with real values
|
|
10
|
+
* - coordinatorIdentity (0.25) - ASCII header + strong mandates
|
|
11
|
+
* - forbiddenToolsPresent (0.15) - lists forbidden tools by name
|
|
12
|
+
* - postCompactionDiscipline (0.20) - first tool correct, no edit/write
|
|
13
|
+
*/
|
|
14
|
+
export type { CompactionPrompt, ScorerResult } from "opencode-swarm-plugin/compaction-prompt-scoring";
|
|
15
|
+
export { scoreActionability, scoreCoordinatorIdentity, scoreEpicIdSpecificity, scoreForbiddenToolsPresent, scorePostCompactionDiscipline, } from "opencode-swarm-plugin/compaction-prompt-scoring";
|
|
16
|
+
/**
|
|
17
|
+
* Epic ID Specificity Scorer
|
|
18
|
+
*
|
|
19
|
+
* Validates that epic IDs are REAL, not placeholders.
|
|
20
|
+
* Score: 1.0 if real IDs, 0.0 if placeholders found
|
|
21
|
+
*/
|
|
22
|
+
export declare const epicIdSpecificity: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
23
|
+
/**
|
|
24
|
+
* Actionability Scorer
|
|
25
|
+
*
|
|
26
|
+
* Validates that the prompt includes SPECIFIC actionable tool calls.
|
|
27
|
+
* Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
|
|
28
|
+
*/
|
|
29
|
+
export declare const actionability: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
30
|
+
/**
|
|
31
|
+
* Coordinator Identity Scorer
|
|
32
|
+
*
|
|
33
|
+
* Validates that the prompt has STRONG coordinator identity reinforcement.
|
|
34
|
+
* Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
|
|
35
|
+
*/
|
|
36
|
+
export declare const coordinatorIdentity: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
37
|
+
/**
|
|
38
|
+
* Forbidden Tools Present Scorer
|
|
39
|
+
*
|
|
40
|
+
* Validates that the prompt LISTS forbidden tools by name.
|
|
41
|
+
* Score: ratio of forbidden tools mentioned (0.0 to 1.0)
|
|
42
|
+
*/
|
|
43
|
+
export declare const forbiddenToolsPresent: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
44
|
+
/**
|
|
45
|
+
* Post-Compaction Discipline Scorer
|
|
46
|
+
*
|
|
47
|
+
* Validates that the FIRST suggested tool is correct.
|
|
48
|
+
* Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
|
|
49
|
+
*/
|
|
50
|
+
export declare const postCompactionDiscipline: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
51
|
+
//# sourceMappingURL=compaction-prompt-scorers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-prompt-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/compaction-prompt-scorers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAaH,YAAY,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,iDAAiD,CAAC;AAGtG,OAAO,EACN,kBAAkB,EAClB,wBAAwB,EACxB,sBAAsB,EACtB,0BAA0B,EAC1B,6BAA6B,GAC7B,MAAM,iDAAiD,CAAC;AAEzD;;;;;GAKG;AACH,eAAO,MAAM,iBAAiB,6DAc5B,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,aAAa,6DAcxB,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,mBAAmB,6DAc9B,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,qBAAqB,6DAchC,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,wBAAwB,6DAcnC,CAAC"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Custom scorers for compaction hook evaluation
|
|
3
|
+
*
|
|
4
|
+
* These scorers validate that the compaction hook correctly:
|
|
5
|
+
* 1. Detects swarm state (confidence level)
|
|
6
|
+
* 2. Injects appropriate context (full/fallback/none)
|
|
7
|
+
* 3. Includes required patterns in context
|
|
8
|
+
* 4. Excludes placeholder/generic content
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Expected output from compaction hook tests
|
|
12
|
+
*/
|
|
13
|
+
export interface CompactionResult {
|
|
14
|
+
detected: boolean;
|
|
15
|
+
confidence: "high" | "medium" | "low" | "none";
|
|
16
|
+
contextInjected: boolean;
|
|
17
|
+
contextType: "full" | "fallback" | "none";
|
|
18
|
+
injectedContext: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Expected criteria from test case
|
|
22
|
+
*/
|
|
23
|
+
export interface CompactionExpected {
|
|
24
|
+
confidence: "high" | "medium" | "low" | "none";
|
|
25
|
+
contextInjected: boolean;
|
|
26
|
+
contextType: "full" | "fallback" | "none";
|
|
27
|
+
mustContain?: string[];
|
|
28
|
+
mustNotContain?: string[];
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Validates that detection confidence matches expected level
|
|
32
|
+
*
|
|
33
|
+
* Confidence determines what gets injected:
|
|
34
|
+
* - HIGH/MEDIUM: Full coordinator context
|
|
35
|
+
* - LOW: Fallback detection prompt
|
|
36
|
+
* - NONE: No injection
|
|
37
|
+
*
|
|
38
|
+
* Score: 1.0 if confidence matches, 0.0 otherwise
|
|
39
|
+
*/
|
|
40
|
+
export declare const confidenceAccuracy: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
41
|
+
/**
|
|
42
|
+
* Validates that context injection matches expected behavior
|
|
43
|
+
*
|
|
44
|
+
* Checks:
|
|
45
|
+
* - Whether context was injected (boolean)
|
|
46
|
+
* - What type of context (full/fallback/none)
|
|
47
|
+
*
|
|
48
|
+
* Score: 1.0 if both match, 0.5 if only injection status matches, 0.0 otherwise
|
|
49
|
+
*/
|
|
50
|
+
export declare const contextInjectionCorrectness: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
51
|
+
/**
|
|
52
|
+
* Validates that injected context contains required patterns
|
|
53
|
+
*
|
|
54
|
+
* For coordinator resumption, context MUST include:
|
|
55
|
+
* - Swarm continuation instructions
|
|
56
|
+
* - Tool names (swarm_status, swarmmail_inbox)
|
|
57
|
+
* - Actionable language ("COORDINATOR", "Keep Cooking")
|
|
58
|
+
*
|
|
59
|
+
* Score: ratio of required patterns found (0.0 to 1.0)
|
|
60
|
+
*/
|
|
61
|
+
export declare const requiredPatternsPresent: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
62
|
+
/**
|
|
63
|
+
* Validates that injected context excludes forbidden patterns
|
|
64
|
+
*
|
|
65
|
+
* Context should NOT contain:
|
|
66
|
+
* - Placeholder IDs ("bd-xxx")
|
|
67
|
+
* - Generic/template language
|
|
68
|
+
* - Wrong context type markers
|
|
69
|
+
*
|
|
70
|
+
* Score: 1.0 if no forbidden patterns found, 0.0 if any found
|
|
71
|
+
*/
|
|
72
|
+
export declare const forbiddenPatternsAbsent: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
73
|
+
/**
|
|
74
|
+
* Composite scorer: Overall compaction quality
|
|
75
|
+
*
|
|
76
|
+
* Combines all compaction-specific checks into single score.
|
|
77
|
+
* Weighted average:
|
|
78
|
+
* - Confidence accuracy: 25%
|
|
79
|
+
* - Context injection: 25%
|
|
80
|
+
* - Required patterns: 30%
|
|
81
|
+
* - Forbidden patterns: 20%
|
|
82
|
+
*
|
|
83
|
+
* Score: 0.0 to 1.0
|
|
84
|
+
*/
|
|
85
|
+
export declare const compactionQuality: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
86
|
+
//# sourceMappingURL=compaction-scorers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compaction-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/compaction-scorers.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,OAAO,CAAC;IAClB,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,GAAG,MAAM,CAAC;IAC/C,eAAe,EAAE,OAAO,CAAC;IACzB,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,MAAM,CAAC;IAC1C,eAAe,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,GAAG,MAAM,CAAC;IAC/C,eAAe,EAAE,OAAO,CAAC;IACzB,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,MAAM,CAAC;IAC1C,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,6DA0B7B,CAAC;AAEH;;;;;;;;GAQG;AACH,eAAO,MAAM,2BAA2B,6DAoCtC,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,uBAAuB,6DA0DlC,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,uBAAuB,6DA8ClC,CAAC;AAEH;;;;;;;;;;;GAWG;AACH,eAAO,MAAM,iBAAiB,6DA6C5B,CAAC"}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Coordinator Discipline Scorers - Evaluate coordinator behavior
|
|
3
|
+
*
|
|
4
|
+
* These scorers measure whether a coordinator follows the protocol:
|
|
5
|
+
* 1. Don't edit files directly (spawn workers)
|
|
6
|
+
* 2. Don't run tests directly (workers do verification)
|
|
7
|
+
* 3. Spawn workers for all subtasks
|
|
8
|
+
* 4. Review worker output before accepting
|
|
9
|
+
* 5. Minimize time to first spawn (don't overthink)
|
|
10
|
+
*
|
|
11
|
+
* Inputs: CoordinatorSession from eval-capture
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Violation Count Scorer
|
|
15
|
+
*
|
|
16
|
+
* Counts VIOLATION events in the session.
|
|
17
|
+
* Each violation reduces score by 0.2.
|
|
18
|
+
*
|
|
19
|
+
* Violations tracked:
|
|
20
|
+
* - coordinator_edited_file (should spawn worker instead)
|
|
21
|
+
* - coordinator_ran_tests (workers do verification)
|
|
22
|
+
* - coordinator_reserved_files (only workers reserve)
|
|
23
|
+
* - no_worker_spawned (subtask exists but no worker)
|
|
24
|
+
*
|
|
25
|
+
* Score: 1.0 - (0.2 * violation_count), floored at 0.0
|
|
26
|
+
*/
|
|
27
|
+
export declare const violationCount: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
28
|
+
/**
|
|
29
|
+
* Spawn Efficiency Scorer
|
|
30
|
+
*
|
|
31
|
+
* Measures whether workers were spawned for all subtasks.
|
|
32
|
+
* Coordinators should delegate work, not do it themselves.
|
|
33
|
+
*
|
|
34
|
+
* Score: workers_spawned / subtasks_planned
|
|
35
|
+
*
|
|
36
|
+
* If no decomposition_complete event exists, falls back to counting spawns
|
|
37
|
+
* and returns 1.0 if any workers were spawned (better than nothing).
|
|
38
|
+
*/
|
|
39
|
+
export declare const spawnEfficiency: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
40
|
+
/**
|
|
41
|
+
* Review Thoroughness Scorer
|
|
42
|
+
*
|
|
43
|
+
* Measures whether coordinator reviewed worker output.
|
|
44
|
+
* Should have review_completed events for all finished subtasks.
|
|
45
|
+
*
|
|
46
|
+
* Score: reviews_completed / workers_finished
|
|
47
|
+
*/
|
|
48
|
+
export declare const reviewThoroughness: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
49
|
+
/**
|
|
50
|
+
* Time to First Spawn Scorer
|
|
51
|
+
*
|
|
52
|
+
* Measures how fast the coordinator spawned the first worker.
|
|
53
|
+
* Overthinking and perfectionism delays workers and blocks progress.
|
|
54
|
+
*
|
|
55
|
+
* Normalization:
|
|
56
|
+
* - < 60s: 1.0 (excellent)
|
|
57
|
+
* - 60-300s: linear decay to 0.5
|
|
58
|
+
* - > 300s: 0.0 (way too slow)
|
|
59
|
+
*
|
|
60
|
+
* Score: normalized to 0-1 (faster is better)
|
|
61
|
+
*/
|
|
62
|
+
export declare const timeToFirstSpawn: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
63
|
+
/**
|
|
64
|
+
* Overall Discipline Scorer
|
|
65
|
+
*
|
|
66
|
+
* Weighted composite of all coordinator discipline metrics.
|
|
67
|
+
*
|
|
68
|
+
* Weights:
|
|
69
|
+
* - Violations: 30% (most critical - breaking protocol)
|
|
70
|
+
* - Spawn efficiency: 25% (delegation is key)
|
|
71
|
+
* - Review thoroughness: 25% (quality gate)
|
|
72
|
+
* - Time to first spawn: 20% (bias toward action)
|
|
73
|
+
*
|
|
74
|
+
* Score: 0.0 to 1.0
|
|
75
|
+
*/
|
|
76
|
+
export declare const overallDiscipline: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
77
|
+
//# sourceMappingURL=coordinator-discipline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"coordinator-discipline.d.ts","sourceRoot":"","sources":["../../src/scorers/coordinator-discipline.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAKH;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,cAAc,6DAiCzB,CAAC;AAEH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,eAAe,6DAwD1B,CAAC;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,6DAyC7B,CAAC;AAEH;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,gBAAgB,6DAkE3B,CAAC;AAEH;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,iBAAiB,6DA6C5B,CAAC"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Custom scorers for evaluating swarm task decomposition quality
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Checks that no files appear in multiple subtasks
|
|
6
|
+
*
|
|
7
|
+
* Independent subtasks are critical for parallel execution.
|
|
8
|
+
* File conflicts cause merge conflicts and coordination overhead.
|
|
9
|
+
*
|
|
10
|
+
* Score: 1.0 if no conflicts, 0.0 if conflicts found
|
|
11
|
+
*/
|
|
12
|
+
export declare const subtaskIndependence: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
13
|
+
export { executionSuccess, timeBalance, scopeAccuracy, scopeDrift, noRework, } from "./outcome-scorers.js";
|
|
14
|
+
export { confidenceAccuracy, contextInjectionCorrectness, requiredPatternsPresent, forbiddenPatternsAbsent, compactionQuality, } from "./compaction-scorers.js";
|
|
15
|
+
export { violationCount, spawnEfficiency, reviewThoroughness, timeToFirstSpawn, overallDiscipline, } from "./coordinator-discipline.js";
|
|
16
|
+
/**
|
|
17
|
+
* Checks that subtasks cover the full task scope
|
|
18
|
+
*
|
|
19
|
+
* Incomplete coverage means:
|
|
20
|
+
* - Missing functionality
|
|
21
|
+
* - Follow-up work required
|
|
22
|
+
* - Task not actually complete
|
|
23
|
+
*
|
|
24
|
+
* Score: ratio of expected files covered (0.0 to 1.0)
|
|
25
|
+
* If no expected files specified, checks that subtasks exist
|
|
26
|
+
*/
|
|
27
|
+
export declare const coverageCompleteness: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
28
|
+
/**
|
|
29
|
+
* Checks that each subtask has clear, actionable instructions
|
|
30
|
+
*
|
|
31
|
+
* Vague instructions lead to:
|
|
32
|
+
* - Agent confusion and blocking
|
|
33
|
+
* - Incorrect implementations
|
|
34
|
+
* - Need for coordinator intervention
|
|
35
|
+
*
|
|
36
|
+
* Score: Average of per-subtask instruction quality
|
|
37
|
+
*/
|
|
38
|
+
export declare const instructionClarity: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
39
|
+
/**
|
|
40
|
+
* LLM-as-judge scorer for decomposition coherence
|
|
41
|
+
*
|
|
42
|
+
* Uses Claude Haiku to evaluate whether subtasks are truly independent,
|
|
43
|
+
* well-scoped, and complete. This catches nuances that heuristics miss:
|
|
44
|
+
* - Semantic dependencies between subtasks
|
|
45
|
+
* - Scope that's too big or too trivial
|
|
46
|
+
* - Missing pieces that would block completion
|
|
47
|
+
*
|
|
48
|
+
* Only use for decomposition evals - this is where it matters.
|
|
49
|
+
*/
|
|
50
|
+
export declare const decompositionCoherence: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
51
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scorers/index.ts"],"names":[],"mappings":"AAOA;;GAEG;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,mBAAmB,6DAuC9B,CAAC;AAMH,OAAO,EACL,gBAAgB,EAChB,WAAW,EACX,aAAa,EACb,UAAU,EACV,QAAQ,GACT,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACL,kBAAkB,EAClB,2BAA2B,EAC3B,uBAAuB,EACvB,uBAAuB,EACvB,iBAAiB,GAClB,MAAM,yBAAyB,CAAC;AAMjC,OAAO,EACL,cAAc,EACd,eAAe,EACf,kBAAkB,EAClB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,6BAA6B,CAAC;AAErC;;;;;;;;;;GAUG;AACH,eAAO,MAAM,oBAAoB,6DAsD/B,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,6DAsD7B,CAAC;AAMH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,sBAAsB,6DAmFjC,CAAC"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Outcome-based scorers for evaluating decomposition quality
|
|
3
|
+
*
|
|
4
|
+
* These scorers evaluate based on ACTUAL execution outcomes,
|
|
5
|
+
* not just the structure of the decomposition.
|
|
6
|
+
*
|
|
7
|
+
* Requires EvalRecord with outcomes populated.
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Execution Success Scorer
|
|
11
|
+
*
|
|
12
|
+
* Measures whether all subtasks succeeded without errors.
|
|
13
|
+
* This is the ultimate measure - did the decomposition actually work?
|
|
14
|
+
*
|
|
15
|
+
* Score: 1.0 if all outcomes.success === true, 0.0 otherwise
|
|
16
|
+
*/
|
|
17
|
+
export declare const executionSuccess: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
18
|
+
/**
|
|
19
|
+
* Time Balance Scorer
|
|
20
|
+
*
|
|
21
|
+
* Measures how evenly balanced the work was across subtasks.
|
|
22
|
+
* Unbalanced work means some agents finish early while others are bottlenecked.
|
|
23
|
+
*
|
|
24
|
+
* Score: 1.0 if max/min ratio < 2.0 (well balanced)
|
|
25
|
+
* 0.5 if ratio < 4.0 (moderately balanced)
|
|
26
|
+
* 0.0 if ratio >= 4.0 (poorly balanced)
|
|
27
|
+
*/
|
|
28
|
+
export declare const timeBalance: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
29
|
+
/**
|
|
30
|
+
* Scope Accuracy Scorer
|
|
31
|
+
*
|
|
32
|
+
* Measures how accurately the decomposition predicted which files would be touched.
|
|
33
|
+
* High accuracy means the planner understood the work scope correctly.
|
|
34
|
+
*
|
|
35
|
+
* Score: intersection(actual, planned) / planned.length
|
|
36
|
+
* 1.0 = all planned files were touched, no extras
|
|
37
|
+
* 0.5 = half the planned files were touched
|
|
38
|
+
* 0.0 = none of the planned files were touched
|
|
39
|
+
*/
|
|
40
|
+
export declare const scopeAccuracy: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
41
|
+
/**
|
|
42
|
+
* Scope Drift Scorer
|
|
43
|
+
*
|
|
44
|
+
* Penalizes when agents touch files NOT in their planned scope.
|
|
45
|
+
* Scope drift indicates poor planning or unexpected dependencies.
|
|
46
|
+
*
|
|
47
|
+
* Score: 1.0 if no drift (all actual files were planned)
|
|
48
|
+
* Decreases linearly with drift percentage
|
|
49
|
+
* 0.0 if drift > 50%
|
|
50
|
+
*/
|
|
51
|
+
export declare const scopeDrift: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
52
|
+
/**
|
|
53
|
+
* No Rework Scorer
|
|
54
|
+
*
|
|
55
|
+
* Checks that no subtask touched files assigned to another subtask.
|
|
56
|
+
* Rework indicates poor decomposition or missing dependencies.
|
|
57
|
+
*
|
|
58
|
+
* Score: 1.0 if no rework (no subtask touched another's planned files)
|
|
59
|
+
* 0.0 if rework detected
|
|
60
|
+
*/
|
|
61
|
+
export declare const noRework: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
|
|
62
|
+
//# sourceMappingURL=outcome-scorers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"outcome-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/outcome-scorers.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,gBAAgB,6DAwC3B,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,WAAW,6DAoEtB,CAAC;AAEH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,aAAa,6DAmDxB,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,UAAU,6DAwDrB,CAAC;AAEH;;;;;;;;GAQG;AACH,eAAO,MAAM,QAAQ,6DAiEnB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"swarm-decomposition.eval.d.ts","sourceRoot":"","sources":["../src/swarm-decomposition.eval.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@swarmtools/evals",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Evaluation suite for swarm-tools multi-agent coordination",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"files": [
|
|
9
|
+
"dist",
|
|
10
|
+
"README.md"
|
|
11
|
+
],
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"import": "./dist/index.js",
|
|
15
|
+
"types": "./dist/index.d.ts"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"publishConfig": {
|
|
19
|
+
"access": "public",
|
|
20
|
+
"registry": "https://registry.npmjs.org/"
|
|
21
|
+
},
|
|
22
|
+
"repository": {
|
|
23
|
+
"type": "git",
|
|
24
|
+
"url": "https://github.com/joelhooks/opencode-swarm-plugin"
|
|
25
|
+
},
|
|
26
|
+
"author": "Joel Hooks",
|
|
27
|
+
"license": "MIT",
|
|
28
|
+
"scripts": {
|
|
29
|
+
"build": "bun build ./src/index.ts --outdir ./dist --target node && tsc",
|
|
30
|
+
"test": "bun test src/**/*.test.ts",
|
|
31
|
+
"typecheck": "tsc --noEmit"
|
|
32
|
+
},
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"evalite": "^1.0.0-beta.10",
|
|
35
|
+
"ai": "6.0.0-beta.150",
|
|
36
|
+
"opencode-swarm-plugin": "0.44.1",
|
|
37
|
+
"swarm-mail": "1.6.0"
|
|
38
|
+
},
|
|
39
|
+
"devDependencies": {
|
|
40
|
+
"@types/node": "^22.19.3",
|
|
41
|
+
"bun-types": "^1.3.4",
|
|
42
|
+
"typescript": "^5.7.2",
|
|
43
|
+
"vitest": "^4.0.15"
|
|
44
|
+
}
|
|
45
|
+
}
|