@swarmtools/evals 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +53 -0
  2. package/dist/compaction-prompt.eval.d.ts +30 -0
  3. package/dist/compaction-prompt.eval.d.ts.map +1 -0
  4. package/dist/compaction-resumption.eval.d.ts +23 -0
  5. package/dist/compaction-resumption.eval.d.ts.map +1 -0
  6. package/dist/coordinator-behavior.eval.d.ts +29 -0
  7. package/dist/coordinator-behavior.eval.d.ts.map +1 -0
  8. package/dist/coordinator-session.eval.d.ts +26 -0
  9. package/dist/coordinator-session.eval.d.ts.map +1 -0
  10. package/dist/example.eval.d.ts +10 -0
  11. package/dist/example.eval.d.ts.map +1 -0
  12. package/dist/fixtures/cass-baseline.d.ts +147 -0
  13. package/dist/fixtures/cass-baseline.d.ts.map +1 -0
  14. package/dist/fixtures/compaction-cases.d.ts +44 -0
  15. package/dist/fixtures/compaction-cases.d.ts.map +1 -0
  16. package/dist/fixtures/compaction-prompt-cases.d.ts +50 -0
  17. package/dist/fixtures/compaction-prompt-cases.d.ts.map +1 -0
  18. package/dist/fixtures/coordinator-sessions.d.ts +73 -0
  19. package/dist/fixtures/coordinator-sessions.d.ts.map +1 -0
  20. package/dist/fixtures/decomposition-cases.d.ts +20 -0
  21. package/dist/fixtures/decomposition-cases.d.ts.map +1 -0
  22. package/dist/index.d.ts +8 -0
  23. package/dist/index.d.ts.map +1 -0
  24. package/dist/index.js +0 -0
  25. package/dist/lib/compaction-loader.d.ts +103 -0
  26. package/dist/lib/compaction-loader.d.ts.map +1 -0
  27. package/dist/lib/data-loader.d.ts +79 -0
  28. package/dist/lib/data-loader.d.ts.map +1 -0
  29. package/dist/lib/llm.d.ts +27 -0
  30. package/dist/lib/llm.d.ts.map +1 -0
  31. package/dist/scorers/compaction-prompt-scorers.d.ts +51 -0
  32. package/dist/scorers/compaction-prompt-scorers.d.ts.map +1 -0
  33. package/dist/scorers/compaction-scorers.d.ts +86 -0
  34. package/dist/scorers/compaction-scorers.d.ts.map +1 -0
  35. package/dist/scorers/coordinator-discipline.d.ts +77 -0
  36. package/dist/scorers/coordinator-discipline.d.ts.map +1 -0
  37. package/dist/scorers/index.d.ts +51 -0
  38. package/dist/scorers/index.d.ts.map +1 -0
  39. package/dist/scorers/outcome-scorers.d.ts +62 -0
  40. package/dist/scorers/outcome-scorers.d.ts.map +1 -0
  41. package/dist/swarm-decomposition.eval.d.ts +2 -0
  42. package/dist/swarm-decomposition.eval.d.ts.map +1 -0
  43. package/package.json +45 -0
package/README.md ADDED
@@ -0,0 +1,53 @@
1
+ # @swarmtools/evals
2
+
3
+ ```
4
+ 🐝 EVAL SUITE 🐝
5
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━
6
+ Swarm Intelligence QA
7
+ ```
8
+
9
+ Evaluation suite for swarm-tools multi-agent coordination. Uses [Evalite](https://evalite.dev) to measure coordinator behavior, decomposition quality, and compaction correctness.
10
+
11
+ ## Purpose
12
+
13
+ This package contains the evaluation framework for the swarm-tools ecosystem. Extracting evals into a separate package ensures:
14
+
15
+ 1. **Clean Dependencies** - Main plugin doesn't need evalite/vitest in production
16
+ 2. **Faster Installs** - Eval deps only needed for development/CI
17
+ 3. **Isolated Testing** - Eval suite can evolve independently from plugin
18
+
19
+ ## What Gets Evaluated
20
+
21
+ - **Coordinator Protocol** - Does the coordinator spawn workers vs doing work itself?
22
+ - **Task Decomposition** - Quality of task splitting, file conflict detection
23
+ - **Compaction** - Context compression correctness
24
+ - **Review Thoroughness** - Does coordinator review worker output properly?
25
+
26
+ ## Usage
27
+
28
+ ```bash
29
+ # Run all evals
30
+ bun run test
31
+
32
+ # Build for publishing
33
+ bun run build
34
+
35
+ # Type check
36
+ bun run typecheck
37
+ ```
38
+
39
+ ## Package Structure
40
+
41
+ This package is part of the swarm-tools monorepo:
42
+
43
+ - `opencode-swarm-plugin` - Main plugin (peer dependency)
44
+ - `swarm-mail` - Event sourcing primitives (peer dependency)
45
+ - `@swarmtools/evals` - This package
46
+
47
+ ## Development
48
+
49
+ Evals use real coordinator sessions captured to `~/.config/swarm-tools/sessions/*.jsonl`. See the main plugin's `evals/README.md` for details on session capture.
50
+
51
+ ## License
52
+
53
+ MIT
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Compaction Prompt Quality Evaluation
3
+ *
4
+ * Tests that continuation prompts generated after context compaction meet
5
+ * quality criteria for coordinator resumption:
6
+ *
7
+ * 1. Epic ID Specificity (20%) - Real IDs not placeholders
8
+ * 2. Actionability (20%) - Specific tool calls with real values
9
+ * 3. Coordinator Identity (25%) - ASCII header + strong mandates
10
+ * 4. Forbidden Tools (15%) - Lists forbidden tools by name
11
+ * 5. Post-Compaction Discipline (20%) - First tool is correct
12
+ *
13
+ * ## Why This Matters
14
+ *
15
+ * After compaction, coordinators lose context. The continuation prompt is
16
+ * their ONLY guide to resume. Bad prompts cause:
17
+ * - Coordinators editing files (should delegate to workers)
18
+ * - Generic "check status" instead of actual tool calls
19
+ * - Lost epic IDs (can't resume coordination)
20
+ *
21
+ * ## Test Strategy
22
+ *
23
+ * - 6 synthetic fixtures covering perfect/bad prompts
24
+ * - Each fixture tests specific failure modes
25
+ * - Composite scorer validates overall quality
26
+ *
27
+ * Run with: bun run eval:compaction
28
+ */
29
+ export {};
30
+ //# sourceMappingURL=compaction-prompt.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-prompt.eval.d.ts","sourceRoot":"","sources":["../src/compaction-prompt.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG"}
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Compaction Hook Coordinator Resumption Eval
3
+ *
4
+ * Tests that the compaction hook correctly detects swarm state and injects
5
+ * appropriate context for coordinator resumption.
6
+ *
7
+ * ## Bug Being Tested
8
+ *
9
+ * Root cause: The compaction hook injects generic "you are a coordinator"
10
+ * context but doesn't include the SPECIFIC epic ID, subtask status, or
11
+ * project path. This causes coordinators to lose identity after compaction.
12
+ *
13
+ * ## Test Cases
14
+ *
15
+ * 1. Active swarm with in_progress epic - should inject full context with epic ID
16
+ * 2. Multiple epics - should identify the in_progress one
17
+ * 3. No active swarm - should not inject coordinator context
18
+ * 4. Blocked epic - should still detect as active swarm
19
+ *
20
+ * Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
21
+ */
22
+ export {};
23
+ //# sourceMappingURL=compaction-resumption.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-resumption.eval.d.ts","sourceRoot":"","sources":["../src/compaction-resumption.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Coordinator Behavior After Compaction Eval
3
+ *
4
+ * LLM-as-judge eval that tests whether the compaction context actually
5
+ * causes Claude to behave like a coordinator (spawn workers, check status)
6
+ * rather than a worker (run tests, edit files directly).
7
+ *
8
+ * This is the missing piece - we test the CONTEXT CONTENT in unit tests,
9
+ * but we need to test whether the LLM BEHAVES CORRECTLY given that context.
10
+ *
11
+ * Run with: bunx evalite run evals/coordinator-behavior.eval.ts
12
+ */
13
+ /**
14
+ * Scores whether the response mentions coordinator tools
15
+ */
16
+ export declare const mentionsCoordinatorTools: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
17
+ /**
18
+ * Scores whether the response avoids worker behaviors
19
+ */
20
+ export declare const avoidsWorkerBehaviors: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
21
+ /**
22
+ * Scores whether the response shows coordinator mindset
23
+ */
24
+ export declare const coordinatorMindset: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
25
+ /**
26
+ * Composite scorer for overall coordinator behavior
27
+ */
28
+ export declare const overallCoordinatorBehavior: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
29
+ //# sourceMappingURL=coordinator-behavior.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"coordinator-behavior.eval.d.ts","sourceRoot":"","sources":["../src/coordinator-behavior.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AA2DH;;GAEG;AACH,eAAO,MAAM,wBAAwB,6DAwBnC,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,qBAAqB,6DAiChC,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,kBAAkB,6DAyC7B,CAAC;AAEH;;GAEG;AACH,eAAO,MAAM,0BAA0B,6DAmBrC,CAAC"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Coordinator Session Eval - Scores Real Captured Sessions
3
+ *
4
+ * Tests that coordinators follow protocol:
5
+ * 1. Don't edit files directly (spawn workers)
6
+ * 2. Don't run tests directly (workers do verification)
7
+ * 3. Spawn workers for all subtasks
8
+ * 4. Review worker output before accepting
9
+ * 5. Minimize time to first spawn (don't overthink)
10
+ *
11
+ * ## Data Sources
12
+ *
13
+ * - **Real sessions**: Captured from ~/.config/swarm-tools/sessions/*.jsonl
14
+ * - **Synthetic fixtures**: Test cases in fixtures/coordinator-sessions.ts
15
+ *
16
+ * ## Test Flow
17
+ *
18
+ * 1. Load captured sessions from disk (via loadCapturedSessions)
19
+ * 2. Load synthetic fixtures for baseline validation
20
+ * 3. Run coordinator-discipline scorers on all sessions
21
+ * 4. Output scores and violation details
22
+ *
23
+ * Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
24
+ */
25
+ export {};
26
+ //# sourceMappingURL=coordinator-session.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"coordinator-session.eval.d.ts","sourceRoot":"","sources":["../src/coordinator-session.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;GAuBG"}
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Example eval file to test Evalite setup
3
+ *
4
+ * This is a minimal test to verify:
5
+ * 1. Evalite CLI can discover .eval.ts files
6
+ * 2. createScorer works
7
+ * 3. evalite() function works
8
+ */
9
+ export {};
10
+ //# sourceMappingURL=example.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"example.eval.d.ts","sourceRoot":"","sources":["../src/example.eval.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG"}
@@ -0,0 +1,147 @@
1
+ /**
2
+ * CASS Baseline Response Fixtures
3
+ *
4
+ * These fixtures capture the ACTUAL behavior of the CASS binary tools.
5
+ * DO NOT modify to match desired behavior - these document what the binary DOES.
6
+ *
7
+ * Purpose: Characterization tests for ADR-010 (CASS inhousing).
8
+ * These ensure our inhouse implementation matches the binary's behavior.
9
+ */
10
+ /**
11
+ * cass stats --json
12
+ * Captured: 2025-12-25
13
+ */
14
+ export declare const cassStatsBaseline: {
15
+ readonly by_agent: readonly [{
16
+ readonly agent: "claude_code";
17
+ readonly count: 137;
18
+ }, {
19
+ readonly agent: "cursor";
20
+ readonly count: 23;
21
+ }, {
22
+ readonly agent: "codex";
23
+ readonly count: 2;
24
+ }];
25
+ readonly conversations: 162;
26
+ readonly date_range: {
27
+ readonly newest: "2025-12-08T04:20:36.526+00:00";
28
+ readonly oldest: "2025-07-14T01:14:44.997+00:00";
29
+ };
30
+ readonly db_path: "/Users/joel/Library/Application Support/com.coding-agent-search.coding-agent-search/agent_search.db";
31
+ readonly messages: 4213;
32
+ readonly top_workspaces: readonly [{
33
+ readonly count: 28;
34
+ readonly workspace: "/Users/joel/Code/vercel/academy-vectr-workflow-course-content/external/workflow-builder-starter";
35
+ }, {
36
+ readonly count: 22;
37
+ readonly workspace: "/Users/joel/Code/vercel/slack-agents-course";
38
+ }];
39
+ };
40
+ /**
41
+ * cass search "swarm" --limit 2 --json
42
+ * Captured: 2025-12-25
43
+ */
44
+ export declare const cassSearchBaseline: {
45
+ readonly count: 2;
46
+ readonly cursor: null;
47
+ readonly hits: readonly [{
48
+ readonly agent: "claude_code";
49
+ readonly content: "Fixed. The `plugins` key is invalid - OpenCode auto-loads plugins from directories instead.\n\n**Changes:**\n1. ✅ Removed invalid `plugins` array from `opencode.jsonc`\n2. ✅ Created `~/.config/opencode/plugin/` directory\n3. ✅ Symlinked your swarm plugin → `~/.config/opencode/plugin/swarm.js`\n\nThe plugin will now auto-load on startup. Restart OpenCode to pick it up.\n\nSources:\n- [OpenCode Plugins Documentation](https://opencode.ai/docs/plugins/)\n- [OpenCode Config Documentation](https://opencode.ai/docs/config/)";
50
+ readonly created_at: 1765161767083;
51
+ readonly line_number: 9;
52
+ readonly match_type: "exact";
53
+ readonly score: 15.536974906921387;
54
+ readonly snippet: "Symlinked your swarm plugin → `~/.config/opencode/plugin/swarm.js`\n\nThe plugin will now auto-load on startup. Restart OpenCode to pick it up.\n\nSources:\n- [OpenC…";
55
+ readonly source_path: "/Users/joel/.claude/projects/-Users-joel--config-opencode/ccd64ac6-bca7-40e5-9150-cea58c3788ae.jsonl";
56
+ readonly title: "@opencode.jsonc has an invalid plugins key https://opencode.ai/docs/plugins/ https://opencode.ai/doc";
57
+ readonly workspace: "/Users/joel/.config/opencode";
58
+ }, {
59
+ readonly agent: "claude_code";
60
+ readonly content: "I'm ready to help you explore the codebase and design implementation plans. I'm in **READ-ONLY mode** - I can explore files, understand architecture, and create detailed plans, but I cannot and will not modify any files.\n\nI have access to the beads issue tracker (`bd` commands) and can see your current working directory is `/Users/joel/.config/opencode`.\n\n**Current git status shows:**\n- Modified: `.beads/issues.jsonl`, `AGENTS.md`, `command/swarm.md`, `opencode.jsonc`\n- Untracked: `command/swarm-collect.md`, `command/swarm-status.md`, `plugin/`\n\n**What would you like me to explore and plan?**\n\nCommon scenarios I can help with:\n- Designing new command implementations\n- Planning plugin architecture\n- Exploring existing patterns for feature additions\n- Creating implementation strategies for beads issues\n\nLet me know what you need, and I'll dive into the codebase, understand the current architecture, and provide a detailed implementation plan.";
61
+ readonly created_at: 1765161814722;
62
+ readonly line_number: 1;
63
+ readonly match_type: "exact";
64
+ readonly score: 14.522254943847656;
65
+ readonly snippet: ".md`, `command/swarm.md`, `opencode.jsonc`\n- Untracked: `command/swarm-collect.md`, `command/swarm-status.md`, `plugin/`\n\n**What would you like me to explore an…";
66
+ readonly source_path: "/Users/joel/.claude/projects/-Users-joel--config-opencode/agent-ee2a73ee.jsonl";
67
+ readonly title: "opencode";
68
+ readonly workspace: "/Users/joel/.config/opencode";
69
+ }];
70
+ readonly hits_clamped: false;
71
+ readonly limit: 2;
72
+ readonly max_tokens: null;
73
+ readonly offset: 0;
74
+ readonly query: "swarm";
75
+ readonly request_id: null;
76
+ readonly total_matches: 2;
77
+ };
78
+ /**
79
+ * cass health (human-readable output)
80
+ * Captured: 2025-12-25
81
+ */
82
+ export declare const cassHealthHumanBaseline = "\u2713 Healthy (3ms)\n Note: index stale (older than 300s)";
83
+ /**
84
+ * cass stats (human-readable output)
85
+ * Captured: 2025-12-25
86
+ */
87
+ export declare const cassStatsHumanBaseline = "CASS Index Statistics\n=====================\nDatabase: /Users/joel/Library/Application Support/com.coding-agent-search.coding-agent-search/agent_search.db\n\nTotals:\n Conversations: 162\n Messages: 4213\n\nBy Agent:\n claude_code: 137\n cursor: 23\n codex: 2\n\nTop Workspaces:\n /Users/joel/Code/vercel/academy-vectr-workflow-course-content/external/workflow-builder-starter: 28\n /Users/joel/Code/vercel/slack-agents-course: 22\n /Users/joel/Code/vercel/academy-vectr-workflow-course-content: 22\n /Users/joel/Code/vercel/academy-content: 13\n /Users/joel/Code/joelhooks/trt-buddy: 13\n /Users/joel/Code/vercel/front: 11\n /Users/joel/.config/opencode: 9\n /Users/joel: 6\n /Users/joel/Code/badass-courses/course-builder/apps/ai-hero: 5\n /Users/joel/Code/vercel/front/apps/vercel-academy: 4\n\nDate Range: 2025-07-14 to 2025-12-08";
88
+ /**
89
+ * cass view <file> -n <line>
90
+ * Captured: 2025-12-25
91
+ *
92
+ * Format: File path header, line indicator with context window, separator, content with line numbers
93
+ */
94
+ export declare const cassViewBaseline = "File: /Users/joel/.config/swarm-tools/sessions/ses_19yz2iaMpHxY1ddvVq2voC.jsonl\nLine: 1 (context: 5)\n----------------------------------------\n> 1 | {\"session_id\":\"ses_19yz2iaMpHxY1ddvVq2voC\",\"epic_id\":\"cell-f2p61v-mjko4d89zdt\",\"timestamp\":\"2025-12-24T23:51:52.896Z\",\"event_type\":\"OUTCOME\",\"outcome_type\":\"subtask_success\",\"payload\":{\"bead_id\":\"cell-f2p61v-mjko4d89zdt\",\"duration_ms\":0,\"files_touched\":[],\"verification_passed\":false,\"verification_skipped\":true}}\n----------------------------------------";
95
+ /**
96
+ * Error responses (captured from actual failures)
97
+ */
98
+ export declare const cassErrorBaseline: {
99
+ readonly fileNotFound: {
100
+ readonly error: {
101
+ readonly code: 3;
102
+ readonly hint: null;
103
+ readonly kind: "file-not-found";
104
+ readonly message: "File not found: /Users/joel/.config/swarm-tools/sessions/ses_fRrFb7WrNr9K89JBCKd6GV.jsonl";
105
+ readonly retryable: false;
106
+ };
107
+ };
108
+ readonly invalidArgument: {
109
+ readonly error: {
110
+ readonly code: 2;
111
+ readonly hint: {
112
+ readonly common_mistakes: readonly [{
113
+ readonly correct: "cass robot-docs";
114
+ readonly wrong: "cass --robot-docs";
115
+ }, {
116
+ readonly correct: "cass robot-docs commands";
117
+ readonly wrong: "cass --robot-docs=commands";
118
+ }, {
119
+ readonly correct: "cass robot-docs";
120
+ readonly wrong: "cass robot-docs --robot";
121
+ }];
122
+ readonly error: "error: unexpected argument '--robot' found\\n\\nUsage: cass stats [OPTIONS]\\n\\nFor more information, try '--help'.\\n";
123
+ readonly examples: readonly ["cass robot-docs commands", "cass robot-docs schemas", "cass robot-docs examples", "cass --robot-help"];
124
+ readonly flag_syntax: {
125
+ readonly correct: readonly ["--limit 5", "--robot", "--json"];
126
+ readonly incorrect: readonly ["-limit 5", "limit=5", "--Limit"];
127
+ };
128
+ readonly hints: readonly ["For get robot-mode documentation, try: cass --robot-help"];
129
+ readonly kind: "argument_parsing";
130
+ readonly status: "error";
131
+ };
132
+ readonly kind: "usage";
133
+ readonly message: "Could not parse arguments";
134
+ readonly retryable: false;
135
+ };
136
+ };
137
+ };
138
+ /**
139
+ * Schema definitions extracted from actual responses
140
+ */
141
+ export type CassStatsResponse = typeof cassStatsBaseline;
142
+ export type CassSearchResponse = typeof cassSearchBaseline;
143
+ export type CassSearchHit = CassSearchResponse["hits"][number];
144
+ export type CassAgentStats = CassStatsResponse["by_agent"][number];
145
+ export type CassWorkspaceStats = CassStatsResponse["top_workspaces"][number];
146
+ export type CassError = typeof cassErrorBaseline.fileNotFound | typeof cassErrorBaseline.invalidArgument;
147
+ //# sourceMappingURL=cass-baseline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cass-baseline.d.ts","sourceRoot":"","sources":["../../src/fixtures/cass-baseline.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH;;;GAGG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;;;;;;CAkCpB,CAAC;AAEX;;;GAGG;AACH,eAAO,MAAM,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA2CrB,CAAC;AAEX;;;GAGG;AACH,eAAO,MAAM,uBAAuB,gEACE,CAAC;AAEvC;;;GAGG;AACH,eAAO,MAAM,sBAAsB,u1BAyBE,CAAC;AAEtC;;;;;GAKG;AACH,eAAO,MAAM,gBAAgB,oiBAIY,CAAC;AAE1C;;GAEG;AACH,eAAO,MAAM,iBAAiB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAoDpB,CAAC;AAEX;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,OAAO,iBAAiB,CAAC;AACzD,MAAM,MAAM,kBAAkB,GAAG,OAAO,kBAAkB,CAAC;AAC3D,MAAM,MAAM,aAAa,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC;AAC/D,MAAM,MAAM,cAAc,GAAG,iBAAiB,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC,CAAC;AACnE,MAAM,MAAM,kBAAkB,GAAG,iBAAiB,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAC;AAC7E,MAAM,MAAM,SAAS,GACjB,OAAO,iBAAiB,CAAC,YAAY,GACrC,OAAO,iBAAiB,CAAC,eAAe,CAAC"}
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Test cases for compaction hook coordinator resumption
3
+ *
4
+ * Each case simulates a different swarm state and verifies that
5
+ * the compaction hook injects the correct context for resumption.
6
+ */
7
+ import type { Cell } from "swarm-mail";
8
+ /**
9
+ * Compaction test case structure
10
+ */
11
+ export interface CompactionTestCase {
12
+ name: string;
13
+ description: string;
14
+ /**
15
+ * Simulated hive state (cells to create)
16
+ */
17
+ hiveCells: Array<Omit<Cell, "created_at" | "updated_at" | "closed_at">>;
18
+ /**
19
+ * Simulated swarm-mail state
20
+ */
21
+ swarmMailState: {
22
+ agents: number;
23
+ reservations: number;
24
+ messages: number;
25
+ };
26
+ /**
27
+ * Expected detection confidence
28
+ */
29
+ expected: {
30
+ confidence: "high" | "medium" | "low" | "none";
31
+ contextInjected: boolean;
32
+ contextType: "full" | "fallback" | "none";
33
+ /**
34
+ * Patterns that MUST appear in injected context (if injected)
35
+ */
36
+ mustContain?: string[];
37
+ /**
38
+ * Patterns that MUST NOT appear
39
+ */
40
+ mustNotContain?: string[];
41
+ };
42
+ }
43
+ export declare const compactionCases: CompactionTestCase[];
44
+ //# sourceMappingURL=compaction-cases.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-cases.d.ts","sourceRoot":"","sources":["../../src/fixtures/compaction-cases.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAEvC;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,EAAE,YAAY,GAAG,YAAY,GAAG,WAAW,CAAC,CAAC,CAAC;IACxE;;OAEG;IACH,cAAc,EAAE;QACd,MAAM,EAAE,MAAM,CAAC;QACf,YAAY,EAAE,MAAM,CAAC;QACrB,QAAQ,EAAE,MAAM,CAAC;KAClB,CAAC;IACF;;OAEG;IACH,QAAQ,EAAE;QACR,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,GAAG,MAAM,CAAC;QAC/C,eAAe,EAAE,OAAO,CAAC;QACzB,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,MAAM,CAAC;QAC1C;;WAEG;QACH,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;QACvB;;WAEG;QACH,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;KAC3B,CAAC;CACH;AAED,eAAO,MAAM,eAAe,EAAE,kBAAkB,EAgT/C,CAAC"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * Test cases for compaction prompt quality evaluation
3
+ *
4
+ * Each case represents a continuation prompt that should be generated
5
+ * after context compaction. Tests validate that prompts have:
6
+ * - Real epic IDs (not placeholders)
7
+ * - Actionable tool calls with specific values
8
+ * - Strong coordinator identity
9
+ * - Explicit forbidden tools list
10
+ * - Correct first tool suggestion
11
+ */
12
+ import type { CompactionPrompt } from "opencode-swarm-plugin/compaction-prompt-scoring";
13
+ /**
14
+ * Compaction prompt test case structure
15
+ */
16
+ export interface CompactionPromptTestCase {
17
+ name: string;
18
+ description: string;
19
+ /**
20
+ * The generated continuation prompt
21
+ */
22
+ prompt: CompactionPrompt;
23
+ /**
24
+ * Expected scoring outcomes
25
+ */
26
+ expected: {
27
+ /**
28
+ * Should have real epic IDs (not placeholders)
29
+ */
30
+ hasRealEpicId: boolean;
31
+ /**
32
+ * Should have actionable tool calls
33
+ */
34
+ isActionable: boolean;
35
+ /**
36
+ * Should have strong coordinator identity
37
+ */
38
+ hasCoordinatorIdentity: boolean;
39
+ /**
40
+ * Should list forbidden tools by name
41
+ */
42
+ listsForbiddenTools: boolean;
43
+ /**
44
+ * First suggested tool should be correct
45
+ */
46
+ hasCorrectFirstTool: boolean;
47
+ };
48
+ }
49
+ export declare const compactionPromptCases: CompactionPromptTestCase[];
50
+ //# sourceMappingURL=compaction-prompt-cases.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-prompt-cases.d.ts","sourceRoot":"","sources":["../../src/fixtures/compaction-prompt-cases.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iDAAiD,CAAC;AAExF;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACxC,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB;;OAEG;IACH,MAAM,EAAE,gBAAgB,CAAC;IACzB;;OAEG;IACH,QAAQ,EAAE;QACT;;WAEG;QACH,aAAa,EAAE,OAAO,CAAC;QACvB;;WAEG;QACH,YAAY,EAAE,OAAO,CAAC;QACtB;;WAEG;QACH,sBAAsB,EAAE,OAAO,CAAC;QAChC;;WAEG;QACH,mBAAmB,EAAE,OAAO,CAAC;QAC7B;;WAEG;QACH,mBAAmB,EAAE,OAAO,CAAC;KAC7B,CAAC;CACF;AAED,eAAO,MAAM,qBAAqB,EAAE,wBAAwB,EAmQ3D,CAAC"}
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Coordinator Session Test Fixtures
3
+ *
4
+ * Synthetic coordinator sessions for testing coordinator-discipline scorers.
5
+ * Each fixture demonstrates good or bad coordinator behavior.
6
+ */
7
+ import type { CoordinatorSession } from "opencode-swarm-plugin/eval-capture";
8
+ /**
9
+ * PERFECT COORDINATOR
10
+ *
11
+ * - No violations (no direct edits, tests, or reservations)
12
+ * - 100% spawn efficiency (3/3 workers spawned)
13
+ * - 100% review thoroughness (all workers reviewed)
14
+ * - Fast time to first spawn (30s)
15
+ */
16
+ export declare const perfectCoordinator: CoordinatorSession;
17
+ /**
18
+ * BAD COORDINATOR - Multiple Violations
19
+ *
20
+ * - 3 violations (edited file, ran tests, reserved files)
21
+ * - 33% spawn efficiency (only 1/3 workers spawned)
22
+ * - 0% review thoroughness (no reviews)
23
+ * - Slow time to first spawn (10 minutes)
24
+ */
25
+ export declare const badCoordinator: CoordinatorSession;
26
+ /**
27
+ * DECENT COORDINATOR - Some Issues
28
+ *
29
+ * - 1 violation (ran tests once)
30
+ * - 100% spawn efficiency (2/2 workers spawned)
31
+ * - 50% review thoroughness (reviewed only 1/2)
32
+ * - Good time to first spawn (45s)
33
+ */
34
+ export declare const decentCoordinator: CoordinatorSession;
35
+ /**
36
+ * All test fixtures
37
+ */
38
+ export declare const coordinatorSessionFixtures: {
39
+ session_id: string;
40
+ epic_id: string;
41
+ start_time: string;
42
+ events: ({
43
+ session_id: string;
44
+ epic_id: string;
45
+ timestamp: string;
46
+ event_type: "DECISION";
47
+ decision_type: "strategy_selected" | "worker_spawned" | "review_completed" | "decomposition_complete" | "researcher_spawned" | "skill_loaded" | "inbox_checked" | "blocker_resolved" | "scope_change_approved" | "scope_change_rejected";
48
+ payload: any;
49
+ } | {
50
+ session_id: string;
51
+ epic_id: string;
52
+ timestamp: string;
53
+ event_type: "VIOLATION";
54
+ violation_type: "coordinator_edited_file" | "coordinator_ran_tests" | "coordinator_reserved_files" | "no_worker_spawned";
55
+ payload: any;
56
+ } | {
57
+ session_id: string;
58
+ epic_id: string;
59
+ timestamp: string;
60
+ event_type: "OUTCOME";
61
+ outcome_type: "subtask_success" | "subtask_retry" | "subtask_failed" | "epic_complete" | "blocker_detected";
62
+ payload: any;
63
+ } | {
64
+ session_id: string;
65
+ epic_id: string;
66
+ timestamp: string;
67
+ event_type: "COMPACTION";
68
+ compaction_type: "detection_complete" | "prompt_generated" | "context_injected" | "resumption_started" | "tool_call_tracked";
69
+ payload: any;
70
+ })[];
71
+ end_time?: string | undefined;
72
+ }[];
73
+ //# sourceMappingURL=coordinator-sessions.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"coordinator-sessions.d.ts","sourceRoot":"","sources":["../../src/fixtures/coordinator-sessions.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,oCAAoC,CAAC;AAE7E;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,EAAE,kBAsHhC,CAAC;AAEF;;;;;;;GAOG;AACH,eAAO,MAAM,cAAc,EAAE,kBA+E5B,CAAC;AAEF;;;;;;;GAOG;AACH,eAAO,MAAM,iBAAiB,EAAE,kBAoF/B,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,0BAA0B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAItC,CAAC"}
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Test cases for swarm task decomposition
3
+ *
4
+ * Each case includes:
5
+ * - input: task description and optional context
6
+ * - expected: validation criteria (min/max subtasks, required files)
7
+ */
8
+ export interface DecompositionTestCase {
9
+ input: {
10
+ task: string;
11
+ context?: string;
12
+ };
13
+ expected: {
14
+ minSubtasks: number;
15
+ maxSubtasks: number;
16
+ requiredFiles?: string[];
17
+ };
18
+ }
19
+ export declare const decompositionCases: DecompositionTestCase[];
20
+ //# sourceMappingURL=decomposition-cases.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"decomposition-cases.d.ts","sourceRoot":"","sources":["../../src/fixtures/decomposition-cases.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,WAAW,qBAAqB;IACpC,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;IACF,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;KAC1B,CAAC;CACH;AAED,eAAO,MAAM,kBAAkB,EAAE,qBAAqB,EAoFrD,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * @swarmtools/evals - Evaluation suite for swarm-tools
3
+ *
4
+ * Placeholder entry point. Actual eval implementations will be added when
5
+ * evals are migrated from opencode-swarm-plugin/evals/
6
+ */
7
+ export {};
8
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,EAAE,CAAC"}
package/dist/index.js ADDED
File without changes
@@ -0,0 +1,103 @@
1
+ import type { CoordinatorEvent } from "opencode-swarm-plugin/eval-capture";
2
+ /**
3
+ * Compaction event - subset of CoordinatorEvent with event_type === "COMPACTION"
4
+ */
5
+ export type CompactionEvent = Extract<CoordinatorEvent, {
6
+ event_type: "COMPACTION";
7
+ }>;
8
+ /**
9
+ * Compaction session - session with only COMPACTION events
10
+ */
11
+ export interface CompactionSession {
12
+ session_id: string;
13
+ epic_id: string;
14
+ start_time: string;
15
+ end_time: string;
16
+ events: CompactionEvent[];
17
+ }
18
+ /**
19
+ * Load options
20
+ */
21
+ export interface LoadOptions {
22
+ /** Filter by compaction_type */
23
+ compaction_type?: "detection_complete" | "prompt_generated" | "context_injected" | "resumption_started" | "tool_call_tracked";
24
+ /** Filter by session IDs */
25
+ sessionIds?: string[];
26
+ /** Limit number of results */
27
+ limit?: number;
28
+ }
29
+ /**
30
+ * Load COMPACTION events from session JSONL files
31
+ *
32
+ * Reads all .jsonl files in the session directory, parses events,
33
+ * and returns only COMPACTION events matching the filters.
34
+ *
35
+ * @param sessionDir - Path to session directory (default: ~/.config/swarm-tools/sessions)
36
+ * @param options - Filter options
37
+ * @returns Array of compaction events
38
+ *
39
+ * @example
40
+ * // Load all COMPACTION events
41
+ * const events = await loadCompactionEvents("/path/to/sessions");
42
+ *
43
+ * @example
44
+ * // Load only detection_complete events
45
+ * const events = await loadCompactionEvents("/path/to/sessions", {
46
+ * compaction_type: "detection_complete",
47
+ * });
48
+ *
49
+ * @example
50
+ * // Load events from specific sessions
51
+ * const events = await loadCompactionEvents("/path/to/sessions", {
52
+ * sessionIds: ["session-1", "session-2"],
53
+ * limit: 10,
54
+ * });
55
+ */
56
+ export declare function loadCompactionEvents(sessionDir: string, options?: LoadOptions): Promise<CompactionEvent[]>;
57
+ /**
58
+ * Load COMPACTION sessions grouped by session_id
59
+ *
60
+ * Groups COMPACTION events by session_id and returns session metadata.
61
+ *
62
+ * @param sessionDir - Path to session directory
63
+ * @param options - Filter options
64
+ * @returns Array of compaction sessions
65
+ *
66
+ * @example
67
+ * // Load all sessions with COMPACTION events
68
+ * const sessions = await loadCompactionSessions("/path/to/sessions");
69
+ *
70
+ * @example
71
+ * // Load sessions with specific compaction_type
72
+ * const sessions = await loadCompactionSessions("/path/to/sessions", {
73
+ * compaction_type: "prompt_generated",
74
+ * });
75
+ */
76
+ export declare function loadCompactionSessions(sessionDir: string, options?: LoadOptions): Promise<CompactionSession[]>;
77
+ /**
78
+ * Load COMPACTION events from default session directory
79
+ *
80
+ * Convenience wrapper that uses the default ~/.config/swarm-tools/sessions directory.
81
+ *
82
+ * @param options - Filter options
83
+ * @returns Array of compaction events
84
+ *
85
+ * @example
86
+ * // Load recent compaction events
87
+ * const events = await loadDefaultCompactionEvents({ limit: 10 });
88
+ */
89
+ export declare function loadDefaultCompactionEvents(options?: LoadOptions): Promise<CompactionEvent[]>;
90
+ /**
91
+ * Load COMPACTION sessions from default session directory
92
+ *
93
+ * Convenience wrapper that uses the default ~/.config/swarm-tools/sessions directory.
94
+ *
95
+ * @param options - Filter options
96
+ * @returns Array of compaction sessions
97
+ *
98
+ * @example
99
+ * // Load all compaction sessions
100
+ * const sessions = await loadDefaultCompactionSessions();
101
+ */
102
+ export declare function loadDefaultCompactionSessions(options?: LoadOptions): Promise<CompactionSession[]>;
103
+ //# sourceMappingURL=compaction-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-loader.d.ts","sourceRoot":"","sources":["../../src/lib/compaction-loader.ts"],"names":[],"mappings":"AAgBA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,oCAAoC,CAAC;AAG3E;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,OAAO,CACnC,gBAAgB,EAChB;IAAE,UAAU,EAAE,YAAY,CAAA;CAAE,CAC7B,CAAC;AAEF;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,eAAe,EAAE,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,gCAAgC;IAChC,eAAe,CAAC,EACZ,oBAAoB,GACpB,kBAAkB,GAClB,kBAAkB,GAClB,oBAAoB,GACpB,mBAAmB,CAAC;IACxB,4BAA4B;IAC5B,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,wBAAsB,oBAAoB,CACxC,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC,CAuE5B;AAyDD;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAsB,sBAAsB,CAC1C,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,iBAAiB,EAAE,CAAC,CA8C9B;AAED;;;;;;;;;;;GAWG;AACH,wBAAsB,2BAA2B,CAC/C,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,eAAe,EAAE,CAAC,CAG5B;AAED;;;;;;;;;;;GAWG;AACH,wBAAsB,6BAA6B,CACjD,OAAO,CAAC,EAAE,WAAW,GACpB,OAAO,CAAC,iBAAiB,EAAE,CAAC,CAG9B"}
@@ -0,0 +1,79 @@
1
+ import { type EvalRecord } from "swarm-mail";
2
+ export interface EvalCase {
3
+ input: {
4
+ task: string;
5
+ context?: string;
6
+ };
7
+ expected: {
8
+ minSubtasks: number;
9
+ maxSubtasks: number;
10
+ requiredFiles?: string[];
11
+ overallSuccess?: boolean;
12
+ };
13
+ actual?: EvalRecord;
14
+ }
15
+ /**
16
+ * Load eval cases from PGlite
17
+ *
18
+ * @param projectKey - Project key for filtering records
19
+ * @param options - Filter options
20
+ * @returns Array of eval cases ready for Evalite
21
+ */
22
+ export declare function loadEvalCases(projectKey: string, options?: {
23
+ limit?: number;
24
+ strategy?: "file-based" | "feature-based" | "risk-based";
25
+ successOnly?: boolean;
26
+ projectPath?: string;
27
+ }): Promise<EvalCase[]>;
28
+ /**
29
+ * Check if we have enough real data to run evals
30
+ *
31
+ * @param projectKey - Project key to check
32
+ * @param minRecords - Minimum number of records required (default: 5)
33
+ * @param projectPath - Optional project path for database lookup
34
+ * @returns True if enough data exists
35
+ */
36
+ export declare function hasRealEvalData(projectKey: string, minRecords?: number, projectPath?: string): Promise<boolean>;
37
+ /**
38
+ * Get eval data stats for reporting
39
+ *
40
+ * @param projectKey - Project key to query
41
+ * @param projectPath - Optional project path for database lookup
42
+ * @returns Summary of available eval data
43
+ */
44
+ export declare function getEvalDataSummary(projectKey: string, projectPath?: string): Promise<{
45
+ totalRecords: number;
46
+ successRate: number;
47
+ byStrategy: Record<string, number>;
48
+ hasEnoughData: boolean;
49
+ }>;
50
+ /**
51
+ * Load captured coordinator sessions from ~/.config/swarm-tools/sessions/
52
+ *
53
+ * Reads all JSONL session files and returns CoordinatorSession objects.
54
+ *
55
+ * Quality filters are applied to focus on high-signal coordinator sessions:
56
+ * - minEvents: Filter out incomplete/aborted sessions (default: 3)
57
+ * - requireWorkerSpawn: Ensure session delegated to workers (default: true)
58
+ * - requireReview: Ensure coordinator reviewed work (default: true)
59
+ *
60
+ * Filters are applied BEFORE the limit for accurate sampling.
61
+ *
62
+ * @param options - Filter options
63
+ * @returns Array of coordinator sessions that meet quality criteria
64
+ */
65
+ export declare function loadCapturedSessions(options?: {
66
+ sessionIds?: string[];
67
+ limit?: number;
68
+ /** Minimum number of events required (default: 3) */
69
+ minEvents?: number;
70
+ /** Require at least one worker_spawned event (default: true) */
71
+ requireWorkerSpawn?: boolean;
72
+ /** Require at least one review_completed event (default: true) */
73
+ requireReview?: boolean;
74
+ /** Override session directory for testing */
75
+ sessionDir?: string;
76
+ }): Promise<Array<{
77
+ session: import("opencode-swarm-plugin/eval-capture").CoordinatorSession;
78
+ }>>;
79
+ //# sourceMappingURL=data-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"data-loader.d.ts","sourceRoot":"","sources":["../../src/lib/data-loader.ts"],"names":[],"mappings":"AAOA,OAAO,EAGL,KAAK,UAAU,EAChB,MAAM,YAAY,CAAC;AAEpB,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAC1C,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;QACzB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,CAAC;IACF,MAAM,CAAC,EAAE,UAAU,CAAC;CACrB;AAED;;;;;;GAMG;AACH,wBAAsB,aAAa,CACjC,UAAU,EAAE,MAAM,EAClB,OAAO,CAAC,EAAE;IACR,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,YAAY,GAAG,eAAe,GAAG,YAAY,CAAC;IACzD,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GACA,OAAO,CAAC,QAAQ,EAAE,CAAC,CA6BrB;AAED;;;;;;;GAOG;AACH,wBAAsB,eAAe,CACnC,UAAU,EAAE,MAAM,EAClB,UAAU,GAAE,MAAU,EACtB,WAAW,CAAC,EAAE,MAAM,GACnB,OAAO,CAAC,OAAO,CAAC,CAGlB;AAED;;;;;;GAMG;AACH,wBAAsB,kBAAkB,CACtC,UAAU,EAAE,MAAM,EAClB,WAAW,CAAC,EAAE,MAAM,GACnB,OAAO,CAAC;IACT,YAAY,EAAE,MAAM,CAAC;IACrB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACnC,aAAa,EAAE,OAAO,CAAC;CACxB,CAAC,CASD;AA0CD;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,oBAAoB,CAAC,OAAO,CAAC,EAAE;IACnD,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,qDAAqD;IACrD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gEAAgE;IAChE,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,kEAAkE;IAClE,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,6CAA6C;IAC7C,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB,GAAG,OAAO,CACT,KAAK,CAAC;IAAE,OAAO,EAAE,OAAO,oCAAoC,EAAE,kBAAkB,CAAA;CAAE,CAAC,CACpF,CAmGA"}
@@ -0,0 +1,27 @@
1
+ import type { GatewayModelId } from "ai";
2
+ /**
3
+ * Default model for decomposition evals
4
+ * Using Claude Sonnet for good balance of quality and cost
5
+ */
6
+ export declare const DEFAULT_MODEL: GatewayModelId;
7
+ /**
8
+ * Generate a decomposition from a task description
9
+ *
10
+ * @param prompt - The full decomposition prompt
11
+ * @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
12
+ * @returns The raw text response from the LLM
13
+ */
14
+ export declare function generateDecomposition(prompt: string, model?: GatewayModelId): Promise<string>;
15
+ /**
16
+ * Format a decomposition prompt from task and context
17
+ *
18
+ * Uses the same prompt template as swarm_plan_prompt
19
+ */
20
+ export declare function formatDecompositionPrompt(task: string, context?: string, maxSubtasks?: number): string;
21
+ /**
22
+ * Extract JSON from LLM response
23
+ *
24
+ * Handles responses that may have markdown code blocks or extra text
25
+ */
26
+ export declare function extractJson(text: string): string;
27
+ //# sourceMappingURL=llm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm.d.ts","sourceRoot":"","sources":["../../src/lib/llm.ts"],"names":[],"mappings":"AASA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AAEzC;;;GAGG;AACH,eAAO,MAAM,aAAa,EAAE,cAA8C,CAAC;AAE3E;;;;;;GAMG;AACH,wBAAsB,qBAAqB,CACzC,MAAM,EAAE,MAAM,EACd,KAAK,GAAE,cAA8B,GACpC,OAAO,CAAC,MAAM,CAAC,CAQjB;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CACvC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,EAChB,WAAW,GAAE,MAAU,GACtB,MAAM,CA8CR;AAED;;;;GAIG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAehD"}
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Compaction Prompt Quality Scorers - Evalite Wrappers
3
+ *
4
+ * These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
5
+ * for use with evalite's test runner.
6
+ *
7
+ * Weighted scoring:
8
+ * - epicIdSpecificity (0.20) - real IDs not placeholders
9
+ * - actionability (0.20) - swarm_status/inbox with real values
10
+ * - coordinatorIdentity (0.25) - ASCII header + strong mandates
11
+ * - forbiddenToolsPresent (0.15) - lists forbidden tools by name
12
+ * - postCompactionDiscipline (0.20) - first tool correct, no edit/write
13
+ */
14
+ export type { CompactionPrompt, ScorerResult } from "opencode-swarm-plugin/compaction-prompt-scoring";
15
+ export { scoreActionability, scoreCoordinatorIdentity, scoreEpicIdSpecificity, scoreForbiddenToolsPresent, scorePostCompactionDiscipline, } from "opencode-swarm-plugin/compaction-prompt-scoring";
16
+ /**
17
+ * Epic ID Specificity Scorer
18
+ *
19
+ * Validates that epic IDs are REAL, not placeholders.
20
+ * Score: 1.0 if real IDs, 0.0 if placeholders found
21
+ */
22
+ export declare const epicIdSpecificity: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
23
+ /**
24
+ * Actionability Scorer
25
+ *
26
+ * Validates that the prompt includes SPECIFIC actionable tool calls.
27
+ * Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
28
+ */
29
+ export declare const actionability: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
30
+ /**
31
+ * Coordinator Identity Scorer
32
+ *
33
+ * Validates that the prompt has STRONG coordinator identity reinforcement.
34
+ * Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
35
+ */
36
+ export declare const coordinatorIdentity: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
37
+ /**
38
+ * Forbidden Tools Present Scorer
39
+ *
40
+ * Validates that the prompt LISTS forbidden tools by name.
41
+ * Score: ratio of forbidden tools mentioned (0.0 to 1.0)
42
+ */
43
+ export declare const forbiddenToolsPresent: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
44
+ /**
45
+ * Post-Compaction Discipline Scorer
46
+ *
47
+ * Validates that the FIRST suggested tool is correct.
48
+ * Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
49
+ */
50
+ export declare const postCompactionDiscipline: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
51
+ //# sourceMappingURL=compaction-prompt-scorers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-prompt-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/compaction-prompt-scorers.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAaH,YAAY,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,iDAAiD,CAAC;AAGtG,OAAO,EACN,kBAAkB,EAClB,wBAAwB,EACxB,sBAAsB,EACtB,0BAA0B,EAC1B,6BAA6B,GAC7B,MAAM,iDAAiD,CAAC;AAEzD;;;;;GAKG;AACH,eAAO,MAAM,iBAAiB,6DAc5B,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,aAAa,6DAcxB,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,mBAAmB,6DAc9B,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,qBAAqB,6DAchC,CAAC;AAEH;;;;;GAKG;AACH,eAAO,MAAM,wBAAwB,6DAcnC,CAAC"}
@@ -0,0 +1,86 @@
1
+ /**
2
+ * Custom scorers for compaction hook evaluation
3
+ *
4
+ * These scorers validate that the compaction hook correctly:
5
+ * 1. Detects swarm state (confidence level)
6
+ * 2. Injects appropriate context (full/fallback/none)
7
+ * 3. Includes required patterns in context
8
+ * 4. Excludes placeholder/generic content
9
+ */
10
+ /**
11
+ * Expected output from compaction hook tests
12
+ */
13
+ export interface CompactionResult {
14
+ detected: boolean;
15
+ confidence: "high" | "medium" | "low" | "none";
16
+ contextInjected: boolean;
17
+ contextType: "full" | "fallback" | "none";
18
+ injectedContext: string;
19
+ }
20
+ /**
21
+ * Expected criteria from test case
22
+ */
23
+ export interface CompactionExpected {
24
+ confidence: "high" | "medium" | "low" | "none";
25
+ contextInjected: boolean;
26
+ contextType: "full" | "fallback" | "none";
27
+ mustContain?: string[];
28
+ mustNotContain?: string[];
29
+ }
30
+ /**
31
+ * Validates that detection confidence matches expected level
32
+ *
33
+ * Confidence determines what gets injected:
34
+ * - HIGH/MEDIUM: Full coordinator context
35
+ * - LOW: Fallback detection prompt
36
+ * - NONE: No injection
37
+ *
38
+ * Score: 1.0 if confidence matches, 0.0 otherwise
39
+ */
40
+ export declare const confidenceAccuracy: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
41
+ /**
42
+ * Validates that context injection matches expected behavior
43
+ *
44
+ * Checks:
45
+ * - Whether context was injected (boolean)
46
+ * - What type of context (full/fallback/none)
47
+ *
48
+ * Score: 1.0 if both match, 0.5 if only injection status matches, 0.0 otherwise
49
+ */
50
+ export declare const contextInjectionCorrectness: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
51
+ /**
52
+ * Validates that injected context contains required patterns
53
+ *
54
+ * For coordinator resumption, context MUST include:
55
+ * - Swarm continuation instructions
56
+ * - Tool names (swarm_status, swarmmail_inbox)
57
+ * - Actionable language ("COORDINATOR", "Keep Cooking")
58
+ *
59
+ * Score: ratio of required patterns found (0.0 to 1.0)
60
+ */
61
+ export declare const requiredPatternsPresent: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
62
+ /**
63
+ * Validates that injected context excludes forbidden patterns
64
+ *
65
+ * Context should NOT contain:
66
+ * - Placeholder IDs ("bd-xxx")
67
+ * - Generic/template language
68
+ * - Wrong context type markers
69
+ *
70
+ * Score: 1.0 if no forbidden patterns found, 0.0 if any found
71
+ */
72
+ export declare const forbiddenPatternsAbsent: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
73
+ /**
74
+ * Composite scorer: Overall compaction quality
75
+ *
76
+ * Combines all compaction-specific checks into single score.
77
+ * Weighted average:
78
+ * - Confidence accuracy: 25%
79
+ * - Context injection: 25%
80
+ * - Required patterns: 30%
81
+ * - Forbidden patterns: 20%
82
+ *
83
+ * Score: 0.0 to 1.0
84
+ */
85
+ export declare const compactionQuality: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
86
+ //# sourceMappingURL=compaction-scorers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/compaction-scorers.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAIH;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,EAAE,OAAO,CAAC;IAClB,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,GAAG,MAAM,CAAC;IAC/C,eAAe,EAAE,OAAO,CAAC;IACzB,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,MAAM,CAAC;IAC1C,eAAe,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,GAAG,MAAM,CAAC;IAC/C,eAAe,EAAE,OAAO,CAAC;IACzB,WAAW,EAAE,MAAM,GAAG,UAAU,GAAG,MAAM,CAAC;IAC1C,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,6DA0B7B,CAAC;AAEH;;;;;;;;GAQG;AACH,eAAO,MAAM,2BAA2B,6DAoCtC,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,uBAAuB,6DA0DlC,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,uBAAuB,6DA8ClC,CAAC;AAEH;;;;;;;;;;;GAWG;AACH,eAAO,MAAM,iBAAiB,6DA6C5B,CAAC"}
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Coordinator Discipline Scorers - Evaluate coordinator behavior
3
+ *
4
+ * These scorers measure whether a coordinator follows the protocol:
5
+ * 1. Don't edit files directly (spawn workers)
6
+ * 2. Don't run tests directly (workers do verification)
7
+ * 3. Spawn workers for all subtasks
8
+ * 4. Review worker output before accepting
9
+ * 5. Minimize time to first spawn (don't overthink)
10
+ *
11
+ * Inputs: CoordinatorSession from eval-capture
12
+ */
13
+ /**
14
+ * Violation Count Scorer
15
+ *
16
+ * Counts VIOLATION events in the session.
17
+ * Each violation reduces score by 0.2.
18
+ *
19
+ * Violations tracked:
20
+ * - coordinator_edited_file (should spawn worker instead)
21
+ * - coordinator_ran_tests (workers do verification)
22
+ * - coordinator_reserved_files (only workers reserve)
23
+ * - no_worker_spawned (subtask exists but no worker)
24
+ *
25
+ * Score: 1.0 - (0.2 * violation_count), floored at 0.0
26
+ */
27
+ export declare const violationCount: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
28
+ /**
29
+ * Spawn Efficiency Scorer
30
+ *
31
+ * Measures whether workers were spawned for all subtasks.
32
+ * Coordinators should delegate work, not do it themselves.
33
+ *
34
+ * Score: workers_spawned / subtasks_planned
35
+ *
36
+ * If no decomposition_complete event exists, falls back to counting spawns
37
+ * and returns 1.0 if any workers were spawned (better than nothing).
38
+ */
39
+ export declare const spawnEfficiency: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
40
+ /**
41
+ * Review Thoroughness Scorer
42
+ *
43
+ * Measures whether coordinator reviewed worker output.
44
+ * Should have review_completed events for all finished subtasks.
45
+ *
46
+ * Score: reviews_completed / workers_finished
47
+ */
48
+ export declare const reviewThoroughness: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
49
+ /**
50
+ * Time to First Spawn Scorer
51
+ *
52
+ * Measures how fast the coordinator spawned the first worker.
53
+ * Overthinking and perfectionism delays workers and blocks progress.
54
+ *
55
+ * Normalization:
56
+ * - < 60s: 1.0 (excellent)
57
+ * - 60-300s: linear decay to 0.5
58
+ * - > 300s: 0.0 (way too slow)
59
+ *
60
+ * Score: normalized to 0-1 (faster is better)
61
+ */
62
+ export declare const timeToFirstSpawn: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
63
+ /**
64
+ * Overall Discipline Scorer
65
+ *
66
+ * Weighted composite of all coordinator discipline metrics.
67
+ *
68
+ * Weights:
69
+ * - Violations: 30% (most critical - breaking protocol)
70
+ * - Spawn efficiency: 25% (delegation is key)
71
+ * - Review thoroughness: 25% (quality gate)
72
+ * - Time to first spawn: 20% (bias toward action)
73
+ *
74
+ * Score: 0.0 to 1.0
75
+ */
76
+ export declare const overallDiscipline: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
77
+ //# sourceMappingURL=coordinator-discipline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"coordinator-discipline.d.ts","sourceRoot":"","sources":["../../src/scorers/coordinator-discipline.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAKH;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,cAAc,6DAiCzB,CAAC;AAEH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,eAAe,6DAwD1B,CAAC;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,kBAAkB,6DAyC7B,CAAC;AAEH;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,gBAAgB,6DAkE3B,CAAC;AAEH;;;;;;;;;;;;GAYG;AACH,eAAO,MAAM,iBAAiB,6DA6C5B,CAAC"}
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Custom scorers for evaluating swarm task decomposition quality
3
+ */
4
+ /**
5
+ * Checks that no files appear in multiple subtasks
6
+ *
7
+ * Independent subtasks are critical for parallel execution.
8
+ * File conflicts cause merge conflicts and coordination overhead.
9
+ *
10
+ * Score: 1.0 if no conflicts, 0.0 if conflicts found
11
+ */
12
+ export declare const subtaskIndependence: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
13
+ export { executionSuccess, timeBalance, scopeAccuracy, scopeDrift, noRework, } from "./outcome-scorers.js";
14
+ export { confidenceAccuracy, contextInjectionCorrectness, requiredPatternsPresent, forbiddenPatternsAbsent, compactionQuality, } from "./compaction-scorers.js";
15
+ export { violationCount, spawnEfficiency, reviewThoroughness, timeToFirstSpawn, overallDiscipline, } from "./coordinator-discipline.js";
16
+ /**
17
+ * Checks that subtasks cover the full task scope
18
+ *
19
+ * Incomplete coverage means:
20
+ * - Missing functionality
21
+ * - Follow-up work required
22
+ * - Task not actually complete
23
+ *
24
+ * Score: ratio of expected files covered (0.0 to 1.0)
25
+ * If no expected files specified, checks that subtasks exist
26
+ */
27
+ export declare const coverageCompleteness: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
28
+ /**
29
+ * Checks that each subtask has clear, actionable instructions
30
+ *
31
+ * Vague instructions lead to:
32
+ * - Agent confusion and blocking
33
+ * - Incorrect implementations
34
+ * - Need for coordinator intervention
35
+ *
36
+ * Score: Average of per-subtask instruction quality
37
+ */
38
+ export declare const instructionClarity: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
39
+ /**
40
+ * LLM-as-judge scorer for decomposition coherence
41
+ *
42
+ * Uses Claude Haiku to evaluate whether subtasks are truly independent,
43
+ * well-scoped, and complete. This catches nuances that heuristics miss:
44
+ * - Semantic dependencies between subtasks
45
+ * - Scope that's too big or too trivial
46
+ * - Missing pieces that would block completion
47
+ *
48
+ * Only use for decomposition evals - this is where it matters.
49
+ */
50
+ export declare const decompositionCoherence: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
51
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scorers/index.ts"],"names":[],"mappings":"AAOA;;GAEG;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,mBAAmB,6DAuC9B,CAAC;AAMH,OAAO,EACL,gBAAgB,EAChB,WAAW,EACX,aAAa,EACb,UAAU,EACV,QAAQ,GACT,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACL,kBAAkB,EAClB,2BAA2B,EAC3B,uBAAuB,EACvB,uBAAuB,EACvB,iBAAiB,GAClB,MAAM,yBAAyB,CAAC;AAMjC,OAAO,EACL,cAAc,EACd,eAAe,EACf,kBAAkB,EAClB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,6BAA6B,CAAC;AAErC;;;;;;;;;;GAUG;AACH,eAAO,MAAM,oBAAoB,6DAsD/B,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,6DAsD7B,CAAC;AAMH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,sBAAsB,6DAmFjC,CAAC"}
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Outcome-based scorers for evaluating decomposition quality
3
+ *
4
+ * These scorers evaluate based on ACTUAL execution outcomes,
5
+ * not just the structure of the decomposition.
6
+ *
7
+ * Requires EvalRecord with outcomes populated.
8
+ */
9
+ /**
10
+ * Execution Success Scorer
11
+ *
12
+ * Measures whether all subtasks succeeded without errors.
13
+ * This is the ultimate measure - did the decomposition actually work?
14
+ *
15
+ * Score: 1.0 if all outcomes.success === true, 0.0 otherwise
16
+ */
17
+ export declare const executionSuccess: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
18
+ /**
19
+ * Time Balance Scorer
20
+ *
21
+ * Measures how evenly balanced the work was across subtasks.
22
+ * Unbalanced work means some agents finish early while others are bottlenecked.
23
+ *
24
+ * Score: 1.0 if max/min ratio < 2.0 (well balanced)
25
+ * 0.5 if ratio < 4.0 (moderately balanced)
26
+ * 0.0 if ratio >= 4.0 (poorly balanced)
27
+ */
28
+ export declare const timeBalance: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
29
+ /**
30
+ * Scope Accuracy Scorer
31
+ *
32
+ * Measures how accurately the decomposition predicted which files would be touched.
33
+ * High accuracy means the planner understood the work scope correctly.
34
+ *
35
+ * Score: intersection(actual, planned) / planned.length
36
+ * 1.0 = all planned files were touched, no extras
37
+ * 0.5 = half the planned files were touched
38
+ * 0.0 = none of the planned files were touched
39
+ */
40
+ export declare const scopeAccuracy: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
41
+ /**
42
+ * Scope Drift Scorer
43
+ *
44
+ * Penalizes when agents touch files NOT in their planned scope.
45
+ * Scope drift indicates poor planning or unexpected dependencies.
46
+ *
47
+ * Score: 1.0 if no drift (all actual files were planned)
48
+ * Decreases linearly with drift percentage
49
+ * 0.0 if drift > 50%
50
+ */
51
+ export declare const scopeDrift: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
52
+ /**
53
+ * No Rework Scorer
54
+ *
55
+ * Checks that no subtask touched files assigned to another subtask.
56
+ * Rework indicates poor decomposition or missing dependencies.
57
+ *
58
+ * Score: 1.0 if no rework (no subtask touched another's planned files)
59
+ * 0.0 if rework detected
60
+ */
61
+ export declare const noRework: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
62
+ //# sourceMappingURL=outcome-scorers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"outcome-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/outcome-scorers.ts"],"names":[],"mappings":"AAGA;;;;;;;GAOG;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,gBAAgB,6DAwC3B,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,WAAW,6DAoEtB,CAAC;AAEH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,aAAa,6DAmDxB,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,UAAU,6DAwDrB,CAAC;AAEH;;;;;;;;GAQG;AACH,eAAO,MAAM,QAAQ,6DAiEnB,CAAC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=swarm-decomposition.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"swarm-decomposition.eval.d.ts","sourceRoot":"","sources":["../src/swarm-decomposition.eval.ts"],"names":[],"mappings":""}
package/package.json ADDED
@@ -0,0 +1,45 @@
1
+ {
2
+ "name": "@swarmtools/evals",
3
+ "version": "0.2.0",
4
+ "description": "Evaluation suite for swarm-tools multi-agent coordination",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "files": [
9
+ "dist",
10
+ "README.md"
11
+ ],
12
+ "exports": {
13
+ ".": {
14
+ "import": "./dist/index.js",
15
+ "types": "./dist/index.d.ts"
16
+ }
17
+ },
18
+ "publishConfig": {
19
+ "access": "public",
20
+ "registry": "https://registry.npmjs.org/"
21
+ },
22
+ "repository": {
23
+ "type": "git",
24
+ "url": "https://github.com/joelhooks/opencode-swarm-plugin"
25
+ },
26
+ "author": "Joel Hooks",
27
+ "license": "MIT",
28
+ "scripts": {
29
+ "build": "bun build ./src/index.ts --outdir ./dist --target node && tsc",
30
+ "test": "bun test src/**/*.test.ts",
31
+ "typecheck": "tsc --noEmit"
32
+ },
33
+ "dependencies": {
34
+ "evalite": "^1.0.0-beta.10",
35
+ "ai": "6.0.0-beta.150",
36
+ "opencode-swarm-plugin": "0.44.1",
37
+ "swarm-mail": "1.6.0"
38
+ },
39
+ "devDependencies": {
40
+ "@types/node": "^22.19.3",
41
+ "bun-types": "^1.3.4",
42
+ "typescript": "^5.7.2",
43
+ "vitest": "^4.0.15"
44
+ }
45
+ }