opencode-swarm-plugin 0.32.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.hive/issues.jsonl +12 -0
- package/.hive/memories.jsonl +255 -1
- package/.turbo/turbo-build.log +9 -10
- package/.turbo/turbo-test.log +343 -337
- package/CHANGELOG.md +358 -0
- package/README.md +152 -179
- package/bin/swarm.test.ts +303 -1
- package/bin/swarm.ts +473 -16
- package/dist/compaction-hook.d.ts +1 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/index.d.ts +112 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +12380 -131
- package/dist/logger.d.ts +34 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/observability-tools.d.ts +116 -0
- package/dist/observability-tools.d.ts.map +1 -0
- package/dist/plugin.js +12254 -119
- package/dist/skills.d.ts.map +1 -1
- package/dist/swarm-orchestrate.d.ts +105 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +113 -2
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-research.d.ts +127 -0
- package/dist/swarm-research.d.ts.map +1 -0
- package/dist/swarm-review.d.ts.map +1 -1
- package/dist/swarm.d.ts +73 -1
- package/dist/swarm.d.ts.map +1 -1
- package/evals/compaction-resumption.eval.ts +289 -0
- package/evals/coordinator-behavior.eval.ts +307 -0
- package/evals/fixtures/compaction-cases.ts +350 -0
- package/evals/scorers/compaction-scorers.ts +305 -0
- package/evals/scorers/index.ts +12 -0
- package/examples/plugin-wrapper-template.ts +297 -8
- package/package.json +6 -2
- package/src/compaction-hook.test.ts +617 -1
- package/src/compaction-hook.ts +291 -18
- package/src/index.ts +54 -1
- package/src/logger.test.ts +189 -0
- package/src/logger.ts +135 -0
- package/src/observability-tools.test.ts +346 -0
- package/src/observability-tools.ts +594 -0
- package/src/skills.integration.test.ts +137 -1
- package/src/skills.test.ts +42 -1
- package/src/skills.ts +8 -4
- package/src/swarm-orchestrate.test.ts +123 -0
- package/src/swarm-orchestrate.ts +183 -0
- package/src/swarm-prompts.test.ts +553 -1
- package/src/swarm-prompts.ts +406 -4
- package/src/swarm-research.integration.test.ts +544 -0
- package/src/swarm-research.test.ts +698 -0
- package/src/swarm-research.ts +472 -0
- package/src/swarm-review.test.ts +177 -0
- package/src/swarm-review.ts +12 -47
- package/src/swarm.ts +6 -3
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compaction Hook Coordinator Resumption Eval
|
|
3
|
+
*
|
|
4
|
+
* Tests that the compaction hook correctly detects swarm state and injects
|
|
5
|
+
* appropriate context for coordinator resumption.
|
|
6
|
+
*
|
|
7
|
+
* ## Bug Being Tested
|
|
8
|
+
*
|
|
9
|
+
* Root cause: The compaction hook injects generic "you are a coordinator"
|
|
10
|
+
* context but doesn't include the SPECIFIC epic ID, subtask status, or
|
|
11
|
+
* project path. This causes coordinators to lose identity after compaction.
|
|
12
|
+
*
|
|
13
|
+
* ## Test Cases
|
|
14
|
+
*
|
|
15
|
+
* 1. Active swarm with in_progress epic - should inject full context with epic ID
|
|
16
|
+
* 2. Multiple epics - should identify the in_progress one
|
|
17
|
+
* 3. No active swarm - should not inject coordinator context
|
|
18
|
+
* 4. Blocked epic - should still detect as active swarm
|
|
19
|
+
*
|
|
20
|
+
* Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { evalite } from "evalite";
|
|
24
|
+
import type { Cell } from "swarm-mail";
|
|
25
|
+
import { compactionCases } from "./fixtures/compaction-cases.js";
|
|
26
|
+
import type { CompactionResult } from "./scorers/compaction-scorers.js";
|
|
27
|
+
import {
|
|
28
|
+
compactionQuality,
|
|
29
|
+
confidenceAccuracy,
|
|
30
|
+
contextInjectionCorrectness,
|
|
31
|
+
forbiddenPatternsAbsent,
|
|
32
|
+
requiredPatternsPresent,
|
|
33
|
+
} from "./scorers/index.js";
|
|
34
|
+
|
|
35
|
+
// Copy context constants from compaction-hook.ts to avoid import issues
|
|
36
|
+
const SWARM_COMPACTION_CONTEXT = `## 🐝 SWARM ACTIVE - Keep Cooking
|
|
37
|
+
|
|
38
|
+
You are the **COORDINATOR** of an active swarm. Context was compacted but the swarm is still running.
|
|
39
|
+
|
|
40
|
+
**YOUR JOB:** Keep orchestrating. Spawn agents. Monitor progress. Unblock work. Ship it.
|
|
41
|
+
|
|
42
|
+
### On Resume - IMMEDIATELY
|
|
43
|
+
|
|
44
|
+
1. \`swarm_status(epic_id="<epic>", project_key="<path>")\` - Get current state
|
|
45
|
+
2. \`swarmmail_inbox(limit=5)\` - Check for agent messages
|
|
46
|
+
3. \`swarm_review(project_key, epic_id, task_id, files_touched)\` - Review any completed work
|
|
47
|
+
4. **Spawn ready subtasks** - Don't wait, fire them off
|
|
48
|
+
|
|
49
|
+
### Keep the Swarm Cooking
|
|
50
|
+
|
|
51
|
+
- **Spawn aggressively** - If a subtask is ready and unblocked, spawn an agent
|
|
52
|
+
- **Monitor actively** - Check status, read messages, respond to blockers
|
|
53
|
+
- **Close the loop** - When all subtasks done, verify and close the epic
|
|
54
|
+
|
|
55
|
+
**You are not waiting for instructions. You are the coordinator. Coordinate.**
|
|
56
|
+
`;
|
|
57
|
+
|
|
58
|
+
const SWARM_DETECTION_FALLBACK = `## 🐝 Swarm Detection - Check Your Context
|
|
59
|
+
|
|
60
|
+
**IMPORTANT:** Before summarizing, check if this session involves an active swarm.
|
|
61
|
+
|
|
62
|
+
Look for ANY of these patterns in the conversation:
|
|
63
|
+
|
|
64
|
+
### Tool Calls (definite swarm sign)
|
|
65
|
+
- \`swarm_decompose\`, \`swarm_spawn_subtask\`, \`swarm_status\`, \`swarm_complete\`
|
|
66
|
+
- \`swarmmail_init\`, \`swarmmail_reserve\`, \`swarmmail_send\`
|
|
67
|
+
- \`hive_create_epic\`, \`hive_start\`, \`hive_close\`
|
|
68
|
+
|
|
69
|
+
### If You Find Swarm Evidence
|
|
70
|
+
|
|
71
|
+
Include this in your summary and tell the resumed session:
|
|
72
|
+
"This is an active swarm. Check swarm_status and swarmmail_inbox immediately."
|
|
73
|
+
`;
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Simulate compaction hook execution with given hive state
|
|
77
|
+
*
|
|
78
|
+
* Simplified version that simulates detection logic without running full hook.
|
|
79
|
+
* This tests the CONTEXT CONTENT itself, not the detection logic.
|
|
80
|
+
*/
|
|
81
|
+
async function runCompactionHook(testCase: {
|
|
82
|
+
hiveCells: Array<Omit<Cell, "created_at" | "updated_at" | "closed_at">>;
|
|
83
|
+
swarmMailState: {
|
|
84
|
+
agents: number;
|
|
85
|
+
reservations: number;
|
|
86
|
+
messages: number;
|
|
87
|
+
};
|
|
88
|
+
}): Promise<CompactionResult> {
|
|
89
|
+
// Simulate detection logic based on test case state
|
|
90
|
+
const hasInProgressCells = testCase.hiveCells.some(
|
|
91
|
+
(c) => c.status === "in_progress",
|
|
92
|
+
);
|
|
93
|
+
const hasReservations = testCase.swarmMailState.reservations > 0;
|
|
94
|
+
const hasOpenSubtasks = testCase.hiveCells.some(
|
|
95
|
+
(c) => c.status === "open" && c.parent_id,
|
|
96
|
+
);
|
|
97
|
+
const hasOpenEpics = testCase.hiveCells.some(
|
|
98
|
+
(c) => c.type === "epic" && c.status !== "closed",
|
|
99
|
+
);
|
|
100
|
+
const hasCells = testCase.hiveCells.length > 0;
|
|
101
|
+
|
|
102
|
+
// Determine confidence based on signals
|
|
103
|
+
let confidence: "high" | "medium" | "low" | "none" = "none";
|
|
104
|
+
let contextType: "full" | "fallback" | "none" = "none";
|
|
105
|
+
let injectedContext = "";
|
|
106
|
+
|
|
107
|
+
if (hasInProgressCells || hasReservations) {
|
|
108
|
+
confidence = "high";
|
|
109
|
+
contextType = "full";
|
|
110
|
+
injectedContext = `[Swarm detected: ${hasInProgressCells ? "cells in_progress" : ""}, ${hasReservations ? "active reservations" : ""}]\n\n${SWARM_COMPACTION_CONTEXT}`;
|
|
111
|
+
} else if (hasOpenSubtasks || hasOpenEpics) {
|
|
112
|
+
confidence = "medium";
|
|
113
|
+
contextType = "full";
|
|
114
|
+
injectedContext = `[Swarm detected: ${hasOpenSubtasks ? "open subtasks" : "unclosed epic"}]\n\n${SWARM_COMPACTION_CONTEXT}`;
|
|
115
|
+
} else if (hasCells) {
|
|
116
|
+
confidence = "low";
|
|
117
|
+
contextType = "fallback";
|
|
118
|
+
injectedContext = `[Possible swarm: cells exist]\n\n${SWARM_DETECTION_FALLBACK}`;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
detected: confidence !== "none",
|
|
123
|
+
confidence,
|
|
124
|
+
contextInjected: contextType !== "none",
|
|
125
|
+
contextType,
|
|
126
|
+
injectedContext,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Main eval: Compaction Hook Coordinator Resumption
|
|
132
|
+
*
|
|
133
|
+
* Tests all cases from fixtures/compaction-cases.ts
|
|
134
|
+
*/
|
|
135
|
+
evalite("Compaction Hook Coordinator Resumption", {
|
|
136
|
+
data: async () =>
|
|
137
|
+
compactionCases.map((testCase) => ({
|
|
138
|
+
input: testCase,
|
|
139
|
+
expected: testCase.expected,
|
|
140
|
+
})),
|
|
141
|
+
|
|
142
|
+
task: async (input) => {
|
|
143
|
+
const result = await runCompactionHook({
|
|
144
|
+
hiveCells: input.hiveCells,
|
|
145
|
+
swarmMailState: input.swarmMailState,
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
// Return as JSON string for scorers
|
|
149
|
+
return JSON.stringify(result);
|
|
150
|
+
},
|
|
151
|
+
|
|
152
|
+
scorers: [
|
|
153
|
+
confidenceAccuracy,
|
|
154
|
+
contextInjectionCorrectness,
|
|
155
|
+
requiredPatternsPresent,
|
|
156
|
+
forbiddenPatternsAbsent,
|
|
157
|
+
compactionQuality,
|
|
158
|
+
],
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Edge Case Eval: Epic ID Specificity
|
|
163
|
+
*
|
|
164
|
+
* Ensures injected context includes SPECIFIC epic IDs, not placeholders
|
|
165
|
+
*/
|
|
166
|
+
evalite("Epic ID Specificity", {
|
|
167
|
+
data: async () => [
|
|
168
|
+
{
|
|
169
|
+
input: {
|
|
170
|
+
name: "Epic ID must be specific, not placeholder",
|
|
171
|
+
hiveCells: [
|
|
172
|
+
{
|
|
173
|
+
id: "my-app-lf2p4u-epic999",
|
|
174
|
+
project_key: "/my/app",
|
|
175
|
+
type: "epic" as const,
|
|
176
|
+
status: "in_progress" as const,
|
|
177
|
+
title: "Implement feature X",
|
|
178
|
+
description: "Description here",
|
|
179
|
+
priority: 2,
|
|
180
|
+
parent_id: null,
|
|
181
|
+
assignee: "coordinator",
|
|
182
|
+
closed_reason: null,
|
|
183
|
+
deleted_at: null,
|
|
184
|
+
deleted_by: null,
|
|
185
|
+
delete_reason: null,
|
|
186
|
+
created_by: "coordinator",
|
|
187
|
+
},
|
|
188
|
+
],
|
|
189
|
+
swarmMailState: {
|
|
190
|
+
agents: 1,
|
|
191
|
+
reservations: 1,
|
|
192
|
+
messages: 2,
|
|
193
|
+
},
|
|
194
|
+
},
|
|
195
|
+
expected: {
|
|
196
|
+
confidence: "high" as const,
|
|
197
|
+
contextInjected: true,
|
|
198
|
+
contextType: "full" as const,
|
|
199
|
+
mustContain: ["SWARM ACTIVE", "COORDINATOR"],
|
|
200
|
+
// The bug: injected context should NOT contain generic placeholders
|
|
201
|
+
mustNotContain: ["bd-xxx", "<epic>", "<path>", "placeholder"],
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
],
|
|
205
|
+
|
|
206
|
+
task: async (input) => {
|
|
207
|
+
const result = await runCompactionHook({
|
|
208
|
+
hiveCells: input.hiveCells,
|
|
209
|
+
swarmMailState: input.swarmMailState,
|
|
210
|
+
});
|
|
211
|
+
return JSON.stringify(result);
|
|
212
|
+
},
|
|
213
|
+
|
|
214
|
+
scorers: [requiredPatternsPresent, forbiddenPatternsAbsent],
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Edge Case Eval: No False Positives
|
|
219
|
+
*
|
|
220
|
+
* Ensures we don't inject coordinator context when there's no swarm
|
|
221
|
+
*/
|
|
222
|
+
evalite("No False Positives", {
|
|
223
|
+
data: async () => [
|
|
224
|
+
{
|
|
225
|
+
input: {
|
|
226
|
+
name: "Empty hive should not trigger injection",
|
|
227
|
+
hiveCells: [],
|
|
228
|
+
swarmMailState: {
|
|
229
|
+
agents: 0,
|
|
230
|
+
reservations: 0,
|
|
231
|
+
messages: 0,
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
expected: {
|
|
235
|
+
confidence: "none" as const,
|
|
236
|
+
contextInjected: false,
|
|
237
|
+
contextType: "none" as const,
|
|
238
|
+
mustContain: [],
|
|
239
|
+
mustNotContain: ["SWARM", "COORDINATOR", "swarm_status"],
|
|
240
|
+
},
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
input: {
|
|
244
|
+
name: "Closed epic should not trigger full context",
|
|
245
|
+
hiveCells: [
|
|
246
|
+
{
|
|
247
|
+
id: "test-project-lf2p4u-epic100",
|
|
248
|
+
project_key: "/test/project",
|
|
249
|
+
type: "epic" as const,
|
|
250
|
+
status: "closed" as const,
|
|
251
|
+
title: "Completed epic",
|
|
252
|
+
description: null,
|
|
253
|
+
priority: 2,
|
|
254
|
+
parent_id: null,
|
|
255
|
+
assignee: null,
|
|
256
|
+
closed_reason: "Done",
|
|
257
|
+
deleted_at: null,
|
|
258
|
+
deleted_by: null,
|
|
259
|
+
delete_reason: null,
|
|
260
|
+
created_by: null,
|
|
261
|
+
},
|
|
262
|
+
],
|
|
263
|
+
swarmMailState: {
|
|
264
|
+
agents: 0,
|
|
265
|
+
reservations: 0,
|
|
266
|
+
messages: 0,
|
|
267
|
+
},
|
|
268
|
+
},
|
|
269
|
+
expected: {
|
|
270
|
+
// Should be low confidence (cells exist but no active work)
|
|
271
|
+
confidence: "low" as const,
|
|
272
|
+
contextInjected: true,
|
|
273
|
+
contextType: "fallback" as const,
|
|
274
|
+
mustContain: ["Swarm Detection", "Check Your Context"],
|
|
275
|
+
mustNotContain: ["SWARM ACTIVE", "COORDINATOR"],
|
|
276
|
+
},
|
|
277
|
+
},
|
|
278
|
+
],
|
|
279
|
+
|
|
280
|
+
task: async (input) => {
|
|
281
|
+
const result = await runCompactionHook({
|
|
282
|
+
hiveCells: input.hiveCells,
|
|
283
|
+
swarmMailState: input.swarmMailState,
|
|
284
|
+
});
|
|
285
|
+
return JSON.stringify(result);
|
|
286
|
+
},
|
|
287
|
+
|
|
288
|
+
scorers: [confidenceAccuracy, forbiddenPatternsAbsent],
|
|
289
|
+
});
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Coordinator Behavior After Compaction Eval
|
|
3
|
+
*
|
|
4
|
+
* LLM-as-judge eval that tests whether the compaction context actually
|
|
5
|
+
* causes Claude to behave like a coordinator (spawn workers, check status)
|
|
6
|
+
* rather than a worker (run tests, edit files directly).
|
|
7
|
+
*
|
|
8
|
+
* This is the missing piece - we test the CONTEXT CONTENT in unit tests,
|
|
9
|
+
* but we need to test whether the LLM BEHAVES CORRECTLY given that context.
|
|
10
|
+
*
|
|
11
|
+
* Run with: bunx evalite run evals/coordinator-behavior.eval.ts
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { evalite } from "evalite";
|
|
15
|
+
import { createScorer } from "evalite";
|
|
16
|
+
import { generateText, gateway } from "ai";
|
|
17
|
+
import type { GatewayModelId } from "ai";
|
|
18
|
+
|
|
19
|
+
const MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
|
|
20
|
+
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// Test Context: Simulated compaction context injection
|
|
23
|
+
// ============================================================================
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Build the context that would be injected after compaction
|
|
27
|
+
* This mirrors buildDynamicSwarmState() from compaction-hook.ts
|
|
28
|
+
*/
|
|
29
|
+
function buildTestContext(epicId: string, projectPath: string): string {
|
|
30
|
+
return `## 🐝 Current Swarm State
|
|
31
|
+
|
|
32
|
+
**Epic:** ${epicId} - Add user authentication
|
|
33
|
+
**Subtasks:**
|
|
34
|
+
- 1 closed
|
|
35
|
+
- 1 in_progress
|
|
36
|
+
- 2 open
|
|
37
|
+
**Project:** ${projectPath}
|
|
38
|
+
|
|
39
|
+
## 🎯 YOU ARE THE COORDINATOR
|
|
40
|
+
|
|
41
|
+
**Primary role:** Orchestrate workers, review their output, unblock dependencies.
|
|
42
|
+
**Spawn workers** for implementation tasks - don't do them yourself.
|
|
43
|
+
|
|
44
|
+
**RESUME STEPS:**
|
|
45
|
+
1. Check swarm status: \`swarm_status(epic_id="${epicId}", project_key="${projectPath}")\`
|
|
46
|
+
2. Check inbox for worker messages: \`swarmmail_inbox(limit=5)\`
|
|
47
|
+
3. For in_progress subtasks: Review worker results with \`swarm_review\`
|
|
48
|
+
4. For open subtasks: Spawn workers with \`swarm_spawn_subtask\`
|
|
49
|
+
5. For blocked subtasks: Investigate and unblock
|
|
50
|
+
|
|
51
|
+
## 🐝 SWARM ACTIVE - Keep Cooking
|
|
52
|
+
|
|
53
|
+
You are the **COORDINATOR** of an active swarm. Context was compacted but the swarm is still running.
|
|
54
|
+
|
|
55
|
+
**YOUR JOB:** Keep orchestrating. Spawn agents. Monitor progress. Unblock work. Ship it.
|
|
56
|
+
|
|
57
|
+
### On Resume - IMMEDIATELY
|
|
58
|
+
|
|
59
|
+
1. \`swarm_status(epic_id="${epicId}", project_key="${projectPath}")\` - Get current state
|
|
60
|
+
2. \`swarmmail_inbox(limit=5)\` - Check for agent messages
|
|
61
|
+
3. \`swarm_review(project_key, epic_id, task_id, files_touched)\` - Review any completed work
|
|
62
|
+
4. **Spawn ready subtasks** - Don't wait, fire them off
|
|
63
|
+
|
|
64
|
+
**You are not waiting for instructions. You are the coordinator. Coordinate.**`;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// ============================================================================
|
|
68
|
+
// Scorers
|
|
69
|
+
// ============================================================================
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Scores whether the response mentions coordinator tools
|
|
73
|
+
*/
|
|
74
|
+
export const mentionsCoordinatorTools = createScorer({
|
|
75
|
+
name: "Mentions Coordinator Tools",
|
|
76
|
+
description: "Response mentions swarm_status, swarmmail_inbox, swarm_spawn_subtask, or swarm_review",
|
|
77
|
+
scorer: ({ output }) => {
|
|
78
|
+
const text = String(output).toLowerCase();
|
|
79
|
+
const coordinatorTools = [
|
|
80
|
+
"swarm_status",
|
|
81
|
+
"swarmmail_inbox",
|
|
82
|
+
"swarm_spawn_subtask",
|
|
83
|
+
"swarm_review",
|
|
84
|
+
"spawn",
|
|
85
|
+
"worker",
|
|
86
|
+
];
|
|
87
|
+
|
|
88
|
+
const found = coordinatorTools.filter(tool => text.includes(tool));
|
|
89
|
+
const score = Math.min(found.length / 3, 1); // Need at least 3 for full score
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
score,
|
|
93
|
+
message: found.length > 0
|
|
94
|
+
? `Found coordinator patterns: ${found.join(", ")}`
|
|
95
|
+
: "No coordinator patterns found",
|
|
96
|
+
};
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Scores whether the response avoids worker behaviors
|
|
102
|
+
*/
|
|
103
|
+
export const avoidsWorkerBehaviors = createScorer({
|
|
104
|
+
name: "Avoids Worker Behaviors",
|
|
105
|
+
description: "Response does NOT suggest running tests, editing files, or doing implementation directly",
|
|
106
|
+
scorer: ({ output }) => {
|
|
107
|
+
const text = String(output).toLowerCase();
|
|
108
|
+
const workerPatterns = [
|
|
109
|
+
"bun test",
|
|
110
|
+
"npm test",
|
|
111
|
+
"pnpm test",
|
|
112
|
+
"let me run",
|
|
113
|
+
"i'll run the tests",
|
|
114
|
+
"let me edit",
|
|
115
|
+
"i'll fix",
|
|
116
|
+
"let me implement",
|
|
117
|
+
"i'll write the code",
|
|
118
|
+
"```typescript", // Code blocks suggest implementation
|
|
119
|
+
"```javascript",
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
const found = workerPatterns.filter(pattern => text.includes(pattern));
|
|
123
|
+
|
|
124
|
+
if (found.length === 0) {
|
|
125
|
+
return {
|
|
126
|
+
score: 1,
|
|
127
|
+
message: "No worker behaviors detected",
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
score: Math.max(0, 1 - (found.length * 0.25)),
|
|
133
|
+
message: `Worker behaviors detected: ${found.join(", ")}`,
|
|
134
|
+
};
|
|
135
|
+
},
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Scores whether the response shows coordinator mindset
|
|
140
|
+
*/
|
|
141
|
+
export const coordinatorMindset = createScorer({
|
|
142
|
+
name: "Coordinator Mindset",
|
|
143
|
+
description: "Response demonstrates orchestration thinking, not implementation thinking",
|
|
144
|
+
scorer: ({ output }) => {
|
|
145
|
+
const text = String(output).toLowerCase();
|
|
146
|
+
|
|
147
|
+
// Positive signals: orchestration language
|
|
148
|
+
const orchestrationPatterns = [
|
|
149
|
+
"check status",
|
|
150
|
+
"check inbox",
|
|
151
|
+
"spawn",
|
|
152
|
+
"delegate",
|
|
153
|
+
"assign",
|
|
154
|
+
"review",
|
|
155
|
+
"coordinate",
|
|
156
|
+
"orchestrat",
|
|
157
|
+
"worker",
|
|
158
|
+
"subtask",
|
|
159
|
+
"unblock",
|
|
160
|
+
];
|
|
161
|
+
|
|
162
|
+
// Negative signals: implementation language
|
|
163
|
+
const implementationPatterns = [
|
|
164
|
+
"let me code",
|
|
165
|
+
"i'll implement",
|
|
166
|
+
"here's the fix",
|
|
167
|
+
"the solution is",
|
|
168
|
+
"i'll write",
|
|
169
|
+
"let me add",
|
|
170
|
+
];
|
|
171
|
+
|
|
172
|
+
const positiveCount = orchestrationPatterns.filter(p => text.includes(p)).length;
|
|
173
|
+
const negativeCount = implementationPatterns.filter(p => text.includes(p)).length;
|
|
174
|
+
|
|
175
|
+
const score = Math.min(1, Math.max(0, (positiveCount - negativeCount * 2) / 4));
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
score,
|
|
179
|
+
message: `Orchestration signals: ${positiveCount}, Implementation signals: ${negativeCount}`,
|
|
180
|
+
};
|
|
181
|
+
},
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Composite scorer for overall coordinator behavior
|
|
186
|
+
*/
|
|
187
|
+
export const overallCoordinatorBehavior = createScorer({
|
|
188
|
+
name: "Overall Coordinator Behavior",
|
|
189
|
+
description: "Composite score: does the LLM behave like a coordinator?",
|
|
190
|
+
scorer: ({ output }) => {
|
|
191
|
+
const toolsResult = mentionsCoordinatorTools.scorer({ output, expected: undefined });
|
|
192
|
+
const avoidsResult = avoidsWorkerBehaviors.scorer({ output, expected: undefined });
|
|
193
|
+
const mindsetResult = coordinatorMindset.scorer({ output, expected: undefined });
|
|
194
|
+
|
|
195
|
+
// Weighted average: avoiding worker behavior is most important
|
|
196
|
+
const score =
|
|
197
|
+
toolsResult.score * 0.3 +
|
|
198
|
+
avoidsResult.score * 0.4 +
|
|
199
|
+
mindsetResult.score * 0.3;
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
score,
|
|
203
|
+
message: `Tools: ${(toolsResult.score * 100).toFixed(0)}%, Avoids Worker: ${(avoidsResult.score * 100).toFixed(0)}%, Mindset: ${(mindsetResult.score * 100).toFixed(0)}%`,
|
|
204
|
+
};
|
|
205
|
+
},
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// ============================================================================
|
|
209
|
+
// Eval Cases
|
|
210
|
+
// ============================================================================
|
|
211
|
+
|
|
212
|
+
interface CoordinatorTestCase {
|
|
213
|
+
name: string;
|
|
214
|
+
systemContext: string;
|
|
215
|
+
userPrompt: string;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const testCases: CoordinatorTestCase[] = [
|
|
219
|
+
{
|
|
220
|
+
name: "Basic resumption after compaction",
|
|
221
|
+
systemContext: buildTestContext("bd-epic-123", "/my/project"),
|
|
222
|
+
userPrompt: "Context was compacted. What should I do next?",
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
name: "Resumption with specific epic",
|
|
226
|
+
systemContext: buildTestContext("my-app-lf2p4u-auth-epic", "/Users/dev/my-app"),
|
|
227
|
+
userPrompt: "I just resumed. The swarm is active. What's my next step?",
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
name: "Temptation to do work directly",
|
|
231
|
+
systemContext: buildTestContext("bd-epic-456", "/project"),
|
|
232
|
+
userPrompt: "There are 2 open subtasks. Should I just implement them myself to save time?",
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
name: "Worker completed - what now?",
|
|
236
|
+
systemContext: buildTestContext("bd-epic-789", "/app"),
|
|
237
|
+
userPrompt: "A worker just finished a subtask. What should I do?",
|
|
238
|
+
},
|
|
239
|
+
];
|
|
240
|
+
|
|
241
|
+
// ============================================================================
|
|
242
|
+
// Main Eval
|
|
243
|
+
// ============================================================================
|
|
244
|
+
|
|
245
|
+
evalite("Coordinator Behavior After Compaction", {
|
|
246
|
+
data: async () => testCases.map(tc => ({
|
|
247
|
+
input: tc,
|
|
248
|
+
expected: undefined, // Scorers don't need expected values
|
|
249
|
+
})),
|
|
250
|
+
|
|
251
|
+
task: async (input) => {
|
|
252
|
+
const { text } = await generateText({
|
|
253
|
+
model: gateway(MODEL),
|
|
254
|
+
system: input.systemContext,
|
|
255
|
+
prompt: input.userPrompt,
|
|
256
|
+
maxOutputTokens: 1024,
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
return text;
|
|
260
|
+
},
|
|
261
|
+
|
|
262
|
+
scorers: [
|
|
263
|
+
mentionsCoordinatorTools,
|
|
264
|
+
avoidsWorkerBehaviors,
|
|
265
|
+
coordinatorMindset,
|
|
266
|
+
overallCoordinatorBehavior,
|
|
267
|
+
],
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Edge Case: Direct implementation temptation
|
|
272
|
+
*
|
|
273
|
+
* Tests that even when prompted to "just do it", the coordinator resists
|
|
274
|
+
*/
|
|
275
|
+
evalite("Coordinator Resists Direct Implementation", {
|
|
276
|
+
data: async () => [
|
|
277
|
+
{
|
|
278
|
+
input: {
|
|
279
|
+
name: "User asks to skip workers",
|
|
280
|
+
systemContext: buildTestContext("bd-epic-skip", "/project"),
|
|
281
|
+
userPrompt: "Can you just run the tests and fix any errors yourself? It would be faster.",
|
|
282
|
+
},
|
|
283
|
+
expected: undefined,
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
input: {
|
|
287
|
+
name: "User asks for direct code",
|
|
288
|
+
systemContext: buildTestContext("bd-epic-code", "/project"),
|
|
289
|
+
userPrompt: "Write the authentication middleware code for me.",
|
|
290
|
+
},
|
|
291
|
+
expected: undefined,
|
|
292
|
+
},
|
|
293
|
+
],
|
|
294
|
+
|
|
295
|
+
task: async (input) => {
|
|
296
|
+
const { text } = await generateText({
|
|
297
|
+
model: gateway(MODEL),
|
|
298
|
+
system: input.systemContext,
|
|
299
|
+
prompt: input.userPrompt,
|
|
300
|
+
maxOutputTokens: 1024,
|
|
301
|
+
});
|
|
302
|
+
|
|
303
|
+
return text;
|
|
304
|
+
},
|
|
305
|
+
|
|
306
|
+
scorers: [avoidsWorkerBehaviors, coordinatorMindset],
|
|
307
|
+
});
|