opencode-swarm-plugin 0.32.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/.hive/issues.jsonl +12 -0
  2. package/.hive/memories.jsonl +255 -1
  3. package/.turbo/turbo-build.log +9 -10
  4. package/.turbo/turbo-test.log +343 -337
  5. package/CHANGELOG.md +358 -0
  6. package/README.md +152 -179
  7. package/bin/swarm.test.ts +303 -1
  8. package/bin/swarm.ts +473 -16
  9. package/dist/compaction-hook.d.ts +1 -1
  10. package/dist/compaction-hook.d.ts.map +1 -1
  11. package/dist/index.d.ts +112 -0
  12. package/dist/index.d.ts.map +1 -1
  13. package/dist/index.js +12380 -131
  14. package/dist/logger.d.ts +34 -0
  15. package/dist/logger.d.ts.map +1 -0
  16. package/dist/observability-tools.d.ts +116 -0
  17. package/dist/observability-tools.d.ts.map +1 -0
  18. package/dist/plugin.js +12254 -119
  19. package/dist/skills.d.ts.map +1 -1
  20. package/dist/swarm-orchestrate.d.ts +105 -0
  21. package/dist/swarm-orchestrate.d.ts.map +1 -1
  22. package/dist/swarm-prompts.d.ts +113 -2
  23. package/dist/swarm-prompts.d.ts.map +1 -1
  24. package/dist/swarm-research.d.ts +127 -0
  25. package/dist/swarm-research.d.ts.map +1 -0
  26. package/dist/swarm-review.d.ts.map +1 -1
  27. package/dist/swarm.d.ts +73 -1
  28. package/dist/swarm.d.ts.map +1 -1
  29. package/evals/compaction-resumption.eval.ts +289 -0
  30. package/evals/coordinator-behavior.eval.ts +307 -0
  31. package/evals/fixtures/compaction-cases.ts +350 -0
  32. package/evals/scorers/compaction-scorers.ts +305 -0
  33. package/evals/scorers/index.ts +12 -0
  34. package/examples/plugin-wrapper-template.ts +297 -8
  35. package/package.json +6 -2
  36. package/src/compaction-hook.test.ts +617 -1
  37. package/src/compaction-hook.ts +291 -18
  38. package/src/index.ts +54 -1
  39. package/src/logger.test.ts +189 -0
  40. package/src/logger.ts +135 -0
  41. package/src/observability-tools.test.ts +346 -0
  42. package/src/observability-tools.ts +594 -0
  43. package/src/skills.integration.test.ts +137 -1
  44. package/src/skills.test.ts +42 -1
  45. package/src/skills.ts +8 -4
  46. package/src/swarm-orchestrate.test.ts +123 -0
  47. package/src/swarm-orchestrate.ts +183 -0
  48. package/src/swarm-prompts.test.ts +553 -1
  49. package/src/swarm-prompts.ts +406 -4
  50. package/src/swarm-research.integration.test.ts +544 -0
  51. package/src/swarm-research.test.ts +698 -0
  52. package/src/swarm-research.ts +472 -0
  53. package/src/swarm-review.test.ts +177 -0
  54. package/src/swarm-review.ts +12 -47
  55. package/src/swarm.ts +6 -3
@@ -0,0 +1,289 @@
1
+ /**
2
+ * Compaction Hook Coordinator Resumption Eval
3
+ *
4
+ * Tests that the compaction hook correctly detects swarm state and injects
5
+ * appropriate context for coordinator resumption.
6
+ *
7
+ * ## Bug Being Tested
8
+ *
9
+ * Root cause: The compaction hook injects generic "you are a coordinator"
10
+ * context but doesn't include the SPECIFIC epic ID, subtask status, or
11
+ * project path. This causes coordinators to lose identity after compaction.
12
+ *
13
+ * ## Test Cases
14
+ *
15
+ * 1. Active swarm with in_progress epic - should inject full context with epic ID
16
+ * 2. Multiple epics - should identify the in_progress one
17
+ * 3. No active swarm - should not inject coordinator context
18
+ * 4. Blocked epic - should still detect as active swarm
19
+ *
20
+ * Run with: pnpm eval:dev (watch mode) or pnpm eval:run (once)
21
+ */
22
+
23
+ import { evalite } from "evalite";
24
+ import type { Cell } from "swarm-mail";
25
+ import { compactionCases } from "./fixtures/compaction-cases.js";
26
+ import type { CompactionResult } from "./scorers/compaction-scorers.js";
27
+ import {
28
+ compactionQuality,
29
+ confidenceAccuracy,
30
+ contextInjectionCorrectness,
31
+ forbiddenPatternsAbsent,
32
+ requiredPatternsPresent,
33
+ } from "./scorers/index.js";
34
+
35
+ // Copy context constants from compaction-hook.ts to avoid import issues
36
+ const SWARM_COMPACTION_CONTEXT = `## 🐝 SWARM ACTIVE - Keep Cooking
37
+
38
+ You are the **COORDINATOR** of an active swarm. Context was compacted but the swarm is still running.
39
+
40
+ **YOUR JOB:** Keep orchestrating. Spawn agents. Monitor progress. Unblock work. Ship it.
41
+
42
+ ### On Resume - IMMEDIATELY
43
+
44
+ 1. \`swarm_status(epic_id="<epic>", project_key="<path>")\` - Get current state
45
+ 2. \`swarmmail_inbox(limit=5)\` - Check for agent messages
46
+ 3. \`swarm_review(project_key, epic_id, task_id, files_touched)\` - Review any completed work
47
+ 4. **Spawn ready subtasks** - Don't wait, fire them off
48
+
49
+ ### Keep the Swarm Cooking
50
+
51
+ - **Spawn aggressively** - If a subtask is ready and unblocked, spawn an agent
52
+ - **Monitor actively** - Check status, read messages, respond to blockers
53
+ - **Close the loop** - When all subtasks done, verify and close the epic
54
+
55
+ **You are not waiting for instructions. You are the coordinator. Coordinate.**
56
+ `;
57
+
58
+ const SWARM_DETECTION_FALLBACK = `## 🐝 Swarm Detection - Check Your Context
59
+
60
+ **IMPORTANT:** Before summarizing, check if this session involves an active swarm.
61
+
62
+ Look for ANY of these patterns in the conversation:
63
+
64
+ ### Tool Calls (definite swarm sign)
65
+ - \`swarm_decompose\`, \`swarm_spawn_subtask\`, \`swarm_status\`, \`swarm_complete\`
66
+ - \`swarmmail_init\`, \`swarmmail_reserve\`, \`swarmmail_send\`
67
+ - \`hive_create_epic\`, \`hive_start\`, \`hive_close\`
68
+
69
+ ### If You Find Swarm Evidence
70
+
71
+ Include this in your summary and tell the resumed session:
72
+ "This is an active swarm. Check swarm_status and swarmmail_inbox immediately."
73
+ `;
74
+
75
+ /**
76
+ * Simulate compaction hook execution with given hive state
77
+ *
78
+ * Simplified version that simulates detection logic without running full hook.
79
+ * This tests the CONTEXT CONTENT itself, not the detection logic.
80
+ */
81
+ async function runCompactionHook(testCase: {
82
+ hiveCells: Array<Omit<Cell, "created_at" | "updated_at" | "closed_at">>;
83
+ swarmMailState: {
84
+ agents: number;
85
+ reservations: number;
86
+ messages: number;
87
+ };
88
+ }): Promise<CompactionResult> {
89
+ // Simulate detection logic based on test case state
90
+ const hasInProgressCells = testCase.hiveCells.some(
91
+ (c) => c.status === "in_progress",
92
+ );
93
+ const hasReservations = testCase.swarmMailState.reservations > 0;
94
+ const hasOpenSubtasks = testCase.hiveCells.some(
95
+ (c) => c.status === "open" && c.parent_id,
96
+ );
97
+ const hasOpenEpics = testCase.hiveCells.some(
98
+ (c) => c.type === "epic" && c.status !== "closed",
99
+ );
100
+ const hasCells = testCase.hiveCells.length > 0;
101
+
102
+ // Determine confidence based on signals
103
+ let confidence: "high" | "medium" | "low" | "none" = "none";
104
+ let contextType: "full" | "fallback" | "none" = "none";
105
+ let injectedContext = "";
106
+
107
+ if (hasInProgressCells || hasReservations) {
108
+ confidence = "high";
109
+ contextType = "full";
110
+ injectedContext = `[Swarm detected: ${hasInProgressCells ? "cells in_progress" : ""}, ${hasReservations ? "active reservations" : ""}]\n\n${SWARM_COMPACTION_CONTEXT}`;
111
+ } else if (hasOpenSubtasks || hasOpenEpics) {
112
+ confidence = "medium";
113
+ contextType = "full";
114
+ injectedContext = `[Swarm detected: ${hasOpenSubtasks ? "open subtasks" : "unclosed epic"}]\n\n${SWARM_COMPACTION_CONTEXT}`;
115
+ } else if (hasCells) {
116
+ confidence = "low";
117
+ contextType = "fallback";
118
+ injectedContext = `[Possible swarm: cells exist]\n\n${SWARM_DETECTION_FALLBACK}`;
119
+ }
120
+
121
+ return {
122
+ detected: confidence !== "none",
123
+ confidence,
124
+ contextInjected: contextType !== "none",
125
+ contextType,
126
+ injectedContext,
127
+ };
128
+ }
129
+
130
+ /**
131
+ * Main eval: Compaction Hook Coordinator Resumption
132
+ *
133
+ * Tests all cases from fixtures/compaction-cases.ts
134
+ */
135
+ evalite("Compaction Hook Coordinator Resumption", {
136
+ data: async () =>
137
+ compactionCases.map((testCase) => ({
138
+ input: testCase,
139
+ expected: testCase.expected,
140
+ })),
141
+
142
+ task: async (input) => {
143
+ const result = await runCompactionHook({
144
+ hiveCells: input.hiveCells,
145
+ swarmMailState: input.swarmMailState,
146
+ });
147
+
148
+ // Return as JSON string for scorers
149
+ return JSON.stringify(result);
150
+ },
151
+
152
+ scorers: [
153
+ confidenceAccuracy,
154
+ contextInjectionCorrectness,
155
+ requiredPatternsPresent,
156
+ forbiddenPatternsAbsent,
157
+ compactionQuality,
158
+ ],
159
+ });
160
+
161
+ /**
162
+ * Edge Case Eval: Epic ID Specificity
163
+ *
164
+ * Ensures injected context includes SPECIFIC epic IDs, not placeholders
165
+ */
166
+ evalite("Epic ID Specificity", {
167
+ data: async () => [
168
+ {
169
+ input: {
170
+ name: "Epic ID must be specific, not placeholder",
171
+ hiveCells: [
172
+ {
173
+ id: "my-app-lf2p4u-epic999",
174
+ project_key: "/my/app",
175
+ type: "epic" as const,
176
+ status: "in_progress" as const,
177
+ title: "Implement feature X",
178
+ description: "Description here",
179
+ priority: 2,
180
+ parent_id: null,
181
+ assignee: "coordinator",
182
+ closed_reason: null,
183
+ deleted_at: null,
184
+ deleted_by: null,
185
+ delete_reason: null,
186
+ created_by: "coordinator",
187
+ },
188
+ ],
189
+ swarmMailState: {
190
+ agents: 1,
191
+ reservations: 1,
192
+ messages: 2,
193
+ },
194
+ },
195
+ expected: {
196
+ confidence: "high" as const,
197
+ contextInjected: true,
198
+ contextType: "full" as const,
199
+ mustContain: ["SWARM ACTIVE", "COORDINATOR"],
200
+ // The bug: injected context should NOT contain generic placeholders
201
+ mustNotContain: ["bd-xxx", "<epic>", "<path>", "placeholder"],
202
+ },
203
+ },
204
+ ],
205
+
206
+ task: async (input) => {
207
+ const result = await runCompactionHook({
208
+ hiveCells: input.hiveCells,
209
+ swarmMailState: input.swarmMailState,
210
+ });
211
+ return JSON.stringify(result);
212
+ },
213
+
214
+ scorers: [requiredPatternsPresent, forbiddenPatternsAbsent],
215
+ });
216
+
217
+ /**
218
+ * Edge Case Eval: No False Positives
219
+ *
220
+ * Ensures we don't inject coordinator context when there's no swarm
221
+ */
222
+ evalite("No False Positives", {
223
+ data: async () => [
224
+ {
225
+ input: {
226
+ name: "Empty hive should not trigger injection",
227
+ hiveCells: [],
228
+ swarmMailState: {
229
+ agents: 0,
230
+ reservations: 0,
231
+ messages: 0,
232
+ },
233
+ },
234
+ expected: {
235
+ confidence: "none" as const,
236
+ contextInjected: false,
237
+ contextType: "none" as const,
238
+ mustContain: [],
239
+ mustNotContain: ["SWARM", "COORDINATOR", "swarm_status"],
240
+ },
241
+ },
242
+ {
243
+ input: {
244
+ name: "Closed epic should not trigger full context",
245
+ hiveCells: [
246
+ {
247
+ id: "test-project-lf2p4u-epic100",
248
+ project_key: "/test/project",
249
+ type: "epic" as const,
250
+ status: "closed" as const,
251
+ title: "Completed epic",
252
+ description: null,
253
+ priority: 2,
254
+ parent_id: null,
255
+ assignee: null,
256
+ closed_reason: "Done",
257
+ deleted_at: null,
258
+ deleted_by: null,
259
+ delete_reason: null,
260
+ created_by: null,
261
+ },
262
+ ],
263
+ swarmMailState: {
264
+ agents: 0,
265
+ reservations: 0,
266
+ messages: 0,
267
+ },
268
+ },
269
+ expected: {
270
+ // Should be low confidence (cells exist but no active work)
271
+ confidence: "low" as const,
272
+ contextInjected: true,
273
+ contextType: "fallback" as const,
274
+ mustContain: ["Swarm Detection", "Check Your Context"],
275
+ mustNotContain: ["SWARM ACTIVE", "COORDINATOR"],
276
+ },
277
+ },
278
+ ],
279
+
280
+ task: async (input) => {
281
+ const result = await runCompactionHook({
282
+ hiveCells: input.hiveCells,
283
+ swarmMailState: input.swarmMailState,
284
+ });
285
+ return JSON.stringify(result);
286
+ },
287
+
288
+ scorers: [confidenceAccuracy, forbiddenPatternsAbsent],
289
+ });
@@ -0,0 +1,307 @@
1
+ /**
2
+ * Coordinator Behavior After Compaction Eval
3
+ *
4
+ * LLM-as-judge eval that tests whether the compaction context actually
5
+ * causes Claude to behave like a coordinator (spawn workers, check status)
6
+ * rather than a worker (run tests, edit files directly).
7
+ *
8
+ * This is the missing piece - we test the CONTEXT CONTENT in unit tests,
9
+ * but we need to test whether the LLM BEHAVES CORRECTLY given that context.
10
+ *
11
+ * Run with: bunx evalite run evals/coordinator-behavior.eval.ts
12
+ */
13
+
14
+ import { evalite } from "evalite";
15
+ import { createScorer } from "evalite";
16
+ import { generateText, gateway } from "ai";
17
+ import type { GatewayModelId } from "ai";
18
+
19
+ const MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
20
+
21
+ // ============================================================================
22
+ // Test Context: Simulated compaction context injection
23
+ // ============================================================================
24
+
25
+ /**
26
+ * Build the context that would be injected after compaction
27
+ * This mirrors buildDynamicSwarmState() from compaction-hook.ts
28
+ */
29
+ function buildTestContext(epicId: string, projectPath: string): string {
30
+ return `## 🐝 Current Swarm State
31
+
32
+ **Epic:** ${epicId} - Add user authentication
33
+ **Subtasks:**
34
+ - 1 closed
35
+ - 1 in_progress
36
+ - 2 open
37
+ **Project:** ${projectPath}
38
+
39
+ ## 🎯 YOU ARE THE COORDINATOR
40
+
41
+ **Primary role:** Orchestrate workers, review their output, unblock dependencies.
42
+ **Spawn workers** for implementation tasks - don't do them yourself.
43
+
44
+ **RESUME STEPS:**
45
+ 1. Check swarm status: \`swarm_status(epic_id="${epicId}", project_key="${projectPath}")\`
46
+ 2. Check inbox for worker messages: \`swarmmail_inbox(limit=5)\`
47
+ 3. For in_progress subtasks: Review worker results with \`swarm_review\`
48
+ 4. For open subtasks: Spawn workers with \`swarm_spawn_subtask\`
49
+ 5. For blocked subtasks: Investigate and unblock
50
+
51
+ ## 🐝 SWARM ACTIVE - Keep Cooking
52
+
53
+ You are the **COORDINATOR** of an active swarm. Context was compacted but the swarm is still running.
54
+
55
+ **YOUR JOB:** Keep orchestrating. Spawn agents. Monitor progress. Unblock work. Ship it.
56
+
57
+ ### On Resume - IMMEDIATELY
58
+
59
+ 1. \`swarm_status(epic_id="${epicId}", project_key="${projectPath}")\` - Get current state
60
+ 2. \`swarmmail_inbox(limit=5)\` - Check for agent messages
61
+ 3. \`swarm_review(project_key, epic_id, task_id, files_touched)\` - Review any completed work
62
+ 4. **Spawn ready subtasks** - Don't wait, fire them off
63
+
64
+ **You are not waiting for instructions. You are the coordinator. Coordinate.**`;
65
+ }
66
+
67
+ // ============================================================================
68
+ // Scorers
69
+ // ============================================================================
70
+
71
+ /**
72
+ * Scores whether the response mentions coordinator tools
73
+ */
74
+ export const mentionsCoordinatorTools = createScorer({
75
+ name: "Mentions Coordinator Tools",
76
+ description: "Response mentions swarm_status, swarmmail_inbox, swarm_spawn_subtask, or swarm_review",
77
+ scorer: ({ output }) => {
78
+ const text = String(output).toLowerCase();
79
+ const coordinatorTools = [
80
+ "swarm_status",
81
+ "swarmmail_inbox",
82
+ "swarm_spawn_subtask",
83
+ "swarm_review",
84
+ "spawn",
85
+ "worker",
86
+ ];
87
+
88
+ const found = coordinatorTools.filter(tool => text.includes(tool));
89
+ const score = Math.min(found.length / 3, 1); // Need at least 3 for full score
90
+
91
+ return {
92
+ score,
93
+ message: found.length > 0
94
+ ? `Found coordinator patterns: ${found.join(", ")}`
95
+ : "No coordinator patterns found",
96
+ };
97
+ },
98
+ });
99
+
100
+ /**
101
+ * Scores whether the response avoids worker behaviors
102
+ */
103
+ export const avoidsWorkerBehaviors = createScorer({
104
+ name: "Avoids Worker Behaviors",
105
+ description: "Response does NOT suggest running tests, editing files, or doing implementation directly",
106
+ scorer: ({ output }) => {
107
+ const text = String(output).toLowerCase();
108
+ const workerPatterns = [
109
+ "bun test",
110
+ "npm test",
111
+ "pnpm test",
112
+ "let me run",
113
+ "i'll run the tests",
114
+ "let me edit",
115
+ "i'll fix",
116
+ "let me implement",
117
+ "i'll write the code",
118
+ "```typescript", // Code blocks suggest implementation
119
+ "```javascript",
120
+ ];
121
+
122
+ const found = workerPatterns.filter(pattern => text.includes(pattern));
123
+
124
+ if (found.length === 0) {
125
+ return {
126
+ score: 1,
127
+ message: "No worker behaviors detected",
128
+ };
129
+ }
130
+
131
+ return {
132
+ score: Math.max(0, 1 - (found.length * 0.25)),
133
+ message: `Worker behaviors detected: ${found.join(", ")}`,
134
+ };
135
+ },
136
+ });
137
+
138
+ /**
139
+ * Scores whether the response shows coordinator mindset
140
+ */
141
+ export const coordinatorMindset = createScorer({
142
+ name: "Coordinator Mindset",
143
+ description: "Response demonstrates orchestration thinking, not implementation thinking",
144
+ scorer: ({ output }) => {
145
+ const text = String(output).toLowerCase();
146
+
147
+ // Positive signals: orchestration language
148
+ const orchestrationPatterns = [
149
+ "check status",
150
+ "check inbox",
151
+ "spawn",
152
+ "delegate",
153
+ "assign",
154
+ "review",
155
+ "coordinate",
156
+ "orchestrat",
157
+ "worker",
158
+ "subtask",
159
+ "unblock",
160
+ ];
161
+
162
+ // Negative signals: implementation language
163
+ const implementationPatterns = [
164
+ "let me code",
165
+ "i'll implement",
166
+ "here's the fix",
167
+ "the solution is",
168
+ "i'll write",
169
+ "let me add",
170
+ ];
171
+
172
+ const positiveCount = orchestrationPatterns.filter(p => text.includes(p)).length;
173
+ const negativeCount = implementationPatterns.filter(p => text.includes(p)).length;
174
+
175
+ const score = Math.min(1, Math.max(0, (positiveCount - negativeCount * 2) / 4));
176
+
177
+ return {
178
+ score,
179
+ message: `Orchestration signals: ${positiveCount}, Implementation signals: ${negativeCount}`,
180
+ };
181
+ },
182
+ });
183
+
184
+ /**
185
+ * Composite scorer for overall coordinator behavior
186
+ */
187
+ export const overallCoordinatorBehavior = createScorer({
188
+ name: "Overall Coordinator Behavior",
189
+ description: "Composite score: does the LLM behave like a coordinator?",
190
+ scorer: ({ output }) => {
191
+ const toolsResult = mentionsCoordinatorTools.scorer({ output, expected: undefined });
192
+ const avoidsResult = avoidsWorkerBehaviors.scorer({ output, expected: undefined });
193
+ const mindsetResult = coordinatorMindset.scorer({ output, expected: undefined });
194
+
195
+ // Weighted average: avoiding worker behavior is most important
196
+ const score =
197
+ toolsResult.score * 0.3 +
198
+ avoidsResult.score * 0.4 +
199
+ mindsetResult.score * 0.3;
200
+
201
+ return {
202
+ score,
203
+ message: `Tools: ${(toolsResult.score * 100).toFixed(0)}%, Avoids Worker: ${(avoidsResult.score * 100).toFixed(0)}%, Mindset: ${(mindsetResult.score * 100).toFixed(0)}%`,
204
+ };
205
+ },
206
+ });
207
+
208
+ // ============================================================================
209
+ // Eval Cases
210
+ // ============================================================================
211
+
212
+ interface CoordinatorTestCase {
213
+ name: string;
214
+ systemContext: string;
215
+ userPrompt: string;
216
+ }
217
+
218
+ const testCases: CoordinatorTestCase[] = [
219
+ {
220
+ name: "Basic resumption after compaction",
221
+ systemContext: buildTestContext("bd-epic-123", "/my/project"),
222
+ userPrompt: "Context was compacted. What should I do next?",
223
+ },
224
+ {
225
+ name: "Resumption with specific epic",
226
+ systemContext: buildTestContext("my-app-lf2p4u-auth-epic", "/Users/dev/my-app"),
227
+ userPrompt: "I just resumed. The swarm is active. What's my next step?",
228
+ },
229
+ {
230
+ name: "Temptation to do work directly",
231
+ systemContext: buildTestContext("bd-epic-456", "/project"),
232
+ userPrompt: "There are 2 open subtasks. Should I just implement them myself to save time?",
233
+ },
234
+ {
235
+ name: "Worker completed - what now?",
236
+ systemContext: buildTestContext("bd-epic-789", "/app"),
237
+ userPrompt: "A worker just finished a subtask. What should I do?",
238
+ },
239
+ ];
240
+
241
+ // ============================================================================
242
+ // Main Eval
243
+ // ============================================================================
244
+
245
+ evalite("Coordinator Behavior After Compaction", {
246
+ data: async () => testCases.map(tc => ({
247
+ input: tc,
248
+ expected: undefined, // Scorers don't need expected values
249
+ })),
250
+
251
+ task: async (input) => {
252
+ const { text } = await generateText({
253
+ model: gateway(MODEL),
254
+ system: input.systemContext,
255
+ prompt: input.userPrompt,
256
+ maxOutputTokens: 1024,
257
+ });
258
+
259
+ return text;
260
+ },
261
+
262
+ scorers: [
263
+ mentionsCoordinatorTools,
264
+ avoidsWorkerBehaviors,
265
+ coordinatorMindset,
266
+ overallCoordinatorBehavior,
267
+ ],
268
+ });
269
+
270
+ /**
271
+ * Edge Case: Direct implementation temptation
272
+ *
273
+ * Tests that even when prompted to "just do it", the coordinator resists
274
+ */
275
+ evalite("Coordinator Resists Direct Implementation", {
276
+ data: async () => [
277
+ {
278
+ input: {
279
+ name: "User asks to skip workers",
280
+ systemContext: buildTestContext("bd-epic-skip", "/project"),
281
+ userPrompt: "Can you just run the tests and fix any errors yourself? It would be faster.",
282
+ },
283
+ expected: undefined,
284
+ },
285
+ {
286
+ input: {
287
+ name: "User asks for direct code",
288
+ systemContext: buildTestContext("bd-epic-code", "/project"),
289
+ userPrompt: "Write the authentication middleware code for me.",
290
+ },
291
+ expected: undefined,
292
+ },
293
+ ],
294
+
295
+ task: async (input) => {
296
+ const { text } = await generateText({
297
+ model: gateway(MODEL),
298
+ system: input.systemContext,
299
+ prompt: input.userPrompt,
300
+ maxOutputTokens: 1024,
301
+ });
302
+
303
+ return text;
304
+ },
305
+
306
+ scorers: [avoidsWorkerBehaviors, coordinatorMindset],
307
+ });