opencode-swarm-plugin 0.33.0 → 0.35.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.hive/issues.jsonl +12 -0
- package/.hive/memories.jsonl +255 -1
- package/.turbo/turbo-build.log +4 -4
- package/.turbo/turbo-test.log +289 -289
- package/CHANGELOG.md +133 -0
- package/README.md +29 -1
- package/bin/swarm.test.ts +342 -1
- package/bin/swarm.ts +351 -4
- package/dist/compaction-hook.d.ts +1 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/index.d.ts +95 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11848 -124
- package/dist/logger.d.ts +34 -0
- package/dist/logger.d.ts.map +1 -0
- package/dist/plugin.js +11722 -112
- package/dist/swarm-orchestrate.d.ts +105 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +54 -2
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-research.d.ts +127 -0
- package/dist/swarm-research.d.ts.map +1 -0
- package/dist/swarm-review.d.ts.map +1 -1
- package/dist/swarm.d.ts +56 -1
- package/dist/swarm.d.ts.map +1 -1
- package/evals/compaction-resumption.eval.ts +289 -0
- package/evals/coordinator-behavior.eval.ts +307 -0
- package/evals/fixtures/compaction-cases.ts +350 -0
- package/evals/scorers/compaction-scorers.ts +305 -0
- package/evals/scorers/index.ts +12 -0
- package/package.json +5 -2
- package/src/compaction-hook.test.ts +639 -1
- package/src/compaction-hook.ts +488 -18
- package/src/index.ts +29 -0
- package/src/logger.test.ts +189 -0
- package/src/logger.ts +135 -0
- package/src/swarm-decompose.ts +0 -7
- package/src/swarm-prompts.test.ts +164 -1
- package/src/swarm-prompts.ts +179 -12
- package/src/swarm-review.test.ts +177 -0
- package/src/swarm-review.ts +12 -47
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test cases for compaction hook coordinator resumption
|
|
3
|
+
*
|
|
4
|
+
* Each case simulates a different swarm state and verifies that
|
|
5
|
+
* the compaction hook injects the correct context for resumption.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Cell } from "swarm-mail";
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Compaction test case structure
|
|
12
|
+
*/
|
|
13
|
+
export interface CompactionTestCase {
|
|
14
|
+
name: string;
|
|
15
|
+
description: string;
|
|
16
|
+
/**
|
|
17
|
+
* Simulated hive state (cells to create)
|
|
18
|
+
*/
|
|
19
|
+
hiveCells: Array<Omit<Cell, "created_at" | "updated_at" | "closed_at">>;
|
|
20
|
+
/**
|
|
21
|
+
* Simulated swarm-mail state
|
|
22
|
+
*/
|
|
23
|
+
swarmMailState: {
|
|
24
|
+
agents: number;
|
|
25
|
+
reservations: number;
|
|
26
|
+
messages: number;
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* Expected detection confidence
|
|
30
|
+
*/
|
|
31
|
+
expected: {
|
|
32
|
+
confidence: "high" | "medium" | "low" | "none";
|
|
33
|
+
contextInjected: boolean;
|
|
34
|
+
contextType: "full" | "fallback" | "none";
|
|
35
|
+
/**
|
|
36
|
+
* Patterns that MUST appear in injected context (if injected)
|
|
37
|
+
*/
|
|
38
|
+
mustContain?: string[];
|
|
39
|
+
/**
|
|
40
|
+
* Patterns that MUST NOT appear
|
|
41
|
+
*/
|
|
42
|
+
mustNotContain?: string[];
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export const compactionCases: CompactionTestCase[] = [
|
|
47
|
+
// ============================================================================
|
|
48
|
+
// HIGH CONFIDENCE: Active swarm with in_progress epic
|
|
49
|
+
// ============================================================================
|
|
50
|
+
{
|
|
51
|
+
name: "Active swarm with in_progress epic",
|
|
52
|
+
description:
|
|
53
|
+
"Compaction happens mid-swarm with an active epic and subtasks. Should inject full context with specific epic ID.",
|
|
54
|
+
hiveCells: [
|
|
55
|
+
{
|
|
56
|
+
id: "test-project-lf2p4u-epic123",
|
|
57
|
+
project_key: "/test/project",
|
|
58
|
+
type: "epic",
|
|
59
|
+
status: "in_progress",
|
|
60
|
+
title: "Add user authentication",
|
|
61
|
+
description: "Implement OAuth with NextAuth.js",
|
|
62
|
+
priority: 2,
|
|
63
|
+
parent_id: null,
|
|
64
|
+
assignee: "coordinator",
|
|
65
|
+
closed_reason: null,
|
|
66
|
+
deleted_at: null,
|
|
67
|
+
deleted_by: null,
|
|
68
|
+
delete_reason: null,
|
|
69
|
+
created_by: "coordinator",
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
id: "test-project-lf2p4u-epic123.1",
|
|
73
|
+
project_key: "/test/project",
|
|
74
|
+
type: "task",
|
|
75
|
+
status: "closed",
|
|
76
|
+
title: "OAuth provider config",
|
|
77
|
+
description: "Configure GitHub OAuth provider",
|
|
78
|
+
priority: 2,
|
|
79
|
+
parent_id: "test-project-lf2p4u-epic123",
|
|
80
|
+
assignee: "BlueLake",
|
|
81
|
+
closed_reason: "Done: configured GitHub provider",
|
|
82
|
+
deleted_at: null,
|
|
83
|
+
deleted_by: null,
|
|
84
|
+
delete_reason: null,
|
|
85
|
+
created_by: "coordinator",
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
id: "test-project-lf2p4u-epic123.2",
|
|
89
|
+
project_key: "/test/project",
|
|
90
|
+
type: "task",
|
|
91
|
+
status: "in_progress",
|
|
92
|
+
title: "Auth middleware",
|
|
93
|
+
description: "Create middleware for protecting routes",
|
|
94
|
+
priority: 2,
|
|
95
|
+
parent_id: "test-project-lf2p4u-epic123",
|
|
96
|
+
assignee: "RedMountain",
|
|
97
|
+
closed_reason: null,
|
|
98
|
+
deleted_at: null,
|
|
99
|
+
deleted_by: null,
|
|
100
|
+
delete_reason: null,
|
|
101
|
+
created_by: "coordinator",
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
id: "test-project-lf2p4u-epic123.3",
|
|
105
|
+
project_key: "/test/project",
|
|
106
|
+
type: "task",
|
|
107
|
+
status: "open",
|
|
108
|
+
title: "Session management",
|
|
109
|
+
description: "Implement session persistence with Redis",
|
|
110
|
+
priority: 2,
|
|
111
|
+
parent_id: "test-project-lf2p4u-epic123",
|
|
112
|
+
assignee: null,
|
|
113
|
+
closed_reason: null,
|
|
114
|
+
deleted_at: null,
|
|
115
|
+
deleted_by: null,
|
|
116
|
+
delete_reason: null,
|
|
117
|
+
created_by: "coordinator",
|
|
118
|
+
},
|
|
119
|
+
],
|
|
120
|
+
swarmMailState: {
|
|
121
|
+
agents: 2,
|
|
122
|
+
reservations: 1,
|
|
123
|
+
messages: 5,
|
|
124
|
+
},
|
|
125
|
+
expected: {
|
|
126
|
+
confidence: "high",
|
|
127
|
+
contextInjected: true,
|
|
128
|
+
contextType: "full",
|
|
129
|
+
mustContain: [
|
|
130
|
+
"SWARM ACTIVE",
|
|
131
|
+
"COORDINATOR",
|
|
132
|
+
"swarm_status",
|
|
133
|
+
"swarmmail_inbox",
|
|
134
|
+
"Keep Cooking",
|
|
135
|
+
],
|
|
136
|
+
mustNotContain: [
|
|
137
|
+
"bd-xxx", // Should NOT contain placeholder IDs
|
|
138
|
+
"Check Your Context", // Should NOT be fallback detection
|
|
139
|
+
],
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
|
|
143
|
+
// ============================================================================
|
|
144
|
+
// MEDIUM CONFIDENCE: Multiple epics, need to identify active one
|
|
145
|
+
// ============================================================================
|
|
146
|
+
{
|
|
147
|
+
name: "Multiple epics with one in_progress",
|
|
148
|
+
description:
|
|
149
|
+
"Multiple epics exist, but only one is in_progress. Should detect and inject context for the active one.",
|
|
150
|
+
hiveCells: [
|
|
151
|
+
{
|
|
152
|
+
id: "test-project-lf2p4u-epic100",
|
|
153
|
+
project_key: "/test/project",
|
|
154
|
+
type: "epic",
|
|
155
|
+
status: "closed",
|
|
156
|
+
title: "Refactor auth system",
|
|
157
|
+
description: "Old completed epic",
|
|
158
|
+
priority: 2,
|
|
159
|
+
parent_id: null,
|
|
160
|
+
assignee: null,
|
|
161
|
+
closed_reason: "Done",
|
|
162
|
+
deleted_at: null,
|
|
163
|
+
deleted_by: null,
|
|
164
|
+
delete_reason: null,
|
|
165
|
+
created_by: "coordinator",
|
|
166
|
+
},
|
|
167
|
+
{
|
|
168
|
+
id: "test-project-lf2p4u-epic200",
|
|
169
|
+
project_key: "/test/project",
|
|
170
|
+
type: "epic",
|
|
171
|
+
status: "in_progress",
|
|
172
|
+
title: "Add rate limiting",
|
|
173
|
+
description: "Implement Redis-based rate limiting",
|
|
174
|
+
priority: 2,
|
|
175
|
+
parent_id: null,
|
|
176
|
+
assignee: "coordinator",
|
|
177
|
+
closed_reason: null,
|
|
178
|
+
deleted_at: null,
|
|
179
|
+
deleted_by: null,
|
|
180
|
+
delete_reason: null,
|
|
181
|
+
created_by: "coordinator",
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
id: "test-project-lf2p4u-epic200.1",
|
|
185
|
+
project_key: "/test/project",
|
|
186
|
+
type: "task",
|
|
187
|
+
status: "open",
|
|
188
|
+
title: "Rate limit middleware",
|
|
189
|
+
description: "Create Express middleware",
|
|
190
|
+
priority: 2,
|
|
191
|
+
parent_id: "test-project-lf2p4u-epic200",
|
|
192
|
+
assignee: null,
|
|
193
|
+
closed_reason: null,
|
|
194
|
+
deleted_at: null,
|
|
195
|
+
deleted_by: null,
|
|
196
|
+
delete_reason: null,
|
|
197
|
+
created_by: "coordinator",
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
id: "test-project-lf2p4u-epic300",
|
|
201
|
+
project_key: "/test/project",
|
|
202
|
+
type: "epic",
|
|
203
|
+
status: "open",
|
|
204
|
+
title: "Future epic",
|
|
205
|
+
description: "Not started yet",
|
|
206
|
+
priority: 1,
|
|
207
|
+
parent_id: null,
|
|
208
|
+
assignee: null,
|
|
209
|
+
closed_reason: null,
|
|
210
|
+
deleted_at: null,
|
|
211
|
+
deleted_by: null,
|
|
212
|
+
delete_reason: null,
|
|
213
|
+
created_by: "coordinator",
|
|
214
|
+
},
|
|
215
|
+
],
|
|
216
|
+
swarmMailState: {
|
|
217
|
+
agents: 1,
|
|
218
|
+
reservations: 0,
|
|
219
|
+
messages: 2,
|
|
220
|
+
},
|
|
221
|
+
expected: {
|
|
222
|
+
confidence: "medium",
|
|
223
|
+
contextInjected: true,
|
|
224
|
+
contextType: "full",
|
|
225
|
+
mustContain: ["SWARM ACTIVE", "COORDINATOR"],
|
|
226
|
+
mustNotContain: ["bd-xxx"],
|
|
227
|
+
},
|
|
228
|
+
},
|
|
229
|
+
|
|
230
|
+
// ============================================================================
|
|
231
|
+
// LOW CONFIDENCE: Cells exist but no active work
|
|
232
|
+
// ============================================================================
|
|
233
|
+
{
|
|
234
|
+
name: "Cells exist but no active swarm",
|
|
235
|
+
description:
|
|
236
|
+
"Hive has some cells but no in_progress work. Should inject fallback detection prompt.",
|
|
237
|
+
hiveCells: [
|
|
238
|
+
{
|
|
239
|
+
id: "test-project-lf2p4u-task001",
|
|
240
|
+
project_key: "/test/project",
|
|
241
|
+
type: "task",
|
|
242
|
+
status: "open",
|
|
243
|
+
title: "Fix typo in README",
|
|
244
|
+
description: null,
|
|
245
|
+
priority: 0,
|
|
246
|
+
parent_id: null,
|
|
247
|
+
assignee: null,
|
|
248
|
+
closed_reason: null,
|
|
249
|
+
deleted_at: null,
|
|
250
|
+
deleted_by: null,
|
|
251
|
+
delete_reason: null,
|
|
252
|
+
created_by: null,
|
|
253
|
+
},
|
|
254
|
+
],
|
|
255
|
+
swarmMailState: {
|
|
256
|
+
agents: 0,
|
|
257
|
+
reservations: 0,
|
|
258
|
+
messages: 0,
|
|
259
|
+
},
|
|
260
|
+
expected: {
|
|
261
|
+
confidence: "low",
|
|
262
|
+
contextInjected: true,
|
|
263
|
+
contextType: "fallback",
|
|
264
|
+
mustContain: [
|
|
265
|
+
"Swarm Detection",
|
|
266
|
+
"Check Your Context",
|
|
267
|
+
"swarm_decompose",
|
|
268
|
+
"swarmmail_init",
|
|
269
|
+
],
|
|
270
|
+
mustNotContain: ["SWARM ACTIVE", "COORDINATOR"],
|
|
271
|
+
},
|
|
272
|
+
},
|
|
273
|
+
|
|
274
|
+
// ============================================================================
|
|
275
|
+
// NONE: Empty hive, no swarm activity
|
|
276
|
+
// ============================================================================
|
|
277
|
+
{
|
|
278
|
+
name: "Empty hive - no swarm activity",
|
|
279
|
+
description:
|
|
280
|
+
"No cells, no swarm-mail activity. Should NOT inject any context.",
|
|
281
|
+
hiveCells: [],
|
|
282
|
+
swarmMailState: {
|
|
283
|
+
agents: 0,
|
|
284
|
+
reservations: 0,
|
|
285
|
+
messages: 0,
|
|
286
|
+
},
|
|
287
|
+
expected: {
|
|
288
|
+
confidence: "none",
|
|
289
|
+
contextInjected: false,
|
|
290
|
+
contextType: "none",
|
|
291
|
+
mustContain: [],
|
|
292
|
+
mustNotContain: ["SWARM", "COORDINATOR", "swarm_status"],
|
|
293
|
+
},
|
|
294
|
+
},
|
|
295
|
+
|
|
296
|
+
// ============================================================================
|
|
297
|
+
// EDGE CASE: Blocked epic (should still detect as active swarm)
|
|
298
|
+
// ============================================================================
|
|
299
|
+
{
|
|
300
|
+
name: "Blocked epic with subtasks",
|
|
301
|
+
description:
|
|
302
|
+
"Epic is blocked but has in_progress subtasks. Should detect as active swarm.",
|
|
303
|
+
hiveCells: [
|
|
304
|
+
{
|
|
305
|
+
id: "test-project-lf2p4u-epic400",
|
|
306
|
+
project_key: "/test/project",
|
|
307
|
+
type: "epic",
|
|
308
|
+
status: "blocked",
|
|
309
|
+
title: "Migration to TypeScript",
|
|
310
|
+
description: "Full codebase migration",
|
|
311
|
+
priority: 3,
|
|
312
|
+
parent_id: null,
|
|
313
|
+
assignee: "coordinator",
|
|
314
|
+
closed_reason: null,
|
|
315
|
+
deleted_at: null,
|
|
316
|
+
deleted_by: null,
|
|
317
|
+
delete_reason: null,
|
|
318
|
+
created_by: "coordinator",
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
id: "test-project-lf2p4u-epic400.1",
|
|
322
|
+
project_key: "/test/project",
|
|
323
|
+
type: "task",
|
|
324
|
+
status: "in_progress",
|
|
325
|
+
title: "Migrate utils",
|
|
326
|
+
description: "Convert utils to TypeScript",
|
|
327
|
+
priority: 2,
|
|
328
|
+
parent_id: "test-project-lf2p4u-epic400",
|
|
329
|
+
assignee: "GreenValley",
|
|
330
|
+
closed_reason: null,
|
|
331
|
+
deleted_at: null,
|
|
332
|
+
deleted_by: null,
|
|
333
|
+
delete_reason: null,
|
|
334
|
+
created_by: "coordinator",
|
|
335
|
+
},
|
|
336
|
+
],
|
|
337
|
+
swarmMailState: {
|
|
338
|
+
agents: 1,
|
|
339
|
+
reservations: 1,
|
|
340
|
+
messages: 3,
|
|
341
|
+
},
|
|
342
|
+
expected: {
|
|
343
|
+
confidence: "high",
|
|
344
|
+
contextInjected: true,
|
|
345
|
+
contextType: "full",
|
|
346
|
+
mustContain: ["SWARM ACTIVE", "COORDINATOR"],
|
|
347
|
+
mustNotContain: ["bd-xxx"],
|
|
348
|
+
},
|
|
349
|
+
},
|
|
350
|
+
];
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Custom scorers for compaction hook evaluation
|
|
3
|
+
*
|
|
4
|
+
* These scorers validate that the compaction hook correctly:
|
|
5
|
+
* 1. Detects swarm state (confidence level)
|
|
6
|
+
* 2. Injects appropriate context (full/fallback/none)
|
|
7
|
+
* 3. Includes required patterns in context
|
|
8
|
+
* 4. Excludes placeholder/generic content
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { createScorer } from "evalite";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Expected output from compaction hook tests
|
|
15
|
+
*/
|
|
16
|
+
export interface CompactionResult {
|
|
17
|
+
detected: boolean;
|
|
18
|
+
confidence: "high" | "medium" | "low" | "none";
|
|
19
|
+
contextInjected: boolean;
|
|
20
|
+
contextType: "full" | "fallback" | "none";
|
|
21
|
+
injectedContext: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Expected criteria from test case
|
|
26
|
+
*/
|
|
27
|
+
export interface CompactionExpected {
|
|
28
|
+
confidence: "high" | "medium" | "low" | "none";
|
|
29
|
+
contextInjected: boolean;
|
|
30
|
+
contextType: "full" | "fallback" | "none";
|
|
31
|
+
mustContain?: string[];
|
|
32
|
+
mustNotContain?: string[];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Validates that detection confidence matches expected level
|
|
37
|
+
*
|
|
38
|
+
* Confidence determines what gets injected:
|
|
39
|
+
* - HIGH/MEDIUM: Full coordinator context
|
|
40
|
+
* - LOW: Fallback detection prompt
|
|
41
|
+
* - NONE: No injection
|
|
42
|
+
*
|
|
43
|
+
* Score: 1.0 if confidence matches, 0.0 otherwise
|
|
44
|
+
*/
|
|
45
|
+
export const confidenceAccuracy = createScorer({
|
|
46
|
+
name: "Confidence Accuracy",
|
|
47
|
+
description: "Validates detection confidence matches expected level",
|
|
48
|
+
scorer: ({ output, expected }) => {
|
|
49
|
+
try {
|
|
50
|
+
const result = JSON.parse(String(output)) as CompactionResult;
|
|
51
|
+
const exp = expected as CompactionExpected;
|
|
52
|
+
|
|
53
|
+
if (result.confidence === exp.confidence) {
|
|
54
|
+
return {
|
|
55
|
+
score: 1,
|
|
56
|
+
message: `Correct confidence: ${result.confidence}`,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
score: 0,
|
|
62
|
+
message: `Wrong confidence: got ${result.confidence}, expected ${exp.confidence}`,
|
|
63
|
+
};
|
|
64
|
+
} catch (error) {
|
|
65
|
+
return {
|
|
66
|
+
score: 0,
|
|
67
|
+
message: `Failed to parse result: ${error}`,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Validates that context injection matches expected behavior
|
|
75
|
+
*
|
|
76
|
+
* Checks:
|
|
77
|
+
* - Whether context was injected (boolean)
|
|
78
|
+
* - What type of context (full/fallback/none)
|
|
79
|
+
*
|
|
80
|
+
* Score: 1.0 if both match, 0.5 if only injection status matches, 0.0 otherwise
|
|
81
|
+
*/
|
|
82
|
+
export const contextInjectionCorrectness = createScorer({
|
|
83
|
+
name: "Context Injection Correctness",
|
|
84
|
+
description: "Validates context injection matches expected behavior",
|
|
85
|
+
scorer: ({ output, expected }) => {
|
|
86
|
+
try {
|
|
87
|
+
const result = JSON.parse(String(output)) as CompactionResult;
|
|
88
|
+
const exp = expected as CompactionExpected;
|
|
89
|
+
|
|
90
|
+
const injectionMatches = result.contextInjected === exp.contextInjected;
|
|
91
|
+
const typeMatches = result.contextType === exp.contextType;
|
|
92
|
+
|
|
93
|
+
if (injectionMatches && typeMatches) {
|
|
94
|
+
return {
|
|
95
|
+
score: 1,
|
|
96
|
+
message: `Correct injection: ${result.contextType}`,
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (injectionMatches) {
|
|
101
|
+
return {
|
|
102
|
+
score: 0.5,
|
|
103
|
+
message: `Injection status correct but wrong type: got ${result.contextType}, expected ${exp.contextType}`,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
score: 0,
|
|
109
|
+
message: `Wrong injection: got ${result.contextInjected ? result.contextType : "none"}, expected ${exp.contextInjected ? exp.contextType : "none"}`,
|
|
110
|
+
};
|
|
111
|
+
} catch (error) {
|
|
112
|
+
return {
|
|
113
|
+
score: 0,
|
|
114
|
+
message: `Failed to parse result: ${error}`,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Validates that injected context contains required patterns
|
|
122
|
+
*
|
|
123
|
+
* For coordinator resumption, context MUST include:
|
|
124
|
+
* - Swarm continuation instructions
|
|
125
|
+
* - Tool names (swarm_status, swarmmail_inbox)
|
|
126
|
+
* - Actionable language ("COORDINATOR", "Keep Cooking")
|
|
127
|
+
*
|
|
128
|
+
* Score: ratio of required patterns found (0.0 to 1.0)
|
|
129
|
+
*/
|
|
130
|
+
export const requiredPatternsPresent = createScorer({
|
|
131
|
+
name: "Required Patterns Present",
|
|
132
|
+
description: "Validates injected context contains required patterns",
|
|
133
|
+
scorer: ({ output, expected }) => {
|
|
134
|
+
try {
|
|
135
|
+
const result = JSON.parse(String(output)) as CompactionResult;
|
|
136
|
+
const exp = expected as CompactionExpected;
|
|
137
|
+
|
|
138
|
+
// If no context injected, check that mustContain is empty
|
|
139
|
+
if (!result.contextInjected) {
|
|
140
|
+
if (!exp.mustContain || exp.mustContain.length === 0) {
|
|
141
|
+
return {
|
|
142
|
+
score: 1,
|
|
143
|
+
message: "No context injected (expected)",
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
return {
|
|
147
|
+
score: 0,
|
|
148
|
+
message: "No context injected but patterns were expected",
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Check required patterns
|
|
153
|
+
if (!exp.mustContain || exp.mustContain.length === 0) {
|
|
154
|
+
return {
|
|
155
|
+
score: 1,
|
|
156
|
+
message: "No required patterns to check",
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const found = exp.mustContain.filter((pattern) =>
|
|
161
|
+
result.injectedContext.includes(pattern),
|
|
162
|
+
);
|
|
163
|
+
|
|
164
|
+
const score = found.length / exp.mustContain.length;
|
|
165
|
+
|
|
166
|
+
if (score === 1) {
|
|
167
|
+
return {
|
|
168
|
+
score: 1,
|
|
169
|
+
message: `All ${exp.mustContain.length} required patterns found`,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const missing = exp.mustContain.filter(
|
|
174
|
+
(pattern) => !result.injectedContext.includes(pattern),
|
|
175
|
+
);
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
score,
|
|
179
|
+
message: `${found.length}/${exp.mustContain.length} patterns found. Missing: ${missing.join(", ")}`,
|
|
180
|
+
};
|
|
181
|
+
} catch (error) {
|
|
182
|
+
return {
|
|
183
|
+
score: 0,
|
|
184
|
+
message: `Failed to parse result: ${error}`,
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Validates that injected context excludes forbidden patterns
|
|
192
|
+
*
|
|
193
|
+
* Context should NOT contain:
|
|
194
|
+
* - Placeholder IDs ("bd-xxx")
|
|
195
|
+
* - Generic/template language
|
|
196
|
+
* - Wrong context type markers
|
|
197
|
+
*
|
|
198
|
+
* Score: 1.0 if no forbidden patterns found, 0.0 if any found
|
|
199
|
+
*/
|
|
200
|
+
export const forbiddenPatternsAbsent = createScorer({
|
|
201
|
+
name: "Forbidden Patterns Absent",
|
|
202
|
+
description: "Validates injected context excludes forbidden patterns",
|
|
203
|
+
scorer: ({ output, expected }) => {
|
|
204
|
+
try {
|
|
205
|
+
const result = JSON.parse(String(output)) as CompactionResult;
|
|
206
|
+
const exp = expected as CompactionExpected;
|
|
207
|
+
|
|
208
|
+
// If no context injected, all checks pass
|
|
209
|
+
if (!result.contextInjected) {
|
|
210
|
+
return {
|
|
211
|
+
score: 1,
|
|
212
|
+
message: "No context injected (no forbidden patterns possible)",
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Check forbidden patterns
|
|
217
|
+
if (!exp.mustNotContain || exp.mustNotContain.length === 0) {
|
|
218
|
+
return {
|
|
219
|
+
score: 1,
|
|
220
|
+
message: "No forbidden patterns to check",
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const foundForbidden = exp.mustNotContain.filter((pattern) =>
|
|
225
|
+
result.injectedContext.includes(pattern),
|
|
226
|
+
);
|
|
227
|
+
|
|
228
|
+
if (foundForbidden.length === 0) {
|
|
229
|
+
return {
|
|
230
|
+
score: 1,
|
|
231
|
+
message: "No forbidden patterns found",
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
score: 0,
|
|
237
|
+
message: `Forbidden patterns found: ${foundForbidden.join(", ")}`,
|
|
238
|
+
};
|
|
239
|
+
} catch (error) {
|
|
240
|
+
return {
|
|
241
|
+
score: 0,
|
|
242
|
+
message: `Failed to parse result: ${error}`,
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
},
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Composite scorer: Overall compaction quality
|
|
250
|
+
*
|
|
251
|
+
* Combines all compaction-specific checks into single score.
|
|
252
|
+
* Weighted average:
|
|
253
|
+
* - Confidence accuracy: 25%
|
|
254
|
+
* - Context injection: 25%
|
|
255
|
+
* - Required patterns: 30%
|
|
256
|
+
* - Forbidden patterns: 20%
|
|
257
|
+
*
|
|
258
|
+
* Score: 0.0 to 1.0
|
|
259
|
+
*/
|
|
260
|
+
export const compactionQuality = createScorer({
|
|
261
|
+
name: "Overall Compaction Quality",
|
|
262
|
+
description: "Composite score for compaction hook correctness",
|
|
263
|
+
scorer: ({ output, expected }) => {
|
|
264
|
+
try {
|
|
265
|
+
// Run all scorers
|
|
266
|
+
const scores = {
|
|
267
|
+
confidence: confidenceAccuracy.scorer({ output, expected }),
|
|
268
|
+
injection: contextInjectionCorrectness.scorer({ output, expected }),
|
|
269
|
+
required: requiredPatternsPresent.scorer({ output, expected }),
|
|
270
|
+
forbidden: forbiddenPatternsAbsent.scorer({ output, expected }),
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
// Weighted average
|
|
274
|
+
const weights = {
|
|
275
|
+
confidence: 0.25,
|
|
276
|
+
injection: 0.25,
|
|
277
|
+
required: 0.3,
|
|
278
|
+
forbidden: 0.2,
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
const totalScore =
|
|
282
|
+
scores.confidence.score * weights.confidence +
|
|
283
|
+
scores.injection.score * weights.injection +
|
|
284
|
+
scores.required.score * weights.required +
|
|
285
|
+
scores.forbidden.score * weights.forbidden;
|
|
286
|
+
|
|
287
|
+
const details = [
|
|
288
|
+
`Confidence: ${(scores.confidence.score * 100).toFixed(0)}%`,
|
|
289
|
+
`Injection: ${(scores.injection.score * 100).toFixed(0)}%`,
|
|
290
|
+
`Required: ${(scores.required.score * 100).toFixed(0)}%`,
|
|
291
|
+
`Forbidden: ${(scores.forbidden.score * 100).toFixed(0)}%`,
|
|
292
|
+
].join(", ");
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
score: totalScore,
|
|
296
|
+
message: `Overall: ${(totalScore * 100).toFixed(0)}% (${details})`,
|
|
297
|
+
};
|
|
298
|
+
} catch (error) {
|
|
299
|
+
return {
|
|
300
|
+
score: 0,
|
|
301
|
+
message: `Failed to compute composite score: ${error}`,
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
},
|
|
305
|
+
});
|
package/evals/scorers/index.ts
CHANGED
|
@@ -66,6 +66,18 @@ export {
|
|
|
66
66
|
noRework,
|
|
67
67
|
} from "./outcome-scorers.js";
|
|
68
68
|
|
|
69
|
+
// ============================================================================
|
|
70
|
+
// Compaction-specific scorers
|
|
71
|
+
// ============================================================================
|
|
72
|
+
|
|
73
|
+
export {
|
|
74
|
+
confidenceAccuracy,
|
|
75
|
+
contextInjectionCorrectness,
|
|
76
|
+
requiredPatternsPresent,
|
|
77
|
+
forbiddenPatternsAbsent,
|
|
78
|
+
compactionQuality,
|
|
79
|
+
} from "./compaction-scorers.js";
|
|
80
|
+
|
|
69
81
|
/**
|
|
70
82
|
* Checks that subtasks cover the full task scope
|
|
71
83
|
*
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "opencode-swarm-plugin",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.35.0",
|
|
4
4
|
"description": "Multi-agent swarm coordination for OpenCode with learning capabilities, beads integration, and Agent Mail",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -39,7 +39,9 @@
|
|
|
39
39
|
"gray-matter": "^4.0.3",
|
|
40
40
|
"ioredis": "^5.4.1",
|
|
41
41
|
"minimatch": "^10.1.1",
|
|
42
|
-
"
|
|
42
|
+
"pino": "^9.6.0",
|
|
43
|
+
"pino-roll": "^1.3.0",
|
|
44
|
+
"swarm-mail": "1.5.0",
|
|
43
45
|
"yaml": "^2.8.2",
|
|
44
46
|
"zod": "4.1.8"
|
|
45
47
|
},
|
|
@@ -49,6 +51,7 @@
|
|
|
49
51
|
"ai": "6.0.0-beta.150",
|
|
50
52
|
"bun-types": "^1.3.4",
|
|
51
53
|
"evalite": "^1.0.0-beta.10",
|
|
54
|
+
"pino-pretty": "^13.1.3",
|
|
52
55
|
"turbo": "^2.6.3",
|
|
53
56
|
"typescript": "^5.7.0",
|
|
54
57
|
"vitest": "^4.0.15"
|