opencode-swarm-plugin 0.38.0 → 0.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env +2 -0
- package/.hive/eval-results.json +26 -0
- package/.hive/issues.jsonl +27 -0
- package/.hive/memories.jsonl +23 -1
- package/.opencode/eval-history.jsonl +12 -0
- package/CHANGELOG.md +182 -0
- package/README.md +29 -12
- package/bin/swarm.test.ts +881 -0
- package/bin/swarm.ts +686 -0
- package/dist/compaction-hook.d.ts +8 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-observability.d.ts +173 -0
- package/dist/compaction-observability.d.ts.map +1 -0
- package/dist/compaction-prompt-scoring.d.ts +124 -0
- package/dist/compaction-prompt-scoring.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +174 -1
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/eval-gates.d.ts +84 -0
- package/dist/eval-gates.d.ts.map +1 -0
- package/dist/eval-history.d.ts +117 -0
- package/dist/eval-history.d.ts.map +1 -0
- package/dist/eval-learning.d.ts +216 -0
- package/dist/eval-learning.d.ts.map +1 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +80 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +16098 -651
- package/dist/plugin.js +16012 -756
- package/dist/post-compaction-tracker.d.ts +133 -0
- package/dist/post-compaction-tracker.d.ts.map +1 -0
- package/dist/schemas/task.d.ts +3 -3
- package/dist/swarm-orchestrate.d.ts +23 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts +25 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm.d.ts +4 -0
- package/dist/swarm.d.ts.map +1 -1
- package/evals/README.md +702 -105
- package/evals/compaction-prompt.eval.ts +149 -0
- package/evals/coordinator-behavior.eval.ts +8 -8
- package/evals/fixtures/compaction-prompt-cases.ts +305 -0
- package/evals/lib/compaction-loader.test.ts +248 -0
- package/evals/lib/compaction-loader.ts +320 -0
- package/evals/lib/data-loader.test.ts +345 -0
- package/evals/lib/data-loader.ts +107 -6
- package/evals/scorers/compaction-prompt-scorers.ts +145 -0
- package/evals/scorers/compaction-scorers.ts +13 -13
- package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
- package/evals/scorers/coordinator-discipline.ts +348 -15
- package/evals/scorers/index.test.ts +146 -0
- package/evals/scorers/index.ts +104 -0
- package/evals/swarm-decomposition.eval.ts +9 -2
- package/examples/commands/swarm.md +291 -21
- package/examples/plugin-wrapper-template.ts +117 -0
- package/package.json +7 -5
- package/scripts/migrate-unknown-sessions.ts +349 -0
- package/src/compaction-capture.integration.test.ts +257 -0
- package/src/compaction-hook.test.ts +42 -0
- package/src/compaction-hook.ts +315 -86
- package/src/compaction-observability.integration.test.ts +139 -0
- package/src/compaction-observability.test.ts +187 -0
- package/src/compaction-observability.ts +324 -0
- package/src/compaction-prompt-scorers.test.ts +299 -0
- package/src/compaction-prompt-scoring.ts +298 -0
- package/src/eval-capture.test.ts +626 -1
- package/src/eval-capture.ts +286 -2
- package/src/eval-gates.test.ts +306 -0
- package/src/eval-gates.ts +218 -0
- package/src/eval-history.test.ts +508 -0
- package/src/eval-history.ts +214 -0
- package/src/eval-learning.test.ts +378 -0
- package/src/eval-learning.ts +360 -0
- package/src/eval-runner.test.ts +96 -0
- package/src/eval-runner.ts +356 -0
- package/src/hive.ts +34 -0
- package/src/index.ts +115 -2
- package/src/memory.test.ts +110 -0
- package/src/memory.ts +34 -0
- package/src/post-compaction-tracker.test.ts +251 -0
- package/src/post-compaction-tracker.ts +237 -0
- package/src/swarm-decompose.ts +2 -2
- package/src/swarm-orchestrate.ts +2 -2
- package/src/swarm-prompts.ts +2 -2
- package/src/swarm-review.ts +3 -3
- package/dist/beads.d.ts +0 -386
- package/dist/beads.d.ts.map +0 -1
- package/dist/schemas/bead-events.d.ts +0 -698
- package/dist/schemas/bead-events.d.ts.map +0 -1
- package/dist/schemas/bead.d.ts +0 -255
- package/dist/schemas/bead.d.ts.map +0 -1
- /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Programmatic Evalite Runner
|
|
3
|
+
*
|
|
4
|
+
* Provides a type-safe API for running evalite evals programmatically.
|
|
5
|
+
* Wraps evalite's runEvalite function with structured result parsing.
|
|
6
|
+
*
|
|
7
|
+
* @module eval-runner
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { tool } from "@opencode-ai/plugin";
|
|
11
|
+
import { runEvalite } from "evalite/runner";
|
|
12
|
+
import { createInMemoryStorage } from "evalite/in-memory-storage";
|
|
13
|
+
import type { Evalite } from "evalite/types";
|
|
14
|
+
import fs from "node:fs/promises";
|
|
15
|
+
import path from "node:path";
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Options for running evals programmatically
|
|
19
|
+
*/
|
|
20
|
+
export interface RunEvalsOptions {
|
|
21
|
+
/**
|
|
22
|
+
* Working directory containing eval files (defaults to process.cwd())
|
|
23
|
+
*/
|
|
24
|
+
cwd?: string;
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Optional filter to run specific eval suites (e.g., "coordinator", "compaction")
|
|
28
|
+
* Matches against eval file paths using substring matching
|
|
29
|
+
*/
|
|
30
|
+
suiteFilter?: string;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Minimum average score threshold (0-100)
|
|
34
|
+
* If average score falls below this, result.success will be false
|
|
35
|
+
*/
|
|
36
|
+
scoreThreshold?: number;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Optional path to write raw evalite JSON output
|
|
40
|
+
*/
|
|
41
|
+
outputPath?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Structured suite result with scores
|
|
46
|
+
*/
|
|
47
|
+
export interface SuiteResult {
|
|
48
|
+
/** Suite name from evalite() call */
|
|
49
|
+
name: string;
|
|
50
|
+
|
|
51
|
+
/** Absolute path to eval file */
|
|
52
|
+
filepath: string;
|
|
53
|
+
|
|
54
|
+
/** Suite status: success, fail, or running */
|
|
55
|
+
status: "success" | "fail" | "running";
|
|
56
|
+
|
|
57
|
+
/** Total duration in milliseconds */
|
|
58
|
+
duration: number;
|
|
59
|
+
|
|
60
|
+
/** Average score across all evals in suite (0-1 scale) */
|
|
61
|
+
averageScore: number;
|
|
62
|
+
|
|
63
|
+
/** Number of evals in this suite */
|
|
64
|
+
evalCount: number;
|
|
65
|
+
|
|
66
|
+
/** Individual eval results (optional, can be large) */
|
|
67
|
+
evals?: Array<{
|
|
68
|
+
input: unknown;
|
|
69
|
+
output: unknown;
|
|
70
|
+
expected?: unknown;
|
|
71
|
+
scores: Array<{
|
|
72
|
+
name: string;
|
|
73
|
+
score: number;
|
|
74
|
+
description?: string;
|
|
75
|
+
}>;
|
|
76
|
+
}>;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Structured result from running evals
|
|
81
|
+
*/
|
|
82
|
+
export interface RunEvalsResult {
|
|
83
|
+
/** Whether the run succeeded (all evals passed threshold) */
|
|
84
|
+
success: boolean;
|
|
85
|
+
|
|
86
|
+
/** Total number of suites executed */
|
|
87
|
+
totalSuites: number;
|
|
88
|
+
|
|
89
|
+
/** Total number of individual evals executed */
|
|
90
|
+
totalEvals: number;
|
|
91
|
+
|
|
92
|
+
/** Average score across all suites (0-1 scale) */
|
|
93
|
+
averageScore: number;
|
|
94
|
+
|
|
95
|
+
/** Individual suite results */
|
|
96
|
+
suites: SuiteResult[];
|
|
97
|
+
|
|
98
|
+
/** Error message if run failed */
|
|
99
|
+
error?: string;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Run evalite evals programmatically
|
|
104
|
+
*
|
|
105
|
+
* @param options - Configuration for eval run
|
|
106
|
+
* @returns Structured results with scores per suite
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* ```typescript
|
|
110
|
+
* // Run all evals
|
|
111
|
+
* const result = await runEvals({ cwd: "/path/to/project" });
|
|
112
|
+
* console.log(`Average score: ${result.averageScore}`);
|
|
113
|
+
*
|
|
114
|
+
* // Run specific suite
|
|
115
|
+
* const coordResult = await runEvals({
|
|
116
|
+
* cwd: "/path/to/project",
|
|
117
|
+
* suiteFilter: "coordinator"
|
|
118
|
+
* });
|
|
119
|
+
*
|
|
120
|
+
* // Enforce score threshold
|
|
121
|
+
* const gatedResult = await runEvals({
|
|
122
|
+
* cwd: "/path/to/project",
|
|
123
|
+
* scoreThreshold: 80
|
|
124
|
+
* });
|
|
125
|
+
* if (!gatedResult.success) {
|
|
126
|
+
* throw new Error(`Evals failed threshold: ${gatedResult.averageScore}`);
|
|
127
|
+
* }
|
|
128
|
+
* ```
|
|
129
|
+
*/
|
|
130
|
+
export async function runEvals(
|
|
131
|
+
options: RunEvalsOptions = {}
|
|
132
|
+
): Promise<RunEvalsResult> {
|
|
133
|
+
const {
|
|
134
|
+
cwd = process.cwd(),
|
|
135
|
+
suiteFilter,
|
|
136
|
+
scoreThreshold,
|
|
137
|
+
outputPath: userOutputPath,
|
|
138
|
+
} = options;
|
|
139
|
+
|
|
140
|
+
try {
|
|
141
|
+
// Resolve to project root (evals are in evals/ relative to project root)
|
|
142
|
+
// If cwd is src/, go up one level
|
|
143
|
+
const projectRoot = cwd.endsWith("src") ? path.dirname(cwd) : cwd;
|
|
144
|
+
const evalsDir = path.join(projectRoot, "evals");
|
|
145
|
+
let evalPath: string | undefined;
|
|
146
|
+
|
|
147
|
+
if (suiteFilter) {
|
|
148
|
+
// Find matching eval files
|
|
149
|
+
try {
|
|
150
|
+
const files = await fs.readdir(evalsDir);
|
|
151
|
+
const matchingFiles = files.filter((f) =>
|
|
152
|
+
f.toLowerCase().includes(suiteFilter.toLowerCase())
|
|
153
|
+
);
|
|
154
|
+
|
|
155
|
+
if (matchingFiles.length === 0) {
|
|
156
|
+
// No matches - return empty result (not an error)
|
|
157
|
+
return {
|
|
158
|
+
success: true,
|
|
159
|
+
totalSuites: 0,
|
|
160
|
+
totalEvals: 0,
|
|
161
|
+
averageScore: 0,
|
|
162
|
+
suites: [],
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Use first matching file (evalite will discover all via vitest)
|
|
167
|
+
evalPath = path.join(evalsDir, matchingFiles[0]);
|
|
168
|
+
} catch (err) {
|
|
169
|
+
// Directory doesn't exist or can't be read
|
|
170
|
+
return {
|
|
171
|
+
success: false,
|
|
172
|
+
totalSuites: 0,
|
|
173
|
+
totalEvals: 0,
|
|
174
|
+
averageScore: 0,
|
|
175
|
+
suites: [],
|
|
176
|
+
error: `Failed to read evals directory: ${err instanceof Error ? err.message : String(err)}`,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
} else {
|
|
180
|
+
// No filter - run all evals in evals/
|
|
181
|
+
evalPath = evalsDir;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Use temporary output path if user didn't provide one
|
|
185
|
+
const outputPath =
|
|
186
|
+
userOutputPath || path.join(projectRoot, `.evalite-results-${Date.now()}.json`);
|
|
187
|
+
const isTemporaryOutput = !userOutputPath;
|
|
188
|
+
|
|
189
|
+
// Run evalite programmatically
|
|
190
|
+
const storage = createInMemoryStorage();
|
|
191
|
+
|
|
192
|
+
await runEvalite({
|
|
193
|
+
path: evalPath, // undefined = run all
|
|
194
|
+
cwd: projectRoot, // Use project root as working directory
|
|
195
|
+
mode: "run-once",
|
|
196
|
+
scoreThreshold,
|
|
197
|
+
outputPath,
|
|
198
|
+
hideTable: true, // Suppress terminal output
|
|
199
|
+
storage,
|
|
200
|
+
disableServer: true, // No UI server needed
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
// Parse output file for structured results
|
|
204
|
+
let outputJson: string;
|
|
205
|
+
try {
|
|
206
|
+
outputJson = await fs.readFile(outputPath, "utf-8");
|
|
207
|
+
} catch (err) {
|
|
208
|
+
// Output file wasn't written - evalite crashed or no tests ran
|
|
209
|
+
return {
|
|
210
|
+
success: false,
|
|
211
|
+
totalSuites: 0,
|
|
212
|
+
totalEvals: 0,
|
|
213
|
+
averageScore: 0,
|
|
214
|
+
suites: [],
|
|
215
|
+
error: `No results file generated: ${err instanceof Error ? err.message : String(err)}`,
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const output: Evalite.Exported.Output = JSON.parse(outputJson);
|
|
220
|
+
|
|
221
|
+
// Clean up temporary output file
|
|
222
|
+
if (isTemporaryOutput) {
|
|
223
|
+
await fs.unlink(outputPath).catch(() => {
|
|
224
|
+
/* ignore cleanup errors */
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Transform to structured result
|
|
229
|
+
const suites: SuiteResult[] = output.suites.map((suite) => ({
|
|
230
|
+
name: suite.name,
|
|
231
|
+
filepath: suite.filepath,
|
|
232
|
+
status: suite.status,
|
|
233
|
+
duration: suite.duration,
|
|
234
|
+
averageScore: suite.averageScore,
|
|
235
|
+
evalCount: suite.evals.length,
|
|
236
|
+
// Include evals if user wants detailed results
|
|
237
|
+
evals: suite.evals.map((e) => ({
|
|
238
|
+
input: e.input,
|
|
239
|
+
output: e.output,
|
|
240
|
+
expected: e.expected,
|
|
241
|
+
scores: e.scores.map((s) => ({
|
|
242
|
+
name: s.name,
|
|
243
|
+
score: s.score,
|
|
244
|
+
description: s.description,
|
|
245
|
+
})),
|
|
246
|
+
})),
|
|
247
|
+
}));
|
|
248
|
+
|
|
249
|
+
// Calculate overall metrics
|
|
250
|
+
const totalEvals = suites.reduce((sum, s) => sum + s.evalCount, 0);
|
|
251
|
+
const averageScore =
|
|
252
|
+
suites.length > 0
|
|
253
|
+
? suites.reduce((sum, s) => sum + s.averageScore, 0) / suites.length
|
|
254
|
+
: 0;
|
|
255
|
+
|
|
256
|
+
// Determine success based on threshold
|
|
257
|
+
const thresholdPassed =
|
|
258
|
+
scoreThreshold === undefined || averageScore * 100 >= scoreThreshold;
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
success: thresholdPassed,
|
|
262
|
+
totalSuites: suites.length,
|
|
263
|
+
totalEvals,
|
|
264
|
+
averageScore,
|
|
265
|
+
suites,
|
|
266
|
+
};
|
|
267
|
+
} catch (error) {
|
|
268
|
+
// Return error result
|
|
269
|
+
return {
|
|
270
|
+
success: false,
|
|
271
|
+
totalSuites: 0,
|
|
272
|
+
totalEvals: 0,
|
|
273
|
+
averageScore: 0,
|
|
274
|
+
suites: [],
|
|
275
|
+
error:
|
|
276
|
+
error instanceof Error
|
|
277
|
+
? error.message
|
|
278
|
+
: String(error),
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// ============================================================================
|
|
284
|
+
// Plugin Tool
|
|
285
|
+
// ============================================================================
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Plugin tool for running evals programmatically
|
|
289
|
+
*/
|
|
290
|
+
const eval_run = tool({
|
|
291
|
+
description: `Run evalite evals programmatically and get structured results with scores.
|
|
292
|
+
|
|
293
|
+
Use this to:
|
|
294
|
+
- Run all evals in evals/ directory
|
|
295
|
+
- Filter by specific eval suite (e.g., "coordinator", "compaction")
|
|
296
|
+
- Enforce score thresholds for quality gates
|
|
297
|
+
- Get per-suite and per-eval scores
|
|
298
|
+
|
|
299
|
+
Returns structured JSON with:
|
|
300
|
+
- success: boolean (true if all tests passed threshold)
|
|
301
|
+
- totalSuites: number of eval suites run
|
|
302
|
+
- totalEvals: number of individual test cases
|
|
303
|
+
- averageScore: 0-1 score across all suites
|
|
304
|
+
- suites: array of suite results with scores
|
|
305
|
+
|
|
306
|
+
Example usage:
|
|
307
|
+
- Run all evals: eval_run()
|
|
308
|
+
- Run coordinator evals: eval_run({ suiteFilter: "coordinator" })
|
|
309
|
+
- Enforce 80% threshold: eval_run({ scoreThreshold: 80 })`,
|
|
310
|
+
|
|
311
|
+
args: {
|
|
312
|
+
suiteFilter: tool.schema
|
|
313
|
+
.string()
|
|
314
|
+
.optional()
|
|
315
|
+
.describe(
|
|
316
|
+
'Optional filter to run specific eval suite (e.g., "coordinator", "compaction"). Matches against eval file paths using substring matching.'
|
|
317
|
+
),
|
|
318
|
+
scoreThreshold: tool.schema
|
|
319
|
+
.number()
|
|
320
|
+
.optional()
|
|
321
|
+
.describe(
|
|
322
|
+
"Optional minimum average score threshold (0-100). If average score falls below this, result.success will be false. Useful for CI quality gates."
|
|
323
|
+
),
|
|
324
|
+
includeDetailedResults: tool.schema
|
|
325
|
+
.boolean()
|
|
326
|
+
.optional()
|
|
327
|
+
.describe(
|
|
328
|
+
"Include individual eval results with input/output/scores in response. Set to false (default) for summary only to save token usage."
|
|
329
|
+
),
|
|
330
|
+
},
|
|
331
|
+
|
|
332
|
+
execute: async (args) => {
|
|
333
|
+
const result = await runEvals({
|
|
334
|
+
cwd: process.cwd(),
|
|
335
|
+
suiteFilter: args.suiteFilter as string | undefined,
|
|
336
|
+
scoreThreshold: args.scoreThreshold as number | undefined,
|
|
337
|
+
});
|
|
338
|
+
|
|
339
|
+
// Remove detailed evals if not requested (saves tokens)
|
|
340
|
+
const includeDetails = args.includeDetailedResults === true;
|
|
341
|
+
if (!includeDetails) {
|
|
342
|
+
for (const suite of result.suites) {
|
|
343
|
+
delete suite.evals;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return JSON.stringify(result, null, 2);
|
|
348
|
+
},
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* All eval tools exported for registration
|
|
353
|
+
*/
|
|
354
|
+
export const evalTools = {
|
|
355
|
+
eval_run,
|
|
356
|
+
} as const;
|
package/src/hive.ts
CHANGED
|
@@ -765,6 +765,40 @@ export const hive_create_epic = tool({
|
|
|
765
765
|
error,
|
|
766
766
|
);
|
|
767
767
|
}
|
|
768
|
+
|
|
769
|
+
// Capture decomposition_complete event for eval scoring
|
|
770
|
+
try {
|
|
771
|
+
const { captureCoordinatorEvent } = await import("./eval-capture.js");
|
|
772
|
+
|
|
773
|
+
// Build files_per_subtask map (indexed by subtask index)
|
|
774
|
+
const filesPerSubtask: Record<number, string[]> = {};
|
|
775
|
+
validated.subtasks.forEach((subtask, index) => {
|
|
776
|
+
if (subtask.files && subtask.files.length > 0) {
|
|
777
|
+
filesPerSubtask[index] = subtask.files;
|
|
778
|
+
}
|
|
779
|
+
});
|
|
780
|
+
|
|
781
|
+
captureCoordinatorEvent({
|
|
782
|
+
session_id: ctx.sessionID || "unknown",
|
|
783
|
+
epic_id: epic.id,
|
|
784
|
+
timestamp: new Date().toISOString(),
|
|
785
|
+
event_type: "DECISION",
|
|
786
|
+
decision_type: "decomposition_complete",
|
|
787
|
+
payload: {
|
|
788
|
+
subtask_count: validated.subtasks.length,
|
|
789
|
+
strategy_used: args.strategy || "feature-based",
|
|
790
|
+
files_per_subtask: filesPerSubtask,
|
|
791
|
+
epic_title: validated.epic_title,
|
|
792
|
+
task: args.task,
|
|
793
|
+
},
|
|
794
|
+
});
|
|
795
|
+
} catch (error) {
|
|
796
|
+
// Non-fatal - log and continue
|
|
797
|
+
console.warn(
|
|
798
|
+
"[hive_create_epic] Failed to capture decomposition_complete event:",
|
|
799
|
+
error,
|
|
800
|
+
);
|
|
801
|
+
}
|
|
768
802
|
}
|
|
769
803
|
|
|
770
804
|
// Sync cells to JSONL so spawned workers can see them immediately
|
package/src/index.ts
CHANGED
|
@@ -49,6 +49,7 @@ import { mandateTools } from "./mandates";
|
|
|
49
49
|
import { memoryTools } from "./memory-tools";
|
|
50
50
|
import { observabilityTools } from "./observability-tools";
|
|
51
51
|
import { researchTools } from "./swarm-research";
|
|
52
|
+
import { evalTools } from "./eval-runner";
|
|
52
53
|
import {
|
|
53
54
|
guardrailOutput,
|
|
54
55
|
DEFAULT_GUARDRAIL_CONFIG,
|
|
@@ -175,6 +176,7 @@ const SwarmPlugin: Plugin = async (
|
|
|
175
176
|
...memoryTools,
|
|
176
177
|
...observabilityTools,
|
|
177
178
|
...researchTools,
|
|
179
|
+
...evalTools,
|
|
178
180
|
},
|
|
179
181
|
|
|
180
182
|
/**
|
|
@@ -213,7 +215,7 @@ const SwarmPlugin: Plugin = async (
|
|
|
213
215
|
if (isInCoordinatorContext()) {
|
|
214
216
|
const ctx = getCoordinatorContext();
|
|
215
217
|
const violation = detectCoordinatorViolation({
|
|
216
|
-
sessionId:
|
|
218
|
+
sessionId: input.sessionID || "unknown",
|
|
217
219
|
epicId: ctx.epicId || "unknown",
|
|
218
220
|
toolName,
|
|
219
221
|
toolArgs: output.args as Record<string, unknown>,
|
|
@@ -729,6 +731,8 @@ export {
|
|
|
729
731
|
* Includes:
|
|
730
732
|
* - SWARM_COMPACTION_CONTEXT - Prompt text for swarm state preservation
|
|
731
733
|
* - createCompactionHook - Factory function for the compaction hook
|
|
734
|
+
* - scanSessionMessages - Scan session for swarm state
|
|
735
|
+
* - ScannedSwarmState - Scanned state interface
|
|
732
736
|
*
|
|
733
737
|
* Usage:
|
|
734
738
|
* ```typescript
|
|
@@ -739,7 +743,56 @@ export {
|
|
|
739
743
|
* };
|
|
740
744
|
* ```
|
|
741
745
|
*/
|
|
742
|
-
export {
|
|
746
|
+
export {
|
|
747
|
+
SWARM_COMPACTION_CONTEXT,
|
|
748
|
+
createCompactionHook,
|
|
749
|
+
scanSessionMessages,
|
|
750
|
+
type ScannedSwarmState,
|
|
751
|
+
} from "./compaction-hook";
|
|
752
|
+
|
|
753
|
+
/**
|
|
754
|
+
* Re-export compaction-observability module
|
|
755
|
+
*
|
|
756
|
+
* Includes:
|
|
757
|
+
* - CompactionPhase - Enum of compaction phases
|
|
758
|
+
* - createMetricsCollector - Create a metrics collector
|
|
759
|
+
* - recordPhaseStart, recordPhaseComplete - Phase timing
|
|
760
|
+
* - recordPatternExtracted, recordPatternSkipped - Pattern tracking
|
|
761
|
+
* - getMetricsSummary - Get metrics summary
|
|
762
|
+
*
|
|
763
|
+
* Types:
|
|
764
|
+
* - CompactionMetrics - Mutable metrics collector
|
|
765
|
+
* - CompactionMetricsSummary - Read-only summary snapshot
|
|
766
|
+
*
|
|
767
|
+
* Features:
|
|
768
|
+
* - Phase timing breakdown (START, GATHER, DETECT, INJECT, COMPLETE)
|
|
769
|
+
* - Pattern extraction tracking with reasons
|
|
770
|
+
* - Success rate calculation
|
|
771
|
+
* - Debug mode for verbose details
|
|
772
|
+
* - JSON serializable for persistence
|
|
773
|
+
*
|
|
774
|
+
* Usage:
|
|
775
|
+
* ```typescript
|
|
776
|
+
* import { createMetricsCollector, CompactionPhase, recordPhaseStart } from "opencode-swarm-plugin";
|
|
777
|
+
*
|
|
778
|
+
* const metrics = createMetricsCollector({ session_id: "abc123" });
|
|
779
|
+
* recordPhaseStart(metrics, CompactionPhase.DETECT);
|
|
780
|
+
* // ... work ...
|
|
781
|
+
* recordPhaseComplete(metrics, CompactionPhase.DETECT);
|
|
782
|
+
* const summary = getMetricsSummary(metrics);
|
|
783
|
+
* ```
|
|
784
|
+
*/
|
|
785
|
+
export {
|
|
786
|
+
CompactionPhase,
|
|
787
|
+
createMetricsCollector,
|
|
788
|
+
recordPhaseStart,
|
|
789
|
+
recordPhaseComplete,
|
|
790
|
+
recordPatternExtracted,
|
|
791
|
+
recordPatternSkipped,
|
|
792
|
+
getMetricsSummary,
|
|
793
|
+
type CompactionMetrics,
|
|
794
|
+
type CompactionMetricsSummary,
|
|
795
|
+
} from "./compaction-observability";
|
|
743
796
|
|
|
744
797
|
/**
|
|
745
798
|
* Re-export memory module
|
|
@@ -771,6 +824,66 @@ export {
|
|
|
771
824
|
} from "./memory-tools";
|
|
772
825
|
export type { Memory, SearchResult, SearchOptions } from "swarm-mail";
|
|
773
826
|
|
|
827
|
+
/**
|
|
828
|
+
* Re-export eval-history module
|
|
829
|
+
*
|
|
830
|
+
* Includes:
|
|
831
|
+
* - recordEvalRun - Record eval run to JSONL history
|
|
832
|
+
* - getScoreHistory - Get score history for a specific eval
|
|
833
|
+
* - getPhase - Get current phase based on run count and variance
|
|
834
|
+
* - calculateVariance - Calculate statistical variance of scores
|
|
835
|
+
* - ensureEvalHistoryDir - Ensure history directory exists
|
|
836
|
+
* - getEvalHistoryPath - Get path to eval history file
|
|
837
|
+
*
|
|
838
|
+
* Constants:
|
|
839
|
+
* - DEFAULT_EVAL_HISTORY_PATH - Default path (.opencode/eval-history.jsonl)
|
|
840
|
+
* - VARIANCE_THRESHOLD - Variance threshold for production phase (0.1)
|
|
841
|
+
* - BOOTSTRAP_THRESHOLD - Run count for bootstrap phase (10)
|
|
842
|
+
* - STABILIZATION_THRESHOLD - Run count for stabilization phase (50)
|
|
843
|
+
*
|
|
844
|
+
* Types:
|
|
845
|
+
* - Phase - Progressive phases (bootstrap | stabilization | production)
|
|
846
|
+
* - EvalRunRecord - Single eval run record
|
|
847
|
+
*/
|
|
848
|
+
export {
|
|
849
|
+
recordEvalRun,
|
|
850
|
+
getScoreHistory,
|
|
851
|
+
getPhase,
|
|
852
|
+
calculateVariance,
|
|
853
|
+
ensureEvalHistoryDir,
|
|
854
|
+
getEvalHistoryPath,
|
|
855
|
+
DEFAULT_EVAL_HISTORY_PATH,
|
|
856
|
+
VARIANCE_THRESHOLD,
|
|
857
|
+
BOOTSTRAP_THRESHOLD,
|
|
858
|
+
STABILIZATION_THRESHOLD,
|
|
859
|
+
type Phase,
|
|
860
|
+
type EvalRunRecord,
|
|
861
|
+
} from "./eval-history";
|
|
862
|
+
|
|
863
|
+
/**
|
|
864
|
+
* Re-export eval-gates module
|
|
865
|
+
*
|
|
866
|
+
* Includes:
|
|
867
|
+
* - checkGate - Check if current score passes quality gate
|
|
868
|
+
* - DEFAULT_THRESHOLDS - Default regression thresholds by phase
|
|
869
|
+
*
|
|
870
|
+
* Types:
|
|
871
|
+
* - GateResult - Result from gate check
|
|
872
|
+
* - GateConfig - Configuration for gate thresholds
|
|
873
|
+
*
|
|
874
|
+
* Features:
|
|
875
|
+
* - Phase-based regression thresholds (Bootstrap: none, Stabilization: 10%, Production: 5%)
|
|
876
|
+
* - Configurable thresholds via GateConfig
|
|
877
|
+
* - Clear pass/fail messages with baseline comparison
|
|
878
|
+
* - Handles edge cases (division by zero, no history)
|
|
879
|
+
*/
|
|
880
|
+
export {
|
|
881
|
+
checkGate,
|
|
882
|
+
DEFAULT_THRESHOLDS,
|
|
883
|
+
type GateResult,
|
|
884
|
+
type GateConfig,
|
|
885
|
+
} from "./eval-gates";
|
|
886
|
+
|
|
774
887
|
/**
|
|
775
888
|
* Re-export logger infrastructure
|
|
776
889
|
*
|
package/src/memory.test.ts
CHANGED
|
@@ -120,6 +120,116 @@ describe("memory adapter", () => {
|
|
|
120
120
|
}
|
|
121
121
|
});
|
|
122
122
|
});
|
|
123
|
+
|
|
124
|
+
describe("upsert", () => {
|
|
125
|
+
test("returns ADD operation for new memory", async () => {
|
|
126
|
+
const result = await adapter.upsert({
|
|
127
|
+
information: "Completely new information about quantum computing",
|
|
128
|
+
tags: "quantum,physics",
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
expect(result.operation).toBe("ADD");
|
|
132
|
+
expect(result.reason).toBeDefined();
|
|
133
|
+
expect(result.memoryId).toBeDefined();
|
|
134
|
+
expect(result.memoryId).toMatch(/^mem_/);
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
test("returns UPDATE operation when refining existing memory", async () => {
|
|
138
|
+
// Store initial memory
|
|
139
|
+
await adapter.store({
|
|
140
|
+
information: "OAuth tokens need buffer",
|
|
141
|
+
tags: "auth",
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
// Try to upsert refined version
|
|
145
|
+
const result = await adapter.upsert({
|
|
146
|
+
information: "OAuth refresh tokens need 5min buffer before expiry to avoid race conditions",
|
|
147
|
+
tags: "auth,oauth,tokens",
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
expect(result.operation).toBe("UPDATE");
|
|
151
|
+
expect(result.reason).toBeDefined();
|
|
152
|
+
expect(result.memoryId).toBeDefined();
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
test("returns NOOP operation when information already exists", async () => {
|
|
156
|
+
// Store a memory
|
|
157
|
+
await adapter.store({
|
|
158
|
+
information: "Next.js 16 requires Suspense for Cache Components",
|
|
159
|
+
tags: "nextjs",
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
// Try to upsert same information
|
|
163
|
+
const result = await adapter.upsert({
|
|
164
|
+
information: "Next.js 16 requires Suspense for Cache Components",
|
|
165
|
+
tags: "nextjs",
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
expect(result.operation).toBe("NOOP");
|
|
169
|
+
expect(result.reason).toContain("already captured");
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
test("autoTag generates tags when enabled", async () => {
|
|
173
|
+
const result = await adapter.upsert({
|
|
174
|
+
information: "TypeScript interfaces are better than type aliases for object types",
|
|
175
|
+
autoTag: true,
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
expect(result.autoTags).toBeDefined();
|
|
179
|
+
expect(result.autoTags?.tags).toBeInstanceOf(Array);
|
|
180
|
+
expect(result.autoTags?.tags.length).toBeGreaterThan(0);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
test("autoLink creates links when enabled", async () => {
|
|
184
|
+
// Store a base memory
|
|
185
|
+
await adapter.store({
|
|
186
|
+
information: "TypeScript supports structural typing",
|
|
187
|
+
tags: "typescript",
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Upsert related memory with autoLink
|
|
191
|
+
const result = await adapter.upsert({
|
|
192
|
+
information: "TypeScript interfaces use structural typing for shape matching",
|
|
193
|
+
autoLink: true,
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
if (result.linksCreated !== undefined) {
|
|
197
|
+
expect(result.linksCreated).toBeGreaterThanOrEqual(0);
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
test("extractEntities extracts entities when enabled", async () => {
|
|
202
|
+
const result = await adapter.upsert({
|
|
203
|
+
information: "React 19 introduces Server Components for Next.js 15",
|
|
204
|
+
extractEntities: true,
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
if (result.entitiesExtracted !== undefined) {
|
|
208
|
+
expect(result.entitiesExtracted).toBeGreaterThanOrEqual(0);
|
|
209
|
+
}
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
test("respects collection parameter", async () => {
|
|
213
|
+
const result = await adapter.upsert({
|
|
214
|
+
information: "Test memory in custom collection",
|
|
215
|
+
collection: "test-collection",
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
expect(result.memoryId).toBeDefined();
|
|
219
|
+
// Verify memory was stored in correct collection
|
|
220
|
+
const memory = await adapter.get({ id: result.memoryId! });
|
|
221
|
+
expect(memory?.collection).toBe("test-collection");
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
test("handles errors gracefully when information is missing", async () => {
|
|
225
|
+
await expect(async () => {
|
|
226
|
+
await (adapter.upsert as any)({
|
|
227
|
+
// Missing required information field
|
|
228
|
+
tags: "test",
|
|
229
|
+
});
|
|
230
|
+
}).toThrow();
|
|
231
|
+
});
|
|
232
|
+
});
|
|
123
233
|
});
|
|
124
234
|
|
|
125
235
|
describe("auto-migration on createMemoryAdapter", () => {
|