opencode-swarm-plugin 0.43.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cass.characterization.test.ts +422 -0
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.test.ts +68 -0
- package/bin/swarm.ts +81 -8
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/contributor-tools.d.ts +42 -0
- package/dist/contributor-tools.d.ts.map +1 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +12 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +7728 -62590
- package/dist/plugin.js +23833 -78695
- package/dist/sessions/agent-discovery.d.ts +59 -0
- package/dist/sessions/agent-discovery.d.ts.map +1 -0
- package/dist/sessions/index.d.ts +10 -0
- package/dist/sessions/index.d.ts.map +1 -0
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2255
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2426
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -935
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
|
@@ -1,345 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Tests for data-loader quality filters
|
|
3
|
-
*
|
|
4
|
-
* TDD approach: RED → GREEN → REFACTOR
|
|
5
|
-
*/
|
|
6
|
-
import { afterEach, beforeEach, describe, expect, test } from "bun:test";
|
|
7
|
-
import * as fs from "node:fs";
|
|
8
|
-
import * as os from "node:os";
|
|
9
|
-
import * as path from "node:path";
|
|
10
|
-
import type { CoordinatorEvent } from "../../src/eval-capture.js";
|
|
11
|
-
import { loadCapturedSessions } from "./data-loader.js";
|
|
12
|
-
|
|
13
|
-
// Test helper: create a temp session directory
|
|
14
|
-
let tempSessionDir: string;
|
|
15
|
-
|
|
16
|
-
beforeEach(() => {
|
|
17
|
-
tempSessionDir = fs.mkdtempSync(path.join(os.tmpdir(), "test-sessions-"));
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
afterEach(() => {
|
|
21
|
-
if (fs.existsSync(tempSessionDir)) {
|
|
22
|
-
fs.rmSync(tempSessionDir, { recursive: true });
|
|
23
|
-
}
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
* Helper: create a session JSONL file with events
|
|
28
|
-
*/
|
|
29
|
-
function createSessionFile(
|
|
30
|
-
sessionId: string,
|
|
31
|
-
events: CoordinatorEvent[],
|
|
32
|
-
): void {
|
|
33
|
-
const filePath = path.join(tempSessionDir, `${sessionId}.jsonl`);
|
|
34
|
-
const lines = events.map((e) => JSON.stringify(e)).join("\n") + "\n";
|
|
35
|
-
fs.writeFileSync(filePath, lines, "utf-8");
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Helper: create minimal events
|
|
40
|
-
*/
|
|
41
|
-
function createEvent(
|
|
42
|
-
sessionId: string,
|
|
43
|
-
epicId: string,
|
|
44
|
-
type: "DECISION" | "VIOLATION" | "OUTCOME",
|
|
45
|
-
subtype: string,
|
|
46
|
-
): CoordinatorEvent {
|
|
47
|
-
const base = {
|
|
48
|
-
session_id: sessionId,
|
|
49
|
-
epic_id: epicId,
|
|
50
|
-
timestamp: new Date().toISOString(),
|
|
51
|
-
payload: {},
|
|
52
|
-
};
|
|
53
|
-
|
|
54
|
-
if (type === "DECISION") {
|
|
55
|
-
return {
|
|
56
|
-
...base,
|
|
57
|
-
event_type: "DECISION" as const,
|
|
58
|
-
decision_type: subtype as any,
|
|
59
|
-
};
|
|
60
|
-
} else if (type === "VIOLATION") {
|
|
61
|
-
return {
|
|
62
|
-
...base,
|
|
63
|
-
event_type: "VIOLATION" as const,
|
|
64
|
-
violation_type: subtype as any,
|
|
65
|
-
};
|
|
66
|
-
} else {
|
|
67
|
-
return {
|
|
68
|
-
...base,
|
|
69
|
-
event_type: "OUTCOME" as const,
|
|
70
|
-
outcome_type: subtype as any,
|
|
71
|
-
};
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
describe("loadCapturedSessions - quality filters", () => {
|
|
76
|
-
test("filters out sessions with fewer than minEvents (default: 3)", async () => {
|
|
77
|
-
// Create sessions with different event counts
|
|
78
|
-
createSessionFile("session-2-events", [
|
|
79
|
-
createEvent("session-2-events", "epic-1", "DECISION", "worker_spawned"),
|
|
80
|
-
createEvent("session-2-events", "epic-1", "OUTCOME", "subtask_success"),
|
|
81
|
-
]);
|
|
82
|
-
|
|
83
|
-
createSessionFile("session-3-events", [
|
|
84
|
-
createEvent("session-3-events", "epic-2", "DECISION", "worker_spawned"),
|
|
85
|
-
createEvent("session-3-events", "epic-2", "DECISION", "review_completed"),
|
|
86
|
-
createEvent("session-3-events", "epic-2", "OUTCOME", "subtask_success"),
|
|
87
|
-
]);
|
|
88
|
-
|
|
89
|
-
createSessionFile("session-5-events", [
|
|
90
|
-
createEvent("session-5-events", "epic-3", "DECISION", "worker_spawned"),
|
|
91
|
-
createEvent("session-5-events", "epic-3", "DECISION", "review_completed"),
|
|
92
|
-
createEvent("session-5-events", "epic-3", "OUTCOME", "subtask_success"),
|
|
93
|
-
createEvent("session-5-events", "epic-3", "OUTCOME", "subtask_success"),
|
|
94
|
-
createEvent("session-5-events", "epic-3", "OUTCOME", "epic_complete"),
|
|
95
|
-
]);
|
|
96
|
-
|
|
97
|
-
const sessions = await loadCapturedSessions({
|
|
98
|
-
minEvents: 3,
|
|
99
|
-
sessionDir: tempSessionDir,
|
|
100
|
-
});
|
|
101
|
-
|
|
102
|
-
// Should only get sessions with >= 3 events
|
|
103
|
-
expect(sessions.length).toBe(2);
|
|
104
|
-
expect(
|
|
105
|
-
sessions.some((s) => s.session.session_id === "session-3-events"),
|
|
106
|
-
).toBe(true);
|
|
107
|
-
expect(
|
|
108
|
-
sessions.some((s) => s.session.session_id === "session-5-events"),
|
|
109
|
-
).toBe(true);
|
|
110
|
-
expect(
|
|
111
|
-
sessions.some((s) => s.session.session_id === "session-2-events"),
|
|
112
|
-
).toBe(false);
|
|
113
|
-
});
|
|
114
|
-
|
|
115
|
-
test("filters out sessions without worker_spawned event when requireWorkerSpawn=true", async () => {
|
|
116
|
-
// Session WITH worker_spawned
|
|
117
|
-
createSessionFile("session-with-spawn", [
|
|
118
|
-
createEvent("session-with-spawn", "epic-1", "DECISION", "worker_spawned"),
|
|
119
|
-
createEvent(
|
|
120
|
-
"session-with-spawn",
|
|
121
|
-
"epic-1",
|
|
122
|
-
"DECISION",
|
|
123
|
-
"review_completed",
|
|
124
|
-
),
|
|
125
|
-
createEvent("session-with-spawn", "epic-1", "OUTCOME", "subtask_success"),
|
|
126
|
-
]);
|
|
127
|
-
|
|
128
|
-
// Session WITHOUT worker_spawned
|
|
129
|
-
createSessionFile("session-no-spawn", [
|
|
130
|
-
createEvent(
|
|
131
|
-
"session-no-spawn",
|
|
132
|
-
"epic-2",
|
|
133
|
-
"DECISION",
|
|
134
|
-
"strategy_selected",
|
|
135
|
-
),
|
|
136
|
-
createEvent(
|
|
137
|
-
"session-no-spawn",
|
|
138
|
-
"epic-2",
|
|
139
|
-
"DECISION",
|
|
140
|
-
"decomposition_complete",
|
|
141
|
-
),
|
|
142
|
-
createEvent("session-no-spawn", "epic-2", "OUTCOME", "epic_complete"),
|
|
143
|
-
]);
|
|
144
|
-
|
|
145
|
-
const sessions = await loadCapturedSessions({
|
|
146
|
-
requireWorkerSpawn: true,
|
|
147
|
-
sessionDir: tempSessionDir,
|
|
148
|
-
});
|
|
149
|
-
|
|
150
|
-
expect(sessions.length).toBe(1);
|
|
151
|
-
expect(sessions[0]?.session.session_id).toBe("session-with-spawn");
|
|
152
|
-
});
|
|
153
|
-
|
|
154
|
-
test("filters out sessions without review_completed event when requireReview=true", async () => {
|
|
155
|
-
// Session WITH review
|
|
156
|
-
createSessionFile("session-with-review", [
|
|
157
|
-
createEvent(
|
|
158
|
-
"session-with-review",
|
|
159
|
-
"epic-1",
|
|
160
|
-
"DECISION",
|
|
161
|
-
"worker_spawned",
|
|
162
|
-
),
|
|
163
|
-
createEvent(
|
|
164
|
-
"session-with-review",
|
|
165
|
-
"epic-1",
|
|
166
|
-
"DECISION",
|
|
167
|
-
"review_completed",
|
|
168
|
-
),
|
|
169
|
-
createEvent("session-with-review", "epic-1", "OUTCOME", "subtask_success"),
|
|
170
|
-
]);
|
|
171
|
-
|
|
172
|
-
// Session WITHOUT review
|
|
173
|
-
createSessionFile("session-no-review", [
|
|
174
|
-
createEvent("session-no-review", "epic-2", "DECISION", "worker_spawned"),
|
|
175
|
-
createEvent("session-no-review", "epic-2", "OUTCOME", "subtask_success"),
|
|
176
|
-
createEvent("session-no-review", "epic-2", "OUTCOME", "epic_complete"),
|
|
177
|
-
]);
|
|
178
|
-
|
|
179
|
-
const sessions = await loadCapturedSessions({
|
|
180
|
-
requireReview: true,
|
|
181
|
-
sessionDir: tempSessionDir,
|
|
182
|
-
});
|
|
183
|
-
|
|
184
|
-
expect(sessions.length).toBe(1);
|
|
185
|
-
expect(sessions[0]?.session.session_id).toBe("session-with-review");
|
|
186
|
-
});
|
|
187
|
-
|
|
188
|
-
test("allows disabling filters individually", async () => {
|
|
189
|
-
// Session with only 2 events, no worker_spawned, no review
|
|
190
|
-
createSessionFile("session-low-quality", [
|
|
191
|
-
createEvent(
|
|
192
|
-
"session-low-quality",
|
|
193
|
-
"epic-1",
|
|
194
|
-
"DECISION",
|
|
195
|
-
"strategy_selected",
|
|
196
|
-
),
|
|
197
|
-
createEvent("session-low-quality", "epic-1", "OUTCOME", "epic_complete"),
|
|
198
|
-
]);
|
|
199
|
-
|
|
200
|
-
// Disable all filters
|
|
201
|
-
const sessions = await loadCapturedSessions({
|
|
202
|
-
minEvents: 0,
|
|
203
|
-
requireWorkerSpawn: false,
|
|
204
|
-
requireReview: false,
|
|
205
|
-
sessionDir: tempSessionDir,
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
expect(sessions.length).toBe(1);
|
|
209
|
-
expect(sessions[0]?.session.session_id).toBe("session-low-quality");
|
|
210
|
-
});
|
|
211
|
-
|
|
212
|
-
test("applies limit AFTER filtering", async () => {
|
|
213
|
-
// Create 5 high-quality sessions
|
|
214
|
-
for (let i = 1; i <= 5; i++) {
|
|
215
|
-
createSessionFile(`session-${i}`, [
|
|
216
|
-
createEvent(`session-${i}`, `epic-${i}`, "DECISION", "worker_spawned"),
|
|
217
|
-
createEvent(
|
|
218
|
-
`session-${i}`,
|
|
219
|
-
`epic-${i}`,
|
|
220
|
-
"DECISION",
|
|
221
|
-
"review_completed",
|
|
222
|
-
),
|
|
223
|
-
createEvent(`session-${i}`, `epic-${i}`, "OUTCOME", "subtask_success"),
|
|
224
|
-
]);
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
// Create 3 low-quality sessions (will be filtered out)
|
|
228
|
-
for (let i = 6; i <= 8; i++) {
|
|
229
|
-
createSessionFile(`session-${i}`, [
|
|
230
|
-
createEvent(`session-${i}`, `epic-${i}`, "DECISION", "strategy_selected"),
|
|
231
|
-
]);
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
// Filter first (remove 3 low-quality), then limit to 2
|
|
235
|
-
const sessions = await loadCapturedSessions({
|
|
236
|
-
minEvents: 3,
|
|
237
|
-
requireWorkerSpawn: true,
|
|
238
|
-
requireReview: true,
|
|
239
|
-
limit: 2,
|
|
240
|
-
sessionDir: tempSessionDir,
|
|
241
|
-
});
|
|
242
|
-
|
|
243
|
-
// Should get 2 sessions from the 5 high-quality ones
|
|
244
|
-
expect(sessions.length).toBe(2);
|
|
245
|
-
expect(sessions.every((s) => s.session.events.length >= 3)).toBe(true);
|
|
246
|
-
});
|
|
247
|
-
|
|
248
|
-
test("combines all filters correctly", async () => {
|
|
249
|
-
// High-quality session (passes all filters)
|
|
250
|
-
createSessionFile("session-high-quality", [
|
|
251
|
-
createEvent(
|
|
252
|
-
"session-high-quality",
|
|
253
|
-
"epic-1",
|
|
254
|
-
"DECISION",
|
|
255
|
-
"worker_spawned",
|
|
256
|
-
),
|
|
257
|
-
createEvent(
|
|
258
|
-
"session-high-quality",
|
|
259
|
-
"epic-1",
|
|
260
|
-
"DECISION",
|
|
261
|
-
"review_completed",
|
|
262
|
-
),
|
|
263
|
-
createEvent("session-high-quality", "epic-1", "OUTCOME", "subtask_success"),
|
|
264
|
-
createEvent("session-high-quality", "epic-1", "OUTCOME", "epic_complete"),
|
|
265
|
-
]);
|
|
266
|
-
|
|
267
|
-
// Missing worker_spawned
|
|
268
|
-
createSessionFile("session-no-spawn", [
|
|
269
|
-
createEvent(
|
|
270
|
-
"session-no-spawn",
|
|
271
|
-
"epic-2",
|
|
272
|
-
"DECISION",
|
|
273
|
-
"review_completed",
|
|
274
|
-
),
|
|
275
|
-
createEvent("session-no-spawn", "epic-2", "OUTCOME", "subtask_success"),
|
|
276
|
-
createEvent("session-no-spawn", "epic-2", "OUTCOME", "epic_complete"),
|
|
277
|
-
]);
|
|
278
|
-
|
|
279
|
-
// Missing review_completed
|
|
280
|
-
createSessionFile("session-no-review", [
|
|
281
|
-
createEvent("session-no-review", "epic-3", "DECISION", "worker_spawned"),
|
|
282
|
-
createEvent("session-no-review", "epic-3", "OUTCOME", "subtask_success"),
|
|
283
|
-
createEvent("session-no-review", "epic-3", "OUTCOME", "epic_complete"),
|
|
284
|
-
]);
|
|
285
|
-
|
|
286
|
-
// Too few events
|
|
287
|
-
createSessionFile("session-too-few", [
|
|
288
|
-
createEvent("session-too-few", "epic-4", "DECISION", "worker_spawned"),
|
|
289
|
-
createEvent("session-too-few", "epic-4", "DECISION", "review_completed"),
|
|
290
|
-
]);
|
|
291
|
-
|
|
292
|
-
const sessions = await loadCapturedSessions({
|
|
293
|
-
minEvents: 3,
|
|
294
|
-
requireWorkerSpawn: true,
|
|
295
|
-
requireReview: true,
|
|
296
|
-
sessionDir: tempSessionDir,
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
// Only high-quality session should pass
|
|
300
|
-
expect(sessions.length).toBe(1);
|
|
301
|
-
expect(sessions[0]?.session.session_id).toBe("session-high-quality");
|
|
302
|
-
});
|
|
303
|
-
|
|
304
|
-
test("defaults are: minEvents=3, requireWorkerSpawn=true, requireReview=true", async () => {
|
|
305
|
-
// Create one session that meets defaults
|
|
306
|
-
createSessionFile("session-meets-defaults", [
|
|
307
|
-
createEvent(
|
|
308
|
-
"session-meets-defaults",
|
|
309
|
-
"epic-1",
|
|
310
|
-
"DECISION",
|
|
311
|
-
"worker_spawned",
|
|
312
|
-
),
|
|
313
|
-
createEvent(
|
|
314
|
-
"session-meets-defaults",
|
|
315
|
-
"epic-1",
|
|
316
|
-
"DECISION",
|
|
317
|
-
"review_completed",
|
|
318
|
-
),
|
|
319
|
-
createEvent(
|
|
320
|
-
"session-meets-defaults",
|
|
321
|
-
"epic-1",
|
|
322
|
-
"OUTCOME",
|
|
323
|
-
"subtask_success",
|
|
324
|
-
),
|
|
325
|
-
]);
|
|
326
|
-
|
|
327
|
-
// Create one that doesn't
|
|
328
|
-
createSessionFile("session-fails-defaults", [
|
|
329
|
-
createEvent(
|
|
330
|
-
"session-fails-defaults",
|
|
331
|
-
"epic-2",
|
|
332
|
-
"DECISION",
|
|
333
|
-
"strategy_selected",
|
|
334
|
-
),
|
|
335
|
-
]);
|
|
336
|
-
|
|
337
|
-
// Call with NO options except sessionDir - should use defaults
|
|
338
|
-
const sessions = await loadCapturedSessions({
|
|
339
|
-
sessionDir: tempSessionDir,
|
|
340
|
-
});
|
|
341
|
-
|
|
342
|
-
expect(sessions.length).toBe(1);
|
|
343
|
-
expect(sessions[0]?.session.session_id).toBe("session-meets-defaults");
|
|
344
|
-
});
|
|
345
|
-
});
|
package/evals/lib/data-loader.ts
DELETED
|
@@ -1,281 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* PGlite-backed eval data loader
|
|
3
|
-
*
|
|
4
|
-
* Loads real decomposition outcomes from the eval_records table
|
|
5
|
-
* for use in Evalite evals.
|
|
6
|
-
*/
|
|
7
|
-
import * as fs from "node:fs";
|
|
8
|
-
import {
|
|
9
|
-
getEvalRecords,
|
|
10
|
-
getEvalStats,
|
|
11
|
-
type EvalRecord,
|
|
12
|
-
} from "swarm-mail";
|
|
13
|
-
|
|
14
|
-
export interface EvalCase {
|
|
15
|
-
input: { task: string; context?: string };
|
|
16
|
-
expected: {
|
|
17
|
-
minSubtasks: number;
|
|
18
|
-
maxSubtasks: number;
|
|
19
|
-
requiredFiles?: string[];
|
|
20
|
-
overallSuccess?: boolean;
|
|
21
|
-
};
|
|
22
|
-
actual?: EvalRecord;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Load eval cases from PGlite
|
|
27
|
-
*
|
|
28
|
-
* @param projectKey - Project key for filtering records
|
|
29
|
-
* @param options - Filter options
|
|
30
|
-
* @returns Array of eval cases ready for Evalite
|
|
31
|
-
*/
|
|
32
|
-
export async function loadEvalCases(
|
|
33
|
-
projectKey: string,
|
|
34
|
-
options?: {
|
|
35
|
-
limit?: number;
|
|
36
|
-
strategy?: "file-based" | "feature-based" | "risk-based";
|
|
37
|
-
successOnly?: boolean;
|
|
38
|
-
projectPath?: string;
|
|
39
|
-
},
|
|
40
|
-
): Promise<EvalCase[]> {
|
|
41
|
-
const { limit, strategy, successOnly, projectPath } = options ?? {};
|
|
42
|
-
|
|
43
|
-
// Query eval records from PGlite
|
|
44
|
-
const records = await getEvalRecords(
|
|
45
|
-
projectKey,
|
|
46
|
-
{ limit, strategy },
|
|
47
|
-
projectPath,
|
|
48
|
-
);
|
|
49
|
-
|
|
50
|
-
// Filter by success if requested
|
|
51
|
-
const filtered = successOnly
|
|
52
|
-
? records.filter((r) => r.overall_success === true)
|
|
53
|
-
: records;
|
|
54
|
-
|
|
55
|
-
// Transform to EvalCase format
|
|
56
|
-
return filtered.map((record) => ({
|
|
57
|
-
input: {
|
|
58
|
-
task: record.task,
|
|
59
|
-
context: record.context ?? undefined,
|
|
60
|
-
},
|
|
61
|
-
expected: {
|
|
62
|
-
minSubtasks: 2,
|
|
63
|
-
maxSubtasks: record.subtasks.length,
|
|
64
|
-
requiredFiles: record.subtasks.flatMap((s) => s.files),
|
|
65
|
-
overallSuccess: record.overall_success ?? undefined,
|
|
66
|
-
},
|
|
67
|
-
actual: record,
|
|
68
|
-
}));
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Check if we have enough real data to run evals
|
|
73
|
-
*
|
|
74
|
-
* @param projectKey - Project key to check
|
|
75
|
-
* @param minRecords - Minimum number of records required (default: 5)
|
|
76
|
-
* @param projectPath - Optional project path for database lookup
|
|
77
|
-
* @returns True if enough data exists
|
|
78
|
-
*/
|
|
79
|
-
export async function hasRealEvalData(
|
|
80
|
-
projectKey: string,
|
|
81
|
-
minRecords: number = 5,
|
|
82
|
-
projectPath?: string,
|
|
83
|
-
): Promise<boolean> {
|
|
84
|
-
const stats = await getEvalStats(projectKey, projectPath);
|
|
85
|
-
return stats.totalRecords >= minRecords;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
/**
|
|
89
|
-
* Get eval data stats for reporting
|
|
90
|
-
*
|
|
91
|
-
* @param projectKey - Project key to query
|
|
92
|
-
* @param projectPath - Optional project path for database lookup
|
|
93
|
-
* @returns Summary of available eval data
|
|
94
|
-
*/
|
|
95
|
-
export async function getEvalDataSummary(
|
|
96
|
-
projectKey: string,
|
|
97
|
-
projectPath?: string,
|
|
98
|
-
): Promise<{
|
|
99
|
-
totalRecords: number;
|
|
100
|
-
successRate: number;
|
|
101
|
-
byStrategy: Record<string, number>;
|
|
102
|
-
hasEnoughData: boolean;
|
|
103
|
-
}> {
|
|
104
|
-
const stats = await getEvalStats(projectKey, projectPath);
|
|
105
|
-
|
|
106
|
-
return {
|
|
107
|
-
totalRecords: stats.totalRecords,
|
|
108
|
-
successRate: stats.successRate,
|
|
109
|
-
byStrategy: stats.byStrategy,
|
|
110
|
-
hasEnoughData: stats.totalRecords >= 5,
|
|
111
|
-
};
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/**
|
|
115
|
-
* Check if a session meets quality criteria
|
|
116
|
-
*/
|
|
117
|
-
function meetsQualityCriteria(
|
|
118
|
-
session: import("../../src/eval-capture.js").CoordinatorSession,
|
|
119
|
-
criteria: {
|
|
120
|
-
minEvents: number;
|
|
121
|
-
requireWorkerSpawn: boolean;
|
|
122
|
-
requireReview: boolean;
|
|
123
|
-
},
|
|
124
|
-
): boolean {
|
|
125
|
-
// Filter 1: minEvents
|
|
126
|
-
if (session.events.length < criteria.minEvents) {
|
|
127
|
-
return false;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// Filter 2: requireWorkerSpawn
|
|
131
|
-
if (
|
|
132
|
-
criteria.requireWorkerSpawn &&
|
|
133
|
-
!session.events.some(
|
|
134
|
-
(e) => e.event_type === "DECISION" && e.decision_type === "worker_spawned",
|
|
135
|
-
)
|
|
136
|
-
) {
|
|
137
|
-
return false;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// Filter 3: requireReview
|
|
141
|
-
if (
|
|
142
|
-
criteria.requireReview &&
|
|
143
|
-
!session.events.some(
|
|
144
|
-
(e) =>
|
|
145
|
-
e.event_type === "DECISION" && e.decision_type === "review_completed",
|
|
146
|
-
)
|
|
147
|
-
) {
|
|
148
|
-
return false;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
return true;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/**
|
|
155
|
-
* Load captured coordinator sessions from ~/.config/swarm-tools/sessions/
|
|
156
|
-
*
|
|
157
|
-
* Reads all JSONL session files and returns CoordinatorSession objects.
|
|
158
|
-
*
|
|
159
|
-
* Quality filters are applied to focus on high-signal coordinator sessions:
|
|
160
|
-
* - minEvents: Filter out incomplete/aborted sessions (default: 3)
|
|
161
|
-
* - requireWorkerSpawn: Ensure session delegated to workers (default: true)
|
|
162
|
-
* - requireReview: Ensure coordinator reviewed work (default: true)
|
|
163
|
-
*
|
|
164
|
-
* Filters are applied BEFORE the limit for accurate sampling.
|
|
165
|
-
*
|
|
166
|
-
* @param options - Filter options
|
|
167
|
-
* @returns Array of coordinator sessions that meet quality criteria
|
|
168
|
-
*/
|
|
169
|
-
export async function loadCapturedSessions(options?: {
|
|
170
|
-
sessionIds?: string[];
|
|
171
|
-
limit?: number;
|
|
172
|
-
/** Minimum number of events required (default: 3) */
|
|
173
|
-
minEvents?: number;
|
|
174
|
-
/** Require at least one worker_spawned event (default: true) */
|
|
175
|
-
requireWorkerSpawn?: boolean;
|
|
176
|
-
/** Require at least one review_completed event (default: true) */
|
|
177
|
-
requireReview?: boolean;
|
|
178
|
-
/** Override session directory for testing */
|
|
179
|
-
sessionDir?: string;
|
|
180
|
-
}): Promise<
|
|
181
|
-
Array<{ session: import("../../src/eval-capture.js").CoordinatorSession }>
|
|
182
|
-
> {
|
|
183
|
-
const { getSessionDir, readSessionEvents, saveSession } = await import(
|
|
184
|
-
"../../src/eval-capture.js"
|
|
185
|
-
);
|
|
186
|
-
const sessionDir = options?.sessionDir ?? getSessionDir();
|
|
187
|
-
|
|
188
|
-
// Default quality filters
|
|
189
|
-
const qualityCriteria = {
|
|
190
|
-
minEvents: options?.minEvents ?? 3,
|
|
191
|
-
requireWorkerSpawn: options?.requireWorkerSpawn ?? true,
|
|
192
|
-
requireReview: options?.requireReview ?? true,
|
|
193
|
-
};
|
|
194
|
-
|
|
195
|
-
// If session dir doesn't exist, return empty
|
|
196
|
-
if (!fs.existsSync(sessionDir)) {
|
|
197
|
-
return [];
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
// Read all .jsonl files in session directory
|
|
201
|
-
const files = fs
|
|
202
|
-
.readdirSync(sessionDir)
|
|
203
|
-
.filter((f) => f.endsWith(".jsonl"));
|
|
204
|
-
|
|
205
|
-
// Filter by sessionIds if provided
|
|
206
|
-
const targetFiles = options?.sessionIds
|
|
207
|
-
? files.filter((f) => options.sessionIds?.includes(f.replace(".jsonl", "")))
|
|
208
|
-
: files;
|
|
209
|
-
|
|
210
|
-
// Load each session
|
|
211
|
-
const sessions: Array<{
|
|
212
|
-
session: import("../../src/eval-capture.js").CoordinatorSession;
|
|
213
|
-
}> = [];
|
|
214
|
-
let filteredOutCount = 0;
|
|
215
|
-
|
|
216
|
-
for (const file of targetFiles) {
|
|
217
|
-
const sessionId = file.replace(".jsonl", "");
|
|
218
|
-
|
|
219
|
-
try {
|
|
220
|
-
let events: import("../../src/eval-capture.js").CoordinatorEvent[];
|
|
221
|
-
|
|
222
|
-
// If custom sessionDir, read directly; otherwise use eval-capture functions
|
|
223
|
-
if (options?.sessionDir) {
|
|
224
|
-
const sessionPath = `${sessionDir}/${sessionId}.jsonl`;
|
|
225
|
-
if (!fs.existsSync(sessionPath)) continue;
|
|
226
|
-
|
|
227
|
-
const content = fs.readFileSync(sessionPath, "utf-8");
|
|
228
|
-
const lines = content.trim().split("\n").filter(Boolean);
|
|
229
|
-
const { CoordinatorEventSchema } = await import(
|
|
230
|
-
"../../src/eval-capture.js"
|
|
231
|
-
);
|
|
232
|
-
events = lines.map((line) => {
|
|
233
|
-
const parsed = JSON.parse(line);
|
|
234
|
-
return CoordinatorEventSchema.parse(parsed);
|
|
235
|
-
});
|
|
236
|
-
} else {
|
|
237
|
-
events = readSessionEvents(sessionId);
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
if (events.length === 0) continue;
|
|
241
|
-
|
|
242
|
-
// Find epic_id from first event
|
|
243
|
-
const epicId = events[0]?.epic_id;
|
|
244
|
-
if (!epicId) continue;
|
|
245
|
-
|
|
246
|
-
// Build session object
|
|
247
|
-
const session: import("../../src/eval-capture.js").CoordinatorSession = {
|
|
248
|
-
session_id: sessionId,
|
|
249
|
-
epic_id: epicId,
|
|
250
|
-
start_time: events[0]?.timestamp ?? new Date().toISOString(),
|
|
251
|
-
end_time: events[events.length - 1]?.timestamp,
|
|
252
|
-
events,
|
|
253
|
-
};
|
|
254
|
-
if (!session) continue;
|
|
255
|
-
|
|
256
|
-
// Apply quality filters BEFORE limit
|
|
257
|
-
if (meetsQualityCriteria(session, qualityCriteria)) {
|
|
258
|
-
sessions.push({ session });
|
|
259
|
-
} else {
|
|
260
|
-
filteredOutCount++;
|
|
261
|
-
}
|
|
262
|
-
} catch (error) {
|
|
263
|
-
// Skip invalid sessions
|
|
264
|
-
console.warn(`Failed to load session ${sessionId}:`, error);
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
// Apply limit AFTER filtering
|
|
268
|
-
if (options?.limit && sessions.length >= options.limit) {
|
|
269
|
-
break;
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
// Log filtering stats for visibility
|
|
274
|
-
if (filteredOutCount > 0) {
|
|
275
|
-
console.log(
|
|
276
|
-
`Filtered out ${filteredOutCount} sessions (minEvents=${qualityCriteria.minEvents}, requireWorkerSpawn=${qualityCriteria.requireWorkerSpawn}, requireReview=${qualityCriteria.requireReview})`,
|
|
277
|
-
);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
return sessions;
|
|
281
|
-
}
|