opencode-swarm-plugin 0.44.0 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/swarm.serve.test.ts +6 -4
- package/bin/swarm.ts +16 -10
- package/dist/compaction-prompt-scoring.js +139 -0
- package/dist/eval-capture.js +12811 -0
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.js +7644 -62599
- package/dist/plugin.js +23766 -78721
- package/dist/swarm-orchestrate.d.ts.map +1 -1
- package/dist/swarm-prompts.d.ts.map +1 -1
- package/dist/swarm-review.d.ts.map +1 -1
- package/package.json +17 -5
- package/.changeset/swarm-insights-data-layer.md +0 -63
- package/.hive/analysis/eval-failure-analysis-2025-12-25.md +0 -331
- package/.hive/analysis/session-data-quality-audit.md +0 -320
- package/.hive/eval-results.json +0 -483
- package/.hive/issues.jsonl +0 -138
- package/.hive/memories.jsonl +0 -729
- package/.opencode/eval-history.jsonl +0 -327
- package/.turbo/turbo-build.log +0 -9
- package/CHANGELOG.md +0 -2286
- package/SCORER-ANALYSIS.md +0 -598
- package/docs/analysis/subagent-coordination-patterns.md +0 -902
- package/docs/analysis-socratic-planner-pattern.md +0 -504
- package/docs/planning/ADR-001-monorepo-structure.md +0 -171
- package/docs/planning/ADR-002-package-extraction.md +0 -393
- package/docs/planning/ADR-003-performance-improvements.md +0 -451
- package/docs/planning/ADR-004-message-queue-features.md +0 -187
- package/docs/planning/ADR-005-devtools-observability.md +0 -202
- package/docs/planning/ADR-007-swarm-enhancements-worktree-review.md +0 -168
- package/docs/planning/ADR-008-worker-handoff-protocol.md +0 -293
- package/docs/planning/ADR-009-oh-my-opencode-patterns.md +0 -353
- package/docs/planning/ADR-010-cass-inhousing.md +0 -1215
- package/docs/planning/ROADMAP.md +0 -368
- package/docs/semantic-memory-cli-syntax.md +0 -123
- package/docs/swarm-mail-architecture.md +0 -1147
- package/docs/testing/context-recovery-test.md +0 -470
- package/evals/ARCHITECTURE.md +0 -1189
- package/evals/README.md +0 -768
- package/evals/compaction-prompt.eval.ts +0 -149
- package/evals/compaction-resumption.eval.ts +0 -289
- package/evals/coordinator-behavior.eval.ts +0 -307
- package/evals/coordinator-session.eval.ts +0 -154
- package/evals/evalite.config.ts.bak +0 -15
- package/evals/example.eval.ts +0 -31
- package/evals/fixtures/cass-baseline.ts +0 -217
- package/evals/fixtures/compaction-cases.ts +0 -350
- package/evals/fixtures/compaction-prompt-cases.ts +0 -311
- package/evals/fixtures/coordinator-sessions.ts +0 -328
- package/evals/fixtures/decomposition-cases.ts +0 -105
- package/evals/lib/compaction-loader.test.ts +0 -248
- package/evals/lib/compaction-loader.ts +0 -320
- package/evals/lib/data-loader.evalite-test.ts +0 -289
- package/evals/lib/data-loader.test.ts +0 -345
- package/evals/lib/data-loader.ts +0 -281
- package/evals/lib/llm.ts +0 -115
- package/evals/scorers/compaction-prompt-scorers.ts +0 -145
- package/evals/scorers/compaction-scorers.ts +0 -305
- package/evals/scorers/coordinator-discipline.evalite-test.ts +0 -539
- package/evals/scorers/coordinator-discipline.ts +0 -325
- package/evals/scorers/index.test.ts +0 -146
- package/evals/scorers/index.ts +0 -328
- package/evals/scorers/outcome-scorers.evalite-test.ts +0 -27
- package/evals/scorers/outcome-scorers.ts +0 -349
- package/evals/swarm-decomposition.eval.ts +0 -121
- package/examples/commands/swarm.md +0 -745
- package/examples/plugin-wrapper-template.ts +0 -2515
- package/examples/skills/hive-workflow/SKILL.md +0 -212
- package/examples/skills/skill-creator/SKILL.md +0 -223
- package/examples/skills/swarm-coordination/SKILL.md +0 -292
- package/global-skills/cli-builder/SKILL.md +0 -344
- package/global-skills/cli-builder/references/advanced-patterns.md +0 -244
- package/global-skills/learning-systems/SKILL.md +0 -644
- package/global-skills/skill-creator/LICENSE.txt +0 -202
- package/global-skills/skill-creator/SKILL.md +0 -352
- package/global-skills/skill-creator/references/output-patterns.md +0 -82
- package/global-skills/skill-creator/references/workflows.md +0 -28
- package/global-skills/swarm-coordination/SKILL.md +0 -995
- package/global-skills/swarm-coordination/references/coordinator-patterns.md +0 -235
- package/global-skills/swarm-coordination/references/strategies.md +0 -138
- package/global-skills/system-design/SKILL.md +0 -213
- package/global-skills/testing-patterns/SKILL.md +0 -430
- package/global-skills/testing-patterns/references/dependency-breaking-catalog.md +0 -586
- package/opencode-swarm-plugin-0.30.7.tgz +0 -0
- package/opencode-swarm-plugin-0.31.0.tgz +0 -0
- package/scripts/cleanup-test-memories.ts +0 -346
- package/scripts/init-skill.ts +0 -222
- package/scripts/migrate-unknown-sessions.ts +0 -349
- package/scripts/validate-skill.ts +0 -204
- package/src/agent-mail.ts +0 -1724
- package/src/anti-patterns.test.ts +0 -1167
- package/src/anti-patterns.ts +0 -448
- package/src/compaction-capture.integration.test.ts +0 -257
- package/src/compaction-hook.test.ts +0 -838
- package/src/compaction-hook.ts +0 -1204
- package/src/compaction-observability.integration.test.ts +0 -139
- package/src/compaction-observability.test.ts +0 -187
- package/src/compaction-observability.ts +0 -324
- package/src/compaction-prompt-scorers.test.ts +0 -475
- package/src/compaction-prompt-scoring.ts +0 -300
- package/src/contributor-tools.test.ts +0 -133
- package/src/contributor-tools.ts +0 -201
- package/src/dashboard.test.ts +0 -611
- package/src/dashboard.ts +0 -462
- package/src/error-enrichment.test.ts +0 -403
- package/src/error-enrichment.ts +0 -219
- package/src/eval-capture.test.ts +0 -1015
- package/src/eval-capture.ts +0 -929
- package/src/eval-gates.test.ts +0 -306
- package/src/eval-gates.ts +0 -218
- package/src/eval-history.test.ts +0 -508
- package/src/eval-history.ts +0 -214
- package/src/eval-learning.test.ts +0 -378
- package/src/eval-learning.ts +0 -360
- package/src/eval-runner.test.ts +0 -223
- package/src/eval-runner.ts +0 -402
- package/src/export-tools.test.ts +0 -476
- package/src/export-tools.ts +0 -257
- package/src/hive.integration.test.ts +0 -2241
- package/src/hive.ts +0 -1628
- package/src/index.ts +0 -940
- package/src/learning.integration.test.ts +0 -1815
- package/src/learning.ts +0 -1079
- package/src/logger.test.ts +0 -189
- package/src/logger.ts +0 -135
- package/src/mandate-promotion.test.ts +0 -473
- package/src/mandate-promotion.ts +0 -239
- package/src/mandate-storage.integration.test.ts +0 -601
- package/src/mandate-storage.test.ts +0 -578
- package/src/mandate-storage.ts +0 -794
- package/src/mandates.ts +0 -540
- package/src/memory-tools.test.ts +0 -195
- package/src/memory-tools.ts +0 -344
- package/src/memory.integration.test.ts +0 -334
- package/src/memory.test.ts +0 -158
- package/src/memory.ts +0 -527
- package/src/model-selection.test.ts +0 -188
- package/src/model-selection.ts +0 -68
- package/src/observability-tools.test.ts +0 -359
- package/src/observability-tools.ts +0 -871
- package/src/output-guardrails.test.ts +0 -438
- package/src/output-guardrails.ts +0 -381
- package/src/pattern-maturity.test.ts +0 -1160
- package/src/pattern-maturity.ts +0 -525
- package/src/planning-guardrails.test.ts +0 -491
- package/src/planning-guardrails.ts +0 -438
- package/src/plugin.ts +0 -23
- package/src/post-compaction-tracker.test.ts +0 -251
- package/src/post-compaction-tracker.ts +0 -237
- package/src/query-tools.test.ts +0 -636
- package/src/query-tools.ts +0 -324
- package/src/rate-limiter.integration.test.ts +0 -466
- package/src/rate-limiter.ts +0 -774
- package/src/replay-tools.test.ts +0 -496
- package/src/replay-tools.ts +0 -240
- package/src/repo-crawl.integration.test.ts +0 -441
- package/src/repo-crawl.ts +0 -610
- package/src/schemas/cell-events.test.ts +0 -347
- package/src/schemas/cell-events.ts +0 -807
- package/src/schemas/cell.ts +0 -257
- package/src/schemas/evaluation.ts +0 -166
- package/src/schemas/index.test.ts +0 -199
- package/src/schemas/index.ts +0 -286
- package/src/schemas/mandate.ts +0 -232
- package/src/schemas/swarm-context.ts +0 -115
- package/src/schemas/task.ts +0 -161
- package/src/schemas/worker-handoff.test.ts +0 -302
- package/src/schemas/worker-handoff.ts +0 -131
- package/src/sessions/agent-discovery.test.ts +0 -137
- package/src/sessions/agent-discovery.ts +0 -112
- package/src/sessions/index.ts +0 -15
- package/src/skills.integration.test.ts +0 -1192
- package/src/skills.test.ts +0 -643
- package/src/skills.ts +0 -1549
- package/src/storage.integration.test.ts +0 -341
- package/src/storage.ts +0 -884
- package/src/structured.integration.test.ts +0 -817
- package/src/structured.test.ts +0 -1046
- package/src/structured.ts +0 -762
- package/src/swarm-decompose.test.ts +0 -188
- package/src/swarm-decompose.ts +0 -1302
- package/src/swarm-deferred.integration.test.ts +0 -157
- package/src/swarm-deferred.test.ts +0 -38
- package/src/swarm-insights.test.ts +0 -214
- package/src/swarm-insights.ts +0 -459
- package/src/swarm-mail.integration.test.ts +0 -970
- package/src/swarm-mail.ts +0 -739
- package/src/swarm-orchestrate.integration.test.ts +0 -282
- package/src/swarm-orchestrate.test.ts +0 -548
- package/src/swarm-orchestrate.ts +0 -3084
- package/src/swarm-prompts.test.ts +0 -1270
- package/src/swarm-prompts.ts +0 -2077
- package/src/swarm-research.integration.test.ts +0 -701
- package/src/swarm-research.test.ts +0 -698
- package/src/swarm-research.ts +0 -472
- package/src/swarm-review.integration.test.ts +0 -285
- package/src/swarm-review.test.ts +0 -879
- package/src/swarm-review.ts +0 -709
- package/src/swarm-strategies.ts +0 -407
- package/src/swarm-worktree.test.ts +0 -501
- package/src/swarm-worktree.ts +0 -575
- package/src/swarm.integration.test.ts +0 -2377
- package/src/swarm.ts +0 -38
- package/src/tool-adapter.integration.test.ts +0 -1221
- package/src/tool-availability.ts +0 -461
- package/tsconfig.json +0 -28
package/evals/lib/data-loader.ts
DELETED
|
@@ -1,281 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* PGlite-backed eval data loader
|
|
3
|
-
*
|
|
4
|
-
* Loads real decomposition outcomes from the eval_records table
|
|
5
|
-
* for use in Evalite evals.
|
|
6
|
-
*/
|
|
7
|
-
import * as fs from "node:fs";
|
|
8
|
-
import {
|
|
9
|
-
getEvalRecords,
|
|
10
|
-
getEvalStats,
|
|
11
|
-
type EvalRecord,
|
|
12
|
-
} from "swarm-mail";
|
|
13
|
-
|
|
14
|
-
export interface EvalCase {
|
|
15
|
-
input: { task: string; context?: string };
|
|
16
|
-
expected: {
|
|
17
|
-
minSubtasks: number;
|
|
18
|
-
maxSubtasks: number;
|
|
19
|
-
requiredFiles?: string[];
|
|
20
|
-
overallSuccess?: boolean;
|
|
21
|
-
};
|
|
22
|
-
actual?: EvalRecord;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Load eval cases from PGlite
|
|
27
|
-
*
|
|
28
|
-
* @param projectKey - Project key for filtering records
|
|
29
|
-
* @param options - Filter options
|
|
30
|
-
* @returns Array of eval cases ready for Evalite
|
|
31
|
-
*/
|
|
32
|
-
export async function loadEvalCases(
|
|
33
|
-
projectKey: string,
|
|
34
|
-
options?: {
|
|
35
|
-
limit?: number;
|
|
36
|
-
strategy?: "file-based" | "feature-based" | "risk-based";
|
|
37
|
-
successOnly?: boolean;
|
|
38
|
-
projectPath?: string;
|
|
39
|
-
},
|
|
40
|
-
): Promise<EvalCase[]> {
|
|
41
|
-
const { limit, strategy, successOnly, projectPath } = options ?? {};
|
|
42
|
-
|
|
43
|
-
// Query eval records from PGlite
|
|
44
|
-
const records = await getEvalRecords(
|
|
45
|
-
projectKey,
|
|
46
|
-
{ limit, strategy },
|
|
47
|
-
projectPath,
|
|
48
|
-
);
|
|
49
|
-
|
|
50
|
-
// Filter by success if requested
|
|
51
|
-
const filtered = successOnly
|
|
52
|
-
? records.filter((r) => r.overall_success === true)
|
|
53
|
-
: records;
|
|
54
|
-
|
|
55
|
-
// Transform to EvalCase format
|
|
56
|
-
return filtered.map((record) => ({
|
|
57
|
-
input: {
|
|
58
|
-
task: record.task,
|
|
59
|
-
context: record.context ?? undefined,
|
|
60
|
-
},
|
|
61
|
-
expected: {
|
|
62
|
-
minSubtasks: 2,
|
|
63
|
-
maxSubtasks: record.subtasks.length,
|
|
64
|
-
requiredFiles: record.subtasks.flatMap((s) => s.files),
|
|
65
|
-
overallSuccess: record.overall_success ?? undefined,
|
|
66
|
-
},
|
|
67
|
-
actual: record,
|
|
68
|
-
}));
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Check if we have enough real data to run evals
|
|
73
|
-
*
|
|
74
|
-
* @param projectKey - Project key to check
|
|
75
|
-
* @param minRecords - Minimum number of records required (default: 5)
|
|
76
|
-
* @param projectPath - Optional project path for database lookup
|
|
77
|
-
* @returns True if enough data exists
|
|
78
|
-
*/
|
|
79
|
-
export async function hasRealEvalData(
|
|
80
|
-
projectKey: string,
|
|
81
|
-
minRecords: number = 5,
|
|
82
|
-
projectPath?: string,
|
|
83
|
-
): Promise<boolean> {
|
|
84
|
-
const stats = await getEvalStats(projectKey, projectPath);
|
|
85
|
-
return stats.totalRecords >= minRecords;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
/**
|
|
89
|
-
* Get eval data stats for reporting
|
|
90
|
-
*
|
|
91
|
-
* @param projectKey - Project key to query
|
|
92
|
-
* @param projectPath - Optional project path for database lookup
|
|
93
|
-
* @returns Summary of available eval data
|
|
94
|
-
*/
|
|
95
|
-
export async function getEvalDataSummary(
|
|
96
|
-
projectKey: string,
|
|
97
|
-
projectPath?: string,
|
|
98
|
-
): Promise<{
|
|
99
|
-
totalRecords: number;
|
|
100
|
-
successRate: number;
|
|
101
|
-
byStrategy: Record<string, number>;
|
|
102
|
-
hasEnoughData: boolean;
|
|
103
|
-
}> {
|
|
104
|
-
const stats = await getEvalStats(projectKey, projectPath);
|
|
105
|
-
|
|
106
|
-
return {
|
|
107
|
-
totalRecords: stats.totalRecords,
|
|
108
|
-
successRate: stats.successRate,
|
|
109
|
-
byStrategy: stats.byStrategy,
|
|
110
|
-
hasEnoughData: stats.totalRecords >= 5,
|
|
111
|
-
};
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/**
|
|
115
|
-
* Check if a session meets quality criteria
|
|
116
|
-
*/
|
|
117
|
-
function meetsQualityCriteria(
|
|
118
|
-
session: import("../../src/eval-capture.js").CoordinatorSession,
|
|
119
|
-
criteria: {
|
|
120
|
-
minEvents: number;
|
|
121
|
-
requireWorkerSpawn: boolean;
|
|
122
|
-
requireReview: boolean;
|
|
123
|
-
},
|
|
124
|
-
): boolean {
|
|
125
|
-
// Filter 1: minEvents
|
|
126
|
-
if (session.events.length < criteria.minEvents) {
|
|
127
|
-
return false;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// Filter 2: requireWorkerSpawn
|
|
131
|
-
if (
|
|
132
|
-
criteria.requireWorkerSpawn &&
|
|
133
|
-
!session.events.some(
|
|
134
|
-
(e) => e.event_type === "DECISION" && e.decision_type === "worker_spawned",
|
|
135
|
-
)
|
|
136
|
-
) {
|
|
137
|
-
return false;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// Filter 3: requireReview
|
|
141
|
-
if (
|
|
142
|
-
criteria.requireReview &&
|
|
143
|
-
!session.events.some(
|
|
144
|
-
(e) =>
|
|
145
|
-
e.event_type === "DECISION" && e.decision_type === "review_completed",
|
|
146
|
-
)
|
|
147
|
-
) {
|
|
148
|
-
return false;
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
return true;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/**
|
|
155
|
-
* Load captured coordinator sessions from ~/.config/swarm-tools/sessions/
|
|
156
|
-
*
|
|
157
|
-
* Reads all JSONL session files and returns CoordinatorSession objects.
|
|
158
|
-
*
|
|
159
|
-
* Quality filters are applied to focus on high-signal coordinator sessions:
|
|
160
|
-
* - minEvents: Filter out incomplete/aborted sessions (default: 3)
|
|
161
|
-
* - requireWorkerSpawn: Ensure session delegated to workers (default: true)
|
|
162
|
-
* - requireReview: Ensure coordinator reviewed work (default: true)
|
|
163
|
-
*
|
|
164
|
-
* Filters are applied BEFORE the limit for accurate sampling.
|
|
165
|
-
*
|
|
166
|
-
* @param options - Filter options
|
|
167
|
-
* @returns Array of coordinator sessions that meet quality criteria
|
|
168
|
-
*/
|
|
169
|
-
export async function loadCapturedSessions(options?: {
|
|
170
|
-
sessionIds?: string[];
|
|
171
|
-
limit?: number;
|
|
172
|
-
/** Minimum number of events required (default: 3) */
|
|
173
|
-
minEvents?: number;
|
|
174
|
-
/** Require at least one worker_spawned event (default: true) */
|
|
175
|
-
requireWorkerSpawn?: boolean;
|
|
176
|
-
/** Require at least one review_completed event (default: true) */
|
|
177
|
-
requireReview?: boolean;
|
|
178
|
-
/** Override session directory for testing */
|
|
179
|
-
sessionDir?: string;
|
|
180
|
-
}): Promise<
|
|
181
|
-
Array<{ session: import("../../src/eval-capture.js").CoordinatorSession }>
|
|
182
|
-
> {
|
|
183
|
-
const { getSessionDir, readSessionEvents, saveSession } = await import(
|
|
184
|
-
"../../src/eval-capture.js"
|
|
185
|
-
);
|
|
186
|
-
const sessionDir = options?.sessionDir ?? getSessionDir();
|
|
187
|
-
|
|
188
|
-
// Default quality filters
|
|
189
|
-
const qualityCriteria = {
|
|
190
|
-
minEvents: options?.minEvents ?? 3,
|
|
191
|
-
requireWorkerSpawn: options?.requireWorkerSpawn ?? true,
|
|
192
|
-
requireReview: options?.requireReview ?? true,
|
|
193
|
-
};
|
|
194
|
-
|
|
195
|
-
// If session dir doesn't exist, return empty
|
|
196
|
-
if (!fs.existsSync(sessionDir)) {
|
|
197
|
-
return [];
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
// Read all .jsonl files in session directory
|
|
201
|
-
const files = fs
|
|
202
|
-
.readdirSync(sessionDir)
|
|
203
|
-
.filter((f) => f.endsWith(".jsonl"));
|
|
204
|
-
|
|
205
|
-
// Filter by sessionIds if provided
|
|
206
|
-
const targetFiles = options?.sessionIds
|
|
207
|
-
? files.filter((f) => options.sessionIds?.includes(f.replace(".jsonl", "")))
|
|
208
|
-
: files;
|
|
209
|
-
|
|
210
|
-
// Load each session
|
|
211
|
-
const sessions: Array<{
|
|
212
|
-
session: import("../../src/eval-capture.js").CoordinatorSession;
|
|
213
|
-
}> = [];
|
|
214
|
-
let filteredOutCount = 0;
|
|
215
|
-
|
|
216
|
-
for (const file of targetFiles) {
|
|
217
|
-
const sessionId = file.replace(".jsonl", "");
|
|
218
|
-
|
|
219
|
-
try {
|
|
220
|
-
let events: import("../../src/eval-capture.js").CoordinatorEvent[];
|
|
221
|
-
|
|
222
|
-
// If custom sessionDir, read directly; otherwise use eval-capture functions
|
|
223
|
-
if (options?.sessionDir) {
|
|
224
|
-
const sessionPath = `${sessionDir}/${sessionId}.jsonl`;
|
|
225
|
-
if (!fs.existsSync(sessionPath)) continue;
|
|
226
|
-
|
|
227
|
-
const content = fs.readFileSync(sessionPath, "utf-8");
|
|
228
|
-
const lines = content.trim().split("\n").filter(Boolean);
|
|
229
|
-
const { CoordinatorEventSchema } = await import(
|
|
230
|
-
"../../src/eval-capture.js"
|
|
231
|
-
);
|
|
232
|
-
events = lines.map((line) => {
|
|
233
|
-
const parsed = JSON.parse(line);
|
|
234
|
-
return CoordinatorEventSchema.parse(parsed);
|
|
235
|
-
});
|
|
236
|
-
} else {
|
|
237
|
-
events = readSessionEvents(sessionId);
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
if (events.length === 0) continue;
|
|
241
|
-
|
|
242
|
-
// Find epic_id from first event
|
|
243
|
-
const epicId = events[0]?.epic_id;
|
|
244
|
-
if (!epicId) continue;
|
|
245
|
-
|
|
246
|
-
// Build session object
|
|
247
|
-
const session: import("../../src/eval-capture.js").CoordinatorSession = {
|
|
248
|
-
session_id: sessionId,
|
|
249
|
-
epic_id: epicId,
|
|
250
|
-
start_time: events[0]?.timestamp ?? new Date().toISOString(),
|
|
251
|
-
end_time: events[events.length - 1]?.timestamp,
|
|
252
|
-
events,
|
|
253
|
-
};
|
|
254
|
-
if (!session) continue;
|
|
255
|
-
|
|
256
|
-
// Apply quality filters BEFORE limit
|
|
257
|
-
if (meetsQualityCriteria(session, qualityCriteria)) {
|
|
258
|
-
sessions.push({ session });
|
|
259
|
-
} else {
|
|
260
|
-
filteredOutCount++;
|
|
261
|
-
}
|
|
262
|
-
} catch (error) {
|
|
263
|
-
// Skip invalid sessions
|
|
264
|
-
console.warn(`Failed to load session ${sessionId}:`, error);
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
// Apply limit AFTER filtering
|
|
268
|
-
if (options?.limit && sessions.length >= options.limit) {
|
|
269
|
-
break;
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
// Log filtering stats for visibility
|
|
274
|
-
if (filteredOutCount > 0) {
|
|
275
|
-
console.log(
|
|
276
|
-
`Filtered out ${filteredOutCount} sessions (minEvents=${qualityCriteria.minEvents}, requireWorkerSpawn=${qualityCriteria.requireWorkerSpawn}, requireReview=${qualityCriteria.requireReview})`,
|
|
277
|
-
);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
return sessions;
|
|
281
|
-
}
|
package/evals/lib/llm.ts
DELETED
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* LLM Client for Evalite Evals
|
|
3
|
-
*
|
|
4
|
-
* Uses AI SDK v6 with Vercel AI Gateway.
|
|
5
|
-
* Gateway handles provider routing - just pass "provider/model" string.
|
|
6
|
-
*
|
|
7
|
-
* @module evals/lib/llm
|
|
8
|
-
*/
|
|
9
|
-
import { generateText, gateway } from "ai";
|
|
10
|
-
import type { GatewayModelId } from "ai";
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Default model for decomposition evals
|
|
14
|
-
* Using Claude Sonnet for good balance of quality and cost
|
|
15
|
-
*/
|
|
16
|
-
export const DEFAULT_MODEL: GatewayModelId = "anthropic/claude-sonnet-4-5";
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* Generate a decomposition from a task description
|
|
20
|
-
*
|
|
21
|
-
* @param prompt - The full decomposition prompt
|
|
22
|
-
* @param model - Gateway model ID (e.g., "anthropic/claude-sonnet-4-5")
|
|
23
|
-
* @returns The raw text response from the LLM
|
|
24
|
-
*/
|
|
25
|
-
export async function generateDecomposition(
|
|
26
|
-
prompt: string,
|
|
27
|
-
model: GatewayModelId = DEFAULT_MODEL,
|
|
28
|
-
): Promise<string> {
|
|
29
|
-
const { text } = await generateText({
|
|
30
|
-
model: gateway(model),
|
|
31
|
-
prompt,
|
|
32
|
-
maxOutputTokens: 4096,
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
return text;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Format a decomposition prompt from task and context
|
|
40
|
-
*
|
|
41
|
-
* Uses the same prompt template as swarm_plan_prompt
|
|
42
|
-
*/
|
|
43
|
-
export function formatDecompositionPrompt(
|
|
44
|
-
task: string,
|
|
45
|
-
context?: string,
|
|
46
|
-
maxSubtasks: number = 6,
|
|
47
|
-
): string {
|
|
48
|
-
const contextSection = context ? `## Context\n${context}` : "";
|
|
49
|
-
|
|
50
|
-
return `You are decomposing a task into parallelizable subtasks for a swarm of agents.
|
|
51
|
-
|
|
52
|
-
## Task
|
|
53
|
-
${task}
|
|
54
|
-
|
|
55
|
-
${contextSection}
|
|
56
|
-
|
|
57
|
-
## Requirements
|
|
58
|
-
|
|
59
|
-
1. **Break into 2-${maxSubtasks} independent subtasks** that can run in parallel
|
|
60
|
-
2. **Assign files** - each subtask must specify which files it will modify
|
|
61
|
-
3. **No file overlap** - files cannot appear in multiple subtasks (they get exclusive locks)
|
|
62
|
-
4. **Order by dependency** - if subtask B needs subtask A's output, A must come first in the array
|
|
63
|
-
5. **Estimate complexity** - 1 (trivial) to 5 (complex)
|
|
64
|
-
|
|
65
|
-
## Response Format
|
|
66
|
-
|
|
67
|
-
Respond with ONLY a JSON object matching this schema (no markdown, no explanation):
|
|
68
|
-
|
|
69
|
-
{
|
|
70
|
-
"epic": {
|
|
71
|
-
"title": "string",
|
|
72
|
-
"description": "string"
|
|
73
|
-
},
|
|
74
|
-
"subtasks": [
|
|
75
|
-
{
|
|
76
|
-
"title": "string",
|
|
77
|
-
"description": "string",
|
|
78
|
-
"files": ["string"],
|
|
79
|
-
"dependencies": [0],
|
|
80
|
-
"estimated_complexity": 1
|
|
81
|
-
}
|
|
82
|
-
]
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
## Guidelines
|
|
86
|
-
|
|
87
|
-
- **Plan aggressively** - when in doubt, split further
|
|
88
|
-
- **Prefer smaller, focused subtasks** over large complex ones
|
|
89
|
-
- **Include test files** in the same subtask as the code they test
|
|
90
|
-
- **Be specific about files** - use actual file paths, not placeholders
|
|
91
|
-
|
|
92
|
-
Now decompose the task. Respond with JSON only:`;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
/**
|
|
96
|
-
* Extract JSON from LLM response
|
|
97
|
-
*
|
|
98
|
-
* Handles responses that may have markdown code blocks or extra text
|
|
99
|
-
*/
|
|
100
|
-
export function extractJson(text: string): string {
|
|
101
|
-
// Try to find JSON in code blocks first
|
|
102
|
-
const codeBlockMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
103
|
-
if (codeBlockMatch) {
|
|
104
|
-
return codeBlockMatch[1].trim();
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// Try to find raw JSON object
|
|
108
|
-
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
109
|
-
if (jsonMatch) {
|
|
110
|
-
return jsonMatch[0];
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
// Return as-is if no JSON found
|
|
114
|
-
return text;
|
|
115
|
-
}
|
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Compaction Prompt Quality Scorers - Evalite Wrappers
|
|
3
|
-
*
|
|
4
|
-
* These wrap the pure scoring functions from src/compaction-prompt-scoring.ts
|
|
5
|
-
* for use with evalite's test runner.
|
|
6
|
-
*
|
|
7
|
-
* Weighted scoring:
|
|
8
|
-
* - epicIdSpecificity (0.20) - real IDs not placeholders
|
|
9
|
-
* - actionability (0.20) - swarm_status/inbox with real values
|
|
10
|
-
* - coordinatorIdentity (0.25) - ASCII header + strong mandates
|
|
11
|
-
* - forbiddenToolsPresent (0.15) - lists forbidden tools by name
|
|
12
|
-
* - postCompactionDiscipline (0.20) - first tool correct, no edit/write
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
import { createScorer } from "evalite";
|
|
16
|
-
import type { CompactionPrompt } from "../../src/compaction-prompt-scoring.js";
|
|
17
|
-
import {
|
|
18
|
-
scoreActionability,
|
|
19
|
-
scoreCoordinatorIdentity,
|
|
20
|
-
scoreEpicIdSpecificity,
|
|
21
|
-
scoreForbiddenToolsPresent,
|
|
22
|
-
scorePostCompactionDiscipline,
|
|
23
|
-
} from "../../src/compaction-prompt-scoring.js";
|
|
24
|
-
|
|
25
|
-
// Re-export types for convenience
|
|
26
|
-
export type { CompactionPrompt, ScorerResult } from "../../src/compaction-prompt-scoring.js";
|
|
27
|
-
|
|
28
|
-
// Re-export pure functions for direct use
|
|
29
|
-
export {
|
|
30
|
-
scoreActionability,
|
|
31
|
-
scoreCoordinatorIdentity,
|
|
32
|
-
scoreEpicIdSpecificity,
|
|
33
|
-
scoreForbiddenToolsPresent,
|
|
34
|
-
scorePostCompactionDiscipline,
|
|
35
|
-
} from "../../src/compaction-prompt-scoring.js";
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Epic ID Specificity Scorer
|
|
39
|
-
*
|
|
40
|
-
* Validates that epic IDs are REAL, not placeholders.
|
|
41
|
-
* Score: 1.0 if real IDs, 0.0 if placeholders found
|
|
42
|
-
*/
|
|
43
|
-
export const epicIdSpecificity = createScorer({
|
|
44
|
-
name: "Epic ID Specificity",
|
|
45
|
-
description: "Prompt uses real epic IDs, not placeholders",
|
|
46
|
-
scorer: ({ output }) => {
|
|
47
|
-
try {
|
|
48
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
49
|
-
return scoreEpicIdSpecificity(prompt);
|
|
50
|
-
} catch (error) {
|
|
51
|
-
return {
|
|
52
|
-
score: 0,
|
|
53
|
-
message: `Failed to parse prompt: ${error}`,
|
|
54
|
-
};
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
/**
|
|
60
|
-
* Actionability Scorer
|
|
61
|
-
*
|
|
62
|
-
* Validates that the prompt includes SPECIFIC actionable tool calls.
|
|
63
|
-
* Score: 1.0 if actionable tool calls with real values, 0.0 otherwise
|
|
64
|
-
*/
|
|
65
|
-
export const actionability = createScorer({
|
|
66
|
-
name: "Actionability",
|
|
67
|
-
description: "Prompt includes specific tool calls with real values",
|
|
68
|
-
scorer: ({ output }) => {
|
|
69
|
-
try {
|
|
70
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
71
|
-
return scoreActionability(prompt);
|
|
72
|
-
} catch (error) {
|
|
73
|
-
return {
|
|
74
|
-
score: 0,
|
|
75
|
-
message: `Failed to parse prompt: ${error}`,
|
|
76
|
-
};
|
|
77
|
-
}
|
|
78
|
-
},
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
/**
|
|
82
|
-
* Coordinator Identity Scorer
|
|
83
|
-
*
|
|
84
|
-
* Validates that the prompt has STRONG coordinator identity reinforcement.
|
|
85
|
-
* Score: 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
|
|
86
|
-
*/
|
|
87
|
-
export const coordinatorIdentity = createScorer({
|
|
88
|
-
name: "Coordinator Identity",
|
|
89
|
-
description: "Prompt has ASCII header and strong mandates",
|
|
90
|
-
scorer: ({ output }) => {
|
|
91
|
-
try {
|
|
92
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
93
|
-
return scoreCoordinatorIdentity(prompt);
|
|
94
|
-
} catch (error) {
|
|
95
|
-
return {
|
|
96
|
-
score: 0,
|
|
97
|
-
message: `Failed to parse prompt: ${error}`,
|
|
98
|
-
};
|
|
99
|
-
}
|
|
100
|
-
},
|
|
101
|
-
});
|
|
102
|
-
|
|
103
|
-
/**
|
|
104
|
-
* Forbidden Tools Present Scorer
|
|
105
|
-
*
|
|
106
|
-
* Validates that the prompt LISTS forbidden tools by name.
|
|
107
|
-
* Score: ratio of forbidden tools mentioned (0.0 to 1.0)
|
|
108
|
-
*/
|
|
109
|
-
export const forbiddenToolsPresent = createScorer({
|
|
110
|
-
name: "Forbidden Tools Present",
|
|
111
|
-
description: "Prompt lists forbidden tools by name",
|
|
112
|
-
scorer: ({ output }) => {
|
|
113
|
-
try {
|
|
114
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
115
|
-
return scoreForbiddenToolsPresent(prompt);
|
|
116
|
-
} catch (error) {
|
|
117
|
-
return {
|
|
118
|
-
score: 0,
|
|
119
|
-
message: `Failed to parse prompt: ${error}`,
|
|
120
|
-
};
|
|
121
|
-
}
|
|
122
|
-
},
|
|
123
|
-
});
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Post-Compaction Discipline Scorer
|
|
127
|
-
*
|
|
128
|
-
* Validates that the FIRST suggested tool is correct.
|
|
129
|
-
* Score: 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
|
|
130
|
-
*/
|
|
131
|
-
export const postCompactionDiscipline = createScorer({
|
|
132
|
-
name: "Post-Compaction Discipline",
|
|
133
|
-
description: "First suggested tool is swarm_status or inbox",
|
|
134
|
-
scorer: ({ output }) => {
|
|
135
|
-
try {
|
|
136
|
-
const prompt = JSON.parse(String(output)) as CompactionPrompt;
|
|
137
|
-
return scorePostCompactionDiscipline(prompt);
|
|
138
|
-
} catch (error) {
|
|
139
|
-
return {
|
|
140
|
-
score: 0,
|
|
141
|
-
message: `Failed to parse prompt: ${error}`,
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
},
|
|
145
|
-
});
|