opencode-swarm-plugin 0.42.0 → 0.42.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.hive/issues.jsonl +7 -5
- package/.turbo/turbo-build.log +2 -2
- package/CHANGELOG.md +18 -0
- package/bin/swarm.serve.test.ts +46 -0
- package/bin/swarm.ts +61 -0
- package/evals/scorers/coordinator-discipline.evalite-test.ts +1 -162
- package/evals/scorers/coordinator-discipline.ts +0 -70
- package/package.json +1 -1
- package/src/compaction-prompt-scorers.test.ts +175 -0
package/.hive/issues.jsonl
CHANGED
|
@@ -44,10 +44,6 @@
|
|
|
44
44
|
{"id":"opencode-swarm-plugin--ys7z8-mjlk7jspacf","title":"Audit session data quality and filtering","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T14:50:08.761Z","updated_at":"2025-12-25T14:59:53.284Z","closed_at":"2025-12-25T14:59:53.284Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlk7js9bt1","dependencies":[],"labels":[],"comments":[]}
|
|
45
45
|
{"id":"opencode-swarm-plugin--ys7z8-mjlk7jsrvls","title":"Analyze scorer implementations and scoring patterns","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T14:50:08.763Z","updated_at":"2025-12-25T14:59:54.612Z","closed_at":"2025-12-25T14:59:54.612Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlk7js9bt1","dependencies":[],"labels":[],"comments":[]}
|
|
46
46
|
{"id":"opencode-swarm-plugin--ys7z8-mjlk7jstvch","title":"Synthesize findings and propose improvements","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T14:50:08.765Z","updated_at":"2025-12-25T15:04:46.898Z","closed_at":"2025-12-25T15:04:46.898Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlk7js9bt1","dependencies":[],"labels":[],"comments":[]}
|
|
47
|
-
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","title":"P0 Eval Fixes","description":"Immediate fixes from eval audit. Target: restore eval health (0%→100%, 53%→70%), remove 250 LOC dead code.","status":"open","priority":1,"issue_type":"epic","created_at":"2025-12-25T15:42:19.671Z","updated_at":"2025-12-25T15:42:19.671Z","dependencies":[],"labels":[],"comments":[]}
|
|
48
|
-
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmlu3m","title":"Fix example.eval.ts data/task mismatch","status":"open","priority":0,"issue_type":"task","created_at":"2025-12-25T15:42:19.677Z","updated_at":"2025-12-25T15:42:19.677Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","dependencies":[],"labels":[],"comments":[]}
|
|
49
|
-
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmont1","title":"Fix compaction-prompt case-sensitive regex and missing tools","status":"open","priority":0,"issue_type":"task","created_at":"2025-12-25T15:42:19.680Z","updated_at":"2025-12-25T15:42:19.680Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","dependencies":[],"labels":[],"comments":[]}
|
|
50
|
-
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmt1kq","title":"Remove 4 unused coordinator scorers","status":"open","priority":0,"issue_type":"task","created_at":"2025-12-25T15:42:19.685Z","updated_at":"2025-12-25T15:42:19.685Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","dependencies":[],"labels":[],"comments":[]}
|
|
51
47
|
{"id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","title":"Eval System Improvements: Tool + Event Capture + Scorers","description":"Improve eval system with:\n1. Plugin tool for running evals (eval_run)\n2. Capture decomposition_complete events\n3. Capture VIOLATION events\n4. Improve compaction prompt structure\n5. Add review efficiency scorer\n6. Enforce knowledge gathering validation\n\nTarget: 70% → 85% overall eval score","status":"closed","priority":1,"issue_type":"epic","created_at":"2025-12-25T05:28:16.999Z","updated_at":"2025-12-25T16:06:41.043Z","closed_at":"2025-12-25T16:06:41.043Z","dependencies":[],"labels":[],"comments":[]}
|
|
52
48
|
{"id":"opencode-swarm-plugin--ys7z8-mjl04znlxzw","title":"Improve compaction prompt structure","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T05:28:17.025Z","updated_at":"2025-12-25T16:06:31.435Z","closed_at":"2025-12-25T16:06:31.435Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
|
|
53
49
|
{"id":"opencode-swarm-plugin--ys7z8-mjlnn93f5t1","title":"Eval-Driven Improvement Flywheel","description":"Wire up the complete eval improvement feedback loop:\n1. Connect eval-runner to eval-history (recordEvalRun)\n2. Add checkGate + learnFromEvalFailure integration\n3. Add eval:gate script for CI\n4. Inject failures into coordinator prompts\n5. Add GitHub Actions workflow\n\nGoal: Evals drive real improvement over time via automated regression detection and learning.","status":"closed","priority":1,"issue_type":"epic","created_at":"2025-12-25T16:26:20.235Z","updated_at":"2025-12-25T16:49:21.513Z","closed_at":"2025-12-25T16:49:21.513Z","dependencies":[],"labels":[],"comments":[]}
|
|
@@ -58,7 +54,6 @@
|
|
|
58
54
|
{"id":"opencode-swarm-plugin--ys7z8-mjlnn9412oc","title":"Add GitHub Actions eval workflow","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T16:26:20.257Z","updated_at":"2025-12-25T16:49:12.694Z","closed_at":"2025-12-25T16:49:12.694Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlnn93f5t1","dependencies":[],"labels":[],"comments":[]}
|
|
59
55
|
{"id":"opencode-swarm-plugin--ys7z8-mjljadmw66u","title":"Research: Swarm Coordination (decomposition, orchestration, review, worktree)","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.128Z","updated_at":"2025-12-25T16:50:12.242Z","closed_at":"2025-12-25T16:50:12.242Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
|
|
60
56
|
{"id":"opencode-swarm-plugin--ys7z8-mjljadn7knk","title":"Research: Mandates, Guardrails & Structured Output","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.139Z","updated_at":"2025-12-25T16:50:13.275Z","closed_at":"2025-12-25T16:50:13.275Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
|
|
61
|
-
{"id":"opencode-swarm-plugin--ys7z8-mjltv0ievr0","title":"Swarm O11y & Eval Insights Pipeline","description":"Comprehensive observability and eval insights for swarm coordination. Fixes data capture gaps, adds CLI commands for visibility, injects insights into prompts, and creates a real-time dashboard.\n\nGoals:\n1. All swarm events captured (decomposition, outcomes, reviews, failures)\n2. CLI commands: `swarm stats`, `swarm history` for human visibility\n3. Prompt injection: surface insights to coordinators/workers\n4. TanStack Start dashboard with real-time streaming\n\nDatabase: ~/.config/swarm-tools/swarm.db (libSQL)\nSessions: ~/.config/swarm-tools/sessions/*.jsonl\nExisting analytics: swarm-mail/src/analytics.ts","status":"open","priority":1,"issue_type":"epic","created_at":"2025-12-25T19:20:20.054Z","updated_at":"2025-12-25T19:20:20.054Z","dependencies":[],"labels":[],"comments":[]}
|
|
62
57
|
{"id":"opencode-swarm-plugin--ys7z8-mjltv0j8x4n","title":"Audit & fix data capture gaps","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-25T19:20:20.084Z","updated_at":"2025-12-25T19:30:57.891Z","closed_at":"2025-12-25T19:30:57.891Z","parent_id":"opencode-swarm-plugin--ys7z8-mjltv0ievr0","dependencies":[],"labels":[],"comments":[]}
|
|
63
58
|
{"id":"opencode-swarm-plugin--ys7z8-mjltv0jcpjx","title":"Add swarm stats CLI command","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T19:20:20.088Z","updated_at":"2025-12-25T19:30:59.853Z","closed_at":"2025-12-25T19:30:59.853Z","parent_id":"opencode-swarm-plugin--ys7z8-mjltv0ievr0","dependencies":[],"labels":[],"comments":[]}
|
|
64
59
|
{"id":"opencode-swarm-plugin--ys7z8-mjltv0jhsd6","title":"Add swarm history CLI command","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T19:20:20.093Z","updated_at":"2025-12-25T19:31:01.913Z","closed_at":"2025-12-25T19:31:01.913Z","parent_id":"opencode-swarm-plugin--ys7z8-mjltv0ievr0","dependencies":[],"labels":[],"comments":[]}
|
|
@@ -102,3 +97,10 @@
|
|
|
102
97
|
{"id":"opencode-swarm-plugin--ys7z8-mjlv8hzdnf2","title":"Events pane with live tail","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T19:58:48.841Z","updated_at":"2025-12-25T20:18:33.574Z","closed_at":"2025-12-25T20:18:33.574Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlv8hy1tzf","dependencies":[],"labels":[],"comments":[]}
|
|
103
98
|
{"id":"opencode-swarm-plugin--ys7z8-mjlv8hzi0bv","title":"Cells pane with tree view","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T19:58:48.846Z","updated_at":"2025-12-25T20:18:34.562Z","closed_at":"2025-12-25T20:18:34.562Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlv8hy1tzf","dependencies":[],"labels":[],"comments":[]}
|
|
104
99
|
{"id":"opencode-swarm-plugin--ys7z8-mjlv8hzxyee","title":"Main layout with keyboard navigation","status":"closed","priority":3,"issue_type":"task","created_at":"2025-12-25T19:58:48.861Z","updated_at":"2025-12-25T20:24:43.227Z","closed_at":"2025-12-25T20:24:43.227Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlv8hy1tzf","dependencies":[],"labels":[],"comments":[]}
|
|
100
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","title":"P0 Eval Fixes","description":"Immediate fixes from eval audit. Target: restore eval health (0%→100%, 53%→70%), remove 250 LOC dead code.","status":"closed","priority":1,"issue_type":"epic","created_at":"2025-12-25T15:42:19.671Z","updated_at":"2025-12-25T20:40:14.672Z","closed_at":"2025-12-25T20:40:14.672Z","dependencies":[],"labels":[],"comments":[]}
|
|
101
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmlu3m","title":"Fix example.eval.ts data/task mismatch","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-25T15:42:19.677Z","updated_at":"2025-12-25T20:40:00.437Z","closed_at":"2025-12-25T20:40:00.437Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","dependencies":[],"labels":[],"comments":[]}
|
|
102
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmont1","title":"Fix compaction-prompt case-sensitive regex and missing tools","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-25T15:42:19.680Z","updated_at":"2025-12-25T20:40:02.213Z","closed_at":"2025-12-25T20:40:02.213Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","dependencies":[],"labels":[],"comments":[]}
|
|
103
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjlm2nmt1kq","title":"Remove 4 unused coordinator scorers","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-25T15:42:19.685Z","updated_at":"2025-12-25T20:40:03.974Z","closed_at":"2025-12-25T20:40:03.974Z","parent_id":"opencode-swarm-plugin--ys7z8-mjlm2nmf2hw","dependencies":[],"labels":[],"comments":[]}
|
|
104
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjltv0ievr0","title":"Swarm O11y & Eval Insights Pipeline","description":"Comprehensive observability and eval insights for swarm coordination.\n\nCOMPLETED:\n1. ✅ `swarm serve` command - starts SSE server on configurable port\n2. ✅ Dashboard panes wired to real data (CellsPane, AgentsPane)\n3. ✅ Vite + React dashboard with SSE hooks\n\nREMAINING:\n- GET /cells endpoint on server (dashboard blocked on this)\n- Fix dashboard test fixtures for mock server\n- CLI commands: `swarm stats`, `swarm history`\n- Prompt injection for insights","status":"open","priority":1,"issue_type":"epic","created_at":"2025-12-25T19:20:20.054Z","updated_at":"2025-12-25T20:40:19.433Z","dependencies":[],"labels":[],"comments":[]}
|
|
105
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjlwcoh8ut2","title":"Add `swarm serve` command to start SSE server","description":"Add a `serve` subcommand to bin/swarm.ts that starts the DurableStreamServer on a configurable port (default 3001).\n\nFiles: bin/swarm.ts\n\nImplementation:\n1. Add `serve` case to the switch statement\n2. Import createDurableStreamServer from swarm-mail\n3. Start server with adapter from getSwarmMailLibSQL()\n4. Print URL to console\n5. Keep process alive\n\nExample usage: `swarm serve --port 3001`\n\nThe dashboard at localhost:5173 will connect to this SSE endpoint.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T20:30:03.500Z","updated_at":"2025-12-25T20:40:05.728Z","closed_at":"2025-12-25T20:40:05.728Z","dependencies":[],"labels":[],"comments":[]}
|
|
106
|
+
{"id":"opencode-swarm-plugin--ys7z8-mjlwcslohuv","title":"Wire dashboard panes to real swarm-mail data","description":"Connect CellsPane and AgentsPane to real data from the SSE server.\n\nFiles: \n- packages/swarm-dashboard/src/lib/api.ts\n- packages/swarm-dashboard/src/components/CellsPane.tsx\n- packages/swarm-dashboard/src/components/AgentsPane.tsx\n- packages/swarm-dashboard/src/hooks/useSwarmEvents.ts\n\nImplementation:\n1. Update api.ts to fetch cells from hive (can use REST endpoint or derive from events)\n2. Update CellsPane to use real cell data instead of mock\n3. Update AgentsPane to derive agent list from SSE events (agent_registered events)\n4. Ensure useSwarmEvents properly accumulates agent state\n\nThe SSE server runs at localhost:3001 (from `swarm serve` command).","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T20:30:08.844Z","updated_at":"2025-12-25T20:40:07.548Z","closed_at":"2025-12-25T20:40:07.548Z","dependencies":[],"labels":[],"comments":[]}
|
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
$ bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && tsc
|
|
2
|
-
Bundled 1348 modules in
|
|
2
|
+
Bundled 1348 modules in 198ms
|
|
3
3
|
|
|
4
4
|
index.js 4.33 MB (entry point)
|
|
5
5
|
|
|
6
|
-
Bundled 1349 modules in
|
|
6
|
+
Bundled 1349 modules in 192ms
|
|
7
7
|
|
|
8
8
|
plugin.js 4.30 MB (entry point)
|
|
9
9
|
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# opencode-swarm-plugin
|
|
2
2
|
|
|
3
|
+
## 0.42.2
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- [`9ded2a0`](https://github.com/joelhooks/swarm-tools/commit/9ded2a0929f430a3297e3b62858aa1143179542f) Thanks [@joelhooks](https://github.com/joelhooks)! - ## Tweet Bot Learns to Speak Swarm
|
|
8
|
+
|
|
9
|
+
Release tweets now use a manyshot prompt with examples that match the project's voice: terse, technical, slightly cheeky. Focus on what devs can DO, not what we shipped.
|
|
10
|
+
|
|
11
|
+
## 0.42.1
|
|
12
|
+
|
|
13
|
+
### Patch Changes
|
|
14
|
+
|
|
15
|
+
- [`f6707d5`](https://github.com/joelhooks/swarm-tools/commit/f6707d53eb92021b6976212e903994c98c798483) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🐦 @swarmtoolsai Now Tweets Releases
|
|
16
|
+
|
|
17
|
+
Automated release announcements are live! When packages publish to npm, Claude summarizes the changelog into a tweet and posts from @swarmtoolsai.
|
|
18
|
+
|
|
19
|
+
No more manual "hey we shipped" posts - the bees handle it now.
|
|
20
|
+
|
|
3
21
|
## 0.42.0
|
|
4
22
|
|
|
5
23
|
### Minor Changes
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for `swarm serve` command
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, test, expect } from "bun:test";
|
|
6
|
+
import { spawn } from "bun";
|
|
7
|
+
|
|
8
|
+
describe("swarm serve command", () => {
|
|
9
|
+
test("serve command accepts custom port via --port flag", () => {
|
|
10
|
+
// Verify that CLI parsing works for custom port
|
|
11
|
+
const args = ["serve", "--port", "8080"];
|
|
12
|
+
const port = args.includes("--port")
|
|
13
|
+
? Number.parseInt(args[args.indexOf("--port") + 1])
|
|
14
|
+
: 3001;
|
|
15
|
+
|
|
16
|
+
expect(port).toBe(8080);
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
test("serve command defaults to port 3001", () => {
|
|
20
|
+
const args = ["serve"];
|
|
21
|
+
const port = args.includes("--port")
|
|
22
|
+
? Number.parseInt(args[args.indexOf("--port") + 1])
|
|
23
|
+
: 3001;
|
|
24
|
+
|
|
25
|
+
expect(port).toBe(3001);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
test("serve command uses project path from CWD", () => {
|
|
29
|
+
const projectPath = process.cwd();
|
|
30
|
+
expect(projectPath).toBeDefined();
|
|
31
|
+
expect(typeof projectPath).toBe("string");
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
test("serve command appears in help text", async () => {
|
|
35
|
+
const proc = spawn(["bun", "run", "bin/swarm.ts", "help"], {
|
|
36
|
+
stdout: "pipe",
|
|
37
|
+
stderr: "pipe",
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
const output = await new Response(proc.stdout).text();
|
|
41
|
+
|
|
42
|
+
expect(output).toContain("swarm serve");
|
|
43
|
+
expect(output).toContain("Start SSE server");
|
|
44
|
+
expect(output).toContain("--port");
|
|
45
|
+
});
|
|
46
|
+
});
|
package/bin/swarm.ts
CHANGED
|
@@ -2518,6 +2518,8 @@ ${cyan("Commands:")}
|
|
|
2518
2518
|
swarm config Show paths to generated config files
|
|
2519
2519
|
swarm agents Update AGENTS.md with skill awareness
|
|
2520
2520
|
swarm migrate Migrate PGlite database to libSQL
|
|
2521
|
+
swarm serve Start SSE server for real-time event streaming
|
|
2522
|
+
--port <n> Port to listen on (default: 3001)
|
|
2521
2523
|
swarm cells List or get cells from database (replaces 'swarm tool hive_query')
|
|
2522
2524
|
swarm log View swarm logs with filtering
|
|
2523
2525
|
swarm stats Show swarm health metrics and success rates
|
|
@@ -4488,6 +4490,62 @@ async function evalRun() {
|
|
|
4488
4490
|
}
|
|
4489
4491
|
}
|
|
4490
4492
|
|
|
4493
|
+
// ============================================================================
|
|
4494
|
+
// Serve Command - Start SSE Server
|
|
4495
|
+
// ============================================================================
|
|
4496
|
+
|
|
4497
|
+
async function serve() {
|
|
4498
|
+
p.intro("swarm serve v" + VERSION);
|
|
4499
|
+
|
|
4500
|
+
// Parse --port flag (default 3001)
|
|
4501
|
+
const portFlagIndex = process.argv.indexOf("--port");
|
|
4502
|
+
const port = portFlagIndex !== -1
|
|
4503
|
+
? Number.parseInt(process.argv[portFlagIndex + 1]) || 3001
|
|
4504
|
+
: 3001;
|
|
4505
|
+
|
|
4506
|
+
const projectPath = process.cwd();
|
|
4507
|
+
|
|
4508
|
+
p.log.step("Starting DurableStreamServer...");
|
|
4509
|
+
p.log.message(dim(` Project: ${projectPath}`));
|
|
4510
|
+
p.log.message(dim(` Port: ${port}`));
|
|
4511
|
+
|
|
4512
|
+
try {
|
|
4513
|
+
// Import dependencies
|
|
4514
|
+
const { getSwarmMailLibSQL } = await import("swarm-mail");
|
|
4515
|
+
const { createDurableStreamAdapter, createDurableStreamServer } = await import("swarm-mail");
|
|
4516
|
+
|
|
4517
|
+
// Get swarm-mail adapter
|
|
4518
|
+
const swarmMail = await getSwarmMailLibSQL(projectPath);
|
|
4519
|
+
|
|
4520
|
+
// Create stream adapter
|
|
4521
|
+
const streamAdapter = createDurableStreamAdapter(swarmMail, projectPath);
|
|
4522
|
+
|
|
4523
|
+
// Create and start server
|
|
4524
|
+
const server = createDurableStreamServer({
|
|
4525
|
+
adapter: streamAdapter,
|
|
4526
|
+
port,
|
|
4527
|
+
projectKey: projectPath,
|
|
4528
|
+
});
|
|
4529
|
+
|
|
4530
|
+
await server.start();
|
|
4531
|
+
|
|
4532
|
+
p.log.success("Server started!");
|
|
4533
|
+
p.log.message("");
|
|
4534
|
+
p.log.message(cyan(" Dashboard: http://localhost:5173"));
|
|
4535
|
+
p.log.message(cyan(` SSE Endpoint: ${server.url}/streams/${encodeURIComponent(projectPath)}`));
|
|
4536
|
+
p.log.message("");
|
|
4537
|
+
p.log.message(dim(" Press Ctrl+C to stop"));
|
|
4538
|
+
|
|
4539
|
+
// Keep process alive
|
|
4540
|
+
await new Promise(() => {});
|
|
4541
|
+
} catch (error) {
|
|
4542
|
+
p.log.error("Failed to start server");
|
|
4543
|
+
p.log.message(error instanceof Error ? error.message : String(error));
|
|
4544
|
+
p.outro("Aborted");
|
|
4545
|
+
process.exit(1);
|
|
4546
|
+
}
|
|
4547
|
+
}
|
|
4548
|
+
|
|
4491
4549
|
// ============================================================================
|
|
4492
4550
|
// Main
|
|
4493
4551
|
// ============================================================================
|
|
@@ -4510,6 +4568,9 @@ switch (command) {
|
|
|
4510
4568
|
case "config":
|
|
4511
4569
|
config();
|
|
4512
4570
|
break;
|
|
4571
|
+
case "serve":
|
|
4572
|
+
await serve();
|
|
4573
|
+
break;
|
|
4513
4574
|
case "update":
|
|
4514
4575
|
await update();
|
|
4515
4576
|
break;
|
|
@@ -5,7 +5,7 @@ import { describe, expect, it } from "bun:test";
|
|
|
5
5
|
import type { CoordinatorSession } from "../../src/eval-capture.js";
|
|
6
6
|
import {
|
|
7
7
|
overallDiscipline,
|
|
8
|
-
|
|
8
|
+
|
|
9
9
|
reviewThoroughness,
|
|
10
10
|
spawnEfficiency,
|
|
11
11
|
timeToFirstSpawn,
|
|
@@ -537,164 +537,3 @@ describe("overallDiscipline", () => {
|
|
|
537
537
|
});
|
|
538
538
|
});
|
|
539
539
|
|
|
540
|
-
describe("reviewEfficiency", () => {
|
|
541
|
-
it("scores 1.0 for ideal 1:1 ratio (one review per spawn)", async () => {
|
|
542
|
-
const session: CoordinatorSession = {
|
|
543
|
-
session_id: "test-session",
|
|
544
|
-
epic_id: "test-epic",
|
|
545
|
-
start_time: "2025-01-01T00:00:00Z",
|
|
546
|
-
events: [
|
|
547
|
-
{
|
|
548
|
-
session_id: "test-session",
|
|
549
|
-
epic_id: "test-epic",
|
|
550
|
-
timestamp: "2025-01-01T00:00:10Z",
|
|
551
|
-
event_type: "DECISION",
|
|
552
|
-
decision_type: "worker_spawned",
|
|
553
|
-
payload: { bead_id: "bd-1" },
|
|
554
|
-
},
|
|
555
|
-
{
|
|
556
|
-
session_id: "test-session",
|
|
557
|
-
epic_id: "test-epic",
|
|
558
|
-
timestamp: "2025-01-01T00:00:20Z",
|
|
559
|
-
event_type: "DECISION",
|
|
560
|
-
decision_type: "worker_spawned",
|
|
561
|
-
payload: { bead_id: "bd-2" },
|
|
562
|
-
},
|
|
563
|
-
{
|
|
564
|
-
session_id: "test-session",
|
|
565
|
-
epic_id: "test-epic",
|
|
566
|
-
timestamp: "2025-01-01T00:10:00Z",
|
|
567
|
-
event_type: "DECISION",
|
|
568
|
-
decision_type: "review_completed",
|
|
569
|
-
payload: { bead_id: "bd-1" },
|
|
570
|
-
},
|
|
571
|
-
{
|
|
572
|
-
session_id: "test-session",
|
|
573
|
-
epic_id: "test-epic",
|
|
574
|
-
timestamp: "2025-01-01T00:10:10Z",
|
|
575
|
-
event_type: "DECISION",
|
|
576
|
-
decision_type: "review_completed",
|
|
577
|
-
payload: { bead_id: "bd-2" },
|
|
578
|
-
},
|
|
579
|
-
],
|
|
580
|
-
};
|
|
581
|
-
|
|
582
|
-
const result = await reviewEfficiency({
|
|
583
|
-
output: JSON.stringify(session),
|
|
584
|
-
expected: {},
|
|
585
|
-
input: undefined,
|
|
586
|
-
});
|
|
587
|
-
|
|
588
|
-
expect(result.score).toBe(1.0);
|
|
589
|
-
expect(result.message).toContain("2 reviews / 2 spawns");
|
|
590
|
-
});
|
|
591
|
-
|
|
592
|
-
it("penalizes over-reviewing (>2:1 ratio)", async () => {
|
|
593
|
-
// 6 reviews for 2 spawns = 3:1 ratio (over-reviewing)
|
|
594
|
-
const session: CoordinatorSession = {
|
|
595
|
-
session_id: "test-session",
|
|
596
|
-
epic_id: "test-epic",
|
|
597
|
-
start_time: "2025-01-01T00:00:00Z",
|
|
598
|
-
events: [
|
|
599
|
-
{
|
|
600
|
-
session_id: "test-session",
|
|
601
|
-
epic_id: "test-epic",
|
|
602
|
-
timestamp: "2025-01-01T00:00:10Z",
|
|
603
|
-
event_type: "DECISION",
|
|
604
|
-
decision_type: "worker_spawned",
|
|
605
|
-
payload: { bead_id: "bd-1" },
|
|
606
|
-
},
|
|
607
|
-
{
|
|
608
|
-
session_id: "test-session",
|
|
609
|
-
epic_id: "test-epic",
|
|
610
|
-
timestamp: "2025-01-01T00:00:20Z",
|
|
611
|
-
event_type: "DECISION",
|
|
612
|
-
decision_type: "worker_spawned",
|
|
613
|
-
payload: { bead_id: "bd-2" },
|
|
614
|
-
},
|
|
615
|
-
...Array.from({ length: 6 }, (_, i) => ({
|
|
616
|
-
session_id: "test-session",
|
|
617
|
-
epic_id: "test-epic",
|
|
618
|
-
timestamp: `2025-01-01T00:10:${String(i * 10).padStart(2, "0")}Z`,
|
|
619
|
-
event_type: "DECISION" as const,
|
|
620
|
-
decision_type: "review_completed" as const,
|
|
621
|
-
payload: { bead_id: `bd-${(i % 2) + 1}` },
|
|
622
|
-
})),
|
|
623
|
-
],
|
|
624
|
-
};
|
|
625
|
-
|
|
626
|
-
const result = await reviewEfficiency({
|
|
627
|
-
output: JSON.stringify(session),
|
|
628
|
-
expected: {},
|
|
629
|
-
input: undefined,
|
|
630
|
-
});
|
|
631
|
-
|
|
632
|
-
// 3:1 ratio should be penalized (score < 0.5)
|
|
633
|
-
expect(result.score).toBeLessThan(0.5);
|
|
634
|
-
expect(result.message).toContain("6 reviews / 2 spawns");
|
|
635
|
-
});
|
|
636
|
-
|
|
637
|
-
it("handles no spawns gracefully", async () => {
|
|
638
|
-
const session: CoordinatorSession = {
|
|
639
|
-
session_id: "test-session",
|
|
640
|
-
epic_id: "test-epic",
|
|
641
|
-
start_time: "2025-01-01T00:00:00Z",
|
|
642
|
-
events: [
|
|
643
|
-
{
|
|
644
|
-
session_id: "test-session",
|
|
645
|
-
epic_id: "test-epic",
|
|
646
|
-
timestamp: "2025-01-01T00:00:00Z",
|
|
647
|
-
event_type: "DECISION",
|
|
648
|
-
decision_type: "strategy_selected",
|
|
649
|
-
payload: { strategy: "file-based" },
|
|
650
|
-
},
|
|
651
|
-
],
|
|
652
|
-
};
|
|
653
|
-
|
|
654
|
-
const result = await reviewEfficiency({
|
|
655
|
-
output: JSON.stringify(session),
|
|
656
|
-
expected: {},
|
|
657
|
-
input: undefined,
|
|
658
|
-
});
|
|
659
|
-
|
|
660
|
-
expect(result.score).toBe(1.0);
|
|
661
|
-
expect(result.message).toContain("No workers spawned");
|
|
662
|
-
});
|
|
663
|
-
|
|
664
|
-
it("handles no reviews gracefully (0:N ratio)", async () => {
|
|
665
|
-
const session: CoordinatorSession = {
|
|
666
|
-
session_id: "test-session",
|
|
667
|
-
epic_id: "test-epic",
|
|
668
|
-
start_time: "2025-01-01T00:00:00Z",
|
|
669
|
-
events: [
|
|
670
|
-
{
|
|
671
|
-
session_id: "test-session",
|
|
672
|
-
epic_id: "test-epic",
|
|
673
|
-
timestamp: "2025-01-01T00:00:10Z",
|
|
674
|
-
event_type: "DECISION",
|
|
675
|
-
decision_type: "worker_spawned",
|
|
676
|
-
payload: { bead_id: "bd-1" },
|
|
677
|
-
},
|
|
678
|
-
{
|
|
679
|
-
session_id: "test-session",
|
|
680
|
-
epic_id: "test-epic",
|
|
681
|
-
timestamp: "2025-01-01T00:00:20Z",
|
|
682
|
-
event_type: "DECISION",
|
|
683
|
-
decision_type: "worker_spawned",
|
|
684
|
-
payload: { bead_id: "bd-2" },
|
|
685
|
-
},
|
|
686
|
-
],
|
|
687
|
-
};
|
|
688
|
-
|
|
689
|
-
const result = await reviewEfficiency({
|
|
690
|
-
output: JSON.stringify(session),
|
|
691
|
-
expected: {},
|
|
692
|
-
input: undefined,
|
|
693
|
-
});
|
|
694
|
-
|
|
695
|
-
// No reviews is bad (should use reviewThoroughness for this)
|
|
696
|
-
// But this scorer focuses on over-reviewing, so no reviews = 1.0 (not over-reviewing)
|
|
697
|
-
expect(result.score).toBe(1.0);
|
|
698
|
-
expect(result.message).toContain("0 reviews / 2 spawns");
|
|
699
|
-
});
|
|
700
|
-
});
|
|
@@ -132,76 +132,6 @@ export const spawnEfficiency = createScorer({
|
|
|
132
132
|
},
|
|
133
133
|
});
|
|
134
134
|
|
|
135
|
-
/**
|
|
136
|
-
* Review Efficiency Scorer
|
|
137
|
-
*
|
|
138
|
-
* Measures review-to-spawn ratio to detect over-reviewing.
|
|
139
|
-
* Ideal ratio is 1:1 (one review per spawned worker).
|
|
140
|
-
* Penalizes >2:1 ratio (over-reviewing wastes context).
|
|
141
|
-
*
|
|
142
|
-
* Scoring:
|
|
143
|
-
* - 0:N or 1:1 ratio = 1.0 (perfect)
|
|
144
|
-
* - 2:1 ratio = 0.5 (threshold)
|
|
145
|
-
* - >2:1 ratio = linear penalty toward 0.0
|
|
146
|
-
*
|
|
147
|
-
* Score: normalized to 0-1 (lower ratio is better)
|
|
148
|
-
*/
|
|
149
|
-
export const reviewEfficiency = createScorer({
|
|
150
|
-
name: "Review Efficiency",
|
|
151
|
-
description: "Review-to-spawn ratio (penalize over-reviewing >2:1)",
|
|
152
|
-
scorer: ({ output }) => {
|
|
153
|
-
try {
|
|
154
|
-
const session = JSON.parse(String(output)) as CoordinatorSession;
|
|
155
|
-
|
|
156
|
-
// Count worker_spawned events
|
|
157
|
-
const spawned = session.events.filter(
|
|
158
|
-
(e) =>
|
|
159
|
-
e.event_type === "DECISION" && e.decision_type === "worker_spawned"
|
|
160
|
-
).length;
|
|
161
|
-
|
|
162
|
-
if (spawned === 0) {
|
|
163
|
-
return {
|
|
164
|
-
score: 1.0,
|
|
165
|
-
message: "No workers spawned",
|
|
166
|
-
};
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
// Count review_completed events
|
|
170
|
-
const reviewed = session.events.filter(
|
|
171
|
-
(e) =>
|
|
172
|
-
e.event_type === "DECISION" && e.decision_type === "review_completed"
|
|
173
|
-
).length;
|
|
174
|
-
|
|
175
|
-
const ratio = reviewed / spawned;
|
|
176
|
-
|
|
177
|
-
// Scoring:
|
|
178
|
-
// - ratio <= 1.0: perfect (1.0)
|
|
179
|
-
// - ratio <= 2.0: linear decay from 1.0 to 0.5
|
|
180
|
-
// - ratio > 2.0: linear penalty from 0.5 toward 0.0
|
|
181
|
-
let score: number;
|
|
182
|
-
if (ratio <= 1.0) {
|
|
183
|
-
score = 1.0;
|
|
184
|
-
} else if (ratio <= 2.0) {
|
|
185
|
-
// Linear decay: 1.0 at ratio=1.0, 0.5 at ratio=2.0
|
|
186
|
-
score = 1.0 - (ratio - 1.0) * 0.5;
|
|
187
|
-
} else {
|
|
188
|
-
// Penalty for extreme over-reviewing: 0.5 at ratio=2.0, 0.0 at ratio=4.0
|
|
189
|
-
score = Math.max(0, 0.5 - (ratio - 2.0) * 0.25);
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
return {
|
|
193
|
-
score,
|
|
194
|
-
message: `${reviewed} reviews / ${spawned} spawns (${ratio.toFixed(1)}:1 ratio)`,
|
|
195
|
-
};
|
|
196
|
-
} catch (error) {
|
|
197
|
-
return {
|
|
198
|
-
score: 0,
|
|
199
|
-
message: `Failed to parse CoordinatorSession: ${error}`,
|
|
200
|
-
};
|
|
201
|
-
}
|
|
202
|
-
},
|
|
203
|
-
});
|
|
204
|
-
|
|
205
135
|
/**
|
|
206
136
|
* Review Thoroughness Scorer
|
|
207
137
|
*
|
package/package.json
CHANGED
|
@@ -3,6 +3,15 @@
|
|
|
3
3
|
*
|
|
4
4
|
* TDD approach - tests written FIRST to define scorer behavior
|
|
5
5
|
* Tests the PURE scoring functions (not evalite wrappers)
|
|
6
|
+
*
|
|
7
|
+
* **Case-Sensitivity Verification**:
|
|
8
|
+
* All tool name regexes MUST be case-insensitive (/i flag) because:
|
|
9
|
+
* - LLMs generate inconsistent casing (Edit vs edit, Read vs read)
|
|
10
|
+
* - Fixtures contain mixed case examples
|
|
11
|
+
* - Scoring must be robust to case variations
|
|
12
|
+
*
|
|
13
|
+
* Fixed in commit adding /i flags to Edit, Write, bash patterns.
|
|
14
|
+
* Tests added to prevent regression.
|
|
6
15
|
*/
|
|
7
16
|
|
|
8
17
|
import { describe, expect, test } from "bun:test";
|
|
@@ -15,6 +24,109 @@ import {
|
|
|
15
24
|
scorePostCompactionDiscipline,
|
|
16
25
|
} from "./compaction-prompt-scoring.js";
|
|
17
26
|
|
|
27
|
+
describe("Case-Insensitive Tool Detection (Regression Prevention)", () => {
|
|
28
|
+
test("all scorers handle mixed-case tool names correctly", () => {
|
|
29
|
+
// Real-world example with mixed casing from LLM output
|
|
30
|
+
const prompt: CompactionPrompt = {
|
|
31
|
+
content: `┌─────────────────────────────────────────┐
|
|
32
|
+
│ YOU ARE THE COORDINATOR │
|
|
33
|
+
└─────────────────────────────────────────┘
|
|
34
|
+
|
|
35
|
+
You are coordinating epic mjkw81rkq4c.
|
|
36
|
+
|
|
37
|
+
## IMMEDIATE ACTIONS
|
|
38
|
+
|
|
39
|
+
1. swarm_status(epic_id='mjkw81rkq4c', project_key='/path')
|
|
40
|
+
2. swarmmail_inbox()
|
|
41
|
+
|
|
42
|
+
## FORBIDDEN TOOLS
|
|
43
|
+
|
|
44
|
+
NEVER use these tools - delegate to workers:
|
|
45
|
+
- edit (file modifications)
|
|
46
|
+
- write (file creation)
|
|
47
|
+
- BASH (shell commands for file mods)
|
|
48
|
+
- swarmmail_reserve (only workers)
|
|
49
|
+
- git commit (workers handle)
|
|
50
|
+
|
|
51
|
+
ALWAYS spawn workers for code changes.`,
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
// Epic ID detection should work
|
|
55
|
+
const epicResult = scoreEpicIdSpecificity(prompt);
|
|
56
|
+
expect(epicResult.score).toBe(1.0);
|
|
57
|
+
|
|
58
|
+
// Actionability should detect swarm_status
|
|
59
|
+
const actionResult = scoreActionability(prompt);
|
|
60
|
+
expect(actionResult.score).toBe(1.0);
|
|
61
|
+
|
|
62
|
+
// Coordinator identity should detect ASCII + NEVER/ALWAYS
|
|
63
|
+
const identityResult = scoreCoordinatorIdentity(prompt);
|
|
64
|
+
expect(identityResult.score).toBe(1.0);
|
|
65
|
+
|
|
66
|
+
// Forbidden tools should detect all 5 despite mixed case
|
|
67
|
+
const forbiddenResult = scoreForbiddenToolsPresent(prompt);
|
|
68
|
+
expect(forbiddenResult.score).toBe(1.0);
|
|
69
|
+
expect(forbiddenResult.message).toContain("All 5");
|
|
70
|
+
|
|
71
|
+
// Post-compaction discipline should detect swarm_status as first tool
|
|
72
|
+
const disciplineResult = scorePostCompactionDiscipline(prompt);
|
|
73
|
+
expect(disciplineResult.score).toBe(1.0);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
test("forbidden tools scorer detects lowercase tool names", () => {
|
|
77
|
+
// Previously failed before /i flags were added
|
|
78
|
+
const prompt: CompactionPrompt = {
|
|
79
|
+
content: `Don't use: edit, write, bash, swarmmail_reserve, git commit`,
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
const result = scoreForbiddenToolsPresent(prompt);
|
|
83
|
+
|
|
84
|
+
// Should detect all 5 tools regardless of case
|
|
85
|
+
expect(result.score).toBe(1.0);
|
|
86
|
+
expect(result.message).toContain("All 5");
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
test("forbidden tools scorer detects UPPERCASE tool names", () => {
|
|
90
|
+
const prompt: CompactionPrompt = {
|
|
91
|
+
content: `Forbidden: EDIT, WRITE, BASH, swarmmail_reserve, git commit`,
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
const result = scoreForbiddenToolsPresent(prompt);
|
|
95
|
+
|
|
96
|
+
expect(result.score).toBe(1.0);
|
|
97
|
+
expect(result.message).toContain("All 5");
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
test("post-compaction discipline detects mixed-case first tools", () => {
|
|
101
|
+
const testCases = [
|
|
102
|
+
{ tool: "EDIT", shouldPass: false },
|
|
103
|
+
{ tool: "edit", shouldPass: false },
|
|
104
|
+
{ tool: "Edit", shouldPass: false },
|
|
105
|
+
{ tool: "WRITE", shouldPass: false },
|
|
106
|
+
{ tool: "write", shouldPass: false },
|
|
107
|
+
{ tool: "READ", shouldPass: false },
|
|
108
|
+
{ tool: "read", shouldPass: false },
|
|
109
|
+
{ tool: "swarm_status", shouldPass: true },
|
|
110
|
+
{ tool: "SWARM_STATUS", shouldPass: true },
|
|
111
|
+
{ tool: "swarmmail_inbox", shouldPass: true },
|
|
112
|
+
];
|
|
113
|
+
|
|
114
|
+
for (const { tool, shouldPass } of testCases) {
|
|
115
|
+
const prompt: CompactionPrompt = {
|
|
116
|
+
content: `1. ${tool}()`,
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
const result = scorePostCompactionDiscipline(prompt);
|
|
120
|
+
|
|
121
|
+
if (shouldPass) {
|
|
122
|
+
expect(result.score).toBe(1.0);
|
|
123
|
+
} else {
|
|
124
|
+
expect(result.score).toBe(0.0);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
});
|
|
129
|
+
|
|
18
130
|
describe("epicIdSpecificity scorer", () => {
|
|
19
131
|
test("scores 1.0 for real epic IDs", () => {
|
|
20
132
|
const prompt: CompactionPrompt = {
|
|
@@ -218,6 +330,33 @@ describe("forbiddenToolsPresent scorer", () => {
|
|
|
218
330
|
expect(result.score).toBe(0.0);
|
|
219
331
|
expect(result.message).toContain("0/5");
|
|
220
332
|
});
|
|
333
|
+
|
|
334
|
+
test("scores 1.0 with lowercase forbidden tools (case-insensitive)", () => {
|
|
335
|
+
const prompt: CompactionPrompt = {
|
|
336
|
+
content: `🚫 FORBIDDEN TOOLS - NEVER call these:
|
|
337
|
+
- edit (use swarm_spawn_subtask)
|
|
338
|
+
- write (use swarm_spawn_subtask)
|
|
339
|
+
- swarmmail_reserve (only workers reserve)
|
|
340
|
+
- git commit (workers commit)
|
|
341
|
+
- bash (for file modifications)`,
|
|
342
|
+
};
|
|
343
|
+
|
|
344
|
+
const result = scoreForbiddenToolsPresent(prompt);
|
|
345
|
+
|
|
346
|
+
expect(result.score).toBe(1.0);
|
|
347
|
+
expect(result.message).toContain("All 5 forbidden tools");
|
|
348
|
+
});
|
|
349
|
+
|
|
350
|
+
test("scores correctly with mixed case forbidden tools", () => {
|
|
351
|
+
const prompt: CompactionPrompt = {
|
|
352
|
+
content: `Avoid: edit, Write, BASH`,
|
|
353
|
+
};
|
|
354
|
+
|
|
355
|
+
const result = scoreForbiddenToolsPresent(prompt);
|
|
356
|
+
|
|
357
|
+
expect(result.score).toBe(0.6);
|
|
358
|
+
expect(result.message).toContain("3/5");
|
|
359
|
+
});
|
|
221
360
|
});
|
|
222
361
|
|
|
223
362
|
describe("postCompactionDiscipline scorer", () => {
|
|
@@ -297,4 +436,40 @@ describe("postCompactionDiscipline scorer", () => {
|
|
|
297
436
|
expect(result.score).toBe(0.0);
|
|
298
437
|
expect(result.message).toContain("No tool");
|
|
299
438
|
});
|
|
439
|
+
|
|
440
|
+
test("scores 0.0 when first tool is lowercase 'read' (case-insensitive)", () => {
|
|
441
|
+
const prompt: CompactionPrompt = {
|
|
442
|
+
content: `1. read(file='src/index.ts')
|
|
443
|
+
2. swarm_status()`,
|
|
444
|
+
};
|
|
445
|
+
|
|
446
|
+
const result = scorePostCompactionDiscipline(prompt);
|
|
447
|
+
|
|
448
|
+
expect(result.score).toBe(0.0);
|
|
449
|
+
expect(result.message).toContain("read");
|
|
450
|
+
});
|
|
451
|
+
|
|
452
|
+
test("scores 0.0 when first tool is lowercase 'edit'", () => {
|
|
453
|
+
const prompt: CompactionPrompt = {
|
|
454
|
+
content: `1. edit(file='src/auth.ts', ...)
|
|
455
|
+
2. swarm_status()`,
|
|
456
|
+
};
|
|
457
|
+
|
|
458
|
+
const result = scorePostCompactionDiscipline(prompt);
|
|
459
|
+
|
|
460
|
+
expect(result.score).toBe(0.0);
|
|
461
|
+
expect(result.message).toContain("edit");
|
|
462
|
+
});
|
|
463
|
+
|
|
464
|
+
test("scores 0.0 when first tool is lowercase 'write'", () => {
|
|
465
|
+
const prompt: CompactionPrompt = {
|
|
466
|
+
content: `1. write(file='README.md', content='...')
|
|
467
|
+
2. swarm_status()`,
|
|
468
|
+
};
|
|
469
|
+
|
|
470
|
+
const result = scorePostCompactionDiscipline(prompt);
|
|
471
|
+
|
|
472
|
+
expect(result.score).toBe(0.0);
|
|
473
|
+
expect(result.message).toContain("write");
|
|
474
|
+
});
|
|
300
475
|
});
|