nodebench-mcp 1.4.1 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/NODEBENCH_AGENTS.md +154 -2
  2. package/README.md +214 -215
  3. package/dist/__tests__/comparativeBench.test.d.ts +1 -0
  4. package/dist/__tests__/comparativeBench.test.js +722 -0
  5. package/dist/__tests__/comparativeBench.test.js.map +1 -0
  6. package/dist/__tests__/evalHarness.test.js +24 -2
  7. package/dist/__tests__/evalHarness.test.js.map +1 -1
  8. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +14 -0
  9. package/dist/__tests__/gaiaCapabilityEval.test.js +420 -0
  10. package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -0
  11. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +15 -0
  12. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +303 -0
  13. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -0
  14. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +7 -0
  15. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +279 -0
  16. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +1 -0
  17. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +10 -0
  18. package/dist/__tests__/openDatasetPerfComparison.test.js +318 -0
  19. package/dist/__tests__/openDatasetPerfComparison.test.js.map +1 -0
  20. package/dist/__tests__/tools.test.js +155 -7
  21. package/dist/__tests__/tools.test.js.map +1 -1
  22. package/dist/__tests__/toolsetGatingEval.test.d.ts +1 -0
  23. package/dist/__tests__/toolsetGatingEval.test.js +1031 -0
  24. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -0
  25. package/dist/db.js +56 -0
  26. package/dist/db.js.map +1 -1
  27. package/dist/index.js +462 -28
  28. package/dist/index.js.map +1 -1
  29. package/dist/tools/localFileTools.d.ts +15 -0
  30. package/dist/tools/localFileTools.js +386 -0
  31. package/dist/tools/localFileTools.js.map +1 -0
  32. package/dist/tools/metaTools.js +170 -3
  33. package/dist/tools/metaTools.js.map +1 -1
  34. package/dist/tools/parallelAgentTools.d.ts +18 -0
  35. package/dist/tools/parallelAgentTools.js +1272 -0
  36. package/dist/tools/parallelAgentTools.js.map +1 -0
  37. package/dist/tools/selfEvalTools.js +240 -10
  38. package/dist/tools/selfEvalTools.js.map +1 -1
  39. package/dist/tools/webTools.js +171 -37
  40. package/dist/tools/webTools.js.map +1 -1
  41. package/package.json +26 -8
@@ -0,0 +1,1272 @@
1
+ /**
2
+ * Parallel Agent Coordination Tools
3
+ *
4
+ * Inspired by Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
5
+ * Implements task locking, role specialization, context budget management,
6
+ * and oracle-based testing patterns for multi-agent development workflows.
7
+ *
8
+ * Key patterns from the blog post:
9
+ * - Task locking: Prevent two agents from solving the same problem simultaneously
10
+ * - Agent roles: Specialization (implementer, dedup, perf, docs, critic)
11
+ * - Context window management: Prevent pollution, track budget, pre-compute summaries
12
+ * - Oracle testing: Compare against known-good reference outputs
13
+ * - Progress tracking: Maintain running docs of status for fresh agent sessions
14
+ *
15
+ * Reference: https://www.anthropic.com/engineering/building-c-compiler
16
+ */
17
+ import { getDb, genId } from "../db.js";
18
+ const PREDEFINED_ROLES = {
19
+ implementer: {
20
+ description: "Primary feature implementer. Picks failing tests and implements fixes.",
21
+ instructions: "Focus on making failing tests pass. Pick the next most obvious failing test, fix it, run tests, commit. Avoid refactoring unrelated code. Update progress notes after each commit.",
22
+ },
23
+ dedup_reviewer: {
24
+ description: "Code deduplication specialist. Finds and coalesces duplicate implementations.",
25
+ instructions: "Search for duplicated logic across the codebase. Coalesce into shared utilities. Do NOT change external behavior. Run all tests after each consolidation. Log each dedup as a learning.",
26
+ },
27
+ performance_optimizer: {
28
+ description: "Performance specialist. Profiles and optimizes hot paths.",
29
+ instructions: "Profile the system for bottlenecks. Optimize hot paths without changing correctness. Benchmark before and after. Use oracle comparisons to verify output hasn't changed. Document optimizations as learnings.",
30
+ },
31
+ documentation_maintainer: {
32
+ description: "Documentation specialist. Keeps READMEs, progress files, and docs in sync.",
33
+ instructions: "Review all documentation for accuracy against current code. Update READMEs, progress files, and inline docs. Ensure new agents can orient themselves quickly. Use decide_re_update before creating new files.",
34
+ },
35
+ code_quality_critic: {
36
+ description: "Code quality reviewer. Structural improvements and pattern enforcement.",
37
+ instructions: "Review code from the perspective of an expert developer. Identify structural issues, anti-patterns, and opportunities for improvement. Make changes that improve maintainability without breaking tests. Log patterns discovered as learnings.",
38
+ },
39
+ test_writer: {
40
+ description: "Test specialist. Writes and improves test coverage.",
41
+ instructions: "Identify untested code paths. Write targeted tests for edge cases and failure modes. Ensure tests are deterministic and fast. Use oracle comparisons for complex output validation. Log test patterns as learnings.",
42
+ },
43
+ security_auditor: {
44
+ description: "Security specialist. Finds and fixes vulnerabilities.",
45
+ instructions: "Audit code for security vulnerabilities: injection, auth bypass, data exposure, unsafe defaults. Log each finding as a CRITICAL or HIGH gap. Fix vulnerabilities and verify with targeted tests.",
46
+ },
47
+ };
48
+ // ============================================================================
49
+ // Portable AGENTS.md Generator
50
+ // ============================================================================
51
+ function generateParallelAgentsMdSection(techStack = "general", projectName = "this project", maxAgents = 4, includeNodebench = true) {
52
+ const isTs = techStack.toLowerCase().includes("typescript") || techStack.toLowerCase().includes("node") || techStack.toLowerCase().includes("js");
53
+ const isPython = techStack.toLowerCase().includes("python");
54
+ const buildCmd = isTs ? "npm run build" : isPython ? "python -m py_compile" : "make build";
55
+ const testCmd = isTs ? "npm test" : isPython ? "pytest" : "make test";
56
+ const lintCmd = isTs ? "npx tsc --noEmit" : isPython ? "ruff check ." : "make lint";
57
+ // Role recommendations based on agent count
58
+ const roleRecs = maxAgents >= 4
59
+ ? `- Agent 1: **implementer** — Primary feature work
60
+ - Agent 2: **test_writer** — Test coverage and edge cases
61
+ - Agent 3: **code_quality_critic** — Refactoring and pattern enforcement
62
+ - Agent 4: **documentation_maintainer** — Docs, progress files, READMEs`
63
+ : maxAgents >= 2
64
+ ? `- Agent 1: **implementer** — Feature work and bug fixes
65
+ - Agent 2: **test_writer** — Tests and quality review`
66
+ : `- Agent 1: **implementer** — All work (single agent mode)`;
67
+ let md = `## Parallel Agent Coordination Protocol
68
+
69
+ > Based on Anthropic's "Building a C Compiler with Parallel Claudes" (Feb 2026).
70
+ > Reference: https://www.anthropic.com/engineering/building-c-compiler
71
+
72
+ This section enables ${maxAgents} AI agents to work on ${projectName} in parallel without conflicts.
73
+
74
+ ### Task Locking Protocol
75
+
76
+ **Before starting any work**, claim your task to prevent duplicate effort:
77
+
78
+ 1. Check \`.parallel-agents/current_tasks/\` for active claims
79
+ 2. Create a lock file: \`.parallel-agents/current_tasks/<task_key>.lock\`
80
+ - Content: \`{ "agent": "<session_id>", "started": "<ISO timestamp>", "description": "<what you plan to do>" }\`
81
+ 3. Do your work
82
+ 4. When done, delete the lock file and update \`.parallel-agents/progress.md\`
83
+
84
+ **If a lock file already exists for your intended task**: pick a different task. Do NOT delete another agent's lock.
85
+
86
+ **If a lock file is stale** (older than 2 hours with no progress update): the agent may have crashed. You may reclaim it — but add a note in progress.md.
87
+
88
+ ### Role Specialization
89
+
90
+ Recommended role assignments for ${maxAgents} parallel agents:
91
+
92
+ ${roleRecs}
93
+
94
+ Each agent should:
95
+ - Stay focused on their role's responsibilities
96
+ - Avoid making changes outside their scope
97
+ - Update progress.md after each significant commit
98
+ - Record learnings about patterns discovered in their domain
99
+
100
+ ### Oracle Testing Workflow
101
+
102
+ Use known-good reference outputs to validate changes:
103
+
104
+ 1. **Capture oracle**: Run the reference implementation and save output
105
+ \`\`\`
106
+ ${isTs ? "node reference-impl.js > .parallel-agents/oracle/test_1.golden" : isPython ? "python reference_impl.py > .parallel-agents/oracle/test_1.golden" : "./reference-impl > .parallel-agents/oracle/test_1.golden"}
107
+ \`\`\`
108
+ 2. **Compare**: After changes, run your implementation and diff against golden file
109
+ \`\`\`
110
+ ${isTs ? "node your-impl.js > /tmp/actual.txt && diff .parallel-agents/oracle/test_1.golden /tmp/actual.txt" : isPython ? "python your_impl.py > /tmp/actual.txt && diff .parallel-agents/oracle/test_1.golden /tmp/actual.txt" : "./your-impl > /tmp/actual.txt && diff .parallel-agents/oracle/test_1.golden /tmp/actual.txt"}
111
+ \`\`\`
112
+ 3. **Triage failures**: Each failing comparison is an independent work item — assign to a different agent
113
+ 4. **Delta debugging**: If tests pass alone but fail together, split the set in half to isolate the conflict
114
+
115
+ ### Context Budget Rules
116
+
117
+ LLM agents have finite context windows. Prevent pollution:
118
+
119
+ - **DO NOT** print thousands of lines of test output — log to file, print summary only
120
+ - **DO NOT** read entire large files — use targeted grep/search
121
+ - **DO** pre-compute aggregate stats before reporting
122
+ - **DO** use \`--fast\` mode (1-10% random sample) for large test suites during development
123
+ - **DO** log errors with ERROR prefix on same line for easy grep
124
+ - **Budget guideline**: If a single tool output exceeds ~5,000 tokens, summarize it first
125
+
126
+ ### Progress File Protocol
127
+
128
+ File: \`.parallel-agents/progress.md\`
129
+
130
+ Every agent MUST read this file at session start and update it after significant work:
131
+
132
+ - **Current Status**: What's done, what's in progress
133
+ - **Active Agents**: Who is working on what (check lock files too)
134
+ - **Blocked Items**: What needs help from another agent or human
135
+ - **Failed Approaches**: What was tried and didn't work (prevents other agents from repeating mistakes)
136
+ - **Key Decisions**: Architectural choices made during parallel work
137
+
138
+ ### Anti-Patterns to Avoid
139
+
140
+ - **Two agents on same task**: Always check lock files before starting
141
+ - **Context dumping**: Never paste >100 lines of raw output into context
142
+ - **Stuck loops**: If stuck >30 minutes on one problem, mark as blocked and move on
143
+ - **Silent overwrites**: Always pull/rebase before pushing — check for other agents' recent commits
144
+ - **No progress updates**: Fresh agents waste time re-orienting without progress.md updates
145
+ - **Scope creep**: Stay in your role — an implementer should not refactor unless assigned as critic
146
+
147
+ ### Flywheel Verification (After Bootstrap)
148
+
149
+ Run this 6-step check to verify parallel agent setup works:
150
+
151
+ 1. **Static Analysis**: \`${lintCmd}\` — zero errors
152
+ 2. **Happy Path**: One agent claims task → does work → releases → progress.md updated
153
+ 3. **Conflict Test**: Two agents claim same task → second gets conflict
154
+ 4. **Oracle Test**: Create golden file → make change → diff catches it
155
+ 5. **Gap Re-scan**: Re-run detection — all 7 categories should show as present
156
+ 6. **Document**: Record any new learnings discovered during verification
157
+ `;
158
+ if (includeNodebench) {
159
+ md += `
160
+ ### NodeBench MCP Setup (Optional but Recommended)
161
+
162
+ Install nodebench-mcp for full parallel agent tool support:
163
+
164
+ \`\`\`bash
165
+ # Claude Code CLI
166
+ claude mcp add nodebench -- npx -y nodebench-mcp
167
+
168
+ # Or manual config in .claude.json / settings.json
169
+ {
170
+ "mcpServers": {
171
+ "nodebench": {
172
+ "command": "npx",
173
+ "args": ["-y", "nodebench-mcp"]
174
+ }
175
+ }
176
+ }
177
+ \`\`\`
178
+
179
+ **Tool mapping** (file-based protocol → MCP tools):
180
+
181
+ | File-Based | NodeBench MCP Tool | Description |
182
+ |------------|-------------------|-------------|
183
+ | Lock file in \`current_tasks/\` | \`claim_agent_task\` | Claim a task lock |
184
+ | Delete lock file | \`release_agent_task\` | Release with progress note |
185
+ | Read \`current_tasks/\` | \`list_agent_tasks\` | See all claims |
186
+ | Manual role notes | \`assign_agent_role\` | 7 predefined roles |
187
+ | \`diff\` against golden file | \`run_oracle_comparison\` | Oracle testing with history |
188
+ | Read progress.md | \`get_parallel_status\` | Full orientation overview |
189
+ | Manual token counting | \`log_context_budget\` | Automated budget tracking |
190
+ | Run detection manually | \`bootstrap_parallel_agents\` | Auto-detect and scaffold |
191
+
192
+ **First-time setup with MCP**:
193
+ \`\`\`
194
+ > Use bootstrap_parallel_agents to scan this project and set up parallel agent infrastructure
195
+ > Use getMethodology("parallel_agent_teams") for the full 6-step workflow
196
+ \`\`\`
197
+ `;
198
+ }
199
+ return md;
200
+ }
201
+ export const parallelAgentTools = [
202
+ // ─── Task Locking ───────────────────────────────────────────
203
+ {
204
+ name: "claim_agent_task",
205
+ description: "Claim a task lock so other parallel agents know you're working on it. Prevents duplicate work when multiple agents run simultaneously. Based on Anthropic's parallel Claude task locking pattern. Returns conflict info if another agent already claimed this task.",
206
+ inputSchema: {
207
+ type: "object",
208
+ properties: {
209
+ taskKey: {
210
+ type: "string",
211
+ description: "Unique task identifier (e.g. 'fix_auth_middleware', 'implement_ssr_hydration'). Use snake_case, descriptive names.",
212
+ },
213
+ description: {
214
+ type: "string",
215
+ description: "What you plan to do for this task",
216
+ },
217
+ sessionId: {
218
+ type: "string",
219
+ description: "Your agent session ID. If omitted, uses the MCP connection session.",
220
+ },
221
+ },
222
+ required: ["taskKey"],
223
+ },
224
+ handler: async (args) => {
225
+ const db = getDb();
226
+ const taskKey = args.taskKey;
227
+ const sessionId = args.sessionId || `agent_${Date.now()}`;
228
+ const description = args.description || "";
229
+ // Check if task is already claimed by another active agent
230
+ const existing = db
231
+ .prepare("SELECT * FROM agent_tasks WHERE task_key = ? AND status = 'claimed'")
232
+ .get(taskKey);
233
+ if (existing && existing.session_id !== sessionId) {
234
+ return {
235
+ claimed: false,
236
+ conflict: true,
237
+ existingClaim: {
238
+ sessionId: existing.session_id,
239
+ claimedAt: existing.claimed_at,
240
+ description: existing.description,
241
+ progressNote: existing.progress_note,
242
+ },
243
+ suggestion: "Another agent is already working on this task. Pick a different task or wait for them to release it. Use list_agent_tasks to see all current claims.",
244
+ };
245
+ }
246
+ // Claim or re-claim the task
247
+ const id = genId("task");
248
+ db.prepare("INSERT OR REPLACE INTO agent_tasks (id, task_key, session_id, status, description, claimed_at) VALUES (?, ?, ?, 'claimed', ?, datetime('now'))").run(id, taskKey, sessionId, description);
249
+ // Count total active tasks for this session
250
+ const myTasks = db
251
+ .prepare("SELECT COUNT(*) as c FROM agent_tasks WHERE session_id = ? AND status = 'claimed'")
252
+ .get(sessionId);
253
+ return {
254
+ claimed: true,
255
+ taskId: id,
256
+ taskKey,
257
+ sessionId,
258
+ activeTasks: myTasks.c,
259
+ tip: "Update progress with release_agent_task when done. Other agents can see your claim via list_agent_tasks.",
260
+ };
261
+ },
262
+ },
263
+ {
264
+ name: "release_agent_task",
265
+ description: "Release a task lock after completing work. Updates status and optionally records a progress note for the next agent session. Part of the parallel agent coordination pattern.",
266
+ inputSchema: {
267
+ type: "object",
268
+ properties: {
269
+ taskKey: {
270
+ type: "string",
271
+ description: "The task key to release",
272
+ },
273
+ status: {
274
+ type: "string",
275
+ enum: ["completed", "blocked", "abandoned"],
276
+ description: "Final status: completed (done), blocked (needs help), abandoned (giving up)",
277
+ },
278
+ progressNote: {
279
+ type: "string",
280
+ description: "Note for the next agent picking up this task (e.g. failed approaches, remaining work)",
281
+ },
282
+ sessionId: {
283
+ type: "string",
284
+ description: "Your agent session ID (must match the claim)",
285
+ },
286
+ },
287
+ required: ["taskKey"],
288
+ },
289
+ handler: async (args) => {
290
+ const db = getDb();
291
+ const taskKey = args.taskKey;
292
+ const status = args.status || "completed";
293
+ const progressNote = args.progressNote || "";
294
+ const sessionId = args.sessionId;
295
+ // Find the active claim
296
+ let query = "UPDATE agent_tasks SET status = ?, progress_note = ?, released_at = datetime('now') WHERE task_key = ? AND status = 'claimed'";
297
+ const params = [status, progressNote, taskKey];
298
+ if (sessionId) {
299
+ query += " AND session_id = ?";
300
+ params.push(sessionId);
301
+ }
302
+ const result = db.prepare(query).run(...params);
303
+ if (result.changes === 0) {
304
+ return {
305
+ released: false,
306
+ error: "No active claim found for this task key",
307
+ };
308
+ }
309
+ return {
310
+ released: true,
311
+ taskKey,
312
+ status,
313
+ progressNote: progressNote || "(none)",
314
+ tip: status === "blocked"
315
+ ? "Task marked as blocked. Another agent or human should review the progress note."
316
+ : status === "abandoned"
317
+ ? "Task abandoned. Consider recording a learning about why this failed."
318
+ : "Task completed. Other agents can now pick related tasks.",
319
+ };
320
+ },
321
+ },
322
+ {
323
+ name: "list_agent_tasks",
324
+ description: "List all current task claims across parallel agents. Shows who is working on what, blocked tasks, and recently completed work. Essential for new agent sessions to orient themselves and avoid duplicate work.",
325
+ inputSchema: {
326
+ type: "object",
327
+ properties: {
328
+ status: {
329
+ type: "string",
330
+ enum: ["claimed", "completed", "blocked", "abandoned", "all"],
331
+ description: "Filter by status (default: 'all')",
332
+ },
333
+ limit: {
334
+ type: "number",
335
+ description: "Max results (default: 50)",
336
+ },
337
+ },
338
+ },
339
+ handler: async (args) => {
340
+ const db = getDb();
341
+ const status = args.status || "all";
342
+ const limit = args.limit || 50;
343
+ let query = "SELECT * FROM agent_tasks";
344
+ const params = [];
345
+ if (status !== "all") {
346
+ query += " WHERE status = ?";
347
+ params.push(status);
348
+ }
349
+ query += " ORDER BY claimed_at DESC LIMIT ?";
350
+ params.push(limit);
351
+ const tasks = db.prepare(query).all(...params);
352
+ // Summary stats
353
+ const stats = db
354
+ .prepare("SELECT status, COUNT(*) as count FROM agent_tasks GROUP BY status")
355
+ .all();
356
+ return {
357
+ tasks: tasks.map((t) => ({
358
+ taskKey: t.task_key,
359
+ sessionId: t.session_id,
360
+ status: t.status,
361
+ description: t.description,
362
+ progressNote: t.progress_note,
363
+ claimedAt: t.claimed_at,
364
+ releasedAt: t.released_at,
365
+ })),
366
+ stats: Object.fromEntries(stats.map((s) => [s.status, s.count])),
367
+ total: tasks.length,
368
+ tip: "Claimed tasks are actively being worked on. Pick unclaimed work or blocked tasks that need fresh eyes.",
369
+ };
370
+ },
371
+ },
372
+ // ─── Agent Role Specialization ──────────────────────────────
373
+ {
374
+ name: "assign_agent_role",
375
+ description: 'Assign a specialized role to the current agent session. Roles define focus area and behavioral instructions. Predefined roles: implementer, dedup_reviewer, performance_optimizer, documentation_maintainer, code_quality_critic, test_writer, security_auditor. Based on Anthropic\'s "multiple agent roles" pattern where specialized agents handle dedup, performance, documentation, and code quality.',
376
+ inputSchema: {
377
+ type: "object",
378
+ properties: {
379
+ role: {
380
+ type: "string",
381
+ description: "Role name. Use predefined: implementer, dedup_reviewer, performance_optimizer, documentation_maintainer, code_quality_critic, test_writer, security_auditor. Or define a custom role.",
382
+ },
383
+ sessionId: {
384
+ type: "string",
385
+ description: "Agent session ID to assign the role to",
386
+ },
387
+ customInstructions: {
388
+ type: "string",
389
+ description: "Custom instructions for the role (overrides predefined instructions if set)",
390
+ },
391
+ focusArea: {
392
+ type: "string",
393
+ description: "Specific area to focus on (e.g. 'auth module', 'API routes', 'frontend components')",
394
+ },
395
+ },
396
+ required: ["role"],
397
+ },
398
+ handler: async (args) => {
399
+ const db = getDb();
400
+ const role = args.role;
401
+ const sessionId = args.sessionId || `agent_${Date.now()}`;
402
+ const focusArea = args.focusArea || "";
403
+ const predefined = PREDEFINED_ROLES[role];
404
+ const instructions = args.customInstructions ||
405
+ (predefined ? predefined.instructions : `Custom role: ${role}`);
406
+ const id = genId("role");
407
+ db.prepare("INSERT OR REPLACE INTO agent_roles (id, session_id, role, instructions, focus_area, created_at) VALUES (?, ?, ?, ?, ?, datetime('now'))").run(id, sessionId, role, instructions, focusArea);
408
+ return {
409
+ assigned: true,
410
+ role,
411
+ sessionId,
412
+ description: predefined?.description || `Custom role: ${role}`,
413
+ instructions,
414
+ focusArea: focusArea || "(general)",
415
+ availableRoles: Object.keys(PREDEFINED_ROLES),
416
+ tip: "Your role shapes what tasks you should claim and how you approach work. Use claim_agent_task to pick tasks aligned with your role.",
417
+ };
418
+ },
419
+ },
420
+ {
421
+ name: "get_agent_role",
422
+ description: "Get the current agent's assigned role and instructions. Returns role-specific behavioral guidance. If no role is assigned, suggests one based on current project state.",
423
+ inputSchema: {
424
+ type: "object",
425
+ properties: {
426
+ sessionId: {
427
+ type: "string",
428
+ description: "Agent session ID to look up",
429
+ },
430
+ },
431
+ },
432
+ handler: async (args) => {
433
+ const db = getDb();
434
+ const sessionId = args.sessionId;
435
+ if (sessionId) {
436
+ const role = db
437
+ .prepare("SELECT * FROM agent_roles WHERE session_id = ?")
438
+ .get(sessionId);
439
+ if (role) {
440
+ return {
441
+ hasRole: true,
442
+ role: role.role,
443
+ instructions: role.instructions,
444
+ focusArea: role.focus_area,
445
+ assignedAt: role.created_at,
446
+ };
447
+ }
448
+ }
449
+ // No role assigned — list all active roles
450
+ const activeRoles = db
451
+ .prepare("SELECT * FROM agent_roles ORDER BY created_at DESC LIMIT 20")
452
+ .all();
453
+ return {
454
+ hasRole: false,
455
+ activeRoles: activeRoles.map((r) => ({
456
+ sessionId: r.session_id,
457
+ role: r.role,
458
+ focusArea: r.focus_area,
459
+ assignedAt: r.created_at,
460
+ })),
461
+ availableRoles: Object.entries(PREDEFINED_ROLES).map(([k, v]) => ({
462
+ role: k,
463
+ description: v.description,
464
+ })),
465
+ tip: "No role assigned for this session. Call assign_agent_role to specialize. This helps parallel agents coordinate by role.",
466
+ };
467
+ },
468
+ },
469
+ // ─── Context Window Budget Management ───────────────────────
470
+ {
471
+ name: "log_context_budget",
472
+ description: "Track context window usage to prevent pollution. LLM agents have finite context and, as Anthropic's blog notes, test harnesses should NOT print thousands of useless bytes. Use this to track token usage, flag when approaching limits, and recommend summarization. Implements the 'context window pollution prevention' pattern.",
473
+ inputSchema: {
474
+ type: "object",
475
+ properties: {
476
+ sessionId: {
477
+ type: "string",
478
+ description: "Agent session ID",
479
+ },
480
+ eventType: {
481
+ type: "string",
482
+ enum: [
483
+ "tool_output",
484
+ "file_read",
485
+ "test_output",
486
+ "log_output",
487
+ "search_result",
488
+ "checkpoint",
489
+ ],
490
+ description: "What kind of content consumed context",
491
+ },
492
+ tokensUsed: {
493
+ type: "number",
494
+ description: "Approximate tokens consumed by this event (estimate: chars / 4)",
495
+ },
496
+ tokensLimit: {
497
+ type: "number",
498
+ description: "Total context window limit (default: 200000 for Claude)",
499
+ },
500
+ description: {
501
+ type: "string",
502
+ description: "What generated this context usage",
503
+ },
504
+ },
505
+ required: ["eventType", "tokensUsed"],
506
+ },
507
+ handler: async (args) => {
508
+ const db = getDb();
509
+ const sessionId = args.sessionId || `agent_${Date.now()}`;
510
+ const eventType = args.eventType;
511
+ const tokensUsed = args.tokensUsed;
512
+ const tokensLimit = args.tokensLimit || 200000;
513
+ const description = args.description || "";
514
+ const id = genId("ctx");
515
+ db.prepare("INSERT INTO context_budget_log (id, session_id, event_type, tokens_used, tokens_limit, description, created_at) VALUES (?, ?, ?, ?, ?, ?, datetime('now'))").run(id, sessionId, eventType, tokensUsed, tokensLimit, description);
516
+ // Calculate total usage for this session
517
+ const total = db
518
+ .prepare("SELECT SUM(tokens_used) as total FROM context_budget_log WHERE session_id = ?")
519
+ .get(sessionId);
520
+ const totalUsed = total?.total || 0;
521
+ const percentUsed = Math.round((totalUsed / tokensLimit) * 100);
522
+ // Breakdown by event type
523
+ const breakdown = db
524
+ .prepare("SELECT event_type, SUM(tokens_used) as total, COUNT(*) as count FROM context_budget_log WHERE session_id = ? GROUP BY event_type ORDER BY total DESC")
525
+ .all(sessionId);
526
+ const warnings = [];
527
+ if (percentUsed > 80) {
528
+ warnings.push("CRITICAL: Over 80% context budget used. Summarize findings and start a fresh session.");
529
+ }
530
+ else if (percentUsed > 60) {
531
+ warnings.push("WARNING: Over 60% context budget used. Avoid reading large files. Use targeted grep instead of full file reads.");
532
+ }
533
+ else if (percentUsed > 40) {
534
+ warnings.push("NOTE: Approaching 40% context budget. Consider pre-computing summaries rather than dumping raw output.");
535
+ }
536
+ // Check for the biggest polluter
537
+ if (breakdown.length > 0 && breakdown[0].total > tokensLimit * 0.3) {
538
+ warnings.push(`Biggest context consumer: '${breakdown[0].event_type}' (${breakdown[0].total} tokens, ${breakdown[0].count} events). Consider reducing output from this source.`);
539
+ }
540
+ return {
541
+ logged: true,
542
+ sessionId,
543
+ event: { type: eventType, tokens: tokensUsed, description },
544
+ budget: {
545
+ totalUsed,
546
+ limit: tokensLimit,
547
+ percentUsed,
548
+ remaining: tokensLimit - totalUsed,
549
+ },
550
+ breakdown: breakdown.map((b) => ({
551
+ eventType: b.event_type,
552
+ totalTokens: b.total,
553
+ eventCount: b.count,
554
+ })),
555
+ warnings,
556
+ bestPractices: [
557
+ "Log errors with ERROR prefix on same line for easy grep",
558
+ "Pre-compute aggregate stats instead of dumping raw data",
559
+ "Use --fast mode (random 1-10% sample) for large test suites",
560
+ "Write detailed output to log files, print only summaries to context",
561
+ ],
562
+ };
563
+ },
564
+ },
565
+ // ─── Oracle-Based Testing ───────────────────────────────────
566
+ {
567
+ name: "run_oracle_comparison",
568
+ description: 'Compare actual output against a known-good oracle reference. Based on Anthropic\'s pattern of using GCC as an "online known-good compiler oracle" to identify which specific components are broken. The oracle pattern enables parallel debugging: each agent can work on different failing comparisons independently.',
569
+ inputSchema: {
570
+ type: "object",
571
+ properties: {
572
+ testLabel: {
573
+ type: "string",
574
+ description: "Label for this comparison (e.g. 'auth_middleware_output', 'api_response_format')",
575
+ },
576
+ actualOutput: {
577
+ type: "string",
578
+ description: "The actual output from your implementation",
579
+ },
580
+ expectedOutput: {
581
+ type: "string",
582
+ description: "The known-good reference output (oracle)",
583
+ },
584
+ oracleSource: {
585
+ type: "string",
586
+ description: "Where the oracle output came from (e.g. 'production_v2.1', 'reference_implementation', 'golden_file')",
587
+ },
588
+ sessionId: {
589
+ type: "string",
590
+ description: "Agent session ID for tracking",
591
+ },
592
+ cycleId: {
593
+ type: "string",
594
+ description: "Verification cycle ID to link this comparison to",
595
+ },
596
+ },
597
+ required: ["testLabel", "actualOutput", "expectedOutput", "oracleSource"],
598
+ },
599
+ handler: async (args) => {
600
+ const db = getDb();
601
+ const testLabel = args.testLabel;
602
+ const actualOutput = args.actualOutput;
603
+ const expectedOutput = args.expectedOutput;
604
+ const oracleSource = args.oracleSource;
605
+ const sessionId = args.sessionId || "";
606
+ const cycleId = args.cycleId || "";
607
+ // Compute match and diff
608
+ const exactMatch = actualOutput === expectedOutput;
609
+ // Simple line-level diff
610
+ const actualLines = actualOutput.split("\n");
611
+ const expectedLines = expectedOutput.split("\n");
612
+ const diffLines = [];
613
+ const maxLines = Math.max(actualLines.length, expectedLines.length);
614
+ let matchingLines = 0;
615
+ for (let i = 0; i < maxLines; i++) {
616
+ const a = actualLines[i] ?? "(missing)";
617
+ const e = expectedLines[i] ?? "(missing)";
618
+ if (a === e) {
619
+ matchingLines++;
620
+ }
621
+ else {
622
+ if (diffLines.length < 20) {
623
+ diffLines.push(`Line ${i + 1}: expected "${e.slice(0, 100)}" got "${a.slice(0, 100)}"`);
624
+ }
625
+ }
626
+ }
627
+ const matchPercent = maxLines > 0 ? Math.round((matchingLines / maxLines) * 100) : 100;
628
+ const diffSummary = diffLines.length > 0
629
+ ? diffLines.join("\n")
630
+ : "Exact match — no differences";
631
+ const id = genId("oracle");
632
+ db.prepare("INSERT INTO oracle_comparisons (id, test_label, oracle_source, actual_output, expected_output, match, diff_summary, session_id, cycle_id, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'))").run(id, testLabel, oracleSource, actualOutput, expectedOutput, exactMatch ? 1 : 0, diffSummary, sessionId, cycleId);
633
+ // Get recent comparison history for this label
634
+ const history = db
635
+ .prepare("SELECT match, created_at FROM oracle_comparisons WHERE test_label = ? ORDER BY created_at DESC LIMIT 5")
636
+ .all(testLabel);
637
+ return {
638
+ comparisonId: id,
639
+ testLabel,
640
+ oracleSource,
641
+ result: {
642
+ exactMatch,
643
+ matchPercent,
644
+ totalLines: maxLines,
645
+ matchingLines,
646
+ diffCount: maxLines - matchingLines,
647
+ },
648
+ diff: diffSummary,
649
+ history: history.map((h) => ({
650
+ match: h.match === 1,
651
+ at: h.created_at,
652
+ })),
653
+ tip: exactMatch
654
+ ? "Output matches oracle. Safe to proceed."
655
+ : `${maxLines - matchingLines} lines differ. Fix differences before committing. Each differing section can be assigned to a parallel agent.`,
656
+ };
657
+ },
658
+ },
659
+ // ─── Parallel Agent Overview ────────────────────────────────
660
+ {
661
+ name: "get_parallel_status",
662
+ description: "Get a comprehensive overview of all parallel agent activity: active task claims, role assignments, context budget status, and recent oracle comparison results. Essential for new agent sessions to orient themselves (Anthropic pattern: 'agents dropped into a fresh container with no context').",
663
+ inputSchema: {
664
+ type: "object",
665
+ properties: {
666
+ includeHistory: {
667
+ type: "boolean",
668
+ description: "Include completed/abandoned tasks and past comparisons (default: false)",
669
+ },
670
+ },
671
+ },
672
+ handler: async (args) => {
673
+ const db = getDb();
674
+ const includeHistory = args.includeHistory || false;
675
+ // Active tasks
676
+ const activeTasks = db
677
+ .prepare("SELECT * FROM agent_tasks WHERE status = 'claimed' ORDER BY claimed_at DESC")
678
+ .all();
679
+ const blockedTasks = db
680
+ .prepare("SELECT * FROM agent_tasks WHERE status = 'blocked' ORDER BY released_at DESC LIMIT 10")
681
+ .all();
682
+ // Roles
683
+ const roles = db
684
+ .prepare("SELECT * FROM agent_roles ORDER BY created_at DESC LIMIT 20")
685
+ .all();
686
+ // Recent oracle results
687
+ const recentOracle = db
688
+ .prepare("SELECT test_label, match, oracle_source, created_at FROM oracle_comparisons ORDER BY created_at DESC LIMIT 10")
689
+ .all();
690
+ // Context budget summaries
691
+ const budgetSummary = db
692
+ .prepare("SELECT session_id, SUM(tokens_used) as total_tokens, MAX(tokens_limit) as budget, COUNT(*) as events FROM context_budget_log GROUP BY session_id ORDER BY total_tokens DESC LIMIT 10")
693
+ .all();
694
+ // Task stats
695
+ const taskStats = db
696
+ .prepare("SELECT status, COUNT(*) as count FROM agent_tasks GROUP BY status")
697
+ .all();
698
+ // Optional history
699
+ let completedTasks = [];
700
+ if (includeHistory) {
701
+ completedTasks = db
702
+ .prepare("SELECT * FROM agent_tasks WHERE status IN ('completed', 'abandoned') ORDER BY released_at DESC LIMIT 20")
703
+ .all();
704
+ }
705
+ // Failed oracle comparisons (opportunities for parallel work)
706
+ const failedOracle = db
707
+ .prepare("SELECT test_label, diff_summary, oracle_source, created_at FROM oracle_comparisons WHERE match = 0 ORDER BY created_at DESC LIMIT 10")
708
+ .all();
709
+ return {
710
+ activeTasks: activeTasks.map((t) => ({
711
+ taskKey: t.task_key,
712
+ sessionId: t.session_id,
713
+ description: t.description,
714
+ claimedAt: t.claimed_at,
715
+ })),
716
+ blockedTasks: blockedTasks.map((t) => ({
717
+ taskKey: t.task_key,
718
+ progressNote: t.progress_note,
719
+ releasedAt: t.released_at,
720
+ })),
721
+ roles: roles.map((r) => ({
722
+ sessionId: r.session_id,
723
+ role: r.role,
724
+ focusArea: r.focus_area,
725
+ })),
726
+ taskStats: Object.fromEntries(taskStats.map((s) => [s.status, s.count])),
727
+ recentOracleResults: recentOracle.map((o) => ({
728
+ testLabel: o.test_label,
729
+ match: o.match === 1,
730
+ oracleSource: o.oracle_source,
731
+ at: o.created_at,
732
+ })),
733
+ failedOracleTests: failedOracle.map((o) => ({
734
+ testLabel: o.test_label,
735
+ diffSummary: (o.diff_summary || "").slice(0, 200),
736
+ oracleSource: o.oracle_source,
737
+ })),
738
+ contextBudgets: budgetSummary.map((b) => ({
739
+ sessionId: b.session_id,
740
+ totalTokens: b.total_tokens,
741
+ budget: b.budget,
742
+ percentUsed: Math.round((b.total_tokens / b.budget) * 100),
743
+ events: b.events,
744
+ })),
745
+ ...(includeHistory ? { completedTasks: completedTasks.map((t) => ({
746
+ taskKey: t.task_key,
747
+ status: t.status,
748
+ progressNote: t.progress_note,
749
+ releasedAt: t.released_at,
750
+ })) } : {}),
751
+ orientation: {
752
+ summary: `${activeTasks.length} active tasks, ${blockedTasks.length} blocked, ${roles.length} agents with roles, ${failedOracle.length} failing oracle tests`,
753
+ nextSteps: [
754
+ activeTasks.length > 0
755
+ ? "Review active tasks — avoid claiming the same work"
756
+ : "No active tasks — pick the next most impactful work item",
757
+ blockedTasks.length > 0
758
+ ? "Blocked tasks need fresh eyes — review progress notes"
759
+ : null,
760
+ failedOracle.length > 0
761
+ ? `${failedOracle.length} oracle tests failing — each can be assigned to a different agent`
762
+ : null,
763
+ ].filter(Boolean),
764
+ },
765
+ };
766
+ },
767
+ },
768
+ // ─── Bootstrap Parallel Agents for External Repos ──────────
769
+ {
770
+ name: "bootstrap_parallel_agents",
771
+ description: "Detect whether a target project repo has parallel agent infrastructure and, if not, scaffold everything needed. Scans for task coordination, role configs, oracle testing, context budget tracking, progress files, AGENTS.md parallel sections, and git worktrees. Returns a gap report with severity ratings and ready-to-use scaffold commands. Uses the AI Flywheel closed loop: detect → research → implement → test → fix → document. Works on ANY project directory — not just nodebench.",
772
+ inputSchema: {
773
+ type: "object",
774
+ properties: {
775
+ projectRoot: {
776
+ type: "string",
777
+ description: "Root directory of the target project to scan and bootstrap (default: current working directory)",
778
+ },
779
+ dryRun: {
780
+ type: "boolean",
781
+ description: "Preview only — show what would be created without writing files (default: true)",
782
+ },
783
+ includeAgentsMd: {
784
+ type: "boolean",
785
+ description: "Generate and include a portable AGENTS.md parallel section for the target repo (default: true)",
786
+ },
787
+ techStack: {
788
+ type: "string",
789
+ description: "Target project's tech stack hint (e.g. 'TypeScript/Node', 'Python/FastAPI', 'Rust') — helps generate idiomatic scaffolds",
790
+ },
791
+ },
792
+ },
793
+ handler: async (args) => {
794
+ const projectRoot = args.projectRoot || process.cwd();
795
+ const dryRun = args.dryRun !== false;
796
+ const includeAgentsMd = args.includeAgentsMd !== false;
797
+ const techStack = args.techStack || "unknown";
798
+ // ── Phase 1: Detection ──────────────────────────────────
799
+ const fs = await import("fs");
800
+ const path = await import("path");
801
+ const results = [];
802
+ // Helper: check if a path exists
803
+ const exists = (p) => {
804
+ try {
805
+ fs.accessSync(p);
806
+ return true;
807
+ }
808
+ catch {
809
+ return false;
810
+ }
811
+ };
812
+ // Helper: check if a file contains a pattern
813
+ const fileContains = (filePath, patterns) => {
814
+ try {
815
+ const content = fs.readFileSync(filePath, "utf-8").toLowerCase();
816
+ return patterns.filter((p) => content.includes(p.toLowerCase()));
817
+ }
818
+ catch {
819
+ return [];
820
+ }
821
+ };
822
+ // Helper: find files matching patterns in top-level dirs
823
+ const findFiles = (root, namePatterns, maxDepth = 3) => {
824
+ const found = [];
825
+ const scan = (dir, depth) => {
826
+ if (depth > maxDepth)
827
+ return;
828
+ try {
829
+ const entries = fs.readdirSync(dir, { withFileTypes: true });
830
+ for (const entry of entries) {
831
+ if (entry.name.startsWith(".") && entry.name !== ".parallel-agents")
832
+ continue;
833
+ if (entry.name === "node_modules" || entry.name === "dist" || entry.name === "__pycache__")
834
+ continue;
835
+ const full = path.join(dir, entry.name);
836
+ if (entry.isFile()) {
837
+ const lower = entry.name.toLowerCase();
838
+ if (namePatterns.some((p) => lower.includes(p.toLowerCase()))) {
839
+ found.push(full);
840
+ }
841
+ }
842
+ else if (entry.isDirectory()) {
843
+ scan(full, depth + 1);
844
+ }
845
+ }
846
+ }
847
+ catch { /* permission denied, etc */ }
848
+ };
849
+ scan(root, 0);
850
+ return found;
851
+ };
852
+ // 1. Task Coordination
853
+ {
854
+ const evidence = [];
855
+ const taskDirs = ["current_tasks", ".parallel-agents", "tasks", ".tasks"];
856
+ for (const d of taskDirs) {
857
+ if (exists(path.join(projectRoot, d)))
858
+ evidence.push(`Directory found: ${d}/`);
859
+ }
860
+ const taskFiles = findFiles(projectRoot, ["task_lock", "taskLock", "claim_task", "claimTask"]);
861
+ for (const f of taskFiles)
862
+ evidence.push(`Task file: ${path.relative(projectRoot, f)}`);
863
+ // Check AGENTS.md / CLAUDE.md for task coordination mentions
864
+ for (const agentsFile of ["AGENTS.md", "CLAUDE.md", "agents.md"]) {
865
+ const matches = fileContains(path.join(projectRoot, agentsFile), ["task lock", "claim_task", "parallel agent", "worktree"]);
866
+ if (matches.length > 0)
867
+ evidence.push(`${agentsFile} mentions: ${matches.join(", ")}`);
868
+ }
869
+ results.push({
870
+ category: "task_coordination",
871
+ detected: evidence.length > 0,
872
+ confidence: Math.min(evidence.length * 0.3, 1),
873
+ evidence,
874
+ severity: "CRITICAL",
875
+ });
876
+ }
877
+ // 2. Role Configuration
878
+ {
879
+ const evidence = [];
880
+ const roleFiles = findFiles(projectRoot, ["role", "agent_role", "agentRole"]);
881
+ for (const f of roleFiles.slice(0, 5))
882
+ evidence.push(`Role file: ${path.relative(projectRoot, f)}`);
883
+ for (const agentsFile of ["AGENTS.md", "CLAUDE.md"]) {
884
+ const matches = fileContains(path.join(projectRoot, agentsFile), ["agent role", "role specializ", "implementer", "dedup_reviewer"]);
885
+ if (matches.length > 0)
886
+ evidence.push(`${agentsFile} mentions roles: ${matches.join(", ")}`);
887
+ }
888
+ results.push({
889
+ category: "role_specialization",
890
+ detected: evidence.length > 0,
891
+ confidence: Math.min(evidence.length * 0.35, 1),
892
+ evidence,
893
+ severity: "HIGH",
894
+ });
895
+ }
896
+ // 3. Oracle Testing
897
+ {
898
+ const evidence = [];
899
+ const oracleDirs = ["oracle", "golden", "golden_files", "reference_outputs", "snapshots", "__snapshots__"];
900
+ for (const d of oracleDirs) {
901
+ if (exists(path.join(projectRoot, d)))
902
+ evidence.push(`Oracle dir: ${d}/`);
903
+ }
904
+ const oracleFiles = findFiles(projectRoot, ["oracle", "golden", "reference_output", "snapshot"]);
905
+ for (const f of oracleFiles.slice(0, 5))
906
+ evidence.push(`Oracle file: ${path.relative(projectRoot, f)}`);
907
+ results.push({
908
+ category: "oracle_testing",
909
+ detected: evidence.length > 0,
910
+ confidence: Math.min(evidence.length * 0.25, 1),
911
+ evidence,
912
+ severity: "HIGH",
913
+ });
914
+ }
915
+ // 4. Context Budget Tracking
916
+ {
917
+ const evidence = [];
918
+ const budgetFiles = findFiles(projectRoot, ["context_budget", "contextBudget", "token_budget", "tokenBudget"]);
919
+ for (const f of budgetFiles.slice(0, 5))
920
+ evidence.push(`Budget file: ${path.relative(projectRoot, f)}`);
921
+ for (const agentsFile of ["AGENTS.md", "CLAUDE.md"]) {
922
+ const matches = fileContains(path.join(projectRoot, agentsFile), ["context budget", "token budget", "context pollution", "context window"]);
923
+ if (matches.length > 0)
924
+ evidence.push(`${agentsFile} mentions: ${matches.join(", ")}`);
925
+ }
926
+ results.push({
927
+ category: "context_budget",
928
+ detected: evidence.length > 0,
929
+ confidence: Math.min(evidence.length * 0.35, 1),
930
+ evidence,
931
+ severity: "MEDIUM",
932
+ });
933
+ }
934
+ // 5. Progress Files
935
+ {
936
+ const evidence = [];
937
+ const progressFiles = ["PROGRESS.md", "progress.md", "claude-progress.txt", "STATUS.md", "CHANGELOG.md"];
938
+ for (const f of progressFiles) {
939
+ if (exists(path.join(projectRoot, f)))
940
+ evidence.push(`Progress file: ${f}`);
941
+ }
942
+ results.push({
943
+ category: "progress_files",
944
+ detected: evidence.length > 0,
945
+ confidence: Math.min(evidence.length * 0.4, 1),
946
+ evidence,
947
+ severity: "MEDIUM",
948
+ });
949
+ }
950
+ // 6. AGENTS.md Parallel Section
951
+ {
952
+ const evidence = [];
953
+ for (const agentsFile of ["AGENTS.md", "CLAUDE.md", "agents.md", "NODEBENCH_AGENTS.md"]) {
954
+ const fp = path.join(projectRoot, agentsFile);
955
+ if (exists(fp)) {
956
+ evidence.push(`Found: ${agentsFile}`);
957
+ const matches = fileContains(fp, ["parallel agent", "multi-agent", "subagent", "worktree", "task locking"]);
958
+ if (matches.length > 0)
959
+ evidence.push(`${agentsFile} has parallel content: ${matches.join(", ")}`);
960
+ }
961
+ }
962
+ results.push({
963
+ category: "agents_md_parallel",
964
+ detected: evidence.some((e) => e.includes("parallel content")),
965
+ confidence: evidence.some((e) => e.includes("parallel content")) ? 0.9 : 0,
966
+ evidence,
967
+ severity: "CRITICAL",
968
+ });
969
+ }
970
+ // 7. Git Worktrees
971
+ {
972
+ const evidence = [];
973
+ const worktreeDir = path.join(projectRoot, ".git", "worktrees");
974
+ if (exists(worktreeDir)) {
975
+ try {
976
+ const wts = fs.readdirSync(worktreeDir);
977
+ evidence.push(`Git worktrees found: ${wts.length} (${wts.slice(0, 5).join(", ")})`);
978
+ }
979
+ catch { /* no access */ }
980
+ }
981
+ results.push({
982
+ category: "git_worktrees",
983
+ detected: evidence.length > 0,
984
+ confidence: evidence.length > 0 ? 0.9 : 0,
985
+ evidence,
986
+ severity: "LOW",
987
+ });
988
+ }
989
+ // ── Phase 2: Gap Report ─────────────────────────────────
990
+ const missing = results.filter((r) => !r.detected);
991
+ const detected = results.filter((r) => r.detected);
992
+ const hasParallelInfra = missing.filter((m) => m.severity === "CRITICAL").length === 0;
993
+ const scaffoldFiles = [];
994
+ // Determine comment style based on tech stack
995
+ const isTs = techStack.toLowerCase().includes("typescript") || techStack.toLowerCase().includes("node") || techStack.toLowerCase().includes("js");
996
+ const isPython = techStack.toLowerCase().includes("python");
997
+ const isRust = techStack.toLowerCase().includes("rust");
998
+ // Task coordination directory
999
+ if (!results.find((r) => r.category === "task_coordination")?.detected) {
1000
+ scaffoldFiles.push({
1001
+ path: ".parallel-agents/README.md",
1002
+ content: `# Parallel Agent Coordination
1003
+
1004
+ This directory manages parallel agent task coordination.
1005
+
1006
+ ## Structure
1007
+ - \`current_tasks/\` — Active task lock files (one per claimed task)
1008
+ - \`progress.md\` — Running status document for agent orientation
1009
+ - \`roles.json\` — Active role assignments
1010
+ - \`oracle/\` — Golden reference outputs for oracle testing
1011
+
1012
+ ## How it works
1013
+ 1. Before starting work, an agent creates a lock file in \`current_tasks/\`
1014
+ 2. Other agents check this directory to avoid duplicate work
1015
+ 3. When done, the agent removes the lock and updates \`progress.md\`
1016
+
1017
+ ## Using with NodeBench MCP
1018
+ If you have nodebench-mcp installed, these operations are handled by:
1019
+ - \`claim_agent_task\` / \`release_agent_task\` — Task locking
1020
+ - \`assign_agent_role\` — Role specialization
1021
+ - \`run_oracle_comparison\` — Oracle testing
1022
+ - \`get_parallel_status\` — Agent orientation
1023
+
1024
+ Install: \`npx -y nodebench-mcp\` or \`claude mcp add nodebench -- npx -y nodebench-mcp\`
1025
+ `,
1026
+ description: "Parallel agents coordination directory README",
1027
+ });
1028
+ scaffoldFiles.push({
1029
+ path: ".parallel-agents/current_tasks/.gitkeep",
1030
+ content: "",
1031
+ description: "Task lock directory (empty, agents create lock files here)",
1032
+ });
1033
+ scaffoldFiles.push({
1034
+ path: ".parallel-agents/oracle/.gitkeep",
1035
+ content: "",
1036
+ description: "Oracle golden files directory",
1037
+ });
1038
+ }
1039
+ // Progress file
1040
+ if (!results.find((r) => r.category === "progress_files")?.detected) {
1041
+ scaffoldFiles.push({
1042
+ path: ".parallel-agents/progress.md",
1043
+ content: `# Parallel Agent Progress
1044
+
1045
+ > Updated by agents after each work session. Read this FIRST when starting a new session.
1046
+
1047
+ ## Current Status
1048
+ - [ ] No tasks started yet
1049
+
1050
+ ## Active Agents
1051
+ (none)
1052
+
1053
+ ## Completed Work
1054
+ (none yet)
1055
+
1056
+ ## Blocked Items
1057
+ (none)
1058
+
1059
+ ## Failed Approaches
1060
+ (Record what didn't work so other agents don't repeat mistakes)
1061
+
1062
+ ## Key Decisions
1063
+ (Record architectural or design decisions made during parallel work)
1064
+ `,
1065
+ description: "Running progress document for agent orientation",
1066
+ });
1067
+ }
1068
+ // Role configuration
1069
+ if (!results.find((r) => r.category === "role_specialization")?.detected) {
1070
+ scaffoldFiles.push({
1071
+ path: ".parallel-agents/roles.json",
1072
+ content: JSON.stringify({
1073
+ _comment: "Agent role assignments. Updated by assign_agent_role or manually.",
1074
+ predefinedRoles: {
1075
+ implementer: "Primary feature work. Picks failing tests, implements fixes.",
1076
+ test_writer: "Writes targeted tests for edge cases and failure modes.",
1077
+ code_quality_critic: "Structural improvements, pattern enforcement.",
1078
+ documentation_maintainer: "Keeps READMEs and progress files in sync.",
1079
+ dedup_reviewer: "Finds and coalesces duplicate implementations.",
1080
+ performance_optimizer: "Profiles bottlenecks, optimizes hot paths.",
1081
+ security_auditor: "Audits for vulnerabilities, logs CRITICAL gaps.",
1082
+ },
1083
+ activeAssignments: [],
1084
+ }, null, 2),
1085
+ description: "Role definitions and active assignments",
1086
+ });
1087
+ }
1088
+ // AGENTS.md parallel section
1089
+ let agentsMdContent = "";
1090
+ if (!results.find((r) => r.category === "agents_md_parallel")?.detected && includeAgentsMd) {
1091
+ agentsMdContent = generateParallelAgentsMdSection(techStack);
1092
+ const existingAgentsMd = exists(path.join(projectRoot, "AGENTS.md"));
1093
+ scaffoldFiles.push({
1094
+ path: existingAgentsMd ? "AGENTS.md.parallel-append" : "AGENTS.md",
1095
+ content: existingAgentsMd
1096
+ ? `\n\n${agentsMdContent}`
1097
+ : `# Agent Instructions\n\n${agentsMdContent}`,
1098
+ description: existingAgentsMd
1099
+ ? "Append this content to your existing AGENTS.md"
1100
+ : "New AGENTS.md with parallel agent coordination section",
1101
+ });
1102
+ }
1103
+ // ── Phase 4: Write files (if not dry run) ──────────────
1104
+ const created = [];
1105
+ if (!dryRun) {
1106
+ for (const file of scaffoldFiles) {
1107
+ const fullPath = path.join(projectRoot, file.path);
1108
+ const dir = path.dirname(fullPath);
1109
+ try {
1110
+ fs.mkdirSync(dir, { recursive: true });
1111
+ // Don't overwrite existing files (except .gitkeep and append markers)
1112
+ if (!file.path.endsWith(".gitkeep") && !file.path.endsWith("-append") && exists(fullPath)) {
1113
+ continue;
1114
+ }
1115
+ fs.writeFileSync(fullPath, file.content, "utf-8");
1116
+ created.push(file.path);
1117
+ }
1118
+ catch (e) {
1119
+ // Log but don't fail
1120
+ created.push(`FAILED: ${file.path} — ${e.message}`);
1121
+ }
1122
+ }
1123
+ }
1124
+ // ── Phase 5: Flywheel Verification Plan ────────────────
1125
+ const flywheelPlan = [
1126
+ {
1127
+ step: 1,
1128
+ name: "Static Analysis",
1129
+ action: "Verify scaffold files are valid and don't conflict with existing project structure",
1130
+ tool: "run_closed_loop({ steps: [{ step: 'compile', passed: true }] })",
1131
+ },
1132
+ {
1133
+ step: 2,
1134
+ name: "Happy Path Test",
1135
+ action: "Have one agent claim a task, do work, release it. Verify progress.md updates.",
1136
+ tool: "claim_agent_task → release_agent_task → list_agent_tasks",
1137
+ },
1138
+ {
1139
+ step: 3,
1140
+ name: "Conflict Test",
1141
+ action: "Have two agents try to claim the same task. Verify the second gets a conflict response.",
1142
+ tool: "claim_agent_task (agent A) → claim_agent_task (agent B, same key)",
1143
+ },
1144
+ {
1145
+ step: 4,
1146
+ name: "Oracle Validation",
1147
+ action: "Create a golden file, run oracle comparison, verify match detection works.",
1148
+ tool: "run_oracle_comparison({ testLabel: 'smoke', actualOutput: 'hello', expectedOutput: 'hello', oracleSource: 'manual' })",
1149
+ },
1150
+ {
1151
+ step: 5,
1152
+ name: "Gap Analysis",
1153
+ action: "Re-run bootstrap_parallel_agents to verify all gaps are now filled.",
1154
+ tool: "bootstrap_parallel_agents({ projectRoot: '...', dryRun: true })",
1155
+ },
1156
+ {
1157
+ step: 6,
1158
+ name: "Document",
1159
+ action: "Record learnings and update AGENTS.md with any new patterns discovered.",
1160
+ tool: "record_learning + update_agents_md",
1161
+ },
1162
+ ];
1163
+ return {
1164
+ projectRoot,
1165
+ dryRun,
1166
+ detection: {
1167
+ hasParallelInfra,
1168
+ detected: detected.map((r) => ({
1169
+ category: r.category,
1170
+ confidence: r.confidence,
1171
+ evidence: r.evidence,
1172
+ })),
1173
+ missing: missing.map((r) => ({
1174
+ category: r.category,
1175
+ severity: r.severity,
1176
+ description: {
1177
+ task_coordination: "No task locking mechanism — parallel agents may duplicate work",
1178
+ role_specialization: "No role configuration — agents won't specialize effectively",
1179
+ oracle_testing: "No oracle/golden file infrastructure — can't validate against known-good references",
1180
+ context_budget: "No context budget tracking — risk of context window pollution",
1181
+ progress_files: "No progress files — fresh agent sessions can't orient themselves",
1182
+ agents_md_parallel: "AGENTS.md has no parallel agent section — agents won't know the coordination protocol",
1183
+ git_worktrees: "No git worktrees — parallel agents will need separate clones or worktrees",
1184
+ }[r.category] || `Missing ${r.category}`,
1185
+ })),
1186
+ score: `${detected.length}/${results.length} capabilities present`,
1187
+ },
1188
+ scaffold: {
1189
+ files: scaffoldFiles.map((f) => ({
1190
+ path: f.path,
1191
+ description: f.description,
1192
+ sizeBytes: f.content.length,
1193
+ })),
1194
+ totalFiles: scaffoldFiles.length,
1195
+ ...(dryRun ? {} : { created }),
1196
+ },
1197
+ flywheelPlan,
1198
+ nextSteps: [
1199
+ dryRun && scaffoldFiles.length > 0
1200
+ ? "Run with dryRun=false to create scaffold files"
1201
+ : null,
1202
+ scaffoldFiles.some((f) => f.path.endsWith("-append"))
1203
+ ? "Manually append the AGENTS.md.parallel-append content to your existing AGENTS.md"
1204
+ : null,
1205
+ "Run the 6-step flywheel verification plan above to validate the setup",
1206
+ "Install nodebench-mcp for full tool support: claude mcp add nodebench -- npx -y nodebench-mcp",
1207
+ "Set up 3-5 git worktrees for maximum parallel throughput: git worktree add ../project-wt1 -b agent-1",
1208
+ missing.length === 0
1209
+ ? "All parallel agent infrastructure detected! Ready for multi-agent work."
1210
+ : null,
1211
+ ].filter(Boolean),
1212
+ tip: hasParallelInfra
1213
+ ? "This project already has parallel agent infrastructure. Use get_parallel_status to orient and start working."
1214
+ : `This project is missing ${missing.length} parallel agent capabilities. ${dryRun ? "Run with dryRun=false to scaffold them automatically." : `Scaffolded ${created.length} files. Run the flywheel plan to verify.`}`,
1215
+ };
1216
+ },
1217
+ },
1218
+ {
1219
+ name: "generate_parallel_agents_md",
1220
+ description: "Generate a portable, framework-agnostic AGENTS.md section for parallel agent coordination. Designed to be dropped into ANY project repo so that AI agents (Claude, GPT, etc.) automatically know how to coordinate in parallel. Includes task locking protocol, role definitions, oracle testing workflow, context budget rules, and anti-patterns. Output is ready to paste into an existing AGENTS.md or use standalone.",
1221
+ inputSchema: {
1222
+ type: "object",
1223
+ properties: {
1224
+ techStack: {
1225
+ type: "string",
1226
+ description: "Target project tech stack (e.g. 'TypeScript/React', 'Python/Django', 'Rust'). Tailors examples to the stack.",
1227
+ },
1228
+ projectName: {
1229
+ type: "string",
1230
+ description: "Project name for the header (default: 'this project')",
1231
+ },
1232
+ maxAgents: {
1233
+ type: "number",
1234
+ description: "Expected max parallel agents (default: 4). Affects role recommendations.",
1235
+ },
1236
+ includeNodebenchSetup: {
1237
+ type: "boolean",
1238
+ description: "Include nodebench-mcp installation and tool mapping instructions (default: true)",
1239
+ },
1240
+ },
1241
+ },
1242
+ handler: async (args) => {
1243
+ const techStack = args.techStack || "general";
1244
+ const projectName = args.projectName || "this project";
1245
+ const maxAgents = args.maxAgents || 4;
1246
+ const includeNodebench = args.includeNodebenchSetup !== false;
1247
+ const content = generateParallelAgentsMdSection(techStack, projectName, maxAgents, includeNodebench);
1248
+ return {
1249
+ format: "markdown",
1250
+ content,
1251
+ usage: [
1252
+ "Option A: Paste into your existing AGENTS.md (append at the end)",
1253
+ "Option B: Save as a new AGENTS.md in your project root",
1254
+ "Option C: Save as .parallel-agents/PROTOCOL.md for a standalone guide",
1255
+ ],
1256
+ charCount: content.length,
1257
+ sections: [
1258
+ "Parallel Agent Coordination Protocol",
1259
+ "Task Locking Protocol",
1260
+ "Role Specialization",
1261
+ "Oracle Testing Workflow",
1262
+ "Context Budget Rules",
1263
+ "Progress File Protocol",
1264
+ "Anti-Patterns",
1265
+ "Flywheel Verification",
1266
+ includeNodebench ? "NodeBench MCP Setup" : null,
1267
+ ].filter(Boolean),
1268
+ };
1269
+ },
1270
+ },
1271
+ ];
1272
+ //# sourceMappingURL=parallelAgentTools.js.map