nodebench-mcp 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  2. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  3. package/dist/dashboard/operatingServer.js +3 -2
  4. package/dist/dashboard/operatingServer.js.map +1 -1
  5. package/dist/db.js +51 -3
  6. package/dist/db.js.map +1 -1
  7. package/dist/index.js +13 -16
  8. package/dist/index.js.map +1 -1
  9. package/dist/packageInfo.d.ts +3 -0
  10. package/dist/packageInfo.js +32 -0
  11. package/dist/packageInfo.js.map +1 -0
  12. package/dist/sandboxApi.js +2 -1
  13. package/dist/sandboxApi.js.map +1 -1
  14. package/dist/tools/boilerplateTools.js +10 -9
  15. package/dist/tools/boilerplateTools.js.map +1 -1
  16. package/dist/tools/documentationTools.js +2 -1
  17. package/dist/tools/documentationTools.js.map +1 -1
  18. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  19. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  20. package/dist/tools/toolRegistry.js +11 -0
  21. package/dist/tools/toolRegistry.js.map +1 -1
  22. package/dist/toolsetRegistry.js +74 -1
  23. package/dist/toolsetRegistry.js.map +1 -1
  24. package/package.json +4 -3
  25. package/dist/__tests__/analytics.test.d.ts +0 -11
  26. package/dist/__tests__/analytics.test.js +0 -546
  27. package/dist/__tests__/analytics.test.js.map +0 -1
  28. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  29. package/dist/__tests__/architectComplex.test.js +0 -373
  30. package/dist/__tests__/architectComplex.test.js.map +0 -1
  31. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  32. package/dist/__tests__/architectSmoke.test.js +0 -92
  33. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  34. package/dist/__tests__/audit-registry.d.ts +0 -1
  35. package/dist/__tests__/audit-registry.js +0 -60
  36. package/dist/__tests__/audit-registry.js.map +0 -1
  37. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  38. package/dist/__tests__/batchAutopilot.test.js +0 -218
  39. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  40. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  41. package/dist/__tests__/cliSubcommands.test.js +0 -138
  42. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  43. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  44. package/dist/__tests__/comparativeBench.test.js +0 -722
  45. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  46. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  47. package/dist/__tests__/critterCalibrationEval.js +0 -370
  48. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  49. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  50. package/dist/__tests__/dynamicLoading.test.js +0 -280
  51. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  52. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  53. package/dist/__tests__/embeddingProvider.test.js +0 -86
  54. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  55. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  56. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  57. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  58. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  59. package/dist/__tests__/evalHarness.test.js +0 -1107
  60. package/dist/__tests__/evalHarness.test.js.map +0 -1
  61. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  62. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  63. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  64. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  65. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  66. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  67. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  69. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  70. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  72. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  73. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  74. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  75. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  76. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingScoring.test.js +0 -202
  78. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  79. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  80. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  81. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  83. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  84. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  86. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  87. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  90. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  91. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  92. package/dist/__tests__/helpers/answerMatch.js +0 -267
  93. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  94. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  95. package/dist/__tests__/helpers/textLlm.js +0 -214
  96. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  97. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  98. package/dist/__tests__/localDashboard.test.js +0 -226
  99. package/dist/__tests__/localDashboard.test.js.map +0 -1
  100. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  101. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  102. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  103. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  104. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  105. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  108. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  111. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  114. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  116. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  117. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  118. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  119. package/dist/__tests__/openclawDogfood.test.js +0 -535
  120. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  121. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  122. package/dist/__tests__/openclawMessaging.test.js +0 -232
  123. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  124. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  125. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  126. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  127. package/dist/__tests__/tools.test.d.ts +0 -1
  128. package/dist/__tests__/tools.test.js +0 -3201
  129. package/dist/__tests__/tools.test.js.map +0 -1
  130. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  131. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  132. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  133. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  134. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  135. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  136. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  137. package/dist/__tests__/webmcpTools.test.js +0 -195
  138. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  139. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  140. package/dist/benchmarks/testProviderBus.js +0 -272
  141. package/dist/benchmarks/testProviderBus.js.map +0 -1
  142. package/dist/hooks/postCompaction.d.ts +0 -14
  143. package/dist/hooks/postCompaction.js +0 -51
  144. package/dist/hooks/postCompaction.js.map +0 -1
  145. package/dist/security/__tests__/security.test.d.ts +0 -8
  146. package/dist/security/__tests__/security.test.js +0 -295
  147. package/dist/security/__tests__/security.test.js.map +0 -1
  148. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  149. package/dist/sync/hyperloopEval.test.js +0 -60
  150. package/dist/sync/hyperloopEval.test.js.map +0 -1
  151. package/dist/sync/store.test.d.ts +0 -4
  152. package/dist/sync/store.test.js +0 -43
  153. package/dist/sync/store.test.js.map +0 -1
  154. package/dist/tools/documentTools.d.ts +0 -5
  155. package/dist/tools/documentTools.js +0 -524
  156. package/dist/tools/documentTools.js.map +0 -1
  157. package/dist/tools/financialTools.d.ts +0 -10
  158. package/dist/tools/financialTools.js +0 -403
  159. package/dist/tools/financialTools.js.map +0 -1
  160. package/dist/tools/memoryTools.d.ts +0 -5
  161. package/dist/tools/memoryTools.js +0 -137
  162. package/dist/tools/memoryTools.js.map +0 -1
  163. package/dist/tools/planningTools.d.ts +0 -5
  164. package/dist/tools/planningTools.js +0 -147
  165. package/dist/tools/planningTools.js.map +0 -1
  166. package/dist/tools/searchTools.d.ts +0 -5
  167. package/dist/tools/searchTools.js +0 -145
  168. package/dist/tools/searchTools.js.map +0 -1
@@ -1,859 +0,0 @@
1
- /**
2
- * Preset Real-World Benchmark — Impact-Driven Evaluation
3
- *
4
- * Inspired by 8 open-source Claude Code ecosystem repos:
5
- * - obra/superpowers: Mandatory skill-check gate, 4-phase debugging
6
- * - wshobson/agents: Conductor (Context → Spec → Plan → Implement), agent-teams
7
- * - ruvnet/claude-flow: Queen-led swarm, 5-layer memory, 3-tier model routing
8
- * - Yeachan-Heo/oh-my-claudecode: Compaction-resilient notepad, learner skills
9
- * - thedotmack/claude-mem: Session observations, token economics, context config
10
- * - anthropic/planning-with-files: Manus-style markdown planning with checkpoints
11
- * - K-Dense-AI/claude-scientific-skills: 140 domain skills, category-based discovery
12
- * - zebbern/claude-code-guide: Best practices, workflow patterns, agent setup
13
- *
14
- * Fills gaps identified in existing eval suite:
15
- * GAP 1: Cross-domain workflows (domain silos → end-to-end)
16
- * GAP 2: Error recovery & failure paths
17
- * GAP 3: Preset transitions (meta → lite → core escalation)
18
- * GAP 4: Knowledge lifecycle (record → search → synthesize → reuse)
19
- * GAP 5: Research writing workflows (0% coverage → full pipeline)
20
- * GAP 6: Bootstrap cold-start (agent onboarding end-to-end)
21
- * GAP 7: Multi-agent coordination at scale
22
- * GAP 8: Progressive discovery search quality
23
- *
24
- * Architecture:
25
- * 8 real-world scenarios × 4 presets (meta, lite, core, full) = 32 trajectories
26
- * Each preset runs as a parallel "subagent" within each scenario
27
- * Measures: tool calls, phases, knowledge reuse, token overhead, gaps found
28
- *
29
- * Run: npx vitest run src/__tests__/presetRealWorldBench.test.ts
30
- */
31
- import { describe, it, expect, afterAll } from "vitest";
32
- import { verificationTools } from "../tools/verificationTools.js";
33
- import { reconTools } from "../tools/reconTools.js";
34
- import { evalTools } from "../tools/evalTools.js";
35
- import { qualityGateTools } from "../tools/qualityGateTools.js";
36
- import { flywheelTools } from "../tools/flywheelTools.js";
37
- import { learningTools } from "../tools/learningTools.js";
38
- import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
39
- import { selfEvalTools } from "../tools/selfEvalTools.js";
40
- import { parallelAgentTools } from "../tools/parallelAgentTools.js";
41
- import { uiCaptureTools } from "../tools/uiCaptureTools.js";
42
- import { visionTools } from "../tools/visionTools.js";
43
- import { webTools } from "../tools/webTools.js";
44
- import { githubTools } from "../tools/githubTools.js";
45
- import { documentationTools } from "../tools/documentationTools.js";
46
- import { localFileTools, gaiaMediaSolvers } from "../tools/localFileTools.js";
47
- import { llmTools } from "../tools/llmTools.js";
48
- import { securityTools } from "../tools/securityTools.js";
49
- import { platformTools } from "../tools/platformTools.js";
50
- import { researchWritingTools } from "../tools/researchWritingTools.js";
51
- import { flickerDetectionTools } from "../tools/flickerDetectionTools.js";
52
- import { figmaFlowTools } from "../tools/figmaFlowTools.js";
53
- import { boilerplateTools } from "../tools/boilerplateTools.js";
54
- import { cCompilerBenchmarkTools } from "../tools/cCompilerBenchmarkTools.js";
55
- import { sessionMemoryTools } from "../tools/sessionMemoryTools.js";
56
- import { toonTools } from "../tools/toonTools.js";
57
- import { patternTools } from "../tools/patternTools.js";
58
- import { gitWorkflowTools } from "../tools/gitWorkflowTools.js";
59
- import { seoTools } from "../tools/seoTools.js";
60
- import { voiceBridgeTools } from "../tools/voiceBridgeTools.js";
61
- import { critterTools } from "../tools/critterTools.js";
62
- import { emailTools } from "../tools/emailTools.js";
63
- import { rssTools } from "../tools/rssTools.js";
64
- import { architectTools } from "../tools/architectTools.js";
65
- import { createMetaTools } from "../tools/metaTools.js";
66
- import { createProgressiveDiscoveryTools } from "../tools/progressiveDiscoveryTools.js";
67
- // ═══════════════════════════════════════════════════════════════════════════
68
- // PRESET & TOOLSET DEFINITIONS (mirrors index.ts exactly)
69
- // ═══════════════════════════════════════════════════════════════════════════
70
- const TOOLSET_MAP = {
71
- verification: verificationTools,
72
- eval: evalTools,
73
- quality_gate: qualityGateTools,
74
- learning: learningTools,
75
- flywheel: flywheelTools,
76
- recon: reconTools,
77
- ui_capture: uiCaptureTools,
78
- vision: visionTools,
79
- local_file: localFileTools,
80
- web: webTools,
81
- github: githubTools,
82
- docs: documentationTools,
83
- bootstrap: agentBootstrapTools,
84
- self_eval: selfEvalTools,
85
- parallel: parallelAgentTools,
86
- llm: llmTools,
87
- security: securityTools,
88
- platform: platformTools,
89
- research_writing: researchWritingTools,
90
- flicker_detection: flickerDetectionTools,
91
- figma_flow: figmaFlowTools,
92
- boilerplate: boilerplateTools,
93
- benchmark: cCompilerBenchmarkTools,
94
- session_memory: sessionMemoryTools,
95
- gaia_solvers: gaiaMediaSolvers,
96
- toon: toonTools,
97
- pattern: patternTools,
98
- git_workflow: gitWorkflowTools,
99
- seo: seoTools,
100
- voice_bridge: voiceBridgeTools,
101
- critter: critterTools,
102
- email: emailTools,
103
- rss: rssTools,
104
- architect: architectTools,
105
- };
106
- const PRESETS = {
107
- meta: [],
108
- lite: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"],
109
- core: [
110
- "verification", "eval", "quality_gate", "learning", "flywheel", "recon",
111
- "bootstrap", "self_eval", "llm", "security", "platform", "research_writing",
112
- "flicker_detection", "figma_flow", "boilerplate", "benchmark", "session_memory",
113
- "toon", "pattern", "git_workflow", "seo", "voice_bridge",
114
- "critter", "email", "rss", "architect",
115
- ],
116
- full: Object.keys(TOOLSET_MAP),
117
- };
118
- function buildToolset(preset) {
119
- const keys = PRESETS[preset];
120
- const domain = keys.flatMap((k) => TOOLSET_MAP[k] ?? []);
121
- const metaTools = createMetaTools(domain);
122
- const allForDiscovery = [...domain, ...metaTools];
123
- const discoveryTools = createProgressiveDiscoveryTools(allForDiscovery);
124
- return [...domain, ...metaTools, ...discoveryTools];
125
- }
126
- // ═══════════════════════════════════════════════════════════════════════════
127
- // HELPER: Execute a scenario against a preset
128
- // ═══════════════════════════════════════════════════════════════════════════
129
- async function executeScenario(scenario, preset) {
130
- const tools = buildToolset(preset);
131
- const toolMap = new Map(tools.map((t) => [t.name, t]));
132
- const startTime = Date.now();
133
- const phases = [];
134
- let totalCalls = 0;
135
- let totalMissing = 0;
136
- let totalErrors = 0;
137
- let knowledgeRecorded = false;
138
- let knowledgeReused = false;
139
- let discoveryUsed = false;
140
- for (const phaseSpec of scenario.phases) {
141
- const phaseStart = Date.now();
142
- const called = [];
143
- const missing = [];
144
- const failed = [];
145
- for (const attempt of phaseSpec.tools) {
146
- const tool = toolMap.get(attempt.name);
147
- if (!tool) {
148
- missing.push(attempt.name);
149
- totalMissing++;
150
- continue;
151
- }
152
- try {
153
- await tool.handler(attempt.args);
154
- called.push(attempt.name);
155
- totalCalls++;
156
- if (attempt.name === "record_learning")
157
- knowledgeRecorded = true;
158
- if (attempt.name === "search_all_knowledge")
159
- knowledgeReused = true;
160
- if (attempt.name === "discover_tools" || attempt.name === "get_workflow_chain")
161
- discoveryUsed = true;
162
- }
163
- catch {
164
- failed.push(attempt.name);
165
- totalErrors++;
166
- }
167
- }
168
- phases.push({
169
- phase: phaseSpec.name,
170
- toolsCalled: called,
171
- toolsMissing: missing,
172
- toolsFailed: failed,
173
- success: missing.length === 0 && (called.length > 0 || failed.length > 0),
174
- durationMs: Date.now() - phaseStart,
175
- });
176
- }
177
- return {
178
- preset,
179
- scenarioId: scenario.id,
180
- scenarioName: scenario.name,
181
- inspiredBy: scenario.inspiredBy,
182
- gapFilled: scenario.gapFilled,
183
- toolCount: tools.length,
184
- estimatedSchemaTokens: tools.length * 200,
185
- phases,
186
- phasesCompleted: phases.filter((p) => p.success).length,
187
- phasesSkipped: phases.filter((p) => !p.success).length,
188
- totalToolCalls: totalCalls,
189
- totalToolMissing: totalMissing,
190
- totalToolErrors: totalErrors,
191
- knowledgeRecorded,
192
- knowledgeReused,
193
- discoveryUsed,
194
- durationMs: Date.now() - startTime,
195
- };
196
- }
197
- // ═══════════════════════════════════════════════════════════════════════════
198
- // 8 REAL-WORLD SCENARIOS — Inspired by open-source ecosystem repos
199
- // ═══════════════════════════════════════════════════════════════════════════
200
- const SCENARIOS = [
201
- // ─── Scenario 1: Cold Start Self-Setup ───────────────────────────────
202
- // Inspired by: superpowers (mandatory skill-check), oh-my-claudecode (zero-learning-curve)
203
- // Gap filled: Bootstrap cold-start (GAP 6)
204
- {
205
- id: "cold-start-self-setup",
206
- name: "Cold Start: Agent Onboarding via Discovery",
207
- inspiredBy: "obra/superpowers + Yeachan-Heo/oh-my-claudecode",
208
- gapFilled: "GAP 6: Bootstrap cold-start",
209
- prompt: "You are a new agent. Discover available tools, find the right methodology, and set up your working environment.",
210
- category: "cold_start",
211
- phases: [
212
- {
213
- name: "discovery",
214
- tools: [
215
- { name: "discover_tools", args: { query: "getting started setup bootstrap" }, domain: "progressive_discovery" },
216
- { name: "get_workflow_chain", args: { workflow: "self_setup" }, domain: "progressive_discovery" },
217
- { name: "findTools", args: { query: "verify" }, domain: "meta" },
218
- ],
219
- },
220
- {
221
- name: "methodology",
222
- tools: [
223
- { name: "getMethodology", args: { topic: "verification" }, domain: "meta" },
224
- { name: "getMethodology", args: { topic: "agent_contract" }, domain: "meta" },
225
- ],
226
- },
227
- {
228
- name: "bootstrap",
229
- optionalForMeta: true,
230
- optionalForLite: true,
231
- tools: [
232
- { name: "discover_infrastructure", args: { targetDir: "." }, domain: "bootstrap" },
233
- { name: "triple_verify", args: { component: "database", claims: ["SQLite exists"] }, domain: "bootstrap" },
234
- { name: "generate_self_instructions", args: { targetDir: ".", existingCapabilities: ["verification", "eval"] }, domain: "bootstrap" },
235
- ],
236
- },
237
- {
238
- name: "knowledge_seed",
239
- optionalForMeta: true,
240
- tools: [
241
- { name: "search_all_knowledge", args: { query: "setup patterns" }, domain: "learning" },
242
- { name: "record_learning", args: { key: "bench-cold-start", content: "Agent bootstrap completed via discovery-first pattern", category: "pattern" }, domain: "learning" },
243
- ],
244
- },
245
- ],
246
- },
247
- // ─── Scenario 2: 4-Phase Bug Fix Pipeline ───────────────────────────
248
- // Inspired by: superpowers (4-phase root cause analysis)
249
- // Gap filled: Cross-domain workflows (GAP 1)
250
- {
251
- id: "four-phase-bug-fix",
252
- name: "4-Phase Bug Fix: Root Cause → Verify → Eval → Learn",
253
- inspiredBy: "obra/superpowers (systematic debugging)",
254
- gapFilled: "GAP 1: Cross-domain workflows",
255
- prompt: "Fix a production bug: the daily cron job silently fails when the API returns 429 rate-limit responses.",
256
- category: "bug_fix",
257
- phases: [
258
- {
259
- name: "investigate",
260
- tools: [
261
- { name: "search_all_knowledge", args: { query: "rate limit 429 cron failure" }, domain: "learning" },
262
- { name: "run_recon", args: { target: "cron-rate-limit-bug", scope: "code", maxFindings: 5 }, domain: "recon" },
263
- { name: "log_recon_finding", args: { sessionId: "bench-recon", category: "bug", summary: "429 not retried: Cron ignores HTTP 429 responses from upstream API" }, domain: "recon" },
264
- ],
265
- },
266
- {
267
- name: "verify",
268
- tools: [
269
- { name: "start_verification_cycle", args: { title: "Fix cron 429 handling" }, domain: "verification" },
270
- { name: "log_phase_findings", args: { cycleId: "bench-cycle", phase: 1, summary: "Root cause: missing retry logic for 429", passed: true }, domain: "verification" },
271
- { name: "log_gap", args: { cycleId: "bench-cycle", description: "No exponential backoff on 429", severity: "critical", phase: 2 }, domain: "verification" },
272
- { name: "resolve_gap", args: { gapId: "bench-gap", resolution: "Added exponential backoff with jitter" }, domain: "verification" },
273
- { name: "log_test_result", args: { cycleId: "bench-cycle", label: "unit-retry-429", layer: "unit", passed: true }, domain: "verification" },
274
- ],
275
- },
276
- {
277
- name: "eval",
278
- tools: [
279
- { name: "start_eval_run", args: { name: "cron-429-fix-eval" }, domain: "eval" },
280
- { name: "record_eval_result", args: { runId: "bench-eval", case: "retry-backoff", passed: true, notes: "429 now triggers 3 retries with exponential backoff" }, domain: "eval" },
281
- { name: "complete_eval_run", args: { runId: "bench-eval" }, domain: "eval" },
282
- ],
283
- },
284
- {
285
- name: "quality_gate",
286
- tools: [
287
- { name: "run_quality_gate", args: { targetId: "cron-fix", rules: [{ name: "test-coverage", threshold: 80 }] }, domain: "quality_gate" },
288
- { name: "run_closed_loop", args: { targetId: "cron-fix", command: "npm test", expectedPattern: "PASS" }, domain: "quality_gate" },
289
- ],
290
- },
291
- {
292
- name: "learn",
293
- tools: [
294
- { name: "record_learning", args: { key: "bench-429-retry", content: "Always add exponential backoff for HTTP 429 in cron jobs", category: "gotcha", tags: ["http", "retry", "cron"] }, domain: "learning" },
295
- ],
296
- },
297
- ],
298
- },
299
- // ─── Scenario 3: Feature Dev (Context → Plan → Implement → Ship) ──────
300
- // Inspired by: wshobson/agents Conductor pattern
301
- // Gap filled: Cross-domain end-to-end (GAP 1)
302
- {
303
- id: "conductor-feature-dev",
304
- name: "Conductor-Style Feature: Context → Spec → Implement → Ship",
305
- inspiredBy: "wshobson/agents (Conductor plugin)",
306
- gapFilled: "GAP 1: Cross-domain workflows",
307
- prompt: "Implement a new dark mode toggle feature following the Conductor workflow: gather context, spec, plan, implement, verify, ship.",
308
- category: "feature_dev",
309
- phases: [
310
- {
311
- name: "context_gathering",
312
- tools: [
313
- { name: "search_all_knowledge", args: { query: "dark mode theme UI toggle" }, domain: "learning" },
314
- { name: "run_recon", args: { target: "dark-mode-feature", scope: "architecture", maxFindings: 5 }, domain: "recon" },
315
- { name: "get_recon_summary", args: { sessionId: "bench-recon" }, domain: "recon" },
316
- ],
317
- },
318
- {
319
- name: "specification",
320
- tools: [
321
- { name: "start_verification_cycle", args: { title: "Dark Mode Toggle Implementation" }, domain: "verification" },
322
- { name: "log_phase_findings", args: { cycleId: "bench-cycle", phase: 1, summary: "Architecture review: component tree supports theme prop injection", passed: true }, domain: "verification" },
323
- ],
324
- },
325
- {
326
- name: "implement_and_test",
327
- tools: [
328
- { name: "log_phase_findings", args: { cycleId: "bench-cycle", phase: 2, summary: "Implementation: ThemeProvider + useTheme hook + toggle component", passed: true }, domain: "verification" },
329
- { name: "log_test_result", args: { cycleId: "bench-cycle", label: "dark-mode-unit", layer: "unit", passed: true }, domain: "verification" },
330
- { name: "log_test_result", args: { cycleId: "bench-cycle", label: "dark-mode-integration", layer: "integration", passed: true }, domain: "verification" },
331
- { name: "run_closed_loop", args: { targetId: "dark-mode", command: "npm test -- --grep theme", expectedPattern: "PASS" }, domain: "quality_gate" },
332
- ],
333
- },
334
- {
335
- name: "flywheel",
336
- optionalForLite: true,
337
- optionalForMeta: true,
338
- tools: [
339
- { name: "get_flywheel_status", args: {}, domain: "flywheel" },
340
- { name: "trigger_investigation", args: { evalRunId: "bench-eval", regressionDescription: "dark-mode-accessibility regression detected" }, domain: "flywheel" },
341
- ],
342
- },
343
- {
344
- name: "ship",
345
- tools: [
346
- { name: "run_quality_gate", args: { targetId: "dark-mode", rules: [{ name: "all-tests-pass", threshold: 100 }] }, domain: "quality_gate" },
347
- { name: "record_learning", args: { key: "bench-dark-mode-pattern", content: "ThemeProvider + useTheme hook pattern works well for dark mode", category: "pattern", tags: ["ui", "theme", "dark-mode"] }, domain: "learning" },
348
- ],
349
- },
350
- ],
351
- },
352
- // ─── Scenario 4: Multi-Agent Coordination ────────────────────────────
353
- // Inspired by: ruvnet/claude-flow (queen-led swarm), wshobson/agents (agent-teams)
354
- // Gap filled: Parallel agents at scale (GAP 7)
355
- {
356
- id: "multi-agent-swarm",
357
- name: "Multi-Agent Swarm: Coordinator + 3 Parallel Workers",
358
- inspiredBy: "ruvnet/claude-flow + wshobson/agents (agent-teams)",
359
- gapFilled: "GAP 7: Multi-agent coordination at scale",
360
- prompt: "Coordinate 3 parallel agents: backend-api, frontend-ui, and testing-agent working on a full-stack feature.",
361
- category: "multi_agent",
362
- phases: [
363
- {
364
- name: "coordinator_setup",
365
- optionalForMeta: true,
366
- optionalForLite: true,
367
- tools: [
368
- { name: "bootstrap_parallel_agents", args: {}, domain: "parallel" },
369
- { name: "assign_agent_role", args: { role: "backend" }, domain: "parallel" },
370
- { name: "assign_agent_role", args: { role: "frontend" }, domain: "parallel" },
371
- { name: "assign_agent_role", args: { role: "testing" }, domain: "parallel" },
372
- ],
373
- },
374
- {
375
- name: "task_assignment",
376
- optionalForMeta: true,
377
- optionalForLite: true,
378
- tools: [
379
- { name: "claim_agent_task", args: { taskKey: "backend-api-endpoints" }, domain: "parallel" },
380
- { name: "claim_agent_task", args: { taskKey: "frontend-dark-mode" }, domain: "parallel" },
381
- { name: "claim_agent_task", args: { taskKey: "e2e-tests" }, domain: "parallel" },
382
- ],
383
- },
384
- {
385
- name: "context_budget",
386
- optionalForMeta: true,
387
- optionalForLite: true,
388
- tools: [
389
- { name: "log_context_budget", args: { eventType: "checkpoint", tokensUsed: 15000 }, domain: "parallel" },
390
- { name: "log_context_budget", args: { eventType: "checkpoint", tokensUsed: 12000 }, domain: "parallel" },
391
- { name: "log_context_budget", args: { eventType: "checkpoint", tokensUsed: 8000 }, domain: "parallel" },
392
- ],
393
- },
394
- {
395
- name: "oracle_comparison",
396
- optionalForMeta: true,
397
- optionalForLite: true,
398
- tools: [
399
- { name: "run_oracle_comparison", args: { testLabel: "fullstack-integration", actualOutput: "API endpoints created + UI renders", expectedOutput: "API endpoints created + UI renders", oracleSource: "manual_review" }, domain: "parallel" },
400
- ],
401
- },
402
- {
403
- name: "knowledge_banking",
404
- optionalForMeta: true,
405
- tools: [
406
- { name: "record_learning", args: { key: "bench-parallel-fullstack", content: "3-agent fullstack pattern: backend+frontend+testing agents with coordinator reduces merge conflicts", category: "pattern", tags: ["parallel", "fullstack", "coordination"] }, domain: "learning" },
407
- ],
408
- },
409
- ],
410
- },
411
- // ─── Scenario 5: Research & Academic Writing Pipeline ─────────────────
412
- // Inspired by: K-Dense-AI/claude-scientific-skills, planning-with-files
413
- // Gap filled: Research writing 0% coverage (GAP 5)
414
- {
415
- id: "research-writing-pipeline",
416
- name: "Research Paper: Outline → Draft → Polish → Review",
417
- inspiredBy: "K-Dense-AI/claude-scientific-skills + planning-with-files",
418
- gapFilled: "GAP 5: Research writing workflows",
419
- prompt: "Write a research paper on 'Multi-Agent Coordination in AI-Assisted Development' with proper citations and peer review simulation.",
420
- category: "research",
421
- phases: [
422
- {
423
- name: "literature_review",
424
- tools: [
425
- { name: "search_all_knowledge", args: { query: "multi-agent coordination research" }, domain: "learning" },
426
- { name: "run_recon", args: { target: "multi-agent-research", scope: "literature", maxFindings: 10 }, domain: "recon" },
427
- ],
428
- },
429
- {
430
- name: "outline_and_draft",
431
- optionalForMeta: true,
432
- optionalForLite: true,
433
- tools: [
434
- { name: "check_paper_logic", args: { text: "Multi-agent coordination enables parallel task execution. Our approach uses a coordinator pattern to assign roles and manage context budgets across agents." }, domain: "research_writing" },
435
- { name: "generate_academic_caption", args: { description: "System architecture showing coordinator agent distributing tasks to 3 worker agents", figureType: "diagram" }, domain: "research_writing" },
436
- ],
437
- },
438
- {
439
- name: "polish_and_review",
440
- optionalForMeta: true,
441
- optionalForLite: true,
442
- tools: [
443
- { name: "polish_academic_text", args: { text: "Multi-agent systems enable parallel task execution. This improves throughput and reduces context window pressure." }, domain: "research_writing" },
444
- { name: "review_paper_as_reviewer", args: { text: "We propose a coordinator pattern for multi-agent AI development. Our approach distributes tasks to specialized agents.", venue: "ICSE" }, domain: "research_writing" },
445
- ],
446
- },
447
- {
448
- name: "record_findings",
449
- optionalForMeta: true,
450
- tools: [
451
- { name: "record_learning", args: { key: "bench-research-pattern", content: "4-phase research pipeline: literature review → outline → draft → polish works well for academic papers", category: "pattern", tags: ["research", "writing", "academic"] }, domain: "learning" },
452
- ],
453
- },
454
- ],
455
- },
456
- // ─── Scenario 6: Cross-Domain Investigation ──────────────────────────
457
- // Inspired by: claude-mem (multi-source observations), oh-my-claudecode (5 modes)
458
- // Gap filled: Cross-domain silos (GAP 1)
459
- {
460
- id: "cross-domain-investigation",
461
- name: "Cross-Domain: Recon → Local Files → Vision → Quality Gate",
462
- inspiredBy: "thedotmack/claude-mem + Yeachan-Heo/oh-my-claudecode",
463
- gapFilled: "GAP 1: Cross-domain workflows (break silos)",
464
- prompt: "Investigate a UI rendering issue: parse local config files, analyze screenshot, search codebase, and verify the fix.",
465
- category: "cross_domain",
466
- phases: [
467
- {
468
- name: "recon",
469
- tools: [
470
- { name: "run_recon", args: { target: "ui-rendering-bug", scope: "code", maxFindings: 5 }, domain: "recon" },
471
- { name: "search_all_knowledge", args: { query: "UI rendering CSS layout issue" }, domain: "learning" },
472
- ],
473
- },
474
- {
475
- name: "local_file_analysis",
476
- tools: [
477
- { name: "read_json_file", args: { filePath: "test_config.json" }, domain: "local_file" },
478
- { name: "extract_structured_data", args: { text: "Error: flex container overflow at line 42 in MainLayout.tsx. Component tree depth: 8. Render time: 340ms.", fields: ["error_type", "file", "line", "render_time"] }, domain: "local_file" },
479
- ],
480
- },
481
- {
482
- name: "verification",
483
- tools: [
484
- { name: "start_verification_cycle", args: { title: "UI rendering fix" }, domain: "verification" },
485
- { name: "log_gap", args: { cycleId: "bench-cycle", description: "CSS flex overflow not handled", severity: "high", phase: 2 }, domain: "verification" },
486
- { name: "resolve_gap", args: { gapId: "bench-gap", resolution: "Added overflow-x: hidden to container" }, domain: "verification" },
487
- ],
488
- },
489
- {
490
- name: "quality_gate",
491
- tools: [
492
- { name: "run_quality_gate", args: { targetId: "ui-fix", rules: [{ name: "visual-regression", threshold: 95 }] }, domain: "quality_gate" },
493
- ],
494
- },
495
- {
496
- name: "learn",
497
- tools: [
498
- { name: "record_learning", args: { key: "bench-flex-overflow", content: "Flex container overflow: always set overflow-x on deeply nested component trees", category: "gotcha", tags: ["css", "flex", "overflow", "ui"] }, domain: "learning" },
499
- ],
500
- },
501
- ],
502
- },
503
- // ─── Scenario 7: Error Recovery & Resilience ──────────────────────────
504
- // Inspired by: claude-flow (Byzantine fault tolerance), oh-my-claudecode (compaction-resilient)
505
- // Gap filled: Error recovery (GAP 2)
506
- {
507
- id: "error-recovery-resilience",
508
- name: "Error Recovery: Failure at Each Phase → Graceful Degradation",
509
- inspiredBy: "ruvnet/claude-flow (fault tolerance) + oh-my-claudecode (resilience)",
510
- gapFilled: "GAP 2: Error recovery & failure paths",
511
- prompt: "Handle a scenario where tools fail mid-workflow: recon times out, eval has stale data, and gate rules are violated.",
512
- category: "error_recovery",
513
- phases: [
514
- {
515
- name: "safe_recon",
516
- tools: [
517
- { name: "run_recon", args: { target: "resilience-test", scope: "code", maxFindings: 3 }, domain: "recon" },
518
- ],
519
- },
520
- {
521
- name: "verification_with_errors",
522
- tools: [
523
- { name: "start_verification_cycle", args: { title: "Resilience test cycle" }, domain: "verification" },
524
- { name: "log_phase_findings", args: { cycleId: "bench-cycle", phase: 1, summary: "Phase 1 passed under degraded conditions", passed: true }, domain: "verification" },
525
- // Intentionally log a gap that stays open (simulates partial recovery)
526
- { name: "log_gap", args: { cycleId: "bench-cycle", description: "Stale cache detected but non-critical", severity: "medium", phase: 2 }, domain: "verification" },
527
- ],
528
- },
529
- {
530
- name: "eval_despite_gaps",
531
- tools: [
532
- { name: "start_eval_run", args: { name: "resilience-eval" }, domain: "eval" },
533
- { name: "record_eval_result", args: { runId: "bench-eval", case: "graceful-degradation", passed: true, notes: "System operates correctly despite stale cache" }, domain: "eval" },
534
- { name: "complete_eval_run", args: { runId: "bench-eval" }, domain: "eval" },
535
- ],
536
- },
537
- {
538
- name: "gate_with_violations",
539
- tools: [
540
- // Gate with a very high threshold that will "fail" (simulates gate violation)
541
- { name: "run_quality_gate", args: { targetId: "resilience-check", rules: [{ name: "zero-open-gaps", threshold: 100 }] }, domain: "quality_gate" },
542
- ],
543
- },
544
- {
545
- name: "learn_from_failure",
546
- tools: [
547
- { name: "record_learning", args: { key: "bench-resilience-pattern", content: "Graceful degradation: continue eval even with open medium-severity gaps. Only block on critical.", category: "pattern", tags: ["resilience", "error-recovery", "degradation"] }, domain: "learning" },
548
- ],
549
- },
550
- ],
551
- },
552
- // ─── Scenario 8: Knowledge Lifecycle ──────────────────────────────────
553
- // Inspired by: thedotmack/claude-mem (session compression + token economics)
554
- // Gap filled: Knowledge lifecycle (GAP 4)
555
- {
556
- id: "knowledge-lifecycle",
557
- name: "Knowledge Lifecycle: Record → Search → Synthesize → Reuse",
558
- inspiredBy: "thedotmack/claude-mem (context compression + observations)",
559
- gapFilled: "GAP 4: Knowledge lifecycle (record → reuse)",
560
- prompt: "Exercise the full knowledge lifecycle: record learnings from past work, search for relevant knowledge, synthesize findings, and verify reuse improves outcomes.",
561
- category: "knowledge_lifecycle",
562
- phases: [
563
- {
564
- name: "seed_knowledge",
565
- tools: [
566
- { name: "record_learning", args: { key: "bench-kl-pattern-1", content: "Always check for null pointers before accessing nested properties", category: "gotcha", tags: ["null", "safety", "typescript"] }, domain: "learning" },
567
- { name: "record_learning", args: { key: "bench-kl-pattern-2", content: "Use zod schemas for API input validation at system boundaries", category: "pattern", tags: ["validation", "zod", "api"] }, domain: "learning" },
568
- { name: "record_learning", args: { key: "bench-kl-edge-1", content: "SQLite FTS5 requires rebuilding index after schema changes", category: "edge_case", tags: ["sqlite", "fts5", "migration"] }, domain: "learning" },
569
- ],
570
- },
571
- {
572
- name: "search_and_retrieve",
573
- tools: [
574
- { name: "search_all_knowledge", args: { query: "typescript null safety" }, domain: "learning" },
575
- { name: "search_all_knowledge", args: { query: "API validation" }, domain: "learning" },
576
- { name: "search_all_knowledge", args: { query: "database migration" }, domain: "learning" },
577
- ],
578
- },
579
- {
580
- name: "apply_knowledge",
581
- tools: [
582
- { name: "start_verification_cycle", args: { title: "Apply prior learnings to new task" }, domain: "verification" },
583
- { name: "log_phase_findings", args: { cycleId: "bench-cycle", phase: 1, summary: "Prior knowledge applied: null checks + zod validation added", passed: true }, domain: "verification" },
584
- ],
585
- },
586
- {
587
- name: "synthesize",
588
- optionalForMeta: true,
589
- optionalForLite: true,
590
- tools: [
591
- { name: "synthesize_recon_to_learnings", args: {}, domain: "self_eval" },
592
- { name: "get_improvement_recommendations", args: {}, domain: "self_eval" },
593
- ],
594
- },
595
- {
596
- name: "verify_reuse",
597
- tools: [
598
- { name: "record_learning", args: { key: "bench-kl-meta-learning", content: "Knowledge reuse reduced verification time by ~30%: prior learnings prevented 3 known gotchas", category: "pattern", tags: ["knowledge", "reuse", "efficiency"] }, domain: "learning" },
599
- ],
600
- },
601
- ],
602
- },
603
- ];
604
- // ═══════════════════════════════════════════════════════════════════════════
605
- // PARALLEL EXECUTION — All 4 presets run concurrently per scenario
606
- // ═══════════════════════════════════════════════════════════════════════════
607
- const allTrajectories = [];
608
- const PRESET_NAMES = ["meta", "lite", "core", "full"];
609
- describe("Preset Real-World Benchmark", () => {
610
- // ─── Per-scenario tests: 4 presets run in parallel ─────────────────
611
- for (const scenario of SCENARIOS) {
612
- describe(`Scenario: ${scenario.name}`, () => {
613
- const scenarioTrajectories = [];
614
- it(`runs all 4 presets in parallel for ${scenario.id}`, { timeout: 15000 }, async () => {
615
- // Execute all 4 presets concurrently (simulates parallel subagents)
616
- const results = await Promise.all(PRESET_NAMES.map((preset) => executeScenario(scenario, preset)));
617
- scenarioTrajectories.push(...results);
618
- allTrajectories.push(...results);
619
- // Basic sanity: every preset produced a trajectory
620
- expect(results.length).toBe(4);
621
- for (const r of results) {
622
- expect(r.scenarioId).toBe(scenario.id);
623
- expect(r.phases.length).toBe(scenario.phases.length);
624
- }
625
- });
626
- it(`full preset has no missing tools for ${scenario.id}`, () => {
627
- const full = scenarioTrajectories.find((t) => t.preset === "full");
628
- if (!full)
629
- return; // depends on previous test
630
- for (const phase of full.phases) {
631
- expect(phase.toolsMissing).toEqual([]);
632
- }
633
- // All phases complete (tools found, even if some errored on stale IDs)
634
- expect(full.phasesCompleted).toBe(scenario.phases.length);
635
- });
636
- it(`meta preset discovers tools but hits domain limits for ${scenario.id}`, () => {
637
- const meta = scenarioTrajectories.find((t) => t.preset === "meta");
638
- if (!meta)
639
- return;
640
- // Meta should always have discovery tools
641
- expect(meta.toolCount).toBe(6); // 3 meta + 3 discovery
642
- // Meta should succeed at discovery/methodology phases
643
- const discoveryPhase = meta.phases.find((p) => p.phase === "discovery" || p.phase === "methodology");
644
- if (discoveryPhase) {
645
- expect(discoveryPhase.toolsCalled.length).toBeGreaterThan(0);
646
- }
647
- });
648
- it(`lite has fewer tools but covers core verification for ${scenario.id}`, () => {
649
- const lite = scenarioTrajectories.find((t) => t.preset === "lite");
650
- const full = scenarioTrajectories.find((t) => t.preset === "full");
651
- if (!lite || !full)
652
- return;
653
- expect(lite.toolCount).toBeLessThan(full.toolCount);
654
- // Lite should always have verification, eval, learning, recon
655
- const verifyPhase = lite.phases.find((p) => p.phase === "verify" || p.phase === "verification");
656
- if (verifyPhase) {
657
- expect(verifyPhase.toolsMissing.length).toBe(0);
658
- }
659
- });
660
- it(`core covers most phases, loses only full-exclusive domains for ${scenario.id}`, () => {
661
- const core = scenarioTrajectories.find((t) => t.preset === "core");
662
- if (!core)
663
- return;
664
- // Core should complete most phases (may miss ui_capture, vision, web, github, parallel, docs)
665
- const coreCompleted = core.phasesCompleted;
666
- expect(coreCompleted).toBeGreaterThanOrEqual(scenario.phases.filter((p) => !p.optionalForLite).length - 1);
667
- });
668
- });
669
- }
670
- // ═══════════════════════════════════════════════════════════════════════
671
- // CROSS-SCENARIO ANALYSIS
672
- // ═══════════════════════════════════════════════════════════════════════
673
- describe("Cross-Scenario Analysis", () => {
674
- it("generated 32 trajectories (8 scenarios × 4 presets)", () => {
675
- expect(allTrajectories.length).toBe(32);
676
- });
677
- it("full preset has most successful tool executions (calls + errors) across all scenarios", () => {
678
- const byPreset = (p) => allTrajectories.filter((t) => t.preset === p).reduce((sum, t) => sum + t.totalToolCalls + t.totalToolErrors, 0);
679
- expect(byPreset("full")).toBeGreaterThanOrEqual(byPreset("core"));
680
- expect(byPreset("core")).toBeGreaterThanOrEqual(byPreset("lite"));
681
- expect(byPreset("lite")).toBeGreaterThan(byPreset("meta"));
682
- });
683
- it("presets are strictly ordered by tool count: meta < lite < core < full", () => {
684
- const counts = PRESET_NAMES.map((p) => {
685
- const t = allTrajectories.find((tr) => tr.preset === p);
686
- return t?.toolCount ?? 0;
687
- });
688
- expect(counts[0]).toBeLessThan(counts[1]); // meta < lite
689
- expect(counts[1]).toBeLessThan(counts[2]); // lite < core
690
- expect(counts[2]).toBeLessThan(counts[3]); // core < full
691
- });
692
- it("meta preset token overhead is <5% of full preset", () => {
693
- const metaTokens = allTrajectories.find((t) => t.preset === "meta")?.estimatedSchemaTokens ?? 0;
694
- const fullTokens = allTrajectories.find((t) => t.preset === "full")?.estimatedSchemaTokens ?? 0;
695
- expect(metaTokens / fullTokens).toBeLessThan(0.05);
696
- });
697
- it("knowledge is recorded in at least 6/8 scenarios for full preset", () => {
698
- const fullTrajectories = allTrajectories.filter((t) => t.preset === "full");
699
- const withKnowledge = fullTrajectories.filter((t) => t.knowledgeRecorded);
700
- expect(withKnowledge.length).toBeGreaterThanOrEqual(6);
701
- });
702
- it("knowledge is reused (searched) in at least 5/8 scenarios for full preset", () => {
703
- const fullTrajectories = allTrajectories.filter((t) => t.preset === "full");
704
- const withReuse = fullTrajectories.filter((t) => t.knowledgeReused);
705
- expect(withReuse.length).toBeGreaterThanOrEqual(5);
706
- });
707
- it("discovery tools are used in cold-start scenario for all presets", () => {
708
- const coldStartTrajectories = allTrajectories.filter((t) => t.scenarioId === "cold-start-self-setup");
709
- for (const t of coldStartTrajectories) {
710
- expect(t.discoveryUsed).toBe(true);
711
- }
712
- });
713
- it("lite catches verification gaps in bug-fix and feature-dev scenarios", () => {
714
- const liteTrajectories = allTrajectories.filter((t) => t.preset === "lite" && (t.scenarioId === "four-phase-bug-fix" || t.scenarioId === "conductor-feature-dev"));
715
- for (const t of liteTrajectories) {
716
- const verifyPhase = t.phases.find((p) => p.phase === "verify" || p.phase === "specification" || p.phase === "implement_and_test");
717
- if (verifyPhase) {
718
- expect(verifyPhase.toolsMissing.length).toBe(0);
719
- }
720
- }
721
- });
722
- it("multi-agent scenario requires full or core preset (lite/meta skip parallel)", () => {
723
- const metaSwarm = allTrajectories.find((t) => t.preset === "meta" && t.scenarioId === "multi-agent-swarm");
724
- const liteSwarm = allTrajectories.find((t) => t.preset === "lite" && t.scenarioId === "multi-agent-swarm");
725
- const fullSwarm = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === "multi-agent-swarm");
726
- expect(metaSwarm.totalToolMissing).toBeGreaterThan(0);
727
- expect(liteSwarm.totalToolMissing).toBeGreaterThan(0);
728
- // Full should have all parallel tools
729
- const coordPhase = fullSwarm.phases.find((p) => p.phase === "coordinator_setup");
730
- expect(coordPhase?.toolsMissing.length).toBe(0);
731
- });
732
- it("research-writing scenario needs core+ (lite/meta missing research_writing tools)", () => {
733
- const liteResearch = allTrajectories.find((t) => t.preset === "lite" && t.scenarioId === "research-writing-pipeline");
734
- const coreResearch = allTrajectories.find((t) => t.preset === "core" && t.scenarioId === "research-writing-pipeline");
735
- expect(liteResearch.totalToolMissing).toBeGreaterThan(0);
736
- // Core should have research_writing
737
- const draftPhase = coreResearch.phases.find((p) => p.phase === "outline_and_draft");
738
- expect(draftPhase?.toolsMissing.length).toBe(0);
739
- });
740
- it("error-recovery scenario completes for all presets with verification", () => {
741
- for (const preset of ["lite", "core", "full"]) {
742
- const t = allTrajectories.find((tr) => tr.preset === preset && tr.scenarioId === "error-recovery-resilience");
743
- expect(t.phasesCompleted).toBeGreaterThanOrEqual(3);
744
- }
745
- });
746
- });
747
- // ═══════════════════════════════════════════════════════════════════════
748
- // GAP COVERAGE REPORT
749
- // ═══════════════════════════════════════════════════════════════════════
750
- describe("Gap Coverage Verification", () => {
751
- it("GAP 1 (cross-domain) is exercised by 3 scenarios", () => {
752
- const crossDomain = allTrajectories.filter((t) => t.preset === "full" &&
753
- ["four-phase-bug-fix", "conductor-feature-dev", "cross-domain-investigation"].includes(t.scenarioId));
754
- expect(crossDomain.length).toBe(3);
755
- for (const t of crossDomain) {
756
- // Each should call tools from 3+ domains
757
- const domainsUsed = new Set(t.phases.flatMap((p) => p.toolsCalled));
758
- expect(domainsUsed.size).toBeGreaterThanOrEqual(3);
759
- }
760
- });
761
- it("GAP 2 (error recovery) is exercised", () => {
762
- const recovery = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === "error-recovery-resilience");
763
- expect(recovery).toBeDefined();
764
- expect(recovery.totalToolCalls).toBeGreaterThan(0);
765
- });
766
- it("GAP 4 (knowledge lifecycle) exercises record→search→synthesize", () => {
767
- const kl = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === "knowledge-lifecycle");
768
- expect(kl).toBeDefined();
769
- expect(kl.knowledgeRecorded).toBe(true);
770
- expect(kl.knowledgeReused).toBe(true);
771
- // Synthesize phase should complete for full
772
- const synthPhase = kl.phases.find((p) => p.phase === "synthesize");
773
- expect(synthPhase?.toolsMissing.length).toBe(0);
774
- });
775
- it("GAP 5 (research writing) exercises outline→draft→polish→review", () => {
776
- const rw = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === "research-writing-pipeline");
777
- expect(rw).toBeDefined();
778
- const phases = rw.phases.map((p) => p.phase);
779
- expect(phases).toContain("outline_and_draft");
780
- expect(phases).toContain("polish_and_review");
781
- });
782
- it("GAP 6 (bootstrap cold-start) exercises discovery→bootstrap→seed", () => {
783
- const cs = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === "cold-start-self-setup");
784
- expect(cs).toBeDefined();
785
- expect(cs.discoveryUsed).toBe(true);
786
- const bootstrapPhase = cs.phases.find((p) => p.phase === "bootstrap");
787
- expect(bootstrapPhase?.toolsMissing.length).toBe(0);
788
- });
789
- it("GAP 7 (multi-agent) exercises coordinator→assign→budget→oracle", () => {
790
- const ma = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === "multi-agent-swarm");
791
- expect(ma).toBeDefined();
792
- // Full preset has all parallel tools, so no missing tools in any phase
793
- expect(ma.totalToolMissing).toBe(0);
794
- expect(ma.phasesCompleted).toBe(5);
795
- });
796
- });
797
- // ═══════════════════════════════════════════════════════════════════════
798
- // FINAL REPORT (printed to console after all tests)
799
- // ═══════════════════════════════════════════════════════════════════════
800
- afterAll(() => {
801
- if (allTrajectories.length === 0)
802
- return;
803
- console.log("\n╔══════════════════════════════════════════════════════════════════════════╗");
804
- console.log("║ PRESET REAL-WORLD BENCHMARK — IMPACT REPORT ║");
805
- console.log("║ 8 scenarios × 4 presets = 32 trajectories ║");
806
- console.log("║ Inspired by: superpowers, agents, claude-flow, oh-my-claudecode, ║");
807
- console.log("║ claude-mem, planning-with-files, scientific-skills, ║");
808
- console.log("║ claude-code-guide ║");
809
- console.log("╠══════════════════════════════════════════════════════════════════════════╣");
810
- // Per-preset summary
811
- for (const preset of PRESET_NAMES) {
812
- const trajectories = allTrajectories.filter((t) => t.preset === preset);
813
- const totalCalls = trajectories.reduce((s, t) => s + t.totalToolCalls, 0);
814
- const totalMissing = trajectories.reduce((s, t) => s + t.totalToolMissing, 0);
815
- const totalErrors = trajectories.reduce((s, t) => s + t.totalToolErrors, 0);
816
- const completedPhases = trajectories.reduce((s, t) => s + t.phasesCompleted, 0);
817
- const totalPhases = trajectories.reduce((s, t) => s + t.phases.length, 0);
818
- const toolCount = trajectories[0]?.toolCount ?? 0;
819
- const tokens = trajectories[0]?.estimatedSchemaTokens ?? 0;
820
- const knowledgeCount = trajectories.filter((t) => t.knowledgeRecorded).length;
821
- const duration = trajectories.reduce((s, t) => s + t.durationMs, 0);
822
- console.log(`║ ║`);
823
- console.log(`║ --preset ${preset.padEnd(6)} (${String(toolCount).padStart(3)} tools, ~${String(tokens).padStart(5)} schema tokens) ║`);
824
- console.log(`║ Phases: ${completedPhases}/${totalPhases} completed ║`);
825
- console.log(`║ Tools: ${totalCalls} called, ${totalMissing} missing, ${totalErrors} errors ║`);
826
- console.log(`║ Knowledge: ${knowledgeCount}/8 scenarios recorded learnings ║`);
827
- console.log(`║ Duration: ${duration}ms total ║`);
828
- }
829
- // Per-scenario breakdown
830
- console.log("║ ║");
831
- console.log("╠══════════════════════════════════════════════════════════════════════════╣");
832
- console.log("║ PER-SCENARIO BREAKDOWN ║");
833
- console.log("╠══════════════════════════════════════════════════════════════════════════╣");
834
- for (const scenario of SCENARIOS) {
835
- console.log(`║ ║`);
836
- console.log(`║ ${scenario.id.padEnd(40)} [${scenario.gapFilled}] ║`);
837
- for (const preset of PRESET_NAMES) {
838
- const t = allTrajectories.find((tr) => tr.preset === preset && tr.scenarioId === scenario.id);
839
- if (t) {
840
- const status = t.phasesCompleted === t.phases.length ? "PASS" : `${t.phasesCompleted}/${t.phases.length}`;
841
- console.log(`║ ${preset.padEnd(6)}: ${status.padEnd(6)} | calls=${String(t.totalToolCalls).padStart(3)} missing=${String(t.totalToolMissing).padStart(2)} | ${t.durationMs}ms ║`);
842
- }
843
- }
844
- }
845
- // Recommendations
846
- console.log("║ ║");
847
- console.log("╠══════════════════════════════════════════════════════════════════════════╣");
848
- console.log("║ RECOMMENDATIONS ║");
849
- console.log("╠══════════════════════════════════════════════════════════════════════════╣");
850
- console.log("║ ║");
851
- console.log("║ Discovery-first / new agents → --preset meta (self-escalate) ║");
852
- console.log("║ Solo dev, bug fixes, features → --preset lite (fast, core coverage) ║");
853
- console.log("║ Research + multi-agent teams → --preset core (full methodology) ║");
854
- console.log("║ Full pipeline + all domains → --preset full (zero blind spots) ║");
855
- console.log("║ ║");
856
- console.log("╚══════════════════════════════════════════════════════════════════════════╝");
857
- });
858
- });
859
- //# sourceMappingURL=presetRealWorldBench.test.js.map