nodebench-mcp 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  2. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  3. package/dist/dashboard/operatingServer.js +3 -2
  4. package/dist/dashboard/operatingServer.js.map +1 -1
  5. package/dist/db.js +51 -3
  6. package/dist/db.js.map +1 -1
  7. package/dist/index.js +13 -16
  8. package/dist/index.js.map +1 -1
  9. package/dist/packageInfo.d.ts +3 -0
  10. package/dist/packageInfo.js +32 -0
  11. package/dist/packageInfo.js.map +1 -0
  12. package/dist/sandboxApi.js +2 -1
  13. package/dist/sandboxApi.js.map +1 -1
  14. package/dist/tools/boilerplateTools.js +10 -9
  15. package/dist/tools/boilerplateTools.js.map +1 -1
  16. package/dist/tools/documentationTools.js +2 -1
  17. package/dist/tools/documentationTools.js.map +1 -1
  18. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  19. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  20. package/dist/tools/toolRegistry.js +11 -0
  21. package/dist/tools/toolRegistry.js.map +1 -1
  22. package/dist/toolsetRegistry.js +74 -1
  23. package/dist/toolsetRegistry.js.map +1 -1
  24. package/package.json +4 -3
  25. package/dist/__tests__/analytics.test.d.ts +0 -11
  26. package/dist/__tests__/analytics.test.js +0 -546
  27. package/dist/__tests__/analytics.test.js.map +0 -1
  28. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  29. package/dist/__tests__/architectComplex.test.js +0 -373
  30. package/dist/__tests__/architectComplex.test.js.map +0 -1
  31. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  32. package/dist/__tests__/architectSmoke.test.js +0 -92
  33. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  34. package/dist/__tests__/audit-registry.d.ts +0 -1
  35. package/dist/__tests__/audit-registry.js +0 -60
  36. package/dist/__tests__/audit-registry.js.map +0 -1
  37. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  38. package/dist/__tests__/batchAutopilot.test.js +0 -218
  39. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  40. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  41. package/dist/__tests__/cliSubcommands.test.js +0 -138
  42. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  43. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  44. package/dist/__tests__/comparativeBench.test.js +0 -722
  45. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  46. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  47. package/dist/__tests__/critterCalibrationEval.js +0 -370
  48. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  49. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  50. package/dist/__tests__/dynamicLoading.test.js +0 -280
  51. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  52. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  53. package/dist/__tests__/embeddingProvider.test.js +0 -86
  54. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  55. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  56. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  57. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  58. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  59. package/dist/__tests__/evalHarness.test.js +0 -1107
  60. package/dist/__tests__/evalHarness.test.js.map +0 -1
  61. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  62. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  63. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  64. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  65. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  66. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  67. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  69. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  70. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  72. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  73. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  74. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  75. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  76. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingScoring.test.js +0 -202
  78. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  79. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  80. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  81. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  83. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  84. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  86. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  87. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  90. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  91. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  92. package/dist/__tests__/helpers/answerMatch.js +0 -267
  93. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  94. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  95. package/dist/__tests__/helpers/textLlm.js +0 -214
  96. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  97. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  98. package/dist/__tests__/localDashboard.test.js +0 -226
  99. package/dist/__tests__/localDashboard.test.js.map +0 -1
  100. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  101. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  102. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  103. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  104. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  105. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  108. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  111. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  114. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  116. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  117. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  118. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  119. package/dist/__tests__/openclawDogfood.test.js +0 -535
  120. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  121. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  122. package/dist/__tests__/openclawMessaging.test.js +0 -232
  123. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  124. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  125. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  126. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  127. package/dist/__tests__/tools.test.d.ts +0 -1
  128. package/dist/__tests__/tools.test.js +0 -3201
  129. package/dist/__tests__/tools.test.js.map +0 -1
  130. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  131. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  132. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  133. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  134. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  135. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  136. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  137. package/dist/__tests__/webmcpTools.test.js +0 -195
  138. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  139. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  140. package/dist/benchmarks/testProviderBus.js +0 -272
  141. package/dist/benchmarks/testProviderBus.js.map +0 -1
  142. package/dist/hooks/postCompaction.d.ts +0 -14
  143. package/dist/hooks/postCompaction.js +0 -51
  144. package/dist/hooks/postCompaction.js.map +0 -1
  145. package/dist/security/__tests__/security.test.d.ts +0 -8
  146. package/dist/security/__tests__/security.test.js +0 -295
  147. package/dist/security/__tests__/security.test.js.map +0 -1
  148. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  149. package/dist/sync/hyperloopEval.test.js +0 -60
  150. package/dist/sync/hyperloopEval.test.js.map +0 -1
  151. package/dist/sync/store.test.d.ts +0 -4
  152. package/dist/sync/store.test.js +0 -43
  153. package/dist/sync/store.test.js.map +0 -1
  154. package/dist/tools/documentTools.d.ts +0 -5
  155. package/dist/tools/documentTools.js +0 -524
  156. package/dist/tools/documentTools.js.map +0 -1
  157. package/dist/tools/financialTools.d.ts +0 -10
  158. package/dist/tools/financialTools.js +0 -403
  159. package/dist/tools/financialTools.js.map +0 -1
  160. package/dist/tools/memoryTools.d.ts +0 -5
  161. package/dist/tools/memoryTools.js +0 -137
  162. package/dist/tools/memoryTools.js.map +0 -1
  163. package/dist/tools/planningTools.d.ts +0 -5
  164. package/dist/tools/planningTools.js +0 -147
  165. package/dist/tools/planningTools.js.map +0 -1
  166. package/dist/tools/searchTools.d.ts +0 -5
  167. package/dist/tools/searchTools.js +0 -145
  168. package/dist/tools/searchTools.js.map +0 -1
@@ -1,1099 +0,0 @@
1
- /**
2
- * Toolset Gating Evaluation — Real Trajectory Comparison
3
- *
4
- * Runs 9 diverse real-world scenarios through lite, core, and full presets.
5
- * Scenario categories inspired by SWE-bench Pro, GAIA, TAU-bench, MCP-AgentBench,
6
- * and real tasks from the nodebench-ai codebase.
7
- *
8
- * Categories:
9
- * - Bug fix (model fallback, cron lifecycle)
10
- * - Feature implementation (governance appeal, OAuth token rotation)
11
- * - Refactoring (cross-branch dedup reconciliation)
12
- * - Multi-agent coordination (parallel pipeline refactor, swarm state isolation)
13
- * - Deployment / canary (model canary rollout)
14
- * - Performance (query optimization)
15
- *
16
- * Measures:
17
- * - Which phases complete vs fail (tool not found)
18
- * - Concrete impact delta between presets
19
- * - Token surface area reduction (tool count × estimated schema tokens)
20
- * - Whether lite/core catch enough per scenario category
21
- *
22
- * This answers: "If I gate to --preset lite, what do I lose per scenario type?"
23
- */
24
- import { describe, it, expect, afterAll } from "vitest";
25
- import { verificationTools } from "../tools/verificationTools.js";
26
- import { reconTools } from "../tools/reconTools.js";
27
- import { evalTools } from "../tools/evalTools.js";
28
- import { qualityGateTools } from "../tools/qualityGateTools.js";
29
- import { flywheelTools } from "../tools/flywheelTools.js";
30
- import { learningTools } from "../tools/learningTools.js";
31
- import { agentBootstrapTools } from "../tools/agentBootstrapTools.js";
32
- import { selfEvalTools } from "../tools/selfEvalTools.js";
33
- import { parallelAgentTools } from "../tools/parallelAgentTools.js";
34
- import { uiCaptureTools } from "../tools/uiCaptureTools.js";
35
- import { visionTools } from "../tools/visionTools.js";
36
- import { webTools } from "../tools/webTools.js";
37
- import { githubTools } from "../tools/githubTools.js";
38
- import { documentationTools } from "../tools/documentationTools.js";
39
- import { localFileTools, gaiaMediaSolvers } from "../tools/localFileTools.js";
40
- import { llmTools } from "../tools/llmTools.js";
41
- import { securityTools } from "../tools/securityTools.js";
42
- import { platformTools } from "../tools/platformTools.js";
43
- import { researchWritingTools } from "../tools/researchWritingTools.js";
44
- import { flickerDetectionTools } from "../tools/flickerDetectionTools.js";
45
- import { figmaFlowTools } from "../tools/figmaFlowTools.js";
46
- import { boilerplateTools } from "../tools/boilerplateTools.js";
47
- import { cCompilerBenchmarkTools } from "../tools/cCompilerBenchmarkTools.js";
48
- import { sessionMemoryTools } from "../tools/sessionMemoryTools.js";
49
- import { toonTools } from "../tools/toonTools.js";
50
- import { patternTools } from "../tools/patternTools.js";
51
- import { gitWorkflowTools } from "../tools/gitWorkflowTools.js";
52
- import { seoTools } from "../tools/seoTools.js";
53
- import { voiceBridgeTools } from "../tools/voiceBridgeTools.js";
54
- import { critterTools } from "../tools/critterTools.js";
55
- import { emailTools } from "../tools/emailTools.js";
56
- import { rssTools } from "../tools/rssTools.js";
57
- import { architectTools } from "../tools/architectTools.js";
58
- import { createMetaTools } from "../tools/metaTools.js";
59
- // ═══════════════════════════════════════════════════════════════════════════
60
- // PRESET DEFINITIONS (mirrors index.ts TOOLSET_MAP + PRESETS exactly)
61
- // ═══════════════════════════════════════════════════════════════════════════
62
- const TOOLSET_MAP = {
63
- verification: verificationTools,
64
- eval: evalTools,
65
- quality_gate: qualityGateTools,
66
- learning: learningTools,
67
- flywheel: flywheelTools,
68
- recon: reconTools,
69
- ui_capture: uiCaptureTools,
70
- vision: visionTools,
71
- local_file: localFileTools,
72
- web: webTools,
73
- github: githubTools,
74
- docs: documentationTools,
75
- bootstrap: agentBootstrapTools,
76
- self_eval: selfEvalTools,
77
- parallel: parallelAgentTools,
78
- llm: llmTools,
79
- security: securityTools,
80
- platform: platformTools,
81
- research_writing: researchWritingTools,
82
- flicker_detection: flickerDetectionTools,
83
- figma_flow: figmaFlowTools,
84
- boilerplate: boilerplateTools,
85
- benchmark: cCompilerBenchmarkTools,
86
- session_memory: sessionMemoryTools,
87
- gaia_solvers: gaiaMediaSolvers,
88
- toon: toonTools,
89
- pattern: patternTools,
90
- git_workflow: gitWorkflowTools,
91
- seo: seoTools,
92
- voice_bridge: voiceBridgeTools,
93
- critter: critterTools,
94
- email: emailTools,
95
- rss: rssTools,
96
- architect: architectTools,
97
- };
98
- const PRESETS = {
99
- meta: [],
100
- lite: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "security", "boilerplate"],
101
- core: ["verification", "eval", "quality_gate", "learning", "flywheel", "recon", "bootstrap", "self_eval", "llm", "security", "platform", "research_writing", "flicker_detection", "figma_flow", "boilerplate", "benchmark", "session_memory", "toon", "pattern", "git_workflow", "seo", "voice_bridge", "critter", "email", "rss", "architect"],
102
- full: Object.keys(TOOLSET_MAP),
103
- };
104
- function buildToolset(preset) {
105
- const keys = PRESETS[preset];
106
- const domain = keys.flatMap((k) => TOOLSET_MAP[k] ?? []);
107
- return [...domain, ...createMetaTools(domain)];
108
- }
109
- // ═══════════════════════════════════════════════════════════════════════════
110
- // 9 DIVERSE SCENARIOS — from actual production codebase
111
- // Categories: bug_fix, feature, refactor, operational, security, performance, deployment
112
- // Inspired by SWE-bench Pro, GAIA, TAU-bench, MCP-AgentBench scenario diversity
113
- // ═══════════════════════════════════════════════════════════════════════════
114
- const SCENARIOS = [
115
- // ─── Bug Fix ───
116
- {
117
- id: "model-fallback-chain",
118
- prompt: "The free model resolver isn't falling back correctly. When glm-4-flash-250414 returns 429, we should try the next model in the chain but instead the agent just errors out. Fix executeWithModelFallback in modelResolver.ts.",
119
- domain: "Model Resolution",
120
- category: "bug_fix",
121
- complexity: "medium",
122
- blindSpots: [
123
- "Fallback chain doesn't skip models that returned 429 in the last 5 minutes",
124
- "No exponential backoff — retries slam the rate-limited endpoint immediately",
125
- "Missing telemetry: which model actually served the response is never logged",
126
- ],
127
- },
128
- {
129
- id: "digest-cron-silent-fail",
130
- prompt: "The daily digest agent hasn't produced output in 4 days. No errors in logs. Is the cron firing? Is the heartbeat blocking? Trace the full lifecycle from crons.ts through digestAgent.ts.",
131
- domain: "Agent Loop",
132
- category: "bug_fix",
133
- complexity: "high",
134
- blindSpots: [
135
- "Heartbeat rate limiting silently returns success but blocks execution",
136
- "listAgents returns empty if no agents have 'active' status in DB",
137
- "No timeout on executeAgentWorkCycle — hung LLM call stalls entire cron tick",
138
- ],
139
- },
140
- // ─── Feature Implementation ───
141
- {
142
- id: "governance-appeal-workflow",
143
- prompt: "We have quarantine.ts to pause misbehaving agents, but no way for them to appeal or auto-remediate. Build a system where agents can request trust tier upgrades after 7 days without incidents, with human-in-the-loop appeal review.",
144
- domain: "Governance",
145
- category: "feature",
146
- complexity: "high",
147
- blindSpots: [
148
- "Appeal versioning & history — no table for tracking appeal requests, success rates, or preventing appeal spam",
149
- "Trust score decay logic — static TRUST_TIER_SCORES with no time-weighted rebuild from incident-free periods",
150
- "Cross-domain impact — lifting quarantine for post_to_linkedin should sync across allowedTools and allowedChannels without manual intervention",
151
- ],
152
- },
153
- {
154
- id: "oauth-token-rotation",
155
- prompt: "LinkedIn tokens expire in 60 days. We refresh proactively 7 days before, but if the refresh fails and we have no refresh_token, posting just silently fails. Build a proper fallback: system token → user token → expired-but-retry, with alerting.",
156
- domain: "LinkedIn Pipeline",
157
- category: "security",
158
- complexity: "medium",
159
- blindSpots: [
160
- "Token state machine missing — code checks boolean 'is expired', should model: valid → expiring_soon → expired_can_refresh → expired_final → requires_reauth",
161
- "Retry budget exhaustion — if refresh fails 5x, should escalate alert severity, not just log",
162
- "Scope reduction fallback — if full refresh fails, fall back to posting-only scope (LinkedIn API supports it), not all-or-nothing failure",
163
- ],
164
- },
165
- // ─── Refactoring ───
166
- {
167
- id: "dd-cross-branch-dedup",
168
- prompt: "Due diligence spawns 5 parallel branches (company, team, market, technical, regulatory), but results are full of contradictions: Team branch says founder left, Market branch says he's still there. Build cross-branch verification that detects and auto-resolves contradictions by source reliability.",
169
- domain: "Due Diligence",
170
- category: "refactor",
171
- complexity: "high",
172
- blindSpots: [
173
- "Entity linking across branches — Team extracts 'founder: John Smith', Market extracts 'CEO: John Smith Jr.' — needs fuzzy matching not naive string dedup",
174
- "Source reliability weighting — contradiction between LinkedIn (primary) and archived tweet (secondary) should favor LinkedIn; SourceReliability enum exists but not used in conflict resolution",
175
- "Partial confidence updates — resolving contradiction should update original branch confidence score, not return flat contradiction list",
176
- ],
177
- },
178
- // ─── Multi-Agent Coordination ───
179
- {
180
- id: "linkedin-parallel-refactor",
181
- prompt: "I need to refactor the LinkedIn posting pipeline so 3 Claude Code subagents can work on it in parallel: one on posting, one on archive dedup, one on scheduling. They keep overwriting each other's changes. Set up coordination.",
182
- domain: "LinkedIn Pipeline",
183
- category: "operational",
184
- complexity: "high",
185
- blindSpots: [
186
- "No task claiming — both agents see the same dedup bug and both fix it",
187
- "No progress file — third agent re-investigates what agent 1 already solved",
188
- "No context budget tracking — agent 2 hits context limit mid-fix, loses work",
189
- "No oracle comparison — merged output has conflict markers nobody catches",
190
- ],
191
- },
192
- {
193
- id: "swarm-state-isolation",
194
- prompt: "We spawn parallel subagents (DocumentAgent, MediaAgent, OpenBBAgent) in swarmOrchestrator. Sometimes they step on each other's messages in the same thread — both write to threadId=X simultaneously. Build proper message locking so agents don't clobber each other's outputs.",
195
- domain: "Agent Loop",
196
- category: "operational",
197
- complexity: "high",
198
- blindSpots: [
199
- "Message ordering guarantees — parallel agents write to same thread; if DocumentAgent finishes before MediaAgent, message order is wrong in UI",
200
- "Checkpoint contention — CheckpointManager.start() may lose concurrent updates from multiple agents despite Convex OCC",
201
- "Partial failure recovery — if one agent crashes after checkpoint but before writing final message, next agent doesn't know to re-read context",
202
- ],
203
- },
204
- // ─── Deployment / Canary ───
205
- {
206
- id: "model-canary-rollout",
207
- prompt: "We hardcoded model selection in autonomousConfig.ts (SYNTHESIS_MODEL = 'qwen3-coder-free'). Implement canary rollout: test new models on 10% of jobs, track quality, auto-promote to 100% if success rate > 95%, auto-rollback if < 80%.",
208
- domain: "Model Resolution",
209
- category: "deployment",
210
- complexity: "medium",
211
- blindSpots: [
212
- "Canary slot assignment — need deterministic hash of job ID (hash(jobId) % 100 < canaryPercent), not random, so same job never switches models mid-retry",
213
- "Success metric definition — 'success rate' is ambiguous: tool error rate? output quality? latency? Need multi-factor gate with independent thresholds",
214
- "Model state drift — rolling back from Model-B to Model-A but old jobs cached with Model-B responses; resuming from checkpoint confuses model_id",
215
- ],
216
- },
217
- // ─── Performance ───
218
- {
219
- id: "archive-query-optimization",
220
- prompt: "The LinkedIn archive page takes 8 seconds to load for companies with 500+ posts. The query does a full table scan with JS-side filtering and .take(500) pagination. Optimize with proper indexes, cursor-based pagination, and server-side filtering.",
221
- domain: "LinkedIn Pipeline",
222
- category: "performance",
223
- complexity: "medium",
224
- blindSpots: [
225
- "Archive lookback uses .take(500) with no cursor — page 2 re-scans rows from page 1, O(n^2) total reads",
226
- "JS-side filtering of personaType and contentSource happens after fetching all rows — should be index-based",
227
- "Dedup hash (cyrb53) is computed on every query, not stored as indexed column — can't deduplicate at DB level",
228
- ],
229
- },
230
- ];
231
- // ═══════════════════════════════════════════════════════════════════════════
232
- // TRAJECTORY RUNNER
233
- // ═══════════════════════════════════════════════════════════════════════════
234
- const callLog = [];
235
- const cleanup = { cycleIds: [], learningKeys: [] };
236
- async function runTrajectory(preset, scenario) {
237
- const tools = buildToolset(preset);
238
- const toolMap = new Map(tools.map((t) => [t.name, t]));
239
- const sid = `gating-${preset}-${scenario.id}`;
240
- const trajectory = {
241
- preset,
242
- scenarioId: scenario.id,
243
- toolCount: tools.length,
244
- estimatedSchemaTokens: tools.length * 200, // ~200 tokens per tool schema avg
245
- phases: [],
246
- phasesCompleted: 0,
247
- phasesSkipped: 0,
248
- totalToolCalls: 0,
249
- issuesDetected: 0,
250
- reconFindings: 0,
251
- evalCases: 0,
252
- gateRules: 0,
253
- learningRecorded: false,
254
- flywheelComplete: false,
255
- riskAssessed: false,
256
- };
257
- async function tryCall(name, args, phase) {
258
- const tool = toolMap.get(name);
259
- if (!tool) {
260
- callLog.push({ preset, scenario: scenario.id, tool: name, phase, status: "missing" });
261
- return null; // tool not available in this preset
262
- }
263
- try {
264
- const result = await tool.handler(args);
265
- callLog.push({ preset, scenario: scenario.id, tool: name, phase, status: "success" });
266
- trajectory.totalToolCalls++;
267
- return result;
268
- }
269
- catch (err) {
270
- callLog.push({ preset, scenario: scenario.id, tool: name, phase, status: "error" });
271
- trajectory.totalToolCalls++;
272
- return null;
273
- }
274
- }
275
- // ─── Phase 1: META — tool discovery ───
276
- {
277
- const called = [];
278
- const missing = [];
279
- const ft = await tryCall("findTools", { query: `${scenario.domain} ${scenario.category}` }, "meta");
280
- ft ? called.push("findTools") : missing.push("findTools");
281
- const gm = await tryCall("getMethodology", { topic: "verification" }, "meta");
282
- gm ? called.push("getMethodology") : missing.push("getMethodology");
283
- const success = called.length > 0;
284
- trajectory.phases.push({ phase: "meta", toolsCalled: called, toolsMissing: missing, success });
285
- if (success)
286
- trajectory.phasesCompleted++;
287
- else
288
- trajectory.phasesSkipped++;
289
- }
290
- // ─── Phase 2: RECON — structured research ───
291
- {
292
- const called = [];
293
- const missing = [];
294
- const recon = await tryCall("run_recon", {
295
- target: `${scenario.domain}: ${scenario.prompt.slice(0, 80)}`,
296
- description: `Gating eval: ${scenario.prompt.slice(0, 120)}`,
297
- }, "recon");
298
- if (recon) {
299
- called.push("run_recon");
300
- const findingCount = scenario.complexity === "high" ? 3 : 2;
301
- for (let f = 0; f < findingCount; f++) {
302
- const r = await tryCall("log_recon_finding", {
303
- sessionId: recon.sessionId,
304
- category: f === 0 ? "codebase_pattern" : "existing_implementation",
305
- summary: scenario.blindSpots[f] || `Pattern in ${scenario.domain}`,
306
- relevance: `Impacts: ${scenario.prompt.slice(0, 60)}`,
307
- }, "recon");
308
- if (r) {
309
- called.push("log_recon_finding");
310
- trajectory.reconFindings++;
311
- }
312
- }
313
- await tryCall("get_recon_summary", { sessionId: recon.sessionId }, "recon");
314
- called.push("get_recon_summary");
315
- }
316
- else {
317
- missing.push("run_recon", "log_recon_finding", "get_recon_summary");
318
- }
319
- // Additional recon tools
320
- const fwCheck = await tryCall("check_framework_updates", { ecosystem: "mcp" }, "recon");
321
- fwCheck ? called.push("check_framework_updates") : missing.push("check_framework_updates");
322
- const projCtx = await tryCall("get_project_context", {}, "recon");
323
- projCtx ? called.push("get_project_context") : missing.push("get_project_context");
324
- const success = called.length > 0;
325
- trajectory.phases.push({ phase: "recon", toolsCalled: called, toolsMissing: missing, success });
326
- if (success)
327
- trajectory.phasesCompleted++;
328
- else
329
- trajectory.phasesSkipped++;
330
- }
331
- // ─── Phase 3: RISK — assessment ───
332
- {
333
- const called = [];
334
- const missing = [];
335
- const risk = await tryCall("assess_risk", {
336
- action: "fix_implementation",
337
- context: `${scenario.domain} — ${scenario.complexity} — ${scenario.prompt.slice(0, 80)}`,
338
- }, "risk");
339
- if (risk) {
340
- called.push("assess_risk");
341
- trajectory.riskAssessed = true;
342
- }
343
- else {
344
- missing.push("assess_risk");
345
- }
346
- const success = called.length > 0;
347
- trajectory.phases.push({ phase: "risk", toolsCalled: called, toolsMissing: missing, success });
348
- if (success)
349
- trajectory.phasesCompleted++;
350
- else
351
- trajectory.phasesSkipped++;
352
- }
353
- // ─── Phase 4: VERIFICATION — tracked implementation cycle ───
354
- {
355
- const called = [];
356
- const missing = [];
357
- const cycle = await tryCall("start_verification_cycle", {
358
- title: `gating-eval-${preset}-${scenario.id}`,
359
- description: scenario.prompt.slice(0, 200),
360
- }, "verification");
361
- if (cycle) {
362
- called.push("start_verification_cycle");
363
- cleanup.cycleIds.push(cycle.cycleId);
364
- // Phase findings
365
- await tryCall("log_phase_findings", {
366
- cycleId: cycle.cycleId, phaseNumber: 1, status: "passed",
367
- findings: { domain: scenario.domain, reconFindings: trajectory.reconFindings },
368
- }, "verification");
369
- called.push("log_phase_findings");
370
- await tryCall("log_phase_findings", {
371
- cycleId: cycle.cycleId, phaseNumber: 2, status: "passed",
372
- findings: { fixApplied: true },
373
- }, "verification");
374
- // Log gaps from blind spots
375
- const gapCount = scenario.complexity === "high" ? 2 : 1;
376
- const gapIds = [];
377
- for (let g = 0; g < gapCount; g++) {
378
- const gap = await tryCall("log_gap", {
379
- cycleId: cycle.cycleId,
380
- severity: g === 0 ? (scenario.complexity === "high" ? "HIGH" : "MEDIUM") : "MEDIUM",
381
- title: `gating-eval-${scenario.blindSpots[g]?.slice(0, 50) || scenario.id}`,
382
- description: scenario.blindSpots[g] || `Issue in ${scenario.domain}`,
383
- rootCause: "Discovered via structured recon",
384
- fixStrategy: `Fix ${scenario.category} in ${scenario.domain}`,
385
- }, "verification");
386
- if (gap) {
387
- called.push("log_gap");
388
- gapIds.push(gap.gapId);
389
- trajectory.issuesDetected++;
390
- }
391
- }
392
- // Resolve gaps
393
- for (const gapId of gapIds) {
394
- await tryCall("resolve_gap", { gapId }, "verification");
395
- called.push("resolve_gap");
396
- }
397
- // 3-layer testing
398
- for (const layer of ["static", "unit", "integration"]) {
399
- await tryCall("log_test_result", {
400
- cycleId: cycle.cycleId, layer,
401
- label: `gating-eval-${preset}-${scenario.id}-${layer}`,
402
- passed: true, output: `${layer} pass`,
403
- }, "verification");
404
- called.push("log_test_result");
405
- }
406
- // Check status and list cycles
407
- const status = await tryCall("get_verification_status", { cycleId: cycle.cycleId }, "verification");
408
- if (status)
409
- called.push("get_verification_status");
410
- const cycleList = await tryCall("list_verification_cycles", { limit: 5 }, "verification");
411
- if (cycleList)
412
- called.push("list_verification_cycles");
413
- }
414
- else {
415
- missing.push("start_verification_cycle", "log_gap", "log_test_result", "get_verification_status", "list_verification_cycles");
416
- }
417
- const success = called.length > 0;
418
- trajectory.phases.push({ phase: "verification", toolsCalled: called, toolsMissing: missing, success });
419
- if (success)
420
- trajectory.phasesCompleted++;
421
- else
422
- trajectory.phasesSkipped++;
423
- }
424
- // ─── Phase 5: EVAL — regression cases ───
425
- {
426
- const called = [];
427
- const missing = [];
428
- const evalRun = await tryCall("start_eval_run", {
429
- name: `gating-eval-${preset}-${scenario.id}`,
430
- description: `Regression eval for ${scenario.domain}`,
431
- cases: [
432
- { input: scenario.prompt.slice(0, 100), intent: `Verify ${scenario.category} fix` },
433
- { input: `Regression guard for ${scenario.id}`, intent: `Prevent regression` },
434
- ],
435
- }, "eval");
436
- if (evalRun) {
437
- called.push("start_eval_run");
438
- for (const caseId of evalRun.caseIds) {
439
- await tryCall("record_eval_result", {
440
- caseId, actual: "Fix verified", verdict: "pass", score: 0.92,
441
- }, "eval");
442
- called.push("record_eval_result");
443
- trajectory.evalCases++;
444
- }
445
- await tryCall("complete_eval_run", { runId: evalRun.runId }, "eval");
446
- called.push("complete_eval_run");
447
- // List and compare runs
448
- const runList = await tryCall("list_eval_runs", { limit: 5 }, "eval");
449
- if (runList)
450
- called.push("list_eval_runs");
451
- // Compare with self (validates the tool works even if baseline === candidate)
452
- const cmp = await tryCall("compare_eval_runs", {
453
- baselineRunId: evalRun.runId,
454
- candidateRunId: evalRun.runId,
455
- }, "eval");
456
- if (cmp)
457
- called.push("compare_eval_runs");
458
- }
459
- else {
460
- missing.push("start_eval_run", "record_eval_result", "complete_eval_run", "list_eval_runs", "compare_eval_runs");
461
- }
462
- const success = called.length > 0;
463
- trajectory.phases.push({ phase: "eval", toolsCalled: called, toolsMissing: missing, success });
464
- if (success)
465
- trajectory.phasesCompleted++;
466
- else
467
- trajectory.phasesSkipped++;
468
- }
469
- // ─── Phase 6: QUALITY GATE ───
470
- {
471
- const called = [];
472
- const missing = [];
473
- const rules = [
474
- { name: "all_tests_pass", passed: true },
475
- { name: "no_type_errors", passed: true },
476
- { name: "no_lint_violations", passed: true },
477
- { name: "coverage_threshold", passed: scenario.complexity !== "high" },
478
- ];
479
- if (scenario.complexity === "high") {
480
- rules.push({ name: "regression_cases_exist", passed: true });
481
- rules.push({ name: "edge_cases_covered", passed: true });
482
- }
483
- const gate = await tryCall("run_quality_gate", {
484
- gateName: "deploy_readiness",
485
- target: `gating-eval-${preset}-${scenario.id}`,
486
- rules,
487
- }, "quality-gate");
488
- if (gate) {
489
- called.push("run_quality_gate");
490
- trajectory.gateRules = rules.length;
491
- }
492
- else {
493
- missing.push("run_quality_gate");
494
- }
495
- const cl = await tryCall("run_closed_loop", {
496
- steps: [{ step: "compile", passed: true }, { step: "lint", passed: true }, { step: "test", passed: true }],
497
- }, "quality-gate");
498
- if (cl)
499
- called.push("run_closed_loop");
500
- else
501
- missing.push("run_closed_loop");
502
- // Gate preset and history
503
- const gp = await tryCall("get_gate_preset", { preset: "deploy_readiness" }, "quality-gate");
504
- gp ? called.push("get_gate_preset") : missing.push("get_gate_preset");
505
- const gh = await tryCall("get_gate_history", {
506
- gateName: "deploy_readiness",
507
- limit: 5,
508
- }, "quality-gate");
509
- gh ? called.push("get_gate_history") : missing.push("get_gate_history");
510
- const success = called.length > 0;
511
- trajectory.phases.push({ phase: "quality-gate", toolsCalled: called, toolsMissing: missing, success });
512
- if (success)
513
- trajectory.phasesCompleted++;
514
- else
515
- trajectory.phasesSkipped++;
516
- }
517
- // ─── Phase 7: KNOWLEDGE — search + record ───
518
- {
519
- const called = [];
520
- const missing = [];
521
- const prior = await tryCall("search_all_knowledge", {
522
- query: `gating ${scenario.domain}`,
523
- }, "knowledge");
524
- if (prior)
525
- called.push("search_all_knowledge");
526
- else
527
- missing.push("search_all_knowledge");
528
- const lkey = `gating-eval-${preset}-${scenario.id}-${Date.now()}`;
529
- cleanup.learningKeys.push(lkey);
530
- const lr = await tryCall("record_learning", {
531
- key: lkey,
532
- category: "pattern",
533
- content: `[gating-eval] ${scenario.domain}: ${scenario.blindSpots[0]?.slice(0, 80)}`,
534
- tags: ["gating-eval", preset, scenario.domain.toLowerCase().replace(/\s+/g, "-")],
535
- }, "knowledge");
536
- if (lr) {
537
- called.push("record_learning");
538
- trajectory.learningRecorded = true;
539
- }
540
- else
541
- missing.push("record_learning");
542
- const success = called.length > 0;
543
- trajectory.phases.push({ phase: "knowledge", toolsCalled: called, toolsMissing: missing, success });
544
- if (success)
545
- trajectory.phasesCompleted++;
546
- else
547
- trajectory.phasesSkipped++;
548
- }
549
- // ─── Phase 8: FLYWHEEL — mandatory 6-step ───
550
- {
551
- const called = [];
552
- const missing = [];
553
- const fw = await tryCall("run_mandatory_flywheel", {
554
- target: `gating-eval-${preset}-${scenario.id}`,
555
- steps: [
556
- { stepName: "static_analysis", passed: true },
557
- { stepName: "happy_path_test", passed: true },
558
- { stepName: "failure_path_test", passed: true },
559
- { stepName: "gap_analysis", passed: true },
560
- { stepName: "fix_and_reverify", passed: true },
561
- { stepName: "deploy_and_document", passed: true },
562
- ],
563
- }, "flywheel");
564
- if (fw) {
565
- called.push("run_mandatory_flywheel");
566
- trajectory.flywheelComplete = fw.passed === true;
567
- }
568
- else {
569
- missing.push("run_mandatory_flywheel");
570
- }
571
- // Flywheel status check
572
- const fwStatus = await tryCall("get_flywheel_status", { includeHistory: false }, "flywheel");
573
- fwStatus ? called.push("get_flywheel_status") : missing.push("get_flywheel_status");
574
- // Promote to eval (needs a real cycleId from phase 4)
575
- const cycleId = cleanup.cycleIds[cleanup.cycleIds.length - 1];
576
- if (cycleId) {
577
- const promo = await tryCall("promote_to_eval", {
578
- cycleId,
579
- evalRunName: `gating-promoted-${preset}-${scenario.id}`,
580
- cases: [{ input: scenario.prompt.slice(0, 80), intent: `Regression guard for ${scenario.domain}` }],
581
- }, "flywheel");
582
- promo ? called.push("promote_to_eval") : missing.push("promote_to_eval");
583
- // Trigger investigation (needs evalRunId from promotion)
584
- if (promo?.evalRunId) {
585
- const inv = await tryCall("trigger_investigation", {
586
- evalRunId: promo.evalRunId,
587
- regressionDescription: `Potential regression in ${scenario.domain}: ${scenario.blindSpots[0]?.slice(0, 60)}`,
588
- }, "flywheel");
589
- inv ? called.push("trigger_investigation") : missing.push("trigger_investigation");
590
- }
591
- }
592
- const success = called.length > 0;
593
- trajectory.phases.push({ phase: "flywheel", toolsCalled: called, toolsMissing: missing, success });
594
- if (success)
595
- trajectory.phasesCompleted++;
596
- else
597
- trajectory.phasesSkipped++;
598
- }
599
- // ─── Phase 9 (operational scenarios): PARALLEL AGENT TOOLS ───
600
- if (scenario.category === "operational") {
601
- const called = [];
602
- const missing = [];
603
- // Bootstrap parallel session
604
- const bootstrap = await tryCall("bootstrap_parallel_agents", {
605
- dryRun: true,
606
- }, "parallel");
607
- bootstrap ? called.push("bootstrap_parallel_agents") : missing.push("bootstrap_parallel_agents");
608
- const claim = await tryCall("claim_agent_task", {
609
- taskKey: `gating-eval-${preset}-${scenario.id}-posting`,
610
- description: "Refactor LinkedIn posting module",
611
- }, "parallel");
612
- if (claim) {
613
- called.push("claim_agent_task");
614
- await tryCall("assign_agent_role", {
615
- role: "implementer", focusArea: "posting",
616
- }, "parallel");
617
- called.push("assign_agent_role");
618
- // Verify role assignment
619
- const role = await tryCall("get_agent_role", {}, "parallel");
620
- role ? called.push("get_agent_role") : missing.push("get_agent_role");
621
- // Log context budget during work
622
- const budget = await tryCall("log_context_budget", {
623
- eventType: "tool_output",
624
- tokensUsed: 3500,
625
- description: `Phase output for ${scenario.id}`,
626
- }, "parallel");
627
- budget ? called.push("log_context_budget") : missing.push("log_context_budget");
628
- // List tasks to verify claim
629
- const taskList = await tryCall("list_agent_tasks", { status: "claimed" }, "parallel");
630
- taskList ? called.push("list_agent_tasks") : missing.push("list_agent_tasks");
631
- await tryCall("get_parallel_status", { includeHistory: false }, "parallel");
632
- called.push("get_parallel_status");
633
- // Oracle comparison — validate merged output
634
- const oracle = await tryCall("run_oracle_comparison", {
635
- testLabel: `gating-eval-${preset}-${scenario.id}-merge`,
636
- actualOutput: `Fixed ${scenario.domain} posting module`,
637
- expectedOutput: `Fixed ${scenario.domain} posting module`,
638
- oracleSource: "gating-eval-reference",
639
- }, "parallel");
640
- oracle ? called.push("run_oracle_comparison") : missing.push("run_oracle_comparison");
641
- await tryCall("release_agent_task", {
642
- taskKey: `gating-eval-${preset}-${scenario.id}-posting`,
643
- status: "completed",
644
- progressNote: "Posting module refactored",
645
- }, "parallel");
646
- called.push("release_agent_task");
647
- // Generate coordination doc
648
- const agentsMd = await tryCall("generate_parallel_agents_md", {
649
- projectName: `gating-eval-${scenario.id}`,
650
- maxAgents: 3,
651
- }, "parallel");
652
- agentsMd ? called.push("generate_parallel_agents_md") : missing.push("generate_parallel_agents_md");
653
- }
654
- else {
655
- missing.push("claim_agent_task", "assign_agent_role", "get_agent_role", "log_context_budget", "list_agent_tasks", "get_parallel_status", "run_oracle_comparison", "release_agent_task", "generate_parallel_agents_md");
656
- }
657
- trajectory.phases.push({
658
- phase: "parallel",
659
- toolsCalled: called,
660
- toolsMissing: missing,
661
- success: called.length > 0,
662
- });
663
- if (called.length > 0)
664
- trajectory.phasesCompleted++;
665
- else
666
- trajectory.phasesSkipped++;
667
- }
668
- // ─── Phase 10: SELF-EVAL (all 6 tools) ───
669
- {
670
- const called = [];
671
- const missing = [];
672
- // Log a tool call for this trajectory
673
- const logCall = await tryCall("log_tool_call", {
674
- sessionId: sid,
675
- toolName: "run_recon",
676
- durationMs: 42,
677
- resultStatus: "success",
678
- phase: "recon",
679
- }, "self-eval");
680
- logCall ? called.push("log_tool_call") : missing.push("log_tool_call");
681
- // Get trajectory analysis
682
- const trajAnalysis = await tryCall("get_trajectory_analysis", {
683
- sessionId: sid,
684
- }, "self-eval");
685
- trajAnalysis ? called.push("get_trajectory_analysis") : missing.push("get_trajectory_analysis");
686
- // Get self-eval report
687
- const report = await tryCall("get_self_eval_report", {
688
- excludeTestSessions: true,
689
- }, "self-eval");
690
- report ? called.push("get_self_eval_report") : missing.push("get_self_eval_report");
691
- // Get improvement recommendations
692
- const recs = await tryCall("get_improvement_recommendations", {
693
- focus: "all",
694
- }, "self-eval");
695
- recs ? called.push("get_improvement_recommendations") : missing.push("get_improvement_recommendations");
696
- // Cleanup stale runs (dry run)
697
- const staleCleanup = await tryCall("cleanup_stale_runs", {
698
- dryRun: true,
699
- }, "self-eval");
700
- staleCleanup ? called.push("cleanup_stale_runs") : missing.push("cleanup_stale_runs");
701
- // Synthesize recon to learnings (dry run)
702
- const synth = await tryCall("synthesize_recon_to_learnings", {
703
- dryRun: true,
704
- }, "self-eval");
705
- synth ? called.push("synthesize_recon_to_learnings") : missing.push("synthesize_recon_to_learnings");
706
- if (called.length > 0 || missing.length > 0) {
707
- trajectory.phases.push({
708
- phase: "self-eval",
709
- toolsCalled: called,
710
- toolsMissing: missing,
711
- success: called.length > 0,
712
- });
713
- if (called.length > 0)
714
- trajectory.phasesCompleted++;
715
- else
716
- trajectory.phasesSkipped++;
717
- }
718
- }
719
- return trajectory;
720
- }
721
- // ═══════════════════════════════════════════════════════════════════════════
722
- // CLEANUP
723
- // ═══════════════════════════════════════════════════════════════════════════
724
- async function cleanupAll() {
725
- const fullTools = buildToolset("full");
726
- const findTool = (name) => fullTools.find((t) => t.name === name);
727
- for (const cycleId of cleanup.cycleIds) {
728
- try {
729
- await findTool("abandon_cycle")?.handler({ cycleId, reason: "gating eval cleanup" });
730
- }
731
- catch { /* ok */ }
732
- }
733
- for (const key of cleanup.learningKeys) {
734
- try {
735
- await findTool("delete_learning")?.handler({ key });
736
- }
737
- catch { /* ok */ }
738
- }
739
- }
740
- // ═══════════════════════════════════════════════════════════════════════════
741
- // TESTS
742
- // ═══════════════════════════════════════════════════════════════════════════
743
- const allTrajectories = [];
744
- describe("Toolset Gating Eval", () => {
745
- afterAll(async () => { await cleanupAll(); });
746
- for (const preset of ["meta", "lite", "core", "full"]) {
747
- describe(`Preset: ${preset}`, () => {
748
- for (const scenario of SCENARIOS) {
749
- it(`${preset}/${scenario.id}: runs 8-phase pipeline`, async () => {
750
- const t = await runTrajectory(preset, scenario);
751
- allTrajectories.push(t);
752
- // Meta phase always succeeds (findTools + getMethodology always present)
753
- const metaPhase = t.phases.find((p) => p.phase === "meta");
754
- expect(metaPhase?.success).toBe(true);
755
- if (preset === "meta") {
756
- // meta preset: only meta tools available — all other phases skipped
757
- expect(t.phasesCompleted).toBe(1); // only meta phase
758
- expect(t.toolCount).toBe(3); // findTools + getMethodology + check_mcp_setup
759
- }
760
- else {
761
- // lite, core, full: domain tools available
762
- const reconPhase = t.phases.find((p) => p.phase === "recon");
763
- expect(reconPhase?.success).toBe(true);
764
- const verifyPhase = t.phases.find((p) => p.phase === "verification");
765
- expect(verifyPhase?.success).toBe(true);
766
- const evalPhase = t.phases.find((p) => p.phase === "eval");
767
- expect(evalPhase?.success).toBe(true);
768
- const gatePhase = t.phases.find((p) => p.phase === "quality-gate");
769
- expect(gatePhase?.success).toBe(true);
770
- // Knowledge phase depends on preset (learning tools in lite + core + full)
771
- const knowledgePhase = t.phases.find((p) => p.phase === "knowledge");
772
- expect(knowledgePhase?.success).toBe(true);
773
- }
774
- }, 30_000);
775
- }
776
- });
777
- }
778
- describe("Flywheel availability", () => {
779
- it("meta preset does NOT have flywheel tools", () => {
780
- const noFlywheel = allTrajectories.filter((t) => t.preset === "meta");
781
- for (const t of noFlywheel) {
782
- const fw = t.phases.find((p) => p.phase === "flywheel");
783
- expect(fw?.success).toBe(false);
784
- }
785
- });
786
- it("lite, core, and full presets HAVE flywheel tools", () => {
787
- const withFlywheel = allTrajectories.filter((t) => t.preset === "lite" || t.preset === "core" || t.preset === "full");
788
- for (const t of withFlywheel) {
789
- expect(t.flywheelComplete).toBe(true);
790
- }
791
- });
792
- });
793
- describe("Parallel agent tools availability", () => {
794
- it("lite and core do NOT have parallel tools", () => {
795
- const parallelScenarios = allTrajectories.filter((t) => (t.scenarioId === "linkedin-parallel-refactor" || t.scenarioId === "swarm-state-isolation") && t.preset !== "full");
796
- for (const t of parallelScenarios) {
797
- const pp = t.phases.find((p) => p.phase === "parallel");
798
- if (pp) {
799
- expect(pp.success).toBe(false);
800
- expect(pp.toolsMissing).toContain("claim_agent_task");
801
- }
802
- }
803
- });
804
- it("full preset HAS parallel tools for parallel scenarios", () => {
805
- const fullParallel = allTrajectories.filter((t) => (t.scenarioId === "linkedin-parallel-refactor" || t.scenarioId === "swarm-state-isolation") && t.preset === "full");
806
- for (const t of fullParallel) {
807
- const pp = t.phases.find((p) => p.phase === "parallel");
808
- if (pp) {
809
- expect(pp.success).toBe(true);
810
- expect(pp.toolsCalled).toContain("claim_agent_task");
811
- }
812
- }
813
- });
814
- });
815
- describe("Self-eval availability", () => {
816
- it("meta and lite do NOT have self-eval tools", () => {
817
- const noSelfEval = allTrajectories.filter((t) => t.preset === "meta" || t.preset === "lite");
818
- for (const t of noSelfEval) {
819
- const se = t.phases.find((p) => p.phase === "self-eval");
820
- if (se)
821
- expect(se.success).toBe(false);
822
- }
823
- });
824
- it("core and full HAVE self-eval tools", () => {
825
- const coreFullTrajectories = allTrajectories.filter((t) => t.preset === "core" || t.preset === "full");
826
- for (const t of coreFullTrajectories) {
827
- const se = t.phases.find((p) => p.phase === "self-eval");
828
- expect(se?.success).toBe(true);
829
- }
830
- });
831
- });
832
- describe("Token surface area reduction", () => {
833
- it("meta has the fewest tools (only meta tools)", () => {
834
- const metaT = allTrajectories.find((t) => t.preset === "meta");
835
- const liteT = allTrajectories.find((t) => t.preset === "lite");
836
- expect(metaT.toolCount).toBe(3); // findTools + getMethodology + check_mcp_setup
837
- expect(metaT.toolCount).toBeLessThan(liteT.toolCount);
838
- const reduction = 1 - metaT.toolCount / liteT.toolCount;
839
- expect(reduction).toBeGreaterThan(0.9); // meta is 90%+ fewer tools than lite
840
- });
841
- it("lite reduces tool count and estimated token overhead vs full", () => {
842
- const liteT = allTrajectories.find((t) => t.preset === "lite");
843
- const fullT = allTrajectories.find((t) => t.preset === "full");
844
- expect(liteT.toolCount).toBeLessThan(fullT.toolCount);
845
- expect(liteT.estimatedSchemaTokens).toBeLessThan(fullT.estimatedSchemaTokens);
846
- const reduction = 1 - liteT.toolCount / fullT.toolCount;
847
- expect(reduction).toBeGreaterThan(0.5); // lite is at least 50% fewer tools
848
- });
849
- it("presets are ordered: meta < lite < core < full", () => {
850
- const metaT = allTrajectories.find((t) => t.preset === "meta");
851
- const liteT = allTrajectories.find((t) => t.preset === "lite");
852
- const coreT = allTrajectories.find((t) => t.preset === "core");
853
- const fullT = allTrajectories.find((t) => t.preset === "full");
854
- expect(metaT.toolCount).toBeLessThan(liteT.toolCount);
855
- expect(liteT.toolCount).toBeLessThan(coreT.toolCount);
856
- expect(coreT.toolCount).toBeLessThan(fullT.toolCount);
857
- });
858
- });
859
- });
860
- // ═══════════════════════════════════════════════════════════════════════════
861
- // TRAJECTORY COMPARISON REPORT
862
- // ═══════════════════════════════════════════════════════════════════════════
863
- describe("Toolset Gating Report", () => {
864
- it("generates trajectory comparison across presets", () => {
865
- expect(allTrajectories.length).toBe(36); // 4 presets × 9 scenarios
866
- console.log("\n");
867
- console.log("╔══════════════════════════════════════════════════════════════════════════════╗");
868
- console.log("║ TOOLSET GATING EVAL — Trajectory Comparison ║");
869
- console.log("║ 3 presets × 9 diverse scenarios = 27 trajectories ║");
870
- console.log("╚══════════════════════════════════════════════════════════════════════════════╝");
871
- console.log("");
872
- // ─── SECTION 1: TOOL COUNT & TOKEN OVERHEAD ───
873
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
874
- console.log("│ 1. TOOL COUNT & ESTIMATED TOKEN OVERHEAD │");
875
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
876
- for (const preset of ["meta", "lite", "core", "full"]) {
877
- const t = allTrajectories.find((tr) => tr.preset === preset);
878
- const bar = "█".repeat(Math.round(t.toolCount / 3));
879
- console.log(`│ ${preset.padEnd(6)} ${String(t.toolCount).padStart(3)} tools ~${String(t.estimatedSchemaTokens).padStart(5)} tokens ${bar}`.padEnd(79) + "│");
880
- }
881
- const liteT = allTrajectories.find((t) => t.preset === "lite");
882
- const fullT = allTrajectories.find((t) => t.preset === "full");
883
- const savings = Math.round((1 - liteT.estimatedSchemaTokens / fullT.estimatedSchemaTokens) * 100);
884
- console.log("│ │");
885
- console.log(`│ lite saves ~${savings}% token overhead vs full (${fullT.estimatedSchemaTokens - liteT.estimatedSchemaTokens} fewer tokens/call)`.padEnd(79) + "│");
886
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
887
- console.log("");
888
- // ─── SECTION 2: PHASE COMPLETION MATRIX ───
889
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
890
- console.log("│ 2. PHASE COMPLETION MATRIX │");
891
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
892
- console.log("│ Phase lite core full │");
893
- console.log("│ ───────────────── ────── ────── ────── │");
894
- const allPhaseNames = ["meta", "recon", "risk", "verification", "eval", "quality-gate", "knowledge", "flywheel", "parallel", "self-eval"];
895
- for (const phase of allPhaseNames) {
896
- const cols = [];
897
- for (const preset of ["meta", "lite", "core", "full"]) {
898
- const trajectories = allTrajectories.filter((t) => t.preset === preset);
899
- const phaseResults = trajectories.map((t) => t.phases.find((p) => p.phase === phase));
900
- const present = phaseResults.some((p) => p);
901
- if (!present) {
902
- cols.push(" -- ");
903
- }
904
- else {
905
- const allSuccess = phaseResults.every((p) => p?.success);
906
- const anySuccess = phaseResults.some((p) => p?.success);
907
- cols.push(allSuccess ? " OK " : anySuccess ? " PART " : " MISS ");
908
- }
909
- }
910
- console.log(`│ ${phase.padEnd(19)}${cols.join(" ")}`.padEnd(79) + "│");
911
- }
912
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
913
- console.log("");
914
- // ─── SECTION 3: IMPACT COMPARISON ───
915
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
916
- console.log("│ 3. CONCRETE IMPACT PER PRESET (aggregated across 9 scenarios) │");
917
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
918
- console.log("│ Metric lite core full │");
919
- console.log("│ ───────────────────────────── ────── ────── ────── │");
920
- for (const metric of [
921
- { label: "Issues detected", key: "issuesDetected" },
922
- { label: "Recon findings", key: "reconFindings" },
923
- { label: "Eval cases created", key: "evalCases" },
924
- { label: "Gate rules enforced", key: "gateRules" },
925
- { label: "Total tool calls", key: "totalToolCalls" },
926
- ]) {
927
- const cols = [];
928
- for (const preset of ["meta", "lite", "core", "full"]) {
929
- const sum = allTrajectories
930
- .filter((t) => t.preset === preset)
931
- .reduce((s, t) => s + t[metric.key], 0);
932
- cols.push(String(sum).padStart(4));
933
- }
934
- console.log(`│ ${metric.label.padEnd(30)}${cols.map((c) => c.padEnd(8)).join("")}`.padEnd(79) + "│");
935
- }
936
- // Boolean metrics
937
- for (const metric of [
938
- { label: "Risk assessed", fn: (t) => t.riskAssessed },
939
- { label: "Learning recorded", fn: (t) => t.learningRecorded },
940
- { label: "Flywheel complete", fn: (t) => t.flywheelComplete },
941
- ]) {
942
- const cols = [];
943
- for (const preset of ["meta", "lite", "core", "full"]) {
944
- const count = allTrajectories
945
- .filter((t) => t.preset === preset)
946
- .filter(metric.fn).length;
947
- cols.push(`${count}/${SCENARIOS.length}`);
948
- }
949
- console.log(`│ ${metric.label.padEnd(30)}${cols.map((c) => c.padStart(4).padEnd(8)).join("")}`.padEnd(79) + "│");
950
- }
951
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
952
- console.log("");
953
- // ─── SECTION 4: MISSING TOOLS LOG ───
954
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
955
- console.log("│ 4. TOOLS MISSING BY PRESET (what you lose with gating) │");
956
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
957
- for (const preset of ["meta", "lite", "core"]) {
958
- const missingCalls = callLog.filter((c) => c.preset === preset && c.status === "missing");
959
- const uniqueMissing = [...new Set(missingCalls.map((c) => c.tool))];
960
- if (uniqueMissing.length > 0) {
961
- console.log(`│ ${preset.toUpperCase()}: missing ${uniqueMissing.length} tools`.padEnd(79) + "│");
962
- for (const tool of uniqueMissing) {
963
- const phases = [...new Set(missingCalls.filter((c) => c.tool === tool).map((c) => c.phase))];
964
- console.log(`│ ${tool.padEnd(28)} (needed in: ${phases.join(", ")})`.padEnd(79) + "│");
965
- }
966
- console.log("│ │");
967
- }
968
- }
969
- console.log(`│ FULL: 0 missing tools (all ${fullT.toolCount} available)`.padEnd(79) + "│");
970
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
971
- console.log("");
972
- // ─── SECTION 5: CATEGORY BREAKDOWN ───
973
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
974
- console.log("│ 5. IMPACT BY SCENARIO CATEGORY │");
975
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
976
- console.log("│ Category Scenarios lite% core% full% Key delta │");
977
- console.log("│ ────────────── ───────── ───── ───── ───── ────────────────────── │");
978
- const categories = [...new Set(SCENARIOS.map((s) => s.category))];
979
- for (const cat of categories) {
980
- const catScenarios = SCENARIOS.filter((s) => s.category === cat);
981
- const catIds = new Set(catScenarios.map((s) => s.id));
982
- const count = catScenarios.length;
983
- const pctFor = (preset) => {
984
- const ts = allTrajectories.filter((t) => t.preset === preset && catIds.has(t.scenarioId));
985
- const completed = ts.reduce((s, t) => s + t.phasesCompleted, 0);
986
- const total = ts.reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
987
- return total > 0 ? Math.round(completed / total * 100) : 0;
988
- };
989
- const litePct = pctFor("lite");
990
- const corePct = pctFor("core");
991
- const fullPct = pctFor("full");
992
- let delta = "";
993
- if (litePct === corePct && corePct === fullPct)
994
- delta = "no difference";
995
- else if (litePct === corePct)
996
- delta = "parallel only";
997
- else if (corePct === fullPct)
998
- delta = "lite loses risk+flywheel";
999
- else
1000
- delta = `lite ${litePct}% → full ${fullPct}%`;
1001
- console.log(`│ ${cat.padEnd(15)}${String(count).padStart(5)} ${String(litePct).padStart(3)}% ${String(corePct).padStart(3)}% ${String(fullPct).padStart(3)}% ${delta}`.padEnd(79) + "│");
1002
- }
1003
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
1004
- console.log("");
1005
- // ─── SECTION 6: PER-SCENARIO DETAIL ───
1006
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
1007
- console.log("│ 6. PER-SCENARIO TRAJECTORY DETAIL │");
1008
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
1009
- console.log("│ Scenario Cat Cplx lite core full Issues Calls │");
1010
- console.log("│ ────────────────────────── ───────── ──── ──── ──── ──── ────── ───── │");
1011
- for (const s of SCENARIOS) {
1012
- const liteTr = allTrajectories.find((t) => t.preset === "lite" && t.scenarioId === s.id);
1013
- const coreTr = allTrajectories.find((t) => t.preset === "core" && t.scenarioId === s.id);
1014
- const fullTr = allTrajectories.find((t) => t.preset === "full" && t.scenarioId === s.id);
1015
- const lp = `${liteTr.phasesCompleted}/${liteTr.phasesCompleted + liteTr.phasesSkipped}`;
1016
- const cp = `${coreTr.phasesCompleted}/${coreTr.phasesCompleted + coreTr.phasesSkipped}`;
1017
- const fp = `${fullTr.phasesCompleted}/${fullTr.phasesCompleted + fullTr.phasesSkipped}`;
1018
- console.log(`│ ${s.id.slice(0, 26).padEnd(27)}${s.category.slice(0, 9).padEnd(10)}${s.complexity.slice(0, 3).toUpperCase().padEnd(5)}${lp.padEnd(5)}${cp.padEnd(5)}${fp.padEnd(6)}${String(fullTr.issuesDetected).padStart(4)} ${String(fullTr.totalToolCalls).padStart(5)}`.padEnd(79) + "│");
1019
- }
1020
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
1021
- console.log("");
1022
- // ─── SECTION 7: TOOL COVERAGE ───
1023
- console.log("┌──────────────────────────────────────────────────────────────────────────────┐");
1024
- console.log("│ 7. UNIQUE TOOLS EXERCISED PER PRESET │");
1025
- console.log("├──────────────────────────────────────────────────────────────────────────────┤");
1026
- for (const preset of ["meta", "lite", "core", "full"]) {
1027
- const successCalls = callLog.filter((c) => c.preset === preset && c.status === "success");
1028
- const uniqueTools = [...new Set(successCalls.map((c) => c.tool))];
1029
- const availableTools = buildToolset(preset).length;
1030
- const pct = Math.round(uniqueTools.length / availableTools * 100);
1031
- console.log(`│ ${preset.padEnd(6)} ${String(uniqueTools.length).padStart(3)} / ${String(availableTools).padStart(3)} tools exercised (${pct}%)`.padEnd(79) + "│");
1032
- }
1033
- const allSuccessCalls = callLog.filter((c) => c.status === "success");
1034
- const totalUnique = [...new Set(allSuccessCalls.map((c) => c.tool))];
1035
- console.log("│ │");
1036
- console.log(`│ Total unique tools exercised across all presets: ${totalUnique.length}`.padEnd(79) + "│");
1037
- console.log("└──────────────────────────────────────────────────────────────────────────────┘");
1038
- console.log("");
1039
- // ─── VERDICT ───
1040
- console.log("╔══════════════════════════════════════════════════════════════════════════════╗");
1041
- console.log("║ VERDICT ║");
1042
- console.log("╠══════════════════════════════════════════════════════════════════════════════╣");
1043
- console.log("║ ║");
1044
- const metaCompleted = allTrajectories.filter((t) => t.preset === "meta").reduce((s, t) => s + t.phasesCompleted, 0);
1045
- const metaTotal = allTrajectories.filter((t) => t.preset === "meta").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
1046
- const liteCompleted = allTrajectories.filter((t) => t.preset === "lite").reduce((s, t) => s + t.phasesCompleted, 0);
1047
- const liteTotal = allTrajectories.filter((t) => t.preset === "lite").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
1048
- const coreCompleted = allTrajectories.filter((t) => t.preset === "core").reduce((s, t) => s + t.phasesCompleted, 0);
1049
- const coreTotal = allTrajectories.filter((t) => t.preset === "core").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
1050
- const fullCompleted = allTrajectories.filter((t) => t.preset === "full").reduce((s, t) => s + t.phasesCompleted, 0);
1051
- const fullTotal = allTrajectories.filter((t) => t.preset === "full").reduce((s, t) => s + t.phasesCompleted + t.phasesSkipped, 0);
1052
- console.log(`║ meta: ${metaCompleted}/${metaTotal} phases (${Math.round(metaCompleted / metaTotal * 100)}%) — discovery only, 5 tools, minimal context`.padEnd(79) + "║");
1053
- console.log(`║ lite: ${liteCompleted}/${liteTotal} phases (${Math.round(liteCompleted / liteTotal * 100)}%) — ${savings}% fewer tokens, loses flywheel + parallel`.padEnd(79) + "║");
1054
- console.log(`║ core: ${coreCompleted}/${coreTotal} phases (${Math.round(coreCompleted / coreTotal * 100)}%) — full methodology loop, no parallel/vision/web`.padEnd(79) + "║");
1055
- console.log(`║ full: ${fullCompleted}/${fullTotal} phases (${Math.round(fullCompleted / fullTotal * 100)}%) — everything`.padEnd(79) + "║");
1056
- console.log("║ ║");
1057
- console.log("║ Recommendation: ║");
1058
- console.log("║ Discovery-first / front door → --preset meta (5 tools, self-escalate) ║");
1059
- console.log("║ Solo dev, standard tasks → --preset lite (fast, low token overhead) ║");
1060
- console.log("║ Team with methodology needs → --preset core (full flywheel loop) ║");
1061
- console.log("║ Multi-agent / full pipeline → --preset full (parallel + self-eval) ║");
1062
- console.log("║ ║");
1063
- console.log("╚══════════════════════════════════════════════════════════════════════════════╝");
1064
- // ─── ASSERTIONS ───
1065
- // meta preset: only meta phase succeeds (discovery-only gate)
1066
- {
1067
- const metaTrajectories = allTrajectories.filter((t) => t.preset === "meta");
1068
- for (const t of metaTrajectories) {
1069
- expect(t.phases.find((p) => p.phase === "meta")?.success).toBe(true);
1070
- expect(t.phasesCompleted).toBe(1);
1071
- expect(t.toolCount).toBe(3);
1072
- }
1073
- }
1074
- // lite, core, full: complete the core 6 phases (meta, recon, risk, verification, eval, quality-gate)
1075
- for (const preset of ["lite", "core", "full"]) {
1076
- const trajectories = allTrajectories.filter((t) => t.preset === preset);
1077
- for (const t of trajectories) {
1078
- expect(t.phases.find((p) => p.phase === "meta")?.success).toBe(true);
1079
- expect(t.phases.find((p) => p.phase === "recon")?.success).toBe(true);
1080
- expect(t.phases.find((p) => p.phase === "verification")?.success).toBe(true);
1081
- expect(t.phases.find((p) => p.phase === "eval")?.success).toBe(true);
1082
- expect(t.phases.find((p) => p.phase === "quality-gate")?.success).toBe(true);
1083
- expect(t.phases.find((p) => p.phase === "knowledge")?.success).toBe(true);
1084
- }
1085
- }
1086
- // lite, core, full detect issues (core methodology is intact)
1087
- for (const preset of ["lite", "core", "full"]) {
1088
- const totalIssues = allTrajectories
1089
- .filter((t) => t.preset === preset)
1090
- .reduce((s, t) => s + t.issuesDetected, 0);
1091
- expect(totalIssues).toBeGreaterThanOrEqual(10); // at least 10 across 9 scenarios
1092
- }
1093
- // Full preset has more tool calls (parallel + self-eval phases add calls)
1094
- const fullCalls = allTrajectories.filter((t) => t.preset === "full").reduce((s, t) => s + t.totalToolCalls, 0);
1095
- const liteCalls = allTrajectories.filter((t) => t.preset === "lite").reduce((s, t) => s + t.totalToolCalls, 0);
1096
- expect(fullCalls).toBeGreaterThan(liteCalls);
1097
- });
1098
- });
1099
- //# sourceMappingURL=toolsetGatingEval.test.js.map