nodebench-mcp 2.70.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. package/README.md +95 -41
  2. package/dist/agents/alertRouter.d.ts +38 -0
  3. package/dist/agents/alertRouter.js +151 -0
  4. package/dist/agents/alertRouter.js.map +1 -0
  5. package/dist/agents/entityMemory.d.ts +40 -0
  6. package/dist/agents/entityMemory.js +64 -0
  7. package/dist/agents/entityMemory.js.map +1 -0
  8. package/dist/agents/subAgents.d.ts +35 -0
  9. package/dist/agents/subAgents.js +62 -0
  10. package/dist/agents/subAgents.js.map +1 -0
  11. package/dist/benchmarks/benchmarkRunner.js +14 -0
  12. package/dist/benchmarks/benchmarkRunner.js.map +1 -1
  13. package/dist/benchmarks/chainEval.js +107 -0
  14. package/dist/benchmarks/chainEval.js.map +1 -1
  15. package/dist/benchmarks/llmJudgeEval.js +85 -0
  16. package/dist/benchmarks/llmJudgeEval.js.map +1 -1
  17. package/dist/benchmarks/searchQualityEval.js +118 -5
  18. package/dist/benchmarks/searchQualityEval.js.map +1 -1
  19. package/dist/cli/search.d.ts +13 -0
  20. package/dist/cli/search.js +130 -0
  21. package/dist/cli/search.js.map +1 -0
  22. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  23. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  24. package/dist/dashboard/operatingServer.js +3 -2
  25. package/dist/dashboard/operatingServer.js.map +1 -1
  26. package/dist/db.d.ts +6 -2
  27. package/dist/db.js +521 -6
  28. package/dist/db.js.map +1 -1
  29. package/dist/index.js +349 -67
  30. package/dist/index.js.map +1 -1
  31. package/dist/packageInfo.d.ts +3 -0
  32. package/dist/packageInfo.js +32 -0
  33. package/dist/packageInfo.js.map +1 -0
  34. package/dist/profiler/behaviorStore.d.ts +97 -0
  35. package/dist/profiler/behaviorStore.js +276 -0
  36. package/dist/profiler/behaviorStore.js.map +1 -0
  37. package/dist/profiler/eventCollector.d.ts +119 -0
  38. package/dist/profiler/eventCollector.js +267 -0
  39. package/dist/profiler/eventCollector.js.map +1 -0
  40. package/dist/profiler/index.d.ts +15 -0
  41. package/dist/profiler/index.js +16 -0
  42. package/dist/profiler/index.js.map +1 -0
  43. package/dist/profiler/mcpProxy.d.ts +49 -0
  44. package/dist/profiler/mcpProxy.js +123 -0
  45. package/dist/profiler/mcpProxy.js.map +1 -0
  46. package/dist/profiler/modelRouter.d.ts +30 -0
  47. package/dist/profiler/modelRouter.js +99 -0
  48. package/dist/profiler/modelRouter.js.map +1 -0
  49. package/dist/profiler/otelReceiver.d.ts +17 -0
  50. package/dist/profiler/otelReceiver.js +62 -0
  51. package/dist/profiler/otelReceiver.js.map +1 -0
  52. package/dist/profiler/proofEngine.d.ts +41 -0
  53. package/dist/profiler/proofEngine.js +93 -0
  54. package/dist/profiler/proofEngine.js.map +1 -0
  55. package/dist/profiler/workflowTemplates.d.ts +41 -0
  56. package/dist/profiler/workflowTemplates.js +95 -0
  57. package/dist/profiler/workflowTemplates.js.map +1 -0
  58. package/dist/providers/localMemoryProvider.js +3 -2
  59. package/dist/providers/localMemoryProvider.js.map +1 -1
  60. package/dist/runtimeConfig.d.ts +11 -0
  61. package/dist/runtimeConfig.js +27 -0
  62. package/dist/runtimeConfig.js.map +1 -0
  63. package/dist/sandboxApi.js +2 -1
  64. package/dist/sandboxApi.js.map +1 -1
  65. package/dist/security/auditLog.js +8 -3
  66. package/dist/security/auditLog.js.map +1 -1
  67. package/dist/subconscious/blocks.d.ts +43 -0
  68. package/dist/subconscious/blocks.js +158 -0
  69. package/dist/subconscious/blocks.js.map +1 -0
  70. package/dist/subconscious/classifier.d.ts +22 -0
  71. package/dist/subconscious/classifier.js +118 -0
  72. package/dist/subconscious/classifier.js.map +1 -0
  73. package/dist/subconscious/graphEngine.d.ts +65 -0
  74. package/dist/subconscious/graphEngine.js +234 -0
  75. package/dist/subconscious/graphEngine.js.map +1 -0
  76. package/dist/subconscious/index.d.ts +19 -0
  77. package/dist/subconscious/index.js +20 -0
  78. package/dist/subconscious/index.js.map +1 -0
  79. package/dist/subconscious/tools.d.ts +5 -0
  80. package/dist/subconscious/tools.js +255 -0
  81. package/dist/subconscious/tools.js.map +1 -0
  82. package/dist/subconscious/whisperPolicy.d.ts +20 -0
  83. package/dist/subconscious/whisperPolicy.js +171 -0
  84. package/dist/subconscious/whisperPolicy.js.map +1 -0
  85. package/dist/sweep/engine.d.ts +27 -0
  86. package/dist/sweep/engine.js +244 -0
  87. package/dist/sweep/engine.js.map +1 -0
  88. package/dist/sweep/index.d.ts +9 -0
  89. package/dist/sweep/index.js +8 -0
  90. package/dist/sweep/index.js.map +1 -0
  91. package/dist/sweep/sources/github_trending.d.ts +6 -0
  92. package/dist/sweep/sources/github_trending.js +37 -0
  93. package/dist/sweep/sources/github_trending.js.map +1 -0
  94. package/dist/sweep/sources/hackernews.d.ts +7 -0
  95. package/dist/sweep/sources/hackernews.js +57 -0
  96. package/dist/sweep/sources/hackernews.js.map +1 -0
  97. package/dist/sweep/sources/openbb_finance.d.ts +9 -0
  98. package/dist/sweep/sources/openbb_finance.js +46 -0
  99. package/dist/sweep/sources/openbb_finance.js.map +1 -0
  100. package/dist/sweep/sources/producthunt.d.ts +6 -0
  101. package/dist/sweep/sources/producthunt.js +41 -0
  102. package/dist/sweep/sources/producthunt.js.map +1 -0
  103. package/dist/sweep/sources/web_signals.d.ts +7 -0
  104. package/dist/sweep/sources/web_signals.js +63 -0
  105. package/dist/sweep/sources/web_signals.js.map +1 -0
  106. package/dist/sweep/sources/yahoo_finance.d.ts +6 -0
  107. package/dist/sweep/sources/yahoo_finance.js +47 -0
  108. package/dist/sweep/sources/yahoo_finance.js.map +1 -0
  109. package/dist/sweep/types.d.ts +50 -0
  110. package/dist/sweep/types.js +9 -0
  111. package/dist/sweep/types.js.map +1 -0
  112. package/dist/sync/founderEpisodeStore.d.ts +98 -0
  113. package/dist/sync/founderEpisodeStore.js +230 -0
  114. package/dist/sync/founderEpisodeStore.js.map +1 -0
  115. package/dist/sync/hyperloopArchive.d.ts +51 -0
  116. package/dist/sync/hyperloopArchive.js +153 -0
  117. package/dist/sync/hyperloopArchive.js.map +1 -0
  118. package/dist/sync/hyperloopEval.d.ts +123 -0
  119. package/dist/sync/hyperloopEval.js +389 -0
  120. package/dist/sync/hyperloopEval.js.map +1 -0
  121. package/dist/sync/protocol.d.ts +172 -0
  122. package/dist/sync/protocol.js +9 -0
  123. package/dist/sync/protocol.js.map +1 -0
  124. package/dist/sync/sessionMemory.d.ts +47 -0
  125. package/dist/sync/sessionMemory.js +138 -0
  126. package/dist/sync/sessionMemory.js.map +1 -0
  127. package/dist/sync/store.d.ts +384 -0
  128. package/dist/sync/store.js +1435 -0
  129. package/dist/sync/store.js.map +1 -0
  130. package/dist/sync/syncBridgeClient.d.ts +30 -0
  131. package/dist/sync/syncBridgeClient.js +172 -0
  132. package/dist/sync/syncBridgeClient.js.map +1 -0
  133. package/dist/tools/autonomousDeliveryTools.d.ts +2 -0
  134. package/dist/tools/autonomousDeliveryTools.js +1104 -0
  135. package/dist/tools/autonomousDeliveryTools.js.map +1 -0
  136. package/dist/tools/boilerplateTools.js +10 -9
  137. package/dist/tools/boilerplateTools.js.map +1 -1
  138. package/dist/tools/claudeCodeIngestTools.d.ts +10 -0
  139. package/dist/tools/claudeCodeIngestTools.js +347 -0
  140. package/dist/tools/claudeCodeIngestTools.js.map +1 -0
  141. package/dist/tools/coreWorkflowTools.d.ts +2 -0
  142. package/dist/tools/coreWorkflowTools.js +488 -0
  143. package/dist/tools/coreWorkflowTools.js.map +1 -0
  144. package/dist/tools/deltaTools.d.ts +15 -0
  145. package/dist/tools/deltaTools.js +1522 -0
  146. package/dist/tools/deltaTools.js.map +1 -0
  147. package/dist/tools/documentationTools.js +2 -1
  148. package/dist/tools/documentationTools.js.map +1 -1
  149. package/dist/tools/entityLookupTools.d.ts +14 -0
  150. package/dist/tools/entityLookupTools.js +159 -0
  151. package/dist/tools/entityLookupTools.js.map +1 -0
  152. package/dist/tools/entityTemporalTools.d.ts +12 -0
  153. package/dist/tools/entityTemporalTools.js +330 -0
  154. package/dist/tools/entityTemporalTools.js.map +1 -0
  155. package/dist/tools/founderLocalPipeline.d.ts +215 -0
  156. package/dist/tools/founderLocalPipeline.js +1516 -2
  157. package/dist/tools/founderLocalPipeline.js.map +1 -1
  158. package/dist/tools/founderOperatingModel.d.ts +120 -0
  159. package/dist/tools/founderOperatingModel.js +469 -0
  160. package/dist/tools/founderOperatingModel.js.map +1 -0
  161. package/dist/tools/founderOperatingModelTools.d.ts +2 -0
  162. package/dist/tools/founderOperatingModelTools.js +169 -0
  163. package/dist/tools/founderOperatingModelTools.js.map +1 -0
  164. package/dist/tools/founderStrategicOpsTools.d.ts +2 -0
  165. package/dist/tools/founderStrategicOpsTools.js +1310 -0
  166. package/dist/tools/founderStrategicOpsTools.js.map +1 -0
  167. package/dist/tools/graphifyTools.d.ts +19 -0
  168. package/dist/tools/graphifyTools.js +375 -0
  169. package/dist/tools/graphifyTools.js.map +1 -0
  170. package/dist/tools/index.d.ts +3 -0
  171. package/dist/tools/index.js +4 -0
  172. package/dist/tools/index.js.map +1 -1
  173. package/dist/tools/monteCarloTools.d.ts +16 -0
  174. package/dist/tools/monteCarloTools.js +225 -0
  175. package/dist/tools/monteCarloTools.js.map +1 -0
  176. package/dist/tools/packetCompilerTools.d.ts +12 -0
  177. package/dist/tools/packetCompilerTools.js +322 -0
  178. package/dist/tools/packetCompilerTools.js.map +1 -0
  179. package/dist/tools/planSynthesisTools.d.ts +15 -0
  180. package/dist/tools/planSynthesisTools.js +455 -0
  181. package/dist/tools/planSynthesisTools.js.map +1 -0
  182. package/dist/tools/profilerTools.d.ts +20 -0
  183. package/dist/tools/profilerTools.js +364 -0
  184. package/dist/tools/profilerTools.js.map +1 -0
  185. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  186. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  187. package/dist/tools/savingsTools.d.ts +11 -0
  188. package/dist/tools/savingsTools.js +155 -0
  189. package/dist/tools/savingsTools.js.map +1 -0
  190. package/dist/tools/scenarioCompilerTools.d.ts +14 -0
  191. package/dist/tools/scenarioCompilerTools.js +290 -0
  192. package/dist/tools/scenarioCompilerTools.js.map +1 -0
  193. package/dist/tools/sharedContextTools.d.ts +2 -0
  194. package/dist/tools/sharedContextTools.js +423 -0
  195. package/dist/tools/sharedContextTools.js.map +1 -0
  196. package/dist/tools/sitemapTools.d.ts +15 -0
  197. package/dist/tools/sitemapTools.js +560 -0
  198. package/dist/tools/sitemapTools.js.map +1 -0
  199. package/dist/tools/sweepTools.d.ts +9 -0
  200. package/dist/tools/sweepTools.js +112 -0
  201. package/dist/tools/sweepTools.js.map +1 -0
  202. package/dist/tools/syncBridgeTools.d.ts +2 -0
  203. package/dist/tools/syncBridgeTools.js +258 -0
  204. package/dist/tools/syncBridgeTools.js.map +1 -0
  205. package/dist/tools/toolRegistry.js +1223 -45
  206. package/dist/tools/toolRegistry.js.map +1 -1
  207. package/dist/tools/workspaceTools.d.ts +19 -0
  208. package/dist/tools/workspaceTools.js +762 -0
  209. package/dist/tools/workspaceTools.js.map +1 -0
  210. package/dist/toolsetRegistry.js +162 -3
  211. package/dist/toolsetRegistry.js.map +1 -1
  212. package/package.json +39 -38
  213. package/rules/nodebench-agentic-reliability.md +32 -0
  214. package/rules/nodebench-analyst-diagnostic.md +25 -0
  215. package/rules/nodebench-auto-qa.md +31 -0
  216. package/rules/nodebench-completion-traceability.md +22 -0
  217. package/rules/nodebench-flywheel-continuous.md +25 -0
  218. package/rules/nodebench-pre-release-review.md +24 -0
  219. package/rules/nodebench-qa-dogfood.md +26 -0
  220. package/rules/nodebench-scenario-testing.md +30 -0
  221. package/rules/nodebench-self-direction.md +23 -0
  222. package/rules/nodebench-self-judge-loop.md +24 -0
  223. package/scripts/install.sh +215 -0
  224. package/dist/__tests__/analytics.test.d.ts +0 -11
  225. package/dist/__tests__/analytics.test.js +0 -546
  226. package/dist/__tests__/analytics.test.js.map +0 -1
  227. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  228. package/dist/__tests__/architectComplex.test.js +0 -373
  229. package/dist/__tests__/architectComplex.test.js.map +0 -1
  230. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  231. package/dist/__tests__/architectSmoke.test.js +0 -92
  232. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  233. package/dist/__tests__/audit-registry.d.ts +0 -1
  234. package/dist/__tests__/audit-registry.js +0 -60
  235. package/dist/__tests__/audit-registry.js.map +0 -1
  236. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  237. package/dist/__tests__/batchAutopilot.test.js +0 -218
  238. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  239. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  240. package/dist/__tests__/cliSubcommands.test.js +0 -138
  241. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  242. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  243. package/dist/__tests__/comparativeBench.test.js +0 -722
  244. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  245. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  246. package/dist/__tests__/critterCalibrationEval.js +0 -370
  247. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  248. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  249. package/dist/__tests__/dynamicLoading.test.js +0 -280
  250. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  251. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  252. package/dist/__tests__/embeddingProvider.test.js +0 -86
  253. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  254. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  255. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  256. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  257. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  258. package/dist/__tests__/evalHarness.test.js +0 -1107
  259. package/dist/__tests__/evalHarness.test.js.map +0 -1
  260. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  261. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  262. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  263. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  264. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  265. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  266. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  267. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  268. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  269. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  270. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  271. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  272. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  273. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  274. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  275. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  276. package/dist/__tests__/forecastingScoring.test.js +0 -202
  277. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  278. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  279. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  280. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  281. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  282. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  283. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  284. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  285. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  286. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  287. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  288. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  289. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  290. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  291. package/dist/__tests__/helpers/answerMatch.js +0 -267
  292. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  293. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  294. package/dist/__tests__/helpers/textLlm.js +0 -214
  295. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  296. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  297. package/dist/__tests__/localDashboard.test.js +0 -226
  298. package/dist/__tests__/localDashboard.test.js.map +0 -1
  299. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  300. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  301. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  302. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  303. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  304. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  305. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  306. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  307. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  308. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  309. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  310. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  311. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  312. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  313. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  314. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  315. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  316. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  317. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  318. package/dist/__tests__/openclawDogfood.test.js +0 -535
  319. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  320. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  321. package/dist/__tests__/openclawMessaging.test.js +0 -232
  322. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  323. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  324. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  325. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  326. package/dist/__tests__/tools.test.d.ts +0 -1
  327. package/dist/__tests__/tools.test.js +0 -3201
  328. package/dist/__tests__/tools.test.js.map +0 -1
  329. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  330. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  331. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  332. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  333. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  334. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  335. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  336. package/dist/__tests__/webmcpTools.test.js +0 -195
  337. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  338. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  339. package/dist/benchmarks/testProviderBus.js +0 -272
  340. package/dist/benchmarks/testProviderBus.js.map +0 -1
  341. package/dist/hooks/postCompaction.d.ts +0 -14
  342. package/dist/hooks/postCompaction.js +0 -51
  343. package/dist/hooks/postCompaction.js.map +0 -1
  344. package/dist/security/__tests__/security.test.d.ts +0 -8
  345. package/dist/security/__tests__/security.test.js +0 -295
  346. package/dist/security/__tests__/security.test.js.map +0 -1
  347. package/dist/tools/documentTools.d.ts +0 -5
  348. package/dist/tools/documentTools.js +0 -524
  349. package/dist/tools/documentTools.js.map +0 -1
  350. package/dist/tools/financialTools.d.ts +0 -10
  351. package/dist/tools/financialTools.js +0 -403
  352. package/dist/tools/financialTools.js.map +0 -1
  353. package/dist/tools/memoryTools.d.ts +0 -5
  354. package/dist/tools/memoryTools.js +0 -137
  355. package/dist/tools/memoryTools.js.map +0 -1
  356. package/dist/tools/planningTools.d.ts +0 -5
  357. package/dist/tools/planningTools.js +0 -147
  358. package/dist/tools/planningTools.js.map +0 -1
  359. package/dist/tools/searchTools.d.ts +0 -5
  360. package/dist/tools/searchTools.js +0 -145
  361. package/dist/tools/searchTools.js.map +0 -1
@@ -1,914 +0,0 @@
1
- /**
2
- * GAIA file-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local file tools.
3
- *
4
- * This lane targets GAIA tasks that include attachments (PDF / XLSX / CSV).
5
- * We provide deterministic local parsing via NodeBench MCP tools and score answers against
6
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
7
- *
8
- * Safety:
9
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
10
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
11
- *
12
- * Disabled by default (cost + rate limits). Run with:
13
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
14
- */
15
- import { describe, expect, it } from "vitest";
16
- import { existsSync, readFileSync } from "node:fs";
17
- import { mkdir, readFile, writeFile } from "node:fs/promises";
18
- import path from "node:path";
19
- import { fileURLToPath } from "node:url";
20
- import { performance } from "node:perf_hooks";
21
- import { localFileTools } from "../tools/localFileTools.js";
22
- import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
23
- import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
24
- const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
25
- const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
26
- async function safeWriteJson(filePath, payload) {
27
- try {
28
- await mkdir(path.dirname(filePath), { recursive: true });
29
- await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
30
- }
31
- catch (err) {
32
- console.warn(`[gaia-capability-files] report write failed: ${err?.message ?? String(err)}`);
33
- }
34
- }
35
- function resolveRepoRoot() {
36
- const testDir = path.dirname(fileURLToPath(import.meta.url));
37
- return path.resolve(testDir, "../../../..");
38
- }
39
- function resolveCapabilityFilesFixturePath() {
40
- const override = process.env.NODEBENCH_GAIA_CAPABILITY_FILES_FIXTURE_PATH;
41
- if (override) {
42
- if (path.isAbsolute(override))
43
- return override;
44
- const repoRoot = resolveRepoRoot();
45
- return path.resolve(repoRoot, override);
46
- }
47
- const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
48
- const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
49
- const repoRoot = resolveRepoRoot();
50
- return path.join(repoRoot, ".cache", "gaia", `gaia_capability_files_${config}_${split}.sample.json`);
51
- }
52
- function loadDotEnvLocalIfPresent() {
53
- const repoRoot = resolveRepoRoot();
54
- const envPath = path.join(repoRoot, ".env.local");
55
- if (!existsSync(envPath))
56
- return;
57
- const text = readFileSync(envPath, "utf8");
58
- for (const rawLine of text.split(/\r?\n/)) {
59
- const line = rawLine.trim();
60
- if (!line || line.startsWith("#"))
61
- continue;
62
- const idx = line.indexOf("=");
63
- if (idx <= 0)
64
- continue;
65
- const key = line.slice(0, idx).trim();
66
- let value = line.slice(idx + 1).trim();
67
- if ((value.startsWith("\"") && value.endsWith("\"")) ||
68
- (value.startsWith("'") && value.endsWith("'"))) {
69
- value = value.slice(1, -1);
70
- }
71
- if (!process.env[key])
72
- process.env[key] = value;
73
- }
74
- }
75
- function toIntegerOrNullLoose(value) {
76
- if (value === null || value === undefined)
77
- return null;
78
- if (typeof value === "number" && Number.isFinite(value))
79
- return Math.trunc(value);
80
- const s = String(value).trim();
81
- if (!s)
82
- return null;
83
- const m = s.match(/-?\d+/);
84
- if (!m)
85
- return null;
86
- const n = Number.parseInt(m[0], 10);
87
- return Number.isFinite(n) ? n : null;
88
- }
89
- function deriveAddressParityIfRelevant(taskPrompt, extract) {
90
- const wantsParity = /\bodd\b|\beven\b|odd-?numbered|even-?numbered|parity/i.test(String(taskPrompt ?? ""));
91
- if (!wantsParity)
92
- return null;
93
- const headers = Array.isArray(extract?.headers) ? extract.headers.map((h) => String(h ?? "")) : [];
94
- const rows = Array.isArray(extract?.rows) ? extract.rows : [];
95
- if (headers.length === 0 || rows.length === 0)
96
- return null;
97
- const addrIdx = headers.findIndex((h) => /address/i.test(h));
98
- if (addrIdx < 0)
99
- return null;
100
- let integerCount = 0;
101
- let evenCount = 0;
102
- let oddCount = 0;
103
- for (const r of rows) {
104
- const n = toIntegerOrNullLoose(Array.isArray(r) ? r[addrIdx] : null);
105
- if (n === null)
106
- continue;
107
- integerCount++;
108
- if (Math.abs(n) % 2 === 0)
109
- evenCount++;
110
- else
111
- oddCount++;
112
- }
113
- return {
114
- column: headers[addrIdx],
115
- columnIndex: addrIdx,
116
- integerCount,
117
- evenCount,
118
- oddCount,
119
- };
120
- }
121
- function inferAnswerFromAddressParityIfPossible(taskPrompt, parity) {
122
- const p = String(taskPrompt ?? "").toLowerCase();
123
- if (!p)
124
- return null;
125
- const oddEast = /odd[^.]*east/.test(p);
126
- const oddWest = /odd[^.]*west/.test(p);
127
- const evenEast = /even[^.]*east/.test(p);
128
- const evenWest = /even[^.]*west/.test(p);
129
- const lastSunrise = p.lastIndexOf("sunrise");
130
- const lastSunset = p.lastIndexOf("sunset");
131
- if (lastSunrise === -1 && lastSunset === -1)
132
- return null;
133
- // If both appear, assume the one mentioned last is what the question asks for.
134
- const wantsSunrise = lastSunrise > lastSunset;
135
- const wantsSunset = lastSunset > lastSunrise;
136
- let desiredDirection = wantsSunrise ? "east" : wantsSunset ? "west" : null;
137
- if (!desiredDirection)
138
- return null;
139
- // Some tasks specify the awning is for the *back* of the house, while the
140
- // prompt gives the facing direction for the street address (front). In that
141
- // case, invert the facing direction.
142
- const mentionsBackOfHouse = /\bback\b/.test(p) && /\bhouse\b/.test(p);
143
- if (mentionsBackOfHouse) {
144
- desiredDirection = desiredDirection === "east" ? "west" : "east";
145
- }
146
- // Map parity -> facing direction when explicitly stated.
147
- const oddFaces = oddEast ? "east" : oddWest ? "west" : null;
148
- const evenFaces = evenEast ? "east" : evenWest ? "west" : null;
149
- if (!oddFaces || !evenFaces)
150
- return null;
151
- if (evenFaces === desiredDirection)
152
- return String(parity.evenCount);
153
- if (oddFaces === desiredDirection)
154
- return String(parity.oddCount);
155
- return null;
156
- }
157
- async function llmGenerateText(llm, history) {
158
- const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
159
- return generateTextFromHistory(llm, history, {
160
- temperature: Number.isFinite(temperature) ? temperature : 0,
161
- maxOutputTokens: 1024,
162
- });
163
- }
164
- async function baselineAnswer(llm, task) {
165
- const contents = [
166
- {
167
- role: "user",
168
- parts: [
169
- {
170
- text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
171
- },
172
- ],
173
- },
174
- ];
175
- return llmGenerateText(llm, contents);
176
- }
177
- function buildToolIndex() {
178
- const byName = new Map();
179
- for (const tool of localFileTools)
180
- byName.set(tool.name, tool);
181
- return byName;
182
- }
183
- function extractJsonObject(text) {
184
- const trimmed = text.trim();
185
- const fenceMatch = trimmed.match(/```json\s*([\s\S]*?)\s*```/i);
186
- const candidate = fenceMatch ? fenceMatch[1] : trimmed;
187
- const start = candidate.indexOf("{");
188
- const end = candidate.lastIndexOf("}");
189
- if (start === -1 || end === -1 || end <= start)
190
- return null;
191
- const slice = candidate.slice(start, end + 1);
192
- try {
193
- return JSON.parse(slice);
194
- }
195
- catch {
196
- return null;
197
- }
198
- }
199
- function resolveTaskLocalFilePath(task) {
200
- const repoRoot = resolveRepoRoot();
201
- const rel = String(task.localFilePath ?? "").trim();
202
- if (rel)
203
- return path.resolve(repoRoot, rel);
204
- // Fallback to the standard cache layout used by the fixture generator.
205
- const filePath = String(task.filePath ?? "").trim();
206
- if (!filePath)
207
- throw new Error("Task missing filePath/localFilePath");
208
- return path.join(repoRoot, ".cache", "gaia", "data", filePath);
209
- }
210
- async function toolAugmentedAnswerFromFile(llm, task, opts) {
211
- const toolIndex = buildToolIndex();
212
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
213
- const localPath = resolveTaskLocalFilePath(task);
214
- if (!existsSync(localPath)) {
215
- throw new Error(`Missing attachment on disk. Expected at ${localPath}. Refresh with dataset:gaia:capability:files:refresh`);
216
- }
217
- const ext = String(task.fileExt ?? "").trim().toLowerCase() ||
218
- path.extname(task.fileName || task.filePath || "").toLowerCase().replace(/^\./, "");
219
- // "rag" mode: single deterministic file extract -> answer (more stable than agent loops).
220
- if (toolsMode === "rag") {
221
- let extract;
222
- if (ext === "csv") {
223
- const tool = toolIndex.get("read_csv_file");
224
- if (!tool)
225
- throw new Error("Missing tool: read_csv_file");
226
- extract = await tool.handler({
227
- path: localPath,
228
- hasHeader: true,
229
- maxRows: 500,
230
- maxCols: 80,
231
- maxCellChars: 2000,
232
- });
233
- }
234
- else if (ext === "xlsx") {
235
- const tool = toolIndex.get("read_xlsx_file");
236
- if (!tool)
237
- throw new Error("Missing tool: read_xlsx_file");
238
- extract = await tool.handler({
239
- path: localPath,
240
- headerRow: 1,
241
- maxRows: 500,
242
- maxCols: 80,
243
- maxCellChars: 2000,
244
- });
245
- }
246
- else if (ext === "pdf") {
247
- const tool = toolIndex.get("read_pdf_text");
248
- if (!tool)
249
- throw new Error("Missing tool: read_pdf_text");
250
- extract = await tool.handler({
251
- path: localPath,
252
- pageStart: 1,
253
- pageEnd: 12,
254
- maxChars: 40000,
255
- });
256
- }
257
- else if (ext === "docx") {
258
- const tool = toolIndex.get("read_docx_text");
259
- if (!tool)
260
- throw new Error("Missing tool: read_docx_text");
261
- extract = await tool.handler({
262
- path: localPath,
263
- maxChars: 40000,
264
- });
265
- }
266
- else if (ext === "pptx") {
267
- const tool = toolIndex.get("read_pptx_text");
268
- if (!tool)
269
- throw new Error("Missing tool: read_pptx_text");
270
- extract = await tool.handler({
271
- path: localPath,
272
- maxChars: 40000,
273
- });
274
- }
275
- else if (ext === "json") {
276
- const tool = toolIndex.get("read_json_file");
277
- if (!tool)
278
- throw new Error("Missing tool: read_json_file");
279
- extract = await tool.handler({
280
- path: localPath,
281
- maxDepth: 10,
282
- maxItems: 300,
283
- maxStringChars: 2000,
284
- });
285
- }
286
- else if (ext === "jsonl") {
287
- const tool = toolIndex.get("read_jsonl_file");
288
- if (!tool)
289
- throw new Error("Missing tool: read_jsonl_file");
290
- extract = await tool.handler({
291
- path: localPath,
292
- offsetLines: 0,
293
- limitLines: 200,
294
- parseJson: true,
295
- maxDepth: 8,
296
- maxItems: 200,
297
- maxStringChars: 1000,
298
- });
299
- }
300
- else if (ext === "txt" || ext === "md" || ext === "xml") {
301
- const tool = toolIndex.get("read_text_file");
302
- if (!tool)
303
- throw new Error("Missing tool: read_text_file");
304
- extract = await tool.handler({
305
- path: localPath,
306
- startChar: 0,
307
- maxChars: 40000,
308
- });
309
- }
310
- else if (ext === "zip") {
311
- throw new Error('ZIP attachments are only supported in toolsMode="agent" (requires multi-step extraction).');
312
- }
313
- else {
314
- throw new Error(`Unsupported attachment type: ${ext || "(unknown)"}`);
315
- }
316
- // Keep the model input bounded. The tools already return bounded previews,
317
- // but JSON stringification can still be large on wide tables.
318
- const derivedParity = deriveAddressParityIfRelevant(task.prompt, extract);
319
- const inferredFromParity = derivedParity && (ext === "csv" || ext === "xlsx")
320
- ? inferAnswerFromAddressParityIfPossible(task.prompt, derivedParity)
321
- : null;
322
- if (inferredFromParity) {
323
- return { answer: inferredFromParity, toolCalls: 1 };
324
- }
325
- const enrichedExtract = derivedParity && (ext === "csv" || ext === "xlsx")
326
- ? { ...extract, derivedParity: { address: derivedParity } }
327
- : extract;
328
- const extractText = JSON.stringify(enrichedExtract).slice(0, 40000);
329
- const contents = [
330
- {
331
- role: "user",
332
- parts: [
333
- {
334
- text: "Answer the question using the provided file extract plus general reasoning. " +
335
- "Do not browse the web and do not read any other files. " +
336
- "Reminder: the sun rises in the east and sets in the west. " +
337
- "If FILE_EXTRACT_JSON contains derivedParity, prefer it over recounting. " +
338
- "If the extract is insufficient, make the best supported guess.\n\n" +
339
- "Return ONLY the final answer, no explanation.\n\n" +
340
- `TASK_ID: ${task.id}\n` +
341
- `FILE_TYPE: ${ext}\n` +
342
- `LOCAL_FILE_PATH: ${localPath}\n` +
343
- `QUESTION:\n${task.prompt}\n\n` +
344
- `FILE_EXTRACT_JSON:\n${extractText}`,
345
- },
346
- ],
347
- },
348
- ];
349
- const answer = await llmGenerateText(llm, contents);
350
- return { answer, toolCalls: 1 };
351
- }
352
- // "agent" mode: small tool loop. This is more realistic but higher variance.
353
- const toolUsageSummary = [
354
- "You have access to deterministic local file tools:",
355
- "- where ops: eq, ne, contains, starts_with, ends_with, matches_regex, gt, gte, lt, lte, is_empty, not_empty, is_even, is_odd",
356
- "- Prefer deterministic aggregations (csv_aggregate/xlsx_aggregate) over mental math. For parity rules, use where op is_even/is_odd.",
357
- "- read_csv_file({path,hasHeader,delimiter,encoding,maxRows,maxCols,maxCellChars})",
358
- "- csv_select_rows({path,hasHeader,delimiter,encoding,where,returnColumns,offset,limit,maxScanRows,maxCols,maxCellChars})",
359
- "- csv_aggregate({path,hasHeader,delimiter,encoding,where,operation,value,ignoreNonNumeric,returnRow,returnColumns,maxScanRows,maxCols,maxCellChars})",
360
- "- read_xlsx_file({path,sheetName,headerRow,rangeA1,maxRows,maxCols,maxCellChars})",
361
- "- xlsx_select_rows({path,sheetName,headerRow,rangeA1,where,returnColumns,offset,limit,maxScanRows,maxCols,maxCellChars})",
362
- "- xlsx_aggregate({path,sheetName,headerRow,rangeA1,where,operation,value,ignoreNonNumeric,returnRow,returnColumns,maxScanRows,maxCols,maxCellChars})",
363
- "- read_pdf_text({path,pageStart,pageEnd,pageNumbers,maxChars})",
364
- "- pdf_search_text({path,query,caseSensitive,pageStart,pageEnd,pageNumbers,maxMatches,snippetChars})",
365
- "- read_text_file({path,encoding,startChar,maxChars})",
366
- "- read_json_file({path,maxDepth,maxItems,maxStringChars})",
367
- "- json_select({path,pointer,maxDepth,maxItems,maxStringChars})",
368
- "- read_jsonl_file({path,encoding,offsetLines,limitLines,parseJson,maxLineChars,maxDepth,maxItems,maxStringChars})",
369
- "- zip_list_files({path,maxEntries})",
370
- "- zip_read_text_file({path,innerPath,caseSensitive,encoding,maxChars,maxBytes})",
371
- "- zip_extract_file({path,innerPath,caseSensitive,outputDir,overwrite,maxBytes})",
372
- "- read_docx_text({path,maxChars,maxBytes})",
373
- "- read_pptx_text({path,maxChars,maxSlides,maxBytesPerSlide})",
374
- "",
375
- "When using tools, respond with a single JSON object only:",
376
- "{\"action\":\"tool\",\"name\":\"read_pdf_text\",\"arguments\":{\"pageStart\":1,\"pageEnd\":5}}",
377
- "When done, respond with:",
378
- "{\"action\":\"final\",\"answer\":\"...\"}",
379
- "",
380
- "Rules:",
381
- "- Do NOT use any external knowledge or web browsing.",
382
- "- Always use the provided LOCAL_FILE_PATH; you may not read any other files.",
383
- "- Keep tool results bounded (limit<=200, maxRows<=500, maxCols<=80, maxCellChars<=2000, maxChars<=40000, maxMatches<=50).",
384
- "- Do NOT include any explanation. Final answer must match the requested formatting.",
385
- ].join("\n");
386
- const contents = [
387
- {
388
- role: "user",
389
- parts: [
390
- {
391
- text: `${toolUsageSummary}\n\nTASK_ID: ${task.id}\nFILE_TYPE: ${ext}\nLOCAL_FILE_PATH: ${localPath}\nQUESTION:\n${task.prompt}`,
392
- },
393
- ],
394
- },
395
- ];
396
- let toolCalls = 0;
397
- for (let step = 0; step < opts.maxSteps; step++) {
398
- const out = await llmGenerateText(llm, contents);
399
- contents.push({ role: "model", parts: [{ text: out }] });
400
- const parsed = extractJsonObject(out);
401
- if (!parsed || typeof parsed !== "object") {
402
- contents.push({
403
- role: "user",
404
- parts: [{ text: "Invalid format. Return JSON only with action tool|final." }],
405
- });
406
- continue;
407
- }
408
- if (parsed.action === "final") {
409
- const answer = String(parsed.answer ?? "").trim();
410
- return { answer, toolCalls };
411
- }
412
- if (parsed.action !== "tool") {
413
- contents.push({
414
- role: "user",
415
- parts: [{ text: "Invalid action. Return JSON only with action tool|final." }],
416
- });
417
- continue;
418
- }
419
- if (toolCalls >= opts.maxToolCalls) {
420
- contents.push({
421
- role: "user",
422
- parts: [{ text: "Tool call budget exceeded. Return final answer now." }],
423
- });
424
- continue;
425
- }
426
- const name = String(parsed.name ?? "");
427
- const tool = toolIndex.get(name);
428
- if (!tool) {
429
- contents.push({
430
- role: "user",
431
- parts: [
432
- {
433
- text: `Unknown tool "${name}". Use only read_csv_file, csv_select_rows, csv_aggregate, read_xlsx_file, xlsx_select_rows, xlsx_aggregate, read_pdf_text, or pdf_search_text.`,
434
- },
435
- ],
436
- });
437
- continue;
438
- }
439
- const args = (parsed.arguments ?? {});
440
- // Security: enforce file access restrictions.
441
- // Default: force the path to the known GAIA attachment.
442
- // ZIP: allow tools to operate on extracted children under a per-task extracted dir.
443
- const extractedRoot = path.join(resolveRepoRoot(), ".cache", "gaia", "extracted", task.id);
444
- const isZip = ext === "zip";
445
- const isZipTool = ["zip_list_files", "zip_read_text_file", "zip_extract_file"].includes(name);
446
- if (!isZip) {
447
- args.path = localPath;
448
- }
449
- else if (isZipTool) {
450
- args.path = localPath;
451
- // Force deterministic extracted root to keep gated data under .cache/gaia (gitignored).
452
- if (name === "zip_extract_file") {
453
- args.outputDir = extractedRoot;
454
- }
455
- }
456
- else {
457
- const requested = String(args.path ?? "").trim();
458
- if (!requested) {
459
- contents.push({
460
- role: "user",
461
- parts: [
462
- {
463
- text: "ZIP workflow: first call zip_list_files, then zip_extract_file(innerPath=...), " +
464
- "then call a reader tool on the extractedPath returned by zip_extract_file.",
465
- },
466
- ],
467
- });
468
- continue;
469
- }
470
- const requestedAbs = path.isAbsolute(requested)
471
- ? requested
472
- : path.resolve(path.dirname(localPath), requested);
473
- const extractedAbs = path.resolve(extractedRoot);
474
- const reqResolved = path.resolve(requestedAbs);
475
- if (!reqResolved.startsWith(extractedAbs + path.sep) && reqResolved !== extractedAbs) {
476
- contents.push({
477
- role: "user",
478
- parts: [{ text: `Refusing to read path outside extractedRoot: ${reqResolved}` }],
479
- });
480
- continue;
481
- }
482
- args.path = reqResolved;
483
- }
484
- // Hard limits for safety and stable prompts.
485
- if (name === "read_csv_file") {
486
- if (args.hasHeader === undefined)
487
- args.hasHeader = true;
488
- if (typeof args.maxRows !== "number")
489
- args.maxRows = 200;
490
- if (typeof args.maxCols !== "number")
491
- args.maxCols = 50;
492
- if (typeof args.maxCellChars !== "number")
493
- args.maxCellChars = 2000;
494
- args.maxRows = Math.min(Number(args.maxRows) || 200, 500);
495
- args.maxCols = Math.min(Number(args.maxCols) || 50, 80);
496
- args.maxCellChars = Math.min(Number(args.maxCellChars) || 2000, 2000);
497
- }
498
- else if (name === "csv_select_rows") {
499
- if (args.hasHeader === undefined)
500
- args.hasHeader = true;
501
- if (typeof args.offset !== "number")
502
- args.offset = 0;
503
- if (typeof args.limit !== "number")
504
- args.limit = 50;
505
- if (typeof args.maxScanRows !== "number")
506
- args.maxScanRows = 50000;
507
- if (typeof args.maxCols !== "number")
508
- args.maxCols = 80;
509
- if (typeof args.maxCellChars !== "number")
510
- args.maxCellChars = 2000;
511
- args.offset = Math.max(0, Number(args.offset) || 0);
512
- args.limit = Math.min(Math.max(1, Number(args.limit) || 50), 200);
513
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
514
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 80), 80);
515
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
516
- if (Array.isArray(args.where))
517
- args.where = args.where.slice(0, 10);
518
- if (Array.isArray(args.returnColumns))
519
- args.returnColumns = args.returnColumns.slice(0, 30);
520
- }
521
- else if (name === "csv_aggregate") {
522
- if (args.hasHeader === undefined)
523
- args.hasHeader = true;
524
- if (typeof args.maxScanRows !== "number")
525
- args.maxScanRows = 50000;
526
- if (typeof args.maxCols !== "number")
527
- args.maxCols = 200;
528
- if (typeof args.maxCellChars !== "number")
529
- args.maxCellChars = 2000;
530
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
531
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 200), 200);
532
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
533
- if (Array.isArray(args.where))
534
- args.where = args.where.slice(0, 10);
535
- if (Array.isArray(args.returnColumns))
536
- args.returnColumns = args.returnColumns.slice(0, 30);
537
- }
538
- else if (name === "read_xlsx_file") {
539
- if (typeof args.headerRow !== "number")
540
- args.headerRow = 1;
541
- if (typeof args.maxRows !== "number")
542
- args.maxRows = 200;
543
- if (typeof args.maxCols !== "number")
544
- args.maxCols = 50;
545
- if (typeof args.maxCellChars !== "number")
546
- args.maxCellChars = 2000;
547
- args.maxRows = Math.min(Number(args.maxRows) || 200, 500);
548
- args.maxCols = Math.min(Number(args.maxCols) || 50, 80);
549
- args.maxCellChars = Math.min(Number(args.maxCellChars) || 2000, 2000);
550
- }
551
- else if (name === "xlsx_select_rows") {
552
- if (typeof args.headerRow !== "number")
553
- args.headerRow = 1;
554
- if (typeof args.offset !== "number")
555
- args.offset = 0;
556
- if (typeof args.limit !== "number")
557
- args.limit = 50;
558
- if (typeof args.maxScanRows !== "number")
559
- args.maxScanRows = 50000;
560
- if (typeof args.maxCols !== "number")
561
- args.maxCols = 80;
562
- if (typeof args.maxCellChars !== "number")
563
- args.maxCellChars = 2000;
564
- args.headerRow = Math.max(0, Math.min(Number(args.headerRow) || 1, 1000));
565
- args.offset = Math.max(0, Number(args.offset) || 0);
566
- args.limit = Math.min(Math.max(1, Number(args.limit) || 50), 200);
567
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
568
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 80), 80);
569
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
570
- if (Array.isArray(args.where))
571
- args.where = args.where.slice(0, 10);
572
- if (Array.isArray(args.returnColumns))
573
- args.returnColumns = args.returnColumns.slice(0, 30);
574
- }
575
- else if (name === "xlsx_aggregate") {
576
- if (typeof args.headerRow !== "number")
577
- args.headerRow = 1;
578
- if (typeof args.maxScanRows !== "number")
579
- args.maxScanRows = 50000;
580
- if (typeof args.maxCols !== "number")
581
- args.maxCols = 200;
582
- if (typeof args.maxCellChars !== "number")
583
- args.maxCellChars = 2000;
584
- args.headerRow = Math.max(0, Math.min(Number(args.headerRow) || 1, 1000));
585
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
586
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 200), 200);
587
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
588
- if (Array.isArray(args.where))
589
- args.where = args.where.slice(0, 10);
590
- if (Array.isArray(args.returnColumns))
591
- args.returnColumns = args.returnColumns.slice(0, 30);
592
- }
593
- else if (name === "read_pdf_text") {
594
- if (typeof args.pageStart !== "number")
595
- args.pageStart = 1;
596
- if (typeof args.pageEnd !== "number")
597
- args.pageEnd = 3;
598
- if (typeof args.maxChars !== "number")
599
- args.maxChars = 12000;
600
- args.pageStart = Math.max(1, Math.min(Number(args.pageStart) || 1, 500));
601
- args.pageEnd = Math.max(1, Math.min(Number(args.pageEnd) || 3, 500));
602
- args.maxChars = Math.min(Number(args.maxChars) || 12000, 40000);
603
- if (Array.isArray(args.pageNumbers)) {
604
- // Keep explicit page lists short to avoid huge extracts.
605
- args.pageNumbers = args.pageNumbers
606
- .map((n) => Number(n))
607
- .filter((n) => Number.isFinite(n) && n > 0)
608
- .slice(0, 20);
609
- }
610
- }
611
- else if (name === "pdf_search_text") {
612
- if (typeof args.query !== "string")
613
- args.query = "";
614
- if (typeof args.pageStart !== "number")
615
- args.pageStart = 1;
616
- if (typeof args.pageEnd !== "number")
617
- args.pageEnd = 25;
618
- if (typeof args.maxMatches !== "number")
619
- args.maxMatches = 25;
620
- if (typeof args.snippetChars !== "number")
621
- args.snippetChars = 180;
622
- args.pageStart = Math.max(1, Math.min(Number(args.pageStart) || 1, 500));
623
- args.pageEnd = Math.max(1, Math.min(Number(args.pageEnd) || 25, 500));
624
- args.maxMatches = Math.min(Math.max(1, Number(args.maxMatches) || 25), 50);
625
- args.snippetChars = Math.min(Math.max(40, Number(args.snippetChars) || 180), 400);
626
- if (Array.isArray(args.pageNumbers)) {
627
- args.pageNumbers = args.pageNumbers
628
- .map((n) => Number(n))
629
- .filter((n) => Number.isFinite(n) && n > 0)
630
- .slice(0, 20);
631
- }
632
- }
633
- else if (name === "read_text_file") {
634
- if (typeof args.startChar !== "number")
635
- args.startChar = 0;
636
- if (typeof args.maxChars !== "number")
637
- args.maxChars = 12000;
638
- args.startChar = Math.max(0, Number(args.startChar) || 0);
639
- args.maxChars = Math.min(Math.max(1, Number(args.maxChars) || 12000), 40000);
640
- }
641
- else if (name === "read_json_file" || name === "json_select") {
642
- if (typeof args.maxDepth !== "number")
643
- args.maxDepth = 8;
644
- if (typeof args.maxItems !== "number")
645
- args.maxItems = 200;
646
- if (typeof args.maxStringChars !== "number")
647
- args.maxStringChars = 2000;
648
- args.maxDepth = Math.min(Math.max(1, Number(args.maxDepth) || 8), 12);
649
- args.maxItems = Math.min(Math.max(1, Number(args.maxItems) || 200), 500);
650
- args.maxStringChars = Math.min(Math.max(20, Number(args.maxStringChars) || 2000), 2000);
651
- if (name === "json_select" && typeof args.pointer !== "string")
652
- args.pointer = "";
653
- }
654
- else if (name === "read_jsonl_file") {
655
- if (typeof args.offsetLines !== "number")
656
- args.offsetLines = 0;
657
- if (typeof args.limitLines !== "number")
658
- args.limitLines = 200;
659
- if (typeof args.maxLineChars !== "number")
660
- args.maxLineChars = 4000;
661
- if (typeof args.maxDepth !== "number")
662
- args.maxDepth = 6;
663
- if (typeof args.maxItems !== "number")
664
- args.maxItems = 100;
665
- if (typeof args.maxStringChars !== "number")
666
- args.maxStringChars = 1000;
667
- args.offsetLines = Math.max(0, Number(args.offsetLines) || 0);
668
- args.limitLines = Math.min(Math.max(1, Number(args.limitLines) || 200), 500);
669
- args.maxLineChars = Math.min(Math.max(200, Number(args.maxLineChars) || 4000), 10000);
670
- args.maxDepth = Math.min(Math.max(1, Number(args.maxDepth) || 6), 10);
671
- args.maxItems = Math.min(Math.max(1, Number(args.maxItems) || 100), 300);
672
- args.maxStringChars = Math.min(Math.max(20, Number(args.maxStringChars) || 1000), 2000);
673
- }
674
- else if (name === "zip_list_files") {
675
- if (typeof args.maxEntries !== "number")
676
- args.maxEntries = 200;
677
- args.maxEntries = Math.min(Math.max(1, Number(args.maxEntries) || 200), 500);
678
- }
679
- else if (name === "zip_read_text_file") {
680
- if (typeof args.innerPath !== "string")
681
- args.innerPath = "";
682
- if (typeof args.maxChars !== "number")
683
- args.maxChars = 12000;
684
- if (typeof args.maxBytes !== "number")
685
- args.maxBytes = 5000000;
686
- args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 20000);
687
- args.maxBytes = Math.min(Math.max(1000, Number(args.maxBytes) || 5000000), 20000000);
688
- }
689
- else if (name === "zip_extract_file") {
690
- if (typeof args.innerPath !== "string")
691
- args.innerPath = "";
692
- if (typeof args.maxBytes !== "number")
693
- args.maxBytes = 25000000;
694
- args.maxBytes = Math.min(Math.max(1000, Number(args.maxBytes) || 25000000), 50000000);
695
- args.overwrite = false;
696
- }
697
- else if (name === "read_docx_text") {
698
- if (typeof args.maxChars !== "number")
699
- args.maxChars = 12000;
700
- args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 40000);
701
- }
702
- else if (name === "read_pptx_text") {
703
- if (typeof args.maxChars !== "number")
704
- args.maxChars = 12000;
705
- if (typeof args.maxSlides !== "number")
706
- args.maxSlides = 60;
707
- args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 40000);
708
- args.maxSlides = Math.min(Math.max(1, Number(args.maxSlides) || 60), 120);
709
- }
710
- // Reduce model confusion: enforce tool matches the attachment type.
711
- const allowedByExt = (ext === "csv" && ["read_csv_file", "csv_select_rows", "csv_aggregate"].includes(name)) ||
712
- (ext === "xlsx" && ["read_xlsx_file", "xlsx_select_rows", "xlsx_aggregate"].includes(name)) ||
713
- (ext === "pdf" && ["read_pdf_text", "pdf_search_text"].includes(name)) ||
714
- (ext === "docx" && ["read_docx_text"].includes(name)) ||
715
- (ext === "pptx" && ["read_pptx_text"].includes(name)) ||
716
- ((ext === "txt" || ext === "md" || ext === "xml") && ["read_text_file"].includes(name)) ||
717
- (ext === "json" && ["read_json_file", "json_select", "read_text_file"].includes(name)) ||
718
- (ext === "jsonl" && ["read_jsonl_file", "read_text_file"].includes(name)) ||
719
- (ext === "zip" &&
720
- [
721
- "zip_list_files",
722
- "zip_read_text_file",
723
- "zip_extract_file",
724
- "read_csv_file",
725
- "csv_select_rows",
726
- "csv_aggregate",
727
- "read_xlsx_file",
728
- "xlsx_select_rows",
729
- "xlsx_aggregate",
730
- "read_pdf_text",
731
- "pdf_search_text",
732
- "read_text_file",
733
- "read_json_file",
734
- "json_select",
735
- "read_jsonl_file",
736
- "read_docx_text",
737
- "read_pptx_text",
738
- ].includes(name));
739
- if (!allowedByExt) {
740
- contents.push({
741
- role: "user",
742
- parts: [{ text: `Wrong tool for FILE_TYPE=${ext}. Use a tool that matches the file type.` }],
743
- });
744
- continue;
745
- }
746
- toolCalls++;
747
- const toolResult = await tool.handler(args);
748
- // Provide a bounded JSON summary to the model. Avoid dumping large content.
749
- const toolResultText = JSON.stringify(toolResult).slice(0, 12000);
750
- contents.push({
751
- role: "user",
752
- parts: [{ text: `TOOL_RESULT ${name}:\n${toolResultText}\n\nContinue. Return JSON only.` }],
753
- });
754
- }
755
- // If we ran out of steps, force a final answer.
756
- contents.push({
757
- role: "user",
758
- parts: [{ text: "Out of steps. Return final answer now as JSON." }],
759
- });
760
- const out = await llmGenerateText(llm, contents);
761
- const parsed = extractJsonObject(out);
762
- if (parsed?.action === "final") {
763
- return { answer: String(parsed.answer ?? "").trim(), toolCalls };
764
- }
765
- return { answer: String(out ?? "").trim(), toolCalls };
766
- }
767
- async function loadFixture(fixturePath) {
768
- const raw = await readFile(fixturePath, "utf8");
769
- const parsed = JSON.parse(raw);
770
- if (!parsed || !Array.isArray(parsed.tasks))
771
- throw new Error("Invalid GAIA capability files fixture");
772
- return parsed;
773
- }
774
- describe("Capability: GAIA accuracy (file-backed) (LLM-only vs LLM+local tools)", () => {
775
- const testFn = shouldRun ? it : it.skip;
776
- testFn("should measure accuracy delta on a small GAIA file-backed subset", async () => {
777
- loadDotEnvLocalIfPresent();
778
- const fixturePath = resolveCapabilityFilesFixturePath();
779
- if (!existsSync(fixturePath)) {
780
- throw new Error(`Missing GAIA capability files fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityFilesFixture.py`);
781
- }
782
- const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
783
- const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
784
- const baselineLlm = await createTextLlmClient({ model: baselineModel });
785
- const toolsLlm = await createTextLlmClient({ model: toolsModel });
786
- const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
787
- const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
788
- const fixture = await loadFixture(fixturePath);
789
- expect(Array.isArray(fixture.tasks)).toBe(true);
790
- expect(fixture.tasks.length).toBeGreaterThan(0);
791
- const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "6", 10);
792
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 6));
793
- const tasks = fixture.tasks.slice(0, taskLimit);
794
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
795
- const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
796
- const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "7", 10);
797
- const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "5", 10);
798
- // Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
799
- const judge = await autoDiscoverJudge(toolsLlm);
800
- const results = new Array(tasks.length);
801
- let nextIndex = 0;
802
- const workers = Array.from({ length: concurrency }, () => (async () => {
803
- while (true) {
804
- const idx = nextIndex++;
805
- if (idx >= tasks.length)
806
- return;
807
- const task = tasks[idx];
808
- try {
809
- const baseStart = performance.now();
810
- const base = await baselineAnswer(baselineLlm, task);
811
- const baseMs = performance.now() - baseStart;
812
- const toolsStart = performance.now();
813
- const tools = await toolAugmentedAnswerFromFile(toolsLlm, task, { maxSteps, maxToolCalls });
814
- const toolsMs = performance.now() - toolsStart;
815
- const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
816
- const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
817
- results[idx] = {
818
- taskId: task.id,
819
- baselineCorrect: baseJudge.match,
820
- toolsCorrect: toolsJudge.match,
821
- baselineMs: baseMs,
822
- toolsMs,
823
- toolCalls: tools.toolCalls,
824
- judgeProvider: toolsJudge.judgeProvider,
825
- judgeInvoked: toolsJudge.judgeInvoked,
826
- };
827
- }
828
- catch (err) {
829
- results[idx] = {
830
- taskId: task.id,
831
- baselineCorrect: false,
832
- toolsCorrect: false,
833
- baselineMs: 0,
834
- toolsMs: 0,
835
- toolCalls: 0,
836
- error: err?.message ?? String(err),
837
- };
838
- }
839
- }
840
- })());
841
- await Promise.all(workers);
842
- const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
843
- const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
844
- const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
845
- const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
846
- const avg = (values) => values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
847
- const avgBaseMs = avg(results.map((r) => r.baselineMs).filter((n) => n > 0));
848
- const avgToolsMs = avg(results.map((r) => r.toolsMs).filter((n) => n > 0));
849
- const avgToolCalls = avg(results.map((r) => r.toolCalls));
850
- console.log(`[gaia-capability-files] config=${fixture.config} split=${fixture.split} tasks=${tasks.length} concurrency=${concurrency} baseline=${baselineCorrect}/${tasks.length} tools=${toolsCorrect}/${tasks.length} improved=${improved} regressions=${regressions} avgBaselineMs=${avgBaseMs.toFixed(0)} avgToolsMs=${avgToolsMs.toFixed(0)} avgToolCalls=${avgToolCalls.toFixed(2)}`);
851
- console.log(`[gaia-capability-files] perTask: ${results
852
- .map((r) => `${r.taskId}:B${r.baselineCorrect ? "1" : "0"}T${r.toolsCorrect ? "1" : "0"}${r.error ? "E" : ""}`)
853
- .join(" ")}`);
854
- if (shouldWriteReport) {
855
- const repoRoot = resolveRepoRoot();
856
- const generatedAtIso = new Date().toISOString();
857
- const stamp = generatedAtIso.replace(/[:.]/g, "-");
858
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
859
- const publicSummary = {
860
- suiteId: "gaia_capability_files",
861
- lane: "files",
862
- generatedAtIso,
863
- config: fixture.config,
864
- split: fixture.split,
865
- taskCount: tasks.length,
866
- concurrency,
867
- baseline: {
868
- model: baselineModelLabel,
869
- correct: baselineCorrect,
870
- passRatePct: tasks.length === 0 ? 0 : (baselineCorrect / tasks.length) * 100,
871
- avgMs: avgBaseMs,
872
- },
873
- tools: {
874
- model: toolsModelLabel,
875
- mode: toolsMode,
876
- correct: toolsCorrect,
877
- passRatePct: tasks.length === 0 ? 0 : (toolsCorrect / tasks.length) * 100,
878
- avgMs: avgToolsMs,
879
- avgToolCalls: avgToolCalls,
880
- },
881
- improved,
882
- regressions,
883
- notes: "GAIA is gated. This file contains only aggregate metrics (no prompt/answer text). Detailed per-task report is written under .cache/gaia/reports (gitignored).",
884
- };
885
- await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_files_latest.json"), publicSummary);
886
- await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_files_${fixture.config}_${fixture.split}_${stamp}.json`), {
887
- ...publicSummary,
888
- perTask: results.map((r) => ({
889
- taskId: r.taskId,
890
- baselineCorrect: r.baselineCorrect,
891
- toolsCorrect: r.toolsCorrect,
892
- baselineMs: r.baselineMs,
893
- toolsMs: r.toolsMs,
894
- toolCalls: r.toolCalls,
895
- error: r.error ?? null,
896
- })),
897
- });
898
- }
899
- const enforce = process.env.NODEBENCH_GAIA_CAPABILITY_ENFORCE === "1";
900
- if (enforce) {
901
- // For file-backed tasks, the baseline is expected to be low. We still enforce that
902
- // tool-augmented performance is not worse than baseline and has at least one improvement.
903
- const allowedRegression = Math.max(1, Math.floor(tasks.length * 0.2));
904
- expect(improved).toBeGreaterThanOrEqual(1);
905
- expect(toolsCorrect).toBeGreaterThanOrEqual(baselineCorrect - allowedRegression);
906
- expect(toolsCorrect).toBeGreaterThanOrEqual(1);
907
- }
908
- else {
909
- expect(results.length).toBe(tasks.length);
910
- expect(results.some((r) => r.error)).toBe(false);
911
- }
912
- }, 300000);
913
- });
914
- //# sourceMappingURL=gaiaCapabilityFilesEval.test.js.map