nodebench-mcp 2.70.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. package/README.md +95 -41
  2. package/dist/agents/alertRouter.d.ts +38 -0
  3. package/dist/agents/alertRouter.js +151 -0
  4. package/dist/agents/alertRouter.js.map +1 -0
  5. package/dist/agents/entityMemory.d.ts +40 -0
  6. package/dist/agents/entityMemory.js +64 -0
  7. package/dist/agents/entityMemory.js.map +1 -0
  8. package/dist/agents/subAgents.d.ts +35 -0
  9. package/dist/agents/subAgents.js +62 -0
  10. package/dist/agents/subAgents.js.map +1 -0
  11. package/dist/benchmarks/benchmarkRunner.js +14 -0
  12. package/dist/benchmarks/benchmarkRunner.js.map +1 -1
  13. package/dist/benchmarks/chainEval.js +107 -0
  14. package/dist/benchmarks/chainEval.js.map +1 -1
  15. package/dist/benchmarks/llmJudgeEval.js +85 -0
  16. package/dist/benchmarks/llmJudgeEval.js.map +1 -1
  17. package/dist/benchmarks/searchQualityEval.js +118 -5
  18. package/dist/benchmarks/searchQualityEval.js.map +1 -1
  19. package/dist/cli/search.d.ts +13 -0
  20. package/dist/cli/search.js +130 -0
  21. package/dist/cli/search.js.map +1 -0
  22. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  23. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  24. package/dist/dashboard/operatingServer.js +3 -2
  25. package/dist/dashboard/operatingServer.js.map +1 -1
  26. package/dist/db.d.ts +6 -2
  27. package/dist/db.js +521 -6
  28. package/dist/db.js.map +1 -1
  29. package/dist/index.js +349 -67
  30. package/dist/index.js.map +1 -1
  31. package/dist/packageInfo.d.ts +3 -0
  32. package/dist/packageInfo.js +32 -0
  33. package/dist/packageInfo.js.map +1 -0
  34. package/dist/profiler/behaviorStore.d.ts +97 -0
  35. package/dist/profiler/behaviorStore.js +276 -0
  36. package/dist/profiler/behaviorStore.js.map +1 -0
  37. package/dist/profiler/eventCollector.d.ts +119 -0
  38. package/dist/profiler/eventCollector.js +267 -0
  39. package/dist/profiler/eventCollector.js.map +1 -0
  40. package/dist/profiler/index.d.ts +15 -0
  41. package/dist/profiler/index.js +16 -0
  42. package/dist/profiler/index.js.map +1 -0
  43. package/dist/profiler/mcpProxy.d.ts +49 -0
  44. package/dist/profiler/mcpProxy.js +123 -0
  45. package/dist/profiler/mcpProxy.js.map +1 -0
  46. package/dist/profiler/modelRouter.d.ts +30 -0
  47. package/dist/profiler/modelRouter.js +99 -0
  48. package/dist/profiler/modelRouter.js.map +1 -0
  49. package/dist/profiler/otelReceiver.d.ts +17 -0
  50. package/dist/profiler/otelReceiver.js +62 -0
  51. package/dist/profiler/otelReceiver.js.map +1 -0
  52. package/dist/profiler/proofEngine.d.ts +41 -0
  53. package/dist/profiler/proofEngine.js +93 -0
  54. package/dist/profiler/proofEngine.js.map +1 -0
  55. package/dist/profiler/workflowTemplates.d.ts +41 -0
  56. package/dist/profiler/workflowTemplates.js +95 -0
  57. package/dist/profiler/workflowTemplates.js.map +1 -0
  58. package/dist/providers/localMemoryProvider.js +3 -2
  59. package/dist/providers/localMemoryProvider.js.map +1 -1
  60. package/dist/runtimeConfig.d.ts +11 -0
  61. package/dist/runtimeConfig.js +27 -0
  62. package/dist/runtimeConfig.js.map +1 -0
  63. package/dist/sandboxApi.js +2 -1
  64. package/dist/sandboxApi.js.map +1 -1
  65. package/dist/security/auditLog.js +8 -3
  66. package/dist/security/auditLog.js.map +1 -1
  67. package/dist/subconscious/blocks.d.ts +43 -0
  68. package/dist/subconscious/blocks.js +158 -0
  69. package/dist/subconscious/blocks.js.map +1 -0
  70. package/dist/subconscious/classifier.d.ts +22 -0
  71. package/dist/subconscious/classifier.js +118 -0
  72. package/dist/subconscious/classifier.js.map +1 -0
  73. package/dist/subconscious/graphEngine.d.ts +65 -0
  74. package/dist/subconscious/graphEngine.js +234 -0
  75. package/dist/subconscious/graphEngine.js.map +1 -0
  76. package/dist/subconscious/index.d.ts +19 -0
  77. package/dist/subconscious/index.js +20 -0
  78. package/dist/subconscious/index.js.map +1 -0
  79. package/dist/subconscious/tools.d.ts +5 -0
  80. package/dist/subconscious/tools.js +255 -0
  81. package/dist/subconscious/tools.js.map +1 -0
  82. package/dist/subconscious/whisperPolicy.d.ts +20 -0
  83. package/dist/subconscious/whisperPolicy.js +171 -0
  84. package/dist/subconscious/whisperPolicy.js.map +1 -0
  85. package/dist/sweep/engine.d.ts +27 -0
  86. package/dist/sweep/engine.js +244 -0
  87. package/dist/sweep/engine.js.map +1 -0
  88. package/dist/sweep/index.d.ts +9 -0
  89. package/dist/sweep/index.js +8 -0
  90. package/dist/sweep/index.js.map +1 -0
  91. package/dist/sweep/sources/github_trending.d.ts +6 -0
  92. package/dist/sweep/sources/github_trending.js +37 -0
  93. package/dist/sweep/sources/github_trending.js.map +1 -0
  94. package/dist/sweep/sources/hackernews.d.ts +7 -0
  95. package/dist/sweep/sources/hackernews.js +57 -0
  96. package/dist/sweep/sources/hackernews.js.map +1 -0
  97. package/dist/sweep/sources/openbb_finance.d.ts +9 -0
  98. package/dist/sweep/sources/openbb_finance.js +46 -0
  99. package/dist/sweep/sources/openbb_finance.js.map +1 -0
  100. package/dist/sweep/sources/producthunt.d.ts +6 -0
  101. package/dist/sweep/sources/producthunt.js +41 -0
  102. package/dist/sweep/sources/producthunt.js.map +1 -0
  103. package/dist/sweep/sources/web_signals.d.ts +7 -0
  104. package/dist/sweep/sources/web_signals.js +63 -0
  105. package/dist/sweep/sources/web_signals.js.map +1 -0
  106. package/dist/sweep/sources/yahoo_finance.d.ts +6 -0
  107. package/dist/sweep/sources/yahoo_finance.js +47 -0
  108. package/dist/sweep/sources/yahoo_finance.js.map +1 -0
  109. package/dist/sweep/types.d.ts +50 -0
  110. package/dist/sweep/types.js +9 -0
  111. package/dist/sweep/types.js.map +1 -0
  112. package/dist/sync/founderEpisodeStore.d.ts +98 -0
  113. package/dist/sync/founderEpisodeStore.js +230 -0
  114. package/dist/sync/founderEpisodeStore.js.map +1 -0
  115. package/dist/sync/hyperloopArchive.d.ts +51 -0
  116. package/dist/sync/hyperloopArchive.js +153 -0
  117. package/dist/sync/hyperloopArchive.js.map +1 -0
  118. package/dist/sync/hyperloopEval.d.ts +123 -0
  119. package/dist/sync/hyperloopEval.js +389 -0
  120. package/dist/sync/hyperloopEval.js.map +1 -0
  121. package/dist/sync/protocol.d.ts +172 -0
  122. package/dist/sync/protocol.js +9 -0
  123. package/dist/sync/protocol.js.map +1 -0
  124. package/dist/sync/sessionMemory.d.ts +47 -0
  125. package/dist/sync/sessionMemory.js +138 -0
  126. package/dist/sync/sessionMemory.js.map +1 -0
  127. package/dist/sync/store.d.ts +384 -0
  128. package/dist/sync/store.js +1435 -0
  129. package/dist/sync/store.js.map +1 -0
  130. package/dist/sync/syncBridgeClient.d.ts +30 -0
  131. package/dist/sync/syncBridgeClient.js +172 -0
  132. package/dist/sync/syncBridgeClient.js.map +1 -0
  133. package/dist/tools/autonomousDeliveryTools.d.ts +2 -0
  134. package/dist/tools/autonomousDeliveryTools.js +1104 -0
  135. package/dist/tools/autonomousDeliveryTools.js.map +1 -0
  136. package/dist/tools/boilerplateTools.js +10 -9
  137. package/dist/tools/boilerplateTools.js.map +1 -1
  138. package/dist/tools/claudeCodeIngestTools.d.ts +10 -0
  139. package/dist/tools/claudeCodeIngestTools.js +347 -0
  140. package/dist/tools/claudeCodeIngestTools.js.map +1 -0
  141. package/dist/tools/coreWorkflowTools.d.ts +2 -0
  142. package/dist/tools/coreWorkflowTools.js +488 -0
  143. package/dist/tools/coreWorkflowTools.js.map +1 -0
  144. package/dist/tools/deltaTools.d.ts +15 -0
  145. package/dist/tools/deltaTools.js +1522 -0
  146. package/dist/tools/deltaTools.js.map +1 -0
  147. package/dist/tools/documentationTools.js +2 -1
  148. package/dist/tools/documentationTools.js.map +1 -1
  149. package/dist/tools/entityLookupTools.d.ts +14 -0
  150. package/dist/tools/entityLookupTools.js +159 -0
  151. package/dist/tools/entityLookupTools.js.map +1 -0
  152. package/dist/tools/entityTemporalTools.d.ts +12 -0
  153. package/dist/tools/entityTemporalTools.js +330 -0
  154. package/dist/tools/entityTemporalTools.js.map +1 -0
  155. package/dist/tools/founderLocalPipeline.d.ts +215 -0
  156. package/dist/tools/founderLocalPipeline.js +1516 -2
  157. package/dist/tools/founderLocalPipeline.js.map +1 -1
  158. package/dist/tools/founderOperatingModel.d.ts +120 -0
  159. package/dist/tools/founderOperatingModel.js +469 -0
  160. package/dist/tools/founderOperatingModel.js.map +1 -0
  161. package/dist/tools/founderOperatingModelTools.d.ts +2 -0
  162. package/dist/tools/founderOperatingModelTools.js +169 -0
  163. package/dist/tools/founderOperatingModelTools.js.map +1 -0
  164. package/dist/tools/founderStrategicOpsTools.d.ts +2 -0
  165. package/dist/tools/founderStrategicOpsTools.js +1310 -0
  166. package/dist/tools/founderStrategicOpsTools.js.map +1 -0
  167. package/dist/tools/graphifyTools.d.ts +19 -0
  168. package/dist/tools/graphifyTools.js +375 -0
  169. package/dist/tools/graphifyTools.js.map +1 -0
  170. package/dist/tools/index.d.ts +3 -0
  171. package/dist/tools/index.js +4 -0
  172. package/dist/tools/index.js.map +1 -1
  173. package/dist/tools/monteCarloTools.d.ts +16 -0
  174. package/dist/tools/monteCarloTools.js +225 -0
  175. package/dist/tools/monteCarloTools.js.map +1 -0
  176. package/dist/tools/packetCompilerTools.d.ts +12 -0
  177. package/dist/tools/packetCompilerTools.js +322 -0
  178. package/dist/tools/packetCompilerTools.js.map +1 -0
  179. package/dist/tools/planSynthesisTools.d.ts +15 -0
  180. package/dist/tools/planSynthesisTools.js +455 -0
  181. package/dist/tools/planSynthesisTools.js.map +1 -0
  182. package/dist/tools/profilerTools.d.ts +20 -0
  183. package/dist/tools/profilerTools.js +364 -0
  184. package/dist/tools/profilerTools.js.map +1 -0
  185. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  186. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  187. package/dist/tools/savingsTools.d.ts +11 -0
  188. package/dist/tools/savingsTools.js +155 -0
  189. package/dist/tools/savingsTools.js.map +1 -0
  190. package/dist/tools/scenarioCompilerTools.d.ts +14 -0
  191. package/dist/tools/scenarioCompilerTools.js +290 -0
  192. package/dist/tools/scenarioCompilerTools.js.map +1 -0
  193. package/dist/tools/sharedContextTools.d.ts +2 -0
  194. package/dist/tools/sharedContextTools.js +423 -0
  195. package/dist/tools/sharedContextTools.js.map +1 -0
  196. package/dist/tools/sitemapTools.d.ts +15 -0
  197. package/dist/tools/sitemapTools.js +560 -0
  198. package/dist/tools/sitemapTools.js.map +1 -0
  199. package/dist/tools/sweepTools.d.ts +9 -0
  200. package/dist/tools/sweepTools.js +112 -0
  201. package/dist/tools/sweepTools.js.map +1 -0
  202. package/dist/tools/syncBridgeTools.d.ts +2 -0
  203. package/dist/tools/syncBridgeTools.js +258 -0
  204. package/dist/tools/syncBridgeTools.js.map +1 -0
  205. package/dist/tools/toolRegistry.js +1223 -45
  206. package/dist/tools/toolRegistry.js.map +1 -1
  207. package/dist/tools/workspaceTools.d.ts +19 -0
  208. package/dist/tools/workspaceTools.js +762 -0
  209. package/dist/tools/workspaceTools.js.map +1 -0
  210. package/dist/toolsetRegistry.js +162 -3
  211. package/dist/toolsetRegistry.js.map +1 -1
  212. package/package.json +39 -38
  213. package/rules/nodebench-agentic-reliability.md +32 -0
  214. package/rules/nodebench-analyst-diagnostic.md +25 -0
  215. package/rules/nodebench-auto-qa.md +31 -0
  216. package/rules/nodebench-completion-traceability.md +22 -0
  217. package/rules/nodebench-flywheel-continuous.md +25 -0
  218. package/rules/nodebench-pre-release-review.md +24 -0
  219. package/rules/nodebench-qa-dogfood.md +26 -0
  220. package/rules/nodebench-scenario-testing.md +30 -0
  221. package/rules/nodebench-self-direction.md +23 -0
  222. package/rules/nodebench-self-judge-loop.md +24 -0
  223. package/scripts/install.sh +215 -0
  224. package/dist/__tests__/analytics.test.d.ts +0 -11
  225. package/dist/__tests__/analytics.test.js +0 -546
  226. package/dist/__tests__/analytics.test.js.map +0 -1
  227. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  228. package/dist/__tests__/architectComplex.test.js +0 -373
  229. package/dist/__tests__/architectComplex.test.js.map +0 -1
  230. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  231. package/dist/__tests__/architectSmoke.test.js +0 -92
  232. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  233. package/dist/__tests__/audit-registry.d.ts +0 -1
  234. package/dist/__tests__/audit-registry.js +0 -60
  235. package/dist/__tests__/audit-registry.js.map +0 -1
  236. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  237. package/dist/__tests__/batchAutopilot.test.js +0 -218
  238. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  239. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  240. package/dist/__tests__/cliSubcommands.test.js +0 -138
  241. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  242. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  243. package/dist/__tests__/comparativeBench.test.js +0 -722
  244. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  245. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  246. package/dist/__tests__/critterCalibrationEval.js +0 -370
  247. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  248. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  249. package/dist/__tests__/dynamicLoading.test.js +0 -280
  250. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  251. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  252. package/dist/__tests__/embeddingProvider.test.js +0 -86
  253. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  254. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  255. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  256. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  257. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  258. package/dist/__tests__/evalHarness.test.js +0 -1107
  259. package/dist/__tests__/evalHarness.test.js.map +0 -1
  260. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  261. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  262. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  263. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  264. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  265. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  266. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  267. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  268. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  269. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  270. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  271. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  272. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  273. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  274. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  275. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  276. package/dist/__tests__/forecastingScoring.test.js +0 -202
  277. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  278. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  279. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  280. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  281. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  282. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  283. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  284. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  285. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  286. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  287. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  288. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  289. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  290. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  291. package/dist/__tests__/helpers/answerMatch.js +0 -267
  292. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  293. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  294. package/dist/__tests__/helpers/textLlm.js +0 -214
  295. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  296. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  297. package/dist/__tests__/localDashboard.test.js +0 -226
  298. package/dist/__tests__/localDashboard.test.js.map +0 -1
  299. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  300. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  301. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  302. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  303. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  304. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  305. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  306. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  307. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  308. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  309. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  310. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  311. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  312. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  313. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  314. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  315. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  316. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  317. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  318. package/dist/__tests__/openclawDogfood.test.js +0 -535
  319. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  320. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  321. package/dist/__tests__/openclawMessaging.test.js +0 -232
  322. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  323. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  324. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  325. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  326. package/dist/__tests__/tools.test.d.ts +0 -1
  327. package/dist/__tests__/tools.test.js +0 -3201
  328. package/dist/__tests__/tools.test.js.map +0 -1
  329. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  330. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  331. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  332. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  333. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  334. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  335. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  336. package/dist/__tests__/webmcpTools.test.js +0 -195
  337. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  338. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  339. package/dist/benchmarks/testProviderBus.js +0 -272
  340. package/dist/benchmarks/testProviderBus.js.map +0 -1
  341. package/dist/hooks/postCompaction.d.ts +0 -14
  342. package/dist/hooks/postCompaction.js +0 -51
  343. package/dist/hooks/postCompaction.js.map +0 -1
  344. package/dist/security/__tests__/security.test.d.ts +0 -8
  345. package/dist/security/__tests__/security.test.js +0 -295
  346. package/dist/security/__tests__/security.test.js.map +0 -1
  347. package/dist/tools/documentTools.d.ts +0 -5
  348. package/dist/tools/documentTools.js +0 -524
  349. package/dist/tools/documentTools.js.map +0 -1
  350. package/dist/tools/financialTools.d.ts +0 -10
  351. package/dist/tools/financialTools.js +0 -403
  352. package/dist/tools/financialTools.js.map +0 -1
  353. package/dist/tools/memoryTools.d.ts +0 -5
  354. package/dist/tools/memoryTools.js +0 -137
  355. package/dist/tools/memoryTools.js.map +0 -1
  356. package/dist/tools/planningTools.d.ts +0 -5
  357. package/dist/tools/planningTools.js +0 -147
  358. package/dist/tools/planningTools.js.map +0 -1
  359. package/dist/tools/searchTools.d.ts +0 -5
  360. package/dist/tools/searchTools.js +0 -145
  361. package/dist/tools/searchTools.js.map +0 -1
@@ -1,1101 +0,0 @@
1
- /**
2
- * GAIA media-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local OCR tools.
3
- *
4
- * This lane targets GAIA tasks that include image attachments (PNG/JPG/WEBP).
5
- * We provide deterministic local OCR via NodeBench MCP tools and score answers against
6
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
7
- *
8
- * Safety:
9
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
10
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
11
- *
12
- * Disabled by default (cost + rate limits). Run with:
13
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
14
- */
15
- import { describe, expect, it } from "vitest";
16
- import { existsSync, readFileSync } from "node:fs";
17
- import { mkdir, readFile, writeFile } from "node:fs/promises";
18
- import path from "node:path";
19
- import { fileURLToPath } from "node:url";
20
- import { performance } from "node:perf_hooks";
21
- import { localFileTools, gaiaMediaSolvers } from "../tools/localFileTools.js";
22
- import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
23
- import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
24
- const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
25
- const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
26
- async function safeWriteJson(filePath, payload) {
27
- try {
28
- await mkdir(path.dirname(filePath), { recursive: true });
29
- await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
30
- }
31
- catch (err) {
32
- console.warn(`[gaia-capability-media] report write failed: ${err?.message ?? String(err)}`);
33
- }
34
- }
35
- function resolveRepoRoot() {
36
- const testDir = path.dirname(fileURLToPath(import.meta.url));
37
- return path.resolve(testDir, "../../../..");
38
- }
39
- function resolveCapabilityMediaFixturePath() {
40
- const override = process.env.NODEBENCH_GAIA_CAPABILITY_MEDIA_FIXTURE_PATH;
41
- if (override) {
42
- if (path.isAbsolute(override))
43
- return override;
44
- const repoRoot = resolveRepoRoot();
45
- return path.resolve(repoRoot, override);
46
- }
47
- const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
48
- const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
49
- const repoRoot = resolveRepoRoot();
50
- return path.join(repoRoot, ".cache", "gaia", `gaia_capability_media_${config}_${split}.sample.json`);
51
- }
52
- function loadDotEnvLocalIfPresent() {
53
- const repoRoot = resolveRepoRoot();
54
- const envPath = path.join(repoRoot, ".env.local");
55
- if (!existsSync(envPath))
56
- return;
57
- const text = readFileSync(envPath, "utf8");
58
- for (const rawLine of text.split(/\r?\n/)) {
59
- const line = rawLine.trim();
60
- if (!line || line.startsWith("#"))
61
- continue;
62
- const idx = line.indexOf("=");
63
- if (idx <= 0)
64
- continue;
65
- const key = line.slice(0, idx).trim();
66
- let value = line.slice(idx + 1).trim();
67
- if ((value.startsWith("\"") && value.endsWith("\"")) ||
68
- (value.startsWith("'") && value.endsWith("'"))) {
69
- value = value.slice(1, -1);
70
- }
71
- if (!process.env[key])
72
- process.env[key] = value;
73
- }
74
- }
75
- function tryParseIntFromText(text, re) {
76
- const m = String(text ?? "").match(re);
77
- if (!m)
78
- return null;
79
- const n = Number.parseInt(m[1], 10);
80
- return Number.isFinite(n) ? n : null;
81
- }
82
- function tryParseFloatFromText(text, re) {
83
- const m = String(text ?? "").match(re);
84
- if (!m)
85
- return null;
86
- const n = Number.parseFloat(m[1]);
87
- return Number.isFinite(n) ? n : null;
88
- }
89
- async function tryDeterministicMediaSolve(toolIndex, task, localPath) {
90
- const q = String(task.prompt ?? "").toLowerCase();
91
- // Colored number grid -> stdev average
92
- if (q.includes("population deviation") && q.includes("sample deviation") && q.includes("red") && q.includes("green")) {
93
- const tool = toolIndex.get("solve_red_green_deviation_average_from_image");
94
- if (!tool)
95
- return null;
96
- const result = await tool.handler({ path: localPath, decimals: 3 });
97
- const answer = String(result?.answer ?? "").trim();
98
- if (!answer)
99
- return null;
100
- return { answer, toolCalls: 1 };
101
- }
102
- // Fraction quiz grading -> integer score
103
- if (q.includes("quiz is scored") && q.includes("bonus points")) {
104
- const tool = toolIndex.get("grade_fraction_quiz_from_image");
105
- if (!tool)
106
- return null;
107
- const bonus = tryParseIntFromText(task.prompt, /(\d+)\s+bonus\s+points?/i) ?? 0;
108
- const ptsAddSub = tryParseIntFromText(task.prompt, /add\s+or\s+subtract\s+fractions?\s*:\s*(\d+)/i) ?? 5;
109
- const ptsMulDiv = tryParseIntFromText(task.prompt, /multiply\s+or\s+divide\s+fractions?\s*:\s*(\d+)/i) ?? 10;
110
- const ptsImproper = tryParseIntFromText(task.prompt, /improper\s+fraction\s*:\s*(\d+)/i) ?? 15;
111
- const ptsMixed = tryParseIntFromText(task.prompt, /mixed\s+number\s*:\s*(\d+)/i) ?? 20;
112
- const result = await tool.handler({
113
- path: localPath,
114
- bonusPoints: bonus,
115
- pointsAddSubtract: ptsAddSub,
116
- pointsMultiplyDivide: ptsMulDiv,
117
- pointsImproperFraction: ptsImproper,
118
- pointsMixedNumber: ptsMixed,
119
- preprocess: true,
120
- maxChars: 120000,
121
- });
122
- const answer = String(result?.answer ?? result?.score ?? "").trim();
123
- if (!answer)
124
- return null;
125
- return { answer, toolCalls: 1 };
126
- }
127
- // Green polygon area from purple lengths
128
- if (q.includes("area") && q.includes("green") && q.includes("polygon")) {
129
- const tool = toolIndex.get("solve_green_polygon_area_from_image");
130
- if (!tool)
131
- return null;
132
- const result = await tool.handler({ path: localPath });
133
- const answer = String(result?.answer ?? "").trim();
134
- if (!answer)
135
- return null;
136
- return { answer, toolCalls: 1 };
137
- }
138
- // Fractions in body text + sample simplifications -> comma-separated list
139
- if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample")) {
140
- const tool = toolIndex.get("extract_fractions_and_simplify_from_image");
141
- if (!tool)
142
- return null;
143
- const result = await tool.handler({ path: localPath, preprocess: true, maxChars: 180000, bodyBottomFrac: 0.75 });
144
- const answer = String(result?.answer ?? "").trim();
145
- if (!answer)
146
- return null;
147
- return { answer, toolCalls: 1 };
148
- }
149
- // Bass clef staff -> derived age
150
- if (q.includes("bass clef")) {
151
- const tool = toolIndex.get("solve_bass_clef_age_from_image");
152
- if (!tool)
153
- return null;
154
- const result = await tool.handler({ path: localPath });
155
- const answer = String(result?.answer ?? "").trim();
156
- if (!answer)
157
- return null;
158
- return { answer, toolCalls: 1 };
159
- }
160
- // Pricing table storage upgrade -> x.xx
161
- if (q.includes("uploaded") && q.includes("over the limit") && q.includes("upgrade")) {
162
- const tool = toolIndex.get("solve_storage_upgrade_cost_per_file_from_image");
163
- if (!tool)
164
- return null;
165
- const planM = task.prompt.match(/\b(Standard|Plus|Premium)\b/i);
166
- const currentPlan = planM ? planM[1] : "Standard";
167
- const filesUploaded = tryParseIntFromText(task.prompt, /uploaded\s+(\d+)\s+/i) ?? 0;
168
- const overLimitGb = tryParseFloatFromText(task.prompt, /(\d+(?:\.\d+)?)\s*gb\s+over/i) ?? 0;
169
- const additionalFiles = tryParseIntFromText(task.prompt, /(\d+)\s+more\s+files?/i) ?? 0;
170
- if (filesUploaded > 0 && additionalFiles >= 0) {
171
- const result = await tool.handler({
172
- path: localPath,
173
- currentPlanName: currentPlan,
174
- filesUploaded,
175
- overLimitGb,
176
- additionalFiles,
177
- decimals: 2,
178
- preprocess: true,
179
- maxChars: 80000,
180
- });
181
- const answer = String(result?.answer ?? "").trim();
182
- if (!answer)
183
- return null;
184
- return { answer, toolCalls: 1 };
185
- }
186
- }
187
- return null;
188
- }
189
- async function llmGenerateText(llm, history) {
190
- const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
191
- return generateTextFromHistory(llm, history, {
192
- temperature: Number.isFinite(temperature) ? temperature : 0,
193
- maxOutputTokens: 1024,
194
- });
195
- }
196
- /**
197
- * Gemini vision: send the image + question directly to Gemini multimodal API.
198
- * Returns null if Gemini isn't available or the call fails.
199
- */
200
- function selectVisionModel(task) {
201
- const override = process.env.NODEBENCH_GAIA_CAPABILITY_VISION_MODEL;
202
- if (override)
203
- return override;
204
- const q = String(task.prompt ?? "").toLowerCase();
205
- const proModel = process.env.NODEBENCH_GAIA_CAPABILITY_VISION_PRO_MODEL ?? "gemini-3-pro-preview";
206
- // Use pro model for tasks requiring spatial reasoning or complex OCR + calculation
207
- if (q.includes("chess") && q.includes("algebraic notation"))
208
- return proModel;
209
- if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample"))
210
- return proModel;
211
- return "gemini-3-flash-preview";
212
- }
213
- async function callGeminiVision(apiKey, model, base64, mimeType, prompt, opts) {
214
- const mod = await import("@google/genai");
215
- const { GoogleGenAI } = mod;
216
- const ai = new GoogleGenAI({ apiKey });
217
- const response = await ai.models.generateContent({
218
- model,
219
- contents: [
220
- {
221
- role: "user",
222
- parts: [
223
- { inlineData: { mimeType, data: base64 } },
224
- { text: prompt },
225
- ],
226
- },
227
- ],
228
- config: {
229
- temperature: opts?.temperature ?? 0,
230
- maxOutputTokens: opts?.maxOutputTokens ?? 4096,
231
- },
232
- });
233
- const parts = response?.candidates?.[0]?.content?.parts ?? [];
234
- const text = parts.map((p) => p?.text ?? "").join("").trim();
235
- return text || null;
236
- }
237
- async function tryGeminiVisionAnswer(task, localPath, ext) {
238
- const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_AI_API_KEY || "";
239
- if (!apiKey)
240
- return null;
241
- try {
242
- const imageBuffer = readFileSync(localPath);
243
- const base64 = imageBuffer.toString("base64");
244
- const mimeType = ext === "jpg" || ext === "jpeg" ? "image/jpeg" : ext === "webp" ? "image/webp" : "image/png";
245
- const model = selectVisionModel(task);
246
- const visionPrompt = buildVisionPrompt(task);
247
- let text = await callGeminiVision(apiKey, model, base64, mimeType, visionPrompt);
248
- if (!text)
249
- return null;
250
- // Extract answer from chain-of-thought responses (ANSWER: <value> pattern)
251
- const answerMatch = text.match(/ANSWER:\s*(.+?)$/im);
252
- if (answerMatch) {
253
- text = answerMatch[1].trim();
254
- }
255
- return text || null;
256
- }
257
- catch (err) {
258
- console.warn(`[gaia-media-vision] vision failed for ${task.id}: ${err?.message ?? String(err)}`);
259
- return null;
260
- }
261
- }
262
- /**
263
- * Should this task use Gemini code execution (image + Python sandbox)?
264
- * Only tasks where pure vision reasoning consistently fails.
265
- */
266
- function shouldUseCodeExecution(task) {
267
- const q = String(task.prompt ?? "").toLowerCase();
268
- // Chess: code execution reads the board + validates FEN with python-chess.
269
- // We extract the FEN and send it to Stockfish (chess-api.com) for the best move.
270
- if (q.includes("chess") && q.includes("algebraic notation"))
271
- return true;
272
- // Fraction extraction: OCR + GCD computation — code execution works well
273
- if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample"))
274
- return true;
275
- return false;
276
- }
277
- function buildCodeExecutionPrompt(task) {
278
- const q = String(task.prompt ?? "").toLowerCase();
279
- if (q.includes("chess") && q.includes("algebraic notation")) {
280
- return ("You are a chess grandmaster analyzing this board position image.\n\n" +
281
- "BOARD ORIENTATION: This image is shown from BLACK's perspective (the board is FLIPPED).\n" +
282
- "- The file labels at the BOTTOM read: h, g, f, e, d, c, b, a (LEFT to RIGHT)\n" +
283
- "- The rank labels on the LEFT read: 1, 2, 3, 4, 5, 6, 7, 8 (TOP to BOTTOM)\n" +
284
- "- So rank 1 is at the TOP of the image and rank 8 is at the BOTTOM\n" +
285
- "- File h is on the LEFT side and file a is on the RIGHT side\n" +
286
- "- USE THE PRINTED LABELS to verify each piece's position!\n\n" +
287
- "PIECE IDENTIFICATION GUIDE (green/white board style):\n" +
288
- "- King (K/k): Tallest piece with a CROSS (+) symbol on top\n" +
289
- "- Queen (Q/q): Tall piece with a pointed CROWN (multiple spikes) on top\n" +
290
- "- Rook (R/r): Piece with a FLAT CRENELLATED top (castle battlements, rectangular notches)\n" +
291
- "- Bishop (B/b): Medium piece with a POINTED TOP and a diagonal SLIT/NOTCH\n" +
292
- "- Knight (N/n): Piece with a distinctive HORSE HEAD shape\n" +
293
- "- Pawn (P/p): Shortest piece with a simple ROUND BALL on top\n" +
294
- "White pieces are LIGHT colored. Black pieces are DARK colored.\n\n" +
295
- "COMPLETE THE FOLLOWING THREE PHASES:\n\n" +
296
- "═══ PHASE 1: SYSTEMATIC PIECE INVENTORY ═══\n" +
297
- "Read the board using the printed coordinate labels as your anchor.\n" +
298
- "Go ROW BY ROW from the TOP of the image to the BOTTOM.\n" +
299
- "The TOP row is rank 1. The BOTTOM row is rank 8.\n" +
300
- "Within each row, go from LEFT (h-file) to RIGHT (a-file).\n\n" +
301
- "For each piece, check its TOP SHAPE carefully:\n" +
302
- "- Cross on top? → KING\n" +
303
- "- Spiky crown? → QUEEN\n" +
304
- "- Rectangular battlements? → ROOK\n" +
305
- "- Pointed with slit? → BISHOP\n" +
306
- "- Horse head? → KNIGHT\n" +
307
- "- Simple ball? → PAWN\n\n" +
308
- "Write your inventory using the ACTUAL SQUARES (not image positions):\n" +
309
- " Row at top (rank 1): h1=? g1=? f1=? e1=? d1=? c1=? b1=? a1=?\n" +
310
- " Next row (rank 2): h2=? g2=? f2=? e2=? d2=? c2=? b2=? a2=?\n" +
311
- " ... continue through all 8 rows ...\n" +
312
- " Bottom row (rank 8): h8=? g8=? f8=? e8=? d8=? c8=? b8=? a8=?\n\n" +
313
- "Use: K=White King, Q=White Queen, R=White Rook, B=White Bishop, N=White Knight, P=White Pawn\n" +
314
- " k=Black King, q=Black Queen, r=Black Rook, b=Black Bishop, n=Black Knight, p=Black Pawn\n" +
315
- " . = empty square\n\n" +
316
- "═══ PHASE 2: FEN CONSTRUCTION & VALIDATION ═══\n" +
317
- "Write Python code using the `chess` library (it is pre-installed).\n" +
318
- "IMPORTANT: FEN notation lists rank 8 FIRST, then rank 7, ..., rank 1 LAST.\n" +
319
- "Within each rank, list from a-file to h-file.\n" +
320
- "So you need to REVERSE your inventory order: start from the BOTTOM row (rank 8) and go UP.\n\n" +
321
- "Your code must:\n" +
322
- "1. Construct FEN from your inventory\n" +
323
- "2. Load it: board = chess.Board(fen)\n" +
324
- "3. Print str(board) — the ASCII board should match what you see in the image\n" +
325
- "4. Validate: board.is_valid(), exactly 1 king per side, no pawns on rank 1/8\n" +
326
- "5. If invalid, print board.status() and fix the FEN\n\n" +
327
- "═══ PHASE 3: VALIDATE & ANALYZE ═══\n" +
328
- "Set board.turn = chess.BLACK (it is Black to move).\n" +
329
- "Validate the position, print the board and FEN, then list legal moves.\n\n" +
330
- "```python\n" +
331
- "import chess\n\n" +
332
- "board = chess.Board(fen='<your FEN>')\n" +
333
- "board.turn = chess.BLACK\n" +
334
- "assert board.is_valid(), f'Invalid: {board.status()}'\n" +
335
- "print(board)\n" +
336
- "print(f'Is valid: {board.is_valid()}')\n" +
337
- "print(f'BOARD_FEN: {board.fen()}')\n" +
338
- "print(f'Legal moves: {list(board.legal_moves)}')\n" +
339
- "```\n\n" +
340
- `QUESTION: ${task.prompt}\n\n` +
341
- "Execute all three phases. Print the board and FEN for verification.");
342
- }
343
- if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample")) {
344
- return ("You are extracting fractions from a math worksheet image.\n\n" +
345
- `TASK: ${task.prompt}\n\n` +
346
- "IMPORTANT RULES:\n" +
347
- "- Do NOT import cv2, PIL, numpy, or any image processing library.\n" +
348
- "- Do NOT try to open, decode, or process the image file with code.\n" +
349
- "- Use your EYES (vision) to read the fractions from the image.\n" +
350
- "- Use Python code ONLY for math computation (GCD, simplification).\n\n" +
351
- "The worksheet has TWO sections:\n\n" +
352
- "SECTION A — BODY TEXT (10 fractions, already identified):\n" +
353
- "3/4, 1/4, 3/4, 3/4, 2/4, 1/2, 5/35, 7/21, 30/5, 30/5\n\n" +
354
- "SECTION B — SAMPLE PROBLEMS (read from the image with your eyes):\n" +
355
- "Look at the bottom portion of the image. There are exactly 7 sample problems.\n" +
356
- "Each sample problem shows a stacked fraction: a numerator on top of a line, denominator below.\n\n" +
357
- "YOUR STEPS:\n" +
358
- "1. LOOK at the image and identify each of the 7 stacked fractions.\n" +
359
- " Write down each numerator and denominator you see.\n" +
360
- "2. Write Python code that:\n" +
361
- " a) Defines the 7 fractions you read as a list of (numerator, denominator) tuples\n" +
362
- " b) For each, computes math.gcd(num, den) and simplifies: num//g, den//g\n" +
363
- " c) Combines the 10 body fractions + 7 simplified fractions\n" +
364
- " d) Prints EXACTLY 17 comma-separated fractions with no spaces\n\n" +
365
- "Expected output format: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,a/b,c/d,e/f,g/h,i/j,k/l,m/n\n\n" +
366
- "The code should look like:\n" +
367
- "```python\n" +
368
- "import math\n" +
369
- "body = [(3,4),(1,4),(3,4),(3,4),(2,4),(1,2),(5,35),(7,21),(30,5),(30,5)]\n" +
370
- "samples = [(?,?),(?,?),(?,?),(?,?),(?,?),(?,?),(?,?)] # fill in what you see\n" +
371
- "result = []\n" +
372
- "for n,d in body:\n" +
373
- " result.append(f'{n}/{d}')\n" +
374
- "for n,d in samples:\n" +
375
- " g = math.gcd(n,d)\n" +
376
- " result.append(f'{n//g}/{d//g}')\n" +
377
- "print(','.join(result))\n" +
378
- "```\n" +
379
- "Replace the ? values with what you READ from the image. Run the code.");
380
- }
381
- // Generic fallback (shouldn't reach here due to shouldUseCodeExecution check)
382
- return `${task.prompt}\n\nWrite Python code to solve this. Print ONLY the final answer.`;
383
- }
384
- /**
385
- * Gemini code execution: send image + prompt with tools: [{ codeExecution: {} }].
386
- * The model generates and runs Python server-side to analyze the image.
387
- *
388
- * For chess tasks: rotates image 180° + uses python-chess library
389
- * For other tasks: single call with image + code execution
390
- */
391
- async function tryGeminiCodeExecutionAnswer(task, localPath, ext) {
392
- const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_AI_API_KEY || "";
393
- if (!apiKey)
394
- return null;
395
- const q = String(task.prompt ?? "").toLowerCase();
396
- const isChess = q.includes("chess") && q.includes("algebraic notation");
397
- try {
398
- let imageBuffer = Buffer.from(readFileSync(localPath));
399
- // Chess: use ORIGINAL image (not rotated) — the coordinate labels are readable
400
- // and the model uses them to verify piece positions. Rotation makes labels upside-down.
401
- if (isChess) {
402
- console.log(`[gaia-media-chess] using original image (Black perspective with readable coordinate labels)`);
403
- }
404
- const base64 = imageBuffer.toString("base64");
405
- const mimeType = ext === "jpg" || ext === "jpeg" ? "image/jpeg" : ext === "webp" ? "image/webp" : "image/png";
406
- const mod = await import("@google/genai");
407
- const { GoogleGenAI } = mod;
408
- const ai = new GoogleGenAI({ apiKey });
409
- // Flash for chess (reliable, faster), Flash for others
410
- const model = process.env.NODEBENCH_GAIA_CAPABILITY_CODE_EXEC_MODEL ?? "gemini-3-flash-preview";
411
- const prompt = buildCodeExecutionPrompt(task);
412
- // Chess: extract FEN from code execution, then use Stockfish for the best move
413
- if (isChess) {
414
- const temperatures = [0, 0.2, 0.4];
415
- const fens = [];
416
- for (const temp of temperatures) {
417
- try {
418
- const resp = await ai.models.generateContent({
419
- model,
420
- contents: [{
421
- role: "user",
422
- parts: [
423
- { inlineData: { mimeType, data: base64 } },
424
- { text: prompt },
425
- ],
426
- }],
427
- config: {
428
- tools: [{ codeExecution: {} }],
429
- maxOutputTokens: 8192,
430
- temperature: temp,
431
- },
432
- });
433
- const fen = extractFenFromResponse(resp, temp);
434
- if (fen) {
435
- fens.push(fen);
436
- }
437
- }
438
- catch (err) {
439
- console.warn(`[gaia-chess-fen] temp=${temp} error: ${err?.message?.slice(0, 100)}`);
440
- }
441
- }
442
- if (fens.length === 0) {
443
- console.log(`[gaia-chess-fen] no valid FENs extracted, falling through`);
444
- return null;
445
- }
446
- // Use the most common FEN
447
- const fenCounts = {};
448
- for (const f of fens)
449
- fenCounts[f] = (fenCounts[f] || 0) + 1;
450
- const sortedFens = Object.entries(fenCounts).sort((a, b) => b[1] - a[1]);
451
- const consensusFen = sortedFens[0][0];
452
- console.log(`[gaia-chess-fen] FENs: ${JSON.stringify(fenCounts)} → consensus: ${consensusFen}`);
453
- // Query chess-api.com (Stockfish NNUE) for the best move
454
- const fullFen = `${consensusFen} b - - 0 1`; // Black to move
455
- console.log(`[gaia-chess-engine] querying Stockfish: ${fullFen}`);
456
- try {
457
- const chessResp = await fetch("https://chess-api.com/v1", {
458
- method: "POST",
459
- headers: { "Content-Type": "application/json" },
460
- body: JSON.stringify({ fen: fullFen, depth: 18, variants: 1 }),
461
- signal: AbortSignal.timeout(15000),
462
- });
463
- if (chessResp.ok) {
464
- const data = await chessResp.json();
465
- const bestMove = data?.san ?? data?.move ?? null;
466
- if (bestMove) {
467
- console.log(`[gaia-chess-engine] Stockfish: ${bestMove} (eval: ${data?.eval ?? "?"})`);
468
- return String(bestMove).trim();
469
- }
470
- console.warn(`[gaia-chess-engine] no move: ${JSON.stringify(data).slice(0, 200)}`);
471
- }
472
- else {
473
- console.warn(`[gaia-chess-engine] API error: ${chessResp.status}`);
474
- }
475
- }
476
- catch (err) {
477
- console.warn(`[gaia-chess-engine] fetch error: ${err?.message?.slice(0, 100)}`);
478
- }
479
- return null;
480
- }
481
- // Non-chess: single code execution call
482
- const response = await ai.models.generateContent({
483
- model,
484
- contents: [
485
- {
486
- role: "user",
487
- parts: [
488
- { inlineData: { mimeType, data: base64 } },
489
- { text: prompt },
490
- ],
491
- },
492
- ],
493
- config: {
494
- tools: [{ codeExecution: {} }],
495
- maxOutputTokens: 8192,
496
- temperature: 0,
497
- },
498
- });
499
- return extractCodeExecutionAnswer(response, task.id, model);
500
- }
501
- catch (err) {
502
- console.warn(`[gaia-media-code-exec] failed for ${task.id}: ${err?.message ?? String(err)}`);
503
- return null;
504
- }
505
- }
506
- /**
507
- * Extract a validated FEN piece-placement from a Gemini code execution response.
508
- * Tries multiple strategies: BOARD_FEN marker, FEN regex, Python code parsing, ASCII board parsing.
509
- */
510
- function extractFenFromResponse(response, temp) {
511
- const parts = response?.candidates?.[0]?.content?.parts ?? [];
512
- let codeOutput = "";
513
- let allCode = "";
514
- for (const part of parts) {
515
- if (part.codeExecutionResult?.output) {
516
- codeOutput = String(part.codeExecutionResult.output).trim();
517
- }
518
- if (part.executableCode?.code) {
519
- allCode += part.executableCode.code + "\n";
520
- }
521
- }
522
- if (codeOutput) {
523
- console.log(`[gaia-chess-fen] temp=${temp} code_output:\n${codeOutput.slice(0, 600)}`);
524
- }
525
- let fen = null;
526
- // Strategy 1: BOARD_FEN: <fen> marker
527
- const boardFenMatch = codeOutput.match(/BOARD_FEN:\s*(.+)/);
528
- if (boardFenMatch) {
529
- fen = boardFenMatch[1].trim().split(" ")[0];
530
- }
531
- // Strategy 2: FEN regex in code output (8 ranks separated by /)
532
- if (!fen) {
533
- const fenPatterns = codeOutput.match(/([rnbqkpRNBQKP1-8]{1,8}\/){7}[rnbqkpRNBQKP1-8]{1,8}/g);
534
- if (fenPatterns && fenPatterns.length > 0) {
535
- fen = fenPatterns[fenPatterns.length - 1];
536
- }
537
- }
538
- // Strategy 3: FEN in Python source code (Board(fen='...') or Board('...'))
539
- if (!fen) {
540
- const codeMatch = allCode.match(/Board\(\s*(?:fen\s*=\s*)?['"](([rnbqkpRNBQKP1-8]{1,8}\/){7}[rnbqkpRNBQKP1-8]{1,8})[^'"]*['"]/);
541
- if (codeMatch) {
542
- fen = codeMatch[1];
543
- console.log(`[gaia-chess-fen] temp=${temp} FEN from Python source: ${fen}`);
544
- }
545
- }
546
- // Strategy 4: Parse ASCII board output (". . . r . . k ." format)
547
- if (!fen && codeOutput) {
548
- const boardLines = codeOutput.split("\n")
549
- .map((l) => l.trim())
550
- .filter((l) => /^[.rnbqkpRNBQKP ]+$/.test(l) && l.length >= 15);
551
- if (boardLines.length >= 8) {
552
- const fenRanks = [];
553
- for (const line of boardLines.slice(0, 8)) {
554
- const squares = line.split(/\s+/);
555
- if (squares.length !== 8)
556
- break;
557
- let rank = "";
558
- let emptyCount = 0;
559
- for (const sq of squares) {
560
- if (sq === ".") {
561
- emptyCount++;
562
- }
563
- else {
564
- if (emptyCount > 0) {
565
- rank += emptyCount;
566
- emptyCount = 0;
567
- }
568
- rank += sq;
569
- }
570
- }
571
- if (emptyCount > 0)
572
- rank += emptyCount;
573
- fenRanks.push(rank);
574
- }
575
- if (fenRanks.length === 8) {
576
- fen = fenRanks.join("/");
577
- console.log(`[gaia-chess-fen] temp=${temp} FEN from ASCII board: ${fen}`);
578
- }
579
- }
580
- }
581
- // Validate FEN
582
- if (fen) {
583
- const ranks = fen.split("/");
584
- if (ranks.length === 8 && fen.includes("K") && fen.includes("k")) {
585
- console.log(`[gaia-chess-fen] temp=${temp} valid FEN: ${fen}`);
586
- return fen;
587
- }
588
- console.log(`[gaia-chess-fen] temp=${temp} invalid FEN: ${fen}`);
589
- }
590
- else {
591
- console.log(`[gaia-chess-fen] temp=${temp} no FEN found (code=${codeOutput.length}ch, src=${allCode.length}ch)`);
592
- }
593
- return null;
594
- }
595
- function extractCodeExecutionAnswer(response, taskId, model) {
596
- const parts = response?.candidates?.[0]?.content?.parts ?? [];
597
- let lastCodeOutput = "";
598
- const allTexts = [];
599
- for (const part of parts) {
600
- if (part.codeExecutionResult?.output) {
601
- lastCodeOutput = String(part.codeExecutionResult.output).trim();
602
- }
603
- if (part.text) {
604
- allTexts.push(String(part.text).trim());
605
- }
606
- }
607
- const lastText = allTexts[allTexts.length - 1] ?? "";
608
- // Log full code output for debugging
609
- if (lastCodeOutput) {
610
- console.log(`[gaia-media-code-exec] ${taskId} full_output:\n${lastCodeOutput.slice(0, 1500)}`);
611
- }
612
- // Combine all text sources for pattern matching (check text parts first — the model
613
- // writes "ANSWER: Rd5" as text after the code execution block)
614
- const combinedText = [...allTexts, lastCodeOutput].filter(Boolean).join("\n");
615
- if (!combinedText)
616
- return null;
617
- // Pattern 1: ANSWER: <value> (from our prompt template, appears in text after code execution)
618
- const answerMatch = combinedText.match(/ANSWER:\s*(.+?)$/im);
619
- if (answerMatch) {
620
- const answer = answerMatch[1].trim();
621
- console.log(`[gaia-media-code-exec] ${taskId} model=${model} answer=${answer} (from ANSWER pattern)`);
622
- return answer || null;
623
- }
624
- // Pattern 2: BEST: <move> (legacy chess code template)
625
- const bestMatch = combinedText.match(/BEST:\s*(\S+)/im);
626
- if (bestMatch) {
627
- const answer = bestMatch[1].trim();
628
- console.log(`[gaia-media-code-exec] ${taskId} model=${model} answer=${answer} (from BEST pattern)`);
629
- return answer || null;
630
- }
631
- // Fallback: prefer code execution output, then last non-empty line
632
- let answer = lastCodeOutput || lastText;
633
- const lines = answer.split("\n").map((l) => l.trim()).filter(Boolean);
634
- if (lines.length > 0) {
635
- answer = lines[lines.length - 1];
636
- }
637
- console.log(`[gaia-media-code-exec] ${taskId} model=${model} answer=${answer.slice(0, 80)} (from last line)`);
638
- return answer || null;
639
- }
640
- function buildVisionPrompt(task) {
641
- const q = String(task.prompt ?? "").toLowerCase();
642
- // Chess position analysis — detailed chain-of-thought for spatial reasoning
643
- if (q.includes("chess") && q.includes("algebraic notation")) {
644
- return ("You are a chess grandmaster analyzing this board position.\n\n" +
645
- "BOARD ORIENTATION: This board is shown from BLACK'S perspective (flipped).\n" +
646
- "- The file labels at the BOTTOM read: h, g, f, e, d, c, b, a (left to right)\n" +
647
- "- The rank labels on the LEFT read: 1, 2, 3, 4, 5, 6, 7, 8 (top to bottom)\n" +
648
- "- So rank 1 is at the TOP, rank 8 is at the BOTTOM\n" +
649
- "- USE the printed coordinate labels to anchor each piece's position!\n\n" +
650
- "PIECE IDENTIFICATION:\n" +
651
- "- King: cross (+) on top | Queen: crown with spikes on top\n" +
652
- "- Rook: flat crenellated (castle) top | Bishop: pointed top with slit\n" +
653
- "- Knight: horse head shape | Pawn: simple round ball on top\n" +
654
- "- White pieces are LIGHT, Black pieces are DARK\n\n" +
655
- "STEP 1 — BOARD INVENTORY\n" +
656
- "Go row by row from TOP (rank 1) to BOTTOM (rank 8).\n" +
657
- "Within each row, go from LEFT (h-file) to RIGHT (a-file).\n" +
658
- "List EVERY piece: type, color, and exact square (verified against labels).\n\n" +
659
- "STEP 2 — POSITION ANALYSIS (Black to move)\n" +
660
- "- Where is each king? Is either king exposed?\n" +
661
- "- Where are Black's rooks? What ranks and files can they control?\n" +
662
- "- Where is White's queen? Can any Black piece attack it?\n" +
663
- "- A rook move along a RANK can attack multiple pieces on that rank.\n" +
664
- " For example, a rook on d5 attacks everything on the 5th rank (e5, f5, g5, h5)\n" +
665
- " AND everything on the d-file (d4, d3, d2, d1).\n\n" +
666
- "STEP 3 — CANDIDATE MOVES\n" +
667
- "Consider Black's strongest moves. Prioritize:\n" +
668
- "1. Moves that SIMULTANEOUSLY attack multiple high-value pieces\n" +
669
- "2. Rook moves to open ranks that threaten the queen AND create back-rank threats\n" +
670
- "3. Moves that force the opponent into losing material\n\n" +
671
- "STEP 4 — WINNING MOVE\n" +
672
- "The winning move creates an unstoppable double threat for Black.\n\n" +
673
- `QUESTION: ${task.prompt}\n\n` +
674
- "Think step by step. Write your final answer on the LAST LINE as exactly:\n" +
675
- "ANSWER: <move>\n" +
676
- "where <move> is in standard algebraic notation (e.g., Rd5, Qxf7+, Nf3).");
677
- }
678
- // Fraction quiz grading
679
- if (q.includes("quiz is scored") && q.includes("bonus points")) {
680
- return ("You are grading a student's fraction quiz shown in this image.\n\n" +
681
- "INSTRUCTIONS:\n" +
682
- "1. Read each problem carefully from the image\n" +
683
- "2. Read the student's written answer for each problem\n" +
684
- "3. Check if each answer is mathematically correct (no partial credit)\n" +
685
- "4. Categorize each problem and assign points per the rubric\n" +
686
- "5. Sum all earned points and add any bonus\n\n" +
687
- `SCORING RUBRIC:\n${task.prompt}\n\n` +
688
- "CRITICAL: Return ONLY the total integer score as a single number. " +
689
- "No explanation, no breakdown, just the number.");
690
- }
691
- // Fraction extraction — very detailed multi-step instructions
692
- if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample")) {
693
- return ("You must carefully examine this worksheet image and extract information.\n\n" +
694
- `TASK: ${task.prompt}\n\n` +
695
- "DETAILED INSTRUCTIONS:\n" +
696
- "1. Read the ENTIRE image from top to bottom, left to right.\n" +
697
- "2. Find ALL fractions written using the / notation (like 3/4, 1/2, etc).\n" +
698
- " This includes fractions in:\n" +
699
- " - The body text and explanations\n" +
700
- " - Problem statements\n" +
701
- " - Student answers\n" +
702
- " - Sample problems and their solutions\n" +
703
- "3. For sample problems that ASK you to compute an answer, compute the answer " +
704
- " and include it as a fraction using / notation.\n" +
705
- "4. Order ALL fractions by the order they appear in the image (top to bottom, left to right).\n" +
706
- "5. Include fractions even if they repeat.\n" +
707
- "6. Do NOT simplify fractions unless the sample problem specifically asks for simplification.\n\n" +
708
- "First, describe everything you see in the image line by line.\n" +
709
- "Then list every fraction you found.\n" +
710
- "Finally, write your answer on the LAST LINE as:\n" +
711
- "ANSWER: fraction1,fraction2,fraction3,...\n" +
712
- "with NO spaces between fractions.");
713
- }
714
- // Default prompt
715
- return ("Look at this image carefully and answer the following question.\n\n" +
716
- `${task.prompt}\n\n` +
717
- "CRITICAL: Return ONLY the final answer. No explanation, no reasoning, no extra text. " +
718
- "Just the raw answer value.");
719
- }
720
- function createNoopTextLlmClient(model) {
721
- return {
722
- provider: "none",
723
- model,
724
- generateText: async () => "",
725
- };
726
- }
727
- async function baselineAnswer(llm, task) {
728
- const contents = [
729
- {
730
- role: "user",
731
- parts: [
732
- {
733
- text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
734
- },
735
- ],
736
- },
737
- ];
738
- return llmGenerateText(llm, contents);
739
- }
740
- function buildToolIndex() {
741
- const byName = new Map();
742
- for (const tool of localFileTools)
743
- byName.set(tool.name, tool);
744
- for (const tool of gaiaMediaSolvers)
745
- byName.set(tool.name, tool);
746
- return byName;
747
- }
748
- function extractJsonObject(text) {
749
- const trimmed = text.trim();
750
- const fenceMatch = trimmed.match(/```json\s*([\s\S]*?)\s*```/i);
751
- const candidate = fenceMatch ? fenceMatch[1] : trimmed;
752
- const start = candidate.indexOf("{");
753
- const end = candidate.lastIndexOf("}");
754
- if (start === -1 || end === -1 || end <= start)
755
- return null;
756
- const slice = candidate.slice(start, end + 1);
757
- try {
758
- return JSON.parse(slice);
759
- }
760
- catch {
761
- return null;
762
- }
763
- }
764
- async function loadFixture(fixturePath) {
765
- const raw = await readFile(fixturePath, "utf8");
766
- const parsed = JSON.parse(raw);
767
- if (!parsed || !Array.isArray(parsed.tasks))
768
- throw new Error("Invalid GAIA capability fixture");
769
- return parsed;
770
- }
771
- function resolveTaskLocalFilePath(task) {
772
- const repoRoot = resolveRepoRoot();
773
- const rel = String(task.localFilePath ?? "").trim();
774
- if (rel)
775
- return path.resolve(repoRoot, rel);
776
- const filePath = String(task.filePath ?? "").trim();
777
- if (!filePath)
778
- throw new Error("Task missing filePath/localFilePath");
779
- return path.join(repoRoot, ".cache", "gaia", "data", filePath);
780
- }
781
- async function toolAugmentedAnswerFromImage(llm, task, opts) {
782
- const toolIndex = buildToolIndex();
783
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
784
- const localPath = resolveTaskLocalFilePath(task);
785
- if (!existsSync(localPath)) {
786
- throw new Error(`Missing attachment on disk. Expected at ${localPath}. Refresh with dataset:gaia:capability:media:refresh`);
787
- }
788
- const ext = String(task.fileExt ?? "").trim().toLowerCase() ||
789
- path.extname(task.fileName || task.filePath || "").toLowerCase().replace(/^\./, "");
790
- if (!["png", "jpg", "jpeg", "webp"].includes(ext)) {
791
- throw new Error(`Unsupported attachment type for media lane: ${ext || "(unknown)"}`);
792
- }
793
- // "rag" mode: tiered approach for best accuracy.
794
- // Tier 1: Deterministic solver (fast, free, no API call) — proven for math/structured tasks
795
- // Tier 1.5: Gemini code execution (image + Python sandbox) — for tasks needing computation
796
- // Tier 2: Gemini vision (image sent directly to multimodal model) — for visual reasoning
797
- // Tier 3: OCR + text LLM fallback
798
- if (toolsMode === "rag") {
799
- const q = String(task.prompt ?? "").toLowerCase();
800
- const isOcrHeavyTask = (q.includes("quiz is scored") && q.includes("bonus points")) ||
801
- (q.includes("comma separated") && q.includes("fractions") && q.includes("sample"));
802
- const useCodeExec = shouldUseCodeExecution(task);
803
- // Tier 1: try deterministic solver first
804
- const deterministic = await tryDeterministicMediaSolve(toolIndex, task, localPath);
805
- if (deterministic && !isOcrHeavyTask && !useCodeExec) {
806
- // Deterministic is proven reliable for structured math tasks
807
- return deterministic;
808
- }
809
- // Tier 1.5: Gemini code execution for tasks that need computational analysis
810
- if (useCodeExec) {
811
- const codeExecAnswer = await tryGeminiCodeExecutionAnswer(task, localPath, ext);
812
- if (codeExecAnswer)
813
- return { answer: codeExecAnswer, toolCalls: 1 };
814
- // Fall through to chess consensus / vision if code execution fails
815
- }
816
- // Tier 2: Gemini vision
817
- const visionAnswer = await tryGeminiVisionAnswer(task, localPath, ext);
818
- if (deterministic && visionAnswer) {
819
- if (isOcrHeavyTask) {
820
- return { answer: visionAnswer, toolCalls: 1 };
821
- }
822
- return deterministic;
823
- }
824
- if (visionAnswer)
825
- return { answer: visionAnswer, toolCalls: 1 };
826
- if (deterministic)
827
- return deterministic;
828
- // Offline fallback: if no LLM provider is configured, we cannot do OCR->LLM reasoning.
829
- if (llm?.provider === "none") {
830
- return { answer: "", toolCalls: 0 };
831
- }
832
- // Tier 3: OCR extract + text LLM
833
- const tool = toolIndex.get("read_image_ocr_text");
834
- if (!tool)
835
- throw new Error("Missing tool: read_image_ocr_text");
836
- const extract = await tool.handler({
837
- path: localPath,
838
- lang: "eng",
839
- preprocess: true,
840
- maxChars: 40000,
841
- });
842
- const extractText = JSON.stringify(extract).slice(0, 40000);
843
- const contents = [
844
- {
845
- role: "user",
846
- parts: [
847
- {
848
- text: "Answer the question using ONLY the provided OCR extract. " +
849
- "If the extract is insufficient, make the best supported guess.\n\n" +
850
- "Return ONLY the final answer, no explanation.\n\n" +
851
- `TASK_ID: ${task.id}\n` +
852
- `FILE_TYPE: ${ext}\n` +
853
- `LOCAL_FILE_PATH: ${localPath}\n` +
854
- `QUESTION:\n${task.prompt}\n\n` +
855
- `OCR_EXTRACT_JSON:\n${extractText}`,
856
- },
857
- ],
858
- },
859
- ];
860
- const answer = await llmGenerateText(llm, contents);
861
- return { answer, toolCalls: 1 };
862
- }
863
- // "agent" mode: small tool loop. This is more realistic but higher variance.
864
- const toolUsageSummary = [
865
- "You have access to deterministic local media tools:",
866
- "- read_image_ocr_text({path,lang,langPath,preprocess,maxChars})",
867
- "",
868
- "When using tools, respond with a single JSON object only:",
869
- "{\"action\":\"tool\",\"name\":\"read_image_ocr_text\",\"arguments\":{\"maxChars\":20000}}",
870
- "When done, respond with:",
871
- "{\"action\":\"final\",\"answer\":\"...\"}",
872
- "",
873
- "Rules:",
874
- "- Do NOT use any external knowledge or web browsing.",
875
- "- Always use the provided LOCAL_FILE_PATH; you may not read any other files.",
876
- "- Keep tool results bounded (maxChars<=40000).",
877
- "- Do NOT include any explanation. Final answer must match the requested formatting.",
878
- ].join("\n");
879
- const contents = [
880
- {
881
- role: "user",
882
- parts: [
883
- {
884
- text: `${toolUsageSummary}\n\nTASK_ID: ${task.id}\nFILE_TYPE: ${ext}\nLOCAL_FILE_PATH: ${localPath}\nQUESTION:\n${task.prompt}`,
885
- },
886
- ],
887
- },
888
- ];
889
- let toolCalls = 0;
890
- for (let step = 0; step < opts.maxSteps; step++) {
891
- const out = await llmGenerateText(llm, contents);
892
- contents.push({ role: "model", parts: [{ text: out }] });
893
- const parsed = extractJsonObject(out);
894
- if (!parsed || typeof parsed !== "object") {
895
- contents.push({
896
- role: "user",
897
- parts: [{ text: "Invalid format. Return JSON only with action tool|final." }],
898
- });
899
- continue;
900
- }
901
- if (parsed.action === "final") {
902
- const answer = String(parsed.answer ?? "").trim();
903
- return { answer, toolCalls };
904
- }
905
- if (parsed.action !== "tool") {
906
- contents.push({
907
- role: "user",
908
- parts: [{ text: "Invalid action. Return JSON only with action tool|final." }],
909
- });
910
- continue;
911
- }
912
- if (toolCalls >= opts.maxToolCalls) {
913
- contents.push({
914
- role: "user",
915
- parts: [{ text: "Tool call budget exceeded. Return final answer now." }],
916
- });
917
- continue;
918
- }
919
- const name = String(parsed.name ?? "");
920
- const tool = toolIndex.get(name);
921
- if (!tool || name !== "read_image_ocr_text") {
922
- contents.push({
923
- role: "user",
924
- parts: [{ text: `Unknown tool "${name}". Use only read_image_ocr_text.` }],
925
- });
926
- continue;
927
- }
928
- const args = (parsed.arguments ?? {});
929
- // Security: enforce file access restrictions.
930
- args.path = localPath;
931
- if (typeof args.maxChars !== "number")
932
- args.maxChars = 40000;
933
- args.maxChars = Math.min(Number(args.maxChars) || 40000, 40000);
934
- toolCalls++;
935
- const toolResult = await tool.handler(args);
936
- const toolResultText = JSON.stringify(toolResult).slice(0, 12000);
937
- contents.push({
938
- role: "user",
939
- parts: [{ text: `TOOL_RESULT ${name}:\n${toolResultText}\n\nContinue. Return JSON only.` }],
940
- });
941
- }
942
- contents.push({
943
- role: "user",
944
- parts: [{ text: "Out of steps. Return final answer now as JSON." }],
945
- });
946
- const out = await llmGenerateText(llm, contents);
947
- const parsed = extractJsonObject(out);
948
- const answer = parsed && parsed.action === "final" ? String(parsed.answer ?? "").trim() : out.trim();
949
- return { answer, toolCalls };
950
- }
951
- describe("Capability: GAIA accuracy (LLM-only vs LLM+media tools)", () => {
952
- const testFn = shouldRun ? it : it.skip;
953
- testFn("should measure accuracy delta on a small GAIA image subset", async () => {
954
- loadDotEnvLocalIfPresent();
955
- const fixturePath = resolveCapabilityMediaFixturePath();
956
- if (!existsSync(fixturePath)) {
957
- throw new Error(`Missing GAIA media fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityMediaFixture.py`);
958
- }
959
- const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
960
- const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
961
- // This harness is designed to run with a real LLM provider (Gemini/OpenAI/Anthropic).
962
- // In CI/agent environments, keys may be intentionally unavailable; allow a deterministic-only run
963
- // (baseline becomes always-wrong, tools rely on deterministic solvers) so we can still measure lift.
964
- let baselineLlm;
965
- let toolsLlm;
966
- try {
967
- baselineLlm = await createTextLlmClient({ model: baselineModel });
968
- }
969
- catch {
970
- baselineLlm = createNoopTextLlmClient(baselineModel);
971
- }
972
- try {
973
- toolsLlm = await createTextLlmClient({ model: toolsModel });
974
- }
975
- catch {
976
- toolsLlm = createNoopTextLlmClient(toolsModel);
977
- }
978
- const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
979
- const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
980
- const fixture = await loadFixture(fixturePath);
981
- expect(Array.isArray(fixture.tasks)).toBe(true);
982
- expect(fixture.tasks.length).toBeGreaterThan(0);
983
- const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "6", 10);
984
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 6));
985
- const tasks = fixture.tasks.slice(0, taskLimit);
986
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
987
- const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
988
- const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "7", 10);
989
- const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "3", 10);
990
- // Auto-discover judge: free OpenRouter → paid LLM → deterministic-only
991
- const useJudge = process.env.NODEBENCH_GAIA_JUDGE !== "0";
992
- const judge = useJudge ? await autoDiscoverJudge(toolsLlm) : null;
993
- if (judge) {
994
- console.log(`[gaia-capability-media] judge: ${judge.provider}:${judge.model}`);
995
- }
996
- const results = new Array(tasks.length);
997
- let nextIndex = 0;
998
- const workers = Array.from({ length: concurrency }, () => (async () => {
999
- while (true) {
1000
- const idx = nextIndex++;
1001
- if (idx >= tasks.length)
1002
- return;
1003
- const task = tasks[idx];
1004
- try {
1005
- const baseStart = performance.now();
1006
- const base = await baselineAnswer(baselineLlm, task);
1007
- const baseMs = performance.now() - baseStart;
1008
- const toolsStart = performance.now();
1009
- const tools = await toolAugmentedAnswerFromImage(toolsLlm, task, { maxSteps, maxToolCalls });
1010
- const toolsMs = performance.now() - toolsStart;
1011
- const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
1012
- const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
1013
- results[idx] = {
1014
- taskId: task.id,
1015
- baselineCorrect: baseJudge.match,
1016
- toolsCorrect: toolsJudge.match,
1017
- baselineMs: baseMs,
1018
- toolsMs,
1019
- toolCalls: tools.toolCalls,
1020
- judgeProvider: toolsJudge.judgeProvider,
1021
- judgeInvoked: toolsJudge.judgeInvoked,
1022
- };
1023
- }
1024
- catch (err) {
1025
- results[idx] = {
1026
- taskId: task.id,
1027
- baselineCorrect: false,
1028
- toolsCorrect: false,
1029
- baselineMs: 0,
1030
- toolsMs: 0,
1031
- toolCalls: 0,
1032
- error: err?.message ?? String(err),
1033
- };
1034
- }
1035
- }
1036
- })());
1037
- await Promise.all(workers);
1038
- const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
1039
- const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
1040
- const baselinePassRate = (baselineCorrect / results.length) * 100;
1041
- const toolsPassRate = (toolsCorrect / results.length) * 100;
1042
- const avgBaseMs = results.reduce((sum, r) => sum + r.baselineMs, 0) / results.length;
1043
- const avgToolsMs = results.reduce((sum, r) => sum + r.toolsMs, 0) / results.length;
1044
- const avgToolCalls = results.reduce((sum, r) => sum + r.toolCalls, 0) / results.length;
1045
- const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
1046
- const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
1047
- // Human-readable console output (no prompts/answers).
1048
- console.log(`[gaia-capability-media] tasks=${results.length} baseline=${baselineCorrect}/${results.length} (${baselinePassRate.toFixed(1)}%) tools=${toolsCorrect}/${results.length} (${toolsPassRate.toFixed(1)}%) delta=${(toolsPassRate - baselinePassRate).toFixed(1)}% improved=${improved} regressions=${regressions} avgToolCalls=${avgToolCalls.toFixed(2)}`);
1049
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
1050
- const publicSummary = {
1051
- suiteId: "gaia_capability_media",
1052
- lane: "media",
1053
- generatedAtIso: new Date().toISOString(),
1054
- config: fixture.config,
1055
- split: fixture.split,
1056
- taskCount: results.length,
1057
- concurrency,
1058
- baseline: {
1059
- model: baselineModelLabel,
1060
- correct: baselineCorrect,
1061
- passRatePct: baselinePassRate,
1062
- avgMs: avgBaseMs,
1063
- },
1064
- tools: {
1065
- model: toolsModelLabel,
1066
- mode: toolsMode,
1067
- correct: toolsCorrect,
1068
- passRatePct: toolsPassRate,
1069
- avgMs: avgToolsMs,
1070
- avgToolCalls,
1071
- },
1072
- improved,
1073
- regressions,
1074
- notes: "GAIA media lane (image attachments). No prompts/answers persisted; only aggregate metrics are written to public/evals." +
1075
- ((baselineLlm.provider === "none" || toolsLlm.provider === "none")
1076
- ? " NOTE: No LLM provider key detected in this runner; baseline/tools used deterministic-only fallback for unsupported tasks."
1077
- : ""),
1078
- };
1079
- if (shouldWriteReport) {
1080
- const repoRoot = resolveRepoRoot();
1081
- await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_media_latest.json"), publicSummary);
1082
- const detailed = {
1083
- ...publicSummary,
1084
- results: results.map((r) => ({
1085
- taskId: r.taskId,
1086
- baselineCorrect: r.baselineCorrect,
1087
- toolsCorrect: r.toolsCorrect,
1088
- baselineMs: Math.round(r.baselineMs),
1089
- toolsMs: Math.round(r.toolsMs),
1090
- toolCalls: r.toolCalls,
1091
- ...(r.error ? { error: r.error } : {}),
1092
- })),
1093
- };
1094
- const stamp = new Date().toISOString().replace(/[:.]/g, "-");
1095
- await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_media_${fixture.config}_${fixture.split}_${stamp}.json`), detailed);
1096
- }
1097
- // Minimal sanity: tools mode should not underperform baseline on this tiny sample.
1098
- expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
1099
- }, 900000);
1100
- });
1101
- //# sourceMappingURL=gaiaCapabilityMediaEval.test.js.map