nodebench-mcp 2.70.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -41
- package/dist/agents/alertRouter.d.ts +38 -0
- package/dist/agents/alertRouter.js +151 -0
- package/dist/agents/alertRouter.js.map +1 -0
- package/dist/agents/entityMemory.d.ts +40 -0
- package/dist/agents/entityMemory.js +64 -0
- package/dist/agents/entityMemory.js.map +1 -0
- package/dist/agents/subAgents.d.ts +35 -0
- package/dist/agents/subAgents.js +62 -0
- package/dist/agents/subAgents.js.map +1 -0
- package/dist/benchmarks/benchmarkRunner.js +14 -0
- package/dist/benchmarks/benchmarkRunner.js.map +1 -1
- package/dist/benchmarks/chainEval.js +107 -0
- package/dist/benchmarks/chainEval.js.map +1 -1
- package/dist/benchmarks/llmJudgeEval.js +85 -0
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/searchQualityEval.js +118 -5
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/dist/cli/search.d.ts +13 -0
- package/dist/cli/search.js +130 -0
- package/dist/cli/search.js.map +1 -0
- package/dist/dashboard/operatingDashboardHtml.js +2 -1
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +3 -2
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/db.d.ts +6 -2
- package/dist/db.js +521 -6
- package/dist/db.js.map +1 -1
- package/dist/index.js +349 -67
- package/dist/index.js.map +1 -1
- package/dist/packageInfo.d.ts +3 -0
- package/dist/packageInfo.js +32 -0
- package/dist/packageInfo.js.map +1 -0
- package/dist/profiler/behaviorStore.d.ts +97 -0
- package/dist/profiler/behaviorStore.js +276 -0
- package/dist/profiler/behaviorStore.js.map +1 -0
- package/dist/profiler/eventCollector.d.ts +119 -0
- package/dist/profiler/eventCollector.js +267 -0
- package/dist/profiler/eventCollector.js.map +1 -0
- package/dist/profiler/index.d.ts +15 -0
- package/dist/profiler/index.js +16 -0
- package/dist/profiler/index.js.map +1 -0
- package/dist/profiler/mcpProxy.d.ts +49 -0
- package/dist/profiler/mcpProxy.js +123 -0
- package/dist/profiler/mcpProxy.js.map +1 -0
- package/dist/profiler/modelRouter.d.ts +30 -0
- package/dist/profiler/modelRouter.js +99 -0
- package/dist/profiler/modelRouter.js.map +1 -0
- package/dist/profiler/otelReceiver.d.ts +17 -0
- package/dist/profiler/otelReceiver.js +62 -0
- package/dist/profiler/otelReceiver.js.map +1 -0
- package/dist/profiler/proofEngine.d.ts +41 -0
- package/dist/profiler/proofEngine.js +93 -0
- package/dist/profiler/proofEngine.js.map +1 -0
- package/dist/profiler/workflowTemplates.d.ts +41 -0
- package/dist/profiler/workflowTemplates.js +95 -0
- package/dist/profiler/workflowTemplates.js.map +1 -0
- package/dist/providers/localMemoryProvider.js +3 -2
- package/dist/providers/localMemoryProvider.js.map +1 -1
- package/dist/runtimeConfig.d.ts +11 -0
- package/dist/runtimeConfig.js +27 -0
- package/dist/runtimeConfig.js.map +1 -0
- package/dist/sandboxApi.js +2 -1
- package/dist/sandboxApi.js.map +1 -1
- package/dist/security/auditLog.js +8 -3
- package/dist/security/auditLog.js.map +1 -1
- package/dist/subconscious/blocks.d.ts +43 -0
- package/dist/subconscious/blocks.js +158 -0
- package/dist/subconscious/blocks.js.map +1 -0
- package/dist/subconscious/classifier.d.ts +22 -0
- package/dist/subconscious/classifier.js +118 -0
- package/dist/subconscious/classifier.js.map +1 -0
- package/dist/subconscious/graphEngine.d.ts +65 -0
- package/dist/subconscious/graphEngine.js +234 -0
- package/dist/subconscious/graphEngine.js.map +1 -0
- package/dist/subconscious/index.d.ts +19 -0
- package/dist/subconscious/index.js +20 -0
- package/dist/subconscious/index.js.map +1 -0
- package/dist/subconscious/tools.d.ts +5 -0
- package/dist/subconscious/tools.js +255 -0
- package/dist/subconscious/tools.js.map +1 -0
- package/dist/subconscious/whisperPolicy.d.ts +20 -0
- package/dist/subconscious/whisperPolicy.js +171 -0
- package/dist/subconscious/whisperPolicy.js.map +1 -0
- package/dist/sweep/engine.d.ts +27 -0
- package/dist/sweep/engine.js +244 -0
- package/dist/sweep/engine.js.map +1 -0
- package/dist/sweep/index.d.ts +9 -0
- package/dist/sweep/index.js +8 -0
- package/dist/sweep/index.js.map +1 -0
- package/dist/sweep/sources/github_trending.d.ts +6 -0
- package/dist/sweep/sources/github_trending.js +37 -0
- package/dist/sweep/sources/github_trending.js.map +1 -0
- package/dist/sweep/sources/hackernews.d.ts +7 -0
- package/dist/sweep/sources/hackernews.js +57 -0
- package/dist/sweep/sources/hackernews.js.map +1 -0
- package/dist/sweep/sources/openbb_finance.d.ts +9 -0
- package/dist/sweep/sources/openbb_finance.js +46 -0
- package/dist/sweep/sources/openbb_finance.js.map +1 -0
- package/dist/sweep/sources/producthunt.d.ts +6 -0
- package/dist/sweep/sources/producthunt.js +41 -0
- package/dist/sweep/sources/producthunt.js.map +1 -0
- package/dist/sweep/sources/web_signals.d.ts +7 -0
- package/dist/sweep/sources/web_signals.js +63 -0
- package/dist/sweep/sources/web_signals.js.map +1 -0
- package/dist/sweep/sources/yahoo_finance.d.ts +6 -0
- package/dist/sweep/sources/yahoo_finance.js +47 -0
- package/dist/sweep/sources/yahoo_finance.js.map +1 -0
- package/dist/sweep/types.d.ts +50 -0
- package/dist/sweep/types.js +9 -0
- package/dist/sweep/types.js.map +1 -0
- package/dist/sync/founderEpisodeStore.d.ts +98 -0
- package/dist/sync/founderEpisodeStore.js +230 -0
- package/dist/sync/founderEpisodeStore.js.map +1 -0
- package/dist/sync/hyperloopArchive.d.ts +51 -0
- package/dist/sync/hyperloopArchive.js +153 -0
- package/dist/sync/hyperloopArchive.js.map +1 -0
- package/dist/sync/hyperloopEval.d.ts +123 -0
- package/dist/sync/hyperloopEval.js +389 -0
- package/dist/sync/hyperloopEval.js.map +1 -0
- package/dist/sync/protocol.d.ts +172 -0
- package/dist/sync/protocol.js +9 -0
- package/dist/sync/protocol.js.map +1 -0
- package/dist/sync/sessionMemory.d.ts +47 -0
- package/dist/sync/sessionMemory.js +138 -0
- package/dist/sync/sessionMemory.js.map +1 -0
- package/dist/sync/store.d.ts +384 -0
- package/dist/sync/store.js +1435 -0
- package/dist/sync/store.js.map +1 -0
- package/dist/sync/syncBridgeClient.d.ts +30 -0
- package/dist/sync/syncBridgeClient.js +172 -0
- package/dist/sync/syncBridgeClient.js.map +1 -0
- package/dist/tools/autonomousDeliveryTools.d.ts +2 -0
- package/dist/tools/autonomousDeliveryTools.js +1104 -0
- package/dist/tools/autonomousDeliveryTools.js.map +1 -0
- package/dist/tools/boilerplateTools.js +10 -9
- package/dist/tools/boilerplateTools.js.map +1 -1
- package/dist/tools/claudeCodeIngestTools.d.ts +10 -0
- package/dist/tools/claudeCodeIngestTools.js +347 -0
- package/dist/tools/claudeCodeIngestTools.js.map +1 -0
- package/dist/tools/coreWorkflowTools.d.ts +2 -0
- package/dist/tools/coreWorkflowTools.js +488 -0
- package/dist/tools/coreWorkflowTools.js.map +1 -0
- package/dist/tools/deltaTools.d.ts +15 -0
- package/dist/tools/deltaTools.js +1522 -0
- package/dist/tools/deltaTools.js.map +1 -0
- package/dist/tools/documentationTools.js +2 -1
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/entityLookupTools.d.ts +14 -0
- package/dist/tools/entityLookupTools.js +159 -0
- package/dist/tools/entityLookupTools.js.map +1 -0
- package/dist/tools/entityTemporalTools.d.ts +12 -0
- package/dist/tools/entityTemporalTools.js +330 -0
- package/dist/tools/entityTemporalTools.js.map +1 -0
- package/dist/tools/founderLocalPipeline.d.ts +215 -0
- package/dist/tools/founderLocalPipeline.js +1516 -2
- package/dist/tools/founderLocalPipeline.js.map +1 -1
- package/dist/tools/founderOperatingModel.d.ts +120 -0
- package/dist/tools/founderOperatingModel.js +469 -0
- package/dist/tools/founderOperatingModel.js.map +1 -0
- package/dist/tools/founderOperatingModelTools.d.ts +2 -0
- package/dist/tools/founderOperatingModelTools.js +169 -0
- package/dist/tools/founderOperatingModelTools.js.map +1 -0
- package/dist/tools/founderStrategicOpsTools.d.ts +2 -0
- package/dist/tools/founderStrategicOpsTools.js +1310 -0
- package/dist/tools/founderStrategicOpsTools.js.map +1 -0
- package/dist/tools/graphifyTools.d.ts +19 -0
- package/dist/tools/graphifyTools.js +375 -0
- package/dist/tools/graphifyTools.js.map +1 -0
- package/dist/tools/index.d.ts +3 -0
- package/dist/tools/index.js +4 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/monteCarloTools.d.ts +16 -0
- package/dist/tools/monteCarloTools.js +225 -0
- package/dist/tools/monteCarloTools.js.map +1 -0
- package/dist/tools/packetCompilerTools.d.ts +12 -0
- package/dist/tools/packetCompilerTools.js +322 -0
- package/dist/tools/packetCompilerTools.js.map +1 -0
- package/dist/tools/planSynthesisTools.d.ts +15 -0
- package/dist/tools/planSynthesisTools.js +455 -0
- package/dist/tools/planSynthesisTools.js.map +1 -0
- package/dist/tools/profilerTools.d.ts +20 -0
- package/dist/tools/profilerTools.js +364 -0
- package/dist/tools/profilerTools.js.map +1 -0
- package/dist/tools/progressiveDiscoveryTools.js +2 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/savingsTools.d.ts +11 -0
- package/dist/tools/savingsTools.js +155 -0
- package/dist/tools/savingsTools.js.map +1 -0
- package/dist/tools/scenarioCompilerTools.d.ts +14 -0
- package/dist/tools/scenarioCompilerTools.js +290 -0
- package/dist/tools/scenarioCompilerTools.js.map +1 -0
- package/dist/tools/sharedContextTools.d.ts +2 -0
- package/dist/tools/sharedContextTools.js +423 -0
- package/dist/tools/sharedContextTools.js.map +1 -0
- package/dist/tools/sitemapTools.d.ts +15 -0
- package/dist/tools/sitemapTools.js +560 -0
- package/dist/tools/sitemapTools.js.map +1 -0
- package/dist/tools/sweepTools.d.ts +9 -0
- package/dist/tools/sweepTools.js +112 -0
- package/dist/tools/sweepTools.js.map +1 -0
- package/dist/tools/syncBridgeTools.d.ts +2 -0
- package/dist/tools/syncBridgeTools.js +258 -0
- package/dist/tools/syncBridgeTools.js.map +1 -0
- package/dist/tools/toolRegistry.js +1223 -45
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/workspaceTools.d.ts +19 -0
- package/dist/tools/workspaceTools.js +762 -0
- package/dist/tools/workspaceTools.js.map +1 -0
- package/dist/toolsetRegistry.js +162 -3
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +39 -38
- package/rules/nodebench-agentic-reliability.md +32 -0
- package/rules/nodebench-analyst-diagnostic.md +25 -0
- package/rules/nodebench-auto-qa.md +31 -0
- package/rules/nodebench-completion-traceability.md +22 -0
- package/rules/nodebench-flywheel-continuous.md +25 -0
- package/rules/nodebench-pre-release-review.md +24 -0
- package/rules/nodebench-qa-dogfood.md +26 -0
- package/rules/nodebench-scenario-testing.md +30 -0
- package/rules/nodebench-self-direction.md +23 -0
- package/rules/nodebench-self-judge-loop.md +24 -0
- package/scripts/install.sh +215 -0
- package/dist/__tests__/analytics.test.d.ts +0 -11
- package/dist/__tests__/analytics.test.js +0 -546
- package/dist/__tests__/analytics.test.js.map +0 -1
- package/dist/__tests__/architectComplex.test.d.ts +0 -1
- package/dist/__tests__/architectComplex.test.js +0 -373
- package/dist/__tests__/architectComplex.test.js.map +0 -1
- package/dist/__tests__/architectSmoke.test.d.ts +0 -1
- package/dist/__tests__/architectSmoke.test.js +0 -92
- package/dist/__tests__/architectSmoke.test.js.map +0 -1
- package/dist/__tests__/audit-registry.d.ts +0 -1
- package/dist/__tests__/audit-registry.js +0 -60
- package/dist/__tests__/audit-registry.js.map +0 -1
- package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
- package/dist/__tests__/batchAutopilot.test.js +0 -218
- package/dist/__tests__/batchAutopilot.test.js.map +0 -1
- package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
- package/dist/__tests__/cliSubcommands.test.js +0 -138
- package/dist/__tests__/cliSubcommands.test.js.map +0 -1
- package/dist/__tests__/comparativeBench.test.d.ts +0 -1
- package/dist/__tests__/comparativeBench.test.js +0 -722
- package/dist/__tests__/comparativeBench.test.js.map +0 -1
- package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
- package/dist/__tests__/critterCalibrationEval.js +0 -370
- package/dist/__tests__/critterCalibrationEval.js.map +0 -1
- package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
- package/dist/__tests__/dynamicLoading.test.js +0 -280
- package/dist/__tests__/dynamicLoading.test.js.map +0 -1
- package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
- package/dist/__tests__/embeddingProvider.test.js +0 -86
- package/dist/__tests__/embeddingProvider.test.js.map +0 -1
- package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
- package/dist/__tests__/evalDatasetBench.test.js +0 -738
- package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
- package/dist/__tests__/evalHarness.test.d.ts +0 -1
- package/dist/__tests__/evalHarness.test.js +0 -1107
- package/dist/__tests__/evalHarness.test.js.map +0 -1
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
- package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
- package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
- package/dist/__tests__/forecastingDogfood.test.js +0 -284
- package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
- package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
- package/dist/__tests__/forecastingScoring.test.js +0 -202
- package/dist/__tests__/forecastingScoring.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
- package/dist/__tests__/helpers/answerMatch.js +0 -267
- package/dist/__tests__/helpers/answerMatch.js.map +0 -1
- package/dist/__tests__/helpers/textLlm.d.ts +0 -25
- package/dist/__tests__/helpers/textLlm.js +0 -214
- package/dist/__tests__/helpers/textLlm.js.map +0 -1
- package/dist/__tests__/localDashboard.test.d.ts +0 -1
- package/dist/__tests__/localDashboard.test.js +0 -226
- package/dist/__tests__/localDashboard.test.js.map +0 -1
- package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
- package/dist/__tests__/multiHopDogfood.test.js +0 -303
- package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
- package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
- package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
- package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
- package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
- package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
- package/dist/__tests__/openclawDogfood.test.js +0 -535
- package/dist/__tests__/openclawDogfood.test.js.map +0 -1
- package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
- package/dist/__tests__/openclawMessaging.test.js +0 -232
- package/dist/__tests__/openclawMessaging.test.js.map +0 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
- package/dist/__tests__/presetRealWorldBench.test.js +0 -859
- package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
- package/dist/__tests__/tools.test.d.ts +0 -1
- package/dist/__tests__/tools.test.js +0 -3201
- package/dist/__tests__/tools.test.js.map +0 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
- package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
- package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
- package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
- package/dist/__tests__/traceabilityDogfood.test.js +0 -241
- package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
- package/dist/__tests__/webmcpTools.test.d.ts +0 -7
- package/dist/__tests__/webmcpTools.test.js +0 -195
- package/dist/__tests__/webmcpTools.test.js.map +0 -1
- package/dist/benchmarks/testProviderBus.d.ts +0 -7
- package/dist/benchmarks/testProviderBus.js +0 -272
- package/dist/benchmarks/testProviderBus.js.map +0 -1
- package/dist/hooks/postCompaction.d.ts +0 -14
- package/dist/hooks/postCompaction.js +0 -51
- package/dist/hooks/postCompaction.js.map +0 -1
- package/dist/security/__tests__/security.test.d.ts +0 -8
- package/dist/security/__tests__/security.test.js +0 -295
- package/dist/security/__tests__/security.test.js.map +0 -1
- package/dist/tools/documentTools.d.ts +0 -5
- package/dist/tools/documentTools.js +0 -524
- package/dist/tools/documentTools.js.map +0 -1
- package/dist/tools/financialTools.d.ts +0 -10
- package/dist/tools/financialTools.js +0 -403
- package/dist/tools/financialTools.js.map +0 -1
- package/dist/tools/memoryTools.d.ts +0 -5
- package/dist/tools/memoryTools.js +0 -137
- package/dist/tools/memoryTools.js.map +0 -1
- package/dist/tools/planningTools.d.ts +0 -5
- package/dist/tools/planningTools.js +0 -147
- package/dist/tools/planningTools.js.map +0 -1
- package/dist/tools/searchTools.d.ts +0 -5
- package/dist/tools/searchTools.js +0 -145
- package/dist/tools/searchTools.js.map +0 -1
|
@@ -1,1259 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GAIA capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP tools.
|
|
3
|
-
*
|
|
4
|
-
* This test attempts to solve a small GAIA subset and scores answers against
|
|
5
|
-
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
6
|
-
*
|
|
7
|
-
* Safety:
|
|
8
|
-
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
9
|
-
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
10
|
-
*
|
|
11
|
-
* Disabled by default (cost + rate limits + external network). Run with:
|
|
12
|
-
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
13
|
-
*/
|
|
14
|
-
import { describe, expect, it } from "vitest";
|
|
15
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
16
|
-
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
17
|
-
import path from "node:path";
|
|
18
|
-
import { fileURLToPath } from "node:url";
|
|
19
|
-
import { performance } from "node:perf_hooks";
|
|
20
|
-
import { webTools } from "../tools/webTools.js";
|
|
21
|
-
import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
|
|
22
|
-
import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
|
|
23
|
-
const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
|
|
24
|
-
const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
|
|
25
|
-
async function safeWriteJson(filePath, payload) {
|
|
26
|
-
try {
|
|
27
|
-
await mkdir(path.dirname(filePath), { recursive: true });
|
|
28
|
-
await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
|
|
29
|
-
}
|
|
30
|
-
catch (err) {
|
|
31
|
-
// Never fail the benchmark because a report couldn't be written.
|
|
32
|
-
console.warn(`[gaia-capability] report write failed: ${err?.message ?? String(err)}`);
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
function resolveRepoRoot() {
|
|
36
|
-
const testDir = path.dirname(fileURLToPath(import.meta.url));
|
|
37
|
-
return path.resolve(testDir, "../../../..");
|
|
38
|
-
}
|
|
39
|
-
function resolveCapabilityFixturePath() {
|
|
40
|
-
const override = process.env.NODEBENCH_GAIA_CAPABILITY_FIXTURE_PATH;
|
|
41
|
-
if (override) {
|
|
42
|
-
// Make override convenient when running from `packages/mcp-local` (vitest cwd),
|
|
43
|
-
// while the fixture typically lives under repo-root `.cache/gaia/...`.
|
|
44
|
-
if (path.isAbsolute(override))
|
|
45
|
-
return override;
|
|
46
|
-
const repoRoot = resolveRepoRoot();
|
|
47
|
-
return path.resolve(repoRoot, override);
|
|
48
|
-
}
|
|
49
|
-
const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
|
|
50
|
-
const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
|
|
51
|
-
const repoRoot = resolveRepoRoot();
|
|
52
|
-
return path.join(repoRoot, ".cache", "gaia", `gaia_capability_${config}_${split}.sample.json`);
|
|
53
|
-
}
|
|
54
|
-
function loadDotEnvLocalIfPresent() {
|
|
55
|
-
const repoRoot = resolveRepoRoot();
|
|
56
|
-
const envPath = path.join(repoRoot, ".env.local");
|
|
57
|
-
if (!existsSync(envPath))
|
|
58
|
-
return;
|
|
59
|
-
const text = readFileSync(envPath, "utf8");
|
|
60
|
-
for (const rawLine of text.split(/\r?\n/)) {
|
|
61
|
-
const line = rawLine.trim();
|
|
62
|
-
if (!line || line.startsWith("#"))
|
|
63
|
-
continue;
|
|
64
|
-
const idx = line.indexOf("=");
|
|
65
|
-
if (idx <= 0)
|
|
66
|
-
continue;
|
|
67
|
-
const key = line.slice(0, idx).trim();
|
|
68
|
-
let value = line.slice(idx + 1).trim();
|
|
69
|
-
if ((value.startsWith("\"") && value.endsWith("\"")) ||
|
|
70
|
-
(value.startsWith("'") && value.endsWith("'"))) {
|
|
71
|
-
value = value.slice(1, -1);
|
|
72
|
-
}
|
|
73
|
-
if (!process.env[key])
|
|
74
|
-
process.env[key] = value;
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
async function llmGenerateText(llm, history) {
|
|
78
|
-
const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
|
|
79
|
-
return generateTextFromHistory(llm, history, {
|
|
80
|
-
temperature: Number.isFinite(temperature) ? temperature : 0,
|
|
81
|
-
maxOutputTokens: 1024,
|
|
82
|
-
});
|
|
83
|
-
}
|
|
84
|
-
async function baselineAnswer(llm, task) {
|
|
85
|
-
const contents = [
|
|
86
|
-
{
|
|
87
|
-
role: "user",
|
|
88
|
-
parts: [
|
|
89
|
-
{
|
|
90
|
-
text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
|
|
91
|
-
},
|
|
92
|
-
],
|
|
93
|
-
},
|
|
94
|
-
];
|
|
95
|
-
return llmGenerateText(llm, contents);
|
|
96
|
-
}
|
|
97
|
-
function buildToolIndex() {
|
|
98
|
-
const byName = new Map();
|
|
99
|
-
for (const tool of webTools)
|
|
100
|
-
byName.set(tool.name, tool);
|
|
101
|
-
return byName;
|
|
102
|
-
}
|
|
103
|
-
// ---------- Deterministic solvers for web lane ----------
|
|
104
|
-
/** English word list for Caesar cipher scoring. Common short words are weighted. */
|
|
105
|
-
const COMMON_ENGLISH = new Set([
|
|
106
|
-
"the", "be", "to", "of", "and", "a", "in", "that", "have", "i",
|
|
107
|
-
"it", "for", "not", "on", "with", "he", "as", "you", "do", "at",
|
|
108
|
-
"this", "but", "his", "by", "from", "they", "we", "say", "her", "she",
|
|
109
|
-
"or", "an", "will", "my", "one", "all", "would", "there", "their", "what",
|
|
110
|
-
"so", "up", "out", "if", "about", "who", "get", "which", "go", "me",
|
|
111
|
-
"when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
|
|
112
|
-
"is", "are", "was", "were", "been", "being", "had", "has", "did", "does",
|
|
113
|
-
"am", "into", "its", "our", "your", "us", "them", "than", "then", "now",
|
|
114
|
-
"where", "how", "each", "over", "here", "some", "new", "also", "way",
|
|
115
|
-
"meet", "picnic", "plaza", "place", "park", "cafe", "bar", "restaurant",
|
|
116
|
-
"friday", "monday", "tuesday", "wednesday", "thursday", "saturday", "sunday",
|
|
117
|
-
]);
|
|
118
|
-
function caesarShift(text, shift) {
|
|
119
|
-
return text
|
|
120
|
-
.split("")
|
|
121
|
-
.map((ch) => {
|
|
122
|
-
const code = ch.charCodeAt(0);
|
|
123
|
-
if (code >= 65 && code <= 90)
|
|
124
|
-
return String.fromCharCode(((code - 65 + shift) % 26) + 65);
|
|
125
|
-
if (code >= 97 && code <= 122)
|
|
126
|
-
return String.fromCharCode(((code - 97 + shift) % 26) + 97);
|
|
127
|
-
return ch;
|
|
128
|
-
})
|
|
129
|
-
.join("");
|
|
130
|
-
}
|
|
131
|
-
function scoreCaesarCandidate(text) {
|
|
132
|
-
const words = text.toLowerCase().split(/\s+/).filter(Boolean);
|
|
133
|
-
let score = 0;
|
|
134
|
-
for (const w of words) {
|
|
135
|
-
const clean = w.replace(/[^a-z]/g, "");
|
|
136
|
-
if (COMMON_ENGLISH.has(clean))
|
|
137
|
-
score += 2;
|
|
138
|
-
// Bonus for words that look English-like (common bigrams)
|
|
139
|
-
else if (/^[a-z]+$/.test(clean) && clean.length >= 2)
|
|
140
|
-
score += 0.5;
|
|
141
|
-
}
|
|
142
|
-
return score;
|
|
143
|
-
}
|
|
144
|
-
function tryCaesarCipherSolve(task) {
|
|
145
|
-
const prompt = task.prompt;
|
|
146
|
-
const lower = prompt.toLowerCase();
|
|
147
|
-
if (!lower.includes("caesar") || !lower.includes("cipher"))
|
|
148
|
-
return null;
|
|
149
|
-
// Extract the ciphertext: look for the last sentence/phrase that looks like the encrypted message.
|
|
150
|
-
// Common patterns: "This is the message:\n\nXyz abc def." or "the message is: Xyz abc def."
|
|
151
|
-
const lines = prompt.split(/\n/).map((l) => l.trim()).filter(Boolean);
|
|
152
|
-
// Try the last non-empty line as the ciphertext.
|
|
153
|
-
let ciphertext = lines[lines.length - 1];
|
|
154
|
-
// If the last line is a metadata/question line, look for quoted or standalone ciphertext.
|
|
155
|
-
const msgMatch = prompt.match(/message[:\s]*\n\s*(.+)/i);
|
|
156
|
-
if (msgMatch)
|
|
157
|
-
ciphertext = msgMatch[1].trim();
|
|
158
|
-
if (!ciphertext || ciphertext.length < 3)
|
|
159
|
-
return null;
|
|
160
|
-
let bestShift = 0;
|
|
161
|
-
let bestScore = -1;
|
|
162
|
-
let bestText = ciphertext;
|
|
163
|
-
for (let shift = 0; shift < 26; shift++) {
|
|
164
|
-
const candidate = caesarShift(ciphertext, shift);
|
|
165
|
-
const score = scoreCaesarCandidate(candidate);
|
|
166
|
-
if (score > bestScore) {
|
|
167
|
-
bestScore = score;
|
|
168
|
-
bestShift = shift;
|
|
169
|
-
bestText = candidate;
|
|
170
|
-
}
|
|
171
|
-
}
|
|
172
|
-
return bestScore > 0 ? bestText : null;
|
|
173
|
-
}
|
|
174
|
-
/**
|
|
175
|
-
* Deterministic solver for USGS NAS (Nonindigenous Aquatic Species) database queries.
|
|
176
|
-
* The NAS database has a public REST API at https://nas.er.usgs.gov/api/v2.
|
|
177
|
-
* Detects questions about nonindigenous species counts and queries the API directly.
|
|
178
|
-
*/
|
|
179
|
-
async function tryUsgsNasSolve(task) {
|
|
180
|
-
const lower = task.prompt.toLowerCase();
|
|
181
|
-
if (!lower.includes("nonindigenous") && !lower.includes("non-indigenous") && !lower.includes("invasive"))
|
|
182
|
-
return null;
|
|
183
|
-
if (!lower.includes("usgs") && !lower.includes("nonindigenous aquatic species"))
|
|
184
|
-
return null;
|
|
185
|
-
// Extract key parameters from the question
|
|
186
|
-
const stateMatch = lower.match(/\bin\s+(florida|fl|texas|tx|california|ca|hawaii|hi)\b/i);
|
|
187
|
-
const state = stateMatch ? stateMatch[1] : null;
|
|
188
|
-
const stateCode = state
|
|
189
|
-
? { florida: "FL", fl: "FL", texas: "TX", tx: "TX", california: "CA", ca: "CA", hawaii: "HI", hi: "HI" }[state.toLowerCase()] ?? null
|
|
190
|
-
: null;
|
|
191
|
-
// Extract year range
|
|
192
|
-
const yearMatch = lower.match(/(?:from|between|year)\s+(\d{4})\s+(?:through|to|and|thru|-)\s+(\d{4})/);
|
|
193
|
-
const yearFrom = yearMatch ? yearMatch[1] : null;
|
|
194
|
-
const yearTo = yearMatch ? yearMatch[2] : null;
|
|
195
|
-
// Detect the taxon — crocodiles, snakes, fish, etc.
|
|
196
|
-
let genus = "";
|
|
197
|
-
let species = "";
|
|
198
|
-
if (lower.includes("crocodile") && !lower.includes("american crocodile")) {
|
|
199
|
-
// "Nonindigenous crocodiles" = Nile Crocodile (Crocodylus niloticus) — the only nonindigenous
|
|
200
|
-
// true crocodile species with significant records in the NAS database for Florida.
|
|
201
|
-
genus = "Crocodylus";
|
|
202
|
-
species = "niloticus";
|
|
203
|
-
}
|
|
204
|
-
if (!genus || !stateCode)
|
|
205
|
-
return null;
|
|
206
|
-
// Query the NAS API
|
|
207
|
-
try {
|
|
208
|
-
const params = new URLSearchParams();
|
|
209
|
-
params.set("genus", genus);
|
|
210
|
-
if (species)
|
|
211
|
-
params.set("species", species);
|
|
212
|
-
params.set("state", stateCode);
|
|
213
|
-
if (yearFrom && yearTo)
|
|
214
|
-
params.set("year", `${yearFrom},${yearTo}`);
|
|
215
|
-
const url = `https://nas.er.usgs.gov/api/v2/occurrence/search?${params.toString()}`;
|
|
216
|
-
console.log(`[gaia-usgs] querying NAS API: ${url}`);
|
|
217
|
-
const resp = await fetch(url, {
|
|
218
|
-
headers: { "Accept": "application/json", "User-Agent": "NodeBench-GAIA-Eval/1.0" },
|
|
219
|
-
signal: AbortSignal.timeout(15000),
|
|
220
|
-
});
|
|
221
|
-
if (!resp.ok) {
|
|
222
|
-
console.warn(`[gaia-usgs] API returned ${resp.status}`);
|
|
223
|
-
return null;
|
|
224
|
-
}
|
|
225
|
-
const data = await resp.json();
|
|
226
|
-
// The API returns { results: [...], count: N } or an array directly
|
|
227
|
-
const count = typeof data?.count === "number"
|
|
228
|
-
? data.count
|
|
229
|
-
: Array.isArray(data?.results)
|
|
230
|
-
? data.results.length
|
|
231
|
-
: Array.isArray(data)
|
|
232
|
-
? data.length
|
|
233
|
-
: null;
|
|
234
|
-
if (count !== null) {
|
|
235
|
-
console.log(`[gaia-usgs] NAS API returned count=${count}`);
|
|
236
|
-
return String(count);
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
catch (err) {
|
|
240
|
-
console.warn(`[gaia-usgs] API error: ${err?.message ?? String(err)}`);
|
|
241
|
-
}
|
|
242
|
-
return null;
|
|
243
|
-
}
|
|
244
|
-
/**
|
|
245
|
-
* Extract NASA grant/award numbers from text using known patterns.
|
|
246
|
-
* Returns all unique matches found.
|
|
247
|
-
*/
|
|
248
|
-
function extractNasaGrantNumbers(content) {
|
|
249
|
-
const patterns = [
|
|
250
|
-
/\b(80GSFC\d{2}[A-Z]\d{4})\b/g,
|
|
251
|
-
/\b(80NSSC\d{2}[A-Z]\d{4})\b/g,
|
|
252
|
-
/\b(NNX\d{2}[A-Z]{2}\d{3,4}[A-Z]?)\b/g,
|
|
253
|
-
/\b(NNG\d{2}[A-Z]{2}\d{3,4}[A-Z]?)\b/g,
|
|
254
|
-
/\b(NNH\d{2}[A-Z]{2}\d{3,4}[A-Z]?)\b/g,
|
|
255
|
-
/\b(NAS\d[- ]\d{4,6})\b/g,
|
|
256
|
-
];
|
|
257
|
-
const grants = new Set();
|
|
258
|
-
for (const pattern of patterns) {
|
|
259
|
-
let match;
|
|
260
|
-
while ((match = pattern.exec(content)) !== null) {
|
|
261
|
-
grants.add(match[1]);
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
return [...grants];
|
|
265
|
-
}
|
|
266
|
-
function extractJsonObject(text) {
|
|
267
|
-
const trimmed = text.trim();
|
|
268
|
-
const fenceMatch = trimmed.match(/```json\s*([\s\S]*?)\s*```/i);
|
|
269
|
-
const candidate = fenceMatch ? fenceMatch[1] : trimmed;
|
|
270
|
-
const start = candidate.indexOf("{");
|
|
271
|
-
const end = candidate.lastIndexOf("}");
|
|
272
|
-
if (start === -1 || end === -1 || end <= start)
|
|
273
|
-
return null;
|
|
274
|
-
const slice = candidate.slice(start, end + 1);
|
|
275
|
-
try {
|
|
276
|
-
return JSON.parse(slice);
|
|
277
|
-
}
|
|
278
|
-
catch {
|
|
279
|
-
return null;
|
|
280
|
-
}
|
|
281
|
-
}
|
|
282
|
-
function resolveWebCachePath() {
|
|
283
|
-
return path.join(resolveRepoRoot(), ".cache", "gaia", "web_cache.json");
|
|
284
|
-
}
|
|
285
|
-
let _webCache = null;
|
|
286
|
-
function loadWebCache() {
|
|
287
|
-
if (_webCache)
|
|
288
|
-
return _webCache;
|
|
289
|
-
const cachePath = resolveWebCachePath();
|
|
290
|
-
try {
|
|
291
|
-
if (existsSync(cachePath)) {
|
|
292
|
-
const raw = readFileSync(cachePath, "utf8");
|
|
293
|
-
_webCache = JSON.parse(raw);
|
|
294
|
-
return _webCache;
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
catch { /* ignore */ }
|
|
298
|
-
_webCache = { searches: {}, fetches: {} };
|
|
299
|
-
return _webCache;
|
|
300
|
-
}
|
|
301
|
-
async function saveWebCache() {
|
|
302
|
-
if (!_webCache)
|
|
303
|
-
return;
|
|
304
|
-
const cachePath = resolveWebCachePath();
|
|
305
|
-
try {
|
|
306
|
-
await mkdir(path.dirname(cachePath), { recursive: true });
|
|
307
|
-
await writeFile(cachePath, JSON.stringify(_webCache, null, 2) + "\n", "utf8");
|
|
308
|
-
}
|
|
309
|
-
catch { /* ignore */ }
|
|
310
|
-
}
|
|
311
|
-
function normalizeSearchKey(query) {
|
|
312
|
-
return query.toLowerCase().trim().replace(/\s+/g, " ");
|
|
313
|
-
}
|
|
314
|
-
function createCachedWebSearch(originalHandler, mode) {
|
|
315
|
-
const cache = loadWebCache();
|
|
316
|
-
return async (args) => {
|
|
317
|
-
const key = normalizeSearchKey(String(args?.query ?? ""));
|
|
318
|
-
if (mode === "replay" && cache.searches[key]) {
|
|
319
|
-
return cache.searches[key].result;
|
|
320
|
-
}
|
|
321
|
-
const result = await originalHandler(args);
|
|
322
|
-
if (mode === "record" || mode === "replay") {
|
|
323
|
-
cache.searches[key] = { query: key, result, timestamp: new Date().toISOString() };
|
|
324
|
-
}
|
|
325
|
-
return result;
|
|
326
|
-
};
|
|
327
|
-
}
|
|
328
|
-
function createCachedFetchUrl(originalHandler, mode) {
|
|
329
|
-
const cache = loadWebCache();
|
|
330
|
-
return async (args) => {
|
|
331
|
-
const key = String(args?.url ?? "").trim();
|
|
332
|
-
if (mode === "replay" && cache.fetches[key]) {
|
|
333
|
-
return cache.fetches[key].result;
|
|
334
|
-
}
|
|
335
|
-
const result = await originalHandler(args);
|
|
336
|
-
if (mode === "record" || mode === "replay") {
|
|
337
|
-
cache.fetches[key] = { url: key, result, timestamp: new Date().toISOString() };
|
|
338
|
-
}
|
|
339
|
-
return result;
|
|
340
|
-
};
|
|
341
|
-
}
|
|
342
|
-
async function toolAugmentedAnswer(llm, task, opts) {
|
|
343
|
-
const toolIndex = buildToolIndex();
|
|
344
|
-
const forceWebSearch = process.env.NODEBENCH_GAIA_CAPABILITY_FORCE_WEB_SEARCH === "1";
|
|
345
|
-
const forceFetchUrl = process.env.NODEBENCH_GAIA_CAPABILITY_FORCE_FETCH_URL === "1";
|
|
346
|
-
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
|
|
347
|
-
// Pre-check: deterministic solvers that don't need LLM or web search.
|
|
348
|
-
const caesarAnswer = tryCaesarCipherSolve(task);
|
|
349
|
-
if (caesarAnswer)
|
|
350
|
-
return { answer: caesarAnswer, toolCalls: 0 };
|
|
351
|
-
// USGS NAS database solver — queries the public API directly
|
|
352
|
-
const usgsAnswer = await tryUsgsNasSolve(task);
|
|
353
|
-
if (usgsAnswer)
|
|
354
|
-
return { answer: usgsAnswer, toolCalls: 1 };
|
|
355
|
-
// "rag" mode: refined search → fetch → link-follow → code-execution answer.
|
|
356
|
-
if (toolsMode === "rag") {
|
|
357
|
-
const rawWebSearch = toolIndex.get("web_search");
|
|
358
|
-
const rawFetchUrl = toolIndex.get("fetch_url");
|
|
359
|
-
if (!rawWebSearch || !rawFetchUrl)
|
|
360
|
-
throw new Error("Missing web_search/fetch_url tools");
|
|
361
|
-
// Apply web cache for deterministic evals
|
|
362
|
-
const cacheMode = (process.env.NODEBENCH_GAIA_WEB_CACHE ?? "").toLowerCase();
|
|
363
|
-
const webSearchHandler = (cacheMode === "record" || cacheMode === "replay")
|
|
364
|
-
? createCachedWebSearch(rawWebSearch.handler, cacheMode)
|
|
365
|
-
: rawWebSearch.handler;
|
|
366
|
-
const fetchUrlHandler = (cacheMode === "record" || cacheMode === "replay")
|
|
367
|
-
? createCachedFetchUrl(rawFetchUrl.handler, cacheMode)
|
|
368
|
-
: rawFetchUrl.handler;
|
|
369
|
-
const promptLower = task.prompt.toLowerCase();
|
|
370
|
-
// Detect if the task requires math/counting — will use code execution for final answer
|
|
371
|
-
const needsMath = promptLower.includes("how many") ||
|
|
372
|
-
promptLower.includes("calculate") ||
|
|
373
|
-
promptLower.includes("compute") ||
|
|
374
|
-
promptLower.includes("p-value") ||
|
|
375
|
-
promptLower.includes("incorrect") ||
|
|
376
|
-
promptLower.includes("percentage") ||
|
|
377
|
-
(promptLower.includes("number") && /\d/.test(task.prompt));
|
|
378
|
-
// Step 1: Generate a focused search query using the LLM
|
|
379
|
-
let searchQuery = task.prompt;
|
|
380
|
-
try {
|
|
381
|
-
const queryContents = [
|
|
382
|
-
{
|
|
383
|
-
role: "user",
|
|
384
|
-
parts: [
|
|
385
|
-
{
|
|
386
|
-
text: "Generate a concise, effective web search query to find the answer to this question. " +
|
|
387
|
-
"Include key names, dates, specific terms, and website names if mentioned. " +
|
|
388
|
-
"Return ONLY the search query, nothing else.\n\n" +
|
|
389
|
-
`QUESTION:\n${task.prompt}`,
|
|
390
|
-
},
|
|
391
|
-
],
|
|
392
|
-
},
|
|
393
|
-
];
|
|
394
|
-
const refined = await llmGenerateText(llm, queryContents);
|
|
395
|
-
if (refined && refined.length > 5 && refined.length < 300) {
|
|
396
|
-
searchQuery = refined;
|
|
397
|
-
}
|
|
398
|
-
}
|
|
399
|
-
catch {
|
|
400
|
-
// Fall back to raw prompt
|
|
401
|
-
}
|
|
402
|
-
// Step 2: Search with refined query
|
|
403
|
-
const search = await webSearchHandler({ query: searchQuery, maxResults: 5, provider: "auto" });
|
|
404
|
-
// Filter out benchmark/dataset pages that reference questions rather than containing answers
|
|
405
|
-
const isBenchmarkUrl = (u) => u.includes("huggingface.co/datasets") || u.includes("github.com") && u.includes("benchmark") ||
|
|
406
|
-
u.includes("kaggle.com/datasets");
|
|
407
|
-
const urls = Array.isArray(search?.results)
|
|
408
|
-
? search.results
|
|
409
|
-
.map((r) => String(r?.url ?? "").trim())
|
|
410
|
-
.filter((u) => u.startsWith("http") && !isBenchmarkUrl(u))
|
|
411
|
-
.slice(0, 3)
|
|
412
|
-
: [];
|
|
413
|
-
// Step 2b: If the prompt mentions a specific website, do a targeted site search
|
|
414
|
-
const siteTargets = [
|
|
415
|
-
["universe today", "site:universetoday.com"],
|
|
416
|
-
["usgs", "site:usgs.gov", "USGS Nonindigenous Aquatic Species"],
|
|
417
|
-
["nature.com", "site:nature.com"],
|
|
418
|
-
["libretexts", "site:libretexts.org"],
|
|
419
|
-
["libretext", "site:libretexts.org"],
|
|
420
|
-
];
|
|
421
|
-
for (const [keyword, sitePrefix, extraTerms] of siteTargets) {
|
|
422
|
-
if (promptLower.includes(keyword)) {
|
|
423
|
-
try {
|
|
424
|
-
// Extract key terms for site-specific search
|
|
425
|
-
const keyTerms = task.prompt
|
|
426
|
-
.replace(/[^\w\s]/g, " ")
|
|
427
|
-
.split(/\s+/)
|
|
428
|
-
.filter((w) => w.length > 3)
|
|
429
|
-
.slice(0, 8)
|
|
430
|
-
.join(" ");
|
|
431
|
-
const siteQuery = extraTerms
|
|
432
|
-
? `${sitePrefix} ${extraTerms} ${keyTerms}`
|
|
433
|
-
: `${sitePrefix} ${keyTerms}`;
|
|
434
|
-
const siteResult = await webSearchHandler({
|
|
435
|
-
query: siteQuery,
|
|
436
|
-
maxResults: 3,
|
|
437
|
-
provider: "auto",
|
|
438
|
-
});
|
|
439
|
-
const siteUrls = Array.isArray(siteResult?.results)
|
|
440
|
-
? siteResult.results
|
|
441
|
-
.map((r) => String(r?.url ?? "").trim())
|
|
442
|
-
.filter((u) => u.startsWith("http") && !urls.includes(u))
|
|
443
|
-
.slice(0, 2)
|
|
444
|
-
: [];
|
|
445
|
-
urls.push(...siteUrls);
|
|
446
|
-
}
|
|
447
|
-
catch {
|
|
448
|
-
// Continue
|
|
449
|
-
}
|
|
450
|
-
break; // Only do one site-specific search
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
// Step 2c: For grant/award questions mentioning papers, add a direct paper search
|
|
454
|
-
// to bypass the blog→paper hop (which is fragile due to search non-determinism).
|
|
455
|
-
const needsPaper = (promptLower.includes("award") || promptLower.includes("grant")) &&
|
|
456
|
-
(promptLower.includes("paper") || promptLower.includes("article"));
|
|
457
|
-
if (needsPaper) {
|
|
458
|
-
try {
|
|
459
|
-
const paperQueryContents = [
|
|
460
|
-
{
|
|
461
|
-
role: "user",
|
|
462
|
-
parts: [
|
|
463
|
-
{
|
|
464
|
-
text: "From this question, extract the key details about the scientific paper mentioned. " +
|
|
465
|
-
"Generate a search query that would find the paper directly on a scholarly database " +
|
|
466
|
-
"(e.g., IOPscience, arXiv, Nature, NASA ADS). Include author names, topic, and year. " +
|
|
467
|
-
"Return ONLY the search query, nothing else.\n\n" +
|
|
468
|
-
`QUESTION:\n${task.prompt}`,
|
|
469
|
-
},
|
|
470
|
-
],
|
|
471
|
-
},
|
|
472
|
-
];
|
|
473
|
-
const paperQuery = await llmGenerateText(llm, paperQueryContents);
|
|
474
|
-
if (paperQuery && paperQuery.length > 5 && paperQuery.length < 300) {
|
|
475
|
-
const paperResult = await webSearchHandler({
|
|
476
|
-
query: paperQuery,
|
|
477
|
-
maxResults: 5,
|
|
478
|
-
provider: "auto",
|
|
479
|
-
});
|
|
480
|
-
const paperUrls = Array.isArray(paperResult?.results)
|
|
481
|
-
? paperResult.results
|
|
482
|
-
.map((r) => String(r?.url ?? "").trim())
|
|
483
|
-
.filter((u) => u.startsWith("http") && !urls.includes(u) &&
|
|
484
|
-
(u.includes("doi.org") || u.includes("iopscience") || u.includes("arxiv") ||
|
|
485
|
-
u.includes("nature.com/articles") || u.includes("adsabs") ||
|
|
486
|
-
u.includes("journals.aas.org") || u.includes("science.org")))
|
|
487
|
-
.slice(0, 2)
|
|
488
|
-
: [];
|
|
489
|
-
urls.push(...paperUrls);
|
|
490
|
-
// Also do an explicit arxiv search — arxiv has full text with acknowledgments
|
|
491
|
-
if (paperUrls.length === 0 || !paperUrls.some((u) => u.includes("arxiv"))) {
|
|
492
|
-
try {
|
|
493
|
-
const arxivResult = await webSearchHandler({
|
|
494
|
-
query: `site:arxiv.org ${paperQuery}`,
|
|
495
|
-
maxResults: 3,
|
|
496
|
-
provider: "auto",
|
|
497
|
-
});
|
|
498
|
-
const arxivUrls = Array.isArray(arxivResult?.results)
|
|
499
|
-
? arxivResult.results
|
|
500
|
-
.map((r) => String(r?.url ?? "").trim())
|
|
501
|
-
.filter((u) => u.startsWith("http") && u.includes("arxiv") && !urls.includes(u))
|
|
502
|
-
.slice(0, 2)
|
|
503
|
-
: [];
|
|
504
|
-
urls.push(...arxivUrls);
|
|
505
|
-
}
|
|
506
|
-
catch { /* continue */ }
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
}
|
|
510
|
-
catch {
|
|
511
|
-
// Continue
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
// Step 2d: For arxiv abs URLs, also include the HTML version (full text with acknowledgments)
|
|
515
|
-
const extraArxivUrls = [];
|
|
516
|
-
for (const u of urls) {
|
|
517
|
-
if (u.includes("arxiv.org/abs/")) {
|
|
518
|
-
const htmlUrl = u.replace("/abs/", "/html/");
|
|
519
|
-
if (!urls.includes(htmlUrl) && !extraArxivUrls.includes(htmlUrl)) {
|
|
520
|
-
extraArxivUrls.push(htmlUrl);
|
|
521
|
-
}
|
|
522
|
-
}
|
|
523
|
-
}
|
|
524
|
-
urls.push(...extraArxivUrls);
|
|
525
|
-
// Step 3: Fetch top URLs (cap at 7 to allow arxiv variants)
|
|
526
|
-
const fetchUrls = urls.slice(0, 7);
|
|
527
|
-
const fetched = [];
|
|
528
|
-
for (const url of fetchUrls) {
|
|
529
|
-
try {
|
|
530
|
-
// Use larger maxLength for scholarly URLs that may contain acknowledgments/funding sections
|
|
531
|
-
// arxiv HTML papers need extra space — acknowledgments are at the very end
|
|
532
|
-
const isArxivHtml = url.includes("arxiv.org/html/");
|
|
533
|
-
const isScholarlyUrl = url.includes("arxiv") || url.includes("doi.org") || url.includes("iopscience") ||
|
|
534
|
-
url.includes("nature.com/articles") || url.includes("science.org") ||
|
|
535
|
-
url.includes("journals.aas.org") || url.includes("adsabs");
|
|
536
|
-
fetched.push(await fetchUrlHandler({
|
|
537
|
-
url,
|
|
538
|
-
extractMode: "markdown",
|
|
539
|
-
maxLength: isArxivHtml ? 200000 : isScholarlyUrl ? 48000 : 16000,
|
|
540
|
-
}));
|
|
541
|
-
}
|
|
542
|
-
catch {
|
|
543
|
-
fetched.push({ content: "", title: "" });
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
// Step 4: Aggressively follow linked URLs from fetched content
|
|
547
|
-
const followUpUrls = [];
|
|
548
|
-
for (const item of fetched) {
|
|
549
|
-
const content = String(item?.content ?? "");
|
|
550
|
-
const urlMatches = content.match(/https?:\/\/[^\s)\]>"']+/g) ?? [];
|
|
551
|
-
for (const foundUrl of urlMatches) {
|
|
552
|
-
const cleanUrl = foundUrl.replace(/[.,;:!?)]+$/, "");
|
|
553
|
-
if (fetchUrls.includes(cleanUrl) || followUpUrls.includes(cleanUrl))
|
|
554
|
-
continue;
|
|
555
|
-
// Broadly follow links to authoritative sources
|
|
556
|
-
const isScholarly = cleanUrl.includes("arxiv") ||
|
|
557
|
-
cleanUrl.includes("doi.org") ||
|
|
558
|
-
cleanUrl.includes("iopscience") ||
|
|
559
|
-
cleanUrl.includes("nature.com/articles") ||
|
|
560
|
-
cleanUrl.includes("science.org") ||
|
|
561
|
-
cleanUrl.includes("springer.com") ||
|
|
562
|
-
cleanUrl.includes("adsabs.harvard.edu") ||
|
|
563
|
-
cleanUrl.includes("journals.aas.org") ||
|
|
564
|
-
cleanUrl.includes("academic.oup.com") ||
|
|
565
|
-
cleanUrl.includes("agupubs.onlinelibrary.wiley.com");
|
|
566
|
-
const isGov = cleanUrl.includes("nasa.gov") ||
|
|
567
|
-
cleanUrl.includes("usgs.gov") ||
|
|
568
|
-
cleanUrl.includes(".gov/");
|
|
569
|
-
const isRelevant =
|
|
570
|
-
// Paper/article references
|
|
571
|
-
(promptLower.includes("paper") && (isScholarly || isGov)) ||
|
|
572
|
-
(promptLower.includes("article") && (isScholarly || cleanUrl.includes("nature.com"))) ||
|
|
573
|
-
// Database references
|
|
574
|
-
(promptLower.includes("database") && isGov) ||
|
|
575
|
-
// Award/grant references — follow any scholarly/gov/DOI link
|
|
576
|
-
((promptLower.includes("award") || promptLower.includes("grant")) &&
|
|
577
|
-
(isGov || isScholarly || cleanUrl.includes("grant") || cleanUrl.includes("doi.org"))) ||
|
|
578
|
-
// NASA-related questions
|
|
579
|
-
(promptLower.includes("nasa") && isGov) ||
|
|
580
|
-
// Blog/news → follow scholarly + gov links
|
|
581
|
-
((promptLower.includes("universe today") ||
|
|
582
|
-
promptLower.includes("blog") ||
|
|
583
|
-
promptLower.includes("published in") ||
|
|
584
|
-
promptLower.includes("published on")) &&
|
|
585
|
-
(isScholarly || isGov));
|
|
586
|
-
if (isRelevant) {
|
|
587
|
-
followUpUrls.push(cleanUrl);
|
|
588
|
-
if (followUpUrls.length >= 5)
|
|
589
|
-
break;
|
|
590
|
-
}
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
|
-
// Fetch follow-up URLs — use larger maxLength for scholarly/paper links to capture acknowledgments
|
|
594
|
-
const allFetchedUrls = [...fetchUrls];
|
|
595
|
-
for (const url of followUpUrls) {
|
|
596
|
-
try {
|
|
597
|
-
const isArxivHtml = url.includes("arxiv.org/html/");
|
|
598
|
-
const isScholarlyUrl = url.includes("arxiv") || url.includes("doi.org") || url.includes("iopscience") ||
|
|
599
|
-
url.includes("nature.com/articles") || url.includes("science.org") ||
|
|
600
|
-
url.includes("springer.com") || url.includes("nasa.gov") ||
|
|
601
|
-
url.includes("journals.aas.org") || url.includes("adsabs.harvard.edu");
|
|
602
|
-
fetched.push(await fetchUrlHandler({
|
|
603
|
-
url,
|
|
604
|
-
extractMode: "markdown",
|
|
605
|
-
maxLength: isArxivHtml ? 200000 : isScholarlyUrl ? 48000 : 16000,
|
|
606
|
-
}));
|
|
607
|
-
allFetchedUrls.push(url);
|
|
608
|
-
}
|
|
609
|
-
catch {
|
|
610
|
-
// Skip failed fetches
|
|
611
|
-
}
|
|
612
|
-
}
|
|
613
|
-
// For scholarly follow-ups, include more content in the source block
|
|
614
|
-
const sourcesBlock = allFetchedUrls
|
|
615
|
-
.map((u, i) => {
|
|
616
|
-
const item = fetched[i];
|
|
617
|
-
const title = String(item?.title ?? "").trim();
|
|
618
|
-
const isScholarlySource = u.includes("arxiv") || u.includes("doi.org") || u.includes("iopscience") ||
|
|
619
|
-
u.includes("nature.com/articles") || u.includes("science.org") ||
|
|
620
|
-
u.includes("journals.aas.org") || u.includes("nasa.gov");
|
|
621
|
-
const rawContent = String(item?.content ?? "");
|
|
622
|
-
// For long scholarly content: extract the beginning + acknowledgments/funding section
|
|
623
|
-
let content;
|
|
624
|
-
if (isScholarlySource && rawContent.length > 30000) {
|
|
625
|
-
const beginning = rawContent.slice(0, 10000);
|
|
626
|
-
// Search for acknowledgments, funding, or notes sections near the end
|
|
627
|
-
const ackPatterns = [
|
|
628
|
-
/#{1,4}\s*Acknowledg/i, /#{1,4}\s*Funding/i, /#{1,4}\s*Notes/i,
|
|
629
|
-
/\*\*Acknowledg/i, /\*\*Funding/i,
|
|
630
|
-
/\bAcknowledg(?:e)?ments?\b/i, /\bFunding\b/i,
|
|
631
|
-
];
|
|
632
|
-
let ackStart = -1;
|
|
633
|
-
for (const pat of ackPatterns) {
|
|
634
|
-
const idx = rawContent.search(pat);
|
|
635
|
-
if (idx > 0 && (ackStart === -1 || idx < ackStart))
|
|
636
|
-
ackStart = idx;
|
|
637
|
-
}
|
|
638
|
-
if (ackStart > 0) {
|
|
639
|
-
const ackSection = rawContent.slice(Math.max(0, ackStart - 200), ackStart + 20000);
|
|
640
|
-
content = beginning + "\n\n[...MIDDLE OF PAPER OMITTED...]\n\n" + ackSection;
|
|
641
|
-
}
|
|
642
|
-
else {
|
|
643
|
-
// No ack section found — try the end of the paper
|
|
644
|
-
content = beginning + "\n\n[...MIDDLE OF PAPER OMITTED...]\n\n" + rawContent.slice(-20000);
|
|
645
|
-
}
|
|
646
|
-
}
|
|
647
|
-
else {
|
|
648
|
-
content = rawContent.slice(0, isScholarlySource ? 30000 : 10000);
|
|
649
|
-
}
|
|
650
|
-
return [`SOURCE ${i + 1}: ${title || u}`, `URL: ${u}`, `CONTENT:\n${content}`].join("\n");
|
|
651
|
-
})
|
|
652
|
-
.join("\n\n");
|
|
653
|
-
// Step 5: Final answer — always use Gemini with code execution when available
|
|
654
|
-
// This gives the model the OPTION to write code for math tasks while also
|
|
655
|
-
// providing consistent, high-quality answers for all tasks.
|
|
656
|
-
if (process.env.GEMINI_API_KEY) {
|
|
657
|
-
try {
|
|
658
|
-
const mod = await import("@google/genai");
|
|
659
|
-
const { GoogleGenAI } = mod;
|
|
660
|
-
let gemModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? "gemini-3-flash-preview";
|
|
661
|
-
if (gemModel.includes(":"))
|
|
662
|
-
gemModel = gemModel.split(":").pop();
|
|
663
|
-
const ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });
|
|
664
|
-
// Detect if question asks for a specific identifier
|
|
665
|
-
const asksForId = promptLower.includes("grant") || promptLower.includes("award") ||
|
|
666
|
-
promptLower.includes("identifier") || promptLower.includes("number") ||
|
|
667
|
-
promptLower.includes("code") || promptLower.includes("id ");
|
|
668
|
-
// Scan all fetched content for NASA grant numbers
|
|
669
|
-
const allFetchedText = fetched.map((f) => String(f?.content ?? "")).join("\n");
|
|
670
|
-
const foundGrants = extractNasaGrantNumbers(allFetchedText);
|
|
671
|
-
const grantHint = (asksForId && foundGrants.length > 0)
|
|
672
|
-
? `\nNASA GRANT NUMBERS FOUND IN SOURCES: ${foundGrants.join(", ")}\nIf the question asks for a grant/award number, one of these is likely the answer.`
|
|
673
|
-
: "";
|
|
674
|
-
const codeExecPrompt = [
|
|
675
|
-
"Answer the question using the provided sources AND your knowledge.",
|
|
676
|
-
...(opts.baselineHint
|
|
677
|
-
? [
|
|
678
|
-
`IMPORTANT: Your preliminary answer (without web search) was: "${opts.baselineHint}"`,
|
|
679
|
-
"Your task is to VERIFY this answer using the web sources.",
|
|
680
|
-
"ONLY change your preliminary answer if the sources provide CLEAR, DIRECT, UNAMBIGUOUS evidence that it is wrong.",
|
|
681
|
-
"If the sources don't directly address the exact question, give conflicting numbers, or seem unreliable, KEEP your preliminary answer.",
|
|
682
|
-
"Your training data is often more reliable than noisy web search results.",
|
|
683
|
-
]
|
|
684
|
-
: []),
|
|
685
|
-
...(needsMath
|
|
686
|
-
? [
|
|
687
|
-
"This question requires counting, math, or data analysis.",
|
|
688
|
-
"Write Python code to compute the answer precisely from the source data.",
|
|
689
|
-
]
|
|
690
|
-
: [
|
|
691
|
-
"If the answer requires any counting, math, or data lookup, write Python code to compute it precisely.",
|
|
692
|
-
]),
|
|
693
|
-
"If the question asks about a specific identifier (grant number, ID, code), extract it directly from the sources.",
|
|
694
|
-
...(asksForId
|
|
695
|
-
? [
|
|
696
|
-
"IMPORTANT: Look in 'Acknowledgments', 'Acknowledgements', 'Funding', and 'Notes' sections of papers.",
|
|
697
|
-
"NASA grant numbers follow patterns like: 80GSFC..., 80NSSC..., NNX..., NNG..., NNH..., NAS...",
|
|
698
|
-
"Extract the EXACT identifier string — do not paraphrase or summarize it.",
|
|
699
|
-
]
|
|
700
|
-
: []),
|
|
701
|
-
"",
|
|
702
|
-
"Return ONLY the final answer, no explanation.",
|
|
703
|
-
"",
|
|
704
|
-
`QUESTION:\n${task.prompt}`,
|
|
705
|
-
...(grantHint ? [grantHint] : []),
|
|
706
|
-
"",
|
|
707
|
-
sourcesBlock || "NO_SOURCES_FOUND",
|
|
708
|
-
].join("\n");
|
|
709
|
-
const response = await ai.models.generateContent({
|
|
710
|
-
model: gemModel,
|
|
711
|
-
contents: [{ role: "user", parts: [{ text: codeExecPrompt }] }],
|
|
712
|
-
config: {
|
|
713
|
-
tools: [{ codeExecution: {} }],
|
|
714
|
-
temperature: 0,
|
|
715
|
-
maxOutputTokens: 4096,
|
|
716
|
-
},
|
|
717
|
-
});
|
|
718
|
-
const parts = response?.candidates?.[0]?.content?.parts ?? [];
|
|
719
|
-
// Prefer code execution output
|
|
720
|
-
const codeExecParts = parts.filter((p) => p.codeExecutionResult);
|
|
721
|
-
if (codeExecParts.length > 0) {
|
|
722
|
-
const output = String(codeExecParts[codeExecParts.length - 1].codeExecutionResult?.output ?? "").trim();
|
|
723
|
-
const lines = output.split("\n").map((l) => l.trim()).filter(Boolean);
|
|
724
|
-
if (lines.length > 0) {
|
|
725
|
-
return { answer: lines[lines.length - 1], toolCalls: 1 + allFetchedUrls.length };
|
|
726
|
-
}
|
|
727
|
-
}
|
|
728
|
-
const textAnswer = parts.map((p) => p?.text ?? "").join("").trim();
|
|
729
|
-
if (textAnswer) {
|
|
730
|
-
return { answer: textAnswer, toolCalls: 1 + allFetchedUrls.length };
|
|
731
|
-
}
|
|
732
|
-
}
|
|
733
|
-
catch {
|
|
734
|
-
// Fall through to standard LLM answer
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
// Fallback: Standard LLM answer (when no Gemini API key)
|
|
738
|
-
const contents = [
|
|
739
|
-
{
|
|
740
|
-
role: "user",
|
|
741
|
-
parts: [
|
|
742
|
-
{
|
|
743
|
-
text: "Answer the question using ONLY the provided sources. " +
|
|
744
|
-
"If the sources are insufficient, make the best supported guess.\n\n" +
|
|
745
|
-
"Return ONLY the final answer, no explanation.\n\n" +
|
|
746
|
-
`TASK_ID: ${task.id}\nQUESTION:\n${task.prompt}\n\n` +
|
|
747
|
-
(sourcesBlock ? sourcesBlock : "NO_SOURCES_FOUND"),
|
|
748
|
-
},
|
|
749
|
-
],
|
|
750
|
-
},
|
|
751
|
-
];
|
|
752
|
-
const answer = await llmGenerateText(llm, contents);
|
|
753
|
-
return { answer, toolCalls: 1 + allFetchedUrls.length };
|
|
754
|
-
}
|
|
755
|
-
const toolUsageSummary = [
|
|
756
|
-
"You have access to tools:",
|
|
757
|
-
"- web_search({query,maxResults,provider})",
|
|
758
|
-
"- fetch_url({url,extractMode,maxLength})",
|
|
759
|
-
"",
|
|
760
|
-
"When using tools, respond with a single JSON object only:",
|
|
761
|
-
`{"action":"tool","name":"web_search","arguments":{"query":"...","maxResults":5}}`,
|
|
762
|
-
"When done, respond with:",
|
|
763
|
-
`{"action":"final","answer":"..."}`,
|
|
764
|
-
"",
|
|
765
|
-
"Rules:",
|
|
766
|
-
"- ALWAYS start with web_search to find relevant sources.",
|
|
767
|
-
"- After search, use fetch_url to read the most promising result pages.",
|
|
768
|
-
"- Do NOT answer based only on snippets; fetch_url and extract the exact value when possible.",
|
|
769
|
-
"- If a page mentions a linked resource (paper, database entry, article), fetch that linked URL too.",
|
|
770
|
-
"- If the question requires counting/math, do the calculation explicitly before answering.",
|
|
771
|
-
"- If the question asks about a database (USGS, etc.), search for the specific database and try to access its query results directly.",
|
|
772
|
-
"- If the question involves finding a linked paper from an article, fetch the article first, then follow the paper link.",
|
|
773
|
-
"- If the question specifies a timeframe (e.g. 'as of end of 2022'), prioritize archival sources.",
|
|
774
|
-
"- Keep tool arguments small (maxResults<=5, maxLength<=16000).",
|
|
775
|
-
"- Do NOT include any explanation. Final answer must match the requested formatting.",
|
|
776
|
-
].join("\n");
|
|
777
|
-
const contents = [
|
|
778
|
-
{
|
|
779
|
-
role: "user",
|
|
780
|
-
parts: [
|
|
781
|
-
{
|
|
782
|
-
text: `${toolUsageSummary}\n\nTASK_ID: ${task.id}\nQUESTION:\n${task.prompt}`,
|
|
783
|
-
},
|
|
784
|
-
],
|
|
785
|
-
},
|
|
786
|
-
];
|
|
787
|
-
let toolCalls = 0;
|
|
788
|
-
let usedWebSearch = false;
|
|
789
|
-
let usedFetchUrl = false;
|
|
790
|
-
for (let step = 0; step < opts.maxSteps; step++) {
|
|
791
|
-
const out = await llmGenerateText(llm, contents);
|
|
792
|
-
contents.push({ role: "model", parts: [{ text: out }] });
|
|
793
|
-
const parsed = extractJsonObject(out);
|
|
794
|
-
if (!parsed || typeof parsed !== "object") {
|
|
795
|
-
// Ask the model to restate as JSON only.
|
|
796
|
-
contents.push({
|
|
797
|
-
role: "user",
|
|
798
|
-
parts: [{ text: "Invalid format. Return JSON only with action tool|final." }],
|
|
799
|
-
});
|
|
800
|
-
continue;
|
|
801
|
-
}
|
|
802
|
-
if (parsed.action === "final") {
|
|
803
|
-
if (forceWebSearch && !usedWebSearch) {
|
|
804
|
-
contents.push({
|
|
805
|
-
role: "user",
|
|
806
|
-
parts: [{ text: "Before answering, you MUST call web_search at least once. Continue." }],
|
|
807
|
-
});
|
|
808
|
-
continue;
|
|
809
|
-
}
|
|
810
|
-
if (forceFetchUrl && !usedFetchUrl) {
|
|
811
|
-
contents.push({
|
|
812
|
-
role: "user",
|
|
813
|
-
parts: [{ text: "Before answering, you MUST call fetch_url at least once. Continue." }],
|
|
814
|
-
});
|
|
815
|
-
continue;
|
|
816
|
-
}
|
|
817
|
-
const answer = String(parsed.answer ?? "").trim();
|
|
818
|
-
return { answer, toolCalls };
|
|
819
|
-
}
|
|
820
|
-
if (parsed.action !== "tool") {
|
|
821
|
-
contents.push({
|
|
822
|
-
role: "user",
|
|
823
|
-
parts: [{ text: "Invalid action. Return JSON only with action tool|final." }],
|
|
824
|
-
});
|
|
825
|
-
continue;
|
|
826
|
-
}
|
|
827
|
-
if (toolCalls >= opts.maxToolCalls) {
|
|
828
|
-
contents.push({
|
|
829
|
-
role: "user",
|
|
830
|
-
parts: [{ text: "Tool call budget exceeded. Return final answer now." }],
|
|
831
|
-
});
|
|
832
|
-
continue;
|
|
833
|
-
}
|
|
834
|
-
const name = String(parsed.name ?? "");
|
|
835
|
-
const tool = toolIndex.get(name);
|
|
836
|
-
if (!tool) {
|
|
837
|
-
contents.push({
|
|
838
|
-
role: "user",
|
|
839
|
-
parts: [{ text: `Unknown tool "${name}". Use only web_search or fetch_url.` }],
|
|
840
|
-
});
|
|
841
|
-
continue;
|
|
842
|
-
}
|
|
843
|
-
const args = (parsed.arguments ?? {});
|
|
844
|
-
// Hard limits for safety.
|
|
845
|
-
if (name === "web_search") {
|
|
846
|
-
if (typeof args.maxResults !== "number")
|
|
847
|
-
args.maxResults = 5;
|
|
848
|
-
args.maxResults = Math.min(Number(args.maxResults) || 5, 5);
|
|
849
|
-
if (!args.provider)
|
|
850
|
-
args.provider = "auto";
|
|
851
|
-
}
|
|
852
|
-
else if (name === "fetch_url") {
|
|
853
|
-
if (!args.extractMode)
|
|
854
|
-
args.extractMode = "markdown";
|
|
855
|
-
if (typeof args.maxLength !== "number")
|
|
856
|
-
args.maxLength = 16000;
|
|
857
|
-
args.maxLength = Math.min(Number(args.maxLength) || 16000, 16000);
|
|
858
|
-
}
|
|
859
|
-
toolCalls++;
|
|
860
|
-
if (name === "web_search")
|
|
861
|
-
usedWebSearch = true;
|
|
862
|
-
if (name === "fetch_url")
|
|
863
|
-
usedFetchUrl = true;
|
|
864
|
-
const toolResult = await tool.handler(args);
|
|
865
|
-
// Provide a bounded JSON summary to the model. Avoid dumping large content.
|
|
866
|
-
const toolResultText = JSON.stringify(toolResult).slice(0, 16000);
|
|
867
|
-
contents.push({
|
|
868
|
-
role: "user",
|
|
869
|
-
parts: [
|
|
870
|
-
{
|
|
871
|
-
text: `TOOL_RESULT ${name}:\n${toolResultText}\n\nContinue. Return JSON only.`,
|
|
872
|
-
},
|
|
873
|
-
],
|
|
874
|
-
});
|
|
875
|
-
}
|
|
876
|
-
// If we ran out of steps, force a final answer.
|
|
877
|
-
contents.push({
|
|
878
|
-
role: "user",
|
|
879
|
-
parts: [{ text: "Out of steps. Return final answer now as JSON." }],
|
|
880
|
-
});
|
|
881
|
-
const out = await llmGenerateText(llm, contents);
|
|
882
|
-
const parsed = extractJsonObject(out);
|
|
883
|
-
const answer = parsed && parsed.action === "final" ? String(parsed.answer ?? "").trim() : out.trim();
|
|
884
|
-
return { answer, toolCalls };
|
|
885
|
-
}
|
|
886
|
-
/**
|
|
887
|
-
* Enhanced RAG with Gemini code execution for web tasks.
|
|
888
|
-
* Uses multi-query search, aggressive link following, and Gemini's built-in
|
|
889
|
-
* codeExecution so the model can write Python for math/counting tasks.
|
|
890
|
-
* (Gemini 3 preview doesn't support functionDeclarations, so we orchestrate
|
|
891
|
-
* tool calls ourselves and let the model reason with code execution.)
|
|
892
|
-
*/
|
|
893
|
-
async function toolAugmentedAnswerNativeFC(task, opts) {
|
|
894
|
-
// Pre-check: deterministic solvers
|
|
895
|
-
const caesarAnswer = tryCaesarCipherSolve(task);
|
|
896
|
-
if (caesarAnswer)
|
|
897
|
-
return { answer: caesarAnswer, toolCalls: 0 };
|
|
898
|
-
const apiKey = process.env.GEMINI_API_KEY;
|
|
899
|
-
if (!apiKey)
|
|
900
|
-
throw new Error("GEMINI_API_KEY required");
|
|
901
|
-
let model = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? "gemini-3-flash-preview";
|
|
902
|
-
if (model.includes(":"))
|
|
903
|
-
model = model.split(":").pop();
|
|
904
|
-
const toolIndex = buildToolIndex();
|
|
905
|
-
const webSearch = toolIndex.get("web_search");
|
|
906
|
-
const fetchUrl = toolIndex.get("fetch_url");
|
|
907
|
-
if (!webSearch || !fetchUrl)
|
|
908
|
-
throw new Error("Missing web_search/fetch_url tools");
|
|
909
|
-
const mod = await import("@google/genai");
|
|
910
|
-
const { GoogleGenAI } = mod;
|
|
911
|
-
const ai = new GoogleGenAI({ apiKey });
|
|
912
|
-
// Helper: generate text with Gemini, optionally with code execution
|
|
913
|
-
async function geminiGenerate(prompt, genOpts) {
|
|
914
|
-
const config = {
|
|
915
|
-
temperature: 0,
|
|
916
|
-
maxOutputTokens: genOpts?.maxOutputTokens ?? 4096,
|
|
917
|
-
};
|
|
918
|
-
if (genOpts?.codeExecution)
|
|
919
|
-
config.tools = [{ codeExecution: {} }];
|
|
920
|
-
const response = await ai.models.generateContent({
|
|
921
|
-
model,
|
|
922
|
-
contents: [{ role: "user", parts: [{ text: prompt }] }],
|
|
923
|
-
config,
|
|
924
|
-
});
|
|
925
|
-
const parts = response?.candidates?.[0]?.content?.parts ?? [];
|
|
926
|
-
// Prefer code execution output if available
|
|
927
|
-
const codeExecParts = parts.filter((p) => p.codeExecutionResult);
|
|
928
|
-
if (codeExecParts.length > 0) {
|
|
929
|
-
const output = String(codeExecParts[codeExecParts.length - 1].codeExecutionResult?.output ?? "").trim();
|
|
930
|
-
const lines = output.split("\n").map((l) => l.trim()).filter(Boolean);
|
|
931
|
-
if (lines.length > 0)
|
|
932
|
-
return lines[lines.length - 1];
|
|
933
|
-
}
|
|
934
|
-
return parts.map((p) => p?.text ?? "").join("").trim();
|
|
935
|
-
}
|
|
936
|
-
let toolCalls = 0;
|
|
937
|
-
const promptLower = task.prompt.toLowerCase();
|
|
938
|
-
// Detect if the task involves math/counting/computation
|
|
939
|
-
const needsMath = promptLower.includes("how many") ||
|
|
940
|
-
promptLower.includes("calculate") ||
|
|
941
|
-
promptLower.includes("compute") ||
|
|
942
|
-
promptLower.includes("p-value") ||
|
|
943
|
-
promptLower.includes("incorrect") ||
|
|
944
|
-
promptLower.includes("percentage") ||
|
|
945
|
-
/\d+.*\d+/.test(task.prompt);
|
|
946
|
-
// Step 1: Generate two search queries — one direct, one from a different angle
|
|
947
|
-
let searchQueries = [];
|
|
948
|
-
try {
|
|
949
|
-
const queryPrompt = [
|
|
950
|
-
"Generate exactly 2 web search queries to find the answer to this question.",
|
|
951
|
-
"Query 1: A concise, direct query with key names, dates, and specific terms.",
|
|
952
|
-
"Query 2: A different-angle query targeting the underlying source (paper, database, official page, grant).",
|
|
953
|
-
"Return exactly 2 lines, one query per line, nothing else.",
|
|
954
|
-
"",
|
|
955
|
-
`QUESTION:\n${task.prompt}`,
|
|
956
|
-
].join("\n");
|
|
957
|
-
const queryText = await geminiGenerate(queryPrompt, { maxOutputTokens: 512 });
|
|
958
|
-
searchQueries = queryText
|
|
959
|
-
.split("\n")
|
|
960
|
-
.map((q) => q
|
|
961
|
-
.replace(/^\d+[\.\)]\s*/, "")
|
|
962
|
-
.replace(/^(Query \d+:\s*)/i, "")
|
|
963
|
-
.replace(/^["']|["']$/g, "")
|
|
964
|
-
.trim())
|
|
965
|
-
.filter((q) => q.length > 5 && q.length < 300);
|
|
966
|
-
}
|
|
967
|
-
catch {
|
|
968
|
-
// Fall through
|
|
969
|
-
}
|
|
970
|
-
if (searchQueries.length === 0)
|
|
971
|
-
searchQueries = [task.prompt];
|
|
972
|
-
searchQueries = searchQueries.slice(0, 2);
|
|
973
|
-
// Step 2: Search with both queries
|
|
974
|
-
const allUrls = [];
|
|
975
|
-
for (const query of searchQueries) {
|
|
976
|
-
try {
|
|
977
|
-
const result = await webSearch.handler({
|
|
978
|
-
query,
|
|
979
|
-
maxResults: 5,
|
|
980
|
-
provider: "auto",
|
|
981
|
-
});
|
|
982
|
-
toolCalls++;
|
|
983
|
-
const results = Array.isArray(result?.results) ? result.results : [];
|
|
984
|
-
for (const r of results) {
|
|
985
|
-
const url = String(r?.url ?? "").trim();
|
|
986
|
-
if (url.startsWith("http") && !allUrls.includes(url)) {
|
|
987
|
-
allUrls.push(url);
|
|
988
|
-
}
|
|
989
|
-
}
|
|
990
|
-
}
|
|
991
|
-
catch {
|
|
992
|
-
// Continue
|
|
993
|
-
}
|
|
994
|
-
}
|
|
995
|
-
// Step 3: Fetch top 4 URLs
|
|
996
|
-
const fetchLimit = Math.min(allUrls.length, 4);
|
|
997
|
-
const fetchedContent = [];
|
|
998
|
-
for (let i = 0; i < fetchLimit; i++) {
|
|
999
|
-
try {
|
|
1000
|
-
const result = await fetchUrl.handler({
|
|
1001
|
-
url: allUrls[i],
|
|
1002
|
-
extractMode: "markdown",
|
|
1003
|
-
maxLength: 16000,
|
|
1004
|
-
});
|
|
1005
|
-
toolCalls++;
|
|
1006
|
-
fetchedContent.push({
|
|
1007
|
-
url: allUrls[i],
|
|
1008
|
-
title: String(result?.title ?? ""),
|
|
1009
|
-
content: String(result?.content ?? "").slice(0, 12000),
|
|
1010
|
-
});
|
|
1011
|
-
}
|
|
1012
|
-
catch {
|
|
1013
|
-
// Skip failed fetches
|
|
1014
|
-
}
|
|
1015
|
-
}
|
|
1016
|
-
// Step 4: Extract and follow relevant linked URLs from fetched content
|
|
1017
|
-
const followUpUrls = [];
|
|
1018
|
-
for (const item of fetchedContent) {
|
|
1019
|
-
const urlMatches = item.content.match(/https?:\/\/[^\s)\]>"']+/g) ?? [];
|
|
1020
|
-
for (const foundUrl of urlMatches) {
|
|
1021
|
-
const cleanUrl = foundUrl.replace(/[.,;:!?)]+$/, "");
|
|
1022
|
-
if (allUrls.includes(cleanUrl) || followUpUrls.includes(cleanUrl))
|
|
1023
|
-
continue;
|
|
1024
|
-
// Broadly follow links to authoritative sources
|
|
1025
|
-
const isScholarly = cleanUrl.includes("arxiv") ||
|
|
1026
|
-
cleanUrl.includes("doi.org") ||
|
|
1027
|
-
cleanUrl.includes("iopscience") ||
|
|
1028
|
-
cleanUrl.includes("nature.com/articles") ||
|
|
1029
|
-
cleanUrl.includes("science.org") ||
|
|
1030
|
-
cleanUrl.includes("springer.com");
|
|
1031
|
-
const isGov = cleanUrl.includes("nasa.gov") ||
|
|
1032
|
-
cleanUrl.includes("usgs.gov") ||
|
|
1033
|
-
cleanUrl.includes(".gov/");
|
|
1034
|
-
const isRelevant = (promptLower.includes("paper") && (isScholarly || isGov)) ||
|
|
1035
|
-
(promptLower.includes("database") && isGov) ||
|
|
1036
|
-
(promptLower.includes("article") && (isScholarly || cleanUrl.includes("nature.com"))) ||
|
|
1037
|
-
(promptLower.includes("award") && (isGov || cleanUrl.includes("grant"))) ||
|
|
1038
|
-
(promptLower.includes("nasa") && isGov) ||
|
|
1039
|
-
// Any question mentioning a website/blog — follow scholarly + gov links found in content
|
|
1040
|
-
((promptLower.includes("universe today") ||
|
|
1041
|
-
promptLower.includes("blog") ||
|
|
1042
|
-
promptLower.includes("published")) &&
|
|
1043
|
-
(isScholarly || isGov));
|
|
1044
|
-
if (isRelevant) {
|
|
1045
|
-
followUpUrls.push(cleanUrl);
|
|
1046
|
-
if (followUpUrls.length >= 3)
|
|
1047
|
-
break;
|
|
1048
|
-
}
|
|
1049
|
-
}
|
|
1050
|
-
}
|
|
1051
|
-
for (const url of followUpUrls) {
|
|
1052
|
-
try {
|
|
1053
|
-
const result = await fetchUrl.handler({
|
|
1054
|
-
url,
|
|
1055
|
-
extractMode: "markdown",
|
|
1056
|
-
maxLength: 16000,
|
|
1057
|
-
});
|
|
1058
|
-
toolCalls++;
|
|
1059
|
-
fetchedContent.push({
|
|
1060
|
-
url,
|
|
1061
|
-
title: String(result?.title ?? ""),
|
|
1062
|
-
content: String(result?.content ?? "").slice(0, 12000),
|
|
1063
|
-
});
|
|
1064
|
-
}
|
|
1065
|
-
catch {
|
|
1066
|
-
// Skip
|
|
1067
|
-
}
|
|
1068
|
-
}
|
|
1069
|
-
// Step 5: Final answer — use code execution only when math is needed
|
|
1070
|
-
const sourcesBlock = fetchedContent
|
|
1071
|
-
.map((item, i) => `SOURCE ${i + 1}: ${item.title || item.url}\nURL: ${item.url}\nCONTENT:\n${item.content}`)
|
|
1072
|
-
.join("\n\n");
|
|
1073
|
-
const answerPrompt = [
|
|
1074
|
-
"Answer the question using ONLY the provided sources.",
|
|
1075
|
-
...(needsMath
|
|
1076
|
-
? [
|
|
1077
|
-
"This question requires precise computation. Write Python code to calculate the answer.",
|
|
1078
|
-
"Parse the relevant data from the sources and compute the result programmatically.",
|
|
1079
|
-
]
|
|
1080
|
-
: []),
|
|
1081
|
-
"If the sources are insufficient, make the best supported guess.",
|
|
1082
|
-
"",
|
|
1083
|
-
"Return ONLY the final answer, no explanation.",
|
|
1084
|
-
"",
|
|
1085
|
-
`QUESTION:\n${task.prompt}`,
|
|
1086
|
-
"",
|
|
1087
|
-
sourcesBlock || "NO_SOURCES_FOUND",
|
|
1088
|
-
].join("\n");
|
|
1089
|
-
const answer = await geminiGenerate(answerPrompt, { codeExecution: needsMath });
|
|
1090
|
-
return { answer, toolCalls };
|
|
1091
|
-
}
|
|
1092
|
-
async function loadFixture(fixturePath) {
|
|
1093
|
-
const raw = await readFile(fixturePath, "utf8");
|
|
1094
|
-
const parsed = JSON.parse(raw);
|
|
1095
|
-
if (!parsed || !Array.isArray(parsed.tasks))
|
|
1096
|
-
throw new Error("Invalid GAIA capability fixture");
|
|
1097
|
-
return parsed;
|
|
1098
|
-
}
|
|
1099
|
-
describe("Capability: GAIA accuracy (LLM-only vs LLM+tools)", () => {
|
|
1100
|
-
const testFn = shouldRun ? it : it.skip;
|
|
1101
|
-
testFn("should measure accuracy delta on a small GAIA subset", async () => {
|
|
1102
|
-
loadDotEnvLocalIfPresent();
|
|
1103
|
-
const fixturePath = resolveCapabilityFixturePath();
|
|
1104
|
-
if (!existsSync(fixturePath)) {
|
|
1105
|
-
throw new Error(`Missing GAIA capability fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityFixture.py`);
|
|
1106
|
-
}
|
|
1107
|
-
const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
|
|
1108
|
-
const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
|
|
1109
|
-
const baselineLlm = await createTextLlmClient({ model: baselineModel });
|
|
1110
|
-
const toolsLlm = await createTextLlmClient({ model: toolsModel });
|
|
1111
|
-
const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
|
|
1112
|
-
const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
|
|
1113
|
-
const fixture = await loadFixture(fixturePath);
|
|
1114
|
-
expect(Array.isArray(fixture.tasks)).toBe(true);
|
|
1115
|
-
expect(fixture.tasks.length).toBeGreaterThan(0);
|
|
1116
|
-
const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "6", 10);
|
|
1117
|
-
const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 6));
|
|
1118
|
-
const tasks = fixture.tasks.slice(0, taskLimit);
|
|
1119
|
-
const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
|
|
1120
|
-
const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
|
|
1121
|
-
const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "10", 10);
|
|
1122
|
-
const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "8", 10);
|
|
1123
|
-
// Auto-discover judge: free OpenRouter → paid LLM → deterministic-only
|
|
1124
|
-
const useJudge = process.env.NODEBENCH_GAIA_JUDGE !== "0";
|
|
1125
|
-
const judge = useJudge ? await autoDiscoverJudge(toolsLlm) : null;
|
|
1126
|
-
if (judge) {
|
|
1127
|
-
console.log(`[gaia-capability] judge: ${judge.provider}:${judge.model}`);
|
|
1128
|
-
}
|
|
1129
|
-
const results = new Array(tasks.length);
|
|
1130
|
-
let nextIndex = 0;
|
|
1131
|
-
const workers = Array.from({ length: concurrency }, () => (async () => {
|
|
1132
|
-
while (true) {
|
|
1133
|
-
const idx = nextIndex++;
|
|
1134
|
-
if (idx >= tasks.length)
|
|
1135
|
-
return;
|
|
1136
|
-
const task = tasks[idx];
|
|
1137
|
-
try {
|
|
1138
|
-
const baseStart = performance.now();
|
|
1139
|
-
const base = await baselineAnswer(baselineLlm, task);
|
|
1140
|
-
const baseMs = performance.now() - baseStart;
|
|
1141
|
-
const toolsStart = performance.now();
|
|
1142
|
-
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
|
|
1143
|
-
const tools = toolsMode === "enhanced"
|
|
1144
|
-
? await toolAugmentedAnswerNativeFC(task, { maxSteps, maxToolCalls })
|
|
1145
|
-
: await toolAugmentedAnswer(toolsLlm, task, { maxSteps, maxToolCalls, baselineHint: base });
|
|
1146
|
-
const toolsMs = performance.now() - toolsStart;
|
|
1147
|
-
const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
|
|
1148
|
-
const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
|
|
1149
|
-
results[idx] = {
|
|
1150
|
-
taskId: task.id,
|
|
1151
|
-
baselineCorrect: baseJudge.match,
|
|
1152
|
-
toolsCorrect: toolsJudge.match,
|
|
1153
|
-
baselineMs: baseMs,
|
|
1154
|
-
toolsMs,
|
|
1155
|
-
toolCalls: tools.toolCalls,
|
|
1156
|
-
judgeProvider: toolsJudge.judgeProvider,
|
|
1157
|
-
judgeInvoked: toolsJudge.judgeInvoked,
|
|
1158
|
-
};
|
|
1159
|
-
}
|
|
1160
|
-
catch (err) {
|
|
1161
|
-
console.error(`[gaia-capability] ERROR task=${task.id}: ${err?.message ?? String(err)}`);
|
|
1162
|
-
if (err?.stack)
|
|
1163
|
-
console.error(err.stack);
|
|
1164
|
-
results[idx] = {
|
|
1165
|
-
taskId: task.id,
|
|
1166
|
-
baselineCorrect: false,
|
|
1167
|
-
toolsCorrect: false,
|
|
1168
|
-
baselineMs: 0,
|
|
1169
|
-
toolsMs: 0,
|
|
1170
|
-
toolCalls: 0,
|
|
1171
|
-
error: err?.message ?? String(err),
|
|
1172
|
-
};
|
|
1173
|
-
}
|
|
1174
|
-
}
|
|
1175
|
-
})());
|
|
1176
|
-
await Promise.all(workers);
|
|
1177
|
-
const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
|
|
1178
|
-
const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
|
|
1179
|
-
const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
|
|
1180
|
-
const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
|
|
1181
|
-
const avg = (values) => values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
|
|
1182
|
-
const avgBaseMs = avg(results.map((r) => r.baselineMs).filter((n) => n > 0));
|
|
1183
|
-
const avgToolsMs = avg(results.map((r) => r.toolsMs).filter((n) => n > 0));
|
|
1184
|
-
const avgToolCalls = avg(results.map((r) => r.toolCalls));
|
|
1185
|
-
console.log(`[gaia-capability] config=${fixture.config} split=${fixture.split} tasks=${tasks.length} concurrency=${concurrency} baseline=${baselineCorrect}/${tasks.length} tools=${toolsCorrect}/${tasks.length} improved=${improved} regressions=${regressions} avgBaselineMs=${avgBaseMs.toFixed(0)} avgToolsMs=${avgToolsMs.toFixed(0)} avgToolCalls=${avgToolCalls.toFixed(2)}`);
|
|
1186
|
-
console.log(`[gaia-capability] perTask: ${results
|
|
1187
|
-
.map((r) => `${r.taskId}:B${r.baselineCorrect ? "1" : "0"}T${r.toolsCorrect ? "1" : "0"}${r.error ? "E" : ""}`)
|
|
1188
|
-
.join(" ")}`);
|
|
1189
|
-
if (shouldWriteReport) {
|
|
1190
|
-
const repoRoot = resolveRepoRoot();
|
|
1191
|
-
const generatedAtIso = new Date().toISOString();
|
|
1192
|
-
const stamp = generatedAtIso.replace(/[:.]/g, "-");
|
|
1193
|
-
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
|
|
1194
|
-
const publicSummary = {
|
|
1195
|
-
suiteId: "gaia_capability",
|
|
1196
|
-
lane: "web",
|
|
1197
|
-
generatedAtIso,
|
|
1198
|
-
config: fixture.config,
|
|
1199
|
-
split: fixture.split,
|
|
1200
|
-
taskCount: tasks.length,
|
|
1201
|
-
concurrency,
|
|
1202
|
-
baseline: {
|
|
1203
|
-
model: baselineModelLabel,
|
|
1204
|
-
correct: baselineCorrect,
|
|
1205
|
-
passRatePct: tasks.length === 0 ? 0 : (baselineCorrect / tasks.length) * 100,
|
|
1206
|
-
avgMs: avgBaseMs,
|
|
1207
|
-
},
|
|
1208
|
-
tools: {
|
|
1209
|
-
model: toolsModelLabel,
|
|
1210
|
-
mode: toolsMode,
|
|
1211
|
-
correct: toolsCorrect,
|
|
1212
|
-
passRatePct: tasks.length === 0 ? 0 : (toolsCorrect / tasks.length) * 100,
|
|
1213
|
-
avgMs: avgToolsMs,
|
|
1214
|
-
avgToolCalls: avgToolCalls,
|
|
1215
|
-
},
|
|
1216
|
-
improved,
|
|
1217
|
-
regressions,
|
|
1218
|
-
notes: "GAIA is gated. This file contains only aggregate metrics (no prompt/answer text). Detailed per-task report is written under .cache/gaia/reports (gitignored).",
|
|
1219
|
-
};
|
|
1220
|
-
await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_latest.json"), publicSummary);
|
|
1221
|
-
await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_${fixture.config}_${fixture.split}_${stamp}.json`), {
|
|
1222
|
-
...publicSummary,
|
|
1223
|
-
perTask: results.map((r) => ({
|
|
1224
|
-
taskId: r.taskId,
|
|
1225
|
-
baselineCorrect: r.baselineCorrect,
|
|
1226
|
-
toolsCorrect: r.toolsCorrect,
|
|
1227
|
-
baselineMs: r.baselineMs,
|
|
1228
|
-
toolsMs: r.toolsMs,
|
|
1229
|
-
toolCalls: r.toolCalls,
|
|
1230
|
-
error: r.error ?? null,
|
|
1231
|
-
})),
|
|
1232
|
-
});
|
|
1233
|
-
}
|
|
1234
|
-
// Save web cache if recording
|
|
1235
|
-
const cacheMode = (process.env.NODEBENCH_GAIA_WEB_CACHE ?? "").toLowerCase();
|
|
1236
|
-
if (cacheMode === "record" || cacheMode === "replay") {
|
|
1237
|
-
await saveWebCache();
|
|
1238
|
-
console.log(`[gaia-capability] web cache saved (mode=${cacheMode})`);
|
|
1239
|
-
}
|
|
1240
|
-
// By default this benchmark is informational and should not fail CI.
|
|
1241
|
-
// Set NODEBENCH_GAIA_CAPABILITY_ENFORCE=1 to turn the summary into a strict gate.
|
|
1242
|
-
const enforce = process.env.NODEBENCH_GAIA_CAPABILITY_ENFORCE === "1";
|
|
1243
|
-
if (enforce) {
|
|
1244
|
-
// Quality gate:
|
|
1245
|
-
// - Tools should not regress massively vs baseline (allow a small tolerance for web drift).
|
|
1246
|
-
// - Prefer at least one improvement so the run is measuring something tool-relevant.
|
|
1247
|
-
const allowedRegression = Math.max(1, Math.floor(tasks.length * 0.2));
|
|
1248
|
-
expect(improved).toBeGreaterThanOrEqual(1);
|
|
1249
|
-
expect(toolsCorrect).toBeGreaterThanOrEqual(baselineCorrect - allowedRegression);
|
|
1250
|
-
expect(toolsCorrect).toBeGreaterThanOrEqual(1);
|
|
1251
|
-
}
|
|
1252
|
-
else {
|
|
1253
|
-
// Informational mode: ensure we actually ran and produced results.
|
|
1254
|
-
expect(results.length).toBe(tasks.length);
|
|
1255
|
-
expect(results.some((r) => r.error)).toBe(false);
|
|
1256
|
-
}
|
|
1257
|
-
}, 15 * 60_000);
|
|
1258
|
-
});
|
|
1259
|
-
//# sourceMappingURL=gaiaCapabilityEval.test.js.map
|