nodebench-mcp 2.70.0 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -41
- package/dist/agents/alertRouter.d.ts +38 -0
- package/dist/agents/alertRouter.js +151 -0
- package/dist/agents/alertRouter.js.map +1 -0
- package/dist/agents/entityMemory.d.ts +40 -0
- package/dist/agents/entityMemory.js +64 -0
- package/dist/agents/entityMemory.js.map +1 -0
- package/dist/agents/subAgents.d.ts +35 -0
- package/dist/agents/subAgents.js +62 -0
- package/dist/agents/subAgents.js.map +1 -0
- package/dist/benchmarks/benchmarkRunner.js +14 -0
- package/dist/benchmarks/benchmarkRunner.js.map +1 -1
- package/dist/benchmarks/chainEval.js +107 -0
- package/dist/benchmarks/chainEval.js.map +1 -1
- package/dist/benchmarks/llmJudgeEval.js +85 -0
- package/dist/benchmarks/llmJudgeEval.js.map +1 -1
- package/dist/benchmarks/searchQualityEval.js +118 -5
- package/dist/benchmarks/searchQualityEval.js.map +1 -1
- package/dist/cli/search.d.ts +13 -0
- package/dist/cli/search.js +130 -0
- package/dist/cli/search.js.map +1 -0
- package/dist/dashboard/operatingDashboardHtml.js +2 -1
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +3 -2
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/db.d.ts +6 -2
- package/dist/db.js +521 -6
- package/dist/db.js.map +1 -1
- package/dist/index.js +349 -67
- package/dist/index.js.map +1 -1
- package/dist/packageInfo.d.ts +3 -0
- package/dist/packageInfo.js +32 -0
- package/dist/packageInfo.js.map +1 -0
- package/dist/profiler/behaviorStore.d.ts +97 -0
- package/dist/profiler/behaviorStore.js +276 -0
- package/dist/profiler/behaviorStore.js.map +1 -0
- package/dist/profiler/eventCollector.d.ts +119 -0
- package/dist/profiler/eventCollector.js +267 -0
- package/dist/profiler/eventCollector.js.map +1 -0
- package/dist/profiler/index.d.ts +15 -0
- package/dist/profiler/index.js +16 -0
- package/dist/profiler/index.js.map +1 -0
- package/dist/profiler/mcpProxy.d.ts +49 -0
- package/dist/profiler/mcpProxy.js +123 -0
- package/dist/profiler/mcpProxy.js.map +1 -0
- package/dist/profiler/modelRouter.d.ts +30 -0
- package/dist/profiler/modelRouter.js +99 -0
- package/dist/profiler/modelRouter.js.map +1 -0
- package/dist/profiler/otelReceiver.d.ts +17 -0
- package/dist/profiler/otelReceiver.js +62 -0
- package/dist/profiler/otelReceiver.js.map +1 -0
- package/dist/profiler/proofEngine.d.ts +41 -0
- package/dist/profiler/proofEngine.js +93 -0
- package/dist/profiler/proofEngine.js.map +1 -0
- package/dist/profiler/workflowTemplates.d.ts +41 -0
- package/dist/profiler/workflowTemplates.js +95 -0
- package/dist/profiler/workflowTemplates.js.map +1 -0
- package/dist/providers/localMemoryProvider.js +3 -2
- package/dist/providers/localMemoryProvider.js.map +1 -1
- package/dist/runtimeConfig.d.ts +11 -0
- package/dist/runtimeConfig.js +27 -0
- package/dist/runtimeConfig.js.map +1 -0
- package/dist/sandboxApi.js +2 -1
- package/dist/sandboxApi.js.map +1 -1
- package/dist/security/auditLog.js +8 -3
- package/dist/security/auditLog.js.map +1 -1
- package/dist/subconscious/blocks.d.ts +43 -0
- package/dist/subconscious/blocks.js +158 -0
- package/dist/subconscious/blocks.js.map +1 -0
- package/dist/subconscious/classifier.d.ts +22 -0
- package/dist/subconscious/classifier.js +118 -0
- package/dist/subconscious/classifier.js.map +1 -0
- package/dist/subconscious/graphEngine.d.ts +65 -0
- package/dist/subconscious/graphEngine.js +234 -0
- package/dist/subconscious/graphEngine.js.map +1 -0
- package/dist/subconscious/index.d.ts +19 -0
- package/dist/subconscious/index.js +20 -0
- package/dist/subconscious/index.js.map +1 -0
- package/dist/subconscious/tools.d.ts +5 -0
- package/dist/subconscious/tools.js +255 -0
- package/dist/subconscious/tools.js.map +1 -0
- package/dist/subconscious/whisperPolicy.d.ts +20 -0
- package/dist/subconscious/whisperPolicy.js +171 -0
- package/dist/subconscious/whisperPolicy.js.map +1 -0
- package/dist/sweep/engine.d.ts +27 -0
- package/dist/sweep/engine.js +244 -0
- package/dist/sweep/engine.js.map +1 -0
- package/dist/sweep/index.d.ts +9 -0
- package/dist/sweep/index.js +8 -0
- package/dist/sweep/index.js.map +1 -0
- package/dist/sweep/sources/github_trending.d.ts +6 -0
- package/dist/sweep/sources/github_trending.js +37 -0
- package/dist/sweep/sources/github_trending.js.map +1 -0
- package/dist/sweep/sources/hackernews.d.ts +7 -0
- package/dist/sweep/sources/hackernews.js +57 -0
- package/dist/sweep/sources/hackernews.js.map +1 -0
- package/dist/sweep/sources/openbb_finance.d.ts +9 -0
- package/dist/sweep/sources/openbb_finance.js +46 -0
- package/dist/sweep/sources/openbb_finance.js.map +1 -0
- package/dist/sweep/sources/producthunt.d.ts +6 -0
- package/dist/sweep/sources/producthunt.js +41 -0
- package/dist/sweep/sources/producthunt.js.map +1 -0
- package/dist/sweep/sources/web_signals.d.ts +7 -0
- package/dist/sweep/sources/web_signals.js +63 -0
- package/dist/sweep/sources/web_signals.js.map +1 -0
- package/dist/sweep/sources/yahoo_finance.d.ts +6 -0
- package/dist/sweep/sources/yahoo_finance.js +47 -0
- package/dist/sweep/sources/yahoo_finance.js.map +1 -0
- package/dist/sweep/types.d.ts +50 -0
- package/dist/sweep/types.js +9 -0
- package/dist/sweep/types.js.map +1 -0
- package/dist/sync/founderEpisodeStore.d.ts +98 -0
- package/dist/sync/founderEpisodeStore.js +230 -0
- package/dist/sync/founderEpisodeStore.js.map +1 -0
- package/dist/sync/hyperloopArchive.d.ts +51 -0
- package/dist/sync/hyperloopArchive.js +153 -0
- package/dist/sync/hyperloopArchive.js.map +1 -0
- package/dist/sync/hyperloopEval.d.ts +123 -0
- package/dist/sync/hyperloopEval.js +389 -0
- package/dist/sync/hyperloopEval.js.map +1 -0
- package/dist/sync/protocol.d.ts +172 -0
- package/dist/sync/protocol.js +9 -0
- package/dist/sync/protocol.js.map +1 -0
- package/dist/sync/sessionMemory.d.ts +47 -0
- package/dist/sync/sessionMemory.js +138 -0
- package/dist/sync/sessionMemory.js.map +1 -0
- package/dist/sync/store.d.ts +384 -0
- package/dist/sync/store.js +1435 -0
- package/dist/sync/store.js.map +1 -0
- package/dist/sync/syncBridgeClient.d.ts +30 -0
- package/dist/sync/syncBridgeClient.js +172 -0
- package/dist/sync/syncBridgeClient.js.map +1 -0
- package/dist/tools/autonomousDeliveryTools.d.ts +2 -0
- package/dist/tools/autonomousDeliveryTools.js +1104 -0
- package/dist/tools/autonomousDeliveryTools.js.map +1 -0
- package/dist/tools/boilerplateTools.js +10 -9
- package/dist/tools/boilerplateTools.js.map +1 -1
- package/dist/tools/claudeCodeIngestTools.d.ts +10 -0
- package/dist/tools/claudeCodeIngestTools.js +347 -0
- package/dist/tools/claudeCodeIngestTools.js.map +1 -0
- package/dist/tools/coreWorkflowTools.d.ts +2 -0
- package/dist/tools/coreWorkflowTools.js +488 -0
- package/dist/tools/coreWorkflowTools.js.map +1 -0
- package/dist/tools/deltaTools.d.ts +15 -0
- package/dist/tools/deltaTools.js +1522 -0
- package/dist/tools/deltaTools.js.map +1 -0
- package/dist/tools/documentationTools.js +2 -1
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/entityLookupTools.d.ts +14 -0
- package/dist/tools/entityLookupTools.js +159 -0
- package/dist/tools/entityLookupTools.js.map +1 -0
- package/dist/tools/entityTemporalTools.d.ts +12 -0
- package/dist/tools/entityTemporalTools.js +330 -0
- package/dist/tools/entityTemporalTools.js.map +1 -0
- package/dist/tools/founderLocalPipeline.d.ts +215 -0
- package/dist/tools/founderLocalPipeline.js +1516 -2
- package/dist/tools/founderLocalPipeline.js.map +1 -1
- package/dist/tools/founderOperatingModel.d.ts +120 -0
- package/dist/tools/founderOperatingModel.js +469 -0
- package/dist/tools/founderOperatingModel.js.map +1 -0
- package/dist/tools/founderOperatingModelTools.d.ts +2 -0
- package/dist/tools/founderOperatingModelTools.js +169 -0
- package/dist/tools/founderOperatingModelTools.js.map +1 -0
- package/dist/tools/founderStrategicOpsTools.d.ts +2 -0
- package/dist/tools/founderStrategicOpsTools.js +1310 -0
- package/dist/tools/founderStrategicOpsTools.js.map +1 -0
- package/dist/tools/graphifyTools.d.ts +19 -0
- package/dist/tools/graphifyTools.js +375 -0
- package/dist/tools/graphifyTools.js.map +1 -0
- package/dist/tools/index.d.ts +3 -0
- package/dist/tools/index.js +4 -0
- package/dist/tools/index.js.map +1 -1
- package/dist/tools/monteCarloTools.d.ts +16 -0
- package/dist/tools/monteCarloTools.js +225 -0
- package/dist/tools/monteCarloTools.js.map +1 -0
- package/dist/tools/packetCompilerTools.d.ts +12 -0
- package/dist/tools/packetCompilerTools.js +322 -0
- package/dist/tools/packetCompilerTools.js.map +1 -0
- package/dist/tools/planSynthesisTools.d.ts +15 -0
- package/dist/tools/planSynthesisTools.js +455 -0
- package/dist/tools/planSynthesisTools.js.map +1 -0
- package/dist/tools/profilerTools.d.ts +20 -0
- package/dist/tools/profilerTools.js +364 -0
- package/dist/tools/profilerTools.js.map +1 -0
- package/dist/tools/progressiveDiscoveryTools.js +2 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/savingsTools.d.ts +11 -0
- package/dist/tools/savingsTools.js +155 -0
- package/dist/tools/savingsTools.js.map +1 -0
- package/dist/tools/scenarioCompilerTools.d.ts +14 -0
- package/dist/tools/scenarioCompilerTools.js +290 -0
- package/dist/tools/scenarioCompilerTools.js.map +1 -0
- package/dist/tools/sharedContextTools.d.ts +2 -0
- package/dist/tools/sharedContextTools.js +423 -0
- package/dist/tools/sharedContextTools.js.map +1 -0
- package/dist/tools/sitemapTools.d.ts +15 -0
- package/dist/tools/sitemapTools.js +560 -0
- package/dist/tools/sitemapTools.js.map +1 -0
- package/dist/tools/sweepTools.d.ts +9 -0
- package/dist/tools/sweepTools.js +112 -0
- package/dist/tools/sweepTools.js.map +1 -0
- package/dist/tools/syncBridgeTools.d.ts +2 -0
- package/dist/tools/syncBridgeTools.js +258 -0
- package/dist/tools/syncBridgeTools.js.map +1 -0
- package/dist/tools/toolRegistry.js +1223 -45
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/workspaceTools.d.ts +19 -0
- package/dist/tools/workspaceTools.js +762 -0
- package/dist/tools/workspaceTools.js.map +1 -0
- package/dist/toolsetRegistry.js +162 -3
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +39 -38
- package/rules/nodebench-agentic-reliability.md +32 -0
- package/rules/nodebench-analyst-diagnostic.md +25 -0
- package/rules/nodebench-auto-qa.md +31 -0
- package/rules/nodebench-completion-traceability.md +22 -0
- package/rules/nodebench-flywheel-continuous.md +25 -0
- package/rules/nodebench-pre-release-review.md +24 -0
- package/rules/nodebench-qa-dogfood.md +26 -0
- package/rules/nodebench-scenario-testing.md +30 -0
- package/rules/nodebench-self-direction.md +23 -0
- package/rules/nodebench-self-judge-loop.md +24 -0
- package/scripts/install.sh +215 -0
- package/dist/__tests__/analytics.test.d.ts +0 -11
- package/dist/__tests__/analytics.test.js +0 -546
- package/dist/__tests__/analytics.test.js.map +0 -1
- package/dist/__tests__/architectComplex.test.d.ts +0 -1
- package/dist/__tests__/architectComplex.test.js +0 -373
- package/dist/__tests__/architectComplex.test.js.map +0 -1
- package/dist/__tests__/architectSmoke.test.d.ts +0 -1
- package/dist/__tests__/architectSmoke.test.js +0 -92
- package/dist/__tests__/architectSmoke.test.js.map +0 -1
- package/dist/__tests__/audit-registry.d.ts +0 -1
- package/dist/__tests__/audit-registry.js +0 -60
- package/dist/__tests__/audit-registry.js.map +0 -1
- package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
- package/dist/__tests__/batchAutopilot.test.js +0 -218
- package/dist/__tests__/batchAutopilot.test.js.map +0 -1
- package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
- package/dist/__tests__/cliSubcommands.test.js +0 -138
- package/dist/__tests__/cliSubcommands.test.js.map +0 -1
- package/dist/__tests__/comparativeBench.test.d.ts +0 -1
- package/dist/__tests__/comparativeBench.test.js +0 -722
- package/dist/__tests__/comparativeBench.test.js.map +0 -1
- package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
- package/dist/__tests__/critterCalibrationEval.js +0 -370
- package/dist/__tests__/critterCalibrationEval.js.map +0 -1
- package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
- package/dist/__tests__/dynamicLoading.test.js +0 -280
- package/dist/__tests__/dynamicLoading.test.js.map +0 -1
- package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
- package/dist/__tests__/embeddingProvider.test.js +0 -86
- package/dist/__tests__/embeddingProvider.test.js.map +0 -1
- package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
- package/dist/__tests__/evalDatasetBench.test.js +0 -738
- package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
- package/dist/__tests__/evalHarness.test.d.ts +0 -1
- package/dist/__tests__/evalHarness.test.js +0 -1107
- package/dist/__tests__/evalHarness.test.js.map +0 -1
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
- package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
- package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
- package/dist/__tests__/forecastingDogfood.test.js +0 -284
- package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
- package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
- package/dist/__tests__/forecastingScoring.test.js +0 -202
- package/dist/__tests__/forecastingScoring.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
- package/dist/__tests__/helpers/answerMatch.js +0 -267
- package/dist/__tests__/helpers/answerMatch.js.map +0 -1
- package/dist/__tests__/helpers/textLlm.d.ts +0 -25
- package/dist/__tests__/helpers/textLlm.js +0 -214
- package/dist/__tests__/helpers/textLlm.js.map +0 -1
- package/dist/__tests__/localDashboard.test.d.ts +0 -1
- package/dist/__tests__/localDashboard.test.js +0 -226
- package/dist/__tests__/localDashboard.test.js.map +0 -1
- package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
- package/dist/__tests__/multiHopDogfood.test.js +0 -303
- package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
- package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
- package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
- package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
- package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
- package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
- package/dist/__tests__/openclawDogfood.test.js +0 -535
- package/dist/__tests__/openclawDogfood.test.js.map +0 -1
- package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
- package/dist/__tests__/openclawMessaging.test.js +0 -232
- package/dist/__tests__/openclawMessaging.test.js.map +0 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
- package/dist/__tests__/presetRealWorldBench.test.js +0 -859
- package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
- package/dist/__tests__/tools.test.d.ts +0 -1
- package/dist/__tests__/tools.test.js +0 -3201
- package/dist/__tests__/tools.test.js.map +0 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
- package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
- package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
- package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
- package/dist/__tests__/traceabilityDogfood.test.js +0 -241
- package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
- package/dist/__tests__/webmcpTools.test.d.ts +0 -7
- package/dist/__tests__/webmcpTools.test.js +0 -195
- package/dist/__tests__/webmcpTools.test.js.map +0 -1
- package/dist/benchmarks/testProviderBus.d.ts +0 -7
- package/dist/benchmarks/testProviderBus.js +0 -272
- package/dist/benchmarks/testProviderBus.js.map +0 -1
- package/dist/hooks/postCompaction.d.ts +0 -14
- package/dist/hooks/postCompaction.js +0 -51
- package/dist/hooks/postCompaction.js.map +0 -1
- package/dist/security/__tests__/security.test.d.ts +0 -8
- package/dist/security/__tests__/security.test.js +0 -295
- package/dist/security/__tests__/security.test.js.map +0 -1
- package/dist/tools/documentTools.d.ts +0 -5
- package/dist/tools/documentTools.js +0 -524
- package/dist/tools/documentTools.js.map +0 -1
- package/dist/tools/financialTools.d.ts +0 -10
- package/dist/tools/financialTools.js +0 -403
- package/dist/tools/financialTools.js.map +0 -1
- package/dist/tools/memoryTools.d.ts +0 -5
- package/dist/tools/memoryTools.js +0 -137
- package/dist/tools/memoryTools.js.map +0 -1
- package/dist/tools/planningTools.d.ts +0 -5
- package/dist/tools/planningTools.js +0 -147
- package/dist/tools/planningTools.js.map +0 -1
- package/dist/tools/searchTools.d.ts +0 -5
- package/dist/tools/searchTools.js +0 -145
- package/dist/tools/searchTools.js.map +0 -1
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GAIA audio-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local audio tools.
|
|
3
|
-
*
|
|
4
|
-
* This lane targets GAIA tasks that include audio attachments (MP3/WAV/etc).
|
|
5
|
-
* We provide deterministic local transcription via NodeBench MCP tools and score answers against
|
|
6
|
-
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
7
|
-
*
|
|
8
|
-
* Safety:
|
|
9
|
-
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
10
|
-
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
11
|
-
*
|
|
12
|
-
* Disabled by default (cost + rate limits). Run with:
|
|
13
|
-
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
14
|
-
*/
|
|
15
|
-
import { describe, expect, it } from "vitest";
|
|
16
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
17
|
-
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
18
|
-
import path from "node:path";
|
|
19
|
-
import { fileURLToPath } from "node:url";
|
|
20
|
-
import { performance } from "node:perf_hooks";
|
|
21
|
-
import { localFileTools } from "../tools/localFileTools.js";
|
|
22
|
-
import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
|
|
23
|
-
import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
|
|
24
|
-
const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
|
|
25
|
-
const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
|
|
26
|
-
async function safeWriteJson(filePath, payload) {
|
|
27
|
-
try {
|
|
28
|
-
await mkdir(path.dirname(filePath), { recursive: true });
|
|
29
|
-
await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
|
|
30
|
-
}
|
|
31
|
-
catch (err) {
|
|
32
|
-
console.warn(`[gaia-capability-audio] report write failed: ${err?.message ?? String(err)}`);
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
function resolveRepoRoot() {
|
|
36
|
-
const testDir = path.dirname(fileURLToPath(import.meta.url));
|
|
37
|
-
return path.resolve(testDir, "../../../..");
|
|
38
|
-
}
|
|
39
|
-
function resolveCapabilityAudioFixturePath() {
|
|
40
|
-
const override = process.env.NODEBENCH_GAIA_CAPABILITY_AUDIO_FIXTURE_PATH;
|
|
41
|
-
if (override) {
|
|
42
|
-
if (path.isAbsolute(override))
|
|
43
|
-
return override;
|
|
44
|
-
const repoRoot = resolveRepoRoot();
|
|
45
|
-
return path.resolve(repoRoot, override);
|
|
46
|
-
}
|
|
47
|
-
const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
|
|
48
|
-
const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
|
|
49
|
-
const repoRoot = resolveRepoRoot();
|
|
50
|
-
return path.join(repoRoot, ".cache", "gaia", `gaia_capability_audio_${config}_${split}.sample.json`);
|
|
51
|
-
}
|
|
52
|
-
function loadDotEnvLocalIfPresent() {
|
|
53
|
-
const repoRoot = resolveRepoRoot();
|
|
54
|
-
const envPath = path.join(repoRoot, ".env.local");
|
|
55
|
-
if (!existsSync(envPath))
|
|
56
|
-
return;
|
|
57
|
-
const text = readFileSync(envPath, "utf8");
|
|
58
|
-
for (const rawLine of text.split(/\r?\n/)) {
|
|
59
|
-
const line = rawLine.trim();
|
|
60
|
-
if (!line || line.startsWith("#"))
|
|
61
|
-
continue;
|
|
62
|
-
const idx = line.indexOf("=");
|
|
63
|
-
if (idx <= 0)
|
|
64
|
-
continue;
|
|
65
|
-
const key = line.slice(0, idx).trim();
|
|
66
|
-
let value = line.slice(idx + 1).trim();
|
|
67
|
-
if ((value.startsWith("\"") && value.endsWith("\"")) ||
|
|
68
|
-
(value.startsWith("'") && value.endsWith("'"))) {
|
|
69
|
-
value = value.slice(1, -1);
|
|
70
|
-
}
|
|
71
|
-
if (!process.env[key])
|
|
72
|
-
process.env[key] = value;
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
async function llmGenerateText(llm, history) {
|
|
76
|
-
const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
|
|
77
|
-
return generateTextFromHistory(llm, history, {
|
|
78
|
-
temperature: Number.isFinite(temperature) ? temperature : 0,
|
|
79
|
-
maxOutputTokens: 1024,
|
|
80
|
-
});
|
|
81
|
-
}
|
|
82
|
-
async function baselineAnswer(llm, task) {
|
|
83
|
-
const contents = [
|
|
84
|
-
{
|
|
85
|
-
role: "user",
|
|
86
|
-
parts: [
|
|
87
|
-
{
|
|
88
|
-
text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
|
|
89
|
-
},
|
|
90
|
-
],
|
|
91
|
-
},
|
|
92
|
-
];
|
|
93
|
-
return llmGenerateText(llm, contents);
|
|
94
|
-
}
|
|
95
|
-
async function loadFixture(filePath) {
|
|
96
|
-
const raw = await readFile(filePath, "utf8");
|
|
97
|
-
const json = JSON.parse(raw);
|
|
98
|
-
return json;
|
|
99
|
-
}
|
|
100
|
-
function createToolIndex(tools) {
|
|
101
|
-
const m = new Map();
|
|
102
|
-
for (const t of tools)
|
|
103
|
-
m.set(t.name, t);
|
|
104
|
-
return m;
|
|
105
|
-
}
|
|
106
|
-
async function toolAugmentedAnswerFromAudio(llm, task, opts) {
|
|
107
|
-
const localPath = String(task.localFilePath ?? "").trim();
|
|
108
|
-
if (!localPath)
|
|
109
|
-
throw new Error("Task missing localFilePath");
|
|
110
|
-
const toolIndex = createToolIndex(localFileTools);
|
|
111
|
-
const tool = toolIndex.get("transcribe_audio_file");
|
|
112
|
-
if (!tool)
|
|
113
|
-
throw new Error("Missing tool: transcribe_audio_file");
|
|
114
|
-
if (opts.maxToolCalls < 1) {
|
|
115
|
-
throw new Error("maxToolCalls must be >= 1 to run audio lane");
|
|
116
|
-
}
|
|
117
|
-
const transcript = (await tool.handler({
|
|
118
|
-
path: localPath,
|
|
119
|
-
model: process.env.NODEBENCH_AUDIO_MODEL ?? "tiny.en",
|
|
120
|
-
maxChars: 20000,
|
|
121
|
-
timeoutMs: 300000,
|
|
122
|
-
}));
|
|
123
|
-
const transcriptText = String(transcript?.text ?? "").trim();
|
|
124
|
-
if (!transcriptText) {
|
|
125
|
-
throw new Error("Empty transcript from transcribe_audio_file");
|
|
126
|
-
}
|
|
127
|
-
const contents = [
|
|
128
|
-
{
|
|
129
|
-
role: "user",
|
|
130
|
-
parts: [
|
|
131
|
-
{
|
|
132
|
-
text: `You are given a transcript of an attached audio file. Use it to answer the question.\n\nRules:\n- Do not browse the web.\n- Return ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}\n\nAudio transcript:\n${transcriptText}`,
|
|
133
|
-
},
|
|
134
|
-
],
|
|
135
|
-
},
|
|
136
|
-
];
|
|
137
|
-
const answer = await llmGenerateText(llm, contents);
|
|
138
|
-
return { answer, toolCalls: 1 };
|
|
139
|
-
}
|
|
140
|
-
describe("GAIA capability: audio lane", () => {
|
|
141
|
-
const testFn = shouldRun ? it : it.skip;
|
|
142
|
-
testFn("should measure accuracy delta on a small GAIA audio subset", async () => {
|
|
143
|
-
loadDotEnvLocalIfPresent();
|
|
144
|
-
const fixturePath = resolveCapabilityAudioFixturePath();
|
|
145
|
-
if (!existsSync(fixturePath)) {
|
|
146
|
-
throw new Error(`Missing GAIA audio fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityAudioFixture.py`);
|
|
147
|
-
}
|
|
148
|
-
const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
|
|
149
|
-
const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
|
|
150
|
-
const baselineLlm = await createTextLlmClient({ model: baselineModel });
|
|
151
|
-
const toolsLlm = await createTextLlmClient({ model: toolsModel });
|
|
152
|
-
const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
|
|
153
|
-
const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
|
|
154
|
-
const fixture = await loadFixture(fixturePath);
|
|
155
|
-
expect(Array.isArray(fixture.tasks)).toBe(true);
|
|
156
|
-
expect(fixture.tasks.length).toBeGreaterThan(0);
|
|
157
|
-
const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "4", 10);
|
|
158
|
-
const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 4));
|
|
159
|
-
const tasks = fixture.tasks.slice(0, taskLimit);
|
|
160
|
-
const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
|
|
161
|
-
const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
|
|
162
|
-
const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "1", 10);
|
|
163
|
-
// Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
|
|
164
|
-
const judge = await autoDiscoverJudge(toolsLlm);
|
|
165
|
-
const results = new Array(tasks.length);
|
|
166
|
-
let nextIndex = 0;
|
|
167
|
-
const workers = Array.from({ length: concurrency }, () => (async () => {
|
|
168
|
-
while (true) {
|
|
169
|
-
const idx = nextIndex++;
|
|
170
|
-
if (idx >= tasks.length)
|
|
171
|
-
return;
|
|
172
|
-
const task = tasks[idx];
|
|
173
|
-
try {
|
|
174
|
-
const baseStart = performance.now();
|
|
175
|
-
const base = await baselineAnswer(baselineLlm, task);
|
|
176
|
-
const baseMs = performance.now() - baseStart;
|
|
177
|
-
const toolsStart = performance.now();
|
|
178
|
-
const tools = await toolAugmentedAnswerFromAudio(toolsLlm, task, { maxToolCalls });
|
|
179
|
-
const toolsMs = performance.now() - toolsStart;
|
|
180
|
-
const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
|
|
181
|
-
const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
|
|
182
|
-
results[idx] = {
|
|
183
|
-
taskId: task.id,
|
|
184
|
-
baselineCorrect: baseJudge.match,
|
|
185
|
-
toolsCorrect: toolsJudge.match,
|
|
186
|
-
baselineMs: baseMs,
|
|
187
|
-
toolsMs,
|
|
188
|
-
toolCalls: tools.toolCalls,
|
|
189
|
-
judgeProvider: toolsJudge.judgeProvider,
|
|
190
|
-
judgeInvoked: toolsJudge.judgeInvoked,
|
|
191
|
-
};
|
|
192
|
-
}
|
|
193
|
-
catch (err) {
|
|
194
|
-
results[idx] = {
|
|
195
|
-
taskId: task.id,
|
|
196
|
-
baselineCorrect: false,
|
|
197
|
-
toolsCorrect: false,
|
|
198
|
-
baselineMs: 0,
|
|
199
|
-
toolsMs: 0,
|
|
200
|
-
toolCalls: 0,
|
|
201
|
-
error: err?.message ?? String(err),
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
})());
|
|
206
|
-
await Promise.all(workers);
|
|
207
|
-
const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
|
|
208
|
-
const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
|
|
209
|
-
const baselinePassRate = (baselineCorrect / results.length) * 100;
|
|
210
|
-
const toolsPassRate = (toolsCorrect / results.length) * 100;
|
|
211
|
-
const avgBaseMs = results.reduce((sum, r) => sum + r.baselineMs, 0) / results.length;
|
|
212
|
-
const avgToolsMs = results.reduce((sum, r) => sum + r.toolsMs, 0) / results.length;
|
|
213
|
-
const avgToolCalls = results.reduce((sum, r) => sum + r.toolCalls, 0) / results.length;
|
|
214
|
-
const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
|
|
215
|
-
const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
|
|
216
|
-
console.log(`[gaia-capability-audio] tasks=${results.length} baseline=${baselineCorrect}/${results.length} (${baselinePassRate.toFixed(1)}%) tools=${toolsCorrect}/${results.length} (${toolsPassRate.toFixed(1)}%) delta=${(toolsPassRate - baselinePassRate).toFixed(1)}% improved=${improved} regressions=${regressions} avgToolCalls=${avgToolCalls.toFixed(2)}`);
|
|
217
|
-
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "audio").toLowerCase();
|
|
218
|
-
const publicSummary = {
|
|
219
|
-
suiteId: "gaia_capability_audio",
|
|
220
|
-
lane: "audio",
|
|
221
|
-
generatedAtIso: new Date().toISOString(),
|
|
222
|
-
config: fixture.config,
|
|
223
|
-
split: fixture.split,
|
|
224
|
-
taskCount: results.length,
|
|
225
|
-
concurrency,
|
|
226
|
-
baseline: {
|
|
227
|
-
model: baselineModelLabel,
|
|
228
|
-
correct: baselineCorrect,
|
|
229
|
-
passRatePct: baselinePassRate,
|
|
230
|
-
avgMs: avgBaseMs,
|
|
231
|
-
},
|
|
232
|
-
tools: {
|
|
233
|
-
model: toolsModelLabel,
|
|
234
|
-
mode: toolsMode,
|
|
235
|
-
correct: toolsCorrect,
|
|
236
|
-
passRatePct: toolsPassRate,
|
|
237
|
-
avgMs: avgToolsMs,
|
|
238
|
-
avgToolCalls,
|
|
239
|
-
},
|
|
240
|
-
improved,
|
|
241
|
-
regressions,
|
|
242
|
-
notes: "GAIA audio lane (audio attachments). No prompts/answers persisted; only aggregate metrics are written to public/evals.",
|
|
243
|
-
};
|
|
244
|
-
if (shouldWriteReport) {
|
|
245
|
-
const repoRoot = resolveRepoRoot();
|
|
246
|
-
await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_audio_latest.json"), publicSummary);
|
|
247
|
-
const detailed = {
|
|
248
|
-
...publicSummary,
|
|
249
|
-
results: results.map((r) => ({
|
|
250
|
-
taskId: r.taskId,
|
|
251
|
-
baselineCorrect: r.baselineCorrect,
|
|
252
|
-
toolsCorrect: r.toolsCorrect,
|
|
253
|
-
baselineMs: Math.round(r.baselineMs),
|
|
254
|
-
toolsMs: Math.round(r.toolsMs),
|
|
255
|
-
toolCalls: r.toolCalls,
|
|
256
|
-
...(r.error ? { error: r.error } : {}),
|
|
257
|
-
})),
|
|
258
|
-
};
|
|
259
|
-
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
260
|
-
await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_audio_${fixture.config}_${fixture.split}_${stamp}.json`), detailed);
|
|
261
|
-
}
|
|
262
|
-
expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
|
|
263
|
-
}, 600000);
|
|
264
|
-
});
|
|
265
|
-
//# sourceMappingURL=gaiaCapabilityAudioEval.test.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"gaiaCapabilityAudioEval.test.js","sourceRoot":"","sources":["../../src/__tests__/gaiaCapabilityAudioEval.test.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EACL,mBAAmB,EACnB,uBAAuB,GAGxB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AA4CpF,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,KAAK,GAAG,CAAC;AACpE,MAAM,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,2BAA2B,KAAK,GAAG,CAAC;AAwB1E,KAAK,UAAU,aAAa,CAAC,QAAgB,EAAE,OAAgB;IAC7D,IAAI,CAAC;QACH,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,MAAM,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,IAAI,CAAC,gDAAgD,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC9F,CAAC;AACH,CAAC;AAED,SAAS,eAAe;IACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,iCAAiC;IACxC,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC;IAC1E,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;YAAE,OAAO,QAAQ,CAAC;QAC/C,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gCAAgC,IAAI,UAAU,CAAC;IAC1E,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,YAAY,CAAC;IAC1E,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,yBAAyB,MAAM,IAAI,KAAK,cAAc,CAAC,CAAC;AACvG,CAAC;AAED,SAAS,wBAAwB;IAC/B,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAClD,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO;IAEjC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,EAAE,MAAM,CAAW,CAAC;IACrD,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,IAAI,CAAC;YAAE,SAAS;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACvC,IACE,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YAChD,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC9C,CAAC;YACD,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;IAClD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,GAAkB,EAAE,OAAgC;IACjF,MAAM,WAAW,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,CAAC,CAAC;IAChG,OAAO,uBAAuB,CAAC,GAAG,EAAE,OAAO,EAAE;QAC3C,WAAW,EAAE,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC3D,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,GAAkB,EAAE,IAAoB;IACpE,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,iJAAiJ,IAAI,CAAC,MAAM,EAAE;iBACrK;aACF;SACF;KACF,CAAC;IACF,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAsB,CAAC;IAClD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,KAAgB;IACvC,MAAM,CAAC,GAAG,IAAI,GAAG,EAAmB,CAAC;IACrC,KAAK,MAAM,CAAC,IAAI,KAAK;QAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACxC,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,4BAA4B,CACzC,GAAkB,EAClB,IAAoB,EACpB,IAA8B;IAE9B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1D,IAAI,CAAC,SAAS;QAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAE9D,MAAM,SAAS,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;IAClD,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IACpD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAElE,IAAI,IAAI,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,MAAM,IAAI,CAAC,OAAO,CAAC;QACrC,IAAI,EAAE,SAAS;QACf,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,qBAAqB,IAAI,SAAS;QACrD,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,MAAM;KAClB,CAAC,CAAQ,CAAC;IAEX,MAAM,cAAc,GAAG,MAAM,CAAC,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7D,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,2LAA2L,IAAI,CAAC,MAAM,0BAA0B,cAAc,EAAE;iBACvP;aACF;SACF;KACF,CAAC;IAEF,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACpD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;AAClC,CAAC;AAED,QAAQ,CAAC,6BAA6B,EAAE,GAAG,EAAE;IAC3C,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC;IAExC,MAAM,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;QAC9E,wBAAwB,EAAE,CAAC;QAE3B,MAAM,WAAW,GAAG,iCAAiC,EAAE,CAAC;QACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,4GAA4G,CACzJ,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,IAAI,wBAAwB,CAAC;QAC5F,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,aAAa,CAAC;QAC3E,MAAM,WAAW,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,aAAa,EAAE,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAC;QAClE,MAAM,kBAAkB,GAAG,GAAG,WAAW,CAAC,QAAQ,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;QAC1E,MAAM,eAAe,GAAG,GAAG,QAAQ,CAAC,QAAQ,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAEjE,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAAC,CAAC;QAC/C,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,cAAc,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QACpG,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CACrF,CAAC;QACF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAEhD,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC3G,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACzF,CAAC;QAEF,MAAM,YAAY,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,wCAAwC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAEtG,wEAAwE;QACxE,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAEhD,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxD,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,CACvD,CAAC,KAAK,IAAI,EAAE;YACV,OAAO,IAAI,EAAE,CAAC;gBACZ,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;gBACxB,IAAI,GAAG,IAAI,KAAK,CAAC,MAAM;oBAAE,OAAO;gBAEhC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;gBAExB,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACpC,MAAM,IAAI,GAAG,MAAM,cAAc,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;oBACrD,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;oBAE7C,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACrC,MAAM,KAAK,GAAG,MAAM,4BAA4B,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;oBACnF,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC;oBAE/C,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC;oBAChF,MAAM,UAAU,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;oBAEzF,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,SAAS,CAAC,KAAK;wBAChC,YAAY,EAAE,UAAU,CAAC,KAAK;wBAC9B,UAAU,EAAE,MAAM;wBAClB,OAAO;wBACP,SAAS,EAAE,KAAK,CAAC,SAAS;wBAC1B,aAAa,EAAE,UAAU,CAAC,aAAa;wBACvC,YAAY,EAAE,UAAU,CAAC,YAAY;qBACtC,CAAC;gBACJ,CAAC;gBAAC,OAAO,GAAQ,EAAE,CAAC;oBAClB,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,KAAK;wBACtB,YAAY,EAAE,KAAK;wBACnB,UAAU,EAAE,CAAC;wBACb,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,CAAC;wBACZ,KAAK,EAAE,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC;qBACnC,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC,CAAC,EAAE,CACL,CAAC;QAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAE3B,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAClE,MAAM,gBAAgB,GAAG,CAAC,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAClE,MAAM,aAAa,GAAG,CAAC,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAC5D,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACrF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACnF,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEvF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QACpF,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAEvF,OAAO,CAAC,GAAG,CACT,iCAAiC,OAAO,CAAC,MAAM,aAAa,eAAe,IAAI,OAAO,CAAC,MAAM,KAAK,gBAAgB,CAAC,OAAO,CACxH,CAAC,CACF,YAAY,YAAY,IAAI,OAAO,CAAC,MAAM,KAAK,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,CAClF,aAAa,GAAG,gBAAgB,CACjC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,QAAQ,gBAAgB,WAAW,iBAAiB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACxG,CAAC;QAEF,MAAM,SAAS,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAC9F,MAAM,aAAa,GAAqC;YACtD,OAAO,EAAE,uBAAuB;YAChC,IAAI,EAAE,OAAO;YACb,cAAc,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACxC,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE;gBACR,KAAK,EAAE,kBAAkB;gBACzB,OAAO,EAAE,eAAe;gBACxB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,SAAS;aACjB;YACD,KAAK,EAAE;gBACL,KAAK,EAAE,eAAe;gBACtB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,aAAa;gBAC1B,KAAK,EAAE,UAAU;gBACjB,YAAY;aACb;YACD,QAAQ;YACR,WAAW;YACX,KAAK,EACH,wHAAwH;SAC3H,CAAC;QAEF,IAAI,iBAAiB,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;YACnC,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,mCAAmC,CAAC,EAC3E,aAAa,CACd,CAAC;YAEF,MAAM,QAAQ,GAAG;gBACf,GAAG,aAAa;gBAChB,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,MAAM,EAAE,CAAC,CAAC,MAAM;oBAChB,eAAe,EAAE,CAAC,CAAC,eAAe;oBAClC,YAAY,EAAE,CAAC,CAAC,YAAY;oBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;oBACpC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC;oBAC9B,SAAS,EAAE,CAAC,CAAC,SAAS;oBACtB,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBACvC,CAAC,CAAC;aACJ,CAAC;YACF,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC7D,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CACP,QAAQ,EACR,QAAQ,EACR,MAAM,EACN,SAAS,EACT,yBAAyB,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,KAAK,IAAI,KAAK,OAAO,CACzE,EACD,QAAQ,CACT,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,aAAa,CAAC,CAAC,sBAAsB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC,EAAE,MAAM,CAAC,CAAC;AACb,CAAC,CAAC,CAAC"}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GAIA capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP tools.
|
|
3
|
-
*
|
|
4
|
-
* This test attempts to solve a small GAIA subset and scores answers against
|
|
5
|
-
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
6
|
-
*
|
|
7
|
-
* Safety:
|
|
8
|
-
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
9
|
-
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
10
|
-
*
|
|
11
|
-
* Disabled by default (cost + rate limits + external network). Run with:
|
|
12
|
-
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
13
|
-
*/
|
|
14
|
-
export {};
|