nodebench-mcp 3.0.0 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +74 -67
- package/README.md +36 -34
- package/dist/dashboard/operatingDashboardHtml.js +2 -1
- package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
- package/dist/dashboard/operatingServer.js +3 -2
- package/dist/dashboard/operatingServer.js.map +1 -1
- package/dist/db.js +51 -3
- package/dist/db.js.map +1 -1
- package/dist/index.js +19 -18
- package/dist/index.js.map +1 -1
- package/dist/packageInfo.d.ts +3 -0
- package/dist/packageInfo.js +32 -0
- package/dist/packageInfo.js.map +1 -0
- package/dist/sandboxApi.js +2 -1
- package/dist/sandboxApi.js.map +1 -1
- package/dist/tools/boilerplateTools.js +10 -9
- package/dist/tools/boilerplateTools.js.map +1 -1
- package/dist/tools/documentationTools.js +2 -1
- package/dist/tools/documentationTools.js.map +1 -1
- package/dist/tools/progressiveDiscoveryTools.js +2 -1
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/toolRegistry.js +11 -0
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/toolsetRegistry.js +74 -1
- package/dist/toolsetRegistry.js.map +1 -1
- package/package.json +7 -6
- package/scripts/install.sh +14 -14
- package/dist/__tests__/analytics.test.d.ts +0 -11
- package/dist/__tests__/analytics.test.js +0 -546
- package/dist/__tests__/analytics.test.js.map +0 -1
- package/dist/__tests__/architectComplex.test.d.ts +0 -1
- package/dist/__tests__/architectComplex.test.js +0 -373
- package/dist/__tests__/architectComplex.test.js.map +0 -1
- package/dist/__tests__/architectSmoke.test.d.ts +0 -1
- package/dist/__tests__/architectSmoke.test.js +0 -92
- package/dist/__tests__/architectSmoke.test.js.map +0 -1
- package/dist/__tests__/audit-registry.d.ts +0 -1
- package/dist/__tests__/audit-registry.js +0 -60
- package/dist/__tests__/audit-registry.js.map +0 -1
- package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
- package/dist/__tests__/batchAutopilot.test.js +0 -218
- package/dist/__tests__/batchAutopilot.test.js.map +0 -1
- package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
- package/dist/__tests__/cliSubcommands.test.js +0 -138
- package/dist/__tests__/cliSubcommands.test.js.map +0 -1
- package/dist/__tests__/comparativeBench.test.d.ts +0 -1
- package/dist/__tests__/comparativeBench.test.js +0 -722
- package/dist/__tests__/comparativeBench.test.js.map +0 -1
- package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
- package/dist/__tests__/critterCalibrationEval.js +0 -370
- package/dist/__tests__/critterCalibrationEval.js.map +0 -1
- package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
- package/dist/__tests__/dynamicLoading.test.js +0 -280
- package/dist/__tests__/dynamicLoading.test.js.map +0 -1
- package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
- package/dist/__tests__/embeddingProvider.test.js +0 -86
- package/dist/__tests__/embeddingProvider.test.js.map +0 -1
- package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
- package/dist/__tests__/evalDatasetBench.test.js +0 -738
- package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
- package/dist/__tests__/evalHarness.test.d.ts +0 -1
- package/dist/__tests__/evalHarness.test.js +0 -1107
- package/dist/__tests__/evalHarness.test.js.map +0 -1
- package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
- package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
- package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
- package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
- package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
- package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
- package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
- package/dist/__tests__/forecastingDogfood.test.js +0 -284
- package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
- package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
- package/dist/__tests__/forecastingScoring.test.js +0 -202
- package/dist/__tests__/forecastingScoring.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
- package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
- package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
- package/dist/__tests__/helpers/answerMatch.js +0 -267
- package/dist/__tests__/helpers/answerMatch.js.map +0 -1
- package/dist/__tests__/helpers/textLlm.d.ts +0 -25
- package/dist/__tests__/helpers/textLlm.js +0 -214
- package/dist/__tests__/helpers/textLlm.js.map +0 -1
- package/dist/__tests__/localDashboard.test.d.ts +0 -1
- package/dist/__tests__/localDashboard.test.js +0 -226
- package/dist/__tests__/localDashboard.test.js.map +0 -1
- package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
- package/dist/__tests__/multiHopDogfood.test.js +0 -303
- package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
- package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
- package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
- package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
- package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
- package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
- package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
- package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
- package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
- package/dist/__tests__/openclawDogfood.test.js +0 -535
- package/dist/__tests__/openclawDogfood.test.js.map +0 -1
- package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
- package/dist/__tests__/openclawMessaging.test.js +0 -232
- package/dist/__tests__/openclawMessaging.test.js.map +0 -1
- package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
- package/dist/__tests__/presetRealWorldBench.test.js +0 -859
- package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
- package/dist/__tests__/tools.test.d.ts +0 -1
- package/dist/__tests__/tools.test.js +0 -3201
- package/dist/__tests__/tools.test.js.map +0 -1
- package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
- package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
- package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
- package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
- package/dist/__tests__/traceabilityDogfood.test.js +0 -241
- package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
- package/dist/__tests__/webmcpTools.test.d.ts +0 -7
- package/dist/__tests__/webmcpTools.test.js +0 -195
- package/dist/__tests__/webmcpTools.test.js.map +0 -1
- package/dist/benchmarks/testProviderBus.d.ts +0 -7
- package/dist/benchmarks/testProviderBus.js +0 -272
- package/dist/benchmarks/testProviderBus.js.map +0 -1
- package/dist/hooks/postCompaction.d.ts +0 -14
- package/dist/hooks/postCompaction.js +0 -51
- package/dist/hooks/postCompaction.js.map +0 -1
- package/dist/security/__tests__/security.test.d.ts +0 -8
- package/dist/security/__tests__/security.test.js +0 -295
- package/dist/security/__tests__/security.test.js.map +0 -1
- package/dist/sync/hyperloopEval.test.d.ts +0 -4
- package/dist/sync/hyperloopEval.test.js +0 -60
- package/dist/sync/hyperloopEval.test.js.map +0 -1
- package/dist/sync/store.test.d.ts +0 -4
- package/dist/sync/store.test.js +0 -43
- package/dist/sync/store.test.js.map +0 -1
- package/dist/tools/documentTools.d.ts +0 -5
- package/dist/tools/documentTools.js +0 -524
- package/dist/tools/documentTools.js.map +0 -1
- package/dist/tools/financialTools.d.ts +0 -10
- package/dist/tools/financialTools.js +0 -403
- package/dist/tools/financialTools.js.map +0 -1
- package/dist/tools/memoryTools.d.ts +0 -5
- package/dist/tools/memoryTools.js +0 -137
- package/dist/tools/memoryTools.js.map +0 -1
- package/dist/tools/planningTools.d.ts +0 -5
- package/dist/tools/planningTools.js +0 -147
- package/dist/tools/planningTools.js.map +0 -1
- package/dist/tools/searchTools.d.ts +0 -5
- package/dist/tools/searchTools.js +0 -145
- package/dist/tools/searchTools.js.map +0 -1
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GAIA audio-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local audio tools.
|
|
3
|
-
*
|
|
4
|
-
* This lane targets GAIA tasks that include audio attachments (MP3/WAV/etc).
|
|
5
|
-
* We provide deterministic local transcription via NodeBench MCP tools and score answers against
|
|
6
|
-
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
7
|
-
*
|
|
8
|
-
* Safety:
|
|
9
|
-
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
10
|
-
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
11
|
-
*
|
|
12
|
-
* Disabled by default (cost + rate limits). Run with:
|
|
13
|
-
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
14
|
-
*/
|
|
15
|
-
import { describe, expect, it } from "vitest";
|
|
16
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
17
|
-
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
18
|
-
import path from "node:path";
|
|
19
|
-
import { fileURLToPath } from "node:url";
|
|
20
|
-
import { performance } from "node:perf_hooks";
|
|
21
|
-
import { localFileTools } from "../tools/localFileTools.js";
|
|
22
|
-
import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
|
|
23
|
-
import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
|
|
24
|
-
const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
|
|
25
|
-
const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
|
|
26
|
-
async function safeWriteJson(filePath, payload) {
|
|
27
|
-
try {
|
|
28
|
-
await mkdir(path.dirname(filePath), { recursive: true });
|
|
29
|
-
await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
|
|
30
|
-
}
|
|
31
|
-
catch (err) {
|
|
32
|
-
console.warn(`[gaia-capability-audio] report write failed: ${err?.message ?? String(err)}`);
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
function resolveRepoRoot() {
|
|
36
|
-
const testDir = path.dirname(fileURLToPath(import.meta.url));
|
|
37
|
-
return path.resolve(testDir, "../../../..");
|
|
38
|
-
}
|
|
39
|
-
function resolveCapabilityAudioFixturePath() {
|
|
40
|
-
const override = process.env.NODEBENCH_GAIA_CAPABILITY_AUDIO_FIXTURE_PATH;
|
|
41
|
-
if (override) {
|
|
42
|
-
if (path.isAbsolute(override))
|
|
43
|
-
return override;
|
|
44
|
-
const repoRoot = resolveRepoRoot();
|
|
45
|
-
return path.resolve(repoRoot, override);
|
|
46
|
-
}
|
|
47
|
-
const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
|
|
48
|
-
const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
|
|
49
|
-
const repoRoot = resolveRepoRoot();
|
|
50
|
-
return path.join(repoRoot, ".cache", "gaia", `gaia_capability_audio_${config}_${split}.sample.json`);
|
|
51
|
-
}
|
|
52
|
-
function loadDotEnvLocalIfPresent() {
|
|
53
|
-
const repoRoot = resolveRepoRoot();
|
|
54
|
-
const envPath = path.join(repoRoot, ".env.local");
|
|
55
|
-
if (!existsSync(envPath))
|
|
56
|
-
return;
|
|
57
|
-
const text = readFileSync(envPath, "utf8");
|
|
58
|
-
for (const rawLine of text.split(/\r?\n/)) {
|
|
59
|
-
const line = rawLine.trim();
|
|
60
|
-
if (!line || line.startsWith("#"))
|
|
61
|
-
continue;
|
|
62
|
-
const idx = line.indexOf("=");
|
|
63
|
-
if (idx <= 0)
|
|
64
|
-
continue;
|
|
65
|
-
const key = line.slice(0, idx).trim();
|
|
66
|
-
let value = line.slice(idx + 1).trim();
|
|
67
|
-
if ((value.startsWith("\"") && value.endsWith("\"")) ||
|
|
68
|
-
(value.startsWith("'") && value.endsWith("'"))) {
|
|
69
|
-
value = value.slice(1, -1);
|
|
70
|
-
}
|
|
71
|
-
if (!process.env[key])
|
|
72
|
-
process.env[key] = value;
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
async function llmGenerateText(llm, history) {
|
|
76
|
-
const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
|
|
77
|
-
return generateTextFromHistory(llm, history, {
|
|
78
|
-
temperature: Number.isFinite(temperature) ? temperature : 0,
|
|
79
|
-
maxOutputTokens: 1024,
|
|
80
|
-
});
|
|
81
|
-
}
|
|
82
|
-
async function baselineAnswer(llm, task) {
|
|
83
|
-
const contents = [
|
|
84
|
-
{
|
|
85
|
-
role: "user",
|
|
86
|
-
parts: [
|
|
87
|
-
{
|
|
88
|
-
text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
|
|
89
|
-
},
|
|
90
|
-
],
|
|
91
|
-
},
|
|
92
|
-
];
|
|
93
|
-
return llmGenerateText(llm, contents);
|
|
94
|
-
}
|
|
95
|
-
async function loadFixture(filePath) {
|
|
96
|
-
const raw = await readFile(filePath, "utf8");
|
|
97
|
-
const json = JSON.parse(raw);
|
|
98
|
-
return json;
|
|
99
|
-
}
|
|
100
|
-
function createToolIndex(tools) {
|
|
101
|
-
const m = new Map();
|
|
102
|
-
for (const t of tools)
|
|
103
|
-
m.set(t.name, t);
|
|
104
|
-
return m;
|
|
105
|
-
}
|
|
106
|
-
async function toolAugmentedAnswerFromAudio(llm, task, opts) {
|
|
107
|
-
const localPath = String(task.localFilePath ?? "").trim();
|
|
108
|
-
if (!localPath)
|
|
109
|
-
throw new Error("Task missing localFilePath");
|
|
110
|
-
const toolIndex = createToolIndex(localFileTools);
|
|
111
|
-
const tool = toolIndex.get("transcribe_audio_file");
|
|
112
|
-
if (!tool)
|
|
113
|
-
throw new Error("Missing tool: transcribe_audio_file");
|
|
114
|
-
if (opts.maxToolCalls < 1) {
|
|
115
|
-
throw new Error("maxToolCalls must be >= 1 to run audio lane");
|
|
116
|
-
}
|
|
117
|
-
const transcript = (await tool.handler({
|
|
118
|
-
path: localPath,
|
|
119
|
-
model: process.env.NODEBENCH_AUDIO_MODEL ?? "tiny.en",
|
|
120
|
-
maxChars: 20000,
|
|
121
|
-
timeoutMs: 300000,
|
|
122
|
-
}));
|
|
123
|
-
const transcriptText = String(transcript?.text ?? "").trim();
|
|
124
|
-
if (!transcriptText) {
|
|
125
|
-
throw new Error("Empty transcript from transcribe_audio_file");
|
|
126
|
-
}
|
|
127
|
-
const contents = [
|
|
128
|
-
{
|
|
129
|
-
role: "user",
|
|
130
|
-
parts: [
|
|
131
|
-
{
|
|
132
|
-
text: `You are given a transcript of an attached audio file. Use it to answer the question.\n\nRules:\n- Do not browse the web.\n- Return ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}\n\nAudio transcript:\n${transcriptText}`,
|
|
133
|
-
},
|
|
134
|
-
],
|
|
135
|
-
},
|
|
136
|
-
];
|
|
137
|
-
const answer = await llmGenerateText(llm, contents);
|
|
138
|
-
return { answer, toolCalls: 1 };
|
|
139
|
-
}
|
|
140
|
-
describe("GAIA capability: audio lane", () => {
|
|
141
|
-
const testFn = shouldRun ? it : it.skip;
|
|
142
|
-
testFn("should measure accuracy delta on a small GAIA audio subset", async () => {
|
|
143
|
-
loadDotEnvLocalIfPresent();
|
|
144
|
-
const fixturePath = resolveCapabilityAudioFixturePath();
|
|
145
|
-
if (!existsSync(fixturePath)) {
|
|
146
|
-
throw new Error(`Missing GAIA audio fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityAudioFixture.py`);
|
|
147
|
-
}
|
|
148
|
-
const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
|
|
149
|
-
const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
|
|
150
|
-
const baselineLlm = await createTextLlmClient({ model: baselineModel });
|
|
151
|
-
const toolsLlm = await createTextLlmClient({ model: toolsModel });
|
|
152
|
-
const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
|
|
153
|
-
const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
|
|
154
|
-
const fixture = await loadFixture(fixturePath);
|
|
155
|
-
expect(Array.isArray(fixture.tasks)).toBe(true);
|
|
156
|
-
expect(fixture.tasks.length).toBeGreaterThan(0);
|
|
157
|
-
const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "4", 10);
|
|
158
|
-
const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 4));
|
|
159
|
-
const tasks = fixture.tasks.slice(0, taskLimit);
|
|
160
|
-
const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
|
|
161
|
-
const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
|
|
162
|
-
const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "1", 10);
|
|
163
|
-
// Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
|
|
164
|
-
const judge = await autoDiscoverJudge(toolsLlm);
|
|
165
|
-
const results = new Array(tasks.length);
|
|
166
|
-
let nextIndex = 0;
|
|
167
|
-
const workers = Array.from({ length: concurrency }, () => (async () => {
|
|
168
|
-
while (true) {
|
|
169
|
-
const idx = nextIndex++;
|
|
170
|
-
if (idx >= tasks.length)
|
|
171
|
-
return;
|
|
172
|
-
const task = tasks[idx];
|
|
173
|
-
try {
|
|
174
|
-
const baseStart = performance.now();
|
|
175
|
-
const base = await baselineAnswer(baselineLlm, task);
|
|
176
|
-
const baseMs = performance.now() - baseStart;
|
|
177
|
-
const toolsStart = performance.now();
|
|
178
|
-
const tools = await toolAugmentedAnswerFromAudio(toolsLlm, task, { maxToolCalls });
|
|
179
|
-
const toolsMs = performance.now() - toolsStart;
|
|
180
|
-
const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
|
|
181
|
-
const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
|
|
182
|
-
results[idx] = {
|
|
183
|
-
taskId: task.id,
|
|
184
|
-
baselineCorrect: baseJudge.match,
|
|
185
|
-
toolsCorrect: toolsJudge.match,
|
|
186
|
-
baselineMs: baseMs,
|
|
187
|
-
toolsMs,
|
|
188
|
-
toolCalls: tools.toolCalls,
|
|
189
|
-
judgeProvider: toolsJudge.judgeProvider,
|
|
190
|
-
judgeInvoked: toolsJudge.judgeInvoked,
|
|
191
|
-
};
|
|
192
|
-
}
|
|
193
|
-
catch (err) {
|
|
194
|
-
results[idx] = {
|
|
195
|
-
taskId: task.id,
|
|
196
|
-
baselineCorrect: false,
|
|
197
|
-
toolsCorrect: false,
|
|
198
|
-
baselineMs: 0,
|
|
199
|
-
toolsMs: 0,
|
|
200
|
-
toolCalls: 0,
|
|
201
|
-
error: err?.message ?? String(err),
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
})());
|
|
206
|
-
await Promise.all(workers);
|
|
207
|
-
const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
|
|
208
|
-
const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
|
|
209
|
-
const baselinePassRate = (baselineCorrect / results.length) * 100;
|
|
210
|
-
const toolsPassRate = (toolsCorrect / results.length) * 100;
|
|
211
|
-
const avgBaseMs = results.reduce((sum, r) => sum + r.baselineMs, 0) / results.length;
|
|
212
|
-
const avgToolsMs = results.reduce((sum, r) => sum + r.toolsMs, 0) / results.length;
|
|
213
|
-
const avgToolCalls = results.reduce((sum, r) => sum + r.toolCalls, 0) / results.length;
|
|
214
|
-
const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
|
|
215
|
-
const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
|
|
216
|
-
console.log(`[gaia-capability-audio] tasks=${results.length} baseline=${baselineCorrect}/${results.length} (${baselinePassRate.toFixed(1)}%) tools=${toolsCorrect}/${results.length} (${toolsPassRate.toFixed(1)}%) delta=${(toolsPassRate - baselinePassRate).toFixed(1)}% improved=${improved} regressions=${regressions} avgToolCalls=${avgToolCalls.toFixed(2)}`);
|
|
217
|
-
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "audio").toLowerCase();
|
|
218
|
-
const publicSummary = {
|
|
219
|
-
suiteId: "gaia_capability_audio",
|
|
220
|
-
lane: "audio",
|
|
221
|
-
generatedAtIso: new Date().toISOString(),
|
|
222
|
-
config: fixture.config,
|
|
223
|
-
split: fixture.split,
|
|
224
|
-
taskCount: results.length,
|
|
225
|
-
concurrency,
|
|
226
|
-
baseline: {
|
|
227
|
-
model: baselineModelLabel,
|
|
228
|
-
correct: baselineCorrect,
|
|
229
|
-
passRatePct: baselinePassRate,
|
|
230
|
-
avgMs: avgBaseMs,
|
|
231
|
-
},
|
|
232
|
-
tools: {
|
|
233
|
-
model: toolsModelLabel,
|
|
234
|
-
mode: toolsMode,
|
|
235
|
-
correct: toolsCorrect,
|
|
236
|
-
passRatePct: toolsPassRate,
|
|
237
|
-
avgMs: avgToolsMs,
|
|
238
|
-
avgToolCalls,
|
|
239
|
-
},
|
|
240
|
-
improved,
|
|
241
|
-
regressions,
|
|
242
|
-
notes: "GAIA audio lane (audio attachments). No prompts/answers persisted; only aggregate metrics are written to public/evals.",
|
|
243
|
-
};
|
|
244
|
-
if (shouldWriteReport) {
|
|
245
|
-
const repoRoot = resolveRepoRoot();
|
|
246
|
-
await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_audio_latest.json"), publicSummary);
|
|
247
|
-
const detailed = {
|
|
248
|
-
...publicSummary,
|
|
249
|
-
results: results.map((r) => ({
|
|
250
|
-
taskId: r.taskId,
|
|
251
|
-
baselineCorrect: r.baselineCorrect,
|
|
252
|
-
toolsCorrect: r.toolsCorrect,
|
|
253
|
-
baselineMs: Math.round(r.baselineMs),
|
|
254
|
-
toolsMs: Math.round(r.toolsMs),
|
|
255
|
-
toolCalls: r.toolCalls,
|
|
256
|
-
...(r.error ? { error: r.error } : {}),
|
|
257
|
-
})),
|
|
258
|
-
};
|
|
259
|
-
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
260
|
-
await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_audio_${fixture.config}_${fixture.split}_${stamp}.json`), detailed);
|
|
261
|
-
}
|
|
262
|
-
expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
|
|
263
|
-
}, 600000);
|
|
264
|
-
});
|
|
265
|
-
//# sourceMappingURL=gaiaCapabilityAudioEval.test.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"gaiaCapabilityAudioEval.test.js","sourceRoot":"","sources":["../../src/__tests__/gaiaCapabilityAudioEval.test.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EACL,mBAAmB,EACnB,uBAAuB,GAGxB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AA4CpF,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,KAAK,GAAG,CAAC;AACpE,MAAM,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,2BAA2B,KAAK,GAAG,CAAC;AAwB1E,KAAK,UAAU,aAAa,CAAC,QAAgB,EAAE,OAAgB;IAC7D,IAAI,CAAC;QACH,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,MAAM,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,IAAI,CAAC,gDAAgD,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC9F,CAAC;AACH,CAAC;AAED,SAAS,eAAe;IACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,iCAAiC;IACxC,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC;IAC1E,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;YAAE,OAAO,QAAQ,CAAC;QAC/C,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gCAAgC,IAAI,UAAU,CAAC;IAC1E,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,YAAY,CAAC;IAC1E,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,yBAAyB,MAAM,IAAI,KAAK,cAAc,CAAC,CAAC;AACvG,CAAC;AAED,SAAS,wBAAwB;IAC/B,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAClD,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO;IAEjC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,EAAE,MAAM,CAAW,CAAC;IACrD,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,IAAI,CAAC;YAAE,SAAS;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACvC,IACE,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YAChD,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC9C,CAAC;YACD,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;IAClD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,GAAkB,EAAE,OAAgC;IACjF,MAAM,WAAW,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,CAAC,CAAC;IAChG,OAAO,uBAAuB,CAAC,GAAG,EAAE,OAAO,EAAE;QAC3C,WAAW,EAAE,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC3D,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,GAAkB,EAAE,IAAoB;IACpE,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,iJAAiJ,IAAI,CAAC,MAAM,EAAE;iBACrK;aACF;SACF;KACF,CAAC;IACF,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAsB,CAAC;IAClD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,KAAgB;IACvC,MAAM,CAAC,GAAG,IAAI,GAAG,EAAmB,CAAC;IACrC,KAAK,MAAM,CAAC,IAAI,KAAK;QAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACxC,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,4BAA4B,CACzC,GAAkB,EAClB,IAAoB,EACpB,IAA8B;IAE9B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1D,IAAI,CAAC,SAAS;QAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAE9D,MAAM,SAAS,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;IAClD,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IACpD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAElE,IAAI,IAAI,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,MAAM,IAAI,CAAC,OAAO,CAAC;QACrC,IAAI,EAAE,SAAS;QACf,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,qBAAqB,IAAI,SAAS;QACrD,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,MAAM;KAClB,CAAC,CAAQ,CAAC;IAEX,MAAM,cAAc,GAAG,MAAM,CAAC,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7D,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,2LAA2L,IAAI,CAAC,MAAM,0BAA0B,cAAc,EAAE;iBACvP;aACF;SACF;KACF,CAAC;IAEF,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACpD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;AAClC,CAAC;AAED,QAAQ,CAAC,6BAA6B,EAAE,GAAG,EAAE;IAC3C,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC;IAExC,MAAM,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;QAC9E,wBAAwB,EAAE,CAAC;QAE3B,MAAM,WAAW,GAAG,iCAAiC,EAAE,CAAC;QACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,4GAA4G,CACzJ,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,IAAI,wBAAwB,CAAC;QAC5F,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,aAAa,CAAC;QAC3E,MAAM,WAAW,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,aAAa,EAAE,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAC;QAClE,MAAM,kBAAkB,GAAG,GAAG,WAAW,CAAC,QAAQ,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;QAC1E,MAAM,eAAe,GAAG,GAAG,QAAQ,CAAC,QAAQ,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAEjE,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAAC,CAAC;QAC/C,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,cAAc,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QACpG,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CACrF,CAAC;QACF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAEhD,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC3G,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACzF,CAAC;QAEF,MAAM,YAAY,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,wCAAwC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAEtG,wEAAwE;QACxE,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAEhD,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxD,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,CACvD,CAAC,KAAK,IAAI,EAAE;YACV,OAAO,IAAI,EAAE,CAAC;gBACZ,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;gBACxB,IAAI,GAAG,IAAI,KAAK,CAAC,MAAM;oBAAE,OAAO;gBAEhC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;gBAExB,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACpC,MAAM,IAAI,GAAG,MAAM,cAAc,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;oBACrD,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;oBAE7C,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACrC,MAAM,KAAK,GAAG,MAAM,4BAA4B,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;oBACnF,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC;oBAE/C,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC;oBAChF,MAAM,UAAU,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;oBAEzF,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,SAAS,CAAC,KAAK;wBAChC,YAAY,EAAE,UAAU,CAAC,KAAK;wBAC9B,UAAU,EAAE,MAAM;wBAClB,OAAO;wBACP,SAAS,EAAE,KAAK,CAAC,SAAS;wBAC1B,aAAa,EAAE,UAAU,CAAC,aAAa;wBACvC,YAAY,EAAE,UAAU,CAAC,YAAY;qBACtC,CAAC;gBACJ,CAAC;gBAAC,OAAO,GAAQ,EAAE,CAAC;oBAClB,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,KAAK;wBACtB,YAAY,EAAE,KAAK;wBACnB,UAAU,EAAE,CAAC;wBACb,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,CAAC;wBACZ,KAAK,EAAE,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC;qBACnC,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC,CAAC,EAAE,CACL,CAAC;QAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAE3B,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAClE,MAAM,gBAAgB,GAAG,CAAC,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAClE,MAAM,aAAa,GAAG,CAAC,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAC5D,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACrF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACnF,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEvF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QACpF,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAEvF,OAAO,CAAC,GAAG,CACT,iCAAiC,OAAO,CAAC,MAAM,aAAa,eAAe,IAAI,OAAO,CAAC,MAAM,KAAK,gBAAgB,CAAC,OAAO,CACxH,CAAC,CACF,YAAY,YAAY,IAAI,OAAO,CAAC,MAAM,KAAK,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,CAClF,aAAa,GAAG,gBAAgB,CACjC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,QAAQ,gBAAgB,WAAW,iBAAiB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACxG,CAAC;QAEF,MAAM,SAAS,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAC9F,MAAM,aAAa,GAAqC;YACtD,OAAO,EAAE,uBAAuB;YAChC,IAAI,EAAE,OAAO;YACb,cAAc,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACxC,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE;gBACR,KAAK,EAAE,kBAAkB;gBACzB,OAAO,EAAE,eAAe;gBACxB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,SAAS;aACjB;YACD,KAAK,EAAE;gBACL,KAAK,EAAE,eAAe;gBACtB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,aAAa;gBAC1B,KAAK,EAAE,UAAU;gBACjB,YAAY;aACb;YACD,QAAQ;YACR,WAAW;YACX,KAAK,EACH,wHAAwH;SAC3H,CAAC;QAEF,IAAI,iBAAiB,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;YACnC,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,mCAAmC,CAAC,EAC3E,aAAa,CACd,CAAC;YAEF,MAAM,QAAQ,GAAG;gBACf,GAAG,aAAa;gBAChB,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,MAAM,EAAE,CAAC,CAAC,MAAM;oBAChB,eAAe,EAAE,CAAC,CAAC,eAAe;oBAClC,YAAY,EAAE,CAAC,CAAC,YAAY;oBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;oBACpC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC;oBAC9B,SAAS,EAAE,CAAC,CAAC,SAAS;oBACtB,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBACvC,CAAC,CAAC;aACJ,CAAC;YACF,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC7D,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CACP,QAAQ,EACR,QAAQ,EACR,MAAM,EACN,SAAS,EACT,yBAAyB,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,KAAK,IAAI,KAAK,OAAO,CACzE,EACD,QAAQ,CACT,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,aAAa,CAAC,CAAC,sBAAsB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC,EAAE,MAAM,CAAC,CAAC;AACb,CAAC,CAAC,CAAC"}
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* GAIA capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP tools.
|
|
3
|
-
*
|
|
4
|
-
* This test attempts to solve a small GAIA subset and scores answers against
|
|
5
|
-
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
6
|
-
*
|
|
7
|
-
* Safety:
|
|
8
|
-
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
9
|
-
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
10
|
-
*
|
|
11
|
-
* Disabled by default (cost + rate limits + external network). Run with:
|
|
12
|
-
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
13
|
-
*/
|
|
14
|
-
export {};
|