nodebench-mcp 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/NODEBENCH_AGENTS.md +74 -67
  2. package/README.md +36 -34
  3. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  4. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  5. package/dist/dashboard/operatingServer.js +3 -2
  6. package/dist/dashboard/operatingServer.js.map +1 -1
  7. package/dist/db.js +51 -3
  8. package/dist/db.js.map +1 -1
  9. package/dist/index.js +19 -18
  10. package/dist/index.js.map +1 -1
  11. package/dist/packageInfo.d.ts +3 -0
  12. package/dist/packageInfo.js +32 -0
  13. package/dist/packageInfo.js.map +1 -0
  14. package/dist/sandboxApi.js +2 -1
  15. package/dist/sandboxApi.js.map +1 -1
  16. package/dist/tools/boilerplateTools.js +10 -9
  17. package/dist/tools/boilerplateTools.js.map +1 -1
  18. package/dist/tools/documentationTools.js +2 -1
  19. package/dist/tools/documentationTools.js.map +1 -1
  20. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  21. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  22. package/dist/tools/toolRegistry.js +11 -0
  23. package/dist/tools/toolRegistry.js.map +1 -1
  24. package/dist/toolsetRegistry.js +74 -1
  25. package/dist/toolsetRegistry.js.map +1 -1
  26. package/package.json +7 -6
  27. package/scripts/install.sh +14 -14
  28. package/dist/__tests__/analytics.test.d.ts +0 -11
  29. package/dist/__tests__/analytics.test.js +0 -546
  30. package/dist/__tests__/analytics.test.js.map +0 -1
  31. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  32. package/dist/__tests__/architectComplex.test.js +0 -373
  33. package/dist/__tests__/architectComplex.test.js.map +0 -1
  34. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  35. package/dist/__tests__/architectSmoke.test.js +0 -92
  36. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  37. package/dist/__tests__/audit-registry.d.ts +0 -1
  38. package/dist/__tests__/audit-registry.js +0 -60
  39. package/dist/__tests__/audit-registry.js.map +0 -1
  40. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  41. package/dist/__tests__/batchAutopilot.test.js +0 -218
  42. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  43. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  44. package/dist/__tests__/cliSubcommands.test.js +0 -138
  45. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  46. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  47. package/dist/__tests__/comparativeBench.test.js +0 -722
  48. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  49. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  50. package/dist/__tests__/critterCalibrationEval.js +0 -370
  51. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  52. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  53. package/dist/__tests__/dynamicLoading.test.js +0 -280
  54. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  55. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  56. package/dist/__tests__/embeddingProvider.test.js +0 -86
  57. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  58. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  59. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  60. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  61. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  62. package/dist/__tests__/evalHarness.test.js +0 -1107
  63. package/dist/__tests__/evalHarness.test.js.map +0 -1
  64. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  65. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  66. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  67. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  69. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  70. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  72. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  73. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  74. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  75. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  76. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  78. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  79. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  80. package/dist/__tests__/forecastingScoring.test.js +0 -202
  81. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  83. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  84. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  86. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  87. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  90. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  91. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  92. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  93. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  94. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  95. package/dist/__tests__/helpers/answerMatch.js +0 -267
  96. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  97. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  98. package/dist/__tests__/helpers/textLlm.js +0 -214
  99. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  100. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  101. package/dist/__tests__/localDashboard.test.js +0 -226
  102. package/dist/__tests__/localDashboard.test.js.map +0 -1
  103. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  104. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  105. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  108. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  111. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  114. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  116. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  117. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  118. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  119. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  120. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  121. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  122. package/dist/__tests__/openclawDogfood.test.js +0 -535
  123. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  124. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  125. package/dist/__tests__/openclawMessaging.test.js +0 -232
  126. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  127. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  128. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  129. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  130. package/dist/__tests__/tools.test.d.ts +0 -1
  131. package/dist/__tests__/tools.test.js +0 -3201
  132. package/dist/__tests__/tools.test.js.map +0 -1
  133. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  134. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  135. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  136. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  137. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  138. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  139. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  140. package/dist/__tests__/webmcpTools.test.js +0 -195
  141. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  142. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  143. package/dist/benchmarks/testProviderBus.js +0 -272
  144. package/dist/benchmarks/testProviderBus.js.map +0 -1
  145. package/dist/hooks/postCompaction.d.ts +0 -14
  146. package/dist/hooks/postCompaction.js +0 -51
  147. package/dist/hooks/postCompaction.js.map +0 -1
  148. package/dist/security/__tests__/security.test.d.ts +0 -8
  149. package/dist/security/__tests__/security.test.js +0 -295
  150. package/dist/security/__tests__/security.test.js.map +0 -1
  151. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  152. package/dist/sync/hyperloopEval.test.js +0 -60
  153. package/dist/sync/hyperloopEval.test.js.map +0 -1
  154. package/dist/sync/store.test.d.ts +0 -4
  155. package/dist/sync/store.test.js +0 -43
  156. package/dist/sync/store.test.js.map +0 -1
  157. package/dist/tools/documentTools.d.ts +0 -5
  158. package/dist/tools/documentTools.js +0 -524
  159. package/dist/tools/documentTools.js.map +0 -1
  160. package/dist/tools/financialTools.d.ts +0 -10
  161. package/dist/tools/financialTools.js +0 -403
  162. package/dist/tools/financialTools.js.map +0 -1
  163. package/dist/tools/memoryTools.d.ts +0 -5
  164. package/dist/tools/memoryTools.js +0 -137
  165. package/dist/tools/memoryTools.js.map +0 -1
  166. package/dist/tools/planningTools.d.ts +0 -5
  167. package/dist/tools/planningTools.js +0 -147
  168. package/dist/tools/planningTools.js.map +0 -1
  169. package/dist/tools/searchTools.d.ts +0 -5
  170. package/dist/tools/searchTools.js +0 -145
  171. package/dist/tools/searchTools.js.map +0 -1
@@ -1,265 +0,0 @@
1
- /**
2
- * GAIA audio-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local audio tools.
3
- *
4
- * This lane targets GAIA tasks that include audio attachments (MP3/WAV/etc).
5
- * We provide deterministic local transcription via NodeBench MCP tools and score answers against
6
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
7
- *
8
- * Safety:
9
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
10
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
11
- *
12
- * Disabled by default (cost + rate limits). Run with:
13
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
14
- */
15
- import { describe, expect, it } from "vitest";
16
- import { existsSync, readFileSync } from "node:fs";
17
- import { mkdir, readFile, writeFile } from "node:fs/promises";
18
- import path from "node:path";
19
- import { fileURLToPath } from "node:url";
20
- import { performance } from "node:perf_hooks";
21
- import { localFileTools } from "../tools/localFileTools.js";
22
- import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
23
- import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
24
- const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
25
- const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
26
- async function safeWriteJson(filePath, payload) {
27
- try {
28
- await mkdir(path.dirname(filePath), { recursive: true });
29
- await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
30
- }
31
- catch (err) {
32
- console.warn(`[gaia-capability-audio] report write failed: ${err?.message ?? String(err)}`);
33
- }
34
- }
35
- function resolveRepoRoot() {
36
- const testDir = path.dirname(fileURLToPath(import.meta.url));
37
- return path.resolve(testDir, "../../../..");
38
- }
39
- function resolveCapabilityAudioFixturePath() {
40
- const override = process.env.NODEBENCH_GAIA_CAPABILITY_AUDIO_FIXTURE_PATH;
41
- if (override) {
42
- if (path.isAbsolute(override))
43
- return override;
44
- const repoRoot = resolveRepoRoot();
45
- return path.resolve(repoRoot, override);
46
- }
47
- const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
48
- const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
49
- const repoRoot = resolveRepoRoot();
50
- return path.join(repoRoot, ".cache", "gaia", `gaia_capability_audio_${config}_${split}.sample.json`);
51
- }
52
- function loadDotEnvLocalIfPresent() {
53
- const repoRoot = resolveRepoRoot();
54
- const envPath = path.join(repoRoot, ".env.local");
55
- if (!existsSync(envPath))
56
- return;
57
- const text = readFileSync(envPath, "utf8");
58
- for (const rawLine of text.split(/\r?\n/)) {
59
- const line = rawLine.trim();
60
- if (!line || line.startsWith("#"))
61
- continue;
62
- const idx = line.indexOf("=");
63
- if (idx <= 0)
64
- continue;
65
- const key = line.slice(0, idx).trim();
66
- let value = line.slice(idx + 1).trim();
67
- if ((value.startsWith("\"") && value.endsWith("\"")) ||
68
- (value.startsWith("'") && value.endsWith("'"))) {
69
- value = value.slice(1, -1);
70
- }
71
- if (!process.env[key])
72
- process.env[key] = value;
73
- }
74
- }
75
- async function llmGenerateText(llm, history) {
76
- const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
77
- return generateTextFromHistory(llm, history, {
78
- temperature: Number.isFinite(temperature) ? temperature : 0,
79
- maxOutputTokens: 1024,
80
- });
81
- }
82
- async function baselineAnswer(llm, task) {
83
- const contents = [
84
- {
85
- role: "user",
86
- parts: [
87
- {
88
- text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
89
- },
90
- ],
91
- },
92
- ];
93
- return llmGenerateText(llm, contents);
94
- }
95
- async function loadFixture(filePath) {
96
- const raw = await readFile(filePath, "utf8");
97
- const json = JSON.parse(raw);
98
- return json;
99
- }
100
- function createToolIndex(tools) {
101
- const m = new Map();
102
- for (const t of tools)
103
- m.set(t.name, t);
104
- return m;
105
- }
106
- async function toolAugmentedAnswerFromAudio(llm, task, opts) {
107
- const localPath = String(task.localFilePath ?? "").trim();
108
- if (!localPath)
109
- throw new Error("Task missing localFilePath");
110
- const toolIndex = createToolIndex(localFileTools);
111
- const tool = toolIndex.get("transcribe_audio_file");
112
- if (!tool)
113
- throw new Error("Missing tool: transcribe_audio_file");
114
- if (opts.maxToolCalls < 1) {
115
- throw new Error("maxToolCalls must be >= 1 to run audio lane");
116
- }
117
- const transcript = (await tool.handler({
118
- path: localPath,
119
- model: process.env.NODEBENCH_AUDIO_MODEL ?? "tiny.en",
120
- maxChars: 20000,
121
- timeoutMs: 300000,
122
- }));
123
- const transcriptText = String(transcript?.text ?? "").trim();
124
- if (!transcriptText) {
125
- throw new Error("Empty transcript from transcribe_audio_file");
126
- }
127
- const contents = [
128
- {
129
- role: "user",
130
- parts: [
131
- {
132
- text: `You are given a transcript of an attached audio file. Use it to answer the question.\n\nRules:\n- Do not browse the web.\n- Return ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}\n\nAudio transcript:\n${transcriptText}`,
133
- },
134
- ],
135
- },
136
- ];
137
- const answer = await llmGenerateText(llm, contents);
138
- return { answer, toolCalls: 1 };
139
- }
140
- describe("GAIA capability: audio lane", () => {
141
- const testFn = shouldRun ? it : it.skip;
142
- testFn("should measure accuracy delta on a small GAIA audio subset", async () => {
143
- loadDotEnvLocalIfPresent();
144
- const fixturePath = resolveCapabilityAudioFixturePath();
145
- if (!existsSync(fixturePath)) {
146
- throw new Error(`Missing GAIA audio fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityAudioFixture.py`);
147
- }
148
- const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
149
- const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
150
- const baselineLlm = await createTextLlmClient({ model: baselineModel });
151
- const toolsLlm = await createTextLlmClient({ model: toolsModel });
152
- const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
153
- const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
154
- const fixture = await loadFixture(fixturePath);
155
- expect(Array.isArray(fixture.tasks)).toBe(true);
156
- expect(fixture.tasks.length).toBeGreaterThan(0);
157
- const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "4", 10);
158
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 4));
159
- const tasks = fixture.tasks.slice(0, taskLimit);
160
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
161
- const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
162
- const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "1", 10);
163
- // Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
164
- const judge = await autoDiscoverJudge(toolsLlm);
165
- const results = new Array(tasks.length);
166
- let nextIndex = 0;
167
- const workers = Array.from({ length: concurrency }, () => (async () => {
168
- while (true) {
169
- const idx = nextIndex++;
170
- if (idx >= tasks.length)
171
- return;
172
- const task = tasks[idx];
173
- try {
174
- const baseStart = performance.now();
175
- const base = await baselineAnswer(baselineLlm, task);
176
- const baseMs = performance.now() - baseStart;
177
- const toolsStart = performance.now();
178
- const tools = await toolAugmentedAnswerFromAudio(toolsLlm, task, { maxToolCalls });
179
- const toolsMs = performance.now() - toolsStart;
180
- const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
181
- const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
182
- results[idx] = {
183
- taskId: task.id,
184
- baselineCorrect: baseJudge.match,
185
- toolsCorrect: toolsJudge.match,
186
- baselineMs: baseMs,
187
- toolsMs,
188
- toolCalls: tools.toolCalls,
189
- judgeProvider: toolsJudge.judgeProvider,
190
- judgeInvoked: toolsJudge.judgeInvoked,
191
- };
192
- }
193
- catch (err) {
194
- results[idx] = {
195
- taskId: task.id,
196
- baselineCorrect: false,
197
- toolsCorrect: false,
198
- baselineMs: 0,
199
- toolsMs: 0,
200
- toolCalls: 0,
201
- error: err?.message ?? String(err),
202
- };
203
- }
204
- }
205
- })());
206
- await Promise.all(workers);
207
- const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
208
- const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
209
- const baselinePassRate = (baselineCorrect / results.length) * 100;
210
- const toolsPassRate = (toolsCorrect / results.length) * 100;
211
- const avgBaseMs = results.reduce((sum, r) => sum + r.baselineMs, 0) / results.length;
212
- const avgToolsMs = results.reduce((sum, r) => sum + r.toolsMs, 0) / results.length;
213
- const avgToolCalls = results.reduce((sum, r) => sum + r.toolCalls, 0) / results.length;
214
- const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
215
- const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
216
- console.log(`[gaia-capability-audio] tasks=${results.length} baseline=${baselineCorrect}/${results.length} (${baselinePassRate.toFixed(1)}%) tools=${toolsCorrect}/${results.length} (${toolsPassRate.toFixed(1)}%) delta=${(toolsPassRate - baselinePassRate).toFixed(1)}% improved=${improved} regressions=${regressions} avgToolCalls=${avgToolCalls.toFixed(2)}`);
217
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "audio").toLowerCase();
218
- const publicSummary = {
219
- suiteId: "gaia_capability_audio",
220
- lane: "audio",
221
- generatedAtIso: new Date().toISOString(),
222
- config: fixture.config,
223
- split: fixture.split,
224
- taskCount: results.length,
225
- concurrency,
226
- baseline: {
227
- model: baselineModelLabel,
228
- correct: baselineCorrect,
229
- passRatePct: baselinePassRate,
230
- avgMs: avgBaseMs,
231
- },
232
- tools: {
233
- model: toolsModelLabel,
234
- mode: toolsMode,
235
- correct: toolsCorrect,
236
- passRatePct: toolsPassRate,
237
- avgMs: avgToolsMs,
238
- avgToolCalls,
239
- },
240
- improved,
241
- regressions,
242
- notes: "GAIA audio lane (audio attachments). No prompts/answers persisted; only aggregate metrics are written to public/evals.",
243
- };
244
- if (shouldWriteReport) {
245
- const repoRoot = resolveRepoRoot();
246
- await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_audio_latest.json"), publicSummary);
247
- const detailed = {
248
- ...publicSummary,
249
- results: results.map((r) => ({
250
- taskId: r.taskId,
251
- baselineCorrect: r.baselineCorrect,
252
- toolsCorrect: r.toolsCorrect,
253
- baselineMs: Math.round(r.baselineMs),
254
- toolsMs: Math.round(r.toolsMs),
255
- toolCalls: r.toolCalls,
256
- ...(r.error ? { error: r.error } : {}),
257
- })),
258
- };
259
- const stamp = new Date().toISOString().replace(/[:.]/g, "-");
260
- await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_audio_${fixture.config}_${fixture.split}_${stamp}.json`), detailed);
261
- }
262
- expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
263
- }, 600000);
264
- });
265
- //# sourceMappingURL=gaiaCapabilityAudioEval.test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"gaiaCapabilityAudioEval.test.js","sourceRoot":"","sources":["../../src/__tests__/gaiaCapabilityAudioEval.test.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EACL,mBAAmB,EACnB,uBAAuB,GAGxB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AA4CpF,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,KAAK,GAAG,CAAC;AACpE,MAAM,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,2BAA2B,KAAK,GAAG,CAAC;AAwB1E,KAAK,UAAU,aAAa,CAAC,QAAgB,EAAE,OAAgB;IAC7D,IAAI,CAAC;QACH,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,MAAM,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,IAAI,CAAC,gDAAgD,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC9F,CAAC;AACH,CAAC;AAED,SAAS,eAAe;IACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,iCAAiC;IACxC,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC;IAC1E,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;YAAE,OAAO,QAAQ,CAAC;QAC/C,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gCAAgC,IAAI,UAAU,CAAC;IAC1E,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,YAAY,CAAC;IAC1E,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,yBAAyB,MAAM,IAAI,KAAK,cAAc,CAAC,CAAC;AACvG,CAAC;AAED,SAAS,wBAAwB;IAC/B,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAClD,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO;IAEjC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,EAAE,MAAM,CAAW,CAAC;IACrD,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,IAAI,CAAC;YAAE,SAAS;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACvC,IACE,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YAChD,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC9C,CAAC;YACD,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;IAClD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,GAAkB,EAAE,OAAgC;IACjF,MAAM,WAAW,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,CAAC,CAAC;IAChG,OAAO,uBAAuB,CAAC,GAAG,EAAE,OAAO,EAAE;QAC3C,WAAW,EAAE,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC3D,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,GAAkB,EAAE,IAAoB;IACpE,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,iJAAiJ,IAAI,CAAC,MAAM,EAAE;iBACrK;aACF;SACF;KACF,CAAC;IACF,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAsB,CAAC;IAClD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,KAAgB;IACvC,MAAM,CAAC,GAAG,IAAI,GAAG,EAAmB,CAAC;IACrC,KAAK,MAAM,CAAC,IAAI,KAAK;QAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACxC,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,4BAA4B,CACzC,GAAkB,EAClB,IAAoB,EACpB,IAA8B;IAE9B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1D,IAAI,CAAC,SAAS;QAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAE9D,MAAM,SAAS,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;IAClD,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IACpD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAElE,IAAI,IAAI,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,MAAM,IAAI,CAAC,OAAO,CAAC;QACrC,IAAI,EAAE,SAAS;QACf,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,qBAAqB,IAAI,SAAS;QACrD,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,MAAM;KAClB,CAAC,CAAQ,CAAC;IAEX,MAAM,cAAc,GAAG,MAAM,CAAC,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7D,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,2LAA2L,IAAI,CAAC,MAAM,0BAA0B,cAAc,EAAE;iBACvP;aACF;SACF;KACF,CAAC;IAEF,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACpD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;AAClC,CAAC;AAED,QAAQ,CAAC,6BAA6B,EAAE,GAAG,EAAE;IAC3C,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC;IAExC,MAAM,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;QAC9E,wBAAwB,EAAE,CAAC;QAE3B,MAAM,WAAW,GAAG,iCAAiC,EAAE,CAAC;QACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,4GAA4G,CACzJ,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,IAAI,wBAAwB,CAAC;QAC5F,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,aAAa,CAAC;QAC3E,MAAM,WAAW,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,aAAa,EAAE,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAC;QAClE,MAAM,kBAAkB,GAAG,GAAG,WAAW,CAAC,QAAQ,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;QAC1E,MAAM,eAAe,GAAG,GAAG,QAAQ,CAAC,QAAQ,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAEjE,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAAC,CAAC;QAC/C,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,cAAc,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QACpG,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CACrF,CAAC;QACF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAEhD,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC3G,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACzF,CAAC;QAEF,MAAM,YAAY,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,wCAAwC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAEtG,wEAAwE;QACxE,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAEhD,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxD,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,CACvD,CAAC,KAAK,IAAI,EAAE;YACV,OAAO,IAAI,EAAE,CAAC;gBACZ,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;gBACxB,IAAI,GAAG,IAAI,KAAK,CAAC,MAAM;oBAAE,OAAO;gBAEhC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;gBAExB,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACpC,MAAM,IAAI,GAAG,MAAM,cAAc,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;oBACrD,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;oBAE7C,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACrC,MAAM,KAAK,GAAG,MAAM,4BAA4B,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;oBACnF,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC;oBAE/C,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC;oBAChF,MAAM,UAAU,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;oBAEzF,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,SAAS,CAAC,KAAK;wBAChC,YAAY,EAAE,UAAU,CAAC,KAAK;wBAC9B,UAAU,EAAE,MAAM;wBAClB,OAAO;wBACP,SAAS,EAAE,KAAK,CAAC,SAAS;wBAC1B,aAAa,EAAE,UAAU,CAAC,aAAa;wBACvC,YAAY,EAAE,UAAU,CAAC,YAAY;qBACtC,CAAC;gBACJ,CAAC;gBAAC,OAAO,GAAQ,EAAE,CAAC;oBAClB,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,KAAK;wBACtB,YAAY,EAAE,KAAK;wBACnB,UAAU,EAAE,CAAC;wBACb,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,CAAC;wBACZ,KAAK,EAAE,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC;qBACnC,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC,CAAC,EAAE,CACL,CAAC;QAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAE3B,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAClE,MAAM,gBAAgB,GAAG,CAAC,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAClE,MAAM,aAAa,GAAG,CAAC,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAC5D,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACrF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACnF,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEvF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QACpF,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAEvF,OAAO,CAAC,GAAG,CACT,iCAAiC,OAAO,CAAC,MAAM,aAAa,eAAe,IAAI,OAAO,CAAC,MAAM,KAAK,gBAAgB,CAAC,OAAO,CACxH,CAAC,CACF,YAAY,YAAY,IAAI,OAAO,CAAC,MAAM,KAAK,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,CAClF,aAAa,GAAG,gBAAgB,CACjC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,QAAQ,gBAAgB,WAAW,iBAAiB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACxG,CAAC;QAEF,MAAM,SAAS,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAC9F,MAAM,aAAa,GAAqC;YACtD,OAAO,EAAE,uBAAuB;YAChC,IAAI,EAAE,OAAO;YACb,cAAc,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACxC,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE;gBACR,KAAK,EAAE,kBAAkB;gBACzB,OAAO,EAAE,eAAe;gBACxB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,SAAS;aACjB;YACD,KAAK,EAAE;gBACL,KAAK,EAAE,eAAe;gBACtB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,aAAa;gBAC1B,KAAK,EAAE,UAAU;gBACjB,YAAY;aACb;YACD,QAAQ;YACR,WAAW;YACX,KAAK,EACH,wHAAwH;SAC3H,CAAC;QAEF,IAAI,iBAAiB,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;YACnC,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,mCAAmC,CAAC,EAC3E,aAAa,CACd,CAAC;YAEF,MAAM,QAAQ,GAAG;gBACf,GAAG,aAAa;gBAChB,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,MAAM,EAAE,CAAC,CAAC,MAAM;oBAChB,eAAe,EAAE,CAAC,CAAC,eAAe;oBAClC,YAAY,EAAE,CAAC,CAAC,YAAY;oBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;oBACpC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC;oBAC9B,SAAS,EAAE,CAAC,CAAC,SAAS;oBACtB,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBACvC,CAAC,CAAC;aACJ,CAAC;YACF,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC7D,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CACP,QAAQ,EACR,QAAQ,EACR,MAAM,EACN,SAAS,EACT,yBAAyB,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,KAAK,IAAI,KAAK,OAAO,CACzE,EACD,QAAQ,CACT,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,aAAa,CAAC,CAAC,sBAAsB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC,EAAE,MAAM,CAAC,CAAC;AACb,CAAC,CAAC,CAAC"}
@@ -1,14 +0,0 @@
1
- /**
2
- * GAIA capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP tools.
3
- *
4
- * This test attempts to solve a small GAIA subset and scores answers against
5
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
6
- *
7
- * Safety:
8
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
9
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
10
- *
11
- * Disabled by default (cost + rate limits + external network). Run with:
12
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
13
- */
14
- export {};