nodebench-mcp 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  2. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  3. package/dist/dashboard/operatingServer.js +3 -2
  4. package/dist/dashboard/operatingServer.js.map +1 -1
  5. package/dist/db.js +51 -3
  6. package/dist/db.js.map +1 -1
  7. package/dist/index.js +13 -16
  8. package/dist/index.js.map +1 -1
  9. package/dist/packageInfo.d.ts +3 -0
  10. package/dist/packageInfo.js +32 -0
  11. package/dist/packageInfo.js.map +1 -0
  12. package/dist/sandboxApi.js +2 -1
  13. package/dist/sandboxApi.js.map +1 -1
  14. package/dist/tools/boilerplateTools.js +10 -9
  15. package/dist/tools/boilerplateTools.js.map +1 -1
  16. package/dist/tools/documentationTools.js +2 -1
  17. package/dist/tools/documentationTools.js.map +1 -1
  18. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  19. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  20. package/dist/tools/toolRegistry.js +11 -0
  21. package/dist/tools/toolRegistry.js.map +1 -1
  22. package/dist/toolsetRegistry.js +74 -1
  23. package/dist/toolsetRegistry.js.map +1 -1
  24. package/package.json +4 -3
  25. package/dist/__tests__/analytics.test.d.ts +0 -11
  26. package/dist/__tests__/analytics.test.js +0 -546
  27. package/dist/__tests__/analytics.test.js.map +0 -1
  28. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  29. package/dist/__tests__/architectComplex.test.js +0 -373
  30. package/dist/__tests__/architectComplex.test.js.map +0 -1
  31. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  32. package/dist/__tests__/architectSmoke.test.js +0 -92
  33. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  34. package/dist/__tests__/audit-registry.d.ts +0 -1
  35. package/dist/__tests__/audit-registry.js +0 -60
  36. package/dist/__tests__/audit-registry.js.map +0 -1
  37. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  38. package/dist/__tests__/batchAutopilot.test.js +0 -218
  39. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  40. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  41. package/dist/__tests__/cliSubcommands.test.js +0 -138
  42. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  43. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  44. package/dist/__tests__/comparativeBench.test.js +0 -722
  45. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  46. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  47. package/dist/__tests__/critterCalibrationEval.js +0 -370
  48. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  49. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  50. package/dist/__tests__/dynamicLoading.test.js +0 -280
  51. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  52. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  53. package/dist/__tests__/embeddingProvider.test.js +0 -86
  54. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  55. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  56. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  57. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  58. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  59. package/dist/__tests__/evalHarness.test.js +0 -1107
  60. package/dist/__tests__/evalHarness.test.js.map +0 -1
  61. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  62. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  63. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  64. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  65. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  66. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  67. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  69. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  70. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  72. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  73. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  74. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  75. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  76. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingScoring.test.js +0 -202
  78. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  79. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  80. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  81. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  83. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  84. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  86. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  87. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  90. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  91. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  92. package/dist/__tests__/helpers/answerMatch.js +0 -267
  93. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  94. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  95. package/dist/__tests__/helpers/textLlm.js +0 -214
  96. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  97. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  98. package/dist/__tests__/localDashboard.test.js +0 -226
  99. package/dist/__tests__/localDashboard.test.js.map +0 -1
  100. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  101. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  102. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  103. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  104. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  105. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  108. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  111. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  114. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  116. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  117. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  118. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  119. package/dist/__tests__/openclawDogfood.test.js +0 -535
  120. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  121. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  122. package/dist/__tests__/openclawMessaging.test.js +0 -232
  123. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  124. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  125. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  126. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  127. package/dist/__tests__/tools.test.d.ts +0 -1
  128. package/dist/__tests__/tools.test.js +0 -3201
  129. package/dist/__tests__/tools.test.js.map +0 -1
  130. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  131. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  132. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  133. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  134. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  135. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  136. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  137. package/dist/__tests__/webmcpTools.test.js +0 -195
  138. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  139. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  140. package/dist/benchmarks/testProviderBus.js +0 -272
  141. package/dist/benchmarks/testProviderBus.js.map +0 -1
  142. package/dist/hooks/postCompaction.d.ts +0 -14
  143. package/dist/hooks/postCompaction.js +0 -51
  144. package/dist/hooks/postCompaction.js.map +0 -1
  145. package/dist/security/__tests__/security.test.d.ts +0 -8
  146. package/dist/security/__tests__/security.test.js +0 -295
  147. package/dist/security/__tests__/security.test.js.map +0 -1
  148. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  149. package/dist/sync/hyperloopEval.test.js +0 -60
  150. package/dist/sync/hyperloopEval.test.js.map +0 -1
  151. package/dist/sync/store.test.d.ts +0 -4
  152. package/dist/sync/store.test.js +0 -43
  153. package/dist/sync/store.test.js.map +0 -1
  154. package/dist/tools/documentTools.d.ts +0 -5
  155. package/dist/tools/documentTools.js +0 -524
  156. package/dist/tools/documentTools.js.map +0 -1
  157. package/dist/tools/financialTools.d.ts +0 -10
  158. package/dist/tools/financialTools.js +0 -403
  159. package/dist/tools/financialTools.js.map +0 -1
  160. package/dist/tools/memoryTools.d.ts +0 -5
  161. package/dist/tools/memoryTools.js +0 -137
  162. package/dist/tools/memoryTools.js.map +0 -1
  163. package/dist/tools/planningTools.d.ts +0 -5
  164. package/dist/tools/planningTools.js +0 -147
  165. package/dist/tools/planningTools.js.map +0 -1
  166. package/dist/tools/searchTools.d.ts +0 -5
  167. package/dist/tools/searchTools.js +0 -145
  168. package/dist/tools/searchTools.js.map +0 -1
@@ -1,265 +0,0 @@
1
- /**
2
- * GAIA audio-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local audio tools.
3
- *
4
- * This lane targets GAIA tasks that include audio attachments (MP3/WAV/etc).
5
- * We provide deterministic local transcription via NodeBench MCP tools and score answers against
6
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
7
- *
8
- * Safety:
9
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
10
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
11
- *
12
- * Disabled by default (cost + rate limits). Run with:
13
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
14
- */
15
- import { describe, expect, it } from "vitest";
16
- import { existsSync, readFileSync } from "node:fs";
17
- import { mkdir, readFile, writeFile } from "node:fs/promises";
18
- import path from "node:path";
19
- import { fileURLToPath } from "node:url";
20
- import { performance } from "node:perf_hooks";
21
- import { localFileTools } from "../tools/localFileTools.js";
22
- import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
23
- import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
24
- const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
25
- const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
26
- async function safeWriteJson(filePath, payload) {
27
- try {
28
- await mkdir(path.dirname(filePath), { recursive: true });
29
- await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
30
- }
31
- catch (err) {
32
- console.warn(`[gaia-capability-audio] report write failed: ${err?.message ?? String(err)}`);
33
- }
34
- }
35
- function resolveRepoRoot() {
36
- const testDir = path.dirname(fileURLToPath(import.meta.url));
37
- return path.resolve(testDir, "../../../..");
38
- }
39
- function resolveCapabilityAudioFixturePath() {
40
- const override = process.env.NODEBENCH_GAIA_CAPABILITY_AUDIO_FIXTURE_PATH;
41
- if (override) {
42
- if (path.isAbsolute(override))
43
- return override;
44
- const repoRoot = resolveRepoRoot();
45
- return path.resolve(repoRoot, override);
46
- }
47
- const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
48
- const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
49
- const repoRoot = resolveRepoRoot();
50
- return path.join(repoRoot, ".cache", "gaia", `gaia_capability_audio_${config}_${split}.sample.json`);
51
- }
52
- function loadDotEnvLocalIfPresent() {
53
- const repoRoot = resolveRepoRoot();
54
- const envPath = path.join(repoRoot, ".env.local");
55
- if (!existsSync(envPath))
56
- return;
57
- const text = readFileSync(envPath, "utf8");
58
- for (const rawLine of text.split(/\r?\n/)) {
59
- const line = rawLine.trim();
60
- if (!line || line.startsWith("#"))
61
- continue;
62
- const idx = line.indexOf("=");
63
- if (idx <= 0)
64
- continue;
65
- const key = line.slice(0, idx).trim();
66
- let value = line.slice(idx + 1).trim();
67
- if ((value.startsWith("\"") && value.endsWith("\"")) ||
68
- (value.startsWith("'") && value.endsWith("'"))) {
69
- value = value.slice(1, -1);
70
- }
71
- if (!process.env[key])
72
- process.env[key] = value;
73
- }
74
- }
75
- async function llmGenerateText(llm, history) {
76
- const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
77
- return generateTextFromHistory(llm, history, {
78
- temperature: Number.isFinite(temperature) ? temperature : 0,
79
- maxOutputTokens: 1024,
80
- });
81
- }
82
- async function baselineAnswer(llm, task) {
83
- const contents = [
84
- {
85
- role: "user",
86
- parts: [
87
- {
88
- text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
89
- },
90
- ],
91
- },
92
- ];
93
- return llmGenerateText(llm, contents);
94
- }
95
- async function loadFixture(filePath) {
96
- const raw = await readFile(filePath, "utf8");
97
- const json = JSON.parse(raw);
98
- return json;
99
- }
100
- function createToolIndex(tools) {
101
- const m = new Map();
102
- for (const t of tools)
103
- m.set(t.name, t);
104
- return m;
105
- }
106
- async function toolAugmentedAnswerFromAudio(llm, task, opts) {
107
- const localPath = String(task.localFilePath ?? "").trim();
108
- if (!localPath)
109
- throw new Error("Task missing localFilePath");
110
- const toolIndex = createToolIndex(localFileTools);
111
- const tool = toolIndex.get("transcribe_audio_file");
112
- if (!tool)
113
- throw new Error("Missing tool: transcribe_audio_file");
114
- if (opts.maxToolCalls < 1) {
115
- throw new Error("maxToolCalls must be >= 1 to run audio lane");
116
- }
117
- const transcript = (await tool.handler({
118
- path: localPath,
119
- model: process.env.NODEBENCH_AUDIO_MODEL ?? "tiny.en",
120
- maxChars: 20000,
121
- timeoutMs: 300000,
122
- }));
123
- const transcriptText = String(transcript?.text ?? "").trim();
124
- if (!transcriptText) {
125
- throw new Error("Empty transcript from transcribe_audio_file");
126
- }
127
- const contents = [
128
- {
129
- role: "user",
130
- parts: [
131
- {
132
- text: `You are given a transcript of an attached audio file. Use it to answer the question.\n\nRules:\n- Do not browse the web.\n- Return ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}\n\nAudio transcript:\n${transcriptText}`,
133
- },
134
- ],
135
- },
136
- ];
137
- const answer = await llmGenerateText(llm, contents);
138
- return { answer, toolCalls: 1 };
139
- }
140
- describe("GAIA capability: audio lane", () => {
141
- const testFn = shouldRun ? it : it.skip;
142
- testFn("should measure accuracy delta on a small GAIA audio subset", async () => {
143
- loadDotEnvLocalIfPresent();
144
- const fixturePath = resolveCapabilityAudioFixturePath();
145
- if (!existsSync(fixturePath)) {
146
- throw new Error(`Missing GAIA audio fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityAudioFixture.py`);
147
- }
148
- const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
149
- const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
150
- const baselineLlm = await createTextLlmClient({ model: baselineModel });
151
- const toolsLlm = await createTextLlmClient({ model: toolsModel });
152
- const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
153
- const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
154
- const fixture = await loadFixture(fixturePath);
155
- expect(Array.isArray(fixture.tasks)).toBe(true);
156
- expect(fixture.tasks.length).toBeGreaterThan(0);
157
- const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "4", 10);
158
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 4));
159
- const tasks = fixture.tasks.slice(0, taskLimit);
160
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
161
- const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
162
- const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "1", 10);
163
- // Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
164
- const judge = await autoDiscoverJudge(toolsLlm);
165
- const results = new Array(tasks.length);
166
- let nextIndex = 0;
167
- const workers = Array.from({ length: concurrency }, () => (async () => {
168
- while (true) {
169
- const idx = nextIndex++;
170
- if (idx >= tasks.length)
171
- return;
172
- const task = tasks[idx];
173
- try {
174
- const baseStart = performance.now();
175
- const base = await baselineAnswer(baselineLlm, task);
176
- const baseMs = performance.now() - baseStart;
177
- const toolsStart = performance.now();
178
- const tools = await toolAugmentedAnswerFromAudio(toolsLlm, task, { maxToolCalls });
179
- const toolsMs = performance.now() - toolsStart;
180
- const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
181
- const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
182
- results[idx] = {
183
- taskId: task.id,
184
- baselineCorrect: baseJudge.match,
185
- toolsCorrect: toolsJudge.match,
186
- baselineMs: baseMs,
187
- toolsMs,
188
- toolCalls: tools.toolCalls,
189
- judgeProvider: toolsJudge.judgeProvider,
190
- judgeInvoked: toolsJudge.judgeInvoked,
191
- };
192
- }
193
- catch (err) {
194
- results[idx] = {
195
- taskId: task.id,
196
- baselineCorrect: false,
197
- toolsCorrect: false,
198
- baselineMs: 0,
199
- toolsMs: 0,
200
- toolCalls: 0,
201
- error: err?.message ?? String(err),
202
- };
203
- }
204
- }
205
- })());
206
- await Promise.all(workers);
207
- const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
208
- const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
209
- const baselinePassRate = (baselineCorrect / results.length) * 100;
210
- const toolsPassRate = (toolsCorrect / results.length) * 100;
211
- const avgBaseMs = results.reduce((sum, r) => sum + r.baselineMs, 0) / results.length;
212
- const avgToolsMs = results.reduce((sum, r) => sum + r.toolsMs, 0) / results.length;
213
- const avgToolCalls = results.reduce((sum, r) => sum + r.toolCalls, 0) / results.length;
214
- const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
215
- const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
216
- console.log(`[gaia-capability-audio] tasks=${results.length} baseline=${baselineCorrect}/${results.length} (${baselinePassRate.toFixed(1)}%) tools=${toolsCorrect}/${results.length} (${toolsPassRate.toFixed(1)}%) delta=${(toolsPassRate - baselinePassRate).toFixed(1)}% improved=${improved} regressions=${regressions} avgToolCalls=${avgToolCalls.toFixed(2)}`);
217
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "audio").toLowerCase();
218
- const publicSummary = {
219
- suiteId: "gaia_capability_audio",
220
- lane: "audio",
221
- generatedAtIso: new Date().toISOString(),
222
- config: fixture.config,
223
- split: fixture.split,
224
- taskCount: results.length,
225
- concurrency,
226
- baseline: {
227
- model: baselineModelLabel,
228
- correct: baselineCorrect,
229
- passRatePct: baselinePassRate,
230
- avgMs: avgBaseMs,
231
- },
232
- tools: {
233
- model: toolsModelLabel,
234
- mode: toolsMode,
235
- correct: toolsCorrect,
236
- passRatePct: toolsPassRate,
237
- avgMs: avgToolsMs,
238
- avgToolCalls,
239
- },
240
- improved,
241
- regressions,
242
- notes: "GAIA audio lane (audio attachments). No prompts/answers persisted; only aggregate metrics are written to public/evals.",
243
- };
244
- if (shouldWriteReport) {
245
- const repoRoot = resolveRepoRoot();
246
- await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_audio_latest.json"), publicSummary);
247
- const detailed = {
248
- ...publicSummary,
249
- results: results.map((r) => ({
250
- taskId: r.taskId,
251
- baselineCorrect: r.baselineCorrect,
252
- toolsCorrect: r.toolsCorrect,
253
- baselineMs: Math.round(r.baselineMs),
254
- toolsMs: Math.round(r.toolsMs),
255
- toolCalls: r.toolCalls,
256
- ...(r.error ? { error: r.error } : {}),
257
- })),
258
- };
259
- const stamp = new Date().toISOString().replace(/[:.]/g, "-");
260
- await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_audio_${fixture.config}_${fixture.split}_${stamp}.json`), detailed);
261
- }
262
- expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
263
- }, 600000);
264
- });
265
- //# sourceMappingURL=gaiaCapabilityAudioEval.test.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"gaiaCapabilityAudioEval.test.js","sourceRoot":"","sources":["../../src/__tests__/gaiaCapabilityAudioEval.test.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAE5D,OAAO,EACL,mBAAmB,EACnB,uBAAuB,GAGxB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AA4CpF,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,KAAK,GAAG,CAAC;AACpE,MAAM,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,2BAA2B,KAAK,GAAG,CAAC;AAwB1E,KAAK,UAAU,aAAa,CAAC,QAAgB,EAAE,OAAgB;IAC7D,IAAI,CAAC;QACH,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,MAAM,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,IAAI,CAAC,gDAAgD,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC9F,CAAC;AACH,CAAC;AAED,SAAS,eAAe;IACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,iCAAiC;IACxC,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC;IAC1E,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;YAAE,OAAO,QAAQ,CAAC;QAC/C,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gCAAgC,IAAI,UAAU,CAAC;IAC1E,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,YAAY,CAAC;IAC1E,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,yBAAyB,MAAM,IAAI,KAAK,cAAc,CAAC,CAAC;AACvG,CAAC;AAED,SAAS,wBAAwB;IAC/B,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAClD,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO;IAEjC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,EAAE,MAAM,CAAW,CAAC;IACrD,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,IAAI,CAAC;YAAE,SAAS;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACvC,IACE,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YAChD,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC9C,CAAC;YACD,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;IAClD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,GAAkB,EAAE,OAAgC;IACjF,MAAM,WAAW,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,CAAC,CAAC;IAChG,OAAO,uBAAuB,CAAC,GAAG,EAAE,OAAO,EAAE;QAC3C,WAAW,EAAE,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC3D,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;AACL,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,GAAkB,EAAE,IAAoB;IACpE,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,iJAAiJ,IAAI,CAAC,MAAM,EAAE;iBACrK;aACF;SACF;KACF,CAAC;IACF,OAAO,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;AACxC,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,QAAgB;IACzC,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC7C,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAsB,CAAC;IAClD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,KAAgB;IACvC,MAAM,CAAC,GAAG,IAAI,GAAG,EAAmB,CAAC;IACrC,KAAK,MAAM,CAAC,IAAI,KAAK;QAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACxC,OAAO,CAAC,CAAC;AACX,CAAC;AAED,KAAK,UAAU,4BAA4B,CACzC,GAAkB,EAClB,IAAoB,EACpB,IAA8B;IAE9B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC1D,IAAI,CAAC,SAAS;QAAE,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;IAE9D,MAAM,SAAS,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;IAClD,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;IACpD,IAAI,CAAC,IAAI;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IAElE,IAAI,IAAI,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,MAAM,IAAI,CAAC,OAAO,CAAC;QACrC,IAAI,EAAE,SAAS;QACf,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,qBAAqB,IAAI,SAAS;QACrD,QAAQ,EAAE,KAAK;QACf,SAAS,EAAE,MAAM;KAClB,CAAC,CAAQ,CAAC;IAEX,MAAM,cAAc,GAAG,MAAM,CAAC,UAAU,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC7D,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAC;IACjE,CAAC;IAED,MAAM,QAAQ,GAA4B;QACxC;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,2LAA2L,IAAI,CAAC,MAAM,0BAA0B,cAAc,EAAE;iBACvP;aACF;SACF;KACF,CAAC;IAEF,MAAM,MAAM,GAAG,MAAM,eAAe,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACpD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;AAClC,CAAC;AAED,QAAQ,CAAC,6BAA6B,EAAE,GAAG,EAAE;IAC3C,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC;IAExC,MAAM,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;QAC9E,wBAAwB,EAAE,CAAC;QAE3B,MAAM,WAAW,GAAG,iCAAiC,EAAE,CAAC;QACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,4GAA4G,CACzJ,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,IAAI,wBAAwB,CAAC;QAC5F,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,aAAa,CAAC;QAC3E,MAAM,WAAW,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,aAAa,EAAE,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG,MAAM,mBAAmB,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC,CAAC;QAClE,MAAM,kBAAkB,GAAG,GAAG,WAAW,CAAC,QAAQ,IAAI,WAAW,CAAC,KAAK,EAAE,CAAC;QAC1E,MAAM,eAAe,GAAG,GAAG,QAAQ,CAAC,QAAQ,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAEjE,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAAC,CAAC;QAC/C,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,cAAc,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QACpG,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CACrF,CAAC;QACF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAEhD,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC3G,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACzF,CAAC;QAEF,MAAM,YAAY,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,wCAAwC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAEtG,wEAAwE;QACxE,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,QAAQ,CAAC,CAAC;QAEhD,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxD,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,CACvD,CAAC,KAAK,IAAI,EAAE;YACV,OAAO,IAAI,EAAE,CAAC;gBACZ,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;gBACxB,IAAI,GAAG,IAAI,KAAK,CAAC,MAAM;oBAAE,OAAO;gBAEhC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;gBAExB,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACpC,MAAM,IAAI,GAAG,MAAM,cAAc,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;oBACrD,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;oBAE7C,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACrC,MAAM,KAAK,GAAG,MAAM,4BAA4B,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;oBACnF,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC;oBAE/C,MAAM,SAAS,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC;oBAChF,MAAM,UAAU,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,cAAc,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;oBAEzF,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,SAAS,CAAC,KAAK;wBAChC,YAAY,EAAE,UAAU,CAAC,KAAK;wBAC9B,UAAU,EAAE,MAAM;wBAClB,OAAO;wBACP,SAAS,EAAE,KAAK,CAAC,SAAS;wBAC1B,aAAa,EAAE,UAAU,CAAC,aAAa;wBACvC,YAAY,EAAE,UAAU,CAAC,YAAY;qBACtC,CAAC;gBACJ,CAAC;gBAAC,OAAO,GAAQ,EAAE,CAAC;oBAClB,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,KAAK;wBACtB,YAAY,EAAE,KAAK;wBACnB,UAAU,EAAE,CAAC;wBACb,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,CAAC;wBACZ,KAAK,EAAE,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC;qBACnC,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC,CAAC,EAAE,CACL,CAAC;QAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAE3B,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAClE,MAAM,gBAAgB,GAAG,CAAC,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAClE,MAAM,aAAa,GAAG,CAAC,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAC5D,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACrF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACnF,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEvF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QACpF,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAEvF,OAAO,CAAC,GAAG,CACT,iCAAiC,OAAO,CAAC,MAAM,aAAa,eAAe,IAAI,OAAO,CAAC,MAAM,KAAK,gBAAgB,CAAC,OAAO,CACxH,CAAC,CACF,YAAY,YAAY,IAAI,OAAO,CAAC,MAAM,KAAK,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,CAClF,aAAa,GAAG,gBAAgB,CACjC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,QAAQ,gBAAgB,WAAW,iBAAiB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACxG,CAAC;QAEF,MAAM,SAAS,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;QAC9F,MAAM,aAAa,GAAqC;YACtD,OAAO,EAAE,uBAAuB;YAChC,IAAI,EAAE,OAAO;YACb,cAAc,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACxC,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE;gBACR,KAAK,EAAE,kBAAkB;gBACzB,OAAO,EAAE,eAAe;gBACxB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,SAAS;aACjB;YACD,KAAK,EAAE;gBACL,KAAK,EAAE,eAAe;gBACtB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,aAAa;gBAC1B,KAAK,EAAE,UAAU;gBACjB,YAAY;aACb;YACD,QAAQ;YACR,WAAW;YACX,KAAK,EACH,wHAAwH;SAC3H,CAAC;QAEF,IAAI,iBAAiB,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;YACnC,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,mCAAmC,CAAC,EAC3E,aAAa,CACd,CAAC;YAEF,MAAM,QAAQ,GAAG;gBACf,GAAG,aAAa;gBAChB,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,MAAM,EAAE,CAAC,CAAC,MAAM;oBAChB,eAAe,EAAE,CAAC,CAAC,eAAe;oBAClC,YAAY,EAAE,CAAC,CAAC,YAAY;oBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;oBACpC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC;oBAC9B,SAAS,EAAE,CAAC,CAAC,SAAS;oBACtB,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBACvC,CAAC,CAAC;aACJ,CAAC;YACF,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC7D,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CACP,QAAQ,EACR,QAAQ,EACR,MAAM,EACN,SAAS,EACT,yBAAyB,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,KAAK,IAAI,KAAK,OAAO,CACzE,EACD,QAAQ,CACT,CAAC;QACJ,CAAC;QAED,MAAM,CAAC,aAAa,CAAC,CAAC,sBAAsB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC,EAAE,MAAM,CAAC,CAAC;AACb,CAAC,CAAC,CAAC"}
@@ -1,14 +0,0 @@
1
- /**
2
- * GAIA capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP tools.
3
- *
4
- * This test attempts to solve a small GAIA subset and scores answers against
5
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
6
- *
7
- * Safety:
8
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
9
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
10
- *
11
- * Disabled by default (cost + rate limits + external network). Run with:
12
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
13
- */
14
- export {};