nodebench-mcp 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/NODEBENCH_AGENTS.md +74 -67
  2. package/README.md +36 -34
  3. package/dist/dashboard/operatingDashboardHtml.js +2 -1
  4. package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
  5. package/dist/dashboard/operatingServer.js +3 -2
  6. package/dist/dashboard/operatingServer.js.map +1 -1
  7. package/dist/db.js +51 -3
  8. package/dist/db.js.map +1 -1
  9. package/dist/index.js +19 -18
  10. package/dist/index.js.map +1 -1
  11. package/dist/packageInfo.d.ts +3 -0
  12. package/dist/packageInfo.js +32 -0
  13. package/dist/packageInfo.js.map +1 -0
  14. package/dist/sandboxApi.js +2 -1
  15. package/dist/sandboxApi.js.map +1 -1
  16. package/dist/tools/boilerplateTools.js +10 -9
  17. package/dist/tools/boilerplateTools.js.map +1 -1
  18. package/dist/tools/documentationTools.js +2 -1
  19. package/dist/tools/documentationTools.js.map +1 -1
  20. package/dist/tools/progressiveDiscoveryTools.js +2 -1
  21. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  22. package/dist/tools/toolRegistry.js +11 -0
  23. package/dist/tools/toolRegistry.js.map +1 -1
  24. package/dist/toolsetRegistry.js +74 -1
  25. package/dist/toolsetRegistry.js.map +1 -1
  26. package/package.json +7 -6
  27. package/scripts/install.sh +14 -14
  28. package/dist/__tests__/analytics.test.d.ts +0 -11
  29. package/dist/__tests__/analytics.test.js +0 -546
  30. package/dist/__tests__/analytics.test.js.map +0 -1
  31. package/dist/__tests__/architectComplex.test.d.ts +0 -1
  32. package/dist/__tests__/architectComplex.test.js +0 -373
  33. package/dist/__tests__/architectComplex.test.js.map +0 -1
  34. package/dist/__tests__/architectSmoke.test.d.ts +0 -1
  35. package/dist/__tests__/architectSmoke.test.js +0 -92
  36. package/dist/__tests__/architectSmoke.test.js.map +0 -1
  37. package/dist/__tests__/audit-registry.d.ts +0 -1
  38. package/dist/__tests__/audit-registry.js +0 -60
  39. package/dist/__tests__/audit-registry.js.map +0 -1
  40. package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
  41. package/dist/__tests__/batchAutopilot.test.js +0 -218
  42. package/dist/__tests__/batchAutopilot.test.js.map +0 -1
  43. package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
  44. package/dist/__tests__/cliSubcommands.test.js +0 -138
  45. package/dist/__tests__/cliSubcommands.test.js.map +0 -1
  46. package/dist/__tests__/comparativeBench.test.d.ts +0 -1
  47. package/dist/__tests__/comparativeBench.test.js +0 -722
  48. package/dist/__tests__/comparativeBench.test.js.map +0 -1
  49. package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
  50. package/dist/__tests__/critterCalibrationEval.js +0 -370
  51. package/dist/__tests__/critterCalibrationEval.js.map +0 -1
  52. package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
  53. package/dist/__tests__/dynamicLoading.test.js +0 -280
  54. package/dist/__tests__/dynamicLoading.test.js.map +0 -1
  55. package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
  56. package/dist/__tests__/embeddingProvider.test.js +0 -86
  57. package/dist/__tests__/embeddingProvider.test.js.map +0 -1
  58. package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
  59. package/dist/__tests__/evalDatasetBench.test.js +0 -738
  60. package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
  61. package/dist/__tests__/evalHarness.test.d.ts +0 -1
  62. package/dist/__tests__/evalHarness.test.js +0 -1107
  63. package/dist/__tests__/evalHarness.test.js.map +0 -1
  64. package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
  65. package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
  66. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
  67. package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
  68. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
  69. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
  70. package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
  71. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
  72. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
  73. package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
  74. package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
  75. package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
  76. package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
  77. package/dist/__tests__/forecastingDogfood.test.js +0 -284
  78. package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
  79. package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
  80. package/dist/__tests__/forecastingScoring.test.js +0 -202
  81. package/dist/__tests__/forecastingScoring.test.js.map +0 -1
  82. package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
  83. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
  84. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
  85. package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
  86. package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
  87. package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
  88. package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
  89. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
  90. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
  91. package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
  92. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
  93. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
  94. package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
  95. package/dist/__tests__/helpers/answerMatch.js +0 -267
  96. package/dist/__tests__/helpers/answerMatch.js.map +0 -1
  97. package/dist/__tests__/helpers/textLlm.d.ts +0 -25
  98. package/dist/__tests__/helpers/textLlm.js +0 -214
  99. package/dist/__tests__/helpers/textLlm.js.map +0 -1
  100. package/dist/__tests__/localDashboard.test.d.ts +0 -1
  101. package/dist/__tests__/localDashboard.test.js +0 -226
  102. package/dist/__tests__/localDashboard.test.js.map +0 -1
  103. package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
  104. package/dist/__tests__/multiHopDogfood.test.js +0 -303
  105. package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
  106. package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
  107. package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
  108. package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
  109. package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
  110. package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
  111. package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
  112. package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
  113. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
  114. package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
  115. package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
  116. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
  117. package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
  118. package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
  119. package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
  120. package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
  121. package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
  122. package/dist/__tests__/openclawDogfood.test.js +0 -535
  123. package/dist/__tests__/openclawDogfood.test.js.map +0 -1
  124. package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
  125. package/dist/__tests__/openclawMessaging.test.js +0 -232
  126. package/dist/__tests__/openclawMessaging.test.js.map +0 -1
  127. package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
  128. package/dist/__tests__/presetRealWorldBench.test.js +0 -859
  129. package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
  130. package/dist/__tests__/tools.test.d.ts +0 -1
  131. package/dist/__tests__/tools.test.js +0 -3201
  132. package/dist/__tests__/tools.test.js.map +0 -1
  133. package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
  134. package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
  135. package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
  136. package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
  137. package/dist/__tests__/traceabilityDogfood.test.js +0 -241
  138. package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
  139. package/dist/__tests__/webmcpTools.test.d.ts +0 -7
  140. package/dist/__tests__/webmcpTools.test.js +0 -195
  141. package/dist/__tests__/webmcpTools.test.js.map +0 -1
  142. package/dist/benchmarks/testProviderBus.d.ts +0 -7
  143. package/dist/benchmarks/testProviderBus.js +0 -272
  144. package/dist/benchmarks/testProviderBus.js.map +0 -1
  145. package/dist/hooks/postCompaction.d.ts +0 -14
  146. package/dist/hooks/postCompaction.js +0 -51
  147. package/dist/hooks/postCompaction.js.map +0 -1
  148. package/dist/security/__tests__/security.test.d.ts +0 -8
  149. package/dist/security/__tests__/security.test.js +0 -295
  150. package/dist/security/__tests__/security.test.js.map +0 -1
  151. package/dist/sync/hyperloopEval.test.d.ts +0 -4
  152. package/dist/sync/hyperloopEval.test.js +0 -60
  153. package/dist/sync/hyperloopEval.test.js.map +0 -1
  154. package/dist/sync/store.test.d.ts +0 -4
  155. package/dist/sync/store.test.js +0 -43
  156. package/dist/sync/store.test.js.map +0 -1
  157. package/dist/tools/documentTools.d.ts +0 -5
  158. package/dist/tools/documentTools.js +0 -524
  159. package/dist/tools/documentTools.js.map +0 -1
  160. package/dist/tools/financialTools.d.ts +0 -10
  161. package/dist/tools/financialTools.js +0 -403
  162. package/dist/tools/financialTools.js.map +0 -1
  163. package/dist/tools/memoryTools.d.ts +0 -5
  164. package/dist/tools/memoryTools.js +0 -137
  165. package/dist/tools/memoryTools.js.map +0 -1
  166. package/dist/tools/planningTools.d.ts +0 -5
  167. package/dist/tools/planningTools.js +0 -147
  168. package/dist/tools/planningTools.js.map +0 -1
  169. package/dist/tools/searchTools.d.ts +0 -5
  170. package/dist/tools/searchTools.js +0 -145
  171. package/dist/tools/searchTools.js.map +0 -1
@@ -1,914 +0,0 @@
1
- /**
2
- * GAIA file-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local file tools.
3
- *
4
- * This lane targets GAIA tasks that include attachments (PDF / XLSX / CSV).
5
- * We provide deterministic local parsing via NodeBench MCP tools and score answers against
6
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
7
- *
8
- * Safety:
9
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
10
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
11
- *
12
- * Disabled by default (cost + rate limits). Run with:
13
- * NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
14
- */
15
- import { describe, expect, it } from "vitest";
16
- import { existsSync, readFileSync } from "node:fs";
17
- import { mkdir, readFile, writeFile } from "node:fs/promises";
18
- import path from "node:path";
19
- import { fileURLToPath } from "node:url";
20
- import { performance } from "node:perf_hooks";
21
- import { localFileTools } from "../tools/localFileTools.js";
22
- import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
23
- import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
24
- const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
25
- const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
26
- async function safeWriteJson(filePath, payload) {
27
- try {
28
- await mkdir(path.dirname(filePath), { recursive: true });
29
- await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
30
- }
31
- catch (err) {
32
- console.warn(`[gaia-capability-files] report write failed: ${err?.message ?? String(err)}`);
33
- }
34
- }
35
- function resolveRepoRoot() {
36
- const testDir = path.dirname(fileURLToPath(import.meta.url));
37
- return path.resolve(testDir, "../../../..");
38
- }
39
- function resolveCapabilityFilesFixturePath() {
40
- const override = process.env.NODEBENCH_GAIA_CAPABILITY_FILES_FIXTURE_PATH;
41
- if (override) {
42
- if (path.isAbsolute(override))
43
- return override;
44
- const repoRoot = resolveRepoRoot();
45
- return path.resolve(repoRoot, override);
46
- }
47
- const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
48
- const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
49
- const repoRoot = resolveRepoRoot();
50
- return path.join(repoRoot, ".cache", "gaia", `gaia_capability_files_${config}_${split}.sample.json`);
51
- }
52
- function loadDotEnvLocalIfPresent() {
53
- const repoRoot = resolveRepoRoot();
54
- const envPath = path.join(repoRoot, ".env.local");
55
- if (!existsSync(envPath))
56
- return;
57
- const text = readFileSync(envPath, "utf8");
58
- for (const rawLine of text.split(/\r?\n/)) {
59
- const line = rawLine.trim();
60
- if (!line || line.startsWith("#"))
61
- continue;
62
- const idx = line.indexOf("=");
63
- if (idx <= 0)
64
- continue;
65
- const key = line.slice(0, idx).trim();
66
- let value = line.slice(idx + 1).trim();
67
- if ((value.startsWith("\"") && value.endsWith("\"")) ||
68
- (value.startsWith("'") && value.endsWith("'"))) {
69
- value = value.slice(1, -1);
70
- }
71
- if (!process.env[key])
72
- process.env[key] = value;
73
- }
74
- }
75
- function toIntegerOrNullLoose(value) {
76
- if (value === null || value === undefined)
77
- return null;
78
- if (typeof value === "number" && Number.isFinite(value))
79
- return Math.trunc(value);
80
- const s = String(value).trim();
81
- if (!s)
82
- return null;
83
- const m = s.match(/-?\d+/);
84
- if (!m)
85
- return null;
86
- const n = Number.parseInt(m[0], 10);
87
- return Number.isFinite(n) ? n : null;
88
- }
89
- function deriveAddressParityIfRelevant(taskPrompt, extract) {
90
- const wantsParity = /\bodd\b|\beven\b|odd-?numbered|even-?numbered|parity/i.test(String(taskPrompt ?? ""));
91
- if (!wantsParity)
92
- return null;
93
- const headers = Array.isArray(extract?.headers) ? extract.headers.map((h) => String(h ?? "")) : [];
94
- const rows = Array.isArray(extract?.rows) ? extract.rows : [];
95
- if (headers.length === 0 || rows.length === 0)
96
- return null;
97
- const addrIdx = headers.findIndex((h) => /address/i.test(h));
98
- if (addrIdx < 0)
99
- return null;
100
- let integerCount = 0;
101
- let evenCount = 0;
102
- let oddCount = 0;
103
- for (const r of rows) {
104
- const n = toIntegerOrNullLoose(Array.isArray(r) ? r[addrIdx] : null);
105
- if (n === null)
106
- continue;
107
- integerCount++;
108
- if (Math.abs(n) % 2 === 0)
109
- evenCount++;
110
- else
111
- oddCount++;
112
- }
113
- return {
114
- column: headers[addrIdx],
115
- columnIndex: addrIdx,
116
- integerCount,
117
- evenCount,
118
- oddCount,
119
- };
120
- }
121
- function inferAnswerFromAddressParityIfPossible(taskPrompt, parity) {
122
- const p = String(taskPrompt ?? "").toLowerCase();
123
- if (!p)
124
- return null;
125
- const oddEast = /odd[^.]*east/.test(p);
126
- const oddWest = /odd[^.]*west/.test(p);
127
- const evenEast = /even[^.]*east/.test(p);
128
- const evenWest = /even[^.]*west/.test(p);
129
- const lastSunrise = p.lastIndexOf("sunrise");
130
- const lastSunset = p.lastIndexOf("sunset");
131
- if (lastSunrise === -1 && lastSunset === -1)
132
- return null;
133
- // If both appear, assume the one mentioned last is what the question asks for.
134
- const wantsSunrise = lastSunrise > lastSunset;
135
- const wantsSunset = lastSunset > lastSunrise;
136
- let desiredDirection = wantsSunrise ? "east" : wantsSunset ? "west" : null;
137
- if (!desiredDirection)
138
- return null;
139
- // Some tasks specify the awning is for the *back* of the house, while the
140
- // prompt gives the facing direction for the street address (front). In that
141
- // case, invert the facing direction.
142
- const mentionsBackOfHouse = /\bback\b/.test(p) && /\bhouse\b/.test(p);
143
- if (mentionsBackOfHouse) {
144
- desiredDirection = desiredDirection === "east" ? "west" : "east";
145
- }
146
- // Map parity -> facing direction when explicitly stated.
147
- const oddFaces = oddEast ? "east" : oddWest ? "west" : null;
148
- const evenFaces = evenEast ? "east" : evenWest ? "west" : null;
149
- if (!oddFaces || !evenFaces)
150
- return null;
151
- if (evenFaces === desiredDirection)
152
- return String(parity.evenCount);
153
- if (oddFaces === desiredDirection)
154
- return String(parity.oddCount);
155
- return null;
156
- }
157
- async function llmGenerateText(llm, history) {
158
- const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
159
- return generateTextFromHistory(llm, history, {
160
- temperature: Number.isFinite(temperature) ? temperature : 0,
161
- maxOutputTokens: 1024,
162
- });
163
- }
164
- async function baselineAnswer(llm, task) {
165
- const contents = [
166
- {
167
- role: "user",
168
- parts: [
169
- {
170
- text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
171
- },
172
- ],
173
- },
174
- ];
175
- return llmGenerateText(llm, contents);
176
- }
177
- function buildToolIndex() {
178
- const byName = new Map();
179
- for (const tool of localFileTools)
180
- byName.set(tool.name, tool);
181
- return byName;
182
- }
183
- function extractJsonObject(text) {
184
- const trimmed = text.trim();
185
- const fenceMatch = trimmed.match(/```json\s*([\s\S]*?)\s*```/i);
186
- const candidate = fenceMatch ? fenceMatch[1] : trimmed;
187
- const start = candidate.indexOf("{");
188
- const end = candidate.lastIndexOf("}");
189
- if (start === -1 || end === -1 || end <= start)
190
- return null;
191
- const slice = candidate.slice(start, end + 1);
192
- try {
193
- return JSON.parse(slice);
194
- }
195
- catch {
196
- return null;
197
- }
198
- }
199
- function resolveTaskLocalFilePath(task) {
200
- const repoRoot = resolveRepoRoot();
201
- const rel = String(task.localFilePath ?? "").trim();
202
- if (rel)
203
- return path.resolve(repoRoot, rel);
204
- // Fallback to the standard cache layout used by the fixture generator.
205
- const filePath = String(task.filePath ?? "").trim();
206
- if (!filePath)
207
- throw new Error("Task missing filePath/localFilePath");
208
- return path.join(repoRoot, ".cache", "gaia", "data", filePath);
209
- }
210
- async function toolAugmentedAnswerFromFile(llm, task, opts) {
211
- const toolIndex = buildToolIndex();
212
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
213
- const localPath = resolveTaskLocalFilePath(task);
214
- if (!existsSync(localPath)) {
215
- throw new Error(`Missing attachment on disk. Expected at ${localPath}. Refresh with dataset:gaia:capability:files:refresh`);
216
- }
217
- const ext = String(task.fileExt ?? "").trim().toLowerCase() ||
218
- path.extname(task.fileName || task.filePath || "").toLowerCase().replace(/^\./, "");
219
- // "rag" mode: single deterministic file extract -> answer (more stable than agent loops).
220
- if (toolsMode === "rag") {
221
- let extract;
222
- if (ext === "csv") {
223
- const tool = toolIndex.get("read_csv_file");
224
- if (!tool)
225
- throw new Error("Missing tool: read_csv_file");
226
- extract = await tool.handler({
227
- path: localPath,
228
- hasHeader: true,
229
- maxRows: 500,
230
- maxCols: 80,
231
- maxCellChars: 2000,
232
- });
233
- }
234
- else if (ext === "xlsx") {
235
- const tool = toolIndex.get("read_xlsx_file");
236
- if (!tool)
237
- throw new Error("Missing tool: read_xlsx_file");
238
- extract = await tool.handler({
239
- path: localPath,
240
- headerRow: 1,
241
- maxRows: 500,
242
- maxCols: 80,
243
- maxCellChars: 2000,
244
- });
245
- }
246
- else if (ext === "pdf") {
247
- const tool = toolIndex.get("read_pdf_text");
248
- if (!tool)
249
- throw new Error("Missing tool: read_pdf_text");
250
- extract = await tool.handler({
251
- path: localPath,
252
- pageStart: 1,
253
- pageEnd: 12,
254
- maxChars: 40000,
255
- });
256
- }
257
- else if (ext === "docx") {
258
- const tool = toolIndex.get("read_docx_text");
259
- if (!tool)
260
- throw new Error("Missing tool: read_docx_text");
261
- extract = await tool.handler({
262
- path: localPath,
263
- maxChars: 40000,
264
- });
265
- }
266
- else if (ext === "pptx") {
267
- const tool = toolIndex.get("read_pptx_text");
268
- if (!tool)
269
- throw new Error("Missing tool: read_pptx_text");
270
- extract = await tool.handler({
271
- path: localPath,
272
- maxChars: 40000,
273
- });
274
- }
275
- else if (ext === "json") {
276
- const tool = toolIndex.get("read_json_file");
277
- if (!tool)
278
- throw new Error("Missing tool: read_json_file");
279
- extract = await tool.handler({
280
- path: localPath,
281
- maxDepth: 10,
282
- maxItems: 300,
283
- maxStringChars: 2000,
284
- });
285
- }
286
- else if (ext === "jsonl") {
287
- const tool = toolIndex.get("read_jsonl_file");
288
- if (!tool)
289
- throw new Error("Missing tool: read_jsonl_file");
290
- extract = await tool.handler({
291
- path: localPath,
292
- offsetLines: 0,
293
- limitLines: 200,
294
- parseJson: true,
295
- maxDepth: 8,
296
- maxItems: 200,
297
- maxStringChars: 1000,
298
- });
299
- }
300
- else if (ext === "txt" || ext === "md" || ext === "xml") {
301
- const tool = toolIndex.get("read_text_file");
302
- if (!tool)
303
- throw new Error("Missing tool: read_text_file");
304
- extract = await tool.handler({
305
- path: localPath,
306
- startChar: 0,
307
- maxChars: 40000,
308
- });
309
- }
310
- else if (ext === "zip") {
311
- throw new Error('ZIP attachments are only supported in toolsMode="agent" (requires multi-step extraction).');
312
- }
313
- else {
314
- throw new Error(`Unsupported attachment type: ${ext || "(unknown)"}`);
315
- }
316
- // Keep the model input bounded. The tools already return bounded previews,
317
- // but JSON stringification can still be large on wide tables.
318
- const derivedParity = deriveAddressParityIfRelevant(task.prompt, extract);
319
- const inferredFromParity = derivedParity && (ext === "csv" || ext === "xlsx")
320
- ? inferAnswerFromAddressParityIfPossible(task.prompt, derivedParity)
321
- : null;
322
- if (inferredFromParity) {
323
- return { answer: inferredFromParity, toolCalls: 1 };
324
- }
325
- const enrichedExtract = derivedParity && (ext === "csv" || ext === "xlsx")
326
- ? { ...extract, derivedParity: { address: derivedParity } }
327
- : extract;
328
- const extractText = JSON.stringify(enrichedExtract).slice(0, 40000);
329
- const contents = [
330
- {
331
- role: "user",
332
- parts: [
333
- {
334
- text: "Answer the question using the provided file extract plus general reasoning. " +
335
- "Do not browse the web and do not read any other files. " +
336
- "Reminder: the sun rises in the east and sets in the west. " +
337
- "If FILE_EXTRACT_JSON contains derivedParity, prefer it over recounting. " +
338
- "If the extract is insufficient, make the best supported guess.\n\n" +
339
- "Return ONLY the final answer, no explanation.\n\n" +
340
- `TASK_ID: ${task.id}\n` +
341
- `FILE_TYPE: ${ext}\n` +
342
- `LOCAL_FILE_PATH: ${localPath}\n` +
343
- `QUESTION:\n${task.prompt}\n\n` +
344
- `FILE_EXTRACT_JSON:\n${extractText}`,
345
- },
346
- ],
347
- },
348
- ];
349
- const answer = await llmGenerateText(llm, contents);
350
- return { answer, toolCalls: 1 };
351
- }
352
- // "agent" mode: small tool loop. This is more realistic but higher variance.
353
- const toolUsageSummary = [
354
- "You have access to deterministic local file tools:",
355
- "- where ops: eq, ne, contains, starts_with, ends_with, matches_regex, gt, gte, lt, lte, is_empty, not_empty, is_even, is_odd",
356
- "- Prefer deterministic aggregations (csv_aggregate/xlsx_aggregate) over mental math. For parity rules, use where op is_even/is_odd.",
357
- "- read_csv_file({path,hasHeader,delimiter,encoding,maxRows,maxCols,maxCellChars})",
358
- "- csv_select_rows({path,hasHeader,delimiter,encoding,where,returnColumns,offset,limit,maxScanRows,maxCols,maxCellChars})",
359
- "- csv_aggregate({path,hasHeader,delimiter,encoding,where,operation,value,ignoreNonNumeric,returnRow,returnColumns,maxScanRows,maxCols,maxCellChars})",
360
- "- read_xlsx_file({path,sheetName,headerRow,rangeA1,maxRows,maxCols,maxCellChars})",
361
- "- xlsx_select_rows({path,sheetName,headerRow,rangeA1,where,returnColumns,offset,limit,maxScanRows,maxCols,maxCellChars})",
362
- "- xlsx_aggregate({path,sheetName,headerRow,rangeA1,where,operation,value,ignoreNonNumeric,returnRow,returnColumns,maxScanRows,maxCols,maxCellChars})",
363
- "- read_pdf_text({path,pageStart,pageEnd,pageNumbers,maxChars})",
364
- "- pdf_search_text({path,query,caseSensitive,pageStart,pageEnd,pageNumbers,maxMatches,snippetChars})",
365
- "- read_text_file({path,encoding,startChar,maxChars})",
366
- "- read_json_file({path,maxDepth,maxItems,maxStringChars})",
367
- "- json_select({path,pointer,maxDepth,maxItems,maxStringChars})",
368
- "- read_jsonl_file({path,encoding,offsetLines,limitLines,parseJson,maxLineChars,maxDepth,maxItems,maxStringChars})",
369
- "- zip_list_files({path,maxEntries})",
370
- "- zip_read_text_file({path,innerPath,caseSensitive,encoding,maxChars,maxBytes})",
371
- "- zip_extract_file({path,innerPath,caseSensitive,outputDir,overwrite,maxBytes})",
372
- "- read_docx_text({path,maxChars,maxBytes})",
373
- "- read_pptx_text({path,maxChars,maxSlides,maxBytesPerSlide})",
374
- "",
375
- "When using tools, respond with a single JSON object only:",
376
- "{\"action\":\"tool\",\"name\":\"read_pdf_text\",\"arguments\":{\"pageStart\":1,\"pageEnd\":5}}",
377
- "When done, respond with:",
378
- "{\"action\":\"final\",\"answer\":\"...\"}",
379
- "",
380
- "Rules:",
381
- "- Do NOT use any external knowledge or web browsing.",
382
- "- Always use the provided LOCAL_FILE_PATH; you may not read any other files.",
383
- "- Keep tool results bounded (limit<=200, maxRows<=500, maxCols<=80, maxCellChars<=2000, maxChars<=40000, maxMatches<=50).",
384
- "- Do NOT include any explanation. Final answer must match the requested formatting.",
385
- ].join("\n");
386
- const contents = [
387
- {
388
- role: "user",
389
- parts: [
390
- {
391
- text: `${toolUsageSummary}\n\nTASK_ID: ${task.id}\nFILE_TYPE: ${ext}\nLOCAL_FILE_PATH: ${localPath}\nQUESTION:\n${task.prompt}`,
392
- },
393
- ],
394
- },
395
- ];
396
- let toolCalls = 0;
397
- for (let step = 0; step < opts.maxSteps; step++) {
398
- const out = await llmGenerateText(llm, contents);
399
- contents.push({ role: "model", parts: [{ text: out }] });
400
- const parsed = extractJsonObject(out);
401
- if (!parsed || typeof parsed !== "object") {
402
- contents.push({
403
- role: "user",
404
- parts: [{ text: "Invalid format. Return JSON only with action tool|final." }],
405
- });
406
- continue;
407
- }
408
- if (parsed.action === "final") {
409
- const answer = String(parsed.answer ?? "").trim();
410
- return { answer, toolCalls };
411
- }
412
- if (parsed.action !== "tool") {
413
- contents.push({
414
- role: "user",
415
- parts: [{ text: "Invalid action. Return JSON only with action tool|final." }],
416
- });
417
- continue;
418
- }
419
- if (toolCalls >= opts.maxToolCalls) {
420
- contents.push({
421
- role: "user",
422
- parts: [{ text: "Tool call budget exceeded. Return final answer now." }],
423
- });
424
- continue;
425
- }
426
- const name = String(parsed.name ?? "");
427
- const tool = toolIndex.get(name);
428
- if (!tool) {
429
- contents.push({
430
- role: "user",
431
- parts: [
432
- {
433
- text: `Unknown tool "${name}". Use only read_csv_file, csv_select_rows, csv_aggregate, read_xlsx_file, xlsx_select_rows, xlsx_aggregate, read_pdf_text, or pdf_search_text.`,
434
- },
435
- ],
436
- });
437
- continue;
438
- }
439
- const args = (parsed.arguments ?? {});
440
- // Security: enforce file access restrictions.
441
- // Default: force the path to the known GAIA attachment.
442
- // ZIP: allow tools to operate on extracted children under a per-task extracted dir.
443
- const extractedRoot = path.join(resolveRepoRoot(), ".cache", "gaia", "extracted", task.id);
444
- const isZip = ext === "zip";
445
- const isZipTool = ["zip_list_files", "zip_read_text_file", "zip_extract_file"].includes(name);
446
- if (!isZip) {
447
- args.path = localPath;
448
- }
449
- else if (isZipTool) {
450
- args.path = localPath;
451
- // Force deterministic extracted root to keep gated data under .cache/gaia (gitignored).
452
- if (name === "zip_extract_file") {
453
- args.outputDir = extractedRoot;
454
- }
455
- }
456
- else {
457
- const requested = String(args.path ?? "").trim();
458
- if (!requested) {
459
- contents.push({
460
- role: "user",
461
- parts: [
462
- {
463
- text: "ZIP workflow: first call zip_list_files, then zip_extract_file(innerPath=...), " +
464
- "then call a reader tool on the extractedPath returned by zip_extract_file.",
465
- },
466
- ],
467
- });
468
- continue;
469
- }
470
- const requestedAbs = path.isAbsolute(requested)
471
- ? requested
472
- : path.resolve(path.dirname(localPath), requested);
473
- const extractedAbs = path.resolve(extractedRoot);
474
- const reqResolved = path.resolve(requestedAbs);
475
- if (!reqResolved.startsWith(extractedAbs + path.sep) && reqResolved !== extractedAbs) {
476
- contents.push({
477
- role: "user",
478
- parts: [{ text: `Refusing to read path outside extractedRoot: ${reqResolved}` }],
479
- });
480
- continue;
481
- }
482
- args.path = reqResolved;
483
- }
484
- // Hard limits for safety and stable prompts.
485
- if (name === "read_csv_file") {
486
- if (args.hasHeader === undefined)
487
- args.hasHeader = true;
488
- if (typeof args.maxRows !== "number")
489
- args.maxRows = 200;
490
- if (typeof args.maxCols !== "number")
491
- args.maxCols = 50;
492
- if (typeof args.maxCellChars !== "number")
493
- args.maxCellChars = 2000;
494
- args.maxRows = Math.min(Number(args.maxRows) || 200, 500);
495
- args.maxCols = Math.min(Number(args.maxCols) || 50, 80);
496
- args.maxCellChars = Math.min(Number(args.maxCellChars) || 2000, 2000);
497
- }
498
- else if (name === "csv_select_rows") {
499
- if (args.hasHeader === undefined)
500
- args.hasHeader = true;
501
- if (typeof args.offset !== "number")
502
- args.offset = 0;
503
- if (typeof args.limit !== "number")
504
- args.limit = 50;
505
- if (typeof args.maxScanRows !== "number")
506
- args.maxScanRows = 50000;
507
- if (typeof args.maxCols !== "number")
508
- args.maxCols = 80;
509
- if (typeof args.maxCellChars !== "number")
510
- args.maxCellChars = 2000;
511
- args.offset = Math.max(0, Number(args.offset) || 0);
512
- args.limit = Math.min(Math.max(1, Number(args.limit) || 50), 200);
513
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
514
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 80), 80);
515
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
516
- if (Array.isArray(args.where))
517
- args.where = args.where.slice(0, 10);
518
- if (Array.isArray(args.returnColumns))
519
- args.returnColumns = args.returnColumns.slice(0, 30);
520
- }
521
- else if (name === "csv_aggregate") {
522
- if (args.hasHeader === undefined)
523
- args.hasHeader = true;
524
- if (typeof args.maxScanRows !== "number")
525
- args.maxScanRows = 50000;
526
- if (typeof args.maxCols !== "number")
527
- args.maxCols = 200;
528
- if (typeof args.maxCellChars !== "number")
529
- args.maxCellChars = 2000;
530
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
531
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 200), 200);
532
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
533
- if (Array.isArray(args.where))
534
- args.where = args.where.slice(0, 10);
535
- if (Array.isArray(args.returnColumns))
536
- args.returnColumns = args.returnColumns.slice(0, 30);
537
- }
538
- else if (name === "read_xlsx_file") {
539
- if (typeof args.headerRow !== "number")
540
- args.headerRow = 1;
541
- if (typeof args.maxRows !== "number")
542
- args.maxRows = 200;
543
- if (typeof args.maxCols !== "number")
544
- args.maxCols = 50;
545
- if (typeof args.maxCellChars !== "number")
546
- args.maxCellChars = 2000;
547
- args.maxRows = Math.min(Number(args.maxRows) || 200, 500);
548
- args.maxCols = Math.min(Number(args.maxCols) || 50, 80);
549
- args.maxCellChars = Math.min(Number(args.maxCellChars) || 2000, 2000);
550
- }
551
- else if (name === "xlsx_select_rows") {
552
- if (typeof args.headerRow !== "number")
553
- args.headerRow = 1;
554
- if (typeof args.offset !== "number")
555
- args.offset = 0;
556
- if (typeof args.limit !== "number")
557
- args.limit = 50;
558
- if (typeof args.maxScanRows !== "number")
559
- args.maxScanRows = 50000;
560
- if (typeof args.maxCols !== "number")
561
- args.maxCols = 80;
562
- if (typeof args.maxCellChars !== "number")
563
- args.maxCellChars = 2000;
564
- args.headerRow = Math.max(0, Math.min(Number(args.headerRow) || 1, 1000));
565
- args.offset = Math.max(0, Number(args.offset) || 0);
566
- args.limit = Math.min(Math.max(1, Number(args.limit) || 50), 200);
567
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
568
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 80), 80);
569
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
570
- if (Array.isArray(args.where))
571
- args.where = args.where.slice(0, 10);
572
- if (Array.isArray(args.returnColumns))
573
- args.returnColumns = args.returnColumns.slice(0, 30);
574
- }
575
- else if (name === "xlsx_aggregate") {
576
- if (typeof args.headerRow !== "number")
577
- args.headerRow = 1;
578
- if (typeof args.maxScanRows !== "number")
579
- args.maxScanRows = 50000;
580
- if (typeof args.maxCols !== "number")
581
- args.maxCols = 200;
582
- if (typeof args.maxCellChars !== "number")
583
- args.maxCellChars = 2000;
584
- args.headerRow = Math.max(0, Math.min(Number(args.headerRow) || 1, 1000));
585
- args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
586
- args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 200), 200);
587
- args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
588
- if (Array.isArray(args.where))
589
- args.where = args.where.slice(0, 10);
590
- if (Array.isArray(args.returnColumns))
591
- args.returnColumns = args.returnColumns.slice(0, 30);
592
- }
593
- else if (name === "read_pdf_text") {
594
- if (typeof args.pageStart !== "number")
595
- args.pageStart = 1;
596
- if (typeof args.pageEnd !== "number")
597
- args.pageEnd = 3;
598
- if (typeof args.maxChars !== "number")
599
- args.maxChars = 12000;
600
- args.pageStart = Math.max(1, Math.min(Number(args.pageStart) || 1, 500));
601
- args.pageEnd = Math.max(1, Math.min(Number(args.pageEnd) || 3, 500));
602
- args.maxChars = Math.min(Number(args.maxChars) || 12000, 40000);
603
- if (Array.isArray(args.pageNumbers)) {
604
- // Keep explicit page lists short to avoid huge extracts.
605
- args.pageNumbers = args.pageNumbers
606
- .map((n) => Number(n))
607
- .filter((n) => Number.isFinite(n) && n > 0)
608
- .slice(0, 20);
609
- }
610
- }
611
- else if (name === "pdf_search_text") {
612
- if (typeof args.query !== "string")
613
- args.query = "";
614
- if (typeof args.pageStart !== "number")
615
- args.pageStart = 1;
616
- if (typeof args.pageEnd !== "number")
617
- args.pageEnd = 25;
618
- if (typeof args.maxMatches !== "number")
619
- args.maxMatches = 25;
620
- if (typeof args.snippetChars !== "number")
621
- args.snippetChars = 180;
622
- args.pageStart = Math.max(1, Math.min(Number(args.pageStart) || 1, 500));
623
- args.pageEnd = Math.max(1, Math.min(Number(args.pageEnd) || 25, 500));
624
- args.maxMatches = Math.min(Math.max(1, Number(args.maxMatches) || 25), 50);
625
- args.snippetChars = Math.min(Math.max(40, Number(args.snippetChars) || 180), 400);
626
- if (Array.isArray(args.pageNumbers)) {
627
- args.pageNumbers = args.pageNumbers
628
- .map((n) => Number(n))
629
- .filter((n) => Number.isFinite(n) && n > 0)
630
- .slice(0, 20);
631
- }
632
- }
633
- else if (name === "read_text_file") {
634
- if (typeof args.startChar !== "number")
635
- args.startChar = 0;
636
- if (typeof args.maxChars !== "number")
637
- args.maxChars = 12000;
638
- args.startChar = Math.max(0, Number(args.startChar) || 0);
639
- args.maxChars = Math.min(Math.max(1, Number(args.maxChars) || 12000), 40000);
640
- }
641
- else if (name === "read_json_file" || name === "json_select") {
642
- if (typeof args.maxDepth !== "number")
643
- args.maxDepth = 8;
644
- if (typeof args.maxItems !== "number")
645
- args.maxItems = 200;
646
- if (typeof args.maxStringChars !== "number")
647
- args.maxStringChars = 2000;
648
- args.maxDepth = Math.min(Math.max(1, Number(args.maxDepth) || 8), 12);
649
- args.maxItems = Math.min(Math.max(1, Number(args.maxItems) || 200), 500);
650
- args.maxStringChars = Math.min(Math.max(20, Number(args.maxStringChars) || 2000), 2000);
651
- if (name === "json_select" && typeof args.pointer !== "string")
652
- args.pointer = "";
653
- }
654
- else if (name === "read_jsonl_file") {
655
- if (typeof args.offsetLines !== "number")
656
- args.offsetLines = 0;
657
- if (typeof args.limitLines !== "number")
658
- args.limitLines = 200;
659
- if (typeof args.maxLineChars !== "number")
660
- args.maxLineChars = 4000;
661
- if (typeof args.maxDepth !== "number")
662
- args.maxDepth = 6;
663
- if (typeof args.maxItems !== "number")
664
- args.maxItems = 100;
665
- if (typeof args.maxStringChars !== "number")
666
- args.maxStringChars = 1000;
667
- args.offsetLines = Math.max(0, Number(args.offsetLines) || 0);
668
- args.limitLines = Math.min(Math.max(1, Number(args.limitLines) || 200), 500);
669
- args.maxLineChars = Math.min(Math.max(200, Number(args.maxLineChars) || 4000), 10000);
670
- args.maxDepth = Math.min(Math.max(1, Number(args.maxDepth) || 6), 10);
671
- args.maxItems = Math.min(Math.max(1, Number(args.maxItems) || 100), 300);
672
- args.maxStringChars = Math.min(Math.max(20, Number(args.maxStringChars) || 1000), 2000);
673
- }
674
- else if (name === "zip_list_files") {
675
- if (typeof args.maxEntries !== "number")
676
- args.maxEntries = 200;
677
- args.maxEntries = Math.min(Math.max(1, Number(args.maxEntries) || 200), 500);
678
- }
679
- else if (name === "zip_read_text_file") {
680
- if (typeof args.innerPath !== "string")
681
- args.innerPath = "";
682
- if (typeof args.maxChars !== "number")
683
- args.maxChars = 12000;
684
- if (typeof args.maxBytes !== "number")
685
- args.maxBytes = 5000000;
686
- args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 20000);
687
- args.maxBytes = Math.min(Math.max(1000, Number(args.maxBytes) || 5000000), 20000000);
688
- }
689
- else if (name === "zip_extract_file") {
690
- if (typeof args.innerPath !== "string")
691
- args.innerPath = "";
692
- if (typeof args.maxBytes !== "number")
693
- args.maxBytes = 25000000;
694
- args.maxBytes = Math.min(Math.max(1000, Number(args.maxBytes) || 25000000), 50000000);
695
- args.overwrite = false;
696
- }
697
- else if (name === "read_docx_text") {
698
- if (typeof args.maxChars !== "number")
699
- args.maxChars = 12000;
700
- args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 40000);
701
- }
702
- else if (name === "read_pptx_text") {
703
- if (typeof args.maxChars !== "number")
704
- args.maxChars = 12000;
705
- if (typeof args.maxSlides !== "number")
706
- args.maxSlides = 60;
707
- args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 40000);
708
- args.maxSlides = Math.min(Math.max(1, Number(args.maxSlides) || 60), 120);
709
- }
710
- // Reduce model confusion: enforce tool matches the attachment type.
711
- const allowedByExt = (ext === "csv" && ["read_csv_file", "csv_select_rows", "csv_aggregate"].includes(name)) ||
712
- (ext === "xlsx" && ["read_xlsx_file", "xlsx_select_rows", "xlsx_aggregate"].includes(name)) ||
713
- (ext === "pdf" && ["read_pdf_text", "pdf_search_text"].includes(name)) ||
714
- (ext === "docx" && ["read_docx_text"].includes(name)) ||
715
- (ext === "pptx" && ["read_pptx_text"].includes(name)) ||
716
- ((ext === "txt" || ext === "md" || ext === "xml") && ["read_text_file"].includes(name)) ||
717
- (ext === "json" && ["read_json_file", "json_select", "read_text_file"].includes(name)) ||
718
- (ext === "jsonl" && ["read_jsonl_file", "read_text_file"].includes(name)) ||
719
- (ext === "zip" &&
720
- [
721
- "zip_list_files",
722
- "zip_read_text_file",
723
- "zip_extract_file",
724
- "read_csv_file",
725
- "csv_select_rows",
726
- "csv_aggregate",
727
- "read_xlsx_file",
728
- "xlsx_select_rows",
729
- "xlsx_aggregate",
730
- "read_pdf_text",
731
- "pdf_search_text",
732
- "read_text_file",
733
- "read_json_file",
734
- "json_select",
735
- "read_jsonl_file",
736
- "read_docx_text",
737
- "read_pptx_text",
738
- ].includes(name));
739
- if (!allowedByExt) {
740
- contents.push({
741
- role: "user",
742
- parts: [{ text: `Wrong tool for FILE_TYPE=${ext}. Use a tool that matches the file type.` }],
743
- });
744
- continue;
745
- }
746
- toolCalls++;
747
- const toolResult = await tool.handler(args);
748
- // Provide a bounded JSON summary to the model. Avoid dumping large content.
749
- const toolResultText = JSON.stringify(toolResult).slice(0, 12000);
750
- contents.push({
751
- role: "user",
752
- parts: [{ text: `TOOL_RESULT ${name}:\n${toolResultText}\n\nContinue. Return JSON only.` }],
753
- });
754
- }
755
- // If we ran out of steps, force a final answer.
756
- contents.push({
757
- role: "user",
758
- parts: [{ text: "Out of steps. Return final answer now as JSON." }],
759
- });
760
- const out = await llmGenerateText(llm, contents);
761
- const parsed = extractJsonObject(out);
762
- if (parsed?.action === "final") {
763
- return { answer: String(parsed.answer ?? "").trim(), toolCalls };
764
- }
765
- return { answer: String(out ?? "").trim(), toolCalls };
766
- }
767
- async function loadFixture(fixturePath) {
768
- const raw = await readFile(fixturePath, "utf8");
769
- const parsed = JSON.parse(raw);
770
- if (!parsed || !Array.isArray(parsed.tasks))
771
- throw new Error("Invalid GAIA capability files fixture");
772
- return parsed;
773
- }
774
- describe("Capability: GAIA accuracy (file-backed) (LLM-only vs LLM+local tools)", () => {
775
- const testFn = shouldRun ? it : it.skip;
776
- testFn("should measure accuracy delta on a small GAIA file-backed subset", async () => {
777
- loadDotEnvLocalIfPresent();
778
- const fixturePath = resolveCapabilityFilesFixturePath();
779
- if (!existsSync(fixturePath)) {
780
- throw new Error(`Missing GAIA capability files fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityFilesFixture.py`);
781
- }
782
- const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
783
- const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
784
- const baselineLlm = await createTextLlmClient({ model: baselineModel });
785
- const toolsLlm = await createTextLlmClient({ model: toolsModel });
786
- const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
787
- const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
788
- const fixture = await loadFixture(fixturePath);
789
- expect(Array.isArray(fixture.tasks)).toBe(true);
790
- expect(fixture.tasks.length).toBeGreaterThan(0);
791
- const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "6", 10);
792
- const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 6));
793
- const tasks = fixture.tasks.slice(0, taskLimit);
794
- const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
795
- const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
796
- const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "7", 10);
797
- const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "5", 10);
798
- // Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
799
- const judge = await autoDiscoverJudge(toolsLlm);
800
- const results = new Array(tasks.length);
801
- let nextIndex = 0;
802
- const workers = Array.from({ length: concurrency }, () => (async () => {
803
- while (true) {
804
- const idx = nextIndex++;
805
- if (idx >= tasks.length)
806
- return;
807
- const task = tasks[idx];
808
- try {
809
- const baseStart = performance.now();
810
- const base = await baselineAnswer(baselineLlm, task);
811
- const baseMs = performance.now() - baseStart;
812
- const toolsStart = performance.now();
813
- const tools = await toolAugmentedAnswerFromFile(toolsLlm, task, { maxSteps, maxToolCalls });
814
- const toolsMs = performance.now() - toolsStart;
815
- const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
816
- const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
817
- results[idx] = {
818
- taskId: task.id,
819
- baselineCorrect: baseJudge.match,
820
- toolsCorrect: toolsJudge.match,
821
- baselineMs: baseMs,
822
- toolsMs,
823
- toolCalls: tools.toolCalls,
824
- judgeProvider: toolsJudge.judgeProvider,
825
- judgeInvoked: toolsJudge.judgeInvoked,
826
- };
827
- }
828
- catch (err) {
829
- results[idx] = {
830
- taskId: task.id,
831
- baselineCorrect: false,
832
- toolsCorrect: false,
833
- baselineMs: 0,
834
- toolsMs: 0,
835
- toolCalls: 0,
836
- error: err?.message ?? String(err),
837
- };
838
- }
839
- }
840
- })());
841
- await Promise.all(workers);
842
- const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
843
- const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
844
- const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
845
- const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
846
- const avg = (values) => values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
847
- const avgBaseMs = avg(results.map((r) => r.baselineMs).filter((n) => n > 0));
848
- const avgToolsMs = avg(results.map((r) => r.toolsMs).filter((n) => n > 0));
849
- const avgToolCalls = avg(results.map((r) => r.toolCalls));
850
- console.log(`[gaia-capability-files] config=${fixture.config} split=${fixture.split} tasks=${tasks.length} concurrency=${concurrency} baseline=${baselineCorrect}/${tasks.length} tools=${toolsCorrect}/${tasks.length} improved=${improved} regressions=${regressions} avgBaselineMs=${avgBaseMs.toFixed(0)} avgToolsMs=${avgToolsMs.toFixed(0)} avgToolCalls=${avgToolCalls.toFixed(2)}`);
851
- console.log(`[gaia-capability-files] perTask: ${results
852
- .map((r) => `${r.taskId}:B${r.baselineCorrect ? "1" : "0"}T${r.toolsCorrect ? "1" : "0"}${r.error ? "E" : ""}`)
853
- .join(" ")}`);
854
- if (shouldWriteReport) {
855
- const repoRoot = resolveRepoRoot();
856
- const generatedAtIso = new Date().toISOString();
857
- const stamp = generatedAtIso.replace(/[:.]/g, "-");
858
- const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
859
- const publicSummary = {
860
- suiteId: "gaia_capability_files",
861
- lane: "files",
862
- generatedAtIso,
863
- config: fixture.config,
864
- split: fixture.split,
865
- taskCount: tasks.length,
866
- concurrency,
867
- baseline: {
868
- model: baselineModelLabel,
869
- correct: baselineCorrect,
870
- passRatePct: tasks.length === 0 ? 0 : (baselineCorrect / tasks.length) * 100,
871
- avgMs: avgBaseMs,
872
- },
873
- tools: {
874
- model: toolsModelLabel,
875
- mode: toolsMode,
876
- correct: toolsCorrect,
877
- passRatePct: tasks.length === 0 ? 0 : (toolsCorrect / tasks.length) * 100,
878
- avgMs: avgToolsMs,
879
- avgToolCalls: avgToolCalls,
880
- },
881
- improved,
882
- regressions,
883
- notes: "GAIA is gated. This file contains only aggregate metrics (no prompt/answer text). Detailed per-task report is written under .cache/gaia/reports (gitignored).",
884
- };
885
- await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_files_latest.json"), publicSummary);
886
- await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_files_${fixture.config}_${fixture.split}_${stamp}.json`), {
887
- ...publicSummary,
888
- perTask: results.map((r) => ({
889
- taskId: r.taskId,
890
- baselineCorrect: r.baselineCorrect,
891
- toolsCorrect: r.toolsCorrect,
892
- baselineMs: r.baselineMs,
893
- toolsMs: r.toolsMs,
894
- toolCalls: r.toolCalls,
895
- error: r.error ?? null,
896
- })),
897
- });
898
- }
899
- const enforce = process.env.NODEBENCH_GAIA_CAPABILITY_ENFORCE === "1";
900
- if (enforce) {
901
- // For file-backed tasks, the baseline is expected to be low. We still enforce that
902
- // tool-augmented performance is not worse than baseline and has at least one improvement.
903
- const allowedRegression = Math.max(1, Math.floor(tasks.length * 0.2));
904
- expect(improved).toBeGreaterThanOrEqual(1);
905
- expect(toolsCorrect).toBeGreaterThanOrEqual(baselineCorrect - allowedRegression);
906
- expect(toolsCorrect).toBeGreaterThanOrEqual(1);
907
- }
908
- else {
909
- expect(results.length).toBe(tasks.length);
910
- expect(results.some((r) => r.error)).toBe(false);
911
- }
912
- }, 300000);
913
- });
914
- //# sourceMappingURL=gaiaCapabilityFilesEval.test.js.map