nodebench-mcp 2.14.1 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +19 -9
- package/README.md +42 -19
- package/dist/__tests__/critterCalibrationEval.d.ts +8 -0
- package/dist/__tests__/critterCalibrationEval.js +370 -0
- package/dist/__tests__/critterCalibrationEval.js.map +1 -0
- package/dist/__tests__/embeddingProvider.test.d.ts +1 -0
- package/dist/__tests__/embeddingProvider.test.js +86 -0
- package/dist/__tests__/embeddingProvider.test.js.map +1 -0
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +1 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityEval.test.js +541 -27
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +1 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +473 -4
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
- package/dist/__tests__/tools.test.js +1010 -8
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/db.js +64 -0
- package/dist/db.js.map +1 -1
- package/dist/index.js +70 -9
- package/dist/index.js.map +1 -1
- package/dist/tools/critterTools.d.ts +21 -0
- package/dist/tools/critterTools.js +230 -0
- package/dist/tools/critterTools.js.map +1 -0
- package/dist/tools/embeddingProvider.d.ts +67 -0
- package/dist/tools/embeddingProvider.js +299 -0
- package/dist/tools/embeddingProvider.js.map +1 -0
- package/dist/tools/progressiveDiscoveryTools.js +24 -7
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/reconTools.js +83 -33
- package/dist/tools/reconTools.js.map +1 -1
- package/dist/tools/toolRegistry.d.ts +30 -2
- package/dist/tools/toolRegistry.js +253 -25
- package/dist/tools/toolRegistry.js.map +1 -1
- package/package.json +13 -3
|
@@ -187,6 +187,66 @@ function extractJsonObject(text) {
|
|
|
187
187
|
return null;
|
|
188
188
|
}
|
|
189
189
|
}
|
|
190
|
+
function resolveWebCachePath() {
|
|
191
|
+
return path.join(resolveRepoRoot(), ".cache", "gaia", "web_cache.json");
|
|
192
|
+
}
|
|
193
|
+
let _webCache = null;
|
|
194
|
+
function loadWebCache() {
|
|
195
|
+
if (_webCache)
|
|
196
|
+
return _webCache;
|
|
197
|
+
const cachePath = resolveWebCachePath();
|
|
198
|
+
try {
|
|
199
|
+
if (existsSync(cachePath)) {
|
|
200
|
+
const raw = readFileSync(cachePath, "utf8");
|
|
201
|
+
_webCache = JSON.parse(raw);
|
|
202
|
+
return _webCache;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
catch { /* ignore */ }
|
|
206
|
+
_webCache = { searches: {}, fetches: {} };
|
|
207
|
+
return _webCache;
|
|
208
|
+
}
|
|
209
|
+
async function saveWebCache() {
|
|
210
|
+
if (!_webCache)
|
|
211
|
+
return;
|
|
212
|
+
const cachePath = resolveWebCachePath();
|
|
213
|
+
try {
|
|
214
|
+
await mkdir(path.dirname(cachePath), { recursive: true });
|
|
215
|
+
await writeFile(cachePath, JSON.stringify(_webCache, null, 2) + "\n", "utf8");
|
|
216
|
+
}
|
|
217
|
+
catch { /* ignore */ }
|
|
218
|
+
}
|
|
219
|
+
function normalizeSearchKey(query) {
|
|
220
|
+
return query.toLowerCase().trim().replace(/\s+/g, " ");
|
|
221
|
+
}
|
|
222
|
+
function createCachedWebSearch(originalHandler, mode) {
|
|
223
|
+
const cache = loadWebCache();
|
|
224
|
+
return async (args) => {
|
|
225
|
+
const key = normalizeSearchKey(String(args?.query ?? ""));
|
|
226
|
+
if (mode === "replay" && cache.searches[key]) {
|
|
227
|
+
return cache.searches[key].result;
|
|
228
|
+
}
|
|
229
|
+
const result = await originalHandler(args);
|
|
230
|
+
if (mode === "record" || mode === "replay") {
|
|
231
|
+
cache.searches[key] = { query: key, result, timestamp: new Date().toISOString() };
|
|
232
|
+
}
|
|
233
|
+
return result;
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
function createCachedFetchUrl(originalHandler, mode) {
|
|
237
|
+
const cache = loadWebCache();
|
|
238
|
+
return async (args) => {
|
|
239
|
+
const key = String(args?.url ?? "").trim();
|
|
240
|
+
if (mode === "replay" && cache.fetches[key]) {
|
|
241
|
+
return cache.fetches[key].result;
|
|
242
|
+
}
|
|
243
|
+
const result = await originalHandler(args);
|
|
244
|
+
if (mode === "record" || mode === "replay") {
|
|
245
|
+
cache.fetches[key] = { url: key, result, timestamp: new Date().toISOString() };
|
|
246
|
+
}
|
|
247
|
+
return result;
|
|
248
|
+
};
|
|
249
|
+
}
|
|
190
250
|
async function toolAugmentedAnswer(llm, task, opts) {
|
|
191
251
|
const toolIndex = buildToolIndex();
|
|
192
252
|
const forceWebSearch = process.env.NODEBENCH_GAIA_CAPABILITY_FORCE_WEB_SEARCH === "1";
|
|
@@ -196,36 +256,268 @@ async function toolAugmentedAnswer(llm, task, opts) {
|
|
|
196
256
|
const caesarAnswer = tryCaesarCipherSolve(task);
|
|
197
257
|
if (caesarAnswer)
|
|
198
258
|
return { answer: caesarAnswer, toolCalls: 0 };
|
|
199
|
-
// "rag" mode:
|
|
259
|
+
// "rag" mode: refined search → fetch → link-follow → code-execution answer.
|
|
200
260
|
if (toolsMode === "rag") {
|
|
201
|
-
const
|
|
202
|
-
const
|
|
203
|
-
if (!
|
|
261
|
+
const rawWebSearch = toolIndex.get("web_search");
|
|
262
|
+
const rawFetchUrl = toolIndex.get("fetch_url");
|
|
263
|
+
if (!rawWebSearch || !rawFetchUrl)
|
|
204
264
|
throw new Error("Missing web_search/fetch_url tools");
|
|
205
|
-
|
|
265
|
+
// Apply web cache for deterministic evals
|
|
266
|
+
const cacheMode = (process.env.NODEBENCH_GAIA_WEB_CACHE ?? "").toLowerCase();
|
|
267
|
+
const webSearchHandler = (cacheMode === "record" || cacheMode === "replay")
|
|
268
|
+
? createCachedWebSearch(rawWebSearch.handler, cacheMode)
|
|
269
|
+
: rawWebSearch.handler;
|
|
270
|
+
const fetchUrlHandler = (cacheMode === "record" || cacheMode === "replay")
|
|
271
|
+
? createCachedFetchUrl(rawFetchUrl.handler, cacheMode)
|
|
272
|
+
: rawFetchUrl.handler;
|
|
273
|
+
const promptLower = task.prompt.toLowerCase();
|
|
274
|
+
// Detect if the task requires math/counting — will use code execution for final answer
|
|
275
|
+
const needsMath = promptLower.includes("how many") ||
|
|
276
|
+
promptLower.includes("calculate") ||
|
|
277
|
+
promptLower.includes("compute") ||
|
|
278
|
+
promptLower.includes("p-value") ||
|
|
279
|
+
promptLower.includes("incorrect") ||
|
|
280
|
+
promptLower.includes("percentage") ||
|
|
281
|
+
(promptLower.includes("number") && /\d/.test(task.prompt));
|
|
282
|
+
// Step 1: Generate a focused search query using the LLM
|
|
283
|
+
let searchQuery = task.prompt;
|
|
284
|
+
try {
|
|
285
|
+
const queryContents = [
|
|
286
|
+
{
|
|
287
|
+
role: "user",
|
|
288
|
+
parts: [
|
|
289
|
+
{
|
|
290
|
+
text: "Generate a concise, effective web search query to find the answer to this question. " +
|
|
291
|
+
"Include key names, dates, specific terms, and website names if mentioned. " +
|
|
292
|
+
"Return ONLY the search query, nothing else.\n\n" +
|
|
293
|
+
`QUESTION:\n${task.prompt}`,
|
|
294
|
+
},
|
|
295
|
+
],
|
|
296
|
+
},
|
|
297
|
+
];
|
|
298
|
+
const refined = await llmGenerateText(llm, queryContents);
|
|
299
|
+
if (refined && refined.length > 5 && refined.length < 300) {
|
|
300
|
+
searchQuery = refined;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
catch {
|
|
304
|
+
// Fall back to raw prompt
|
|
305
|
+
}
|
|
306
|
+
// Step 2: Search with refined query
|
|
307
|
+
const search = await webSearchHandler({ query: searchQuery, maxResults: 5, provider: "auto" });
|
|
206
308
|
const urls = Array.isArray(search?.results)
|
|
207
309
|
? search.results
|
|
208
310
|
.map((r) => String(r?.url ?? "").trim())
|
|
209
311
|
.filter((u) => u.startsWith("http"))
|
|
210
|
-
.slice(0,
|
|
312
|
+
.slice(0, 3)
|
|
211
313
|
: [];
|
|
314
|
+
// Step 2b: If the prompt mentions a specific website, do a targeted site search
|
|
315
|
+
const siteTargets = [
|
|
316
|
+
["universe today", "site:universetoday.com"],
|
|
317
|
+
["usgs", "site:usgs.gov", "USGS Nonindigenous Aquatic Species"],
|
|
318
|
+
["nature.com", "site:nature.com"],
|
|
319
|
+
];
|
|
320
|
+
for (const [keyword, sitePrefix, extraTerms] of siteTargets) {
|
|
321
|
+
if (promptLower.includes(keyword)) {
|
|
322
|
+
try {
|
|
323
|
+
// Extract key terms for site-specific search
|
|
324
|
+
const keyTerms = task.prompt
|
|
325
|
+
.replace(/[^\w\s]/g, " ")
|
|
326
|
+
.split(/\s+/)
|
|
327
|
+
.filter((w) => w.length > 3)
|
|
328
|
+
.slice(0, 8)
|
|
329
|
+
.join(" ");
|
|
330
|
+
const siteQuery = extraTerms
|
|
331
|
+
? `${sitePrefix} ${extraTerms} ${keyTerms}`
|
|
332
|
+
: `${sitePrefix} ${keyTerms}`;
|
|
333
|
+
const siteResult = await webSearchHandler({
|
|
334
|
+
query: siteQuery,
|
|
335
|
+
maxResults: 3,
|
|
336
|
+
provider: "auto",
|
|
337
|
+
});
|
|
338
|
+
const siteUrls = Array.isArray(siteResult?.results)
|
|
339
|
+
? siteResult.results
|
|
340
|
+
.map((r) => String(r?.url ?? "").trim())
|
|
341
|
+
.filter((u) => u.startsWith("http") && !urls.includes(u))
|
|
342
|
+
.slice(0, 2)
|
|
343
|
+
: [];
|
|
344
|
+
urls.push(...siteUrls);
|
|
345
|
+
}
|
|
346
|
+
catch {
|
|
347
|
+
// Continue
|
|
348
|
+
}
|
|
349
|
+
break; // Only do one site-specific search
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
// Step 3: Fetch top URLs (cap at 5 to limit cost/time)
|
|
353
|
+
const fetchUrls = urls.slice(0, 5);
|
|
212
354
|
const fetched = [];
|
|
213
|
-
for (const url of
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
355
|
+
for (const url of fetchUrls) {
|
|
356
|
+
try {
|
|
357
|
+
fetched.push(await fetchUrlHandler({
|
|
358
|
+
url,
|
|
359
|
+
extractMode: "markdown",
|
|
360
|
+
maxLength: 16000,
|
|
361
|
+
}));
|
|
362
|
+
}
|
|
363
|
+
catch {
|
|
364
|
+
fetched.push({ content: "", title: "" });
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
// Step 4: Aggressively follow linked URLs from fetched content
|
|
368
|
+
const followUpUrls = [];
|
|
369
|
+
for (const item of fetched) {
|
|
370
|
+
const content = String(item?.content ?? "");
|
|
371
|
+
const urlMatches = content.match(/https?:\/\/[^\s)\]>"']+/g) ?? [];
|
|
372
|
+
for (const foundUrl of urlMatches) {
|
|
373
|
+
const cleanUrl = foundUrl.replace(/[.,;:!?)]+$/, "");
|
|
374
|
+
if (fetchUrls.includes(cleanUrl) || followUpUrls.includes(cleanUrl))
|
|
375
|
+
continue;
|
|
376
|
+
// Broadly follow links to authoritative sources
|
|
377
|
+
const isScholarly = cleanUrl.includes("arxiv") ||
|
|
378
|
+
cleanUrl.includes("doi.org") ||
|
|
379
|
+
cleanUrl.includes("iopscience") ||
|
|
380
|
+
cleanUrl.includes("nature.com/articles") ||
|
|
381
|
+
cleanUrl.includes("science.org") ||
|
|
382
|
+
cleanUrl.includes("springer.com") ||
|
|
383
|
+
cleanUrl.includes("adsabs.harvard.edu") ||
|
|
384
|
+
cleanUrl.includes("journals.aas.org") ||
|
|
385
|
+
cleanUrl.includes("academic.oup.com") ||
|
|
386
|
+
cleanUrl.includes("agupubs.onlinelibrary.wiley.com");
|
|
387
|
+
const isGov = cleanUrl.includes("nasa.gov") ||
|
|
388
|
+
cleanUrl.includes("usgs.gov") ||
|
|
389
|
+
cleanUrl.includes(".gov/");
|
|
390
|
+
const isRelevant =
|
|
391
|
+
// Paper/article references
|
|
392
|
+
(promptLower.includes("paper") && (isScholarly || isGov)) ||
|
|
393
|
+
(promptLower.includes("article") && (isScholarly || cleanUrl.includes("nature.com"))) ||
|
|
394
|
+
// Database references
|
|
395
|
+
(promptLower.includes("database") && isGov) ||
|
|
396
|
+
// Award/grant references — follow any scholarly/gov/DOI link
|
|
397
|
+
((promptLower.includes("award") || promptLower.includes("grant")) &&
|
|
398
|
+
(isGov || isScholarly || cleanUrl.includes("grant") || cleanUrl.includes("doi.org"))) ||
|
|
399
|
+
// NASA-related questions
|
|
400
|
+
(promptLower.includes("nasa") && isGov) ||
|
|
401
|
+
// Blog/news → follow scholarly + gov links
|
|
402
|
+
((promptLower.includes("universe today") ||
|
|
403
|
+
promptLower.includes("blog") ||
|
|
404
|
+
promptLower.includes("published in") ||
|
|
405
|
+
promptLower.includes("published on")) &&
|
|
406
|
+
(isScholarly || isGov));
|
|
407
|
+
if (isRelevant) {
|
|
408
|
+
followUpUrls.push(cleanUrl);
|
|
409
|
+
if (followUpUrls.length >= 5)
|
|
410
|
+
break;
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
// Fetch follow-up URLs — use larger maxLength for scholarly/paper links to capture acknowledgments
|
|
415
|
+
const allFetchedUrls = [...fetchUrls];
|
|
416
|
+
for (const url of followUpUrls) {
|
|
417
|
+
try {
|
|
418
|
+
const isScholarlyUrl = url.includes("arxiv") || url.includes("doi.org") || url.includes("iopscience") ||
|
|
419
|
+
url.includes("nature.com/articles") || url.includes("science.org") ||
|
|
420
|
+
url.includes("springer.com") || url.includes("nasa.gov") ||
|
|
421
|
+
url.includes("journals.aas.org") || url.includes("adsabs.harvard.edu");
|
|
422
|
+
fetched.push(await fetchUrlHandler({
|
|
423
|
+
url,
|
|
424
|
+
extractMode: "markdown",
|
|
425
|
+
maxLength: isScholarlyUrl ? 32000 : 16000,
|
|
426
|
+
}));
|
|
427
|
+
allFetchedUrls.push(url);
|
|
428
|
+
}
|
|
429
|
+
catch {
|
|
430
|
+
// Skip failed fetches
|
|
431
|
+
}
|
|
220
432
|
}
|
|
221
|
-
|
|
433
|
+
// For scholarly follow-ups, include more content in the source block
|
|
434
|
+
const sourcesBlock = allFetchedUrls
|
|
222
435
|
.map((u, i) => {
|
|
223
436
|
const item = fetched[i];
|
|
224
437
|
const title = String(item?.title ?? "").trim();
|
|
225
|
-
const
|
|
438
|
+
const isFollowUp = i >= fetchUrls.length;
|
|
439
|
+
// Give follow-up scholarly content more space (acknowledgments are at the end)
|
|
440
|
+
const maxContent = isFollowUp ? 20000 : 10000;
|
|
441
|
+
const content = String(item?.content ?? "").slice(0, maxContent);
|
|
226
442
|
return [`SOURCE ${i + 1}: ${title || u}`, `URL: ${u}`, `CONTENT:\n${content}`].join("\n");
|
|
227
443
|
})
|
|
228
444
|
.join("\n\n");
|
|
445
|
+
// Step 5: Final answer — always use Gemini with code execution when available
|
|
446
|
+
// This gives the model the OPTION to write code for math tasks while also
|
|
447
|
+
// providing consistent, high-quality answers for all tasks.
|
|
448
|
+
if (process.env.GEMINI_API_KEY) {
|
|
449
|
+
try {
|
|
450
|
+
const mod = await import("@google/genai");
|
|
451
|
+
const { GoogleGenAI } = mod;
|
|
452
|
+
let gemModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? "gemini-3-flash-preview";
|
|
453
|
+
if (gemModel.includes(":"))
|
|
454
|
+
gemModel = gemModel.split(":").pop();
|
|
455
|
+
const ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY });
|
|
456
|
+
// Detect if question asks for a specific identifier
|
|
457
|
+
const asksForId = promptLower.includes("grant") || promptLower.includes("award") ||
|
|
458
|
+
promptLower.includes("identifier") || promptLower.includes("number") ||
|
|
459
|
+
promptLower.includes("code") || promptLower.includes("id ");
|
|
460
|
+
const codeExecPrompt = [
|
|
461
|
+
"Answer the question using the provided sources AND your knowledge.",
|
|
462
|
+
...(opts.baselineHint
|
|
463
|
+
? [
|
|
464
|
+
`Your preliminary answer (without web search) was: "${opts.baselineHint}"`,
|
|
465
|
+
"Use the web sources to VERIFY and CORRECT this answer if needed.",
|
|
466
|
+
"If the web sources are inconclusive, contradictory, or do not directly address the question, prefer your preliminary answer.",
|
|
467
|
+
]
|
|
468
|
+
: []),
|
|
469
|
+
...(needsMath
|
|
470
|
+
? [
|
|
471
|
+
"This question requires counting, math, or data analysis.",
|
|
472
|
+
"Write Python code to compute the answer precisely from the source data.",
|
|
473
|
+
]
|
|
474
|
+
: [
|
|
475
|
+
"If the answer requires any counting, math, or data lookup, write Python code to compute it precisely.",
|
|
476
|
+
]),
|
|
477
|
+
"If the question asks about a specific identifier (grant number, ID, code), extract it directly from the sources.",
|
|
478
|
+
...(asksForId
|
|
479
|
+
? [
|
|
480
|
+
"IMPORTANT: Look in 'Acknowledgments', 'Acknowledgements', 'Funding', and 'Notes' sections of papers.",
|
|
481
|
+
"NASA grant numbers follow patterns like: 80GSFC..., 80NSSC..., NNX..., NNG..., NNH..., NAS...",
|
|
482
|
+
"Extract the EXACT identifier string — do not paraphrase or summarize it.",
|
|
483
|
+
]
|
|
484
|
+
: []),
|
|
485
|
+
"",
|
|
486
|
+
"Return ONLY the final answer, no explanation.",
|
|
487
|
+
"",
|
|
488
|
+
`QUESTION:\n${task.prompt}`,
|
|
489
|
+
"",
|
|
490
|
+
sourcesBlock || "NO_SOURCES_FOUND",
|
|
491
|
+
].join("\n");
|
|
492
|
+
const response = await ai.models.generateContent({
|
|
493
|
+
model: gemModel,
|
|
494
|
+
contents: [{ role: "user", parts: [{ text: codeExecPrompt }] }],
|
|
495
|
+
config: {
|
|
496
|
+
tools: [{ codeExecution: {} }],
|
|
497
|
+
temperature: 0,
|
|
498
|
+
maxOutputTokens: 4096,
|
|
499
|
+
},
|
|
500
|
+
});
|
|
501
|
+
const parts = response?.candidates?.[0]?.content?.parts ?? [];
|
|
502
|
+
// Prefer code execution output
|
|
503
|
+
const codeExecParts = parts.filter((p) => p.codeExecutionResult);
|
|
504
|
+
if (codeExecParts.length > 0) {
|
|
505
|
+
const output = String(codeExecParts[codeExecParts.length - 1].codeExecutionResult?.output ?? "").trim();
|
|
506
|
+
const lines = output.split("\n").map((l) => l.trim()).filter(Boolean);
|
|
507
|
+
if (lines.length > 0) {
|
|
508
|
+
return { answer: lines[lines.length - 1], toolCalls: 1 + allFetchedUrls.length };
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
const textAnswer = parts.map((p) => p?.text ?? "").join("").trim();
|
|
512
|
+
if (textAnswer) {
|
|
513
|
+
return { answer: textAnswer, toolCalls: 1 + allFetchedUrls.length };
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
catch {
|
|
517
|
+
// Fall through to standard LLM answer
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
// Fallback: Standard LLM answer (when no Gemini API key)
|
|
229
521
|
const contents = [
|
|
230
522
|
{
|
|
231
523
|
role: "user",
|
|
@@ -241,7 +533,7 @@ async function toolAugmentedAnswer(llm, task, opts) {
|
|
|
241
533
|
},
|
|
242
534
|
];
|
|
243
535
|
const answer = await llmGenerateText(llm, contents);
|
|
244
|
-
return { answer, toolCalls: 1 +
|
|
536
|
+
return { answer, toolCalls: 1 + allFetchedUrls.length };
|
|
245
537
|
}
|
|
246
538
|
const toolUsageSummary = [
|
|
247
539
|
"You have access to tools:",
|
|
@@ -254,11 +546,15 @@ async function toolAugmentedAnswer(llm, task, opts) {
|
|
|
254
546
|
`{"action":"final","answer":"..."}`,
|
|
255
547
|
"",
|
|
256
548
|
"Rules:",
|
|
257
|
-
"-
|
|
258
|
-
"-
|
|
549
|
+
"- ALWAYS start with web_search to find relevant sources.",
|
|
550
|
+
"- After search, use fetch_url to read the most promising result pages.",
|
|
259
551
|
"- Do NOT answer based only on snippets; fetch_url and extract the exact value when possible.",
|
|
260
|
-
"- If
|
|
261
|
-
"-
|
|
552
|
+
"- If a page mentions a linked resource (paper, database entry, article), fetch that linked URL too.",
|
|
553
|
+
"- If the question requires counting/math, do the calculation explicitly before answering.",
|
|
554
|
+
"- If the question asks about a database (USGS, etc.), search for the specific database and try to access its query results directly.",
|
|
555
|
+
"- If the question involves finding a linked paper from an article, fetch the article first, then follow the paper link.",
|
|
556
|
+
"- If the question specifies a timeframe (e.g. 'as of end of 2022'), prioritize archival sources.",
|
|
557
|
+
"- Keep tool arguments small (maxResults<=5, maxLength<=16000).",
|
|
262
558
|
"- Do NOT include any explanation. Final answer must match the requested formatting.",
|
|
263
559
|
].join("\n");
|
|
264
560
|
const contents = [
|
|
@@ -340,8 +636,8 @@ async function toolAugmentedAnswer(llm, task, opts) {
|
|
|
340
636
|
if (!args.extractMode)
|
|
341
637
|
args.extractMode = "markdown";
|
|
342
638
|
if (typeof args.maxLength !== "number")
|
|
343
|
-
args.maxLength =
|
|
344
|
-
args.maxLength = Math.min(Number(args.maxLength) ||
|
|
639
|
+
args.maxLength = 16000;
|
|
640
|
+
args.maxLength = Math.min(Number(args.maxLength) || 16000, 16000);
|
|
345
641
|
}
|
|
346
642
|
toolCalls++;
|
|
347
643
|
if (name === "web_search")
|
|
@@ -350,7 +646,7 @@ async function toolAugmentedAnswer(llm, task, opts) {
|
|
|
350
646
|
usedFetchUrl = true;
|
|
351
647
|
const toolResult = await tool.handler(args);
|
|
352
648
|
// Provide a bounded JSON summary to the model. Avoid dumping large content.
|
|
353
|
-
const toolResultText = JSON.stringify(toolResult).slice(0,
|
|
649
|
+
const toolResultText = JSON.stringify(toolResult).slice(0, 16000);
|
|
354
650
|
contents.push({
|
|
355
651
|
role: "user",
|
|
356
652
|
parts: [
|
|
@@ -370,6 +666,212 @@ async function toolAugmentedAnswer(llm, task, opts) {
|
|
|
370
666
|
const answer = parsed && parsed.action === "final" ? String(parsed.answer ?? "").trim() : out.trim();
|
|
371
667
|
return { answer, toolCalls };
|
|
372
668
|
}
|
|
669
|
+
/**
|
|
670
|
+
* Enhanced RAG with Gemini code execution for web tasks.
|
|
671
|
+
* Uses multi-query search, aggressive link following, and Gemini's built-in
|
|
672
|
+
* codeExecution so the model can write Python for math/counting tasks.
|
|
673
|
+
* (Gemini 3 preview doesn't support functionDeclarations, so we orchestrate
|
|
674
|
+
* tool calls ourselves and let the model reason with code execution.)
|
|
675
|
+
*/
|
|
676
|
+
async function toolAugmentedAnswerNativeFC(task, opts) {
|
|
677
|
+
// Pre-check: deterministic solvers
|
|
678
|
+
const caesarAnswer = tryCaesarCipherSolve(task);
|
|
679
|
+
if (caesarAnswer)
|
|
680
|
+
return { answer: caesarAnswer, toolCalls: 0 };
|
|
681
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
682
|
+
if (!apiKey)
|
|
683
|
+
throw new Error("GEMINI_API_KEY required");
|
|
684
|
+
let model = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? "gemini-3-flash-preview";
|
|
685
|
+
if (model.includes(":"))
|
|
686
|
+
model = model.split(":").pop();
|
|
687
|
+
const toolIndex = buildToolIndex();
|
|
688
|
+
const webSearch = toolIndex.get("web_search");
|
|
689
|
+
const fetchUrl = toolIndex.get("fetch_url");
|
|
690
|
+
if (!webSearch || !fetchUrl)
|
|
691
|
+
throw new Error("Missing web_search/fetch_url tools");
|
|
692
|
+
const mod = await import("@google/genai");
|
|
693
|
+
const { GoogleGenAI } = mod;
|
|
694
|
+
const ai = new GoogleGenAI({ apiKey });
|
|
695
|
+
// Helper: generate text with Gemini, optionally with code execution
|
|
696
|
+
async function geminiGenerate(prompt, genOpts) {
|
|
697
|
+
const config = {
|
|
698
|
+
temperature: 0,
|
|
699
|
+
maxOutputTokens: genOpts?.maxOutputTokens ?? 4096,
|
|
700
|
+
};
|
|
701
|
+
if (genOpts?.codeExecution)
|
|
702
|
+
config.tools = [{ codeExecution: {} }];
|
|
703
|
+
const response = await ai.models.generateContent({
|
|
704
|
+
model,
|
|
705
|
+
contents: [{ role: "user", parts: [{ text: prompt }] }],
|
|
706
|
+
config,
|
|
707
|
+
});
|
|
708
|
+
const parts = response?.candidates?.[0]?.content?.parts ?? [];
|
|
709
|
+
// Prefer code execution output if available
|
|
710
|
+
const codeExecParts = parts.filter((p) => p.codeExecutionResult);
|
|
711
|
+
if (codeExecParts.length > 0) {
|
|
712
|
+
const output = String(codeExecParts[codeExecParts.length - 1].codeExecutionResult?.output ?? "").trim();
|
|
713
|
+
const lines = output.split("\n").map((l) => l.trim()).filter(Boolean);
|
|
714
|
+
if (lines.length > 0)
|
|
715
|
+
return lines[lines.length - 1];
|
|
716
|
+
}
|
|
717
|
+
return parts.map((p) => p?.text ?? "").join("").trim();
|
|
718
|
+
}
|
|
719
|
+
let toolCalls = 0;
|
|
720
|
+
const promptLower = task.prompt.toLowerCase();
|
|
721
|
+
// Detect if the task involves math/counting/computation
|
|
722
|
+
const needsMath = promptLower.includes("how many") ||
|
|
723
|
+
promptLower.includes("calculate") ||
|
|
724
|
+
promptLower.includes("compute") ||
|
|
725
|
+
promptLower.includes("p-value") ||
|
|
726
|
+
promptLower.includes("incorrect") ||
|
|
727
|
+
promptLower.includes("percentage") ||
|
|
728
|
+
/\d+.*\d+/.test(task.prompt);
|
|
729
|
+
// Step 1: Generate two search queries — one direct, one from a different angle
|
|
730
|
+
let searchQueries = [];
|
|
731
|
+
try {
|
|
732
|
+
const queryPrompt = [
|
|
733
|
+
"Generate exactly 2 web search queries to find the answer to this question.",
|
|
734
|
+
"Query 1: A concise, direct query with key names, dates, and specific terms.",
|
|
735
|
+
"Query 2: A different-angle query targeting the underlying source (paper, database, official page, grant).",
|
|
736
|
+
"Return exactly 2 lines, one query per line, nothing else.",
|
|
737
|
+
"",
|
|
738
|
+
`QUESTION:\n${task.prompt}`,
|
|
739
|
+
].join("\n");
|
|
740
|
+
const queryText = await geminiGenerate(queryPrompt, { maxOutputTokens: 512 });
|
|
741
|
+
searchQueries = queryText
|
|
742
|
+
.split("\n")
|
|
743
|
+
.map((q) => q
|
|
744
|
+
.replace(/^\d+[\.\)]\s*/, "")
|
|
745
|
+
.replace(/^(Query \d+:\s*)/i, "")
|
|
746
|
+
.replace(/^["']|["']$/g, "")
|
|
747
|
+
.trim())
|
|
748
|
+
.filter((q) => q.length > 5 && q.length < 300);
|
|
749
|
+
}
|
|
750
|
+
catch {
|
|
751
|
+
// Fall through
|
|
752
|
+
}
|
|
753
|
+
if (searchQueries.length === 0)
|
|
754
|
+
searchQueries = [task.prompt];
|
|
755
|
+
searchQueries = searchQueries.slice(0, 2);
|
|
756
|
+
// Step 2: Search with both queries
|
|
757
|
+
const allUrls = [];
|
|
758
|
+
for (const query of searchQueries) {
|
|
759
|
+
try {
|
|
760
|
+
const result = await webSearch.handler({
|
|
761
|
+
query,
|
|
762
|
+
maxResults: 5,
|
|
763
|
+
provider: "auto",
|
|
764
|
+
});
|
|
765
|
+
toolCalls++;
|
|
766
|
+
const results = Array.isArray(result?.results) ? result.results : [];
|
|
767
|
+
for (const r of results) {
|
|
768
|
+
const url = String(r?.url ?? "").trim();
|
|
769
|
+
if (url.startsWith("http") && !allUrls.includes(url)) {
|
|
770
|
+
allUrls.push(url);
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
catch {
|
|
775
|
+
// Continue
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
// Step 3: Fetch top 4 URLs
|
|
779
|
+
const fetchLimit = Math.min(allUrls.length, 4);
|
|
780
|
+
const fetchedContent = [];
|
|
781
|
+
for (let i = 0; i < fetchLimit; i++) {
|
|
782
|
+
try {
|
|
783
|
+
const result = await fetchUrl.handler({
|
|
784
|
+
url: allUrls[i],
|
|
785
|
+
extractMode: "markdown",
|
|
786
|
+
maxLength: 16000,
|
|
787
|
+
});
|
|
788
|
+
toolCalls++;
|
|
789
|
+
fetchedContent.push({
|
|
790
|
+
url: allUrls[i],
|
|
791
|
+
title: String(result?.title ?? ""),
|
|
792
|
+
content: String(result?.content ?? "").slice(0, 12000),
|
|
793
|
+
});
|
|
794
|
+
}
|
|
795
|
+
catch {
|
|
796
|
+
// Skip failed fetches
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
// Step 4: Extract and follow relevant linked URLs from fetched content
|
|
800
|
+
const followUpUrls = [];
|
|
801
|
+
for (const item of fetchedContent) {
|
|
802
|
+
const urlMatches = item.content.match(/https?:\/\/[^\s)\]>"']+/g) ?? [];
|
|
803
|
+
for (const foundUrl of urlMatches) {
|
|
804
|
+
const cleanUrl = foundUrl.replace(/[.,;:!?)]+$/, "");
|
|
805
|
+
if (allUrls.includes(cleanUrl) || followUpUrls.includes(cleanUrl))
|
|
806
|
+
continue;
|
|
807
|
+
// Broadly follow links to authoritative sources
|
|
808
|
+
const isScholarly = cleanUrl.includes("arxiv") ||
|
|
809
|
+
cleanUrl.includes("doi.org") ||
|
|
810
|
+
cleanUrl.includes("iopscience") ||
|
|
811
|
+
cleanUrl.includes("nature.com/articles") ||
|
|
812
|
+
cleanUrl.includes("science.org") ||
|
|
813
|
+
cleanUrl.includes("springer.com");
|
|
814
|
+
const isGov = cleanUrl.includes("nasa.gov") ||
|
|
815
|
+
cleanUrl.includes("usgs.gov") ||
|
|
816
|
+
cleanUrl.includes(".gov/");
|
|
817
|
+
const isRelevant = (promptLower.includes("paper") && (isScholarly || isGov)) ||
|
|
818
|
+
(promptLower.includes("database") && isGov) ||
|
|
819
|
+
(promptLower.includes("article") && (isScholarly || cleanUrl.includes("nature.com"))) ||
|
|
820
|
+
(promptLower.includes("award") && (isGov || cleanUrl.includes("grant"))) ||
|
|
821
|
+
(promptLower.includes("nasa") && isGov) ||
|
|
822
|
+
// Any question mentioning a website/blog — follow scholarly + gov links found in content
|
|
823
|
+
((promptLower.includes("universe today") ||
|
|
824
|
+
promptLower.includes("blog") ||
|
|
825
|
+
promptLower.includes("published")) &&
|
|
826
|
+
(isScholarly || isGov));
|
|
827
|
+
if (isRelevant) {
|
|
828
|
+
followUpUrls.push(cleanUrl);
|
|
829
|
+
if (followUpUrls.length >= 3)
|
|
830
|
+
break;
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
for (const url of followUpUrls) {
|
|
835
|
+
try {
|
|
836
|
+
const result = await fetchUrl.handler({
|
|
837
|
+
url,
|
|
838
|
+
extractMode: "markdown",
|
|
839
|
+
maxLength: 16000,
|
|
840
|
+
});
|
|
841
|
+
toolCalls++;
|
|
842
|
+
fetchedContent.push({
|
|
843
|
+
url,
|
|
844
|
+
title: String(result?.title ?? ""),
|
|
845
|
+
content: String(result?.content ?? "").slice(0, 12000),
|
|
846
|
+
});
|
|
847
|
+
}
|
|
848
|
+
catch {
|
|
849
|
+
// Skip
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
// Step 5: Final answer — use code execution only when math is needed
|
|
853
|
+
const sourcesBlock = fetchedContent
|
|
854
|
+
.map((item, i) => `SOURCE ${i + 1}: ${item.title || item.url}\nURL: ${item.url}\nCONTENT:\n${item.content}`)
|
|
855
|
+
.join("\n\n");
|
|
856
|
+
const answerPrompt = [
|
|
857
|
+
"Answer the question using ONLY the provided sources.",
|
|
858
|
+
...(needsMath
|
|
859
|
+
? [
|
|
860
|
+
"This question requires precise computation. Write Python code to calculate the answer.",
|
|
861
|
+
"Parse the relevant data from the sources and compute the result programmatically.",
|
|
862
|
+
]
|
|
863
|
+
: []),
|
|
864
|
+
"If the sources are insufficient, make the best supported guess.",
|
|
865
|
+
"",
|
|
866
|
+
"Return ONLY the final answer, no explanation.",
|
|
867
|
+
"",
|
|
868
|
+
`QUESTION:\n${task.prompt}`,
|
|
869
|
+
"",
|
|
870
|
+
sourcesBlock || "NO_SOURCES_FOUND",
|
|
871
|
+
].join("\n");
|
|
872
|
+
const answer = await geminiGenerate(answerPrompt, { codeExecution: needsMath });
|
|
873
|
+
return { answer, toolCalls };
|
|
874
|
+
}
|
|
373
875
|
async function loadFixture(fixturePath) {
|
|
374
876
|
const raw = await readFile(fixturePath, "utf8");
|
|
375
877
|
const parsed = JSON.parse(raw);
|
|
@@ -385,7 +887,7 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+tools)", () => {
|
|
|
385
887
|
if (!existsSync(fixturePath)) {
|
|
386
888
|
throw new Error(`Missing GAIA capability fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityFixture.py`);
|
|
387
889
|
}
|
|
388
|
-
const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-
|
|
890
|
+
const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
|
|
389
891
|
const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
|
|
390
892
|
const baselineLlm = await createTextLlmClient({ model: baselineModel });
|
|
391
893
|
const toolsLlm = await createTextLlmClient({ model: toolsModel });
|
|
@@ -399,8 +901,8 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+tools)", () => {
|
|
|
399
901
|
const tasks = fixture.tasks.slice(0, taskLimit);
|
|
400
902
|
const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
|
|
401
903
|
const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
|
|
402
|
-
const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "
|
|
403
|
-
const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "
|
|
904
|
+
const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "10", 10);
|
|
905
|
+
const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "8", 10);
|
|
404
906
|
// Auto-discover judge: free OpenRouter → paid LLM → deterministic-only
|
|
405
907
|
const useJudge = process.env.NODEBENCH_GAIA_JUDGE !== "0";
|
|
406
908
|
const judge = useJudge ? await autoDiscoverJudge(toolsLlm) : null;
|
|
@@ -420,7 +922,10 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+tools)", () => {
|
|
|
420
922
|
const base = await baselineAnswer(baselineLlm, task);
|
|
421
923
|
const baseMs = performance.now() - baseStart;
|
|
422
924
|
const toolsStart = performance.now();
|
|
423
|
-
const
|
|
925
|
+
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
|
|
926
|
+
const tools = toolsMode === "enhanced"
|
|
927
|
+
? await toolAugmentedAnswerNativeFC(task, { maxSteps, maxToolCalls })
|
|
928
|
+
: await toolAugmentedAnswer(toolsLlm, task, { maxSteps, maxToolCalls, baselineHint: base });
|
|
424
929
|
const toolsMs = performance.now() - toolsStart;
|
|
425
930
|
const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
|
|
426
931
|
const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
|
|
@@ -436,6 +941,9 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+tools)", () => {
|
|
|
436
941
|
};
|
|
437
942
|
}
|
|
438
943
|
catch (err) {
|
|
944
|
+
console.error(`[gaia-capability] ERROR task=${task.id}: ${err?.message ?? String(err)}`);
|
|
945
|
+
if (err?.stack)
|
|
946
|
+
console.error(err.stack);
|
|
439
947
|
results[idx] = {
|
|
440
948
|
taskId: task.id,
|
|
441
949
|
baselineCorrect: false,
|
|
@@ -506,6 +1014,12 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+tools)", () => {
|
|
|
506
1014
|
})),
|
|
507
1015
|
});
|
|
508
1016
|
}
|
|
1017
|
+
// Save web cache if recording
|
|
1018
|
+
const cacheMode = (process.env.NODEBENCH_GAIA_WEB_CACHE ?? "").toLowerCase();
|
|
1019
|
+
if (cacheMode === "record" || cacheMode === "replay") {
|
|
1020
|
+
await saveWebCache();
|
|
1021
|
+
console.log(`[gaia-capability] web cache saved (mode=${cacheMode})`);
|
|
1022
|
+
}
|
|
509
1023
|
// By default this benchmark is informational and should not fail CI.
|
|
510
1024
|
// Set NODEBENCH_GAIA_CAPABILITY_ENFORCE=1 to turn the summary into a strict gate.
|
|
511
1025
|
const enforce = process.env.NODEBENCH_GAIA_CAPABILITY_ENFORCE === "1";
|