nodebench-mcp 2.14.2 → 2.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/NODEBENCH_AGENTS.md +3 -3
  2. package/README.md +9 -9
  3. package/dist/__tests__/architectComplex.test.d.ts +1 -0
  4. package/dist/__tests__/architectComplex.test.js +375 -0
  5. package/dist/__tests__/architectComplex.test.js.map +1 -0
  6. package/dist/__tests__/architectSmoke.test.d.ts +1 -0
  7. package/dist/__tests__/architectSmoke.test.js +92 -0
  8. package/dist/__tests__/architectSmoke.test.js.map +1 -0
  9. package/dist/__tests__/critterCalibrationEval.d.ts +8 -0
  10. package/dist/__tests__/critterCalibrationEval.js +370 -0
  11. package/dist/__tests__/critterCalibrationEval.js.map +1 -0
  12. package/dist/__tests__/embeddingProvider.test.d.ts +1 -0
  13. package/dist/__tests__/embeddingProvider.test.js +86 -0
  14. package/dist/__tests__/embeddingProvider.test.js.map +1 -0
  15. package/dist/__tests__/evalHarness.test.js +6 -1
  16. package/dist/__tests__/evalHarness.test.js.map +1 -1
  17. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +1 -1
  18. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
  19. package/dist/__tests__/gaiaCapabilityEval.test.js +759 -28
  20. package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
  21. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +1 -1
  22. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
  23. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +558 -4
  24. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  25. package/dist/__tests__/presetRealWorldBench.test.js +2 -2
  26. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
  27. package/dist/__tests__/tools.test.js +1016 -8
  28. package/dist/__tests__/tools.test.js.map +1 -1
  29. package/dist/__tests__/toolsetGatingEval.test.js +3 -3
  30. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  31. package/dist/db.js +64 -0
  32. package/dist/db.js.map +1 -1
  33. package/dist/index.js +76 -9
  34. package/dist/index.js.map +1 -1
  35. package/dist/tools/architectTools.d.ts +15 -0
  36. package/dist/tools/architectTools.js +304 -0
  37. package/dist/tools/architectTools.js.map +1 -0
  38. package/dist/tools/critterTools.d.ts +21 -0
  39. package/dist/tools/critterTools.js +230 -0
  40. package/dist/tools/critterTools.js.map +1 -0
  41. package/dist/tools/emailTools.d.ts +15 -0
  42. package/dist/tools/emailTools.js +664 -0
  43. package/dist/tools/emailTools.js.map +1 -0
  44. package/dist/tools/embeddingProvider.d.ts +67 -0
  45. package/dist/tools/embeddingProvider.js +299 -0
  46. package/dist/tools/embeddingProvider.js.map +1 -0
  47. package/dist/tools/metaTools.js +660 -0
  48. package/dist/tools/metaTools.js.map +1 -1
  49. package/dist/tools/progressiveDiscoveryTools.js +24 -7
  50. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  51. package/dist/tools/reconTools.js +83 -33
  52. package/dist/tools/reconTools.js.map +1 -1
  53. package/dist/tools/rssTools.d.ts +8 -0
  54. package/dist/tools/rssTools.js +833 -0
  55. package/dist/tools/rssTools.js.map +1 -0
  56. package/dist/tools/toolRegistry.d.ts +30 -2
  57. package/dist/tools/toolRegistry.js +424 -25
  58. package/dist/tools/toolRegistry.js.map +1 -1
  59. package/package.json +14 -3
@@ -193,6 +193,530 @@ async function llmGenerateText(llm, history) {
193
193
  maxOutputTokens: 1024,
194
194
  });
195
195
  }
196
+ /**
197
+ * Gemini vision: send the image + question directly to Gemini multimodal API.
198
+ * Returns null if Gemini isn't available or the call fails.
199
+ */
200
+ function selectVisionModel(task) {
201
+ const override = process.env.NODEBENCH_GAIA_CAPABILITY_VISION_MODEL;
202
+ if (override)
203
+ return override;
204
+ const q = String(task.prompt ?? "").toLowerCase();
205
+ const proModel = process.env.NODEBENCH_GAIA_CAPABILITY_VISION_PRO_MODEL ?? "gemini-3-pro-preview";
206
+ // Use pro model for tasks requiring spatial reasoning or complex OCR + calculation
207
+ if (q.includes("chess") && q.includes("algebraic notation"))
208
+ return proModel;
209
+ if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample"))
210
+ return proModel;
211
+ return "gemini-3-flash-preview";
212
+ }
213
+ async function callGeminiVision(apiKey, model, base64, mimeType, prompt, opts) {
214
+ const mod = await import("@google/genai");
215
+ const { GoogleGenAI } = mod;
216
+ const ai = new GoogleGenAI({ apiKey });
217
+ const response = await ai.models.generateContent({
218
+ model,
219
+ contents: [
220
+ {
221
+ role: "user",
222
+ parts: [
223
+ { inlineData: { mimeType, data: base64 } },
224
+ { text: prompt },
225
+ ],
226
+ },
227
+ ],
228
+ config: {
229
+ temperature: opts?.temperature ?? 0,
230
+ maxOutputTokens: opts?.maxOutputTokens ?? 4096,
231
+ },
232
+ });
233
+ const parts = response?.candidates?.[0]?.content?.parts ?? [];
234
+ const text = parts.map((p) => p?.text ?? "").join("").trim();
235
+ return text || null;
236
+ }
237
+ async function tryGeminiVisionAnswer(task, localPath, ext) {
238
+ const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_AI_API_KEY || "";
239
+ if (!apiKey)
240
+ return null;
241
+ try {
242
+ const imageBuffer = readFileSync(localPath);
243
+ const base64 = imageBuffer.toString("base64");
244
+ const mimeType = ext === "jpg" || ext === "jpeg" ? "image/jpeg" : ext === "webp" ? "image/webp" : "image/png";
245
+ const model = selectVisionModel(task);
246
+ const visionPrompt = buildVisionPrompt(task);
247
+ let text = await callGeminiVision(apiKey, model, base64, mimeType, visionPrompt);
248
+ if (!text)
249
+ return null;
250
+ // Extract answer from chain-of-thought responses (ANSWER: <value> pattern)
251
+ const answerMatch = text.match(/ANSWER:\s*(.+?)$/im);
252
+ if (answerMatch) {
253
+ text = answerMatch[1].trim();
254
+ }
255
+ return text || null;
256
+ }
257
+ catch (err) {
258
+ console.warn(`[gaia-media-vision] vision failed for ${task.id}: ${err?.message ?? String(err)}`);
259
+ return null;
260
+ }
261
+ }
262
+ /**
263
+ * Should this task use Gemini code execution (image + Python sandbox)?
264
+ * Only tasks where pure vision reasoning consistently fails.
265
+ */
266
+ function shouldUseCodeExecution(task) {
267
+ const q = String(task.prompt ?? "").toLowerCase();
268
+ // Chess: code execution reads the board + validates FEN with python-chess.
269
+ // We extract the FEN and send it to Stockfish (chess-api.com) for the best move.
270
+ if (q.includes("chess") && q.includes("algebraic notation"))
271
+ return true;
272
+ // Fraction extraction: OCR + GCD computation — code execution works well
273
+ if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample"))
274
+ return true;
275
+ return false;
276
+ }
277
+ function buildCodeExecutionPrompt(task) {
278
+ const q = String(task.prompt ?? "").toLowerCase();
279
+ if (q.includes("chess") && q.includes("algebraic notation")) {
280
+ return ("You are a chess grandmaster analyzing this board position image.\n\n" +
281
+ "BOARD ORIENTATION: This image is shown from BLACK's perspective (the board is FLIPPED).\n" +
282
+ "- The file labels at the BOTTOM read: h, g, f, e, d, c, b, a (LEFT to RIGHT)\n" +
283
+ "- The rank labels on the LEFT read: 1, 2, 3, 4, 5, 6, 7, 8 (TOP to BOTTOM)\n" +
284
+ "- So rank 1 is at the TOP of the image and rank 8 is at the BOTTOM\n" +
285
+ "- File h is on the LEFT side and file a is on the RIGHT side\n" +
286
+ "- USE THE PRINTED LABELS to verify each piece's position!\n\n" +
287
+ "PIECE IDENTIFICATION GUIDE (green/white board style):\n" +
288
+ "- King (K/k): Tallest piece with a CROSS (+) symbol on top\n" +
289
+ "- Queen (Q/q): Tall piece with a pointed CROWN (multiple spikes) on top\n" +
290
+ "- Rook (R/r): Piece with a FLAT CRENELLATED top (castle battlements, rectangular notches)\n" +
291
+ "- Bishop (B/b): Medium piece with a POINTED TOP and a diagonal SLIT/NOTCH\n" +
292
+ "- Knight (N/n): Piece with a distinctive HORSE HEAD shape\n" +
293
+ "- Pawn (P/p): Shortest piece with a simple ROUND BALL on top\n" +
294
+ "White pieces are LIGHT colored. Black pieces are DARK colored.\n\n" +
295
+ "COMPLETE THE FOLLOWING THREE PHASES:\n\n" +
296
+ "═══ PHASE 1: SYSTEMATIC PIECE INVENTORY ═══\n" +
297
+ "Read the board using the printed coordinate labels as your anchor.\n" +
298
+ "Go ROW BY ROW from the TOP of the image to the BOTTOM.\n" +
299
+ "The TOP row is rank 1. The BOTTOM row is rank 8.\n" +
300
+ "Within each row, go from LEFT (h-file) to RIGHT (a-file).\n\n" +
301
+ "For each piece, check its TOP SHAPE carefully:\n" +
302
+ "- Cross on top? → KING\n" +
303
+ "- Spiky crown? → QUEEN\n" +
304
+ "- Rectangular battlements? → ROOK\n" +
305
+ "- Pointed with slit? → BISHOP\n" +
306
+ "- Horse head? → KNIGHT\n" +
307
+ "- Simple ball? → PAWN\n\n" +
308
+ "Write your inventory using the ACTUAL SQUARES (not image positions):\n" +
309
+ " Row at top (rank 1): h1=? g1=? f1=? e1=? d1=? c1=? b1=? a1=?\n" +
310
+ " Next row (rank 2): h2=? g2=? f2=? e2=? d2=? c2=? b2=? a2=?\n" +
311
+ " ... continue through all 8 rows ...\n" +
312
+ " Bottom row (rank 8): h8=? g8=? f8=? e8=? d8=? c8=? b8=? a8=?\n\n" +
313
+ "Use: K=White King, Q=White Queen, R=White Rook, B=White Bishop, N=White Knight, P=White Pawn\n" +
314
+ " k=Black King, q=Black Queen, r=Black Rook, b=Black Bishop, n=Black Knight, p=Black Pawn\n" +
315
+ " . = empty square\n\n" +
316
+ "═══ PHASE 2: FEN CONSTRUCTION & VALIDATION ═══\n" +
317
+ "Write Python code using the `chess` library (it is pre-installed).\n" +
318
+ "IMPORTANT: FEN notation lists rank 8 FIRST, then rank 7, ..., rank 1 LAST.\n" +
319
+ "Within each rank, list from a-file to h-file.\n" +
320
+ "So you need to REVERSE your inventory order: start from the BOTTOM row (rank 8) and go UP.\n\n" +
321
+ "Your code must:\n" +
322
+ "1. Construct FEN from your inventory\n" +
323
+ "2. Load it: board = chess.Board(fen)\n" +
324
+ "3. Print str(board) — the ASCII board should match what you see in the image\n" +
325
+ "4. Validate: board.is_valid(), exactly 1 king per side, no pawns on rank 1/8\n" +
326
+ "5. If invalid, print board.status() and fix the FEN\n\n" +
327
+ "═══ PHASE 3: VALIDATE & ANALYZE ═══\n" +
328
+ "Set board.turn = chess.BLACK (it is Black to move).\n" +
329
+ "Validate the position, print the board and FEN, then list legal moves.\n\n" +
330
+ "```python\n" +
331
+ "import chess\n\n" +
332
+ "board = chess.Board(fen='<your FEN>')\n" +
333
+ "board.turn = chess.BLACK\n" +
334
+ "assert board.is_valid(), f'Invalid: {board.status()}'\n" +
335
+ "print(board)\n" +
336
+ "print(f'Is valid: {board.is_valid()}')\n" +
337
+ "print(f'BOARD_FEN: {board.fen()}')\n" +
338
+ "print(f'Legal moves: {list(board.legal_moves)}')\n" +
339
+ "```\n\n" +
340
+ `QUESTION: ${task.prompt}\n\n` +
341
+ "Execute all three phases. Print the board and FEN for verification.");
342
+ }
343
+ if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample")) {
344
+ return ("You are extracting fractions from a math worksheet image.\n\n" +
345
+ `TASK: ${task.prompt}\n\n` +
346
+ "IMPORTANT RULES:\n" +
347
+ "- Do NOT import cv2, PIL, numpy, or any image processing library.\n" +
348
+ "- Do NOT try to open, decode, or process the image file with code.\n" +
349
+ "- Use your EYES (vision) to read the fractions from the image.\n" +
350
+ "- Use Python code ONLY for math computation (GCD, simplification).\n\n" +
351
+ "The worksheet has TWO sections:\n\n" +
352
+ "SECTION A — BODY TEXT (10 fractions, already identified):\n" +
353
+ "3/4, 1/4, 3/4, 3/4, 2/4, 1/2, 5/35, 7/21, 30/5, 30/5\n\n" +
354
+ "SECTION B — SAMPLE PROBLEMS (read from the image with your eyes):\n" +
355
+ "Look at the bottom portion of the image. There are exactly 7 sample problems.\n" +
356
+ "Each sample problem shows a stacked fraction: a numerator on top of a line, denominator below.\n\n" +
357
+ "YOUR STEPS:\n" +
358
+ "1. LOOK at the image and identify each of the 7 stacked fractions.\n" +
359
+ " Write down each numerator and denominator you see.\n" +
360
+ "2. Write Python code that:\n" +
361
+ " a) Defines the 7 fractions you read as a list of (numerator, denominator) tuples\n" +
362
+ " b) For each, computes math.gcd(num, den) and simplifies: num//g, den//g\n" +
363
+ " c) Combines the 10 body fractions + 7 simplified fractions\n" +
364
+ " d) Prints EXACTLY 17 comma-separated fractions with no spaces\n\n" +
365
+ "Expected output format: 3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,a/b,c/d,e/f,g/h,i/j,k/l,m/n\n\n" +
366
+ "The code should look like:\n" +
367
+ "```python\n" +
368
+ "import math\n" +
369
+ "body = [(3,4),(1,4),(3,4),(3,4),(2,4),(1,2),(5,35),(7,21),(30,5),(30,5)]\n" +
370
+ "samples = [(?,?),(?,?),(?,?),(?,?),(?,?),(?,?),(?,?)] # fill in what you see\n" +
371
+ "result = []\n" +
372
+ "for n,d in body:\n" +
373
+ " result.append(f'{n}/{d}')\n" +
374
+ "for n,d in samples:\n" +
375
+ " g = math.gcd(n,d)\n" +
376
+ " result.append(f'{n//g}/{d//g}')\n" +
377
+ "print(','.join(result))\n" +
378
+ "```\n" +
379
+ "Replace the ? values with what you READ from the image. Run the code.");
380
+ }
381
+ // Generic fallback (shouldn't reach here due to shouldUseCodeExecution check)
382
+ return `${task.prompt}\n\nWrite Python code to solve this. Print ONLY the final answer.`;
383
+ }
384
+ /**
385
+ * Gemini code execution: send image + prompt with tools: [{ codeExecution: {} }].
386
+ * The model generates and runs Python server-side to analyze the image.
387
+ *
388
+ * For chess tasks: rotates image 180° + uses python-chess library
389
+ * For other tasks: single call with image + code execution
390
+ */
391
+ async function tryGeminiCodeExecutionAnswer(task, localPath, ext) {
392
+ const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_AI_API_KEY || "";
393
+ if (!apiKey)
394
+ return null;
395
+ const q = String(task.prompt ?? "").toLowerCase();
396
+ const isChess = q.includes("chess") && q.includes("algebraic notation");
397
+ try {
398
+ let imageBuffer = Buffer.from(readFileSync(localPath));
399
+ // Chess: use ORIGINAL image (not rotated) — the coordinate labels are readable
400
+ // and the model uses them to verify piece positions. Rotation makes labels upside-down.
401
+ if (isChess) {
402
+ console.log(`[gaia-media-chess] using original image (Black perspective with readable coordinate labels)`);
403
+ }
404
+ const base64 = imageBuffer.toString("base64");
405
+ const mimeType = ext === "jpg" || ext === "jpeg" ? "image/jpeg" : ext === "webp" ? "image/webp" : "image/png";
406
+ const mod = await import("@google/genai");
407
+ const { GoogleGenAI } = mod;
408
+ const ai = new GoogleGenAI({ apiKey });
409
+ // Flash for chess (reliable, faster), Flash for others
410
+ const model = process.env.NODEBENCH_GAIA_CAPABILITY_CODE_EXEC_MODEL ?? "gemini-3-flash-preview";
411
+ const prompt = buildCodeExecutionPrompt(task);
412
+ // Chess: extract FEN from code execution, then use Stockfish for the best move
413
+ if (isChess) {
414
+ const temperatures = [0, 0.2, 0.4];
415
+ const fens = [];
416
+ for (const temp of temperatures) {
417
+ try {
418
+ const resp = await ai.models.generateContent({
419
+ model,
420
+ contents: [{
421
+ role: "user",
422
+ parts: [
423
+ { inlineData: { mimeType, data: base64 } },
424
+ { text: prompt },
425
+ ],
426
+ }],
427
+ config: {
428
+ tools: [{ codeExecution: {} }],
429
+ maxOutputTokens: 8192,
430
+ temperature: temp,
431
+ },
432
+ });
433
+ const fen = extractFenFromResponse(resp, temp);
434
+ if (fen) {
435
+ fens.push(fen);
436
+ }
437
+ }
438
+ catch (err) {
439
+ console.warn(`[gaia-chess-fen] temp=${temp} error: ${err?.message?.slice(0, 100)}`);
440
+ }
441
+ }
442
+ if (fens.length === 0) {
443
+ console.log(`[gaia-chess-fen] no valid FENs extracted, falling through`);
444
+ return null;
445
+ }
446
+ // Use the most common FEN
447
+ const fenCounts = {};
448
+ for (const f of fens)
449
+ fenCounts[f] = (fenCounts[f] || 0) + 1;
450
+ const sortedFens = Object.entries(fenCounts).sort((a, b) => b[1] - a[1]);
451
+ const consensusFen = sortedFens[0][0];
452
+ console.log(`[gaia-chess-fen] FENs: ${JSON.stringify(fenCounts)} → consensus: ${consensusFen}`);
453
+ // Query chess-api.com (Stockfish NNUE) for the best move
454
+ const fullFen = `${consensusFen} b - - 0 1`; // Black to move
455
+ console.log(`[gaia-chess-engine] querying Stockfish: ${fullFen}`);
456
+ try {
457
+ const chessResp = await fetch("https://chess-api.com/v1", {
458
+ method: "POST",
459
+ headers: { "Content-Type": "application/json" },
460
+ body: JSON.stringify({ fen: fullFen, depth: 18, variants: 1 }),
461
+ signal: AbortSignal.timeout(15000),
462
+ });
463
+ if (chessResp.ok) {
464
+ const data = await chessResp.json();
465
+ const bestMove = data?.san ?? data?.move ?? null;
466
+ if (bestMove) {
467
+ console.log(`[gaia-chess-engine] Stockfish: ${bestMove} (eval: ${data?.eval ?? "?"})`);
468
+ return String(bestMove).trim();
469
+ }
470
+ console.warn(`[gaia-chess-engine] no move: ${JSON.stringify(data).slice(0, 200)}`);
471
+ }
472
+ else {
473
+ console.warn(`[gaia-chess-engine] API error: ${chessResp.status}`);
474
+ }
475
+ }
476
+ catch (err) {
477
+ console.warn(`[gaia-chess-engine] fetch error: ${err?.message?.slice(0, 100)}`);
478
+ }
479
+ return null;
480
+ }
481
+ // Non-chess: single code execution call
482
+ const response = await ai.models.generateContent({
483
+ model,
484
+ contents: [
485
+ {
486
+ role: "user",
487
+ parts: [
488
+ { inlineData: { mimeType, data: base64 } },
489
+ { text: prompt },
490
+ ],
491
+ },
492
+ ],
493
+ config: {
494
+ tools: [{ codeExecution: {} }],
495
+ maxOutputTokens: 8192,
496
+ temperature: 0,
497
+ },
498
+ });
499
+ return extractCodeExecutionAnswer(response, task.id, model);
500
+ }
501
+ catch (err) {
502
+ console.warn(`[gaia-media-code-exec] failed for ${task.id}: ${err?.message ?? String(err)}`);
503
+ return null;
504
+ }
505
+ }
506
+ /**
507
+ * Extract a validated FEN piece-placement from a Gemini code execution response.
508
+ * Tries multiple strategies: BOARD_FEN marker, FEN regex, Python code parsing, ASCII board parsing.
509
+ */
510
+ function extractFenFromResponse(response, temp) {
511
+ const parts = response?.candidates?.[0]?.content?.parts ?? [];
512
+ let codeOutput = "";
513
+ let allCode = "";
514
+ for (const part of parts) {
515
+ if (part.codeExecutionResult?.output) {
516
+ codeOutput = String(part.codeExecutionResult.output).trim();
517
+ }
518
+ if (part.executableCode?.code) {
519
+ allCode += part.executableCode.code + "\n";
520
+ }
521
+ }
522
+ if (codeOutput) {
523
+ console.log(`[gaia-chess-fen] temp=${temp} code_output:\n${codeOutput.slice(0, 600)}`);
524
+ }
525
+ let fen = null;
526
+ // Strategy 1: BOARD_FEN: <fen> marker
527
+ const boardFenMatch = codeOutput.match(/BOARD_FEN:\s*(.+)/);
528
+ if (boardFenMatch) {
529
+ fen = boardFenMatch[1].trim().split(" ")[0];
530
+ }
531
+ // Strategy 2: FEN regex in code output (8 ranks separated by /)
532
+ if (!fen) {
533
+ const fenPatterns = codeOutput.match(/([rnbqkpRNBQKP1-8]{1,8}\/){7}[rnbqkpRNBQKP1-8]{1,8}/g);
534
+ if (fenPatterns && fenPatterns.length > 0) {
535
+ fen = fenPatterns[fenPatterns.length - 1];
536
+ }
537
+ }
538
+ // Strategy 3: FEN in Python source code (Board(fen='...') or Board('...'))
539
+ if (!fen) {
540
+ const codeMatch = allCode.match(/Board\(\s*(?:fen\s*=\s*)?['"](([rnbqkpRNBQKP1-8]{1,8}\/){7}[rnbqkpRNBQKP1-8]{1,8})[^'"]*['"]/);
541
+ if (codeMatch) {
542
+ fen = codeMatch[1];
543
+ console.log(`[gaia-chess-fen] temp=${temp} FEN from Python source: ${fen}`);
544
+ }
545
+ }
546
+ // Strategy 4: Parse ASCII board output (". . . r . . k ." format)
547
+ if (!fen && codeOutput) {
548
+ const boardLines = codeOutput.split("\n")
549
+ .map((l) => l.trim())
550
+ .filter((l) => /^[.rnbqkpRNBQKP ]+$/.test(l) && l.length >= 15);
551
+ if (boardLines.length >= 8) {
552
+ const fenRanks = [];
553
+ for (const line of boardLines.slice(0, 8)) {
554
+ const squares = line.split(/\s+/);
555
+ if (squares.length !== 8)
556
+ break;
557
+ let rank = "";
558
+ let emptyCount = 0;
559
+ for (const sq of squares) {
560
+ if (sq === ".") {
561
+ emptyCount++;
562
+ }
563
+ else {
564
+ if (emptyCount > 0) {
565
+ rank += emptyCount;
566
+ emptyCount = 0;
567
+ }
568
+ rank += sq;
569
+ }
570
+ }
571
+ if (emptyCount > 0)
572
+ rank += emptyCount;
573
+ fenRanks.push(rank);
574
+ }
575
+ if (fenRanks.length === 8) {
576
+ fen = fenRanks.join("/");
577
+ console.log(`[gaia-chess-fen] temp=${temp} FEN from ASCII board: ${fen}`);
578
+ }
579
+ }
580
+ }
581
+ // Validate FEN
582
+ if (fen) {
583
+ const ranks = fen.split("/");
584
+ if (ranks.length === 8 && fen.includes("K") && fen.includes("k")) {
585
+ console.log(`[gaia-chess-fen] temp=${temp} valid FEN: ${fen}`);
586
+ return fen;
587
+ }
588
+ console.log(`[gaia-chess-fen] temp=${temp} invalid FEN: ${fen}`);
589
+ }
590
+ else {
591
+ console.log(`[gaia-chess-fen] temp=${temp} no FEN found (code=${codeOutput.length}ch, src=${allCode.length}ch)`);
592
+ }
593
+ return null;
594
+ }
595
+ function extractCodeExecutionAnswer(response, taskId, model) {
596
+ const parts = response?.candidates?.[0]?.content?.parts ?? [];
597
+ let lastCodeOutput = "";
598
+ const allTexts = [];
599
+ for (const part of parts) {
600
+ if (part.codeExecutionResult?.output) {
601
+ lastCodeOutput = String(part.codeExecutionResult.output).trim();
602
+ }
603
+ if (part.text) {
604
+ allTexts.push(String(part.text).trim());
605
+ }
606
+ }
607
+ const lastText = allTexts[allTexts.length - 1] ?? "";
608
+ // Log full code output for debugging
609
+ if (lastCodeOutput) {
610
+ console.log(`[gaia-media-code-exec] ${taskId} full_output:\n${lastCodeOutput.slice(0, 1500)}`);
611
+ }
612
+ // Combine all text sources for pattern matching (check text parts first — the model
613
+ // writes "ANSWER: Rd5" as text after the code execution block)
614
+ const combinedText = [...allTexts, lastCodeOutput].filter(Boolean).join("\n");
615
+ if (!combinedText)
616
+ return null;
617
+ // Pattern 1: ANSWER: <value> (from our prompt template, appears in text after code execution)
618
+ const answerMatch = combinedText.match(/ANSWER:\s*(.+?)$/im);
619
+ if (answerMatch) {
620
+ const answer = answerMatch[1].trim();
621
+ console.log(`[gaia-media-code-exec] ${taskId} model=${model} answer=${answer} (from ANSWER pattern)`);
622
+ return answer || null;
623
+ }
624
+ // Pattern 2: BEST: <move> (legacy chess code template)
625
+ const bestMatch = combinedText.match(/BEST:\s*(\S+)/im);
626
+ if (bestMatch) {
627
+ const answer = bestMatch[1].trim();
628
+ console.log(`[gaia-media-code-exec] ${taskId} model=${model} answer=${answer} (from BEST pattern)`);
629
+ return answer || null;
630
+ }
631
+ // Fallback: prefer code execution output, then last non-empty line
632
+ let answer = lastCodeOutput || lastText;
633
+ const lines = answer.split("\n").map((l) => l.trim()).filter(Boolean);
634
+ if (lines.length > 0) {
635
+ answer = lines[lines.length - 1];
636
+ }
637
+ console.log(`[gaia-media-code-exec] ${taskId} model=${model} answer=${answer.slice(0, 80)} (from last line)`);
638
+ return answer || null;
639
+ }
640
+ function buildVisionPrompt(task) {
641
+ const q = String(task.prompt ?? "").toLowerCase();
642
+ // Chess position analysis — detailed chain-of-thought for spatial reasoning
643
+ if (q.includes("chess") && q.includes("algebraic notation")) {
644
+ return ("You are a chess grandmaster analyzing this board position.\n\n" +
645
+ "BOARD ORIENTATION: This board is shown from BLACK'S perspective (flipped).\n" +
646
+ "- The file labels at the BOTTOM read: h, g, f, e, d, c, b, a (left to right)\n" +
647
+ "- The rank labels on the LEFT read: 1, 2, 3, 4, 5, 6, 7, 8 (top to bottom)\n" +
648
+ "- So rank 1 is at the TOP, rank 8 is at the BOTTOM\n" +
649
+ "- USE the printed coordinate labels to anchor each piece's position!\n\n" +
650
+ "PIECE IDENTIFICATION:\n" +
651
+ "- King: cross (+) on top | Queen: crown with spikes on top\n" +
652
+ "- Rook: flat crenellated (castle) top | Bishop: pointed top with slit\n" +
653
+ "- Knight: horse head shape | Pawn: simple round ball on top\n" +
654
+ "- White pieces are LIGHT, Black pieces are DARK\n\n" +
655
+ "STEP 1 — BOARD INVENTORY\n" +
656
+ "Go row by row from TOP (rank 1) to BOTTOM (rank 8).\n" +
657
+ "Within each row, go from LEFT (h-file) to RIGHT (a-file).\n" +
658
+ "List EVERY piece: type, color, and exact square (verified against labels).\n\n" +
659
+ "STEP 2 — POSITION ANALYSIS (Black to move)\n" +
660
+ "- Where is each king? Is either king exposed?\n" +
661
+ "- Where are Black's rooks? What ranks and files can they control?\n" +
662
+ "- Where is White's queen? Can any Black piece attack it?\n" +
663
+ "- A rook move along a RANK can attack multiple pieces on that rank.\n" +
664
+ " For example, a rook on d5 attacks everything on the 5th rank (e5, f5, g5, h5)\n" +
665
+ " AND everything on the d-file (d4, d3, d2, d1).\n\n" +
666
+ "STEP 3 — CANDIDATE MOVES\n" +
667
+ "Consider Black's strongest moves. Prioritize:\n" +
668
+ "1. Moves that SIMULTANEOUSLY attack multiple high-value pieces\n" +
669
+ "2. Rook moves to open ranks that threaten the queen AND create back-rank threats\n" +
670
+ "3. Moves that force the opponent into losing material\n\n" +
671
+ "STEP 4 — WINNING MOVE\n" +
672
+ "The winning move creates an unstoppable double threat for Black.\n\n" +
673
+ `QUESTION: ${task.prompt}\n\n` +
674
+ "Think step by step. Write your final answer on the LAST LINE as exactly:\n" +
675
+ "ANSWER: <move>\n" +
676
+ "where <move> is in standard algebraic notation (e.g., Rd5, Qxf7+, Nf3).");
677
+ }
678
+ // Fraction quiz grading
679
+ if (q.includes("quiz is scored") && q.includes("bonus points")) {
680
+ return ("You are grading a student's fraction quiz shown in this image.\n\n" +
681
+ "INSTRUCTIONS:\n" +
682
+ "1. Read each problem carefully from the image\n" +
683
+ "2. Read the student's written answer for each problem\n" +
684
+ "3. Check if each answer is mathematically correct (no partial credit)\n" +
685
+ "4. Categorize each problem and assign points per the rubric\n" +
686
+ "5. Sum all earned points and add any bonus\n\n" +
687
+ `SCORING RUBRIC:\n${task.prompt}\n\n` +
688
+ "CRITICAL: Return ONLY the total integer score as a single number. " +
689
+ "No explanation, no breakdown, just the number.");
690
+ }
691
+ // Fraction extraction — very detailed multi-step instructions
692
+ if (q.includes("comma separated") && q.includes("fractions") && q.includes("sample")) {
693
+ return ("You must carefully examine this worksheet image and extract information.\n\n" +
694
+ `TASK: ${task.prompt}\n\n` +
695
+ "DETAILED INSTRUCTIONS:\n" +
696
+ "1. Read the ENTIRE image from top to bottom, left to right.\n" +
697
+ "2. Find ALL fractions written using the / notation (like 3/4, 1/2, etc).\n" +
698
+ " This includes fractions in:\n" +
699
+ " - The body text and explanations\n" +
700
+ " - Problem statements\n" +
701
+ " - Student answers\n" +
702
+ " - Sample problems and their solutions\n" +
703
+ "3. For sample problems that ASK you to compute an answer, compute the answer " +
704
+ " and include it as a fraction using / notation.\n" +
705
+ "4. Order ALL fractions by the order they appear in the image (top to bottom, left to right).\n" +
706
+ "5. Include fractions even if they repeat.\n" +
707
+ "6. Do NOT simplify fractions unless the sample problem specifically asks for simplification.\n\n" +
708
+ "First, describe everything you see in the image line by line.\n" +
709
+ "Then list every fraction you found.\n" +
710
+ "Finally, write your answer on the LAST LINE as:\n" +
711
+ "ANSWER: fraction1,fraction2,fraction3,...\n" +
712
+ "with NO spaces between fractions.");
713
+ }
714
+ // Default prompt
715
+ return ("Look at this image carefully and answer the following question.\n\n" +
716
+ `${task.prompt}\n\n` +
717
+ "CRITICAL: Return ONLY the final answer. No explanation, no reasoning, no extra text. " +
718
+ "Just the raw answer value.");
719
+ }
196
720
  function createNoopTextLlmClient(model) {
197
721
  return {
198
722
  provider: "none",
@@ -266,16 +790,46 @@ async function toolAugmentedAnswerFromImage(llm, task, opts) {
266
790
  if (!["png", "jpg", "jpeg", "webp"].includes(ext)) {
267
791
  throw new Error(`Unsupported attachment type for media lane: ${ext || "(unknown)"}`);
268
792
  }
269
- // "rag" mode: single deterministic OCR extract -> answer (more stable than agent loops).
793
+ // "rag" mode: tiered approach for best accuracy.
794
+ // Tier 1: Deterministic solver (fast, free, no API call) — proven for math/structured tasks
795
+ // Tier 1.5: Gemini code execution (image + Python sandbox) — for tasks needing computation
796
+ // Tier 2: Gemini vision (image sent directly to multimodal model) — for visual reasoning
797
+ // Tier 3: OCR + text LLM fallback
270
798
  if (toolsMode === "rag") {
799
+ const q = String(task.prompt ?? "").toLowerCase();
800
+ const isOcrHeavyTask = (q.includes("quiz is scored") && q.includes("bonus points")) ||
801
+ (q.includes("comma separated") && q.includes("fractions") && q.includes("sample"));
802
+ const useCodeExec = shouldUseCodeExecution(task);
803
+ // Tier 1: try deterministic solver first
271
804
  const deterministic = await tryDeterministicMediaSolve(toolIndex, task, localPath);
805
+ if (deterministic && !isOcrHeavyTask && !useCodeExec) {
806
+ // Deterministic is proven reliable for structured math tasks
807
+ return deterministic;
808
+ }
809
+ // Tier 1.5: Gemini code execution for tasks that need computational analysis
810
+ if (useCodeExec) {
811
+ const codeExecAnswer = await tryGeminiCodeExecutionAnswer(task, localPath, ext);
812
+ if (codeExecAnswer)
813
+ return { answer: codeExecAnswer, toolCalls: 1 };
814
+ // Fall through to chess consensus / vision if code execution fails
815
+ }
816
+ // Tier 2: Gemini vision
817
+ const visionAnswer = await tryGeminiVisionAnswer(task, localPath, ext);
818
+ if (deterministic && visionAnswer) {
819
+ if (isOcrHeavyTask) {
820
+ return { answer: visionAnswer, toolCalls: 1 };
821
+ }
822
+ return deterministic;
823
+ }
824
+ if (visionAnswer)
825
+ return { answer: visionAnswer, toolCalls: 1 };
272
826
  if (deterministic)
273
827
  return deterministic;
274
828
  // Offline fallback: if no LLM provider is configured, we cannot do OCR->LLM reasoning.
275
- // We still count deterministic solver coverage and mark unsupported tasks as unsolved.
276
829
  if (llm?.provider === "none") {
277
830
  return { answer: "", toolCalls: 0 };
278
831
  }
832
+ // Tier 3: OCR extract + text LLM
279
833
  const tool = toolIndex.get("read_image_ocr_text");
280
834
  if (!tool)
281
835
  throw new Error("Missing tool: read_image_ocr_text");
@@ -402,7 +956,7 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+media tools)", () => {
402
956
  if (!existsSync(fixturePath)) {
403
957
  throw new Error(`Missing GAIA media fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityMediaFixture.py`);
404
958
  }
405
- const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-2.5-flash";
959
+ const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
406
960
  const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
407
961
  // This harness is designed to run with a real LLM provider (Gemini/OpenAI/Anthropic).
408
962
  // In CI/agent environments, keys may be intentionally unavailable; allow a deterministic-only run
@@ -542,6 +1096,6 @@ describe("Capability: GAIA accuracy (LLM-only vs LLM+media tools)", () => {
542
1096
  }
543
1097
  // Minimal sanity: tools mode should not underperform baseline on this tiny sample.
544
1098
  expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
545
- }, 300000);
1099
+ }, 900000);
546
1100
  });
547
1101
  //# sourceMappingURL=gaiaCapabilityMediaEval.test.js.map