npm - nodebench-mcp - Versions diffs - 3.0.0 → 3.0.2 - Mend

nodebench-mcp 3.0.0 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

package/NODEBENCH_AGENTS.md +74 -67
package/README.md +36 -34
package/dist/dashboard/operatingDashboardHtml.js +2 -1
package/dist/dashboard/operatingDashboardHtml.js.map +1 -1
package/dist/dashboard/operatingServer.js +3 -2
package/dist/dashboard/operatingServer.js.map +1 -1
package/dist/db.js +51 -3
package/dist/db.js.map +1 -1
package/dist/index.js +19 -18
package/dist/index.js.map +1 -1
package/dist/packageInfo.d.ts +3 -0
package/dist/packageInfo.js +32 -0
package/dist/packageInfo.js.map +1 -0
package/dist/sandboxApi.js +2 -1
package/dist/sandboxApi.js.map +1 -1
package/dist/tools/boilerplateTools.js +10 -9
package/dist/tools/boilerplateTools.js.map +1 -1
package/dist/tools/documentationTools.js +2 -1
package/dist/tools/documentationTools.js.map +1 -1
package/dist/tools/progressiveDiscoveryTools.js +2 -1
package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
package/dist/tools/toolRegistry.js +11 -0
package/dist/tools/toolRegistry.js.map +1 -1
package/dist/toolsetRegistry.js +74 -1
package/dist/toolsetRegistry.js.map +1 -1
package/package.json +7 -6
package/scripts/install.sh +14 -14
package/dist/__tests__/analytics.test.d.ts +0 -11
package/dist/__tests__/analytics.test.js +0 -546
package/dist/__tests__/analytics.test.js.map +0 -1
package/dist/__tests__/architectComplex.test.d.ts +0 -1
package/dist/__tests__/architectComplex.test.js +0 -373
package/dist/__tests__/architectComplex.test.js.map +0 -1
package/dist/__tests__/architectSmoke.test.d.ts +0 -1
package/dist/__tests__/architectSmoke.test.js +0 -92
package/dist/__tests__/architectSmoke.test.js.map +0 -1
package/dist/__tests__/audit-registry.d.ts +0 -1
package/dist/__tests__/audit-registry.js +0 -60
package/dist/__tests__/audit-registry.js.map +0 -1
package/dist/__tests__/batchAutopilot.test.d.ts +0 -8
package/dist/__tests__/batchAutopilot.test.js +0 -218
package/dist/__tests__/batchAutopilot.test.js.map +0 -1
package/dist/__tests__/cliSubcommands.test.d.ts +0 -1
package/dist/__tests__/cliSubcommands.test.js +0 -138
package/dist/__tests__/cliSubcommands.test.js.map +0 -1
package/dist/__tests__/comparativeBench.test.d.ts +0 -1
package/dist/__tests__/comparativeBench.test.js +0 -722
package/dist/__tests__/comparativeBench.test.js.map +0 -1
package/dist/__tests__/critterCalibrationEval.d.ts +0 -8
package/dist/__tests__/critterCalibrationEval.js +0 -370
package/dist/__tests__/critterCalibrationEval.js.map +0 -1
package/dist/__tests__/dynamicLoading.test.d.ts +0 -1
package/dist/__tests__/dynamicLoading.test.js +0 -280
package/dist/__tests__/dynamicLoading.test.js.map +0 -1
package/dist/__tests__/embeddingProvider.test.d.ts +0 -1
package/dist/__tests__/embeddingProvider.test.js +0 -86
package/dist/__tests__/embeddingProvider.test.js.map +0 -1
package/dist/__tests__/evalDatasetBench.test.d.ts +0 -1
package/dist/__tests__/evalDatasetBench.test.js +0 -738
package/dist/__tests__/evalDatasetBench.test.js.map +0 -1
package/dist/__tests__/evalHarness.test.d.ts +0 -1
package/dist/__tests__/evalHarness.test.js +0 -1107
package/dist/__tests__/evalHarness.test.js.map +0 -1
package/dist/__tests__/fixtures/bfcl_v3_long_context.sample.json +0 -264
package/dist/__tests__/fixtures/generateBfclLongContextFixture.d.ts +0 -10
package/dist/__tests__/fixtures/generateBfclLongContextFixture.js +0 -135
package/dist/__tests__/fixtures/generateBfclLongContextFixture.js.map +0 -1
package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.d.ts +0 -14
package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js +0 -189
package/dist/__tests__/fixtures/generateSwebenchVerifiedFixture.js.map +0 -1
package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.d.ts +0 -16
package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js +0 -154
package/dist/__tests__/fixtures/generateToolbenchInstructionFixture.js.map +0 -1
package/dist/__tests__/fixtures/swebench_verified.sample.json +0 -162
package/dist/__tests__/fixtures/toolbench_instruction.sample.json +0 -109
package/dist/__tests__/forecastingDogfood.test.d.ts +0 -9
package/dist/__tests__/forecastingDogfood.test.js +0 -284
package/dist/__tests__/forecastingDogfood.test.js.map +0 -1
package/dist/__tests__/forecastingScoring.test.d.ts +0 -9
package/dist/__tests__/forecastingScoring.test.js +0 -202
package/dist/__tests__/forecastingScoring.test.js.map +0 -1
package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +0 -15
package/dist/__tests__/gaiaCapabilityAudioEval.test.js +0 -265
package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +0 -1
package/dist/__tests__/gaiaCapabilityEval.test.d.ts +0 -14
package/dist/__tests__/gaiaCapabilityEval.test.js +0 -1259
package/dist/__tests__/gaiaCapabilityEval.test.js.map +0 -1
package/dist/__tests__/gaiaCapabilityFilesEval.test.d.ts +0 -15
package/dist/__tests__/gaiaCapabilityFilesEval.test.js +0 -914
package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +0 -1
package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +0 -15
package/dist/__tests__/gaiaCapabilityMediaEval.test.js +0 -1101
package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +0 -1
package/dist/__tests__/helpers/answerMatch.d.ts +0 -41
package/dist/__tests__/helpers/answerMatch.js +0 -267
package/dist/__tests__/helpers/answerMatch.js.map +0 -1
package/dist/__tests__/helpers/textLlm.d.ts +0 -25
package/dist/__tests__/helpers/textLlm.js +0 -214
package/dist/__tests__/helpers/textLlm.js.map +0 -1
package/dist/__tests__/localDashboard.test.d.ts +0 -1
package/dist/__tests__/localDashboard.test.js +0 -226
package/dist/__tests__/localDashboard.test.js.map +0 -1
package/dist/__tests__/multiHopDogfood.test.d.ts +0 -12
package/dist/__tests__/multiHopDogfood.test.js +0 -303
package/dist/__tests__/multiHopDogfood.test.js.map +0 -1
package/dist/__tests__/openDatasetParallelEval.test.d.ts +0 -7
package/dist/__tests__/openDatasetParallelEval.test.js +0 -209
package/dist/__tests__/openDatasetParallelEval.test.js.map +0 -1
package/dist/__tests__/openDatasetParallelEvalGaia.test.d.ts +0 -7
package/dist/__tests__/openDatasetParallelEvalGaia.test.js +0 -279
package/dist/__tests__/openDatasetParallelEvalGaia.test.js.map +0 -1
package/dist/__tests__/openDatasetParallelEvalSwebench.test.d.ts +0 -7
package/dist/__tests__/openDatasetParallelEvalSwebench.test.js +0 -220
package/dist/__tests__/openDatasetParallelEvalSwebench.test.js.map +0 -1
package/dist/__tests__/openDatasetParallelEvalToolbench.test.d.ts +0 -7
package/dist/__tests__/openDatasetParallelEvalToolbench.test.js +0 -218
package/dist/__tests__/openDatasetParallelEvalToolbench.test.js.map +0 -1
package/dist/__tests__/openDatasetPerfComparison.test.d.ts +0 -10
package/dist/__tests__/openDatasetPerfComparison.test.js +0 -318
package/dist/__tests__/openDatasetPerfComparison.test.js.map +0 -1
package/dist/__tests__/openclawDogfood.test.d.ts +0 -23
package/dist/__tests__/openclawDogfood.test.js +0 -535
package/dist/__tests__/openclawDogfood.test.js.map +0 -1
package/dist/__tests__/openclawMessaging.test.d.ts +0 -14
package/dist/__tests__/openclawMessaging.test.js +0 -232
package/dist/__tests__/openclawMessaging.test.js.map +0 -1
package/dist/__tests__/presetRealWorldBench.test.d.ts +0 -1
package/dist/__tests__/presetRealWorldBench.test.js +0 -859
package/dist/__tests__/presetRealWorldBench.test.js.map +0 -1
package/dist/__tests__/tools.test.d.ts +0 -1
package/dist/__tests__/tools.test.js +0 -3201
package/dist/__tests__/tools.test.js.map +0 -1
package/dist/__tests__/toolsetGatingEval.test.d.ts +0 -1
package/dist/__tests__/toolsetGatingEval.test.js +0 -1099
package/dist/__tests__/toolsetGatingEval.test.js.map +0 -1
package/dist/__tests__/traceabilityDogfood.test.d.ts +0 -12
package/dist/__tests__/traceabilityDogfood.test.js +0 -241
package/dist/__tests__/traceabilityDogfood.test.js.map +0 -1
package/dist/__tests__/webmcpTools.test.d.ts +0 -7
package/dist/__tests__/webmcpTools.test.js +0 -195
package/dist/__tests__/webmcpTools.test.js.map +0 -1
package/dist/benchmarks/testProviderBus.d.ts +0 -7
package/dist/benchmarks/testProviderBus.js +0 -272
package/dist/benchmarks/testProviderBus.js.map +0 -1
package/dist/hooks/postCompaction.d.ts +0 -14
package/dist/hooks/postCompaction.js +0 -51
package/dist/hooks/postCompaction.js.map +0 -1
package/dist/security/__tests__/security.test.d.ts +0 -8
package/dist/security/__tests__/security.test.js +0 -295
package/dist/security/__tests__/security.test.js.map +0 -1
package/dist/sync/hyperloopEval.test.d.ts +0 -4
package/dist/sync/hyperloopEval.test.js +0 -60
package/dist/sync/hyperloopEval.test.js.map +0 -1
package/dist/sync/store.test.d.ts +0 -4
package/dist/sync/store.test.js +0 -43
package/dist/sync/store.test.js.map +0 -1
package/dist/tools/documentTools.d.ts +0 -5
package/dist/tools/documentTools.js +0 -524
package/dist/tools/documentTools.js.map +0 -1
package/dist/tools/financialTools.d.ts +0 -10
package/dist/tools/financialTools.js +0 -403
package/dist/tools/financialTools.js.map +0 -1
package/dist/tools/memoryTools.d.ts +0 -5
package/dist/tools/memoryTools.js +0 -137
package/dist/tools/memoryTools.js.map +0 -1
package/dist/tools/planningTools.d.ts +0 -5
package/dist/tools/planningTools.js +0 -147
package/dist/tools/planningTools.js.map +0 -1
package/dist/tools/searchTools.d.ts +0 -5
package/dist/tools/searchTools.js +0 -145
package/dist/tools/searchTools.js.map +0 -1

package/dist/__tests__/gaiaCapabilityFilesEval.test.js DELETED Viewed

@@ -1,914 +0,0 @@
-/**
- * GAIA file-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local file tools.
- *
- * This lane targets GAIA tasks that include attachments (PDF / XLSX / CSV).
- * We provide deterministic local parsing via NodeBench MCP tools and score answers against
- * the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
- *
- * Safety:
- * - GAIA is gated. Do not commit fixtures that contain prompts/answers.
- * - This test logs only task IDs and aggregate metrics (no prompt/answer text).
- *
- * Disabled by default (cost + rate limits). Run with:
- *   NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
- */
-import { describe, expect, it } from "vitest";
-import { existsSync, readFileSync } from "node:fs";
-import { mkdir, readFile, writeFile } from "node:fs/promises";
-import path from "node:path";
-import { fileURLToPath } from "node:url";
-import { performance } from "node:perf_hooks";
-import { localFileTools } from "../tools/localFileTools.js";
-import { createTextLlmClient, generateTextFromHistory, } from "./helpers/textLlm.js";
-import { answersMatchWithJudge, autoDiscoverJudge } from "./helpers/answerMatch.js";
-const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
-const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
-async function safeWriteJson(filePath, payload) {
-    try {
-        await mkdir(path.dirname(filePath), { recursive: true });
-        await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
-    }
-    catch (err) {
-        console.warn(`[gaia-capability-files] report write failed: ${err?.message ?? String(err)}`);
-    }
-}
-function resolveRepoRoot() {
-    const testDir = path.dirname(fileURLToPath(import.meta.url));
-    return path.resolve(testDir, "../../../..");
-}
-function resolveCapabilityFilesFixturePath() {
-    const override = process.env.NODEBENCH_GAIA_CAPABILITY_FILES_FIXTURE_PATH;
-    if (override) {
-        if (path.isAbsolute(override))
-            return override;
-        const repoRoot = resolveRepoRoot();
-        return path.resolve(repoRoot, override);
-    }
-    const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
-    const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
-    const repoRoot = resolveRepoRoot();
-    return path.join(repoRoot, ".cache", "gaia", `gaia_capability_files_${config}_${split}.sample.json`);
-}
-function loadDotEnvLocalIfPresent() {
-    const repoRoot = resolveRepoRoot();
-    const envPath = path.join(repoRoot, ".env.local");
-    if (!existsSync(envPath))
-        return;
-    const text = readFileSync(envPath, "utf8");
-    for (const rawLine of text.split(/\r?\n/)) {
-        const line = rawLine.trim();
-        if (!line || line.startsWith("#"))
-            continue;
-        const idx = line.indexOf("=");
-        if (idx <= 0)
-            continue;
-        const key = line.slice(0, idx).trim();
-        let value = line.slice(idx + 1).trim();
-        if ((value.startsWith("\"") && value.endsWith("\"")) ||
-            (value.startsWith("'") && value.endsWith("'"))) {
-            value = value.slice(1, -1);
-        }
-        if (!process.env[key])
-            process.env[key] = value;
-    }
-}
-function toIntegerOrNullLoose(value) {
-    if (value === null || value === undefined)
-        return null;
-    if (typeof value === "number" && Number.isFinite(value))
-        return Math.trunc(value);
-    const s = String(value).trim();
-    if (!s)
-        return null;
-    const m = s.match(/-?\d+/);
-    if (!m)
-        return null;
-    const n = Number.parseInt(m[0], 10);
-    return Number.isFinite(n) ? n : null;
-}
-function deriveAddressParityIfRelevant(taskPrompt, extract) {
-    const wantsParity = /\bodd\b|\beven\b|odd-?numbered|even-?numbered|parity/i.test(String(taskPrompt ?? ""));
-    if (!wantsParity)
-        return null;
-    const headers = Array.isArray(extract?.headers) ? extract.headers.map((h) => String(h ?? "")) : [];
-    const rows = Array.isArray(extract?.rows) ? extract.rows : [];
-    if (headers.length === 0 || rows.length === 0)
-        return null;
-    const addrIdx = headers.findIndex((h) => /address/i.test(h));
-    if (addrIdx < 0)
-        return null;
-    let integerCount = 0;
-    let evenCount = 0;
-    let oddCount = 0;
-    for (const r of rows) {
-        const n = toIntegerOrNullLoose(Array.isArray(r) ? r[addrIdx] : null);
-        if (n === null)
-            continue;
-        integerCount++;
-        if (Math.abs(n) % 2 === 0)
-            evenCount++;
-        else
-            oddCount++;
-    }
-    return {
-        column: headers[addrIdx],
-        columnIndex: addrIdx,
-        integerCount,
-        evenCount,
-        oddCount,
-    };
-}
-function inferAnswerFromAddressParityIfPossible(taskPrompt, parity) {
-    const p = String(taskPrompt ?? "").toLowerCase();
-    if (!p)
-        return null;
-    const oddEast = /odd[^.]*east/.test(p);
-    const oddWest = /odd[^.]*west/.test(p);
-    const evenEast = /even[^.]*east/.test(p);
-    const evenWest = /even[^.]*west/.test(p);
-    const lastSunrise = p.lastIndexOf("sunrise");
-    const lastSunset = p.lastIndexOf("sunset");
-    if (lastSunrise === -1 && lastSunset === -1)
-        return null;
-    // If both appear, assume the one mentioned last is what the question asks for.
-    const wantsSunrise = lastSunrise > lastSunset;
-    const wantsSunset = lastSunset > lastSunrise;
-    let desiredDirection = wantsSunrise ? "east" : wantsSunset ? "west" : null;
-    if (!desiredDirection)
-        return null;
-    // Some tasks specify the awning is for the *back* of the house, while the
-    // prompt gives the facing direction for the street address (front). In that
-    // case, invert the facing direction.
-    const mentionsBackOfHouse = /\bback\b/.test(p) && /\bhouse\b/.test(p);
-    if (mentionsBackOfHouse) {
-        desiredDirection = desiredDirection === "east" ? "west" : "east";
-    }
-    // Map parity -> facing direction when explicitly stated.
-    const oddFaces = oddEast ? "east" : oddWest ? "west" : null;
-    const evenFaces = evenEast ? "east" : evenWest ? "west" : null;
-    if (!oddFaces || !evenFaces)
-        return null;
-    if (evenFaces === desiredDirection)
-        return String(parity.evenCount);
-    if (oddFaces === desiredDirection)
-        return String(parity.oddCount);
-    return null;
-}
-async function llmGenerateText(llm, history) {
-    const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
-    return generateTextFromHistory(llm, history, {
-        temperature: Number.isFinite(temperature) ? temperature : 0,
-        maxOutputTokens: 1024,
-    });
-}
-async function baselineAnswer(llm, task) {
-    const contents = [
-        {
-            role: "user",
-            parts: [
-                {
-                    text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
-                },
-            ],
-        },
-    ];
-    return llmGenerateText(llm, contents);
-}
-function buildToolIndex() {
-    const byName = new Map();
-    for (const tool of localFileTools)
-        byName.set(tool.name, tool);
-    return byName;
-}
-function extractJsonObject(text) {
-    const trimmed = text.trim();
-    const fenceMatch = trimmed.match(/```json\s*([\s\S]*?)\s*```/i);
-    const candidate = fenceMatch ? fenceMatch[1] : trimmed;
-    const start = candidate.indexOf("{");
-    const end = candidate.lastIndexOf("}");
-    if (start === -1 || end === -1 || end <= start)
-        return null;
-    const slice = candidate.slice(start, end + 1);
-    try {
-        return JSON.parse(slice);
-    }
-    catch {
-        return null;
-    }
-}
-function resolveTaskLocalFilePath(task) {
-    const repoRoot = resolveRepoRoot();
-    const rel = String(task.localFilePath ?? "").trim();
-    if (rel)
-        return path.resolve(repoRoot, rel);
-    // Fallback to the standard cache layout used by the fixture generator.
-    const filePath = String(task.filePath ?? "").trim();
-    if (!filePath)
-        throw new Error("Task missing filePath/localFilePath");
-    return path.join(repoRoot, ".cache", "gaia", "data", filePath);
-}
-async function toolAugmentedAnswerFromFile(llm, task, opts) {
-    const toolIndex = buildToolIndex();
-    const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
-    const localPath = resolveTaskLocalFilePath(task);
-    if (!existsSync(localPath)) {
-        throw new Error(`Missing attachment on disk. Expected at ${localPath}. Refresh with dataset:gaia:capability:files:refresh`);
-    }
-    const ext = String(task.fileExt ?? "").trim().toLowerCase() ||
-        path.extname(task.fileName || task.filePath || "").toLowerCase().replace(/^\./, "");
-    // "rag" mode: single deterministic file extract -> answer (more stable than agent loops).
-    if (toolsMode === "rag") {
-        let extract;
-        if (ext === "csv") {
-            const tool = toolIndex.get("read_csv_file");
-            if (!tool)
-                throw new Error("Missing tool: read_csv_file");
-            extract = await tool.handler({
-                path: localPath,
-                hasHeader: true,
-                maxRows: 500,
-                maxCols: 80,
-                maxCellChars: 2000,
-            });
-        }
-        else if (ext === "xlsx") {
-            const tool = toolIndex.get("read_xlsx_file");
-            if (!tool)
-                throw new Error("Missing tool: read_xlsx_file");
-            extract = await tool.handler({
-                path: localPath,
-                headerRow: 1,
-                maxRows: 500,
-                maxCols: 80,
-                maxCellChars: 2000,
-            });
-        }
-        else if (ext === "pdf") {
-            const tool = toolIndex.get("read_pdf_text");
-            if (!tool)
-                throw new Error("Missing tool: read_pdf_text");
-            extract = await tool.handler({
-                path: localPath,
-                pageStart: 1,
-                pageEnd: 12,
-                maxChars: 40000,
-            });
-        }
-        else if (ext === "docx") {
-            const tool = toolIndex.get("read_docx_text");
-            if (!tool)
-                throw new Error("Missing tool: read_docx_text");
-            extract = await tool.handler({
-                path: localPath,
-                maxChars: 40000,
-            });
-        }
-        else if (ext === "pptx") {
-            const tool = toolIndex.get("read_pptx_text");
-            if (!tool)
-                throw new Error("Missing tool: read_pptx_text");
-            extract = await tool.handler({
-                path: localPath,
-                maxChars: 40000,
-            });
-        }
-        else if (ext === "json") {
-            const tool = toolIndex.get("read_json_file");
-            if (!tool)
-                throw new Error("Missing tool: read_json_file");
-            extract = await tool.handler({
-                path: localPath,
-                maxDepth: 10,
-                maxItems: 300,
-                maxStringChars: 2000,
-            });
-        }
-        else if (ext === "jsonl") {
-            const tool = toolIndex.get("read_jsonl_file");
-            if (!tool)
-                throw new Error("Missing tool: read_jsonl_file");
-            extract = await tool.handler({
-                path: localPath,
-                offsetLines: 0,
-                limitLines: 200,
-                parseJson: true,
-                maxDepth: 8,
-                maxItems: 200,
-                maxStringChars: 1000,
-            });
-        }
-        else if (ext === "txt" || ext === "md" || ext === "xml") {
-            const tool = toolIndex.get("read_text_file");
-            if (!tool)
-                throw new Error("Missing tool: read_text_file");
-            extract = await tool.handler({
-                path: localPath,
-                startChar: 0,
-                maxChars: 40000,
-            });
-        }
-        else if (ext === "zip") {
-            throw new Error('ZIP attachments are only supported in toolsMode="agent" (requires multi-step extraction).');
-        }
-        else {
-            throw new Error(`Unsupported attachment type: ${ext || "(unknown)"}`);
-        }
-        // Keep the model input bounded. The tools already return bounded previews,
-        // but JSON stringification can still be large on wide tables.
-        const derivedParity = deriveAddressParityIfRelevant(task.prompt, extract);
-        const inferredFromParity = derivedParity && (ext === "csv" || ext === "xlsx")
-            ? inferAnswerFromAddressParityIfPossible(task.prompt, derivedParity)
-            : null;
-        if (inferredFromParity) {
-            return { answer: inferredFromParity, toolCalls: 1 };
-        }
-        const enrichedExtract = derivedParity && (ext === "csv" || ext === "xlsx")
-            ? { ...extract, derivedParity: { address: derivedParity } }
-            : extract;
-        const extractText = JSON.stringify(enrichedExtract).slice(0, 40000);
-        const contents = [
-            {
-                role: "user",
-                parts: [
-                    {
-                        text: "Answer the question using the provided file extract plus general reasoning. " +
-                            "Do not browse the web and do not read any other files. " +
-                            "Reminder: the sun rises in the east and sets in the west. " +
-                            "If FILE_EXTRACT_JSON contains derivedParity, prefer it over recounting. " +
-                            "If the extract is insufficient, make the best supported guess.\n\n" +
-                            "Return ONLY the final answer, no explanation.\n\n" +
-                            `TASK_ID: ${task.id}\n` +
-                            `FILE_TYPE: ${ext}\n` +
-                            `LOCAL_FILE_PATH: ${localPath}\n` +
-                            `QUESTION:\n${task.prompt}\n\n` +
-                            `FILE_EXTRACT_JSON:\n${extractText}`,
-                    },
-                ],
-            },
-        ];
-        const answer = await llmGenerateText(llm, contents);
-        return { answer, toolCalls: 1 };
-    }
-    // "agent" mode: small tool loop. This is more realistic but higher variance.
-    const toolUsageSummary = [
-        "You have access to deterministic local file tools:",
-        "- where ops: eq, ne, contains, starts_with, ends_with, matches_regex, gt, gte, lt, lte, is_empty, not_empty, is_even, is_odd",
-        "- Prefer deterministic aggregations (csv_aggregate/xlsx_aggregate) over mental math. For parity rules, use where op is_even/is_odd.",
-        "- read_csv_file({path,hasHeader,delimiter,encoding,maxRows,maxCols,maxCellChars})",
-        "- csv_select_rows({path,hasHeader,delimiter,encoding,where,returnColumns,offset,limit,maxScanRows,maxCols,maxCellChars})",
-        "- csv_aggregate({path,hasHeader,delimiter,encoding,where,operation,value,ignoreNonNumeric,returnRow,returnColumns,maxScanRows,maxCols,maxCellChars})",
-        "- read_xlsx_file({path,sheetName,headerRow,rangeA1,maxRows,maxCols,maxCellChars})",
-        "- xlsx_select_rows({path,sheetName,headerRow,rangeA1,where,returnColumns,offset,limit,maxScanRows,maxCols,maxCellChars})",
-        "- xlsx_aggregate({path,sheetName,headerRow,rangeA1,where,operation,value,ignoreNonNumeric,returnRow,returnColumns,maxScanRows,maxCols,maxCellChars})",
-        "- read_pdf_text({path,pageStart,pageEnd,pageNumbers,maxChars})",
-        "- pdf_search_text({path,query,caseSensitive,pageStart,pageEnd,pageNumbers,maxMatches,snippetChars})",
-        "- read_text_file({path,encoding,startChar,maxChars})",
-        "- read_json_file({path,maxDepth,maxItems,maxStringChars})",
-        "- json_select({path,pointer,maxDepth,maxItems,maxStringChars})",
-        "- read_jsonl_file({path,encoding,offsetLines,limitLines,parseJson,maxLineChars,maxDepth,maxItems,maxStringChars})",
-        "- zip_list_files({path,maxEntries})",
-        "- zip_read_text_file({path,innerPath,caseSensitive,encoding,maxChars,maxBytes})",
-        "- zip_extract_file({path,innerPath,caseSensitive,outputDir,overwrite,maxBytes})",
-        "- read_docx_text({path,maxChars,maxBytes})",
-        "- read_pptx_text({path,maxChars,maxSlides,maxBytesPerSlide})",
-        "",
-        "When using tools, respond with a single JSON object only:",
-        "{\"action\":\"tool\",\"name\":\"read_pdf_text\",\"arguments\":{\"pageStart\":1,\"pageEnd\":5}}",
-        "When done, respond with:",
-        "{\"action\":\"final\",\"answer\":\"...\"}",
-        "",
-        "Rules:",
-        "- Do NOT use any external knowledge or web browsing.",
-        "- Always use the provided LOCAL_FILE_PATH; you may not read any other files.",
-        "- Keep tool results bounded (limit<=200, maxRows<=500, maxCols<=80, maxCellChars<=2000, maxChars<=40000, maxMatches<=50).",
-        "- Do NOT include any explanation. Final answer must match the requested formatting.",
-    ].join("\n");
-    const contents = [
-        {
-            role: "user",
-            parts: [
-                {
-                    text: `${toolUsageSummary}\n\nTASK_ID: ${task.id}\nFILE_TYPE: ${ext}\nLOCAL_FILE_PATH: ${localPath}\nQUESTION:\n${task.prompt}`,
-                },
-            ],
-        },
-    ];
-    let toolCalls = 0;
-    for (let step = 0; step < opts.maxSteps; step++) {
-        const out = await llmGenerateText(llm, contents);
-        contents.push({ role: "model", parts: [{ text: out }] });
-        const parsed = extractJsonObject(out);
-        if (!parsed || typeof parsed !== "object") {
-            contents.push({
-                role: "user",
-                parts: [{ text: "Invalid format. Return JSON only with action tool|final." }],
-            });
-            continue;
-        }
-        if (parsed.action === "final") {
-            const answer = String(parsed.answer ?? "").trim();
-            return { answer, toolCalls };
-        }
-        if (parsed.action !== "tool") {
-            contents.push({
-                role: "user",
-                parts: [{ text: "Invalid action. Return JSON only with action tool|final." }],
-            });
-            continue;
-        }
-        if (toolCalls >= opts.maxToolCalls) {
-            contents.push({
-                role: "user",
-                parts: [{ text: "Tool call budget exceeded. Return final answer now." }],
-            });
-            continue;
-        }
-        const name = String(parsed.name ?? "");
-        const tool = toolIndex.get(name);
-        if (!tool) {
-            contents.push({
-                role: "user",
-                parts: [
-                    {
-                        text: `Unknown tool "${name}". Use only read_csv_file, csv_select_rows, csv_aggregate, read_xlsx_file, xlsx_select_rows, xlsx_aggregate, read_pdf_text, or pdf_search_text.`,
-                    },
-                ],
-            });
-            continue;
-        }
-        const args = (parsed.arguments ?? {});
-        // Security: enforce file access restrictions.
-        // Default: force the path to the known GAIA attachment.
-        // ZIP: allow tools to operate on extracted children under a per-task extracted dir.
-        const extractedRoot = path.join(resolveRepoRoot(), ".cache", "gaia", "extracted", task.id);
-        const isZip = ext === "zip";
-        const isZipTool = ["zip_list_files", "zip_read_text_file", "zip_extract_file"].includes(name);
-        if (!isZip) {
-            args.path = localPath;
-        }
-        else if (isZipTool) {
-            args.path = localPath;
-            // Force deterministic extracted root to keep gated data under .cache/gaia (gitignored).
-            if (name === "zip_extract_file") {
-                args.outputDir = extractedRoot;
-            }
-        }
-        else {
-            const requested = String(args.path ?? "").trim();
-            if (!requested) {
-                contents.push({
-                    role: "user",
-                    parts: [
-                        {
-                            text: "ZIP workflow: first call zip_list_files, then zip_extract_file(innerPath=...), " +
-                                "then call a reader tool on the extractedPath returned by zip_extract_file.",
-                        },
-                    ],
-                });
-                continue;
-            }
-            const requestedAbs = path.isAbsolute(requested)
-                ? requested
-                : path.resolve(path.dirname(localPath), requested);
-            const extractedAbs = path.resolve(extractedRoot);
-            const reqResolved = path.resolve(requestedAbs);
-            if (!reqResolved.startsWith(extractedAbs + path.sep) && reqResolved !== extractedAbs) {
-                contents.push({
-                    role: "user",
-                    parts: [{ text: `Refusing to read path outside extractedRoot: ${reqResolved}` }],
-                });
-                continue;
-            }
-            args.path = reqResolved;
-        }
-        // Hard limits for safety and stable prompts.
-        if (name === "read_csv_file") {
-            if (args.hasHeader === undefined)
-                args.hasHeader = true;
-            if (typeof args.maxRows !== "number")
-                args.maxRows = 200;
-            if (typeof args.maxCols !== "number")
-                args.maxCols = 50;
-            if (typeof args.maxCellChars !== "number")
-                args.maxCellChars = 2000;
-            args.maxRows = Math.min(Number(args.maxRows) || 200, 500);
-            args.maxCols = Math.min(Number(args.maxCols) || 50, 80);
-            args.maxCellChars = Math.min(Number(args.maxCellChars) || 2000, 2000);
-        }
-        else if (name === "csv_select_rows") {
-            if (args.hasHeader === undefined)
-                args.hasHeader = true;
-            if (typeof args.offset !== "number")
-                args.offset = 0;
-            if (typeof args.limit !== "number")
-                args.limit = 50;
-            if (typeof args.maxScanRows !== "number")
-                args.maxScanRows = 50000;
-            if (typeof args.maxCols !== "number")
-                args.maxCols = 80;
-            if (typeof args.maxCellChars !== "number")
-                args.maxCellChars = 2000;
-            args.offset = Math.max(0, Number(args.offset) || 0);
-            args.limit = Math.min(Math.max(1, Number(args.limit) || 50), 200);
-            args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
-            args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 80), 80);
-            args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
-            if (Array.isArray(args.where))
-                args.where = args.where.slice(0, 10);
-            if (Array.isArray(args.returnColumns))
-                args.returnColumns = args.returnColumns.slice(0, 30);
-        }
-        else if (name === "csv_aggregate") {
-            if (args.hasHeader === undefined)
-                args.hasHeader = true;
-            if (typeof args.maxScanRows !== "number")
-                args.maxScanRows = 50000;
-            if (typeof args.maxCols !== "number")
-                args.maxCols = 200;
-            if (typeof args.maxCellChars !== "number")
-                args.maxCellChars = 2000;
-            args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
-            args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 200), 200);
-            args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
-            if (Array.isArray(args.where))
-                args.where = args.where.slice(0, 10);
-            if (Array.isArray(args.returnColumns))
-                args.returnColumns = args.returnColumns.slice(0, 30);
-        }
-        else if (name === "read_xlsx_file") {
-            if (typeof args.headerRow !== "number")
-                args.headerRow = 1;
-            if (typeof args.maxRows !== "number")
-                args.maxRows = 200;
-            if (typeof args.maxCols !== "number")
-                args.maxCols = 50;
-            if (typeof args.maxCellChars !== "number")
-                args.maxCellChars = 2000;
-            args.maxRows = Math.min(Number(args.maxRows) || 200, 500);
-            args.maxCols = Math.min(Number(args.maxCols) || 50, 80);
-            args.maxCellChars = Math.min(Number(args.maxCellChars) || 2000, 2000);
-        }
-        else if (name === "xlsx_select_rows") {
-            if (typeof args.headerRow !== "number")
-                args.headerRow = 1;
-            if (typeof args.offset !== "number")
-                args.offset = 0;
-            if (typeof args.limit !== "number")
-                args.limit = 50;
-            if (typeof args.maxScanRows !== "number")
-                args.maxScanRows = 50000;
-            if (typeof args.maxCols !== "number")
-                args.maxCols = 80;
-            if (typeof args.maxCellChars !== "number")
-                args.maxCellChars = 2000;
-            args.headerRow = Math.max(0, Math.min(Number(args.headerRow) || 1, 1000));
-            args.offset = Math.max(0, Number(args.offset) || 0);
-            args.limit = Math.min(Math.max(1, Number(args.limit) || 50), 200);
-            args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
-            args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 80), 80);
-            args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
-            if (Array.isArray(args.where))
-                args.where = args.where.slice(0, 10);
-            if (Array.isArray(args.returnColumns))
-                args.returnColumns = args.returnColumns.slice(0, 30);
-        }
-        else if (name === "xlsx_aggregate") {
-            if (typeof args.headerRow !== "number")
-                args.headerRow = 1;
-            if (typeof args.maxScanRows !== "number")
-                args.maxScanRows = 50000;
-            if (typeof args.maxCols !== "number")
-                args.maxCols = 200;
-            if (typeof args.maxCellChars !== "number")
-                args.maxCellChars = 2000;
-            args.headerRow = Math.max(0, Math.min(Number(args.headerRow) || 1, 1000));
-            args.maxScanRows = Math.min(Math.max(1, Number(args.maxScanRows) || 50000), 50000);
-            args.maxCols = Math.min(Math.max(1, Number(args.maxCols) || 200), 200);
-            args.maxCellChars = Math.min(Math.max(20, Number(args.maxCellChars) || 2000), 2000);
-            if (Array.isArray(args.where))
-                args.where = args.where.slice(0, 10);
-            if (Array.isArray(args.returnColumns))
-                args.returnColumns = args.returnColumns.slice(0, 30);
-        }
-        else if (name === "read_pdf_text") {
-            if (typeof args.pageStart !== "number")
-                args.pageStart = 1;
-            if (typeof args.pageEnd !== "number")
-                args.pageEnd = 3;
-            if (typeof args.maxChars !== "number")
-                args.maxChars = 12000;
-            args.pageStart = Math.max(1, Math.min(Number(args.pageStart) || 1, 500));
-            args.pageEnd = Math.max(1, Math.min(Number(args.pageEnd) || 3, 500));
-            args.maxChars = Math.min(Number(args.maxChars) || 12000, 40000);
-            if (Array.isArray(args.pageNumbers)) {
-                // Keep explicit page lists short to avoid huge extracts.
-                args.pageNumbers = args.pageNumbers
-                    .map((n) => Number(n))
-                    .filter((n) => Number.isFinite(n) && n > 0)
-                    .slice(0, 20);
-            }
-        }
-        else if (name === "pdf_search_text") {
-            if (typeof args.query !== "string")
-                args.query = "";
-            if (typeof args.pageStart !== "number")
-                args.pageStart = 1;
-            if (typeof args.pageEnd !== "number")
-                args.pageEnd = 25;
-            if (typeof args.maxMatches !== "number")
-                args.maxMatches = 25;
-            if (typeof args.snippetChars !== "number")
-                args.snippetChars = 180;
-            args.pageStart = Math.max(1, Math.min(Number(args.pageStart) || 1, 500));
-            args.pageEnd = Math.max(1, Math.min(Number(args.pageEnd) || 25, 500));
-            args.maxMatches = Math.min(Math.max(1, Number(args.maxMatches) || 25), 50);
-            args.snippetChars = Math.min(Math.max(40, Number(args.snippetChars) || 180), 400);
-            if (Array.isArray(args.pageNumbers)) {
-                args.pageNumbers = args.pageNumbers
-                    .map((n) => Number(n))
-                    .filter((n) => Number.isFinite(n) && n > 0)
-                    .slice(0, 20);
-            }
-        }
-        else if (name === "read_text_file") {
-            if (typeof args.startChar !== "number")
-                args.startChar = 0;
-            if (typeof args.maxChars !== "number")
-                args.maxChars = 12000;
-            args.startChar = Math.max(0, Number(args.startChar) || 0);
-            args.maxChars = Math.min(Math.max(1, Number(args.maxChars) || 12000), 40000);
-        }
-        else if (name === "read_json_file" || name === "json_select") {
-            if (typeof args.maxDepth !== "number")
-                args.maxDepth = 8;
-            if (typeof args.maxItems !== "number")
-                args.maxItems = 200;
-            if (typeof args.maxStringChars !== "number")
-                args.maxStringChars = 2000;
-            args.maxDepth = Math.min(Math.max(1, Number(args.maxDepth) || 8), 12);
-            args.maxItems = Math.min(Math.max(1, Number(args.maxItems) || 200), 500);
-            args.maxStringChars = Math.min(Math.max(20, Number(args.maxStringChars) || 2000), 2000);
-            if (name === "json_select" && typeof args.pointer !== "string")
-                args.pointer = "";
-        }
-        else if (name === "read_jsonl_file") {
-            if (typeof args.offsetLines !== "number")
-                args.offsetLines = 0;
-            if (typeof args.limitLines !== "number")
-                args.limitLines = 200;
-            if (typeof args.maxLineChars !== "number")
-                args.maxLineChars = 4000;
-            if (typeof args.maxDepth !== "number")
-                args.maxDepth = 6;
-            if (typeof args.maxItems !== "number")
-                args.maxItems = 100;
-            if (typeof args.maxStringChars !== "number")
-                args.maxStringChars = 1000;
-            args.offsetLines = Math.max(0, Number(args.offsetLines) || 0);
-            args.limitLines = Math.min(Math.max(1, Number(args.limitLines) || 200), 500);
-            args.maxLineChars = Math.min(Math.max(200, Number(args.maxLineChars) || 4000), 10000);
-            args.maxDepth = Math.min(Math.max(1, Number(args.maxDepth) || 6), 10);
-            args.maxItems = Math.min(Math.max(1, Number(args.maxItems) || 100), 300);
-            args.maxStringChars = Math.min(Math.max(20, Number(args.maxStringChars) || 1000), 2000);
-        }
-        else if (name === "zip_list_files") {
-            if (typeof args.maxEntries !== "number")
-                args.maxEntries = 200;
-            args.maxEntries = Math.min(Math.max(1, Number(args.maxEntries) || 200), 500);
-        }
-        else if (name === "zip_read_text_file") {
-            if (typeof args.innerPath !== "string")
-                args.innerPath = "";
-            if (typeof args.maxChars !== "number")
-                args.maxChars = 12000;
-            if (typeof args.maxBytes !== "number")
-                args.maxBytes = 5000000;
-            args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 20000);
-            args.maxBytes = Math.min(Math.max(1000, Number(args.maxBytes) || 5000000), 20000000);
-        }
-        else if (name === "zip_extract_file") {
-            if (typeof args.innerPath !== "string")
-                args.innerPath = "";
-            if (typeof args.maxBytes !== "number")
-                args.maxBytes = 25000000;
-            args.maxBytes = Math.min(Math.max(1000, Number(args.maxBytes) || 25000000), 50000000);
-            args.overwrite = false;
-        }
-        else if (name === "read_docx_text") {
-            if (typeof args.maxChars !== "number")
-                args.maxChars = 12000;
-            args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 40000);
-        }
-        else if (name === "read_pptx_text") {
-            if (typeof args.maxChars !== "number")
-                args.maxChars = 12000;
-            if (typeof args.maxSlides !== "number")
-                args.maxSlides = 60;
-            args.maxChars = Math.min(Math.max(200, Number(args.maxChars) || 12000), 40000);
-            args.maxSlides = Math.min(Math.max(1, Number(args.maxSlides) || 60), 120);
-        }
-        // Reduce model confusion: enforce tool matches the attachment type.
-        const allowedByExt = (ext === "csv" && ["read_csv_file", "csv_select_rows", "csv_aggregate"].includes(name)) ||
-            (ext === "xlsx" && ["read_xlsx_file", "xlsx_select_rows", "xlsx_aggregate"].includes(name)) ||
-            (ext === "pdf" && ["read_pdf_text", "pdf_search_text"].includes(name)) ||
-            (ext === "docx" && ["read_docx_text"].includes(name)) ||
-            (ext === "pptx" && ["read_pptx_text"].includes(name)) ||
-            ((ext === "txt" || ext === "md" || ext === "xml") && ["read_text_file"].includes(name)) ||
-            (ext === "json" && ["read_json_file", "json_select", "read_text_file"].includes(name)) ||
-            (ext === "jsonl" && ["read_jsonl_file", "read_text_file"].includes(name)) ||
-            (ext === "zip" &&
-                [
-                    "zip_list_files",
-                    "zip_read_text_file",
-                    "zip_extract_file",
-                    "read_csv_file",
-                    "csv_select_rows",
-                    "csv_aggregate",
-                    "read_xlsx_file",
-                    "xlsx_select_rows",
-                    "xlsx_aggregate",
-                    "read_pdf_text",
-                    "pdf_search_text",
-                    "read_text_file",
-                    "read_json_file",
-                    "json_select",
-                    "read_jsonl_file",
-                    "read_docx_text",
-                    "read_pptx_text",
-                ].includes(name));
-        if (!allowedByExt) {
-            contents.push({
-                role: "user",
-                parts: [{ text: `Wrong tool for FILE_TYPE=${ext}. Use a tool that matches the file type.` }],
-            });
-            continue;
-        }
-        toolCalls++;
-        const toolResult = await tool.handler(args);
-        // Provide a bounded JSON summary to the model. Avoid dumping large content.
-        const toolResultText = JSON.stringify(toolResult).slice(0, 12000);
-        contents.push({
-            role: "user",
-            parts: [{ text: `TOOL_RESULT ${name}:\n${toolResultText}\n\nContinue. Return JSON only.` }],
-        });
-    }
-    // If we ran out of steps, force a final answer.
-    contents.push({
-        role: "user",
-        parts: [{ text: "Out of steps. Return final answer now as JSON." }],
-    });
-    const out = await llmGenerateText(llm, contents);
-    const parsed = extractJsonObject(out);
-    if (parsed?.action === "final") {
-        return { answer: String(parsed.answer ?? "").trim(), toolCalls };
-    }
-    return { answer: String(out ?? "").trim(), toolCalls };
-}
-async function loadFixture(fixturePath) {
-    const raw = await readFile(fixturePath, "utf8");
-    const parsed = JSON.parse(raw);
-    if (!parsed || !Array.isArray(parsed.tasks))
-        throw new Error("Invalid GAIA capability files fixture");
-    return parsed;
-}
-describe("Capability: GAIA accuracy (file-backed) (LLM-only vs LLM+local tools)", () => {
-    const testFn = shouldRun ? it : it.skip;
-    testFn("should measure accuracy delta on a small GAIA file-backed subset", async () => {
-        loadDotEnvLocalIfPresent();
-        const fixturePath = resolveCapabilityFilesFixturePath();
-        if (!existsSync(fixturePath)) {
-            throw new Error(`Missing GAIA capability files fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityFilesFixture.py`);
-        }
-        const baselineModel = process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-3-flash-preview";
-        const toolsModel = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? baselineModel;
-        const baselineLlm = await createTextLlmClient({ model: baselineModel });
-        const toolsLlm = await createTextLlmClient({ model: toolsModel });
-        const baselineModelLabel = `${baselineLlm.provider}:${baselineLlm.model}`;
-        const toolsModelLabel = `${toolsLlm.provider}:${toolsLlm.model}`;
-        const fixture = await loadFixture(fixturePath);
-        expect(Array.isArray(fixture.tasks)).toBe(true);
-        expect(fixture.tasks.length).toBeGreaterThan(0);
-        const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "6", 10);
-        const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 6));
-        const tasks = fixture.tasks.slice(0, taskLimit);
-        const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
-        const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
-        const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "7", 10);
-        const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "5", 10);
-        // Auto-discover judge (free OpenRouter → paid LLM → deterministic-only)
-        const judge = await autoDiscoverJudge(toolsLlm);
-        const results = new Array(tasks.length);
-        let nextIndex = 0;
-        const workers = Array.from({ length: concurrency }, () => (async () => {
-            while (true) {
-                const idx = nextIndex++;
-                if (idx >= tasks.length)
-                    return;
-                const task = tasks[idx];
-                try {
-                    const baseStart = performance.now();
-                    const base = await baselineAnswer(baselineLlm, task);
-                    const baseMs = performance.now() - baseStart;
-                    const toolsStart = performance.now();
-                    const tools = await toolAugmentedAnswerFromFile(toolsLlm, task, { maxSteps, maxToolCalls });
-                    const toolsMs = performance.now() - toolsStart;
-                    const baseJudge = await answersMatchWithJudge(task.expectedAnswer, base, judge);
-                    const toolsJudge = await answersMatchWithJudge(task.expectedAnswer, tools.answer, judge);
-                    results[idx] = {
-                        taskId: task.id,
-                        baselineCorrect: baseJudge.match,
-                        toolsCorrect: toolsJudge.match,
-                        baselineMs: baseMs,
-                        toolsMs,
-                        toolCalls: tools.toolCalls,
-                        judgeProvider: toolsJudge.judgeProvider,
-                        judgeInvoked: toolsJudge.judgeInvoked,
-                    };
-                }
-                catch (err) {
-                    results[idx] = {
-                        taskId: task.id,
-                        baselineCorrect: false,
-                        toolsCorrect: false,
-                        baselineMs: 0,
-                        toolsMs: 0,
-                        toolCalls: 0,
-                        error: err?.message ?? String(err),
-                    };
-                }
-            }
-        })());
-        await Promise.all(workers);
-        const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
-        const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
-        const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
-        const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
-        const avg = (values) => values.length === 0 ? 0 : values.reduce((a, b) => a + b, 0) / values.length;
-        const avgBaseMs = avg(results.map((r) => r.baselineMs).filter((n) => n > 0));
-        const avgToolsMs = avg(results.map((r) => r.toolsMs).filter((n) => n > 0));
-        const avgToolCalls = avg(results.map((r) => r.toolCalls));
-        console.log(`[gaia-capability-files] config=${fixture.config} split=${fixture.split} tasks=${tasks.length} concurrency=${concurrency} baseline=${baselineCorrect}/${tasks.length} tools=${toolsCorrect}/${tasks.length} improved=${improved} regressions=${regressions} avgBaselineMs=${avgBaseMs.toFixed(0)} avgToolsMs=${avgToolsMs.toFixed(0)} avgToolCalls=${avgToolCalls.toFixed(2)}`);
-        console.log(`[gaia-capability-files] perTask: ${results
-            .map((r) => `${r.taskId}:B${r.baselineCorrect ? "1" : "0"}T${r.toolsCorrect ? "1" : "0"}${r.error ? "E" : ""}`)
-            .join(" ")}`);
-        if (shouldWriteReport) {
-            const repoRoot = resolveRepoRoot();
-            const generatedAtIso = new Date().toISOString();
-            const stamp = generatedAtIso.replace(/[:.]/g, "-");
-            const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
-            const publicSummary = {
-                suiteId: "gaia_capability_files",
-                lane: "files",
-                generatedAtIso,
-                config: fixture.config,
-                split: fixture.split,
-                taskCount: tasks.length,
-                concurrency,
-                baseline: {
-                    model: baselineModelLabel,
-                    correct: baselineCorrect,
-                    passRatePct: tasks.length === 0 ? 0 : (baselineCorrect / tasks.length) * 100,
-                    avgMs: avgBaseMs,
-                },
-                tools: {
-                    model: toolsModelLabel,
-                    mode: toolsMode,
-                    correct: toolsCorrect,
-                    passRatePct: tasks.length === 0 ? 0 : (toolsCorrect / tasks.length) * 100,
-                    avgMs: avgToolsMs,
-                    avgToolCalls: avgToolCalls,
-                },
-                improved,
-                regressions,
-                notes: "GAIA is gated. This file contains only aggregate metrics (no prompt/answer text). Detailed per-task report is written under .cache/gaia/reports (gitignored).",
-            };
-            await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_files_latest.json"), publicSummary);
-            await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_files_${fixture.config}_${fixture.split}_${stamp}.json`), {
-                ...publicSummary,
-                perTask: results.map((r) => ({
-                    taskId: r.taskId,
-                    baselineCorrect: r.baselineCorrect,
-                    toolsCorrect: r.toolsCorrect,
-                    baselineMs: r.baselineMs,
-                    toolsMs: r.toolsMs,
-                    toolCalls: r.toolCalls,
-                    error: r.error ?? null,
-                })),
-            });
-        }
-        const enforce = process.env.NODEBENCH_GAIA_CAPABILITY_ENFORCE === "1";
-        if (enforce) {
-            // For file-backed tasks, the baseline is expected to be low. We still enforce that
-            // tool-augmented performance is not worse than baseline and has at least one improvement.
-            const allowedRegression = Math.max(1, Math.floor(tasks.length * 0.2));
-            expect(improved).toBeGreaterThanOrEqual(1);
-            expect(toolsCorrect).toBeGreaterThanOrEqual(baselineCorrect - allowedRegression);
-            expect(toolsCorrect).toBeGreaterThanOrEqual(1);
-        }
-        else {
-            expect(results.length).toBe(tasks.length);
-            expect(results.some((r) => r.error)).toBe(false);
-        }
-    }, 300000);
-});
-//# sourceMappingURL=gaiaCapabilityFilesEval.test.js.map