nodebench-mcp 2.8.1 → 2.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +72 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.d.ts +15 -0
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +291 -0
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -0
- package/dist/__tests__/gaiaCapabilityMediaEval.test.d.ts +15 -0
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +421 -0
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -0
- package/dist/__tests__/tools.test.js +153 -4
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/tools/localFileTools.d.ts +1 -0
- package/dist/tools/localFileTools.js +353 -0
- package/dist/tools/localFileTools.js.map +1 -1
- package/dist/tools/toolRegistry.js +93 -6
- package/dist/tools/toolRegistry.js.map +1 -1
- package/package.json +10 -5
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GAIA media-backed capability/accuracy benchmark: LLM-only vs LLM+NodeBench MCP local OCR tools.
|
|
3
|
+
*
|
|
4
|
+
* This lane targets GAIA tasks that include image attachments (PNG/JPG/WEBP).
|
|
5
|
+
* We provide deterministic local OCR via NodeBench MCP tools and score answers against
|
|
6
|
+
* the ground-truth "Final answer" (stored locally under `.cache/gaia`, gitignored).
|
|
7
|
+
*
|
|
8
|
+
* Safety:
|
|
9
|
+
* - GAIA is gated. Do not commit fixtures that contain prompts/answers.
|
|
10
|
+
* - This test logs only task IDs and aggregate metrics (no prompt/answer text).
|
|
11
|
+
*
|
|
12
|
+
* Disabled by default (cost + rate limits). Run with:
|
|
13
|
+
* NODEBENCH_RUN_GAIA_CAPABILITY=1 npm --prefix packages/mcp-local run test
|
|
14
|
+
*/
|
|
15
|
+
import { describe, expect, it } from "vitest";
|
|
16
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
17
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { fileURLToPath } from "node:url";
|
|
20
|
+
import { performance } from "node:perf_hooks";
|
|
21
|
+
import { localFileTools } from "../tools/localFileTools.js";
|
|
22
|
+
const shouldRun = process.env.NODEBENCH_RUN_GAIA_CAPABILITY === "1";
|
|
23
|
+
const shouldWriteReport = process.env.NODEBENCH_WRITE_GAIA_REPORT === "1";
|
|
24
|
+
async function safeWriteJson(filePath, payload) {
|
|
25
|
+
try {
|
|
26
|
+
await mkdir(path.dirname(filePath), { recursive: true });
|
|
27
|
+
await writeFile(filePath, JSON.stringify(payload, null, 2) + "\n", "utf8");
|
|
28
|
+
}
|
|
29
|
+
catch (err) {
|
|
30
|
+
console.warn(`[gaia-capability-media] report write failed: ${err?.message ?? String(err)}`);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
function resolveRepoRoot() {
|
|
34
|
+
const testDir = path.dirname(fileURLToPath(import.meta.url));
|
|
35
|
+
return path.resolve(testDir, "../../../..");
|
|
36
|
+
}
|
|
37
|
+
function resolveCapabilityMediaFixturePath() {
|
|
38
|
+
const override = process.env.NODEBENCH_GAIA_CAPABILITY_MEDIA_FIXTURE_PATH;
|
|
39
|
+
if (override) {
|
|
40
|
+
if (path.isAbsolute(override))
|
|
41
|
+
return override;
|
|
42
|
+
const repoRoot = resolveRepoRoot();
|
|
43
|
+
return path.resolve(repoRoot, override);
|
|
44
|
+
}
|
|
45
|
+
const config = process.env.NODEBENCH_GAIA_CAPABILITY_CONFIG ?? "2023_all";
|
|
46
|
+
const split = process.env.NODEBENCH_GAIA_CAPABILITY_SPLIT ?? "validation";
|
|
47
|
+
const repoRoot = resolveRepoRoot();
|
|
48
|
+
return path.join(repoRoot, ".cache", "gaia", `gaia_capability_media_${config}_${split}.sample.json`);
|
|
49
|
+
}
|
|
50
|
+
function loadDotEnvLocalIfPresent() {
|
|
51
|
+
const repoRoot = resolveRepoRoot();
|
|
52
|
+
const envPath = path.join(repoRoot, ".env.local");
|
|
53
|
+
if (!existsSync(envPath))
|
|
54
|
+
return;
|
|
55
|
+
const text = readFileSync(envPath, "utf8");
|
|
56
|
+
for (const rawLine of text.split(/\r?\n/)) {
|
|
57
|
+
const line = rawLine.trim();
|
|
58
|
+
if (!line || line.startsWith("#"))
|
|
59
|
+
continue;
|
|
60
|
+
const idx = line.indexOf("=");
|
|
61
|
+
if (idx <= 0)
|
|
62
|
+
continue;
|
|
63
|
+
const key = line.slice(0, idx).trim();
|
|
64
|
+
let value = line.slice(idx + 1).trim();
|
|
65
|
+
if ((value.startsWith("\"") && value.endsWith("\"")) ||
|
|
66
|
+
(value.startsWith("'") && value.endsWith("'"))) {
|
|
67
|
+
value = value.slice(1, -1);
|
|
68
|
+
}
|
|
69
|
+
if (!process.env[key])
|
|
70
|
+
process.env[key] = value;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
async function canImport(pkg) {
|
|
74
|
+
try {
|
|
75
|
+
await import(pkg);
|
|
76
|
+
return true;
|
|
77
|
+
}
|
|
78
|
+
catch {
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
function normalizeAnswer(value) {
|
|
83
|
+
return value
|
|
84
|
+
.trim()
|
|
85
|
+
.replace(/\r/g, "")
|
|
86
|
+
.replace(/\s+/g, " ")
|
|
87
|
+
.replace(/^["']|["']$/g, "")
|
|
88
|
+
.replace(/[.]+$/g, "")
|
|
89
|
+
.toLowerCase();
|
|
90
|
+
}
|
|
91
|
+
async function createGeminiClient() {
|
|
92
|
+
const mod = await import("@google/genai");
|
|
93
|
+
const { GoogleGenAI } = mod;
|
|
94
|
+
const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_AI_API_KEY || "";
|
|
95
|
+
if (!apiKey) {
|
|
96
|
+
throw new Error("Missing GEMINI_API_KEY (or GOOGLE_AI_API_KEY)");
|
|
97
|
+
}
|
|
98
|
+
return new GoogleGenAI({ apiKey });
|
|
99
|
+
}
|
|
100
|
+
async function geminiGenerateText(ai, model, contents) {
|
|
101
|
+
const temperature = Number.parseFloat(process.env.NODEBENCH_GAIA_CAPABILITY_TEMPERATURE ?? "0");
|
|
102
|
+
const response = await ai.models.generateContent({
|
|
103
|
+
model,
|
|
104
|
+
contents,
|
|
105
|
+
config: {
|
|
106
|
+
temperature: Number.isFinite(temperature) ? temperature : 0,
|
|
107
|
+
maxOutputTokens: 1024,
|
|
108
|
+
},
|
|
109
|
+
});
|
|
110
|
+
const parts = response?.candidates?.[0]?.content?.parts ?? [];
|
|
111
|
+
const text = parts.map((p) => p?.text ?? "").join("").trim();
|
|
112
|
+
return text;
|
|
113
|
+
}
|
|
114
|
+
async function baselineAnswer(ai, task) {
|
|
115
|
+
const contents = [
|
|
116
|
+
{
|
|
117
|
+
role: "user",
|
|
118
|
+
parts: [
|
|
119
|
+
{
|
|
120
|
+
text: `Answer the question using your existing knowledge only. Do not browse the web.\n\nReturn ONLY the final answer, no explanation.\n\nQuestion:\n${task.prompt}`,
|
|
121
|
+
},
|
|
122
|
+
],
|
|
123
|
+
},
|
|
124
|
+
];
|
|
125
|
+
return geminiGenerateText(ai, process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-2.5-flash", contents);
|
|
126
|
+
}
|
|
127
|
+
function buildToolIndex() {
|
|
128
|
+
const byName = new Map();
|
|
129
|
+
for (const tool of localFileTools)
|
|
130
|
+
byName.set(tool.name, tool);
|
|
131
|
+
return byName;
|
|
132
|
+
}
|
|
133
|
+
function extractJsonObject(text) {
|
|
134
|
+
const trimmed = text.trim();
|
|
135
|
+
const fenceMatch = trimmed.match(/```json\s*([\s\S]*?)\s*```/i);
|
|
136
|
+
const candidate = fenceMatch ? fenceMatch[1] : trimmed;
|
|
137
|
+
const start = candidate.indexOf("{");
|
|
138
|
+
const end = candidate.lastIndexOf("}");
|
|
139
|
+
if (start === -1 || end === -1 || end <= start)
|
|
140
|
+
return null;
|
|
141
|
+
const slice = candidate.slice(start, end + 1);
|
|
142
|
+
try {
|
|
143
|
+
return JSON.parse(slice);
|
|
144
|
+
}
|
|
145
|
+
catch {
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
async function loadFixture(fixturePath) {
|
|
150
|
+
const raw = await readFile(fixturePath, "utf8");
|
|
151
|
+
const parsed = JSON.parse(raw);
|
|
152
|
+
if (!parsed || !Array.isArray(parsed.tasks))
|
|
153
|
+
throw new Error("Invalid GAIA capability fixture");
|
|
154
|
+
return parsed;
|
|
155
|
+
}
|
|
156
|
+
function resolveTaskLocalFilePath(task) {
|
|
157
|
+
const repoRoot = resolveRepoRoot();
|
|
158
|
+
const rel = String(task.localFilePath ?? "").trim();
|
|
159
|
+
if (rel)
|
|
160
|
+
return path.resolve(repoRoot, rel);
|
|
161
|
+
const filePath = String(task.filePath ?? "").trim();
|
|
162
|
+
if (!filePath)
|
|
163
|
+
throw new Error("Task missing filePath/localFilePath");
|
|
164
|
+
return path.join(repoRoot, ".cache", "gaia", "data", filePath);
|
|
165
|
+
}
|
|
166
|
+
async function toolAugmentedAnswerFromImage(ai, task, opts) {
|
|
167
|
+
const toolIndex = buildToolIndex();
|
|
168
|
+
const model = process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? "gemini-2.5-flash";
|
|
169
|
+
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
|
|
170
|
+
const localPath = resolveTaskLocalFilePath(task);
|
|
171
|
+
if (!existsSync(localPath)) {
|
|
172
|
+
throw new Error(`Missing attachment on disk. Expected at ${localPath}. Refresh with dataset:gaia:capability:media:refresh`);
|
|
173
|
+
}
|
|
174
|
+
const ext = String(task.fileExt ?? "").trim().toLowerCase() ||
|
|
175
|
+
path.extname(task.fileName || task.filePath || "").toLowerCase().replace(/^\./, "");
|
|
176
|
+
if (!["png", "jpg", "jpeg", "webp"].includes(ext)) {
|
|
177
|
+
throw new Error(`Unsupported attachment type for media lane: ${ext || "(unknown)"}`);
|
|
178
|
+
}
|
|
179
|
+
// "rag" mode: single deterministic OCR extract -> answer (more stable than agent loops).
|
|
180
|
+
if (toolsMode === "rag") {
|
|
181
|
+
const tool = toolIndex.get("read_image_ocr_text");
|
|
182
|
+
if (!tool)
|
|
183
|
+
throw new Error("Missing tool: read_image_ocr_text");
|
|
184
|
+
const extract = await tool.handler({
|
|
185
|
+
path: localPath,
|
|
186
|
+
lang: "eng",
|
|
187
|
+
preprocess: true,
|
|
188
|
+
maxChars: 40000,
|
|
189
|
+
});
|
|
190
|
+
const extractText = JSON.stringify(extract).slice(0, 40000);
|
|
191
|
+
const contents = [
|
|
192
|
+
{
|
|
193
|
+
role: "user",
|
|
194
|
+
parts: [
|
|
195
|
+
{
|
|
196
|
+
text: "Answer the question using ONLY the provided OCR extract. " +
|
|
197
|
+
"If the extract is insufficient, make the best supported guess.\n\n" +
|
|
198
|
+
"Return ONLY the final answer, no explanation.\n\n" +
|
|
199
|
+
`TASK_ID: ${task.id}\n` +
|
|
200
|
+
`FILE_TYPE: ${ext}\n` +
|
|
201
|
+
`LOCAL_FILE_PATH: ${localPath}\n` +
|
|
202
|
+
`QUESTION:\n${task.prompt}\n\n` +
|
|
203
|
+
`OCR_EXTRACT_JSON:\n${extractText}`,
|
|
204
|
+
},
|
|
205
|
+
],
|
|
206
|
+
},
|
|
207
|
+
];
|
|
208
|
+
const answer = await geminiGenerateText(ai, model, contents);
|
|
209
|
+
return { answer, toolCalls: 1 };
|
|
210
|
+
}
|
|
211
|
+
// "agent" mode: small tool loop. This is more realistic but higher variance.
|
|
212
|
+
const toolUsageSummary = [
|
|
213
|
+
"You have access to deterministic local media tools:",
|
|
214
|
+
"- read_image_ocr_text({path,lang,langPath,preprocess,maxChars})",
|
|
215
|
+
"",
|
|
216
|
+
"When using tools, respond with a single JSON object only:",
|
|
217
|
+
"{\"action\":\"tool\",\"name\":\"read_image_ocr_text\",\"arguments\":{\"maxChars\":20000}}",
|
|
218
|
+
"When done, respond with:",
|
|
219
|
+
"{\"action\":\"final\",\"answer\":\"...\"}",
|
|
220
|
+
"",
|
|
221
|
+
"Rules:",
|
|
222
|
+
"- Do NOT use any external knowledge or web browsing.",
|
|
223
|
+
"- Always use the provided LOCAL_FILE_PATH; you may not read any other files.",
|
|
224
|
+
"- Keep tool results bounded (maxChars<=40000).",
|
|
225
|
+
"- Do NOT include any explanation. Final answer must match the requested formatting.",
|
|
226
|
+
].join("\n");
|
|
227
|
+
const contents = [
|
|
228
|
+
{
|
|
229
|
+
role: "user",
|
|
230
|
+
parts: [
|
|
231
|
+
{
|
|
232
|
+
text: `${toolUsageSummary}\n\nTASK_ID: ${task.id}\nFILE_TYPE: ${ext}\nLOCAL_FILE_PATH: ${localPath}\nQUESTION:\n${task.prompt}`,
|
|
233
|
+
},
|
|
234
|
+
],
|
|
235
|
+
},
|
|
236
|
+
];
|
|
237
|
+
let toolCalls = 0;
|
|
238
|
+
for (let step = 0; step < opts.maxSteps; step++) {
|
|
239
|
+
const out = await geminiGenerateText(ai, model, contents);
|
|
240
|
+
contents.push({ role: "model", parts: [{ text: out }] });
|
|
241
|
+
const parsed = extractJsonObject(out);
|
|
242
|
+
if (!parsed || typeof parsed !== "object") {
|
|
243
|
+
contents.push({
|
|
244
|
+
role: "user",
|
|
245
|
+
parts: [{ text: "Invalid format. Return JSON only with action tool|final." }],
|
|
246
|
+
});
|
|
247
|
+
continue;
|
|
248
|
+
}
|
|
249
|
+
if (parsed.action === "final") {
|
|
250
|
+
const answer = String(parsed.answer ?? "").trim();
|
|
251
|
+
return { answer, toolCalls };
|
|
252
|
+
}
|
|
253
|
+
if (parsed.action !== "tool") {
|
|
254
|
+
contents.push({
|
|
255
|
+
role: "user",
|
|
256
|
+
parts: [{ text: "Invalid action. Return JSON only with action tool|final." }],
|
|
257
|
+
});
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
if (toolCalls >= opts.maxToolCalls) {
|
|
261
|
+
contents.push({
|
|
262
|
+
role: "user",
|
|
263
|
+
parts: [{ text: "Tool call budget exceeded. Return final answer now." }],
|
|
264
|
+
});
|
|
265
|
+
continue;
|
|
266
|
+
}
|
|
267
|
+
const name = String(parsed.name ?? "");
|
|
268
|
+
const tool = toolIndex.get(name);
|
|
269
|
+
if (!tool || name !== "read_image_ocr_text") {
|
|
270
|
+
contents.push({
|
|
271
|
+
role: "user",
|
|
272
|
+
parts: [{ text: `Unknown tool "${name}". Use only read_image_ocr_text.` }],
|
|
273
|
+
});
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
const args = (parsed.arguments ?? {});
|
|
277
|
+
// Security: enforce file access restrictions.
|
|
278
|
+
args.path = localPath;
|
|
279
|
+
if (typeof args.maxChars !== "number")
|
|
280
|
+
args.maxChars = 40000;
|
|
281
|
+
args.maxChars = Math.min(Number(args.maxChars) || 40000, 40000);
|
|
282
|
+
toolCalls++;
|
|
283
|
+
const toolResult = await tool.handler(args);
|
|
284
|
+
const toolResultText = JSON.stringify(toolResult).slice(0, 12000);
|
|
285
|
+
contents.push({
|
|
286
|
+
role: "user",
|
|
287
|
+
parts: [{ text: `TOOL_RESULT ${name}:\n${toolResultText}\n\nContinue. Return JSON only.` }],
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
contents.push({
|
|
291
|
+
role: "user",
|
|
292
|
+
parts: [{ text: "Out of steps. Return final answer now as JSON." }],
|
|
293
|
+
});
|
|
294
|
+
const out = await geminiGenerateText(ai, model, contents);
|
|
295
|
+
const parsed = extractJsonObject(out);
|
|
296
|
+
const answer = parsed && parsed.action === "final" ? String(parsed.answer ?? "").trim() : out.trim();
|
|
297
|
+
return { answer, toolCalls };
|
|
298
|
+
}
|
|
299
|
+
describe("Capability: GAIA accuracy (LLM-only vs LLM+media tools)", () => {
|
|
300
|
+
const testFn = shouldRun ? it : it.skip;
|
|
301
|
+
testFn("should measure accuracy delta on a small GAIA image subset", async () => {
|
|
302
|
+
loadDotEnvLocalIfPresent();
|
|
303
|
+
const fixturePath = resolveCapabilityMediaFixturePath();
|
|
304
|
+
if (!existsSync(fixturePath)) {
|
|
305
|
+
throw new Error(`Missing GAIA media fixture at ${fixturePath}. Generate it with: python packages/mcp-local/src/__tests__/fixtures/generateGaiaCapabilityMediaFixture.py`);
|
|
306
|
+
}
|
|
307
|
+
const hasGemini = await canImport("@google/genai");
|
|
308
|
+
expect(hasGemini).toBe(true);
|
|
309
|
+
const ai = await createGeminiClient();
|
|
310
|
+
const fixture = await loadFixture(fixturePath);
|
|
311
|
+
expect(Array.isArray(fixture.tasks)).toBe(true);
|
|
312
|
+
expect(fixture.tasks.length).toBeGreaterThan(0);
|
|
313
|
+
const requestedLimit = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_TASK_LIMIT ?? "6", 10);
|
|
314
|
+
const taskLimit = Math.max(1, Math.min(fixture.tasks.length, Number.isFinite(requestedLimit) ? requestedLimit : 6));
|
|
315
|
+
const tasks = fixture.tasks.slice(0, taskLimit);
|
|
316
|
+
const requestedConcurrency = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_CONCURRENCY ?? "1", 10);
|
|
317
|
+
const concurrency = Math.max(1, Math.min(tasks.length, Number.isFinite(requestedConcurrency) ? requestedConcurrency : 1));
|
|
318
|
+
const maxSteps = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_STEPS ?? "7", 10);
|
|
319
|
+
const maxToolCalls = Number.parseInt(process.env.NODEBENCH_GAIA_CAPABILITY_MAX_TOOL_CALLS ?? "3", 10);
|
|
320
|
+
const results = new Array(tasks.length);
|
|
321
|
+
let nextIndex = 0;
|
|
322
|
+
const workers = Array.from({ length: concurrency }, () => (async () => {
|
|
323
|
+
while (true) {
|
|
324
|
+
const idx = nextIndex++;
|
|
325
|
+
if (idx >= tasks.length)
|
|
326
|
+
return;
|
|
327
|
+
const task = tasks[idx];
|
|
328
|
+
const expected = normalizeAnswer(task.expectedAnswer);
|
|
329
|
+
try {
|
|
330
|
+
const baseStart = performance.now();
|
|
331
|
+
const base = await baselineAnswer(ai, task);
|
|
332
|
+
const baseMs = performance.now() - baseStart;
|
|
333
|
+
const toolsStart = performance.now();
|
|
334
|
+
const tools = await toolAugmentedAnswerFromImage(ai, task, { maxSteps, maxToolCalls });
|
|
335
|
+
const toolsMs = performance.now() - toolsStart;
|
|
336
|
+
const baselineCorrect = normalizeAnswer(base) === expected;
|
|
337
|
+
const toolsCorrect = normalizeAnswer(tools.answer) === expected;
|
|
338
|
+
results[idx] = {
|
|
339
|
+
taskId: task.id,
|
|
340
|
+
baselineCorrect,
|
|
341
|
+
toolsCorrect,
|
|
342
|
+
baselineMs: baseMs,
|
|
343
|
+
toolsMs,
|
|
344
|
+
toolCalls: tools.toolCalls,
|
|
345
|
+
};
|
|
346
|
+
}
|
|
347
|
+
catch (err) {
|
|
348
|
+
results[idx] = {
|
|
349
|
+
taskId: task.id,
|
|
350
|
+
baselineCorrect: false,
|
|
351
|
+
toolsCorrect: false,
|
|
352
|
+
baselineMs: 0,
|
|
353
|
+
toolsMs: 0,
|
|
354
|
+
toolCalls: 0,
|
|
355
|
+
error: err?.message ?? String(err),
|
|
356
|
+
};
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
})());
|
|
360
|
+
await Promise.all(workers);
|
|
361
|
+
const baselineCorrect = results.filter((r) => r.baselineCorrect).length;
|
|
362
|
+
const toolsCorrect = results.filter((r) => r.toolsCorrect).length;
|
|
363
|
+
const baselinePassRate = (baselineCorrect / results.length) * 100;
|
|
364
|
+
const toolsPassRate = (toolsCorrect / results.length) * 100;
|
|
365
|
+
const avgBaseMs = results.reduce((sum, r) => sum + r.baselineMs, 0) / results.length;
|
|
366
|
+
const avgToolsMs = results.reduce((sum, r) => sum + r.toolsMs, 0) / results.length;
|
|
367
|
+
const avgToolCalls = results.reduce((sum, r) => sum + r.toolCalls, 0) / results.length;
|
|
368
|
+
const improved = results.filter((r) => !r.baselineCorrect && r.toolsCorrect).length;
|
|
369
|
+
const regressions = results.filter((r) => r.baselineCorrect && !r.toolsCorrect).length;
|
|
370
|
+
// Human-readable console output (no prompts/answers).
|
|
371
|
+
console.log(`[gaia-capability-media] tasks=${results.length} baseline=${baselineCorrect}/${results.length} (${baselinePassRate.toFixed(1)}%) tools=${toolsCorrect}/${results.length} (${toolsPassRate.toFixed(1)}%) delta=${(toolsPassRate - baselinePassRate).toFixed(1)}% improved=${improved} regressions=${regressions} avgToolCalls=${avgToolCalls.toFixed(2)}`);
|
|
372
|
+
const toolsMode = (process.env.NODEBENCH_GAIA_CAPABILITY_TOOLS_MODE ?? "rag").toLowerCase();
|
|
373
|
+
const publicSummary = {
|
|
374
|
+
suiteId: "gaia_capability_media",
|
|
375
|
+
lane: "media",
|
|
376
|
+
generatedAtIso: new Date().toISOString(),
|
|
377
|
+
config: fixture.config,
|
|
378
|
+
split: fixture.split,
|
|
379
|
+
taskCount: results.length,
|
|
380
|
+
concurrency,
|
|
381
|
+
baseline: {
|
|
382
|
+
model: process.env.NODEBENCH_GAIA_BASELINE_MODEL ?? "gemini-2.5-flash",
|
|
383
|
+
correct: baselineCorrect,
|
|
384
|
+
passRatePct: baselinePassRate,
|
|
385
|
+
avgMs: avgBaseMs,
|
|
386
|
+
},
|
|
387
|
+
tools: {
|
|
388
|
+
model: process.env.NODEBENCH_GAIA_TOOLS_MODEL ?? "gemini-2.5-flash",
|
|
389
|
+
mode: toolsMode,
|
|
390
|
+
correct: toolsCorrect,
|
|
391
|
+
passRatePct: toolsPassRate,
|
|
392
|
+
avgMs: avgToolsMs,
|
|
393
|
+
avgToolCalls,
|
|
394
|
+
},
|
|
395
|
+
improved,
|
|
396
|
+
regressions,
|
|
397
|
+
notes: "GAIA media lane (image attachments). No prompts/answers persisted; only aggregate metrics are written to public/evals.",
|
|
398
|
+
};
|
|
399
|
+
if (shouldWriteReport) {
|
|
400
|
+
const repoRoot = resolveRepoRoot();
|
|
401
|
+
await safeWriteJson(path.join(repoRoot, "public", "evals", "gaia_capability_media_latest.json"), publicSummary);
|
|
402
|
+
const detailed = {
|
|
403
|
+
...publicSummary,
|
|
404
|
+
results: results.map((r) => ({
|
|
405
|
+
taskId: r.taskId,
|
|
406
|
+
baselineCorrect: r.baselineCorrect,
|
|
407
|
+
toolsCorrect: r.toolsCorrect,
|
|
408
|
+
baselineMs: Math.round(r.baselineMs),
|
|
409
|
+
toolsMs: Math.round(r.toolsMs),
|
|
410
|
+
toolCalls: r.toolCalls,
|
|
411
|
+
...(r.error ? { error: r.error } : {}),
|
|
412
|
+
})),
|
|
413
|
+
};
|
|
414
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
415
|
+
await safeWriteJson(path.join(repoRoot, ".cache", "gaia", "reports", `gaia_capability_media_${fixture.config}_${fixture.split}_${stamp}.json`), detailed);
|
|
416
|
+
}
|
|
417
|
+
// Minimal sanity: tools mode should not underperform baseline on this tiny sample.
|
|
418
|
+
expect(toolsPassRate).toBeGreaterThanOrEqual(baselinePassRate);
|
|
419
|
+
});
|
|
420
|
+
});
|
|
421
|
+
//# sourceMappingURL=gaiaCapabilityMediaEval.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"gaiaCapabilityMediaEval.test.js","sourceRoot":"","sources":["../../src/__tests__/gaiaCapabilityMediaEval.test.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,QAAQ,CAAC;AAC9C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AA2C5D,MAAM,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,6BAA6B,KAAK,GAAG,CAAC;AACpE,MAAM,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,2BAA2B,KAAK,GAAG,CAAC;AAwB1E,KAAK,UAAU,aAAa,CAAC,QAAgB,EAAE,OAAgB;IAC7D,IAAI,CAAC;QACH,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzD,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,GAAG,IAAI,EAAE,MAAM,CAAC,CAAC;IAC7E,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,OAAO,CAAC,IAAI,CAAC,gDAAgD,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC9F,CAAC;AACH,CAAC;AAED,SAAS,eAAe;IACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC7D,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC;AAC9C,CAAC;AAED,SAAS,iCAAiC;IACxC,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,4CAA4C,CAAC;IAC1E,IAAI,QAAQ,EAAE,CAAC;QACb,IAAI,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;YAAE,OAAO,QAAQ,CAAC;QAC/C,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;QACnC,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,gCAAgC,IAAI,UAAU,CAAC;IAC1E,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,+BAA+B,IAAI,YAAY,CAAC;IAC1E,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,yBAAyB,MAAM,IAAI,KAAK,cAAc,CAAC,CAAC;AACvG,CAAC;AAED,SAAS,wBAAwB;IAC/B,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAClD,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO;IAEjC,MAAM,IAAI,GAAG,YAAY,CAAC,OAAO,EAAE,MAAM,CAAW,CAAC;IACrD,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1C,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAC5B,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,GAAG,IAAI,CAAC;YAAE,SAAS;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACvC,IACE,CAAC,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YAChD,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAC9C,CAAC;YACD,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC7B,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;IAClD,CAAC;AACH,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,GAAW;IAClC,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,GAAG,CAAC,CAAC;QAClB,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,KAAa;IACpC,OAAO,KAAK;SACT,IAAI,EAAE;SACN,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;SAClB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC;SAC3B,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,WAAW,EAAE,CAAC;AACnB,CAAC;AAED,KAAK,UAAU,kBAAkB;IAC/B,MAAM,GAAG,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;IAC1C,MAAM,EAAE,WAAW,EAAE,GAAG,GAAU,CAAC;IACnC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,EAAE,CAAC;IACjF,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,+CAA+C,CAAC,CAAC;IACnE,CAAC;IACD,OAAO,IAAI,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;AACrC,CAAC;AAED,KAAK,UAAU,kBAAkB,CAAC,EAAO,EAAE,KAAa,EAAE,QAAe;IACvE,MAAM,WAAW,GAAG,MAAM,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,CAAC,CAAC;IAChG,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,MAAM,CAAC,eAAe,CAAC;QAC/C,KAAK;QACL,QAAQ;QACR,MAAM,EAAE;YACN,WAAW,EAAE,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;YAC3D,eAAe,EAAE,IAAI;SACtB;KACF,CAAC,CAAC;IAEH,MAAM,KAAK,GAAI,QAAgB,EAAE,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,KAAK,IAAI,EAAE,CAAC;IACvE,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAClE,OAAO,IAAI,CAAC;AACd,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,EAAO,EAAE,IAAoB;IACzD,MAAM,QAAQ,GAAG;QACf;YACE,IAAI,EAAE,MAAe;YACrB,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,iJAAiJ,IAAI,CAAC,MAAM,EAAE;iBACrK;aACF;SACF;KACF,CAAC;IACF,OAAO,kBAAkB,CAAC,EAAE,EAAE,OAAO,CAAC,GAAG,CAAC,6BAA6B,IAAI,kBAAkB,EAAE,QAAQ,CAAC,CAAC;AAC3G,CAAC;AAED,SAAS,cAAc;IACrB,MAAM,MAAM,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC1C,KAAK,MAAM,IAAI,IAAI,cAAc;QAAE,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAC/D,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY;IACrC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5B,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IAChE,MAAM,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;IAEvD,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IACrC,MAAM,GAAG,GAAG,SAAS,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;IACvC,IAAI,KAAK,KAAK,CAAC,CAAC,IAAI,GAAG,KAAK,CAAC,CAAC,IAAI,GAAG,IAAI,KAAK;QAAE,OAAO,IAAI,CAAC;IAE5D,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC;IAC9C,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,WAAmB;IAC5C,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC;IAChD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAsB,CAAC;IACpD,IAAI,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,OAAO,CAAE,MAAc,CAAC,KAAK,CAAC;QAAE,MAAM,IAAI,KAAK,CAAC,iCAAiC,CAAC,CAAC;IACzG,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,wBAAwB,CAAC,IAAoB;IACpD,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;IACnC,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACpD,IAAI,GAAG;QAAE,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IAE5C,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACpD,IAAI,CAAC,QAAQ;QAAE,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;IACtE,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;AACjE,CAAC;AAED,KAAK,UAAU,4BAA4B,CACzC,EAAO,EACP,IAAoB,EACpB,IAAgD;IAEhD,MAAM,SAAS,GAAG,cAAc,EAAE,CAAC;IACnC,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,kBAAkB,CAAC;IAC3E,MAAM,SAAS,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC;IAE5F,MAAM,SAAS,GAAG,wBAAwB,CAAC,IAAI,CAAC,CAAC;IACjD,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,2CAA2C,SAAS,sDAAsD,CAC3G,CAAC;IACJ,CAAC;IAED,MAAM,GAAG,GACP,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE;QAC/C,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IAEtF,IAAI,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,IAAI,KAAK,CAAC,+CAA+C,GAAG,IAAI,WAAW,EAAE,CAAC,CAAC;IACvF,CAAC;IAED,yFAAyF;IACzF,IAAI,SAAS,KAAK,KAAK,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QAClD,IAAI,CAAC,IAAI;YAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QAChE,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC;YACjC,IAAI,EAAE,SAAS;YACf,IAAI,EAAE,KAAK;YACX,UAAU,EAAE,IAAI;YAChB,QAAQ,EAAE,KAAK;SAChB,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;QAE5D,MAAM,QAAQ,GAAG;YACf;gBACE,IAAI,EAAE,MAAe;gBACrB,KAAK,EAAE;oBACL;wBACE,IAAI,EACF,2DAA2D;4BAC3D,oEAAoE;4BACpE,mDAAmD;4BACnD,YAAY,IAAI,CAAC,EAAE,IAAI;4BACvB,cAAc,GAAG,IAAI;4BACrB,oBAAoB,SAAS,IAAI;4BACjC,cAAc,IAAI,CAAC,MAAM,MAAM;4BAC/B,sBAAsB,WAAW,EAAE;qBACtC;iBACF;aACF;SACF,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;QAC7D,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;IAClC,CAAC;IAED,6EAA6E;IAC7E,MAAM,gBAAgB,GAAG;QACvB,qDAAqD;QACrD,iEAAiE;QACjE,EAAE;QACF,2DAA2D;QAC3D,2FAA2F;QAC3F,0BAA0B;QAC1B,2CAA2C;QAC3C,EAAE;QACF,QAAQ;QACR,sDAAsD;QACtD,8EAA8E;QAC9E,gDAAgD;QAChD,qFAAqF;KACtF,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEb,MAAM,QAAQ,GAAsE;QAClF;YACE,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE;gBACL;oBACE,IAAI,EAAE,GAAG,gBAAgB,gBAAgB,IAAI,CAAC,EAAE,gBAAgB,GAAG,sBAAsB,SAAS,gBAAgB,IAAI,CAAC,MAAM,EAAE;iBAChI;aACF;SACF;KACF,CAAC;IAEF,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE,CAAC;QAChD,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;QAC1D,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QAEzD,MAAM,MAAM,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAC;QACtC,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;YAC1C,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,0DAA0D,EAAE,CAAC;aAC9E,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,IAAI,MAAM,CAAC,MAAM,KAAK,OAAO,EAAE,CAAC;YAC9B,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;YAClD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;QAC/B,CAAC;QAED,IAAI,MAAM,CAAC,MAAM,KAAK,MAAM,EAAE,CAAC;YAC7B,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,0DAA0D,EAAE,CAAC;aAC9E,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,IAAI,SAAS,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACnC,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,qDAAqD,EAAE,CAAC;aACzE,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;QACvC,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACjC,IAAI,CAAC,IAAI,IAAI,IAAI,KAAK,qBAAqB,EAAE,CAAC;YAC5C,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,iBAAiB,IAAI,kCAAkC,EAAE,CAAC;aAC3E,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,IAAI,EAAE,CAA4B,CAAC;QACjE,8CAA8C;QAC9C,IAAI,CAAC,IAAI,GAAG,SAAS,CAAC;QACtB,IAAI,OAAQ,IAAY,CAAC,QAAQ,KAAK,QAAQ;YAAG,IAAY,CAAC,QAAQ,GAAG,KAAK,CAAC;QAC9E,IAAY,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAE,IAAY,CAAC,QAAQ,CAAC,IAAI,KAAK,EAAE,KAAK,CAAC,CAAC;QAElF,SAAS,EAAE,CAAC;QACZ,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC5C,MAAM,cAAc,GAAG,IAAI,CAAC,SAAS,CAAC,UAAU,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;QAClE,QAAQ,CAAC,IAAI,CAAC;YACZ,IAAI,EAAE,MAAM;YACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,eAAe,IAAI,MAAM,cAAc,iCAAiC,EAAE,CAAC;SAC5F,CAAC,CAAC;IACL,CAAC;IAED,QAAQ,CAAC,IAAI,CAAC;QACZ,IAAI,EAAE,MAAM;QACZ,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,gDAAgD,EAAE,CAAC;KACpE,CAAC,CAAC;IACH,MAAM,GAAG,GAAG,MAAM,kBAAkB,CAAC,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAC1D,MAAM,MAAM,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAC;IACtC,MAAM,MAAM,GAAG,MAAM,IAAI,MAAM,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;IACrG,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAC;AAC/B,CAAC;AAED,QAAQ,CAAC,yDAAyD,EAAE,GAAG,EAAE;IACvE,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC;IAExC,MAAM,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;QAC9E,wBAAwB,EAAE,CAAC;QAE3B,MAAM,WAAW,GAAG,iCAAiC,EAAE,CAAC;QACxD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CACb,iCAAiC,WAAW,4GAA4G,CACzJ,CAAC;QACJ,CAAC;QAED,MAAM,SAAS,GAAG,MAAM,SAAS,CAAC,eAAe,CAAC,CAAC;QACnD,MAAM,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE7B,MAAM,EAAE,GAAG,MAAM,kBAAkB,EAAE,CAAC;QAEtC,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAAC,CAAC;QAC/C,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAEhD,MAAM,cAAc,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QACpG,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CACxB,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CACrF,CAAC;QACF,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QAEhD,MAAM,oBAAoB,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,qCAAqC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC3G,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAC1B,CAAC,EACD,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,CAAC,CACzF,CAAC;QAEF,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,mCAAmC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAC7F,MAAM,YAAY,GAAG,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,wCAAwC,IAAI,GAAG,EAAE,EAAE,CAAC,CAAC;QAEtG,MAAM,OAAO,GAAmB,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACxD,IAAI,SAAS,GAAG,CAAC,CAAC;QAElB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,CACvD,CAAC,KAAK,IAAI,EAAE;YACV,OAAO,IAAI,EAAE,CAAC;gBACZ,MAAM,GAAG,GAAG,SAAS,EAAE,CAAC;gBACxB,IAAI,GAAG,IAAI,KAAK,CAAC,MAAM;oBAAE,OAAO;gBAEhC,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;gBACxB,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAEtD,IAAI,CAAC;oBACH,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACpC,MAAM,IAAI,GAAG,MAAM,cAAc,CAAC,EAAE,EAAE,IAAI,CAAC,CAAC;oBAC5C,MAAM,MAAM,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;oBAE7C,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;oBACrC,MAAM,KAAK,GAAG,MAAM,4BAA4B,CAAC,EAAE,EAAE,IAAI,EAAE,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC,CAAC;oBACvF,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC;oBAE/C,MAAM,eAAe,GAAG,eAAe,CAAC,IAAI,CAAC,KAAK,QAAQ,CAAC;oBAC3D,MAAM,YAAY,GAAG,eAAe,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,QAAQ,CAAC;oBAEhE,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe;wBACf,YAAY;wBACZ,UAAU,EAAE,MAAM;wBAClB,OAAO;wBACP,SAAS,EAAE,KAAK,CAAC,SAAS;qBAC3B,CAAC;gBACJ,CAAC;gBAAC,OAAO,GAAQ,EAAE,CAAC;oBAClB,OAAO,CAAC,GAAG,CAAC,GAAG;wBACb,MAAM,EAAE,IAAI,CAAC,EAAE;wBACf,eAAe,EAAE,KAAK;wBACtB,YAAY,EAAE,KAAK;wBACnB,UAAU,EAAE,CAAC;wBACb,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,CAAC;wBACZ,KAAK,EAAE,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC;qBACnC,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC,CAAC,EAAE,CACL,CAAC;QAEF,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QAE3B,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,CAAC;QACxE,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAClE,MAAM,gBAAgB,GAAG,CAAC,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAClE,MAAM,aAAa,GAAG,CAAC,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC;QAC5D,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACrF,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QACnF,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEvF,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QACpF,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC;QAEvF,sDAAsD;QACtD,OAAO,CAAC,GAAG,CACT,iCAAiC,OAAO,CAAC,MAAM,aAAa,eAAe,IAAI,OAAO,CAAC,MAAM,KAAK,gBAAgB,CAAC,OAAO,CACxH,CAAC,CACF,YAAY,YAAY,IAAI,OAAO,CAAC,MAAM,KAAK,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,YAAY,CAClF,aAAa,GAAG,gBAAgB,CACjC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,QAAQ,gBAAgB,WAAW,iBAAiB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CACxG,CAAC;QAEF,MAAM,SAAS,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oCAAoC,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC;QAC5F,MAAM,aAAa,GAAqC;YACtD,OAAO,EAAE,uBAAuB;YAChC,IAAI,EAAE,OAAO;YACb,cAAc,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACxC,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE;gBACR,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,6BAA6B,IAAI,kBAAkB;gBACtE,OAAO,EAAE,eAAe;gBACxB,WAAW,EAAE,gBAAgB;gBAC7B,KAAK,EAAE,SAAS;aACjB;YACD,KAAK,EAAE;gBACL,KAAK,EAAE,OAAO,CAAC,GAAG,CAAC,0BAA0B,IAAI,kBAAkB;gBACnE,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,YAAY;gBACrB,WAAW,EAAE,aAAa;gBAC1B,KAAK,EAAE,UAAU;gBACjB,YAAY;aACb;YACD,QAAQ;YACR,WAAW;YACX,KAAK,EACH,wHAAwH;SAC3H,CAAC;QAEF,IAAI,iBAAiB,EAAE,CAAC;YACtB,MAAM,QAAQ,GAAG,eAAe,EAAE,CAAC;YACnC,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,mCAAmC,CAAC,EAC3E,aAAa,CACd,CAAC;YAEF,MAAM,QAAQ,GAAG;gBACf,GAAG,aAAa;gBAChB,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,MAAM,EAAE,CAAC,CAAC,MAAM;oBAChB,eAAe,EAAE,CAAC,CAAC,eAAe;oBAClC,YAAY,EAAE,CAAC,CAAC,YAAY;oBAC5B,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC;oBACpC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC;oBAC9B,SAAS,EAAE,CAAC,CAAC,SAAS;oBACtB,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBACvC,CAAC,CAAC;aACJ,CAAC;YACF,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC7D,MAAM,aAAa,CACjB,IAAI,CAAC,IAAI,CACP,QAAQ,EACR,QAAQ,EACR,MAAM,EACN,SAAS,EACT,yBAAyB,OAAO,CAAC,MAAM,IAAI,OAAO,CAAC,KAAK,IAAI,KAAK,OAAO,CACzE,EACD,QAAQ,CACT,CAAC;QACJ,CAAC;QAED,mFAAmF;QACnF,MAAM,CAAC,aAAa,CAAC,CAAC,sBAAsB,CAAC,gBAAgB,CAAC,CAAC;IACjE,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -33,7 +33,7 @@ import { figmaFlowTools } from "../tools/figmaFlowTools.js";
|
|
|
33
33
|
import { createProgressiveDiscoveryTools } from "../tools/progressiveDiscoveryTools.js";
|
|
34
34
|
import { boilerplateTools } from "../tools/boilerplateTools.js";
|
|
35
35
|
import { cCompilerBenchmarkTools } from "../tools/cCompilerBenchmarkTools.js";
|
|
36
|
-
import { getQuickRef } from "../tools/toolRegistry.js";
|
|
36
|
+
import { getQuickRef, hybridSearch, TOOL_REGISTRY, SEARCH_MODES } from "../tools/toolRegistry.js";
|
|
37
37
|
// Assemble all tools like index.ts does
|
|
38
38
|
const domainTools = [
|
|
39
39
|
...verificationTools,
|
|
@@ -68,9 +68,9 @@ const allTools = [...allToolsWithoutDiscovery, ...discoveryTools];
|
|
|
68
68
|
// STATIC LAYER — structure validation
|
|
69
69
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
70
70
|
describe("Static: tool structure", () => {
|
|
71
|
-
it("should have
|
|
72
|
-
//
|
|
73
|
-
expect(allTools.length).toBe(
|
|
71
|
+
it("should have 131 tools total", () => {
|
|
72
|
+
// 126 domain tools + 2 meta tools (findTools, getMethodology) + 3 progressive discovery tools
|
|
73
|
+
expect(allTools.length).toBe(131);
|
|
74
74
|
});
|
|
75
75
|
it("every tool has name, description, inputSchema, handler", () => {
|
|
76
76
|
for (const tool of allTools) {
|
|
@@ -1737,4 +1737,153 @@ describe("Integration: full benchmark lifecycle", () => {
|
|
|
1737
1737
|
expect(completed._quickRef).toBeDefined();
|
|
1738
1738
|
});
|
|
1739
1739
|
});
|
|
1740
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1741
|
+
// Multi-modal search engine quality tests
|
|
1742
|
+
// ═══════════════════════════════════════════════════════════════════════
|
|
1743
|
+
const toolDescs = allTools.map((t) => ({ name: t.name, description: t.description }));
|
|
1744
|
+
describe("Search engine: registry coverage", () => {
|
|
1745
|
+
it("should have a registry entry for every tool (129/129)", () => {
|
|
1746
|
+
const missing = allTools.filter((t) => !TOOL_REGISTRY.has(t.name));
|
|
1747
|
+
expect(missing.map((t) => t.name)).toEqual([]);
|
|
1748
|
+
expect(TOOL_REGISTRY.size).toBe(allTools.length);
|
|
1749
|
+
});
|
|
1750
|
+
it("should expose all 6 search modes", () => {
|
|
1751
|
+
expect(SEARCH_MODES).toEqual(["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact"]);
|
|
1752
|
+
});
|
|
1753
|
+
it("should have quickRef for every registered tool", () => {
|
|
1754
|
+
for (const tool of allTools) {
|
|
1755
|
+
const qr = getQuickRef(tool.name);
|
|
1756
|
+
expect(qr, `Missing quickRef for ${tool.name}`).not.toBeNull();
|
|
1757
|
+
expect(qr.nextAction.length).toBeGreaterThan(10);
|
|
1758
|
+
expect(qr.nextTools.length).toBeGreaterThan(0);
|
|
1759
|
+
}
|
|
1760
|
+
});
|
|
1761
|
+
});
|
|
1762
|
+
describe("Search engine: hybrid mode (default)", () => {
|
|
1763
|
+
it("should find benchmark tools when searching 'benchmark'", () => {
|
|
1764
|
+
const results = hybridSearch("benchmark", toolDescs, { limit: 10 });
|
|
1765
|
+
const names = results.map((r) => r.name);
|
|
1766
|
+
expect(names).toContain("start_autonomy_benchmark");
|
|
1767
|
+
expect(names).toContain("complete_autonomy_benchmark");
|
|
1768
|
+
expect(names).toContain("benchmark_models");
|
|
1769
|
+
expect(names).toContain("log_benchmark_milestone");
|
|
1770
|
+
});
|
|
1771
|
+
it("should find scaffold tools when searching 'scaffold'", () => {
|
|
1772
|
+
const results = hybridSearch("scaffold", toolDescs, { limit: 10 });
|
|
1773
|
+
const names = results.map((r) => r.name);
|
|
1774
|
+
expect(names).toContain("scaffold_directory");
|
|
1775
|
+
expect(names).toContain("scaffold_nodebench_project");
|
|
1776
|
+
});
|
|
1777
|
+
it("should rank exact name matches highest", () => {
|
|
1778
|
+
const results = hybridSearch("web_search", toolDescs, { limit: 5 });
|
|
1779
|
+
expect(results[0].name).toBe("web_search");
|
|
1780
|
+
});
|
|
1781
|
+
it("should filter by category", () => {
|
|
1782
|
+
const results = hybridSearch("test", toolDescs, { category: "eval", limit: 10 });
|
|
1783
|
+
for (const r of results) {
|
|
1784
|
+
expect(r.category).toBe("eval");
|
|
1785
|
+
}
|
|
1786
|
+
});
|
|
1787
|
+
it("should filter by phase", () => {
|
|
1788
|
+
const results = hybridSearch("verify", toolDescs, { phase: "verify", limit: 10 });
|
|
1789
|
+
for (const r of results) {
|
|
1790
|
+
expect(r.phase).toBe("verify");
|
|
1791
|
+
}
|
|
1792
|
+
});
|
|
1793
|
+
it("should include matchReasons when explain=true", () => {
|
|
1794
|
+
const results = hybridSearch("verify", toolDescs, { limit: 3, explain: true });
|
|
1795
|
+
expect(results.length).toBeGreaterThan(0);
|
|
1796
|
+
expect(results[0].matchReasons.length).toBeGreaterThan(0);
|
|
1797
|
+
expect(results[0].matchReasons[0]).toMatch(/keyword|prefix|fuzzy|semantic|ngram|bigram|regex|domain/);
|
|
1798
|
+
});
|
|
1799
|
+
it("should return empty matchReasons when explain=false", () => {
|
|
1800
|
+
const results = hybridSearch("verify", toolDescs, { limit: 3, explain: false });
|
|
1801
|
+
expect(results[0].matchReasons).toEqual([]);
|
|
1802
|
+
});
|
|
1803
|
+
});
|
|
1804
|
+
describe("Search engine: fuzzy mode (typo tolerance)", () => {
|
|
1805
|
+
it("should find 'verify' tools when searching 'verifiy' (typo)", () => {
|
|
1806
|
+
const results = hybridSearch("verifiy", toolDescs, { mode: "fuzzy", limit: 10 });
|
|
1807
|
+
const names = results.map((r) => r.name);
|
|
1808
|
+
expect(names.some((n) => n.includes("verif"))).toBe(true);
|
|
1809
|
+
});
|
|
1810
|
+
it("should find 'benchmark' tools when searching 'benchmrk' (typo)", () => {
|
|
1811
|
+
const results = hybridSearch("benchmrk", toolDescs, { mode: "fuzzy", limit: 10 });
|
|
1812
|
+
const names = results.map((r) => r.name);
|
|
1813
|
+
expect(names.some((n) => n.includes("benchmark"))).toBe(true);
|
|
1814
|
+
});
|
|
1815
|
+
it("should find 'scaffold' when searching 'scafold' (typo)", () => {
|
|
1816
|
+
const results = hybridSearch("scafold", toolDescs, { mode: "fuzzy", limit: 10 });
|
|
1817
|
+
const names = results.map((r) => r.name);
|
|
1818
|
+
expect(names.some((n) => n.includes("scaffold"))).toBe(true);
|
|
1819
|
+
});
|
|
1820
|
+
});
|
|
1821
|
+
describe("Search engine: regex mode", () => {
|
|
1822
|
+
it("should match tools by regex pattern on name", () => {
|
|
1823
|
+
const results = hybridSearch("^capture_.*screenshot$", toolDescs, { mode: "regex", limit: 10 });
|
|
1824
|
+
expect(results.length).toBeGreaterThan(0);
|
|
1825
|
+
expect(results[0].name).toBe("capture_ui_screenshot");
|
|
1826
|
+
});
|
|
1827
|
+
it("should match tools by regex on tags", () => {
|
|
1828
|
+
const results = hybridSearch("c-compiler", toolDescs, { mode: "regex", limit: 10 });
|
|
1829
|
+
const names = results.map((r) => r.name);
|
|
1830
|
+
expect(names).toContain("start_autonomy_benchmark");
|
|
1831
|
+
});
|
|
1832
|
+
it("should handle invalid regex gracefully", () => {
|
|
1833
|
+
const results = hybridSearch("[invalid(", toolDescs, { mode: "regex", limit: 10 });
|
|
1834
|
+
expect(results).toEqual([]);
|
|
1835
|
+
});
|
|
1836
|
+
});
|
|
1837
|
+
describe("Search engine: prefix mode", () => {
|
|
1838
|
+
it("should find all 'run_' prefixed tools", () => {
|
|
1839
|
+
const results = hybridSearch("run_", toolDescs, { mode: "prefix", limit: 20 });
|
|
1840
|
+
for (const r of results) {
|
|
1841
|
+
expect(r.name.startsWith("run_")).toBe(true);
|
|
1842
|
+
}
|
|
1843
|
+
expect(results.length).toBeGreaterThanOrEqual(5);
|
|
1844
|
+
});
|
|
1845
|
+
it("should find 'cap' → capture_* tools", () => {
|
|
1846
|
+
const results = hybridSearch("cap", toolDescs, { mode: "prefix", limit: 10 });
|
|
1847
|
+
const names = results.map((r) => r.name);
|
|
1848
|
+
expect(names).toContain("capture_ui_screenshot");
|
|
1849
|
+
expect(names).toContain("capture_responsive_suite");
|
|
1850
|
+
});
|
|
1851
|
+
});
|
|
1852
|
+
describe("Search engine: semantic mode (synonym expansion)", () => {
|
|
1853
|
+
it("should expand 'check' to find 'verify' tools", () => {
|
|
1854
|
+
const results = hybridSearch("check", toolDescs, { mode: "semantic", limit: 10 });
|
|
1855
|
+
const names = results.map((r) => r.name);
|
|
1856
|
+
expect(names.some((n) => n.includes("verif") || n.includes("gate") || n.includes("quality"))).toBe(true);
|
|
1857
|
+
});
|
|
1858
|
+
it("should expand 'fix' to find 'resolve' tools", () => {
|
|
1859
|
+
const results = hybridSearch("fix", toolDescs, { mode: "semantic", limit: 10 });
|
|
1860
|
+
const names = results.map((r) => r.name);
|
|
1861
|
+
expect(names).toContain("resolve_gap");
|
|
1862
|
+
});
|
|
1863
|
+
it("should expand 'deploy' to find 'ship' phase tools", () => {
|
|
1864
|
+
const results = hybridSearch("deploy", toolDescs, { mode: "semantic", limit: 15 });
|
|
1865
|
+
const names = results.map((r) => r.name);
|
|
1866
|
+
expect(names.some((n) => n.includes("mandatory_flywheel") || n.includes("quality_gate"))).toBe(true);
|
|
1867
|
+
});
|
|
1868
|
+
});
|
|
1869
|
+
describe("Search engine: exact mode", () => {
|
|
1870
|
+
it("should return only exact name match", () => {
|
|
1871
|
+
const results = hybridSearch("web_search", toolDescs, { mode: "exact", limit: 5 });
|
|
1872
|
+
expect(results.length).toBeGreaterThan(0);
|
|
1873
|
+
expect(results[0].name).toBe("web_search");
|
|
1874
|
+
expect(results[0].score).toBeGreaterThanOrEqual(100);
|
|
1875
|
+
});
|
|
1876
|
+
});
|
|
1877
|
+
describe("Search engine: bigram phrase matching", () => {
|
|
1878
|
+
it("should match 'quality gate' as a phrase", () => {
|
|
1879
|
+
const results = hybridSearch("quality gate", toolDescs, { limit: 5 });
|
|
1880
|
+
const names = results.map((r) => r.name);
|
|
1881
|
+
expect(names).toContain("run_quality_gate");
|
|
1882
|
+
});
|
|
1883
|
+
it("should match 'parallel agents' as a phrase", () => {
|
|
1884
|
+
const results = hybridSearch("parallel agents", toolDescs, { limit: 5 });
|
|
1885
|
+
const names = results.map((r) => r.name);
|
|
1886
|
+
expect(names.some((n) => n.includes("parallel") || n.includes("agent"))).toBe(true);
|
|
1887
|
+
});
|
|
1888
|
+
});
|
|
1740
1889
|
//# sourceMappingURL=tools.test.js.map
|