nodebench-mcp 2.14.2 → 2.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/NODEBENCH_AGENTS.md +3 -3
  2. package/README.md +9 -9
  3. package/dist/__tests__/critterCalibrationEval.d.ts +8 -0
  4. package/dist/__tests__/critterCalibrationEval.js +370 -0
  5. package/dist/__tests__/critterCalibrationEval.js.map +1 -0
  6. package/dist/__tests__/embeddingProvider.test.d.ts +1 -0
  7. package/dist/__tests__/embeddingProvider.test.js +86 -0
  8. package/dist/__tests__/embeddingProvider.test.js.map +1 -0
  9. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +1 -1
  10. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
  11. package/dist/__tests__/gaiaCapabilityEval.test.js +541 -27
  12. package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
  13. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +1 -1
  14. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
  15. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +473 -4
  16. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  17. package/dist/__tests__/tools.test.js +1010 -8
  18. package/dist/__tests__/tools.test.js.map +1 -1
  19. package/dist/db.js +64 -0
  20. package/dist/db.js.map +1 -1
  21. package/dist/index.js +70 -9
  22. package/dist/index.js.map +1 -1
  23. package/dist/tools/critterTools.d.ts +21 -0
  24. package/dist/tools/critterTools.js +230 -0
  25. package/dist/tools/critterTools.js.map +1 -0
  26. package/dist/tools/embeddingProvider.d.ts +67 -0
  27. package/dist/tools/embeddingProvider.js +299 -0
  28. package/dist/tools/embeddingProvider.js.map +1 -0
  29. package/dist/tools/progressiveDiscoveryTools.js +24 -7
  30. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  31. package/dist/tools/reconTools.js +83 -33
  32. package/dist/tools/reconTools.js.map +1 -1
  33. package/dist/tools/toolRegistry.d.ts +30 -2
  34. package/dist/tools/toolRegistry.js +253 -25
  35. package/dist/tools/toolRegistry.js.map +1 -1
  36. package/package.json +7 -3
@@ -3,7 +3,7 @@
3
3
  * Covers: static, unit, integration layers.
4
4
  * Live E2E layer is tested via bash pipe in the flywheel step.
5
5
  */
6
- import { describe, it, expect } from "vitest";
6
+ import { describe, it, expect, afterEach } from "vitest";
7
7
  import os from "node:os";
8
8
  import path from "node:path";
9
9
  import { mkdtemp, writeFile } from "node:fs/promises";
@@ -39,7 +39,8 @@ import { patternTools } from "../tools/patternTools.js";
39
39
  import { gitWorkflowTools } from "../tools/gitWorkflowTools.js";
40
40
  import { seoTools } from "../tools/seoTools.js";
41
41
  import { voiceBridgeTools } from "../tools/voiceBridgeTools.js";
42
- import { getQuickRef, hybridSearch, TOOL_REGISTRY, SEARCH_MODES, ALL_REGISTRY_ENTRIES, WORKFLOW_CHAINS } from "../tools/toolRegistry.js";
42
+ import { critterTools } from "../tools/critterTools.js";
43
+ import { getQuickRef, hybridSearch, TOOL_REGISTRY, SEARCH_MODES, ALL_REGISTRY_ENTRIES, WORKFLOW_CHAINS, tokenize, buildDenseIndex, getToolComplexity } from "../tools/toolRegistry.js";
43
44
  // Assemble all tools like index.ts does
44
45
  const domainTools = [
45
46
  ...verificationTools,
@@ -72,6 +73,7 @@ const domainTools = [
72
73
  ...gitWorkflowTools,
73
74
  ...seoTools,
74
75
  ...voiceBridgeTools,
76
+ ...critterTools,
75
77
  ];
76
78
  const metaTools = createMetaTools(domainTools);
77
79
  const allToolsWithoutDiscovery = [...domainTools, ...metaTools];
@@ -81,9 +83,9 @@ const allTools = [...allToolsWithoutDiscovery, ...discoveryTools];
81
83
  // STATIC LAYER — structure validation
82
84
  // ═══════════════════════════════════════════════════════════════════════════
83
85
  describe("Static: tool structure", () => {
84
- it("should have 162 tools total", () => {
85
- // 157 domain tools + 2 meta tools (findTools, getMethodology) + 3 progressive discovery tools
86
- expect(allTools.length).toBe(162);
86
+ it("should have 163 tools total", () => {
87
+ // 158 domain tools + 2 meta tools (findTools, getMethodology) + 3 progressive discovery tools
88
+ expect(allTools.length).toBe(163);
87
89
  });
88
90
  it("every tool has name, description, inputSchema, handler", () => {
89
91
  for (const tool of allTools) {
@@ -99,6 +101,18 @@ describe("Static: tool structure", () => {
99
101
  const names = allTools.map((t) => t.name);
100
102
  expect(new Set(names).size).toBe(names.length);
101
103
  });
104
+ it("every registered tool has MCP annotations (category, phase, complexity)", () => {
105
+ // MCP 2025-11-25 spec: annotations field provides behavior hints for models.
106
+ // We surface category, phase, complexity from the registry as annotations.
107
+ for (const tool of allTools) {
108
+ const entry = TOOL_REGISTRY.get(tool.name);
109
+ expect(entry, `Missing registry entry for ${tool.name}`).toBeDefined();
110
+ expect(entry.category).toBeTruthy();
111
+ expect(entry.phase).toBeTruthy();
112
+ const complexity = getToolComplexity(tool.name);
113
+ expect(["low", "medium", "high"]).toContain(complexity);
114
+ }
115
+ });
102
116
  it("abandon_cycle tool exists in verificationTools", () => {
103
117
  const tool = verificationTools.find((t) => t.name === "abandon_cycle");
104
118
  expect(tool).toBeDefined();
@@ -1799,13 +1813,31 @@ describe("Integration: full benchmark lifecycle", () => {
1799
1813
  // ═══════════════════════════════════════════════════════════════════════
1800
1814
  const toolDescs = allTools.map((t) => ({ name: t.name, description: t.description }));
1801
1815
  describe("Search engine: registry coverage", () => {
1802
- it("should have a registry entry for every tool (162/162)", () => {
1816
+ it("should have a registry entry for every tool (163/163)", () => {
1803
1817
  const missing = allTools.filter((t) => !TOOL_REGISTRY.has(t.name));
1804
1818
  expect(missing.map((t) => t.name)).toEqual([]);
1805
1819
  expect(TOOL_REGISTRY.size).toBe(allTools.length);
1806
1820
  });
1807
- it("should expose all 7 search modes", () => {
1808
- expect(SEARCH_MODES).toEqual(["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact", "dense"]);
1821
+ it("should expose all 8 search modes", () => {
1822
+ expect(SEARCH_MODES).toEqual(["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact", "dense", "embedding"]);
1823
+ });
1824
+ it("discover_tools category enum covers every registry category", () => {
1825
+ const discoverTool = allTools.find((t) => t.name === "discover_tools");
1826
+ expect(discoverTool).toBeDefined();
1827
+ const categoryEnum = discoverTool.inputSchema.properties.category.enum;
1828
+ const registryCategories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
1829
+ for (const cat of registryCategories) {
1830
+ expect(categoryEnum, `category "${cat}" missing from discover_tools enum`).toContain(cat);
1831
+ }
1832
+ });
1833
+ it("get_workflow_chain enum covers every WORKFLOW_CHAINS key", () => {
1834
+ const chainTool = allTools.find((t) => t.name === "get_workflow_chain");
1835
+ expect(chainTool).toBeDefined();
1836
+ const chainEnum = chainTool.inputSchema.properties.chain.enum;
1837
+ for (const key of Object.keys(WORKFLOW_CHAINS)) {
1838
+ expect(chainEnum, `chain "${key}" missing from get_workflow_chain enum`).toContain(key);
1839
+ }
1840
+ expect(chainEnum).toContain("list");
1809
1841
  });
1810
1842
  it("should have quickRef for every registered tool", () => {
1811
1843
  for (const tool of allTools) {
@@ -1943,6 +1975,281 @@ describe("Search engine: bigram phrase matching", () => {
1943
1975
  expect(names.some((n) => n.includes("parallel") || n.includes("agent"))).toBe(true);
1944
1976
  });
1945
1977
  });
1978
+ // ── Dense search NDCG regression guard ──────────────────────────────────
1979
+ // Tested BM25 vs TF-IDF cosine (v2.14.2): TF-IDF won 0.692 vs 0.691.
1980
+ // BM25's length normalization adds no value for short tool descriptions.
1981
+ // Keeping TF-IDF cosine. This test guards against ranking regressions.
1982
+ describe("Search engine: dense search NDCG@5 regression guard", () => {
1983
+ function ndcg(rankedNames, idealNames, k) {
1984
+ const relevance = new Map();
1985
+ idealNames.forEach((name, i) => relevance.set(name, idealNames.length - i));
1986
+ let dcg = 0;
1987
+ for (let i = 0; i < Math.min(k, rankedNames.length); i++) {
1988
+ const rel = relevance.get(rankedNames[i]) ?? 0;
1989
+ dcg += rel / Math.log2(i + 2);
1990
+ }
1991
+ let idcg = 0;
1992
+ const idealRels = idealNames.map((_, i) => idealNames.length - i).sort((a, b) => b - a);
1993
+ for (let i = 0; i < Math.min(k, idealRels.length); i++) {
1994
+ idcg += idealRels[i] / Math.log2(i + 2);
1995
+ }
1996
+ return idcg === 0 ? 0 : dcg / idcg;
1997
+ }
1998
+ const EVAL_QUERIES = [
1999
+ { query: "verify my implementation", ideal: ["start_verification_cycle", "get_verification_status", "log_test_result", "run_quality_gate", "triple_verify"] },
2000
+ { query: "security audit", ideal: ["scan_dependencies", "run_code_analysis", "scan_terminal_security", "assess_risk", "check_git_compliance"] },
2001
+ { query: "write an academic paper", ideal: ["polish_academic_text", "check_paper_logic", "generate_academic_caption", "review_paper_as_reviewer", "compress_or_expand_text"] },
2002
+ { query: "deploy my changes", ideal: ["run_mandatory_flywheel", "run_quality_gate", "assess_risk", "run_closed_loop", "log_test_result"] },
2003
+ { query: "parallel agent coordination", ideal: ["claim_agent_task", "get_parallel_status", "assign_agent_role", "bootstrap_parallel_agents", "release_agent_task"] },
2004
+ { query: "seo website performance", ideal: ["seo_audit_url", "check_page_performance", "analyze_seo_content", "check_wordpress_site", "scan_wordpress_updates"] },
2005
+ { query: "voice pipeline latency", ideal: ["benchmark_voice_latency", "design_voice_pipeline", "analyze_voice_config", "generate_voice_scaffold"] },
2006
+ { query: "session notes context", ideal: ["save_session_note", "load_session_notes", "refresh_task_context"] },
2007
+ { query: "git compliance merge", ideal: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
2008
+ { query: "benchmark autonomous capability", ideal: ["start_autonomy_benchmark", "complete_autonomy_benchmark", "log_benchmark_milestone", "benchmark_models"] },
2009
+ { query: "find tools for testing", ideal: ["discover_tools", "findTools", "log_test_result", "run_tests_cli", "start_eval_run"] },
2010
+ { query: "knowledge learning record", ideal: ["record_learning", "search_all_knowledge", "save_session_note"] },
2011
+ ];
2012
+ it("TF-IDF cosine dense search should maintain NDCG@5 >= 0.60 across eval queries", () => {
2013
+ const { vectors, idf } = buildDenseIndex();
2014
+ const K = 5;
2015
+ let totalNDCG = 0;
2016
+ for (const { query, ideal } of EVAL_QUERIES) {
2017
+ const queryTokens = tokenize(query.toLowerCase());
2018
+ const queryTf = new Map();
2019
+ for (const t of queryTokens)
2020
+ queryTf.set(t, (queryTf.get(t) ?? 0) + 1);
2021
+ const maxFreq = Math.max(...queryTf.values(), 1);
2022
+ for (const [k, v] of queryTf)
2023
+ queryTf.set(k, v / maxFreq);
2024
+ const queryVec = new Map();
2025
+ for (const [term, tfVal] of queryTf) {
2026
+ queryVec.set(term, tfVal * (idf.get(term) ?? 1));
2027
+ }
2028
+ const scores = [];
2029
+ for (const [name, docVec] of vectors) {
2030
+ let dot = 0, normA = 0, normB = 0;
2031
+ for (const [k, v] of queryVec) {
2032
+ normA += v * v;
2033
+ const bv = docVec.get(k);
2034
+ if (bv !== undefined)
2035
+ dot += v * bv;
2036
+ }
2037
+ for (const v of docVec.values())
2038
+ normB += v * v;
2039
+ const sim = (normA === 0 || normB === 0) ? 0 : dot / (Math.sqrt(normA) * Math.sqrt(normB));
2040
+ if (sim > 0)
2041
+ scores.push({ name, sim });
2042
+ }
2043
+ scores.sort((a, b) => b.sim - a.sim);
2044
+ const rankedNames = scores.slice(0, K).map(r => r.name);
2045
+ totalNDCG += ndcg(rankedNames, ideal, K);
2046
+ }
2047
+ const avgNDCG = totalNDCG / EVAL_QUERIES.length;
2048
+ expect(avgNDCG).toBeGreaterThanOrEqual(0.60);
2049
+ });
2050
+ });
2051
+ // ── FTS5+BM25 A/B test: search_all_knowledge (recon_findings + gaps) ────
2052
+ // Verifies that FTS5 BM25 ranking produces relevance-ordered results
2053
+ // for variable-length recon findings and gaps, compared to LIKE (recency-only).
2054
+ describe("FTS5 BM25: search_all_knowledge relevance ranking", () => {
2055
+ const searchTool = reconTools.find((t) => t.name === "search_all_knowledge");
2056
+ const logFinding = reconTools.find((t) => t.name === "log_recon_finding");
2057
+ const runRecon = reconTools.find((t) => t.name === "run_recon");
2058
+ const startCycle = verificationTools.find((t) => t.name === "start_verification_cycle");
2059
+ const logGap = verificationTools.find((t) => t.name === "log_gap");
2060
+ it("should rank recon findings by BM25 relevance (term-specific > generic mentions)", async () => {
2061
+ // Setup: create a recon session with varied findings
2062
+ const session = (await runRecon.handler({ target: "BM25 FTS5 test session" }));
2063
+ const sid = session.sessionId;
2064
+ // Insert findings — the "MCP SDK breaking change" finding is highly relevant
2065
+ await logFinding.handler({
2066
+ sessionId: sid,
2067
+ category: "breaking_change",
2068
+ summary: "MCP SDK v2.0 introduces breaking changes to the transport layer requiring migration",
2069
+ relevance: "All MCP servers must update their transport initialization code",
2070
+ actionItems: "Update transport from stdio to new StreamableHTTP pattern",
2071
+ });
2072
+ await logFinding.handler({
2073
+ sessionId: sid,
2074
+ category: "best_practice",
2075
+ summary: "React 19 compiler optimizations reduce bundle size by 15%",
2076
+ relevance: "Frontend build pipeline could benefit from upgrade",
2077
+ actionItems: "Evaluate React 19 migration path",
2078
+ });
2079
+ await logFinding.handler({
2080
+ sessionId: sid,
2081
+ category: "new_feature",
2082
+ summary: "New MCP SDK sampling API enables server-initiated LLM requests",
2083
+ relevance: "MCP servers can now call LLMs directly through the protocol",
2084
+ actionItems: "Integrate sampling API into MCP tool handlers",
2085
+ });
2086
+ // Query for "MCP SDK breaking" — should rank MCP findings above React
2087
+ const result = (await searchTool.handler({ query: "MCP SDK breaking" }));
2088
+ const findings = result.reconFindings;
2089
+ // At minimum, MCP-related findings should appear (FTS5 MATCH or LIKE fallback)
2090
+ expect(findings.length).toBeGreaterThan(0);
2091
+ // If FTS5 BM25 is working, the breaking_change finding should rank first
2092
+ // (it has the most term overlap with "MCP SDK breaking")
2093
+ if (findings.length >= 2) {
2094
+ const firstSummary = findings[0].summary.toLowerCase();
2095
+ expect(firstSummary).toContain("breaking");
2096
+ }
2097
+ });
2098
+ it("should rank gaps by BM25 relevance (specific match > loose mention)", async () => {
2099
+ // Setup: create a verification cycle with varied gaps
2100
+ const cycle = (await startCycle.handler({
2101
+ title: "BM25 gaps FTS5 test cycle",
2102
+ }));
2103
+ const cid = cycle.cycleId;
2104
+ await logGap.handler({
2105
+ cycleId: cid,
2106
+ severity: "HIGH",
2107
+ title: "SQLite WAL mode lock contention under parallel writes",
2108
+ description: "When multiple agents write to SQLite simultaneously, WAL mode lock contention causes timeout errors after 5 seconds",
2109
+ fixStrategy: "Implement write queue with retry backoff for SQLite parallel access",
2110
+ });
2111
+ await logGap.handler({
2112
+ cycleId: cid,
2113
+ severity: "MEDIUM",
2114
+ title: "API rate limiting not implemented",
2115
+ description: "External API calls have no rate limiting or retry logic",
2116
+ fixStrategy: "Add exponential backoff with jitter for API calls",
2117
+ });
2118
+ await logGap.handler({
2119
+ cycleId: cid,
2120
+ severity: "LOW",
2121
+ title: "Test coverage below 80% for SQLite module",
2122
+ description: "SQLite database module has only 60% test coverage, missing edge cases for concurrent access",
2123
+ fixStrategy: "Add integration tests for SQLite concurrent write scenarios",
2124
+ });
2125
+ // Query for "SQLite parallel" — should rank SQLite-specific gaps above API gap
2126
+ const result = (await searchTool.handler({ query: "SQLite parallel" }));
2127
+ const gaps = result.gaps;
2128
+ expect(gaps.length).toBeGreaterThan(0);
2129
+ // If FTS5 BM25 is working, the WAL lock contention gap (HIGH severity, most term overlap) ranks first
2130
+ if (gaps.length >= 2) {
2131
+ const firstTitle = gaps[0].title.toLowerCase();
2132
+ expect(firstTitle).toContain("sqlite");
2133
+ }
2134
+ });
2135
+ });
2136
+ // ── Gateway BM25 meta-tool A/B test ────────────────────────────────────
2137
+ // Tests BM25 scoring in the gateway metaTools findTools — verifies that
2138
+ // IDF-weighted scoring ranks specific tools higher than generic matches.
2139
+ describe("Gateway BM25: findTools IDF-weighted ranking", () => {
2140
+ // Simulate the gateway's BM25 scorer with inline implementation
2141
+ function tokenize(text) {
2142
+ return text.toLowerCase().match(/[a-z_]+/g) ?? [];
2143
+ }
2144
+ // Word-count baseline (old approach)
2145
+ function wordCountSearch(query, tools) {
2146
+ const words = query.toLowerCase().split(/\s+/).filter(Boolean);
2147
+ return tools
2148
+ .map((t) => {
2149
+ const text = `${t.name} ${t.description}`.toLowerCase();
2150
+ const hits = words.filter((w) => text.includes(w)).length;
2151
+ return { name: t.name, hits };
2152
+ })
2153
+ .filter((t) => t.hits > 0)
2154
+ .sort((a, b) => b.hits - a.hits)
2155
+ .map((t) => t.name);
2156
+ }
2157
+ // BM25 search (new approach)
2158
+ function bm25Search(query, tools) {
2159
+ const corpus = new Map();
2160
+ for (const t of tools) {
2161
+ corpus.set(t.name, tokenize(`${t.name} ${t.description}`));
2162
+ }
2163
+ let totalLen = 0;
2164
+ for (const tokens of corpus.values())
2165
+ totalLen += tokens.length;
2166
+ const avgDl = corpus.size > 0 ? totalLen / corpus.size : 1;
2167
+ const docFreq = new Map();
2168
+ for (const tokens of corpus.values()) {
2169
+ const unique = new Set(tokens);
2170
+ for (const t of unique)
2171
+ docFreq.set(t, (docFreq.get(t) ?? 0) + 1);
2172
+ }
2173
+ const N = corpus.size;
2174
+ const idf = new Map();
2175
+ for (const [term, df] of docFreq) {
2176
+ idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1));
2177
+ }
2178
+ const queryTokens = tokenize(query);
2179
+ const k1 = 1.2, b = 0.75;
2180
+ return tools
2181
+ .map((t) => {
2182
+ const docTokens = corpus.get(t.name) ?? [];
2183
+ const dl = docTokens.length;
2184
+ const tf = new Map();
2185
+ for (const tok of docTokens)
2186
+ tf.set(tok, (tf.get(tok) ?? 0) + 1);
2187
+ let score = 0;
2188
+ for (const qt of queryTokens) {
2189
+ const termTf = tf.get(qt) ?? 0;
2190
+ if (termTf === 0)
2191
+ continue;
2192
+ const termIdf = idf.get(qt) ?? 0;
2193
+ score += termIdf * (termTf * (k1 + 1)) / (termTf + k1 * (1 - b + b * (dl / avgDl)));
2194
+ }
2195
+ return { name: t.name, score };
2196
+ })
2197
+ .filter((t) => t.score > 0)
2198
+ .sort((a, b) => b.score - a.score)
2199
+ .map((t) => t.name);
2200
+ }
2201
+ // Use the real tool list from allTools
2202
+ const toolEntries = allTools.map((t) => ({ name: t.name, description: t.description }));
2203
+ // Queries where IDF matters — rare terms should beat common ones
2204
+ const IDF_QUERIES = [
2205
+ {
2206
+ query: "flicker detection android",
2207
+ mustRankHigher: "start_flicker_analysis",
2208
+ mustRankLower: "web_search",
2209
+ reason: "'flicker' is rare (high IDF), 'search' is common (low IDF)",
2210
+ },
2211
+ {
2212
+ query: "autonomous benchmark c compiler",
2213
+ mustRankHigher: "start_autonomy_benchmark",
2214
+ mustRankLower: "run_quality_gate",
2215
+ reason: "'autonomy' and 'benchmark' are specific (high IDF)",
2216
+ },
2217
+ {
2218
+ query: "toon encode token",
2219
+ mustRankHigher: "toon_encode",
2220
+ mustRankLower: "record_learning",
2221
+ reason: "'toon' is extremely rare (high IDF), should dominate scoring",
2222
+ },
2223
+ ];
2224
+ it("BM25 should outperform word-count on IDF-sensitive queries", () => {
2225
+ let bm25Wins = 0;
2226
+ let wordCountWins = 0;
2227
+ for (const { query, mustRankHigher, mustRankLower } of IDF_QUERIES) {
2228
+ const bm25Results = bm25Search(query, toolEntries);
2229
+ const wordResults = wordCountSearch(query, toolEntries);
2230
+ const bm25IdxHigh = bm25Results.indexOf(mustRankHigher);
2231
+ const bm25IdxLow = bm25Results.indexOf(mustRankLower);
2232
+ const wordIdxHigh = wordResults.indexOf(mustRankHigher);
2233
+ const wordIdxLow = wordResults.indexOf(mustRankLower);
2234
+ // BM25 correctly ranks the specific tool higher
2235
+ if (bm25IdxHigh !== -1 && (bm25IdxLow === -1 || bm25IdxHigh < bm25IdxLow))
2236
+ bm25Wins++;
2237
+ if (wordIdxHigh !== -1 && (wordIdxLow === -1 || wordIdxHigh < wordIdxLow))
2238
+ wordCountWins++;
2239
+ }
2240
+ // BM25 should win at least as many IDF-sensitive queries as word-count
2241
+ expect(bm25Wins).toBeGreaterThanOrEqual(wordCountWins);
2242
+ // BM25 should get at least 2 of 3 IDF-sensitive queries correct
2243
+ expect(bm25Wins).toBeGreaterThanOrEqual(2);
2244
+ });
2245
+ it("BM25 should return results for all eval queries (no regressions)", () => {
2246
+ const queries = ["verify implementation", "search the web", "create document", "find stock prices", "security audit"];
2247
+ for (const q of queries) {
2248
+ const results = bm25Search(q, toolEntries);
2249
+ expect(results.length).toBeGreaterThan(0);
2250
+ }
2251
+ });
2252
+ });
1946
2253
  // ── Contract Compliance Tool Tests ──────────────────────────────────────
1947
2254
  describe("check_contract_compliance", () => {
1948
2255
  it("should return N/A score when no tool call data exists", async () => {
@@ -2180,4 +2487,699 @@ describe("Workflow chains: ablation_eval and task_bank_setup", () => {
2180
2487
  expect(WORKFLOW_CHAINS.task_bank_setup.steps.length).toBe(9);
2181
2488
  });
2182
2489
  });
2490
+ // ── Embedding search A/B: natural language queries where synonym map misses ──
2491
+ // ═══════════════════════════════════════════════════════════════════════════
2492
+ // CRITTER TOOL — intentionality check
2493
+ // ═══════════════════════════════════════════════════════════════════════════
2494
+ describe("Unit: critter_check", () => {
2495
+ const tool = critterTools.find((t) => t.name === "critter_check");
2496
+ it("scores a well-intentioned task as proceed", async () => {
2497
+ const result = await tool.handler({
2498
+ task: "Add embedding-based semantic search to discover_tools",
2499
+ why: "Natural language queries like 'keep track of what I learned' miss record_learning because lexical search can't bridge vocabulary gaps",
2500
+ who: "AI agents using the MCP server who think in natural language, not tool names",
2501
+ success_looks_like: "A/B eval shows 60% lexical → 85%+ hybrid hit rate with zero drops",
2502
+ });
2503
+ expect(result.score).toBeGreaterThanOrEqual(70);
2504
+ expect(result.verdict).toBe("proceed");
2505
+ });
2506
+ it("catches circular reasoning and vague audience", async () => {
2507
+ const result = await tool.handler({
2508
+ task: "Add user authentication and login system to the application",
2509
+ why: "Because we need user authentication and login system in the application",
2510
+ who: "users",
2511
+ });
2512
+ // Circular (-30) + vague audience (-20) = 50, well under 70
2513
+ expect(result.score).toBeLessThan(70);
2514
+ expect(result.feedback.some((f) => f.toLowerCase().includes("circular") || f.toLowerCase().includes("vague"))).toBe(true);
2515
+ });
2516
+ it("catches deference over understanding", async () => {
2517
+ const result = await tool.handler({
2518
+ task: "Refactor the database layer",
2519
+ why: "I was told to refactor it in the ticket",
2520
+ who: "Backend developers maintaining the codebase",
2521
+ });
2522
+ expect(result.feedback.some((f) => f.toLowerCase().includes("deference") || f.toLowerCase().includes("authority"))).toBe(true);
2523
+ });
2524
+ it("rewards specificity bonuses", async () => {
2525
+ const result = await tool.handler({
2526
+ task: "Migrate from REST to GraphQL",
2527
+ why: "Our mobile app makes 12 API calls per screen load because REST endpoints return fixed shapes — GraphQL lets us fetch exactly what each screen needs in one round trip",
2528
+ who: "Mobile team (3 iOS + 2 Android devs) who spend 40% of sprint time on API pagination workarounds",
2529
+ success_looks_like: "Screen load API calls drop from 12 to 1-2, mobile team velocity increases by at least 20%",
2530
+ simplest_version: "Start with the 3 highest-traffic screens, keep REST endpoints alive for backwards compat",
2531
+ });
2532
+ expect(result.score).toBeGreaterThanOrEqual(90);
2533
+ expect(result.verdict).toBe("proceed");
2534
+ });
2535
+ it("persists the check to SQLite", async () => {
2536
+ const result = await tool.handler({
2537
+ task: "Test persistence",
2538
+ why: "Verifying that critter checks are saved for accountability",
2539
+ who: "The test suite validating the critter tool",
2540
+ });
2541
+ expect(result.id).toBeDefined();
2542
+ expect(result.id).toMatch(/^crit_/);
2543
+ });
2544
+ });
2545
+ // These tests verify that when a neural embedding provider IS available,
2546
+ // natural language queries that lexical search struggles with get boosted.
2547
+ // When no provider is available, they gracefully skip.
2548
+ import { _setIndexForTesting, _resetForTesting as resetEmbedding } from "../tools/embeddingProvider.js";
2549
+ import { _resetCooccurrenceCache, _setCooccurrenceForTesting, _setWrrfParamsForTesting, _resetWrrfParamsForTesting } from "../tools/toolRegistry.js";
2550
+ describe("Embedding search: RRF integration with hybridSearch", () => {
2551
+ it("hybridSearch accepts embeddingQueryVec option without error", () => {
2552
+ // Even without an embedding index loaded, hybridSearch should not throw
2553
+ const results = hybridSearch("verify code", toolDescs, {
2554
+ mode: "hybrid",
2555
+ limit: 5,
2556
+ embeddingQueryVec: new Float32Array([0.5, 0.3, 0.1]),
2557
+ });
2558
+ // Should still return results from lexical strategies
2559
+ expect(results.length).toBeGreaterThan(0);
2560
+ });
2561
+ it("embedding mode without index has no embedding reasons", () => {
2562
+ resetEmbedding();
2563
+ const results = hybridSearch("keep track of lessons", toolDescs, {
2564
+ mode: "embedding",
2565
+ limit: 5,
2566
+ explain: true,
2567
+ });
2568
+ // Without an embedding index, no results should have embedding reasons
2569
+ for (const r of results) {
2570
+ expect(r.matchReasons.some((m) => m.startsWith("embedding:"))).toBe(false);
2571
+ }
2572
+ });
2573
+ it("embedding RRF adds score when index is loaded with mock vectors", () => {
2574
+ // Build a simple mock index: record_learning gets a vector close to the query
2575
+ const mockEntries = toolDescs.map((t) => ({
2576
+ name: t.name,
2577
+ // Give record_learning a "close" vector, everything else a distant one
2578
+ vector: t.name === "record_learning"
2579
+ ? new Float32Array([0.9, 0.1, 0.0])
2580
+ : new Float32Array([0.1, 0.1, 0.9]),
2581
+ nodeType: "tool",
2582
+ }));
2583
+ _setIndexForTesting(mockEntries);
2584
+ const queryVec = new Float32Array([1.0, 0.0, 0.0]);
2585
+ const results = hybridSearch("remember what I learned", toolDescs, {
2586
+ mode: "hybrid",
2587
+ limit: 10,
2588
+ explain: true,
2589
+ embeddingQueryVec: queryVec,
2590
+ });
2591
+ // record_learning should appear and have an embedding:tool_rrf reason
2592
+ const recordLearning = results.find((r) => r.name === "record_learning");
2593
+ expect(recordLearning).toBeDefined();
2594
+ expect(recordLearning.matchReasons.some((r) => r.startsWith("embedding:tool_rrf"))).toBe(true);
2595
+ // Clean up
2596
+ resetEmbedding();
2597
+ });
2598
+ it("embedding-only mode with mock index ranks by RRF", () => {
2599
+ // Set up mock where start_verification_cycle is closest to query
2600
+ const mockEntries = toolDescs.map((t) => ({
2601
+ name: t.name,
2602
+ vector: t.name === "start_verification_cycle"
2603
+ ? new Float32Array([0.95, 0.05, 0.0])
2604
+ : t.name === "run_quality_gate"
2605
+ ? new Float32Array([0.7, 0.3, 0.0])
2606
+ : new Float32Array([0.05, 0.05, 0.9]),
2607
+ nodeType: "tool",
2608
+ }));
2609
+ _setIndexForTesting(mockEntries);
2610
+ const queryVec = new Float32Array([1.0, 0.0, 0.0]);
2611
+ const results = hybridSearch("ensure correctness", toolDescs, {
2612
+ mode: "embedding",
2613
+ limit: 5,
2614
+ explain: true,
2615
+ embeddingQueryVec: queryVec,
2616
+ });
2617
+ // In embedding-only mode, results should come from embedding RRF only
2618
+ expect(results.length).toBeGreaterThan(0);
2619
+ expect(results[0].name).toBe("start_verification_cycle");
2620
+ expect(results[0].matchReasons.some((r) => r.startsWith("embedding:tool_rrf"))).toBe(true);
2621
+ resetEmbedding();
2622
+ });
2623
+ });
2624
+ // ── Agent-as-a-Graph: structural property tests ──────────────────────────
2625
+ // These tests verify the STRUCTURAL properties of the bipartite graph search:
2626
+ // 1. Domain-only proximity lifts siblings (upward traversal)
2627
+ // 2. Type-specific wRRF weight asymmetry (α_D=1.5 > α_T=1.0, per paper + ablation)
2628
+ // 3. Strong lexical matches survive noisy embeddings (non-regression)
2629
+ // 4. Execution trace edges boost co-occurring tools
2630
+ //
2631
+ // Unlike tautological tests that mock the "right answer" as close, these tests
2632
+ // prove the ALGORITHM works by testing its structural invariants.
2633
+ describe("Agent-as-a-Graph: bipartite wRRF structural properties", () => {
2634
+ // Helper: build a bipartite index where specific domains are close but NO tools are
2635
+ function buildDomainOnlyIndex(closeDomains) {
2636
+ const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
2637
+ // ALL tool nodes are distant from query — no direct tool match
2638
+ const toolEntries = toolDescs.map((t) => ({
2639
+ name: t.name,
2640
+ vector: new Float32Array([0.1, 0.1, 0.8]),
2641
+ nodeType: "tool",
2642
+ }));
2643
+ // Only specified domains are close to query
2644
+ const domainEntries = [...categories].map((cat) => ({
2645
+ name: `domain:${cat}`,
2646
+ vector: closeDomains.has(cat)
2647
+ ? new Float32Array([0.85, 0.15, 0.0])
2648
+ : new Float32Array([0.05, 0.05, 0.9]),
2649
+ nodeType: "domain",
2650
+ }));
2651
+ return [...toolEntries, ...domainEntries];
2652
+ }
2653
+ afterEach(() => {
2654
+ resetEmbedding();
2655
+ _resetCooccurrenceCache();
2656
+ });
2657
+ it("domain-only embedding proximity causes measurable rank lift for sibling tools", () => {
2658
+ // Prove CAUSATION, not just presence: compare ranks WITH vs WITHOUT domain proximity.
2659
+ // Use a query that gives moderate lexical scores to research_writing tools,
2660
+ // then show domain_rrf lifts them higher.
2661
+ const query = "polish text for submission";
2662
+ // Step 1: Baseline — lexical only (no embeddings)
2663
+ resetEmbedding();
2664
+ const baseline = hybridSearch(query, toolDescs, {
2665
+ mode: "hybrid",
2666
+ limit: 30,
2667
+ explain: true,
2668
+ });
2669
+ // Find a research_writing tool in baseline and record its rank
2670
+ const rwToolBaseline = baseline.findIndex((r) => r.category === "research_writing");
2671
+ // It should exist somewhere (polish/text/submission have some keyword overlap)
2672
+ expect(rwToolBaseline).toBeGreaterThanOrEqual(0);
2673
+ const rwToolName = baseline[rwToolBaseline].name;
2674
+ const rwBaselineScore = baseline[rwToolBaseline].score;
2675
+ // Step 2: With domain-only embeddings (research_writing domain close, NO tools close)
2676
+ const mockIndex = buildDomainOnlyIndex(new Set(["research_writing"]));
2677
+ _setIndexForTesting(mockIndex);
2678
+ const queryVec = new Float32Array([1.0, 0.0, 0.0]);
2679
+ const enhanced = hybridSearch(query, toolDescs, {
2680
+ mode: "hybrid",
2681
+ limit: 30,
2682
+ explain: true,
2683
+ embeddingQueryVec: queryVec,
2684
+ });
2685
+ const rwToolEnhanced = enhanced.find((r) => r.name === rwToolName);
2686
+ expect(rwToolEnhanced).toBeDefined();
2687
+ // CAUSATION: score increased due to domain_rrf
2688
+ expect(rwToolEnhanced.score).toBeGreaterThan(rwBaselineScore);
2689
+ expect(rwToolEnhanced.matchReasons.some((r) => r.includes("domain_rrf"))).toBe(true);
2690
+ // No tool_rrf (all tools are equally distant)
2691
+ expect(rwToolEnhanced.matchReasons.some((r) => r.includes("tool_rrf"))).toBe(false);
2692
+ // Rank should improve (lower index = higher rank)
2693
+ const rwEnhancedIdx = enhanced.findIndex((r) => r.name === rwToolName);
2694
+ expect(rwEnhancedIdx).toBeLessThanOrEqual(rwToolBaseline);
2695
+ });
2696
+ it("multiple close domains each lift their own sibling tools independently", () => {
2697
+ // Setup: security AND vision domains close, but no tools close
2698
+ const mockIndex = buildDomainOnlyIndex(new Set(["security", "vision"]));
2699
+ _setIndexForTesting(mockIndex);
2700
+ const queryVec = new Float32Array([1.0, 0.0, 0.0]);
2701
+ const results = hybridSearch("analyze security visual", toolDescs, {
2702
+ mode: "embedding",
2703
+ limit: 30,
2704
+ explain: true,
2705
+ embeddingQueryVec: queryVec,
2706
+ });
2707
+ const securityTools = results.filter((r) => r.category === "security" && r.matchReasons.some((mr) => mr.includes("domain_rrf(security")));
2708
+ const visionTools = results.filter((r) => r.category === "vision" && r.matchReasons.some((mr) => mr.includes("domain_rrf(vision")));
2709
+ // Both categories should have siblings lifted
2710
+ expect(securityTools.length).toBeGreaterThanOrEqual(1);
2711
+ expect(visionTools.length).toBeGreaterThanOrEqual(1);
2712
+ });
2713
+ it("type-specific wRRF: domain_rrf score exceeds tool_rrf (paper calibration α_D=1.5 > α_T=1.0)", () => {
2714
+ // After ablation (see "wRRF α ratio ablation" test), paper's domain emphasis wins.
2715
+ // At rank 1: α_D * 1000/(K+1) = 1.5 * 1000/61 ≈ 25, α_T * 1000/(K+1) = 1.0 * 1000/61 ≈ 16.
2716
+ // Domain emphasis means category-level matches contribute MORE than individual tool matches,
2717
+ // which helps surface all tools in a matching domain (upward traversal).
2718
+ const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
2719
+ const targetTool = "polish_academic_text";
2720
+ const toolEntries = toolDescs.map((t) => ({
2721
+ name: t.name,
2722
+ vector: t.name === targetTool
2723
+ ? new Float32Array([0.95, 0.05, 0.0])
2724
+ : new Float32Array([0.1, 0.1, 0.8]),
2725
+ nodeType: "tool",
2726
+ }));
2727
+ const domainEntries = [...categories].map((cat) => ({
2728
+ name: `domain:${cat}`,
2729
+ vector: cat === "research_writing"
2730
+ ? new Float32Array([0.90, 0.10, 0.0])
2731
+ : new Float32Array([0.05, 0.05, 0.9]),
2732
+ nodeType: "domain",
2733
+ }));
2734
+ _setIndexForTesting([...toolEntries, ...domainEntries]);
2735
+ const queryVec = new Float32Array([1.0, 0.0, 0.0]);
2736
+ const results = hybridSearch("academic writing", toolDescs, {
2737
+ mode: "embedding",
2738
+ limit: 20,
2739
+ explain: true,
2740
+ embeddingQueryVec: queryVec,
2741
+ });
2742
+ const target = results.find((r) => r.name === targetTool);
2743
+ expect(target).toBeDefined();
2744
+ // Extract individual RRF scores from matchReasons
2745
+ const toolRrfReason = target.matchReasons.find((r) => r.startsWith("embedding:tool_rrf"));
2746
+ const domainRrfReason = target.matchReasons.find((r) => r.startsWith("embedding:domain_rrf"));
2747
+ expect(toolRrfReason).toBeDefined();
2748
+ expect(domainRrfReason).toBeDefined();
2749
+ const toolScore = parseInt(toolRrfReason.match(/\+(\d+)/)?.[1] ?? "0");
2750
+ const domainScore = parseInt(domainRrfReason.match(/\+(\d+)/)?.[1] ?? "0");
2751
+ // α_D=1.5 > α_T=1.0 → domain_rrf contributes more than tool_rrf at similar ranks
2752
+ expect(domainScore).toBeGreaterThan(toolScore);
2753
+ });
2754
+ it("strong lexical matches are not displaced by noisy embeddings", () => {
2755
+ // "start verification cycle" should easily find start_verification_cycle lexically.
2756
+ // Adding uniformly noisy embeddings should NOT knock it from #1.
2757
+ resetEmbedding();
2758
+ const lexicalResults = hybridSearch("start verification cycle", toolDescs, {
2759
+ mode: "hybrid",
2760
+ limit: 5,
2761
+ });
2762
+ expect(lexicalResults[0].name).toBe("start_verification_cycle");
2763
+ // Add noisy embeddings — all vectors point roughly the same direction
2764
+ const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
2765
+ const toolEntries = toolDescs.map((t, i) => ({
2766
+ name: t.name,
2767
+ vector: new Float32Array([0.2 + (i % 10) * 0.01, 0.3, 0.7]),
2768
+ nodeType: "tool",
2769
+ }));
2770
+ const domainEntries = [...categories].map((cat, i) => ({
2771
+ name: `domain:${cat}`,
2772
+ vector: new Float32Array([0.15 + i * 0.02, 0.25, 0.7]),
2773
+ nodeType: "domain",
2774
+ }));
2775
+ _setIndexForTesting([...toolEntries, ...domainEntries]);
2776
+ const queryVec = new Float32Array([1.0, 0.0, 0.0]);
2777
+ const graphResults = hybridSearch("start verification cycle", toolDescs, {
2778
+ mode: "hybrid",
2779
+ limit: 5,
2780
+ embeddingQueryVec: queryVec,
2781
+ });
2782
+ // Lexical dominance should preserve #1 position
2783
+ expect(graphResults[0].name).toBe("start_verification_cycle");
2784
+ });
2785
+ });
2786
+ // ── Agent-as-a-Graph: execution trace edge tests ──────────────────────────
2787
+ // Validates that co-occurrence edges mined from tool_call_log boost results.
2788
+ // Uses _setCooccurrenceForTesting to inject deterministic edges.
2789
+ //
2790
+ // Key insight: trace edges only boost tools that ALREADY scored > 0 from
2791
+ // lexical matching. They lift borderline tools, not create results from nothing.
2792
+ // Tests use a data-driven approach: run baseline first, then inject edges
2793
+ // targeting actual result entries.
2794
+ describe("Agent-as-a-Graph: execution trace edges", () => {
2795
+ const TRACE_QUERY = "verify test quality";
2796
+ afterEach(() => {
2797
+ resetEmbedding();
2798
+ _resetCooccurrenceCache();
2799
+ });
2800
+ it("co-occurrence edges boost a non-top-5 tool by exactly +4", () => {
2801
+ // Step 1: Get natural ranking without trace edges
2802
+ _setCooccurrenceForTesting(new Map());
2803
+ const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
2804
+ mode: "hybrid",
2805
+ limit: 15,
2806
+ explain: true,
2807
+ });
2808
+ expect(baseline.length).toBeGreaterThanOrEqual(6);
2809
+ const topTool = baseline[0].name;
2810
+ const boostTarget = baseline[5].name; // position 6 — NOT in top 5
2811
+ const baselineScore = baseline[5].score;
2812
+ // Step 2: Inject trace edge from top tool → boost target
2813
+ _resetCooccurrenceCache();
2814
+ const edges = new Map();
2815
+ edges.set(topTool, [boostTarget]);
2816
+ _setCooccurrenceForTesting(edges);
2817
+ const boosted = hybridSearch(TRACE_QUERY, toolDescs, {
2818
+ mode: "hybrid",
2819
+ limit: 15,
2820
+ explain: true,
2821
+ });
2822
+ const result = boosted.find((r) => r.name === boostTarget);
2823
+ expect(result).toBeDefined();
2824
+ expect(result.score).toBe(baselineScore + 4);
2825
+ expect(result.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
2826
+ });
2827
+ it("top-5 tools do NOT receive trace edge self-boost", () => {
2828
+ // Get natural ranking
2829
+ _setCooccurrenceForTesting(new Map());
2830
+ const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
2831
+ mode: "hybrid",
2832
+ limit: 15,
2833
+ explain: true,
2834
+ });
2835
+ const topTool = baseline[0].name;
2836
+ const topScore = baseline[0].score;
2837
+ const secondTool = baseline[1].name;
2838
+ // Set edge FROM secondTool TO topTool — topTool is already top-5
2839
+ _resetCooccurrenceCache();
2840
+ const edges = new Map();
2841
+ edges.set(secondTool, [topTool]);
2842
+ _setCooccurrenceForTesting(edges);
2843
+ const results = hybridSearch(TRACE_QUERY, toolDescs, {
2844
+ mode: "hybrid",
2845
+ limit: 15,
2846
+ explain: true,
2847
+ });
2848
+ const top = results.find((r) => r.name === topTool);
2849
+ expect(top).toBeDefined();
2850
+ // Score should NOT increase — top-5 tools are excluded from trace boost
2851
+ expect(top.score).toBe(topScore);
2852
+ expect(top.matchReasons.some((r) => r === "trace_edge:+4")).toBe(false);
2853
+ });
2854
+ it("empty co-occurrence map produces no trace_edge boosts", () => {
2855
+ _setCooccurrenceForTesting(new Map());
2856
+ const results = hybridSearch(TRACE_QUERY, toolDescs, {
2857
+ mode: "hybrid",
2858
+ limit: 15,
2859
+ explain: true,
2860
+ });
2861
+ for (const r of results) {
2862
+ expect(r.matchReasons.some((mr) => mr.includes("trace_edge"))).toBe(false);
2863
+ }
2864
+ });
2865
+ it("trace edges from multiple top tools merge — both targets get +4", () => {
2866
+ // Get natural ranking
2867
+ _setCooccurrenceForTesting(new Map());
2868
+ const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
2869
+ mode: "hybrid",
2870
+ limit: 15,
2871
+ explain: true,
2872
+ });
2873
+ expect(baseline.length).toBeGreaterThanOrEqual(8);
2874
+ const topTool1 = baseline[0].name;
2875
+ const topTool2 = baseline[1].name;
2876
+ const target1 = baseline[6].name;
2877
+ const target2 = baseline[7].name;
2878
+ const target1BaseScore = baseline[6].score;
2879
+ const target2BaseScore = baseline[7].score;
2880
+ // Two top tools each point to a different target
2881
+ _resetCooccurrenceCache();
2882
+ const edges = new Map();
2883
+ edges.set(topTool1, [target1]);
2884
+ edges.set(topTool2, [target2]);
2885
+ _setCooccurrenceForTesting(edges);
2886
+ const results = hybridSearch(TRACE_QUERY, toolDescs, {
2887
+ mode: "hybrid",
2888
+ limit: 15,
2889
+ explain: true,
2890
+ });
2891
+ const boosted1 = results.find((r) => r.name === target1);
2892
+ const boosted2 = results.find((r) => r.name === target2);
2893
+ expect(boosted1).toBeDefined();
2894
+ expect(boosted2).toBeDefined();
2895
+ expect(boosted1.score).toBe(target1BaseScore + 4);
2896
+ expect(boosted2.score).toBe(target2BaseScore + 4);
2897
+ expect(boosted1.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
2898
+ expect(boosted2.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
2899
+ });
2900
+ });
2901
+ // ── Industry-Standard IR Metrics: Recall@K, mAP@K, NDCG@K ──────────────
2902
+ // Every tool retrieval paper (ToolBench, AnyTool, Agent-as-a-Graph, TOOLRET)
2903
+ // reports these metrics. We evaluate hybrid search against 15 intent-based
2904
+ // queries with ground-truth relevant tool sets.
2905
+ //
2906
+ // Standards compared against:
2907
+ // - Agent-as-a-Graph (arxiv:2511.18194): Recall@5=0.85, NDCG@5=0.47
2908
+ // - TOOLRET (ACL 2025): best NDCG@10=33.83 (bi-encoder only)
2909
+ // - ToolBench: NDCG@5=84.9 (contrastive-trained Sentence-BERT)
2910
+ //
2911
+ // Our system is different (single MCP server, 163 tools, 14-strategy ensemble)
2912
+ // so absolute numbers aren't comparable, but we should track and not regress.
2913
+ describe("Industry-standard IR metrics: Recall@K, mAP@K, NDCG@K", () => {
2914
+ // Ground truth: query → set of relevant tools (any order).
2915
+ // Each query has 3-6 relevant tools, reflecting realistic intent breadth.
2916
+ const EVAL_QUERIES = [
2917
+ { query: "verify my implementation is correct", relevant: ["start_verification_cycle", "get_verification_status", "log_test_result", "run_quality_gate", "triple_verify"] },
2918
+ { query: "search past findings and lessons", relevant: ["search_all_knowledge", "record_learning", "load_session_notes"] },
2919
+ { query: "run security audit on codebase", relevant: ["scan_dependencies", "run_code_analysis", "scan_terminal_security", "assess_risk"] },
2920
+ { query: "write and polish academic paper", relevant: ["polish_academic_text", "check_paper_logic", "generate_academic_caption", "review_paper_as_reviewer"] },
2921
+ { query: "coordinate parallel agent tasks", relevant: ["claim_agent_task", "get_parallel_status", "assign_agent_role", "bootstrap_parallel_agents", "release_agent_task"] },
2922
+ { query: "check website performance and SEO", relevant: ["seo_audit_url", "check_page_performance", "analyze_seo_content"] },
2923
+ { query: "save and recall context between sessions", relevant: ["save_session_note", "load_session_notes", "refresh_task_context"] },
2924
+ { query: "review git compliance before merge", relevant: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
2925
+ { query: "benchmark model autonomy", relevant: ["start_autonomy_benchmark", "complete_autonomy_benchmark", "log_benchmark_milestone"] },
2926
+ { query: "capture screenshot of UI state", relevant: ["capture_screenshot", "capture_full_page", "compare_screenshots"] },
2927
+ { query: "encode data in compact token format", relevant: ["toon_encode", "toon_decode"] },
2928
+ { query: "mine patterns from past sessions", relevant: ["mine_session_patterns", "predict_risks_from_patterns"] },
2929
+ { query: "detect video flicker artifacts", relevant: ["analyze_video_flicker", "compare_video_segments", "get_flicker_report"] },
2930
+ { query: "design voice interaction pipeline", relevant: ["design_voice_pipeline", "analyze_voice_config", "generate_voice_scaffold", "benchmark_voice_latency"] },
2931
+ { query: "check if this task is worth doing", relevant: ["critter_check"] },
2932
+ ];
2933
+ function recallAtK(ranked, relevant, k) {
2934
+ const topK = ranked.slice(0, k);
2935
+ const found = topK.filter((name) => relevant.has(name)).length;
2936
+ return found / relevant.size;
2937
+ }
2938
+ function averagePrecisionAtK(ranked, relevant, k) {
2939
+ let hits = 0;
2940
+ let sumPrecision = 0;
2941
+ for (let i = 0; i < Math.min(k, ranked.length); i++) {
2942
+ if (relevant.has(ranked[i])) {
2943
+ hits++;
2944
+ sumPrecision += hits / (i + 1);
2945
+ }
2946
+ }
2947
+ return relevant.size === 0 ? 0 : sumPrecision / relevant.size;
2948
+ }
2949
+ function ndcgAtK(ranked, relevant, k) {
2950
+ // Binary relevance: 1 if relevant, 0 otherwise
2951
+ let dcg = 0;
2952
+ for (let i = 0; i < Math.min(k, ranked.length); i++) {
2953
+ if (relevant.has(ranked[i]))
2954
+ dcg += 1 / Math.log2(i + 2);
2955
+ }
2956
+ let idcg = 0;
2957
+ const idealCount = Math.min(k, relevant.size);
2958
+ for (let i = 0; i < idealCount; i++) {
2959
+ idcg += 1 / Math.log2(i + 2);
2960
+ }
2961
+ return idcg === 0 ? 0 : dcg / idcg;
2962
+ }
2963
+ function evaluateConfig(configLabel, searchFn) {
2964
+ let totalRecall1 = 0, totalRecall3 = 0, totalRecall5 = 0, totalMap5 = 0, totalNdcg5 = 0;
2965
+ for (const { query, relevant } of EVAL_QUERIES) {
2966
+ const relevantSet = new Set(relevant);
2967
+ const ranked = searchFn(query);
2968
+ totalRecall1 += recallAtK(ranked, relevantSet, 1);
2969
+ totalRecall3 += recallAtK(ranked, relevantSet, 3);
2970
+ totalRecall5 += recallAtK(ranked, relevantSet, 5);
2971
+ totalMap5 += averagePrecisionAtK(ranked, relevantSet, 5);
2972
+ totalNdcg5 += ndcgAtK(ranked, relevantSet, 5);
2973
+ }
2974
+ const n = EVAL_QUERIES.length;
2975
+ return {
2976
+ recall1: totalRecall1 / n,
2977
+ recall3: totalRecall3 / n,
2978
+ recall5: totalRecall5 / n,
2979
+ map5: totalMap5 / n,
2980
+ ndcg5: totalNdcg5 / n,
2981
+ };
2982
+ }
2983
+ afterEach(() => {
2984
+ resetEmbedding();
2985
+ _resetCooccurrenceCache();
2986
+ _resetWrrfParamsForTesting();
2987
+ });
2988
+ it("hybrid search (lexical only) meets minimum IR thresholds", () => {
2989
+ // Baseline: no embeddings, pure lexical ensemble (keyword + fuzzy + n-gram + semantic + dense)
2990
+ resetEmbedding();
2991
+ const metrics = evaluateConfig("lexical-only", (query) => {
2992
+ const results = hybridSearch(query, toolDescs, { mode: "hybrid", limit: 10 });
2993
+ return results.map((r) => r.name);
2994
+ });
2995
+ // Minimum thresholds for our 14-strategy lexical ensemble
2996
+ // These are regression guards — if we drop below, something broke.
2997
+ expect(metrics.recall5).toBeGreaterThanOrEqual(0.55);
2998
+ expect(metrics.map5).toBeGreaterThanOrEqual(0.40);
2999
+ expect(metrics.ndcg5).toBeGreaterThanOrEqual(0.50);
3000
+ });
3001
+ it("hybrid + embedding search improves over lexical-only baseline", () => {
3002
+ // Build a realistic mock index: tools close to their own category
3003
+ const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
3004
+ const catList = [...categories];
3005
+ // Each category gets a unique direction in a high-dim space
3006
+ const mockIndex = toolDescs.map((t) => {
3007
+ const entry = TOOL_REGISTRY.get(t.name);
3008
+ const catIdx = catList.indexOf(entry?.category ?? "");
3009
+ // Tools in same category share a similar vector direction
3010
+ const vec = new Float32Array(catList.length + 1);
3011
+ if (catIdx >= 0)
3012
+ vec[catIdx] = 0.8;
3013
+ vec[catList.length] = 0.2; // small shared component
3014
+ // Normalize
3015
+ let norm = 0;
3016
+ for (let i = 0; i < vec.length; i++)
3017
+ norm += vec[i] * vec[i];
3018
+ norm = Math.sqrt(norm);
3019
+ for (let i = 0; i < vec.length; i++)
3020
+ vec[i] /= norm;
3021
+ return { name: t.name, vector: vec, nodeType: "tool" };
3022
+ });
3023
+ const domainIndex = catList.map((cat, catIdx) => {
3024
+ const vec = new Float32Array(catList.length + 1);
3025
+ vec[catIdx] = 0.9;
3026
+ vec[catList.length] = 0.1;
3027
+ let norm = 0;
3028
+ for (let i = 0; i < vec.length; i++)
3029
+ norm += vec[i] * vec[i];
3030
+ norm = Math.sqrt(norm);
3031
+ for (let i = 0; i < vec.length; i++)
3032
+ vec[i] /= norm;
3033
+ return { name: `domain:${cat}`, vector: vec, nodeType: "domain" };
3034
+ });
3035
+ _setIndexForTesting([...mockIndex, ...domainIndex]);
3036
+ // Lexical baseline
3037
+ resetEmbedding();
3038
+ const lexicalMetrics = evaluateConfig("lexical", (query) => {
3039
+ const results = hybridSearch(query, toolDescs, { mode: "hybrid", limit: 10 });
3040
+ return results.map((r) => r.name);
3041
+ });
3042
+ // Hybrid + embedding
3043
+ _setIndexForTesting([...mockIndex, ...domainIndex]);
3044
+ const embeddingMetrics = evaluateConfig("hybrid+embedding", (query) => {
3045
+ // Simulate query embedding: average of relevant category vectors
3046
+ const queryWords = query.toLowerCase().split(/\s+/);
3047
+ const queryVec = new Float32Array(catList.length + 1);
3048
+ for (const cat of catList) {
3049
+ if (queryWords.some((w) => cat.includes(w) || w.includes(cat.slice(0, 4)))) {
3050
+ queryVec[catList.indexOf(cat)] = 0.7;
3051
+ }
3052
+ }
3053
+ queryVec[catList.length] = 0.3;
3054
+ let norm = 0;
3055
+ for (let i = 0; i < queryVec.length; i++)
3056
+ norm += queryVec[i] * queryVec[i];
3057
+ norm = Math.sqrt(norm) || 1;
3058
+ for (let i = 0; i < queryVec.length; i++)
3059
+ queryVec[i] /= norm;
3060
+ const results = hybridSearch(query, toolDescs, {
3061
+ mode: "hybrid",
3062
+ limit: 10,
3063
+ embeddingQueryVec: queryVec,
3064
+ });
3065
+ return results.map((r) => r.name);
3066
+ });
3067
+ // Embedding should not degrade any metric (non-regression)
3068
+ expect(embeddingMetrics.ndcg5).toBeGreaterThanOrEqual(lexicalMetrics.ndcg5 - 0.02);
3069
+ });
3070
+ });
3071
+ // ── wRRF α ratio ablation: paper vs our calibration ──────────────────────
3072
+ // Agent-as-a-Graph (arxiv:2511.18194) optimal: α_A=1.5, α_T=1.0, K=60
3073
+ // Our calibration: α_T=1.0, α_D=0.6, K=20
3074
+ //
3075
+ // The paper optimizes for agent SELECTION across 70 MCP servers.
3076
+ // We optimize for tool RETRIEVAL within a single server.
3077
+ // This ablation verifies our deviation is justified by measuring Recall@5.
3078
+ describe("wRRF α ratio ablation: paper vs NodeBench calibration", () => {
3079
+ const ABLATION_QUERIES = [
3080
+ { query: "verify my implementation", relevant: ["start_verification_cycle", "get_verification_status", "log_test_result"] },
3081
+ { query: "search past findings", relevant: ["search_all_knowledge", "record_learning", "load_session_notes"] },
3082
+ { query: "run security checks", relevant: ["scan_dependencies", "run_code_analysis", "scan_terminal_security"] },
3083
+ { query: "coordinate parallel work", relevant: ["claim_agent_task", "get_parallel_status", "assign_agent_role"] },
3084
+ { query: "capture UI screenshots", relevant: ["capture_screenshot", "capture_full_page", "compare_screenshots"] },
3085
+ { query: "review git compliance", relevant: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
3086
+ { query: "write academic paper", relevant: ["polish_academic_text", "check_paper_logic", "generate_academic_caption"] },
3087
+ { query: "check website performance", relevant: ["seo_audit_url", "check_page_performance", "analyze_seo_content"] },
3088
+ ];
3089
+ function buildCategoryAwareIndex() {
3090
+ const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
3091
+ const catList = [...categories];
3092
+ const toolEntries = toolDescs.map((t) => {
3093
+ const entry = TOOL_REGISTRY.get(t.name);
3094
+ const catIdx = catList.indexOf(entry?.category ?? "");
3095
+ const vec = new Float32Array(catList.length);
3096
+ if (catIdx >= 0)
3097
+ vec[catIdx] = 0.85;
3098
+ // Add small noise per tool so not all tools in same cat have identical vectors
3099
+ const nameHash = t.name.split("").reduce((h, c) => ((h << 5) - h + c.charCodeAt(0)) | 0, 0);
3100
+ vec[Math.abs(nameHash) % catList.length] += 0.1;
3101
+ let norm = 0;
3102
+ for (let i = 0; i < vec.length; i++)
3103
+ norm += vec[i] * vec[i];
3104
+ norm = Math.sqrt(norm);
3105
+ for (let i = 0; i < vec.length; i++)
3106
+ vec[i] /= norm;
3107
+ return { name: t.name, vector: vec, nodeType: "tool" };
3108
+ });
3109
+ const domainEntries = catList.map((cat, catIdx) => {
3110
+ const vec = new Float32Array(catList.length);
3111
+ vec[catIdx] = 0.95;
3112
+ let norm = 0;
3113
+ for (let i = 0; i < vec.length; i++)
3114
+ norm += vec[i] * vec[i];
3115
+ norm = Math.sqrt(norm);
3116
+ for (let i = 0; i < vec.length; i++)
3117
+ vec[i] /= norm;
3118
+ return { name: `domain:${cat}`, vector: vec, nodeType: "domain" };
3119
+ });
3120
+ return [...toolEntries, ...domainEntries];
3121
+ }
3122
+ function makeQueryVec(query, catList) {
3123
+ const words = query.toLowerCase().split(/\s+/);
3124
+ const vec = new Float32Array(catList.length);
3125
+ for (const cat of catList) {
3126
+ if (words.some((w) => cat.includes(w) || w.includes(cat.slice(0, 4)))) {
3127
+ vec[catList.indexOf(cat)] = 0.8;
3128
+ }
3129
+ }
3130
+ let norm = 0;
3131
+ for (let i = 0; i < vec.length; i++)
3132
+ norm += vec[i] * vec[i];
3133
+ norm = Math.sqrt(norm) || 1;
3134
+ for (let i = 0; i < vec.length; i++)
3135
+ vec[i] /= norm;
3136
+ return vec;
3137
+ }
3138
+ function runAblation(label) {
3139
+ const catList = [...new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category))];
3140
+ let totalRecall = 0;
3141
+ for (const { query, relevant } of ABLATION_QUERIES) {
3142
+ const relevantSet = new Set(relevant);
3143
+ const queryVec = makeQueryVec(query, catList);
3144
+ const results = hybridSearch(query, toolDescs, {
3145
+ mode: "hybrid",
3146
+ limit: 10,
3147
+ embeddingQueryVec: queryVec,
3148
+ });
3149
+ const topK = results.slice(0, 5).map((r) => r.name);
3150
+ const found = topK.filter((n) => relevantSet.has(n)).length;
3151
+ totalRecall += found / relevantSet.size;
3152
+ }
3153
+ return totalRecall / ABLATION_QUERIES.length;
3154
+ }
3155
+ afterEach(() => {
3156
+ resetEmbedding();
3157
+ _resetWrrfParamsForTesting();
3158
+ });
3159
+ it("ablation grid: find optimal α_D and K for single-server tool retrieval", () => {
3160
+ const mockIndex = buildCategoryAwareIndex();
3161
+ const configs = [
3162
+ { label: "old(T=1.0,D=0.6,K=20)", alphaT: 1.0, alphaD: 0.6, k: 20 },
3163
+ { label: "paper(T=1.0,D=1.5,K=60)", alphaT: 1.0, alphaD: 1.5, k: 60 },
3164
+ { label: "paperK20(T=1.0,D=1.5,K=20)", alphaT: 1.0, alphaD: 1.5, k: 20 },
3165
+ { label: "balanced(T=1.0,D=1.0,K=20)", alphaT: 1.0, alphaD: 1.0, k: 20 },
3166
+ { label: "gentleDom(T=1.0,D=1.2,K=20)", alphaT: 1.0, alphaD: 1.2, k: 20 },
3167
+ { label: "strongDom(T=1.0,D=2.0,K=20)", alphaT: 1.0, alphaD: 2.0, k: 20 },
3168
+ ];
3169
+ const results = [];
3170
+ for (const cfg of configs) {
3171
+ _setIndexForTesting(mockIndex);
3172
+ _setWrrfParamsForTesting({ alphaT: cfg.alphaT, alphaD: cfg.alphaD, k: cfg.k });
3173
+ results.push({ label: cfg.label, recall: runAblation(cfg.label) });
3174
+ }
3175
+ // Sort by recall descending to find winner
3176
+ results.sort((a, b) => b.recall - a.recall);
3177
+ console.log(`wRRF ablation grid — Recall@5:\n${results.map((r) => ` ${r.label}: ${r.recall.toFixed(3)}`).join("\n")}`);
3178
+ // The winning config should be used as our production default.
3179
+ // Assert the winner beats the old default by at least not being worse.
3180
+ const oldResult = results.find((r) => r.label.startsWith("old"));
3181
+ const bestResult = results[0];
3182
+ expect(bestResult.recall).toBeGreaterThanOrEqual(oldResult.recall);
3183
+ });
3184
+ });
2183
3185
  //# sourceMappingURL=tools.test.js.map