nodebench-mcp 2.14.1 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/NODEBENCH_AGENTS.md +19 -9
- package/README.md +42 -19
- package/dist/__tests__/critterCalibrationEval.d.ts +8 -0
- package/dist/__tests__/critterCalibrationEval.js +370 -0
- package/dist/__tests__/critterCalibrationEval.js.map +1 -0
- package/dist/__tests__/embeddingProvider.test.d.ts +1 -0
- package/dist/__tests__/embeddingProvider.test.js +86 -0
- package/dist/__tests__/embeddingProvider.test.js.map +1 -0
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js +1 -1
- package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityEval.test.js +541 -27
- package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js +1 -1
- package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js +473 -4
- package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
- package/dist/__tests__/tools.test.js +1010 -8
- package/dist/__tests__/tools.test.js.map +1 -1
- package/dist/db.js +64 -0
- package/dist/db.js.map +1 -1
- package/dist/index.js +70 -9
- package/dist/index.js.map +1 -1
- package/dist/tools/critterTools.d.ts +21 -0
- package/dist/tools/critterTools.js +230 -0
- package/dist/tools/critterTools.js.map +1 -0
- package/dist/tools/embeddingProvider.d.ts +67 -0
- package/dist/tools/embeddingProvider.js +299 -0
- package/dist/tools/embeddingProvider.js.map +1 -0
- package/dist/tools/progressiveDiscoveryTools.js +24 -7
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/reconTools.js +83 -33
- package/dist/tools/reconTools.js.map +1 -1
- package/dist/tools/toolRegistry.d.ts +30 -2
- package/dist/tools/toolRegistry.js +253 -25
- package/dist/tools/toolRegistry.js.map +1 -1
- package/package.json +13 -3
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* Covers: static, unit, integration layers.
|
|
4
4
|
* Live E2E layer is tested via bash pipe in the flywheel step.
|
|
5
5
|
*/
|
|
6
|
-
import { describe, it, expect } from "vitest";
|
|
6
|
+
import { describe, it, expect, afterEach } from "vitest";
|
|
7
7
|
import os from "node:os";
|
|
8
8
|
import path from "node:path";
|
|
9
9
|
import { mkdtemp, writeFile } from "node:fs/promises";
|
|
@@ -39,7 +39,8 @@ import { patternTools } from "../tools/patternTools.js";
|
|
|
39
39
|
import { gitWorkflowTools } from "../tools/gitWorkflowTools.js";
|
|
40
40
|
import { seoTools } from "../tools/seoTools.js";
|
|
41
41
|
import { voiceBridgeTools } from "../tools/voiceBridgeTools.js";
|
|
42
|
-
import {
|
|
42
|
+
import { critterTools } from "../tools/critterTools.js";
|
|
43
|
+
import { getQuickRef, hybridSearch, TOOL_REGISTRY, SEARCH_MODES, ALL_REGISTRY_ENTRIES, WORKFLOW_CHAINS, tokenize, buildDenseIndex, getToolComplexity } from "../tools/toolRegistry.js";
|
|
43
44
|
// Assemble all tools like index.ts does
|
|
44
45
|
const domainTools = [
|
|
45
46
|
...verificationTools,
|
|
@@ -72,6 +73,7 @@ const domainTools = [
|
|
|
72
73
|
...gitWorkflowTools,
|
|
73
74
|
...seoTools,
|
|
74
75
|
...voiceBridgeTools,
|
|
76
|
+
...critterTools,
|
|
75
77
|
];
|
|
76
78
|
const metaTools = createMetaTools(domainTools);
|
|
77
79
|
const allToolsWithoutDiscovery = [...domainTools, ...metaTools];
|
|
@@ -81,9 +83,9 @@ const allTools = [...allToolsWithoutDiscovery, ...discoveryTools];
|
|
|
81
83
|
// STATIC LAYER — structure validation
|
|
82
84
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
83
85
|
describe("Static: tool structure", () => {
|
|
84
|
-
it("should have
|
|
85
|
-
//
|
|
86
|
-
expect(allTools.length).toBe(
|
|
86
|
+
it("should have 163 tools total", () => {
|
|
87
|
+
// 158 domain tools + 2 meta tools (findTools, getMethodology) + 3 progressive discovery tools
|
|
88
|
+
expect(allTools.length).toBe(163);
|
|
87
89
|
});
|
|
88
90
|
it("every tool has name, description, inputSchema, handler", () => {
|
|
89
91
|
for (const tool of allTools) {
|
|
@@ -99,6 +101,18 @@ describe("Static: tool structure", () => {
|
|
|
99
101
|
const names = allTools.map((t) => t.name);
|
|
100
102
|
expect(new Set(names).size).toBe(names.length);
|
|
101
103
|
});
|
|
104
|
+
it("every registered tool has MCP annotations (category, phase, complexity)", () => {
|
|
105
|
+
// MCP 2025-11-25 spec: annotations field provides behavior hints for models.
|
|
106
|
+
// We surface category, phase, complexity from the registry as annotations.
|
|
107
|
+
for (const tool of allTools) {
|
|
108
|
+
const entry = TOOL_REGISTRY.get(tool.name);
|
|
109
|
+
expect(entry, `Missing registry entry for ${tool.name}`).toBeDefined();
|
|
110
|
+
expect(entry.category).toBeTruthy();
|
|
111
|
+
expect(entry.phase).toBeTruthy();
|
|
112
|
+
const complexity = getToolComplexity(tool.name);
|
|
113
|
+
expect(["low", "medium", "high"]).toContain(complexity);
|
|
114
|
+
}
|
|
115
|
+
});
|
|
102
116
|
it("abandon_cycle tool exists in verificationTools", () => {
|
|
103
117
|
const tool = verificationTools.find((t) => t.name === "abandon_cycle");
|
|
104
118
|
expect(tool).toBeDefined();
|
|
@@ -1799,13 +1813,31 @@ describe("Integration: full benchmark lifecycle", () => {
|
|
|
1799
1813
|
// ═══════════════════════════════════════════════════════════════════════
|
|
1800
1814
|
const toolDescs = allTools.map((t) => ({ name: t.name, description: t.description }));
|
|
1801
1815
|
describe("Search engine: registry coverage", () => {
|
|
1802
|
-
it("should have a registry entry for every tool (
|
|
1816
|
+
it("should have a registry entry for every tool (163/163)", () => {
|
|
1803
1817
|
const missing = allTools.filter((t) => !TOOL_REGISTRY.has(t.name));
|
|
1804
1818
|
expect(missing.map((t) => t.name)).toEqual([]);
|
|
1805
1819
|
expect(TOOL_REGISTRY.size).toBe(allTools.length);
|
|
1806
1820
|
});
|
|
1807
|
-
it("should expose all
|
|
1808
|
-
expect(SEARCH_MODES).toEqual(["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact", "dense"]);
|
|
1821
|
+
it("should expose all 8 search modes", () => {
|
|
1822
|
+
expect(SEARCH_MODES).toEqual(["hybrid", "fuzzy", "regex", "prefix", "semantic", "exact", "dense", "embedding"]);
|
|
1823
|
+
});
|
|
1824
|
+
it("discover_tools category enum covers every registry category", () => {
|
|
1825
|
+
const discoverTool = allTools.find((t) => t.name === "discover_tools");
|
|
1826
|
+
expect(discoverTool).toBeDefined();
|
|
1827
|
+
const categoryEnum = discoverTool.inputSchema.properties.category.enum;
|
|
1828
|
+
const registryCategories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
1829
|
+
for (const cat of registryCategories) {
|
|
1830
|
+
expect(categoryEnum, `category "${cat}" missing from discover_tools enum`).toContain(cat);
|
|
1831
|
+
}
|
|
1832
|
+
});
|
|
1833
|
+
it("get_workflow_chain enum covers every WORKFLOW_CHAINS key", () => {
|
|
1834
|
+
const chainTool = allTools.find((t) => t.name === "get_workflow_chain");
|
|
1835
|
+
expect(chainTool).toBeDefined();
|
|
1836
|
+
const chainEnum = chainTool.inputSchema.properties.chain.enum;
|
|
1837
|
+
for (const key of Object.keys(WORKFLOW_CHAINS)) {
|
|
1838
|
+
expect(chainEnum, `chain "${key}" missing from get_workflow_chain enum`).toContain(key);
|
|
1839
|
+
}
|
|
1840
|
+
expect(chainEnum).toContain("list");
|
|
1809
1841
|
});
|
|
1810
1842
|
it("should have quickRef for every registered tool", () => {
|
|
1811
1843
|
for (const tool of allTools) {
|
|
@@ -1943,6 +1975,281 @@ describe("Search engine: bigram phrase matching", () => {
|
|
|
1943
1975
|
expect(names.some((n) => n.includes("parallel") || n.includes("agent"))).toBe(true);
|
|
1944
1976
|
});
|
|
1945
1977
|
});
|
|
1978
|
+
// ── Dense search NDCG regression guard ──────────────────────────────────
|
|
1979
|
+
// Tested BM25 vs TF-IDF cosine (v2.14.2): TF-IDF won 0.692 vs 0.691.
|
|
1980
|
+
// BM25's length normalization adds no value for short tool descriptions.
|
|
1981
|
+
// Keeping TF-IDF cosine. This test guards against ranking regressions.
|
|
1982
|
+
describe("Search engine: dense search NDCG@5 regression guard", () => {
|
|
1983
|
+
function ndcg(rankedNames, idealNames, k) {
|
|
1984
|
+
const relevance = new Map();
|
|
1985
|
+
idealNames.forEach((name, i) => relevance.set(name, idealNames.length - i));
|
|
1986
|
+
let dcg = 0;
|
|
1987
|
+
for (let i = 0; i < Math.min(k, rankedNames.length); i++) {
|
|
1988
|
+
const rel = relevance.get(rankedNames[i]) ?? 0;
|
|
1989
|
+
dcg += rel / Math.log2(i + 2);
|
|
1990
|
+
}
|
|
1991
|
+
let idcg = 0;
|
|
1992
|
+
const idealRels = idealNames.map((_, i) => idealNames.length - i).sort((a, b) => b - a);
|
|
1993
|
+
for (let i = 0; i < Math.min(k, idealRels.length); i++) {
|
|
1994
|
+
idcg += idealRels[i] / Math.log2(i + 2);
|
|
1995
|
+
}
|
|
1996
|
+
return idcg === 0 ? 0 : dcg / idcg;
|
|
1997
|
+
}
|
|
1998
|
+
const EVAL_QUERIES = [
|
|
1999
|
+
{ query: "verify my implementation", ideal: ["start_verification_cycle", "get_verification_status", "log_test_result", "run_quality_gate", "triple_verify"] },
|
|
2000
|
+
{ query: "security audit", ideal: ["scan_dependencies", "run_code_analysis", "scan_terminal_security", "assess_risk", "check_git_compliance"] },
|
|
2001
|
+
{ query: "write an academic paper", ideal: ["polish_academic_text", "check_paper_logic", "generate_academic_caption", "review_paper_as_reviewer", "compress_or_expand_text"] },
|
|
2002
|
+
{ query: "deploy my changes", ideal: ["run_mandatory_flywheel", "run_quality_gate", "assess_risk", "run_closed_loop", "log_test_result"] },
|
|
2003
|
+
{ query: "parallel agent coordination", ideal: ["claim_agent_task", "get_parallel_status", "assign_agent_role", "bootstrap_parallel_agents", "release_agent_task"] },
|
|
2004
|
+
{ query: "seo website performance", ideal: ["seo_audit_url", "check_page_performance", "analyze_seo_content", "check_wordpress_site", "scan_wordpress_updates"] },
|
|
2005
|
+
{ query: "voice pipeline latency", ideal: ["benchmark_voice_latency", "design_voice_pipeline", "analyze_voice_config", "generate_voice_scaffold"] },
|
|
2006
|
+
{ query: "session notes context", ideal: ["save_session_note", "load_session_notes", "refresh_task_context"] },
|
|
2007
|
+
{ query: "git compliance merge", ideal: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
|
|
2008
|
+
{ query: "benchmark autonomous capability", ideal: ["start_autonomy_benchmark", "complete_autonomy_benchmark", "log_benchmark_milestone", "benchmark_models"] },
|
|
2009
|
+
{ query: "find tools for testing", ideal: ["discover_tools", "findTools", "log_test_result", "run_tests_cli", "start_eval_run"] },
|
|
2010
|
+
{ query: "knowledge learning record", ideal: ["record_learning", "search_all_knowledge", "save_session_note"] },
|
|
2011
|
+
];
|
|
2012
|
+
it("TF-IDF cosine dense search should maintain NDCG@5 >= 0.60 across eval queries", () => {
|
|
2013
|
+
const { vectors, idf } = buildDenseIndex();
|
|
2014
|
+
const K = 5;
|
|
2015
|
+
let totalNDCG = 0;
|
|
2016
|
+
for (const { query, ideal } of EVAL_QUERIES) {
|
|
2017
|
+
const queryTokens = tokenize(query.toLowerCase());
|
|
2018
|
+
const queryTf = new Map();
|
|
2019
|
+
for (const t of queryTokens)
|
|
2020
|
+
queryTf.set(t, (queryTf.get(t) ?? 0) + 1);
|
|
2021
|
+
const maxFreq = Math.max(...queryTf.values(), 1);
|
|
2022
|
+
for (const [k, v] of queryTf)
|
|
2023
|
+
queryTf.set(k, v / maxFreq);
|
|
2024
|
+
const queryVec = new Map();
|
|
2025
|
+
for (const [term, tfVal] of queryTf) {
|
|
2026
|
+
queryVec.set(term, tfVal * (idf.get(term) ?? 1));
|
|
2027
|
+
}
|
|
2028
|
+
const scores = [];
|
|
2029
|
+
for (const [name, docVec] of vectors) {
|
|
2030
|
+
let dot = 0, normA = 0, normB = 0;
|
|
2031
|
+
for (const [k, v] of queryVec) {
|
|
2032
|
+
normA += v * v;
|
|
2033
|
+
const bv = docVec.get(k);
|
|
2034
|
+
if (bv !== undefined)
|
|
2035
|
+
dot += v * bv;
|
|
2036
|
+
}
|
|
2037
|
+
for (const v of docVec.values())
|
|
2038
|
+
normB += v * v;
|
|
2039
|
+
const sim = (normA === 0 || normB === 0) ? 0 : dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
2040
|
+
if (sim > 0)
|
|
2041
|
+
scores.push({ name, sim });
|
|
2042
|
+
}
|
|
2043
|
+
scores.sort((a, b) => b.sim - a.sim);
|
|
2044
|
+
const rankedNames = scores.slice(0, K).map(r => r.name);
|
|
2045
|
+
totalNDCG += ndcg(rankedNames, ideal, K);
|
|
2046
|
+
}
|
|
2047
|
+
const avgNDCG = totalNDCG / EVAL_QUERIES.length;
|
|
2048
|
+
expect(avgNDCG).toBeGreaterThanOrEqual(0.60);
|
|
2049
|
+
});
|
|
2050
|
+
});
|
|
2051
|
+
// ── FTS5+BM25 A/B test: search_all_knowledge (recon_findings + gaps) ────
|
|
2052
|
+
// Verifies that FTS5 BM25 ranking produces relevance-ordered results
|
|
2053
|
+
// for variable-length recon findings and gaps, compared to LIKE (recency-only).
|
|
2054
|
+
describe("FTS5 BM25: search_all_knowledge relevance ranking", () => {
|
|
2055
|
+
const searchTool = reconTools.find((t) => t.name === "search_all_knowledge");
|
|
2056
|
+
const logFinding = reconTools.find((t) => t.name === "log_recon_finding");
|
|
2057
|
+
const runRecon = reconTools.find((t) => t.name === "run_recon");
|
|
2058
|
+
const startCycle = verificationTools.find((t) => t.name === "start_verification_cycle");
|
|
2059
|
+
const logGap = verificationTools.find((t) => t.name === "log_gap");
|
|
2060
|
+
it("should rank recon findings by BM25 relevance (term-specific > generic mentions)", async () => {
|
|
2061
|
+
// Setup: create a recon session with varied findings
|
|
2062
|
+
const session = (await runRecon.handler({ target: "BM25 FTS5 test session" }));
|
|
2063
|
+
const sid = session.sessionId;
|
|
2064
|
+
// Insert findings — the "MCP SDK breaking change" finding is highly relevant
|
|
2065
|
+
await logFinding.handler({
|
|
2066
|
+
sessionId: sid,
|
|
2067
|
+
category: "breaking_change",
|
|
2068
|
+
summary: "MCP SDK v2.0 introduces breaking changes to the transport layer requiring migration",
|
|
2069
|
+
relevance: "All MCP servers must update their transport initialization code",
|
|
2070
|
+
actionItems: "Update transport from stdio to new StreamableHTTP pattern",
|
|
2071
|
+
});
|
|
2072
|
+
await logFinding.handler({
|
|
2073
|
+
sessionId: sid,
|
|
2074
|
+
category: "best_practice",
|
|
2075
|
+
summary: "React 19 compiler optimizations reduce bundle size by 15%",
|
|
2076
|
+
relevance: "Frontend build pipeline could benefit from upgrade",
|
|
2077
|
+
actionItems: "Evaluate React 19 migration path",
|
|
2078
|
+
});
|
|
2079
|
+
await logFinding.handler({
|
|
2080
|
+
sessionId: sid,
|
|
2081
|
+
category: "new_feature",
|
|
2082
|
+
summary: "New MCP SDK sampling API enables server-initiated LLM requests",
|
|
2083
|
+
relevance: "MCP servers can now call LLMs directly through the protocol",
|
|
2084
|
+
actionItems: "Integrate sampling API into MCP tool handlers",
|
|
2085
|
+
});
|
|
2086
|
+
// Query for "MCP SDK breaking" — should rank MCP findings above React
|
|
2087
|
+
const result = (await searchTool.handler({ query: "MCP SDK breaking" }));
|
|
2088
|
+
const findings = result.reconFindings;
|
|
2089
|
+
// At minimum, MCP-related findings should appear (FTS5 MATCH or LIKE fallback)
|
|
2090
|
+
expect(findings.length).toBeGreaterThan(0);
|
|
2091
|
+
// If FTS5 BM25 is working, the breaking_change finding should rank first
|
|
2092
|
+
// (it has the most term overlap with "MCP SDK breaking")
|
|
2093
|
+
if (findings.length >= 2) {
|
|
2094
|
+
const firstSummary = findings[0].summary.toLowerCase();
|
|
2095
|
+
expect(firstSummary).toContain("breaking");
|
|
2096
|
+
}
|
|
2097
|
+
});
|
|
2098
|
+
it("should rank gaps by BM25 relevance (specific match > loose mention)", async () => {
|
|
2099
|
+
// Setup: create a verification cycle with varied gaps
|
|
2100
|
+
const cycle = (await startCycle.handler({
|
|
2101
|
+
title: "BM25 gaps FTS5 test cycle",
|
|
2102
|
+
}));
|
|
2103
|
+
const cid = cycle.cycleId;
|
|
2104
|
+
await logGap.handler({
|
|
2105
|
+
cycleId: cid,
|
|
2106
|
+
severity: "HIGH",
|
|
2107
|
+
title: "SQLite WAL mode lock contention under parallel writes",
|
|
2108
|
+
description: "When multiple agents write to SQLite simultaneously, WAL mode lock contention causes timeout errors after 5 seconds",
|
|
2109
|
+
fixStrategy: "Implement write queue with retry backoff for SQLite parallel access",
|
|
2110
|
+
});
|
|
2111
|
+
await logGap.handler({
|
|
2112
|
+
cycleId: cid,
|
|
2113
|
+
severity: "MEDIUM",
|
|
2114
|
+
title: "API rate limiting not implemented",
|
|
2115
|
+
description: "External API calls have no rate limiting or retry logic",
|
|
2116
|
+
fixStrategy: "Add exponential backoff with jitter for API calls",
|
|
2117
|
+
});
|
|
2118
|
+
await logGap.handler({
|
|
2119
|
+
cycleId: cid,
|
|
2120
|
+
severity: "LOW",
|
|
2121
|
+
title: "Test coverage below 80% for SQLite module",
|
|
2122
|
+
description: "SQLite database module has only 60% test coverage, missing edge cases for concurrent access",
|
|
2123
|
+
fixStrategy: "Add integration tests for SQLite concurrent write scenarios",
|
|
2124
|
+
});
|
|
2125
|
+
// Query for "SQLite parallel" — should rank SQLite-specific gaps above API gap
|
|
2126
|
+
const result = (await searchTool.handler({ query: "SQLite parallel" }));
|
|
2127
|
+
const gaps = result.gaps;
|
|
2128
|
+
expect(gaps.length).toBeGreaterThan(0);
|
|
2129
|
+
// If FTS5 BM25 is working, the WAL lock contention gap (HIGH severity, most term overlap) ranks first
|
|
2130
|
+
if (gaps.length >= 2) {
|
|
2131
|
+
const firstTitle = gaps[0].title.toLowerCase();
|
|
2132
|
+
expect(firstTitle).toContain("sqlite");
|
|
2133
|
+
}
|
|
2134
|
+
});
|
|
2135
|
+
});
|
|
2136
|
+
// ── Gateway BM25 meta-tool A/B test ────────────────────────────────────
|
|
2137
|
+
// Tests BM25 scoring in the gateway metaTools findTools — verifies that
|
|
2138
|
+
// IDF-weighted scoring ranks specific tools higher than generic matches.
|
|
2139
|
+
describe("Gateway BM25: findTools IDF-weighted ranking", () => {
|
|
2140
|
+
// Simulate the gateway's BM25 scorer with inline implementation
|
|
2141
|
+
function tokenize(text) {
|
|
2142
|
+
return text.toLowerCase().match(/[a-z_]+/g) ?? [];
|
|
2143
|
+
}
|
|
2144
|
+
// Word-count baseline (old approach)
|
|
2145
|
+
function wordCountSearch(query, tools) {
|
|
2146
|
+
const words = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
2147
|
+
return tools
|
|
2148
|
+
.map((t) => {
|
|
2149
|
+
const text = `${t.name} ${t.description}`.toLowerCase();
|
|
2150
|
+
const hits = words.filter((w) => text.includes(w)).length;
|
|
2151
|
+
return { name: t.name, hits };
|
|
2152
|
+
})
|
|
2153
|
+
.filter((t) => t.hits > 0)
|
|
2154
|
+
.sort((a, b) => b.hits - a.hits)
|
|
2155
|
+
.map((t) => t.name);
|
|
2156
|
+
}
|
|
2157
|
+
// BM25 search (new approach)
|
|
2158
|
+
function bm25Search(query, tools) {
|
|
2159
|
+
const corpus = new Map();
|
|
2160
|
+
for (const t of tools) {
|
|
2161
|
+
corpus.set(t.name, tokenize(`${t.name} ${t.description}`));
|
|
2162
|
+
}
|
|
2163
|
+
let totalLen = 0;
|
|
2164
|
+
for (const tokens of corpus.values())
|
|
2165
|
+
totalLen += tokens.length;
|
|
2166
|
+
const avgDl = corpus.size > 0 ? totalLen / corpus.size : 1;
|
|
2167
|
+
const docFreq = new Map();
|
|
2168
|
+
for (const tokens of corpus.values()) {
|
|
2169
|
+
const unique = new Set(tokens);
|
|
2170
|
+
for (const t of unique)
|
|
2171
|
+
docFreq.set(t, (docFreq.get(t) ?? 0) + 1);
|
|
2172
|
+
}
|
|
2173
|
+
const N = corpus.size;
|
|
2174
|
+
const idf = new Map();
|
|
2175
|
+
for (const [term, df] of docFreq) {
|
|
2176
|
+
idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1));
|
|
2177
|
+
}
|
|
2178
|
+
const queryTokens = tokenize(query);
|
|
2179
|
+
const k1 = 1.2, b = 0.75;
|
|
2180
|
+
return tools
|
|
2181
|
+
.map((t) => {
|
|
2182
|
+
const docTokens = corpus.get(t.name) ?? [];
|
|
2183
|
+
const dl = docTokens.length;
|
|
2184
|
+
const tf = new Map();
|
|
2185
|
+
for (const tok of docTokens)
|
|
2186
|
+
tf.set(tok, (tf.get(tok) ?? 0) + 1);
|
|
2187
|
+
let score = 0;
|
|
2188
|
+
for (const qt of queryTokens) {
|
|
2189
|
+
const termTf = tf.get(qt) ?? 0;
|
|
2190
|
+
if (termTf === 0)
|
|
2191
|
+
continue;
|
|
2192
|
+
const termIdf = idf.get(qt) ?? 0;
|
|
2193
|
+
score += termIdf * (termTf * (k1 + 1)) / (termTf + k1 * (1 - b + b * (dl / avgDl)));
|
|
2194
|
+
}
|
|
2195
|
+
return { name: t.name, score };
|
|
2196
|
+
})
|
|
2197
|
+
.filter((t) => t.score > 0)
|
|
2198
|
+
.sort((a, b) => b.score - a.score)
|
|
2199
|
+
.map((t) => t.name);
|
|
2200
|
+
}
|
|
2201
|
+
// Use the real tool list from allTools
|
|
2202
|
+
const toolEntries = allTools.map((t) => ({ name: t.name, description: t.description }));
|
|
2203
|
+
// Queries where IDF matters — rare terms should beat common ones
|
|
2204
|
+
const IDF_QUERIES = [
|
|
2205
|
+
{
|
|
2206
|
+
query: "flicker detection android",
|
|
2207
|
+
mustRankHigher: "start_flicker_analysis",
|
|
2208
|
+
mustRankLower: "web_search",
|
|
2209
|
+
reason: "'flicker' is rare (high IDF), 'search' is common (low IDF)",
|
|
2210
|
+
},
|
|
2211
|
+
{
|
|
2212
|
+
query: "autonomous benchmark c compiler",
|
|
2213
|
+
mustRankHigher: "start_autonomy_benchmark",
|
|
2214
|
+
mustRankLower: "run_quality_gate",
|
|
2215
|
+
reason: "'autonomy' and 'benchmark' are specific (high IDF)",
|
|
2216
|
+
},
|
|
2217
|
+
{
|
|
2218
|
+
query: "toon encode token",
|
|
2219
|
+
mustRankHigher: "toon_encode",
|
|
2220
|
+
mustRankLower: "record_learning",
|
|
2221
|
+
reason: "'toon' is extremely rare (high IDF), should dominate scoring",
|
|
2222
|
+
},
|
|
2223
|
+
];
|
|
2224
|
+
it("BM25 should outperform word-count on IDF-sensitive queries", () => {
|
|
2225
|
+
let bm25Wins = 0;
|
|
2226
|
+
let wordCountWins = 0;
|
|
2227
|
+
for (const { query, mustRankHigher, mustRankLower } of IDF_QUERIES) {
|
|
2228
|
+
const bm25Results = bm25Search(query, toolEntries);
|
|
2229
|
+
const wordResults = wordCountSearch(query, toolEntries);
|
|
2230
|
+
const bm25IdxHigh = bm25Results.indexOf(mustRankHigher);
|
|
2231
|
+
const bm25IdxLow = bm25Results.indexOf(mustRankLower);
|
|
2232
|
+
const wordIdxHigh = wordResults.indexOf(mustRankHigher);
|
|
2233
|
+
const wordIdxLow = wordResults.indexOf(mustRankLower);
|
|
2234
|
+
// BM25 correctly ranks the specific tool higher
|
|
2235
|
+
if (bm25IdxHigh !== -1 && (bm25IdxLow === -1 || bm25IdxHigh < bm25IdxLow))
|
|
2236
|
+
bm25Wins++;
|
|
2237
|
+
if (wordIdxHigh !== -1 && (wordIdxLow === -1 || wordIdxHigh < wordIdxLow))
|
|
2238
|
+
wordCountWins++;
|
|
2239
|
+
}
|
|
2240
|
+
// BM25 should win at least as many IDF-sensitive queries as word-count
|
|
2241
|
+
expect(bm25Wins).toBeGreaterThanOrEqual(wordCountWins);
|
|
2242
|
+
// BM25 should get at least 2 of 3 IDF-sensitive queries correct
|
|
2243
|
+
expect(bm25Wins).toBeGreaterThanOrEqual(2);
|
|
2244
|
+
});
|
|
2245
|
+
it("BM25 should return results for all eval queries (no regressions)", () => {
|
|
2246
|
+
const queries = ["verify implementation", "search the web", "create document", "find stock prices", "security audit"];
|
|
2247
|
+
for (const q of queries) {
|
|
2248
|
+
const results = bm25Search(q, toolEntries);
|
|
2249
|
+
expect(results.length).toBeGreaterThan(0);
|
|
2250
|
+
}
|
|
2251
|
+
});
|
|
2252
|
+
});
|
|
1946
2253
|
// ── Contract Compliance Tool Tests ──────────────────────────────────────
|
|
1947
2254
|
describe("check_contract_compliance", () => {
|
|
1948
2255
|
it("should return N/A score when no tool call data exists", async () => {
|
|
@@ -2180,4 +2487,699 @@ describe("Workflow chains: ablation_eval and task_bank_setup", () => {
|
|
|
2180
2487
|
expect(WORKFLOW_CHAINS.task_bank_setup.steps.length).toBe(9);
|
|
2181
2488
|
});
|
|
2182
2489
|
});
|
|
2490
|
+
// ── Embedding search A/B: natural language queries where synonym map misses ──
|
|
2491
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
2492
|
+
// CRITTER TOOL — intentionality check
|
|
2493
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
2494
|
+
describe("Unit: critter_check", () => {
|
|
2495
|
+
const tool = critterTools.find((t) => t.name === "critter_check");
|
|
2496
|
+
it("scores a well-intentioned task as proceed", async () => {
|
|
2497
|
+
const result = await tool.handler({
|
|
2498
|
+
task: "Add embedding-based semantic search to discover_tools",
|
|
2499
|
+
why: "Natural language queries like 'keep track of what I learned' miss record_learning because lexical search can't bridge vocabulary gaps",
|
|
2500
|
+
who: "AI agents using the MCP server who think in natural language, not tool names",
|
|
2501
|
+
success_looks_like: "A/B eval shows 60% lexical → 85%+ hybrid hit rate with zero drops",
|
|
2502
|
+
});
|
|
2503
|
+
expect(result.score).toBeGreaterThanOrEqual(70);
|
|
2504
|
+
expect(result.verdict).toBe("proceed");
|
|
2505
|
+
});
|
|
2506
|
+
it("catches circular reasoning and vague audience", async () => {
|
|
2507
|
+
const result = await tool.handler({
|
|
2508
|
+
task: "Add user authentication and login system to the application",
|
|
2509
|
+
why: "Because we need user authentication and login system in the application",
|
|
2510
|
+
who: "users",
|
|
2511
|
+
});
|
|
2512
|
+
// Circular (-30) + vague audience (-20) = 50, well under 70
|
|
2513
|
+
expect(result.score).toBeLessThan(70);
|
|
2514
|
+
expect(result.feedback.some((f) => f.toLowerCase().includes("circular") || f.toLowerCase().includes("vague"))).toBe(true);
|
|
2515
|
+
});
|
|
2516
|
+
it("catches deference over understanding", async () => {
|
|
2517
|
+
const result = await tool.handler({
|
|
2518
|
+
task: "Refactor the database layer",
|
|
2519
|
+
why: "I was told to refactor it in the ticket",
|
|
2520
|
+
who: "Backend developers maintaining the codebase",
|
|
2521
|
+
});
|
|
2522
|
+
expect(result.feedback.some((f) => f.toLowerCase().includes("deference") || f.toLowerCase().includes("authority"))).toBe(true);
|
|
2523
|
+
});
|
|
2524
|
+
it("rewards specificity bonuses", async () => {
|
|
2525
|
+
const result = await tool.handler({
|
|
2526
|
+
task: "Migrate from REST to GraphQL",
|
|
2527
|
+
why: "Our mobile app makes 12 API calls per screen load because REST endpoints return fixed shapes — GraphQL lets us fetch exactly what each screen needs in one round trip",
|
|
2528
|
+
who: "Mobile team (3 iOS + 2 Android devs) who spend 40% of sprint time on API pagination workarounds",
|
|
2529
|
+
success_looks_like: "Screen load API calls drop from 12 to 1-2, mobile team velocity increases by at least 20%",
|
|
2530
|
+
simplest_version: "Start with the 3 highest-traffic screens, keep REST endpoints alive for backwards compat",
|
|
2531
|
+
});
|
|
2532
|
+
expect(result.score).toBeGreaterThanOrEqual(90);
|
|
2533
|
+
expect(result.verdict).toBe("proceed");
|
|
2534
|
+
});
|
|
2535
|
+
it("persists the check to SQLite", async () => {
|
|
2536
|
+
const result = await tool.handler({
|
|
2537
|
+
task: "Test persistence",
|
|
2538
|
+
why: "Verifying that critter checks are saved for accountability",
|
|
2539
|
+
who: "The test suite validating the critter tool",
|
|
2540
|
+
});
|
|
2541
|
+
expect(result.id).toBeDefined();
|
|
2542
|
+
expect(result.id).toMatch(/^crit_/);
|
|
2543
|
+
});
|
|
2544
|
+
});
|
|
2545
|
+
// These tests verify that when a neural embedding provider IS available,
|
|
2546
|
+
// natural language queries that lexical search struggles with get boosted.
|
|
2547
|
+
// When no provider is available, they gracefully skip.
|
|
2548
|
+
import { _setIndexForTesting, _resetForTesting as resetEmbedding } from "../tools/embeddingProvider.js";
|
|
2549
|
+
import { _resetCooccurrenceCache, _setCooccurrenceForTesting, _setWrrfParamsForTesting, _resetWrrfParamsForTesting } from "../tools/toolRegistry.js";
|
|
2550
|
+
describe("Embedding search: RRF integration with hybridSearch", () => {
|
|
2551
|
+
it("hybridSearch accepts embeddingQueryVec option without error", () => {
|
|
2552
|
+
// Even without an embedding index loaded, hybridSearch should not throw
|
|
2553
|
+
const results = hybridSearch("verify code", toolDescs, {
|
|
2554
|
+
mode: "hybrid",
|
|
2555
|
+
limit: 5,
|
|
2556
|
+
embeddingQueryVec: new Float32Array([0.5, 0.3, 0.1]),
|
|
2557
|
+
});
|
|
2558
|
+
// Should still return results from lexical strategies
|
|
2559
|
+
expect(results.length).toBeGreaterThan(0);
|
|
2560
|
+
});
|
|
2561
|
+
it("embedding mode without index has no embedding reasons", () => {
|
|
2562
|
+
resetEmbedding();
|
|
2563
|
+
const results = hybridSearch("keep track of lessons", toolDescs, {
|
|
2564
|
+
mode: "embedding",
|
|
2565
|
+
limit: 5,
|
|
2566
|
+
explain: true,
|
|
2567
|
+
});
|
|
2568
|
+
// Without an embedding index, no results should have embedding reasons
|
|
2569
|
+
for (const r of results) {
|
|
2570
|
+
expect(r.matchReasons.some((m) => m.startsWith("embedding:"))).toBe(false);
|
|
2571
|
+
}
|
|
2572
|
+
});
|
|
2573
|
+
it("embedding RRF adds score when index is loaded with mock vectors", () => {
|
|
2574
|
+
// Build a simple mock index: record_learning gets a vector close to the query
|
|
2575
|
+
const mockEntries = toolDescs.map((t) => ({
|
|
2576
|
+
name: t.name,
|
|
2577
|
+
// Give record_learning a "close" vector, everything else a distant one
|
|
2578
|
+
vector: t.name === "record_learning"
|
|
2579
|
+
? new Float32Array([0.9, 0.1, 0.0])
|
|
2580
|
+
: new Float32Array([0.1, 0.1, 0.9]),
|
|
2581
|
+
nodeType: "tool",
|
|
2582
|
+
}));
|
|
2583
|
+
_setIndexForTesting(mockEntries);
|
|
2584
|
+
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2585
|
+
const results = hybridSearch("remember what I learned", toolDescs, {
|
|
2586
|
+
mode: "hybrid",
|
|
2587
|
+
limit: 10,
|
|
2588
|
+
explain: true,
|
|
2589
|
+
embeddingQueryVec: queryVec,
|
|
2590
|
+
});
|
|
2591
|
+
// record_learning should appear and have an embedding:tool_rrf reason
|
|
2592
|
+
const recordLearning = results.find((r) => r.name === "record_learning");
|
|
2593
|
+
expect(recordLearning).toBeDefined();
|
|
2594
|
+
expect(recordLearning.matchReasons.some((r) => r.startsWith("embedding:tool_rrf"))).toBe(true);
|
|
2595
|
+
// Clean up
|
|
2596
|
+
resetEmbedding();
|
|
2597
|
+
});
|
|
2598
|
+
it("embedding-only mode with mock index ranks by RRF", () => {
|
|
2599
|
+
// Set up mock where start_verification_cycle is closest to query
|
|
2600
|
+
const mockEntries = toolDescs.map((t) => ({
|
|
2601
|
+
name: t.name,
|
|
2602
|
+
vector: t.name === "start_verification_cycle"
|
|
2603
|
+
? new Float32Array([0.95, 0.05, 0.0])
|
|
2604
|
+
: t.name === "run_quality_gate"
|
|
2605
|
+
? new Float32Array([0.7, 0.3, 0.0])
|
|
2606
|
+
: new Float32Array([0.05, 0.05, 0.9]),
|
|
2607
|
+
nodeType: "tool",
|
|
2608
|
+
}));
|
|
2609
|
+
_setIndexForTesting(mockEntries);
|
|
2610
|
+
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2611
|
+
const results = hybridSearch("ensure correctness", toolDescs, {
|
|
2612
|
+
mode: "embedding",
|
|
2613
|
+
limit: 5,
|
|
2614
|
+
explain: true,
|
|
2615
|
+
embeddingQueryVec: queryVec,
|
|
2616
|
+
});
|
|
2617
|
+
// In embedding-only mode, results should come from embedding RRF only
|
|
2618
|
+
expect(results.length).toBeGreaterThan(0);
|
|
2619
|
+
expect(results[0].name).toBe("start_verification_cycle");
|
|
2620
|
+
expect(results[0].matchReasons.some((r) => r.startsWith("embedding:tool_rrf"))).toBe(true);
|
|
2621
|
+
resetEmbedding();
|
|
2622
|
+
});
|
|
2623
|
+
});
|
|
2624
|
+
// ── Agent-as-a-Graph: structural property tests ──────────────────────────
|
|
2625
|
+
// These tests verify the STRUCTURAL properties of the bipartite graph search:
|
|
2626
|
+
// 1. Domain-only proximity lifts siblings (upward traversal)
|
|
2627
|
+
// 2. Type-specific wRRF weight asymmetry (α_D=1.5 > α_T=1.0, per paper + ablation)
|
|
2628
|
+
// 3. Strong lexical matches survive noisy embeddings (non-regression)
|
|
2629
|
+
// 4. Execution trace edges boost co-occurring tools
|
|
2630
|
+
//
|
|
2631
|
+
// Unlike tautological tests that mock the "right answer" as close, these tests
|
|
2632
|
+
// prove the ALGORITHM works by testing its structural invariants.
|
|
2633
|
+
describe("Agent-as-a-Graph: bipartite wRRF structural properties", () => {
|
|
2634
|
+
// Helper: build a bipartite index where specific domains are close but NO tools are
|
|
2635
|
+
function buildDomainOnlyIndex(closeDomains) {
|
|
2636
|
+
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
2637
|
+
// ALL tool nodes are distant from query — no direct tool match
|
|
2638
|
+
const toolEntries = toolDescs.map((t) => ({
|
|
2639
|
+
name: t.name,
|
|
2640
|
+
vector: new Float32Array([0.1, 0.1, 0.8]),
|
|
2641
|
+
nodeType: "tool",
|
|
2642
|
+
}));
|
|
2643
|
+
// Only specified domains are close to query
|
|
2644
|
+
const domainEntries = [...categories].map((cat) => ({
|
|
2645
|
+
name: `domain:${cat}`,
|
|
2646
|
+
vector: closeDomains.has(cat)
|
|
2647
|
+
? new Float32Array([0.85, 0.15, 0.0])
|
|
2648
|
+
: new Float32Array([0.05, 0.05, 0.9]),
|
|
2649
|
+
nodeType: "domain",
|
|
2650
|
+
}));
|
|
2651
|
+
return [...toolEntries, ...domainEntries];
|
|
2652
|
+
}
|
|
2653
|
+
afterEach(() => {
|
|
2654
|
+
resetEmbedding();
|
|
2655
|
+
_resetCooccurrenceCache();
|
|
2656
|
+
});
|
|
2657
|
+
it("domain-only embedding proximity causes measurable rank lift for sibling tools", () => {
|
|
2658
|
+
// Prove CAUSATION, not just presence: compare ranks WITH vs WITHOUT domain proximity.
|
|
2659
|
+
// Use a query that gives moderate lexical scores to research_writing tools,
|
|
2660
|
+
// then show domain_rrf lifts them higher.
|
|
2661
|
+
const query = "polish text for submission";
|
|
2662
|
+
// Step 1: Baseline — lexical only (no embeddings)
|
|
2663
|
+
resetEmbedding();
|
|
2664
|
+
const baseline = hybridSearch(query, toolDescs, {
|
|
2665
|
+
mode: "hybrid",
|
|
2666
|
+
limit: 30,
|
|
2667
|
+
explain: true,
|
|
2668
|
+
});
|
|
2669
|
+
// Find a research_writing tool in baseline and record its rank
|
|
2670
|
+
const rwToolBaseline = baseline.findIndex((r) => r.category === "research_writing");
|
|
2671
|
+
// It should exist somewhere (polish/text/submission have some keyword overlap)
|
|
2672
|
+
expect(rwToolBaseline).toBeGreaterThanOrEqual(0);
|
|
2673
|
+
const rwToolName = baseline[rwToolBaseline].name;
|
|
2674
|
+
const rwBaselineScore = baseline[rwToolBaseline].score;
|
|
2675
|
+
// Step 2: With domain-only embeddings (research_writing domain close, NO tools close)
|
|
2676
|
+
const mockIndex = buildDomainOnlyIndex(new Set(["research_writing"]));
|
|
2677
|
+
_setIndexForTesting(mockIndex);
|
|
2678
|
+
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2679
|
+
const enhanced = hybridSearch(query, toolDescs, {
|
|
2680
|
+
mode: "hybrid",
|
|
2681
|
+
limit: 30,
|
|
2682
|
+
explain: true,
|
|
2683
|
+
embeddingQueryVec: queryVec,
|
|
2684
|
+
});
|
|
2685
|
+
const rwToolEnhanced = enhanced.find((r) => r.name === rwToolName);
|
|
2686
|
+
expect(rwToolEnhanced).toBeDefined();
|
|
2687
|
+
// CAUSATION: score increased due to domain_rrf
|
|
2688
|
+
expect(rwToolEnhanced.score).toBeGreaterThan(rwBaselineScore);
|
|
2689
|
+
expect(rwToolEnhanced.matchReasons.some((r) => r.includes("domain_rrf"))).toBe(true);
|
|
2690
|
+
// No tool_rrf (all tools are equally distant)
|
|
2691
|
+
expect(rwToolEnhanced.matchReasons.some((r) => r.includes("tool_rrf"))).toBe(false);
|
|
2692
|
+
// Rank should improve (lower index = higher rank)
|
|
2693
|
+
const rwEnhancedIdx = enhanced.findIndex((r) => r.name === rwToolName);
|
|
2694
|
+
expect(rwEnhancedIdx).toBeLessThanOrEqual(rwToolBaseline);
|
|
2695
|
+
});
|
|
2696
|
+
it("multiple close domains each lift their own sibling tools independently", () => {
|
|
2697
|
+
// Setup: security AND vision domains close, but no tools close
|
|
2698
|
+
const mockIndex = buildDomainOnlyIndex(new Set(["security", "vision"]));
|
|
2699
|
+
_setIndexForTesting(mockIndex);
|
|
2700
|
+
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2701
|
+
const results = hybridSearch("analyze security visual", toolDescs, {
|
|
2702
|
+
mode: "embedding",
|
|
2703
|
+
limit: 30,
|
|
2704
|
+
explain: true,
|
|
2705
|
+
embeddingQueryVec: queryVec,
|
|
2706
|
+
});
|
|
2707
|
+
const securityTools = results.filter((r) => r.category === "security" && r.matchReasons.some((mr) => mr.includes("domain_rrf(security")));
|
|
2708
|
+
const visionTools = results.filter((r) => r.category === "vision" && r.matchReasons.some((mr) => mr.includes("domain_rrf(vision")));
|
|
2709
|
+
// Both categories should have siblings lifted
|
|
2710
|
+
expect(securityTools.length).toBeGreaterThanOrEqual(1);
|
|
2711
|
+
expect(visionTools.length).toBeGreaterThanOrEqual(1);
|
|
2712
|
+
});
|
|
2713
|
+
it("type-specific wRRF: domain_rrf score exceeds tool_rrf (paper calibration α_D=1.5 > α_T=1.0)", () => {
|
|
2714
|
+
// After ablation (see "wRRF α ratio ablation" test), paper's domain emphasis wins.
|
|
2715
|
+
// At rank 1: α_D * 1000/(K+1) = 1.5 * 1000/61 ≈ 25, α_T * 1000/(K+1) = 1.0 * 1000/61 ≈ 16.
|
|
2716
|
+
// Domain emphasis means category-level matches contribute MORE than individual tool matches,
|
|
2717
|
+
// which helps surface all tools in a matching domain (upward traversal).
|
|
2718
|
+
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
2719
|
+
const targetTool = "polish_academic_text";
|
|
2720
|
+
const toolEntries = toolDescs.map((t) => ({
|
|
2721
|
+
name: t.name,
|
|
2722
|
+
vector: t.name === targetTool
|
|
2723
|
+
? new Float32Array([0.95, 0.05, 0.0])
|
|
2724
|
+
: new Float32Array([0.1, 0.1, 0.8]),
|
|
2725
|
+
nodeType: "tool",
|
|
2726
|
+
}));
|
|
2727
|
+
const domainEntries = [...categories].map((cat) => ({
|
|
2728
|
+
name: `domain:${cat}`,
|
|
2729
|
+
vector: cat === "research_writing"
|
|
2730
|
+
? new Float32Array([0.90, 0.10, 0.0])
|
|
2731
|
+
: new Float32Array([0.05, 0.05, 0.9]),
|
|
2732
|
+
nodeType: "domain",
|
|
2733
|
+
}));
|
|
2734
|
+
_setIndexForTesting([...toolEntries, ...domainEntries]);
|
|
2735
|
+
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2736
|
+
const results = hybridSearch("academic writing", toolDescs, {
|
|
2737
|
+
mode: "embedding",
|
|
2738
|
+
limit: 20,
|
|
2739
|
+
explain: true,
|
|
2740
|
+
embeddingQueryVec: queryVec,
|
|
2741
|
+
});
|
|
2742
|
+
const target = results.find((r) => r.name === targetTool);
|
|
2743
|
+
expect(target).toBeDefined();
|
|
2744
|
+
// Extract individual RRF scores from matchReasons
|
|
2745
|
+
const toolRrfReason = target.matchReasons.find((r) => r.startsWith("embedding:tool_rrf"));
|
|
2746
|
+
const domainRrfReason = target.matchReasons.find((r) => r.startsWith("embedding:domain_rrf"));
|
|
2747
|
+
expect(toolRrfReason).toBeDefined();
|
|
2748
|
+
expect(domainRrfReason).toBeDefined();
|
|
2749
|
+
const toolScore = parseInt(toolRrfReason.match(/\+(\d+)/)?.[1] ?? "0");
|
|
2750
|
+
const domainScore = parseInt(domainRrfReason.match(/\+(\d+)/)?.[1] ?? "0");
|
|
2751
|
+
// α_D=1.5 > α_T=1.0 → domain_rrf contributes more than tool_rrf at similar ranks
|
|
2752
|
+
expect(domainScore).toBeGreaterThan(toolScore);
|
|
2753
|
+
});
|
|
2754
|
+
it("strong lexical matches are not displaced by noisy embeddings", () => {
|
|
2755
|
+
// "start verification cycle" should easily find start_verification_cycle lexically.
|
|
2756
|
+
// Adding uniformly noisy embeddings should NOT knock it from #1.
|
|
2757
|
+
resetEmbedding();
|
|
2758
|
+
const lexicalResults = hybridSearch("start verification cycle", toolDescs, {
|
|
2759
|
+
mode: "hybrid",
|
|
2760
|
+
limit: 5,
|
|
2761
|
+
});
|
|
2762
|
+
expect(lexicalResults[0].name).toBe("start_verification_cycle");
|
|
2763
|
+
// Add noisy embeddings — all vectors point roughly the same direction
|
|
2764
|
+
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
2765
|
+
const toolEntries = toolDescs.map((t, i) => ({
|
|
2766
|
+
name: t.name,
|
|
2767
|
+
vector: new Float32Array([0.2 + (i % 10) * 0.01, 0.3, 0.7]),
|
|
2768
|
+
nodeType: "tool",
|
|
2769
|
+
}));
|
|
2770
|
+
const domainEntries = [...categories].map((cat, i) => ({
|
|
2771
|
+
name: `domain:${cat}`,
|
|
2772
|
+
vector: new Float32Array([0.15 + i * 0.02, 0.25, 0.7]),
|
|
2773
|
+
nodeType: "domain",
|
|
2774
|
+
}));
|
|
2775
|
+
_setIndexForTesting([...toolEntries, ...domainEntries]);
|
|
2776
|
+
const queryVec = new Float32Array([1.0, 0.0, 0.0]);
|
|
2777
|
+
const graphResults = hybridSearch("start verification cycle", toolDescs, {
|
|
2778
|
+
mode: "hybrid",
|
|
2779
|
+
limit: 5,
|
|
2780
|
+
embeddingQueryVec: queryVec,
|
|
2781
|
+
});
|
|
2782
|
+
// Lexical dominance should preserve #1 position
|
|
2783
|
+
expect(graphResults[0].name).toBe("start_verification_cycle");
|
|
2784
|
+
});
|
|
2785
|
+
});
|
|
2786
|
+
// ── Agent-as-a-Graph: execution trace edge tests ──────────────────────────
|
|
2787
|
+
// Validates that co-occurrence edges mined from tool_call_log boost results.
|
|
2788
|
+
// Uses _setCooccurrenceForTesting to inject deterministic edges.
|
|
2789
|
+
//
|
|
2790
|
+
// Key insight: trace edges only boost tools that ALREADY scored > 0 from
|
|
2791
|
+
// lexical matching. They lift borderline tools, not create results from nothing.
|
|
2792
|
+
// Tests use a data-driven approach: run baseline first, then inject edges
|
|
2793
|
+
// targeting actual result entries.
|
|
2794
|
+
describe("Agent-as-a-Graph: execution trace edges", () => {
|
|
2795
|
+
const TRACE_QUERY = "verify test quality";
|
|
2796
|
+
afterEach(() => {
|
|
2797
|
+
resetEmbedding();
|
|
2798
|
+
_resetCooccurrenceCache();
|
|
2799
|
+
});
|
|
2800
|
+
it("co-occurrence edges boost a non-top-5 tool by exactly +4", () => {
|
|
2801
|
+
// Step 1: Get natural ranking without trace edges
|
|
2802
|
+
_setCooccurrenceForTesting(new Map());
|
|
2803
|
+
const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2804
|
+
mode: "hybrid",
|
|
2805
|
+
limit: 15,
|
|
2806
|
+
explain: true,
|
|
2807
|
+
});
|
|
2808
|
+
expect(baseline.length).toBeGreaterThanOrEqual(6);
|
|
2809
|
+
const topTool = baseline[0].name;
|
|
2810
|
+
const boostTarget = baseline[5].name; // position 6 — NOT in top 5
|
|
2811
|
+
const baselineScore = baseline[5].score;
|
|
2812
|
+
// Step 2: Inject trace edge from top tool → boost target
|
|
2813
|
+
_resetCooccurrenceCache();
|
|
2814
|
+
const edges = new Map();
|
|
2815
|
+
edges.set(topTool, [boostTarget]);
|
|
2816
|
+
_setCooccurrenceForTesting(edges);
|
|
2817
|
+
const boosted = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2818
|
+
mode: "hybrid",
|
|
2819
|
+
limit: 15,
|
|
2820
|
+
explain: true,
|
|
2821
|
+
});
|
|
2822
|
+
const result = boosted.find((r) => r.name === boostTarget);
|
|
2823
|
+
expect(result).toBeDefined();
|
|
2824
|
+
expect(result.score).toBe(baselineScore + 4);
|
|
2825
|
+
expect(result.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
|
|
2826
|
+
});
|
|
2827
|
+
it("top-5 tools do NOT receive trace edge self-boost", () => {
|
|
2828
|
+
// Get natural ranking
|
|
2829
|
+
_setCooccurrenceForTesting(new Map());
|
|
2830
|
+
const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2831
|
+
mode: "hybrid",
|
|
2832
|
+
limit: 15,
|
|
2833
|
+
explain: true,
|
|
2834
|
+
});
|
|
2835
|
+
const topTool = baseline[0].name;
|
|
2836
|
+
const topScore = baseline[0].score;
|
|
2837
|
+
const secondTool = baseline[1].name;
|
|
2838
|
+
// Set edge FROM secondTool TO topTool — topTool is already top-5
|
|
2839
|
+
_resetCooccurrenceCache();
|
|
2840
|
+
const edges = new Map();
|
|
2841
|
+
edges.set(secondTool, [topTool]);
|
|
2842
|
+
_setCooccurrenceForTesting(edges);
|
|
2843
|
+
const results = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2844
|
+
mode: "hybrid",
|
|
2845
|
+
limit: 15,
|
|
2846
|
+
explain: true,
|
|
2847
|
+
});
|
|
2848
|
+
const top = results.find((r) => r.name === topTool);
|
|
2849
|
+
expect(top).toBeDefined();
|
|
2850
|
+
// Score should NOT increase — top-5 tools are excluded from trace boost
|
|
2851
|
+
expect(top.score).toBe(topScore);
|
|
2852
|
+
expect(top.matchReasons.some((r) => r === "trace_edge:+4")).toBe(false);
|
|
2853
|
+
});
|
|
2854
|
+
it("empty co-occurrence map produces no trace_edge boosts", () => {
|
|
2855
|
+
_setCooccurrenceForTesting(new Map());
|
|
2856
|
+
const results = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2857
|
+
mode: "hybrid",
|
|
2858
|
+
limit: 15,
|
|
2859
|
+
explain: true,
|
|
2860
|
+
});
|
|
2861
|
+
for (const r of results) {
|
|
2862
|
+
expect(r.matchReasons.some((mr) => mr.includes("trace_edge"))).toBe(false);
|
|
2863
|
+
}
|
|
2864
|
+
});
|
|
2865
|
+
it("trace edges from multiple top tools merge — both targets get +4", () => {
|
|
2866
|
+
// Get natural ranking
|
|
2867
|
+
_setCooccurrenceForTesting(new Map());
|
|
2868
|
+
const baseline = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2869
|
+
mode: "hybrid",
|
|
2870
|
+
limit: 15,
|
|
2871
|
+
explain: true,
|
|
2872
|
+
});
|
|
2873
|
+
expect(baseline.length).toBeGreaterThanOrEqual(8);
|
|
2874
|
+
const topTool1 = baseline[0].name;
|
|
2875
|
+
const topTool2 = baseline[1].name;
|
|
2876
|
+
const target1 = baseline[6].name;
|
|
2877
|
+
const target2 = baseline[7].name;
|
|
2878
|
+
const target1BaseScore = baseline[6].score;
|
|
2879
|
+
const target2BaseScore = baseline[7].score;
|
|
2880
|
+
// Two top tools each point to a different target
|
|
2881
|
+
_resetCooccurrenceCache();
|
|
2882
|
+
const edges = new Map();
|
|
2883
|
+
edges.set(topTool1, [target1]);
|
|
2884
|
+
edges.set(topTool2, [target2]);
|
|
2885
|
+
_setCooccurrenceForTesting(edges);
|
|
2886
|
+
const results = hybridSearch(TRACE_QUERY, toolDescs, {
|
|
2887
|
+
mode: "hybrid",
|
|
2888
|
+
limit: 15,
|
|
2889
|
+
explain: true,
|
|
2890
|
+
});
|
|
2891
|
+
const boosted1 = results.find((r) => r.name === target1);
|
|
2892
|
+
const boosted2 = results.find((r) => r.name === target2);
|
|
2893
|
+
expect(boosted1).toBeDefined();
|
|
2894
|
+
expect(boosted2).toBeDefined();
|
|
2895
|
+
expect(boosted1.score).toBe(target1BaseScore + 4);
|
|
2896
|
+
expect(boosted2.score).toBe(target2BaseScore + 4);
|
|
2897
|
+
expect(boosted1.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
|
|
2898
|
+
expect(boosted2.matchReasons.some((r) => r === "trace_edge:+4")).toBe(true);
|
|
2899
|
+
});
|
|
2900
|
+
});
|
|
2901
|
+
// ── Industry-Standard IR Metrics: Recall@K, mAP@K, NDCG@K ──────────────
|
|
2902
|
+
// Every tool retrieval paper (ToolBench, AnyTool, Agent-as-a-Graph, TOOLRET)
|
|
2903
|
+
// reports these metrics. We evaluate hybrid search against 15 intent-based
|
|
2904
|
+
// queries with ground-truth relevant tool sets.
|
|
2905
|
+
//
|
|
2906
|
+
// Standards compared against:
|
|
2907
|
+
// - Agent-as-a-Graph (arxiv:2511.18194): Recall@5=0.85, NDCG@5=0.47
|
|
2908
|
+
// - TOOLRET (ACL 2025): best NDCG@10=33.83 (bi-encoder only)
|
|
2909
|
+
// - ToolBench: NDCG@5=84.9 (contrastive-trained Sentence-BERT)
|
|
2910
|
+
//
|
|
2911
|
+
// Our system is different (single MCP server, 163 tools, 14-strategy ensemble)
|
|
2912
|
+
// so absolute numbers aren't comparable, but we should track and not regress.
|
|
2913
|
+
describe("Industry-standard IR metrics: Recall@K, mAP@K, NDCG@K", () => {
|
|
2914
|
+
// Ground truth: query → set of relevant tools (any order).
|
|
2915
|
+
// Each query has 3-6 relevant tools, reflecting realistic intent breadth.
|
|
2916
|
+
const EVAL_QUERIES = [
|
|
2917
|
+
{ query: "verify my implementation is correct", relevant: ["start_verification_cycle", "get_verification_status", "log_test_result", "run_quality_gate", "triple_verify"] },
|
|
2918
|
+
{ query: "search past findings and lessons", relevant: ["search_all_knowledge", "record_learning", "load_session_notes"] },
|
|
2919
|
+
{ query: "run security audit on codebase", relevant: ["scan_dependencies", "run_code_analysis", "scan_terminal_security", "assess_risk"] },
|
|
2920
|
+
{ query: "write and polish academic paper", relevant: ["polish_academic_text", "check_paper_logic", "generate_academic_caption", "review_paper_as_reviewer"] },
|
|
2921
|
+
{ query: "coordinate parallel agent tasks", relevant: ["claim_agent_task", "get_parallel_status", "assign_agent_role", "bootstrap_parallel_agents", "release_agent_task"] },
|
|
2922
|
+
{ query: "check website performance and SEO", relevant: ["seo_audit_url", "check_page_performance", "analyze_seo_content"] },
|
|
2923
|
+
{ query: "save and recall context between sessions", relevant: ["save_session_note", "load_session_notes", "refresh_task_context"] },
|
|
2924
|
+
{ query: "review git compliance before merge", relevant: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
|
|
2925
|
+
{ query: "benchmark model autonomy", relevant: ["start_autonomy_benchmark", "complete_autonomy_benchmark", "log_benchmark_milestone"] },
|
|
2926
|
+
{ query: "capture screenshot of UI state", relevant: ["capture_screenshot", "capture_full_page", "compare_screenshots"] },
|
|
2927
|
+
{ query: "encode data in compact token format", relevant: ["toon_encode", "toon_decode"] },
|
|
2928
|
+
{ query: "mine patterns from past sessions", relevant: ["mine_session_patterns", "predict_risks_from_patterns"] },
|
|
2929
|
+
{ query: "detect video flicker artifacts", relevant: ["analyze_video_flicker", "compare_video_segments", "get_flicker_report"] },
|
|
2930
|
+
{ query: "design voice interaction pipeline", relevant: ["design_voice_pipeline", "analyze_voice_config", "generate_voice_scaffold", "benchmark_voice_latency"] },
|
|
2931
|
+
{ query: "check if this task is worth doing", relevant: ["critter_check"] },
|
|
2932
|
+
];
|
|
2933
|
+
function recallAtK(ranked, relevant, k) {
|
|
2934
|
+
const topK = ranked.slice(0, k);
|
|
2935
|
+
const found = topK.filter((name) => relevant.has(name)).length;
|
|
2936
|
+
return found / relevant.size;
|
|
2937
|
+
}
|
|
2938
|
+
function averagePrecisionAtK(ranked, relevant, k) {
|
|
2939
|
+
let hits = 0;
|
|
2940
|
+
let sumPrecision = 0;
|
|
2941
|
+
for (let i = 0; i < Math.min(k, ranked.length); i++) {
|
|
2942
|
+
if (relevant.has(ranked[i])) {
|
|
2943
|
+
hits++;
|
|
2944
|
+
sumPrecision += hits / (i + 1);
|
|
2945
|
+
}
|
|
2946
|
+
}
|
|
2947
|
+
return relevant.size === 0 ? 0 : sumPrecision / relevant.size;
|
|
2948
|
+
}
|
|
2949
|
+
function ndcgAtK(ranked, relevant, k) {
|
|
2950
|
+
// Binary relevance: 1 if relevant, 0 otherwise
|
|
2951
|
+
let dcg = 0;
|
|
2952
|
+
for (let i = 0; i < Math.min(k, ranked.length); i++) {
|
|
2953
|
+
if (relevant.has(ranked[i]))
|
|
2954
|
+
dcg += 1 / Math.log2(i + 2);
|
|
2955
|
+
}
|
|
2956
|
+
let idcg = 0;
|
|
2957
|
+
const idealCount = Math.min(k, relevant.size);
|
|
2958
|
+
for (let i = 0; i < idealCount; i++) {
|
|
2959
|
+
idcg += 1 / Math.log2(i + 2);
|
|
2960
|
+
}
|
|
2961
|
+
return idcg === 0 ? 0 : dcg / idcg;
|
|
2962
|
+
}
|
|
2963
|
+
function evaluateConfig(configLabel, searchFn) {
|
|
2964
|
+
let totalRecall1 = 0, totalRecall3 = 0, totalRecall5 = 0, totalMap5 = 0, totalNdcg5 = 0;
|
|
2965
|
+
for (const { query, relevant } of EVAL_QUERIES) {
|
|
2966
|
+
const relevantSet = new Set(relevant);
|
|
2967
|
+
const ranked = searchFn(query);
|
|
2968
|
+
totalRecall1 += recallAtK(ranked, relevantSet, 1);
|
|
2969
|
+
totalRecall3 += recallAtK(ranked, relevantSet, 3);
|
|
2970
|
+
totalRecall5 += recallAtK(ranked, relevantSet, 5);
|
|
2971
|
+
totalMap5 += averagePrecisionAtK(ranked, relevantSet, 5);
|
|
2972
|
+
totalNdcg5 += ndcgAtK(ranked, relevantSet, 5);
|
|
2973
|
+
}
|
|
2974
|
+
const n = EVAL_QUERIES.length;
|
|
2975
|
+
return {
|
|
2976
|
+
recall1: totalRecall1 / n,
|
|
2977
|
+
recall3: totalRecall3 / n,
|
|
2978
|
+
recall5: totalRecall5 / n,
|
|
2979
|
+
map5: totalMap5 / n,
|
|
2980
|
+
ndcg5: totalNdcg5 / n,
|
|
2981
|
+
};
|
|
2982
|
+
}
|
|
2983
|
+
afterEach(() => {
|
|
2984
|
+
resetEmbedding();
|
|
2985
|
+
_resetCooccurrenceCache();
|
|
2986
|
+
_resetWrrfParamsForTesting();
|
|
2987
|
+
});
|
|
2988
|
+
it("hybrid search (lexical only) meets minimum IR thresholds", () => {
|
|
2989
|
+
// Baseline: no embeddings, pure lexical ensemble (keyword + fuzzy + n-gram + semantic + dense)
|
|
2990
|
+
resetEmbedding();
|
|
2991
|
+
const metrics = evaluateConfig("lexical-only", (query) => {
|
|
2992
|
+
const results = hybridSearch(query, toolDescs, { mode: "hybrid", limit: 10 });
|
|
2993
|
+
return results.map((r) => r.name);
|
|
2994
|
+
});
|
|
2995
|
+
// Minimum thresholds for our 14-strategy lexical ensemble
|
|
2996
|
+
// These are regression guards — if we drop below, something broke.
|
|
2997
|
+
expect(metrics.recall5).toBeGreaterThanOrEqual(0.55);
|
|
2998
|
+
expect(metrics.map5).toBeGreaterThanOrEqual(0.40);
|
|
2999
|
+
expect(metrics.ndcg5).toBeGreaterThanOrEqual(0.50);
|
|
3000
|
+
});
|
|
3001
|
+
it("hybrid + embedding search improves over lexical-only baseline", () => {
|
|
3002
|
+
// Build a realistic mock index: tools close to their own category
|
|
3003
|
+
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
3004
|
+
const catList = [...categories];
|
|
3005
|
+
// Each category gets a unique direction in a high-dim space
|
|
3006
|
+
const mockIndex = toolDescs.map((t) => {
|
|
3007
|
+
const entry = TOOL_REGISTRY.get(t.name);
|
|
3008
|
+
const catIdx = catList.indexOf(entry?.category ?? "");
|
|
3009
|
+
// Tools in same category share a similar vector direction
|
|
3010
|
+
const vec = new Float32Array(catList.length + 1);
|
|
3011
|
+
if (catIdx >= 0)
|
|
3012
|
+
vec[catIdx] = 0.8;
|
|
3013
|
+
vec[catList.length] = 0.2; // small shared component
|
|
3014
|
+
// Normalize
|
|
3015
|
+
let norm = 0;
|
|
3016
|
+
for (let i = 0; i < vec.length; i++)
|
|
3017
|
+
norm += vec[i] * vec[i];
|
|
3018
|
+
norm = Math.sqrt(norm);
|
|
3019
|
+
for (let i = 0; i < vec.length; i++)
|
|
3020
|
+
vec[i] /= norm;
|
|
3021
|
+
return { name: t.name, vector: vec, nodeType: "tool" };
|
|
3022
|
+
});
|
|
3023
|
+
const domainIndex = catList.map((cat, catIdx) => {
|
|
3024
|
+
const vec = new Float32Array(catList.length + 1);
|
|
3025
|
+
vec[catIdx] = 0.9;
|
|
3026
|
+
vec[catList.length] = 0.1;
|
|
3027
|
+
let norm = 0;
|
|
3028
|
+
for (let i = 0; i < vec.length; i++)
|
|
3029
|
+
norm += vec[i] * vec[i];
|
|
3030
|
+
norm = Math.sqrt(norm);
|
|
3031
|
+
for (let i = 0; i < vec.length; i++)
|
|
3032
|
+
vec[i] /= norm;
|
|
3033
|
+
return { name: `domain:${cat}`, vector: vec, nodeType: "domain" };
|
|
3034
|
+
});
|
|
3035
|
+
_setIndexForTesting([...mockIndex, ...domainIndex]);
|
|
3036
|
+
// Lexical baseline
|
|
3037
|
+
resetEmbedding();
|
|
3038
|
+
const lexicalMetrics = evaluateConfig("lexical", (query) => {
|
|
3039
|
+
const results = hybridSearch(query, toolDescs, { mode: "hybrid", limit: 10 });
|
|
3040
|
+
return results.map((r) => r.name);
|
|
3041
|
+
});
|
|
3042
|
+
// Hybrid + embedding
|
|
3043
|
+
_setIndexForTesting([...mockIndex, ...domainIndex]);
|
|
3044
|
+
const embeddingMetrics = evaluateConfig("hybrid+embedding", (query) => {
|
|
3045
|
+
// Simulate query embedding: average of relevant category vectors
|
|
3046
|
+
const queryWords = query.toLowerCase().split(/\s+/);
|
|
3047
|
+
const queryVec = new Float32Array(catList.length + 1);
|
|
3048
|
+
for (const cat of catList) {
|
|
3049
|
+
if (queryWords.some((w) => cat.includes(w) || w.includes(cat.slice(0, 4)))) {
|
|
3050
|
+
queryVec[catList.indexOf(cat)] = 0.7;
|
|
3051
|
+
}
|
|
3052
|
+
}
|
|
3053
|
+
queryVec[catList.length] = 0.3;
|
|
3054
|
+
let norm = 0;
|
|
3055
|
+
for (let i = 0; i < queryVec.length; i++)
|
|
3056
|
+
norm += queryVec[i] * queryVec[i];
|
|
3057
|
+
norm = Math.sqrt(norm) || 1;
|
|
3058
|
+
for (let i = 0; i < queryVec.length; i++)
|
|
3059
|
+
queryVec[i] /= norm;
|
|
3060
|
+
const results = hybridSearch(query, toolDescs, {
|
|
3061
|
+
mode: "hybrid",
|
|
3062
|
+
limit: 10,
|
|
3063
|
+
embeddingQueryVec: queryVec,
|
|
3064
|
+
});
|
|
3065
|
+
return results.map((r) => r.name);
|
|
3066
|
+
});
|
|
3067
|
+
// Embedding should not degrade any metric (non-regression)
|
|
3068
|
+
expect(embeddingMetrics.ndcg5).toBeGreaterThanOrEqual(lexicalMetrics.ndcg5 - 0.02);
|
|
3069
|
+
});
|
|
3070
|
+
});
|
|
3071
|
+
// ── wRRF α ratio ablation: paper vs our calibration ──────────────────────
|
|
3072
|
+
// Agent-as-a-Graph (arxiv:2511.18194) optimal: α_A=1.5, α_T=1.0, K=60
|
|
3073
|
+
// Our calibration: α_T=1.0, α_D=0.6, K=20
|
|
3074
|
+
//
|
|
3075
|
+
// The paper optimizes for agent SELECTION across 70 MCP servers.
|
|
3076
|
+
// We optimize for tool RETRIEVAL within a single server.
|
|
3077
|
+
// This ablation verifies our deviation is justified by measuring Recall@5.
|
|
3078
|
+
describe("wRRF α ratio ablation: paper vs NodeBench calibration", () => {
|
|
3079
|
+
const ABLATION_QUERIES = [
|
|
3080
|
+
{ query: "verify my implementation", relevant: ["start_verification_cycle", "get_verification_status", "log_test_result"] },
|
|
3081
|
+
{ query: "search past findings", relevant: ["search_all_knowledge", "record_learning", "load_session_notes"] },
|
|
3082
|
+
{ query: "run security checks", relevant: ["scan_dependencies", "run_code_analysis", "scan_terminal_security"] },
|
|
3083
|
+
{ query: "coordinate parallel work", relevant: ["claim_agent_task", "get_parallel_status", "assign_agent_role"] },
|
|
3084
|
+
{ query: "capture UI screenshots", relevant: ["capture_screenshot", "capture_full_page", "compare_screenshots"] },
|
|
3085
|
+
{ query: "review git compliance", relevant: ["check_git_compliance", "enforce_merge_gate", "review_pr_checklist"] },
|
|
3086
|
+
{ query: "write academic paper", relevant: ["polish_academic_text", "check_paper_logic", "generate_academic_caption"] },
|
|
3087
|
+
{ query: "check website performance", relevant: ["seo_audit_url", "check_page_performance", "analyze_seo_content"] },
|
|
3088
|
+
];
|
|
3089
|
+
function buildCategoryAwareIndex() {
|
|
3090
|
+
const categories = new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category));
|
|
3091
|
+
const catList = [...categories];
|
|
3092
|
+
const toolEntries = toolDescs.map((t) => {
|
|
3093
|
+
const entry = TOOL_REGISTRY.get(t.name);
|
|
3094
|
+
const catIdx = catList.indexOf(entry?.category ?? "");
|
|
3095
|
+
const vec = new Float32Array(catList.length);
|
|
3096
|
+
if (catIdx >= 0)
|
|
3097
|
+
vec[catIdx] = 0.85;
|
|
3098
|
+
// Add small noise per tool so not all tools in same cat have identical vectors
|
|
3099
|
+
const nameHash = t.name.split("").reduce((h, c) => ((h << 5) - h + c.charCodeAt(0)) | 0, 0);
|
|
3100
|
+
vec[Math.abs(nameHash) % catList.length] += 0.1;
|
|
3101
|
+
let norm = 0;
|
|
3102
|
+
for (let i = 0; i < vec.length; i++)
|
|
3103
|
+
norm += vec[i] * vec[i];
|
|
3104
|
+
norm = Math.sqrt(norm);
|
|
3105
|
+
for (let i = 0; i < vec.length; i++)
|
|
3106
|
+
vec[i] /= norm;
|
|
3107
|
+
return { name: t.name, vector: vec, nodeType: "tool" };
|
|
3108
|
+
});
|
|
3109
|
+
const domainEntries = catList.map((cat, catIdx) => {
|
|
3110
|
+
const vec = new Float32Array(catList.length);
|
|
3111
|
+
vec[catIdx] = 0.95;
|
|
3112
|
+
let norm = 0;
|
|
3113
|
+
for (let i = 0; i < vec.length; i++)
|
|
3114
|
+
norm += vec[i] * vec[i];
|
|
3115
|
+
norm = Math.sqrt(norm);
|
|
3116
|
+
for (let i = 0; i < vec.length; i++)
|
|
3117
|
+
vec[i] /= norm;
|
|
3118
|
+
return { name: `domain:${cat}`, vector: vec, nodeType: "domain" };
|
|
3119
|
+
});
|
|
3120
|
+
return [...toolEntries, ...domainEntries];
|
|
3121
|
+
}
|
|
3122
|
+
function makeQueryVec(query, catList) {
|
|
3123
|
+
const words = query.toLowerCase().split(/\s+/);
|
|
3124
|
+
const vec = new Float32Array(catList.length);
|
|
3125
|
+
for (const cat of catList) {
|
|
3126
|
+
if (words.some((w) => cat.includes(w) || w.includes(cat.slice(0, 4)))) {
|
|
3127
|
+
vec[catList.indexOf(cat)] = 0.8;
|
|
3128
|
+
}
|
|
3129
|
+
}
|
|
3130
|
+
let norm = 0;
|
|
3131
|
+
for (let i = 0; i < vec.length; i++)
|
|
3132
|
+
norm += vec[i] * vec[i];
|
|
3133
|
+
norm = Math.sqrt(norm) || 1;
|
|
3134
|
+
for (let i = 0; i < vec.length; i++)
|
|
3135
|
+
vec[i] /= norm;
|
|
3136
|
+
return vec;
|
|
3137
|
+
}
|
|
3138
|
+
function runAblation(label) {
|
|
3139
|
+
const catList = [...new Set(ALL_REGISTRY_ENTRIES.map((e) => e.category))];
|
|
3140
|
+
let totalRecall = 0;
|
|
3141
|
+
for (const { query, relevant } of ABLATION_QUERIES) {
|
|
3142
|
+
const relevantSet = new Set(relevant);
|
|
3143
|
+
const queryVec = makeQueryVec(query, catList);
|
|
3144
|
+
const results = hybridSearch(query, toolDescs, {
|
|
3145
|
+
mode: "hybrid",
|
|
3146
|
+
limit: 10,
|
|
3147
|
+
embeddingQueryVec: queryVec,
|
|
3148
|
+
});
|
|
3149
|
+
const topK = results.slice(0, 5).map((r) => r.name);
|
|
3150
|
+
const found = topK.filter((n) => relevantSet.has(n)).length;
|
|
3151
|
+
totalRecall += found / relevantSet.size;
|
|
3152
|
+
}
|
|
3153
|
+
return totalRecall / ABLATION_QUERIES.length;
|
|
3154
|
+
}
|
|
3155
|
+
afterEach(() => {
|
|
3156
|
+
resetEmbedding();
|
|
3157
|
+
_resetWrrfParamsForTesting();
|
|
3158
|
+
});
|
|
3159
|
+
it("ablation grid: find optimal α_D and K for single-server tool retrieval", () => {
|
|
3160
|
+
const mockIndex = buildCategoryAwareIndex();
|
|
3161
|
+
const configs = [
|
|
3162
|
+
{ label: "old(T=1.0,D=0.6,K=20)", alphaT: 1.0, alphaD: 0.6, k: 20 },
|
|
3163
|
+
{ label: "paper(T=1.0,D=1.5,K=60)", alphaT: 1.0, alphaD: 1.5, k: 60 },
|
|
3164
|
+
{ label: "paperK20(T=1.0,D=1.5,K=20)", alphaT: 1.0, alphaD: 1.5, k: 20 },
|
|
3165
|
+
{ label: "balanced(T=1.0,D=1.0,K=20)", alphaT: 1.0, alphaD: 1.0, k: 20 },
|
|
3166
|
+
{ label: "gentleDom(T=1.0,D=1.2,K=20)", alphaT: 1.0, alphaD: 1.2, k: 20 },
|
|
3167
|
+
{ label: "strongDom(T=1.0,D=2.0,K=20)", alphaT: 1.0, alphaD: 2.0, k: 20 },
|
|
3168
|
+
];
|
|
3169
|
+
const results = [];
|
|
3170
|
+
for (const cfg of configs) {
|
|
3171
|
+
_setIndexForTesting(mockIndex);
|
|
3172
|
+
_setWrrfParamsForTesting({ alphaT: cfg.alphaT, alphaD: cfg.alphaD, k: cfg.k });
|
|
3173
|
+
results.push({ label: cfg.label, recall: runAblation(cfg.label) });
|
|
3174
|
+
}
|
|
3175
|
+
// Sort by recall descending to find winner
|
|
3176
|
+
results.sort((a, b) => b.recall - a.recall);
|
|
3177
|
+
console.log(`wRRF ablation grid — Recall@5:\n${results.map((r) => ` ${r.label}: ${r.recall.toFixed(3)}`).join("\n")}`);
|
|
3178
|
+
// The winning config should be used as our production default.
|
|
3179
|
+
// Assert the winner beats the old default by at least not being worse.
|
|
3180
|
+
const oldResult = results.find((r) => r.label.startsWith("old"));
|
|
3181
|
+
const bestResult = results[0];
|
|
3182
|
+
expect(bestResult.recall).toBeGreaterThanOrEqual(oldResult.recall);
|
|
3183
|
+
});
|
|
3184
|
+
});
|
|
2183
3185
|
//# sourceMappingURL=tools.test.js.map
|