explainmyrepo 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/kb/ask-kb.mjs ADDED
@@ -0,0 +1,1487 @@
1
+ #!/usr/bin/env node
2
+ // ask-kb.mjs — self-contained CLI to query a Cognitum RVF knowledge base and print
3
+ // the FULL top-k passages (not previews). Joins .rvf vector hits to the full-text
4
+ // passages sidecar (.passages.jsonl) by id.
5
+ //
6
+ // Usage:
7
+ // node kb/ask-kb.mjs <store-slug> "your question" [k] (store slugs come from kb.config.mjs)
8
+ //
9
+ // Deps: @ruvector/rvf + @xenova/transformers (resolved PORTABLY — see resolve-deps.mjs:
10
+ // project node_modules first, then RVF_MODULE_PATH/XENOVA_PATH env, then author Mac paths)
11
+ // + the bundled kb/*.rvf and kb/*.passages.jsonl files. So `cd kb && npm i` then run.
12
+ // Model cache is configurable via KB_MODEL_CACHE (offline if cached, else downloads MiniLM
13
+ // from HuggingFace — works on a fresh machine).
14
+
15
+ import fs from 'node:fs';
16
+ import path from 'node:path';
17
+ import readline from 'node:readline';
18
+ import { fileURLToPath } from 'node:url';
19
+ import { loadRvf, loadTransformers, configureModel } from './resolve-deps.mjs';
20
+ import { targets as CONFIG_TARGETS, defaultTarget as CONFIG_DEFAULT } from './kb.config.mjs';
21
+
22
+ const { mod: rvfMod, via: rvfVia } = loadRvf();
23
+ const { RvfDatabase } = rvfMod;
24
+ if (process.env.KB_DEBUG) console.error(`[ask-kb] @ruvector/rvf via: ${rvfVia}`);
25
+
26
+ const __filename = fileURLToPath(import.meta.url); // decodes %20 etc.
27
+ const KB_DIR = path.dirname(__filename);
28
+
29
+ // ===================================================================================
30
+ // CONFIG BRIDGE — per-store config is read from kb.config.mjs (NO hard-coded repo names).
31
+ // Everything the intent/rerank layer needs that used to be a hard-coded ruvector/ruview map is
32
+ // now derived here from the target entry: metaName, productNames, componentRoots, componentWord,
33
+ // disambiguation, offtopicMagnets, primerSlugs (auto = discovered from the live sidecar at query
34
+ // time). knownStore() replaces the hard-coded `store !== 'ruvector' && store !== 'ruview'` checks.
35
+ // ===================================================================================
36
+ const KNOWN_STORES = new Set(Object.keys(CONFIG_TARGETS));
37
+ function knownStore(store) { return KNOWN_STORES.has(store); }
38
+ function cfgFor(store) { return CONFIG_TARGETS[store] || null; }
39
+
40
+ // componentWord synonym group (e.g. ['crate','package','module','component']) — injected into the
41
+ // intent regexes so an npm-package repo (packages/<name>) fires the same routes a crate repo did.
42
+ function componentWords(store) {
43
+ const t = cfgFor(store);
44
+ const w = (t && t.componentWord && t.componentWord.length) ? t.componentWord : ['crate', 'package', 'module', 'component'];
45
+ return w.map((x) => String(x).toLowerCase());
46
+ }
47
+ // Regex alternation of the component words, e.g. (?:crate|package|module|component).
48
+ function componentWordAlt(store) {
49
+ return `(?:${componentWords(store).map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})`;
50
+ }
51
+ // componentRoots (e.g. ['packages'] or ['crates']) — the directory segment(s) that hold components.
52
+ // A component path looks like `<root>/<name>` (optionally under a `vN/` prefix, legacy crates case).
53
+ function componentRoots(store) {
54
+ const t = cfgFor(store);
55
+ return (t && t.componentRoots && t.componentRoots.length) ? t.componentRoots : ['crates', 'packages'];
56
+ }
57
+ // Build a regex SOURCE that matches `(?:vN/)?<root>/<NAME>` for the store's componentRoots, with the
58
+ // component-name capture being `nameSrc` (caller supplies a literal token or a generic name pattern).
59
+ function componentPrefixSrc(store, nameSrc) {
60
+ const rootsAlt = componentRoots(store).map((r) => r.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
61
+ return `(?:^|/)(?:v\\d+/)?(?:${rootsAlt})/${nameSrc}`;
62
+ }
63
+ // A regex testing whether `path` lives inside a SPECIFIC named component (token may itself be a
64
+ // prefix, e.g. "ruvector-core" matches "crates/ruvector-core/..." or "packages/ruvector-core/...").
65
+ function inComponentRe(store, token) {
66
+ const tok = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
67
+ return new RegExp(componentPrefixSrc(store, `${tok}(?:-[a-z0-9-]+)?/`), 'i');
68
+ }
69
+ // The product-name matcher for a store, built from config.productNames (escaped, word-ish bounded).
70
+ // Used by isProductOverviewQuery / conceptNouns. Falls back to the store slug.
71
+ function productNameRe(store) {
72
+ const t = cfgFor(store);
73
+ const names = (t && t.productNames && t.productNames.length) ? t.productNames : [store];
74
+ const alt = names
75
+ .map((n) => String(n).trim())
76
+ .filter(Boolean)
77
+ .map((n) => n.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\s+/g, '[\\s-]?'))
78
+ .sort((a, b) => b.length - a.length) // longest-first so "create-agent-harness" beats a prefix
79
+ .join('|');
80
+ return new RegExp(`\\b(?:${alt})\\b`, 'i');
81
+ }
82
+
83
+ // ---------- variant-aware store resolution ----------
84
+ // Two builds ship per repo (same passages, different embedder):
85
+ // small (384-dim MiniLM) — the Seed-compatible default; files: <store>-kb.rvf
86
+ // big (768-dim bge) — sharper, for Mac/PC; files: <store>-kb.big.rvf
87
+ // One tool serves both: the embedder for a query is read from the <rvf>.embed.json
88
+ // sidecar the build wrote next to each .rvf, so the query is always embedded with the
89
+ // SAME model the corpus was. Absent that sidecar we fall back to MiniLM (the small build).
90
+ const MINILM_CFG = { model: 'Xenova/all-MiniLM-L6-v2', pooling: 'mean', normalize: true, queryPrefix: '' };
91
+
92
+ // KB data files live in kb/stores/<store>/ when organized that way (clear, per-repo), and
93
+ // fall back to a flat kb/ layout otherwise (bundles unzip flat). One rule, both layouts.
94
+ function storeDir(store) {
95
+ const sub = path.join(KB_DIR, 'stores', store);
96
+ return fs.existsSync(sub) ? sub : KB_DIR;
97
+ }
98
+
99
+ function variantPaths(store, variant) {
100
+ const dir = storeDir(store);
101
+ const base = path.join(dir, `${store}-kb`);
102
+ // metadata sidecar (per-path `kind`): the generic builder writes *.ids.json; legacy stores used
103
+ // *.meta.json. Prefer ids.json, fall back to the legacy name when only it exists. (No repo baked in.)
104
+ const idsName = `${store}-kb.ids.json`;
105
+ const legacyName = `${store}-kb.meta.json`;
106
+ const metaFile = fs.existsSync(path.join(dir, idsName)) ? idsName
107
+ : (fs.existsSync(path.join(dir, legacyName)) ? legacyName : idsName);
108
+ // BOTH versions are explicitly named: .big.rvf (768-dim) and .small.rvf (384-dim).
109
+ // passages + metadata are SHARED (un-tagged) — built once, used by both.
110
+ // Single-384 build (recipe v1.3.1) writes the canonical un-suffixed <store>-kb.rvf; prefer it
111
+ // for the small variant, falling back to the legacy .small.rvf tag.
112
+ let rvf;
113
+ if (variant === 'big') {
114
+ rvf = `${base}.big.rvf`;
115
+ } else {
116
+ const plain = `${base}.rvf`;
117
+ rvf = fs.existsSync(plain) ? plain : `${base}.small.rvf`;
118
+ }
119
+ return {
120
+ rvf,
121
+ passages: `${base}.passages.jsonl`,
122
+ meta: path.join(dir, metaFile),
123
+ embedCfgPath: `${rvf}.embed.json`,
124
+ };
125
+ }
126
+
127
+ // Resolve the file set + embedder config for a (store, variant). variant defaults to
128
+ // 'big' when a big build is present (best answers), else 'small' — so a fresh checkout
129
+ // with only the Seed build still works, and a Mac bundle auto-uses the sharp one.
130
+ function resolveConf(store, variant) {
131
+ if (!knownStore(store)) throw new Error(`unknown store: ${store} (use ${[...KNOWN_STORES].join('|')})`);
132
+ if (variant !== 'big' && variant !== 'small') {
133
+ variant = fs.existsSync(path.join(storeDir(store), `${store}-kb.big.rvf`)) ? 'big' : 'small';
134
+ }
135
+ const p = variantPaths(store, variant);
136
+ // The big variant indexes the SAME passages/metadata as the small build (only the embedder
137
+ // differs), so a bundle ships ONE copy. If the big-tagged sidecars are absent, fall back to
138
+ // the small (untagged) ones — same content, ~115 MB smaller download per repo.
139
+ if (variant === 'big') {
140
+ const small = variantPaths(store, 'small');
141
+ if (!fs.existsSync(p.passages)) p.passages = small.passages;
142
+ if (!fs.existsSync(p.meta)) p.meta = small.meta;
143
+ }
144
+ let embedCfg = { ...MINILM_CFG };
145
+ if (fs.existsSync(p.embedCfgPath)) {
146
+ try { embedCfg = { ...MINILM_CFG, ...JSON.parse(fs.readFileSync(p.embedCfgPath, 'utf8')) }; }
147
+ catch (e) { if (process.env.KB_DEBUG) console.error(`[ask-kb] bad embed.json (${p.embedCfgPath}): ${e.message}`); }
148
+ }
149
+ return { ...p, embedCfg, variant };
150
+ }
151
+
152
+ // ---------- embedder (lazy, per-model, offline-first with remote fallback) ----------
153
+ // Cached per model name so a single process can serve both the small (MiniLM) and big
154
+ // (bge) builds without reloading. Remote download is allowed only when THAT model isn't
155
+ // already cached locally (so a Seed-only box never reaches for the network).
156
+ const _feCache = new Map();
157
+ async function getEmbedder(model) {
158
+ if (_feCache.has(model)) return _feCache.get(model);
159
+ const { T, modelCache, via } = await loadTransformers();
160
+ T.env.localModelPath = modelCache;
161
+ T.env.allowRemoteModels = !fs.existsSync(path.join(modelCache, model));
162
+ if (process.env.KB_DEBUG) {
163
+ console.error(`[ask-kb] transformers via: ${via} | model ${model} | cache: ${modelCache} `
164
+ + `(${T.env.allowRemoteModels ? 'remote download' : 'local'})`);
165
+ }
166
+ const fe = await T.pipeline('feature-extraction', model, { quantized: true });
167
+ _feCache.set(model, fe);
168
+ return fe;
169
+ }
170
+
171
+ // Embed a QUERY with the build's embedder config. bge-style builds carry a queryPrefix
172
+ // (asymmetric retrieval — passages were embedded with NO prefix at build time, queries
173
+ // get the instruction prefix here); MiniLM uses no prefix and mean pooling.
174
+ async function embed(text, cfg = MINILM_CFG) {
175
+ const fe = await getEmbedder(cfg.model || MINILM_CFG.model);
176
+ const out = await fe([(cfg.queryPrefix || '') + text], {
177
+ pooling: cfg.pooling || 'mean',
178
+ normalize: cfg.normalize !== false,
179
+ });
180
+ return Float32Array.from(out.data);
181
+ }
182
+
183
+ // ---------- passages sidecar loader ----------
184
+ // Returns { byId, byPath } where:
185
+ // byId : Map id(str) -> { id(num), text, path, title }
186
+ // byPath : Map path -> [ {id,text,...}, ... ] sorted by numeric id (== chunk order)
187
+ // Numeric id order reconstructs document chunk order: the builder assigns ids sequentially
188
+ // while walking a document, so a path's chunks are id-ordered (verified on both KBs).
189
+ function loadPassages(file) {
190
+ return new Promise((resolve, reject) => {
191
+ const byId = new Map();
192
+ const byPath = new Map();
193
+ if (!fs.existsSync(file)) return reject(new Error(`passages sidecar not found: ${file}`));
194
+ const rl = readline.createInterface({ input: fs.createReadStream(file, 'utf8'), crlfDelay: Infinity });
195
+ rl.on('line', (line) => {
196
+ if (!line.trim()) return;
197
+ try {
198
+ const o = JSON.parse(line);
199
+ const rec = { id: Number(o.id), text: o.text || '', path: o.path || '(unknown path)', title: o.title || '(unknown title)' };
200
+ byId.set(String(o.id), rec);
201
+ if (!byPath.has(rec.path)) byPath.set(rec.path, []);
202
+ byPath.get(rec.path).push(rec);
203
+ } catch { /* skip malformed line */ }
204
+ });
205
+ rl.on('close', () => {
206
+ for (const arr of byPath.values()) arr.sort((a, b) => a.id - b.id);
207
+ resolve({ byId, byPath });
208
+ });
209
+ rl.on('error', reject);
210
+ });
211
+ }
212
+
213
+ // ---------- kind metadata sidecar loader ----------
214
+ // The passages sidecar (.passages.jsonl) carries only {id,text,path,title}. The per-chunk
215
+ // `kind` (source / crate-src / adr / doc / doc-deep / primer-orientation / …) lives in the
216
+ // build metadata sidecar (.ids.json / .meta.json) keyed by the SAME numeric id. The intent
217
+ // layer (code-vs-doc routing, ADR-vs-code pairing, PRIMER detection) needs `kind`, so we load
218
+ // it once and fold it down to a per-PATH kind. If the sidecar is missing the layer degrades
219
+ // gracefully (kind unknown -> no kind-based adjustments; vector+rerank still works).
220
+ function loadKinds(file) {
221
+ const byPathKind = new Map(); // path -> representative kind (the doc's dominant content kind)
222
+ try {
223
+ if (!file || !fs.existsSync(file)) return byPathKind;
224
+ const j = JSON.parse(fs.readFileSync(file, 'utf8'));
225
+ const entries = j.entries || {};
226
+ const counts = new Map(); // path -> Map(kind -> n)
227
+ for (const v of Object.values(entries)) {
228
+ if (!v || !v.path || !v.kind) continue;
229
+ if (!counts.has(v.path)) counts.set(v.path, new Map());
230
+ const m = counts.get(v.path);
231
+ m.set(v.kind, (m.get(v.kind) || 0) + 1);
232
+ }
233
+ for (const [p, m] of counts) {
234
+ let best = null, bestN = -1;
235
+ for (const [kind, n] of m) { if (n > bestN) { best = kind; bestN = n; } }
236
+ byPathKind.set(p, best);
237
+ }
238
+ } catch { /* sidecar unreadable -> empty map, graceful degrade */ }
239
+ return byPathKind;
240
+ }
241
+
242
+ // A path is "source code" if its dominant kind is a code kind.
243
+ const SOURCE_KINDS = new Set(['source', 'crate-src', 'example']);
244
+ function isSourceKind(kind) { return SOURCE_KINDS.has(kind); }
245
+
246
+ // ===================================================================================
247
+ // SPECIFIC-ENTITY DETECTION (FIX 1 — orientation over-fire). A query that names a SPECIFIC
248
+ // entity (a crate, an ADR id, a filename/.rs token, or a Capitalized multiword proper noun)
249
+ // is NOT a generic product-orientation question, even if it begins "what does …". For such a
250
+ // query we suppress the generic PRIMER-orientation lift AND demote primer-orientation docs so a
251
+ // vector-closer deep doc (source / adr / crate-src / doc) wins. The crate-INVENTORY archetype
252
+ // ("which crates make up X") is handled separately (it carries no hyphen-crate token) and still
253
+ // routes to the inventory PRIMER.
254
+ // ===================================================================================
255
+
256
+ // Build the set of component-style path prefixes actually present in the KB (data-driven, so the
257
+ // detector never fires on a generic hyphenated word like "end-to-end" — only on real components).
258
+ // Prefixes are taken from the store's componentRoots segments (e.g. `packages/<name>` for AHG,
259
+ // `crates/<name>` / `v2/crates/<name>` for a Rust monorepo) — NO hard-coded root name.
260
+ function crateTokenSet(byPath, store) {
261
+ const set = new Set();
262
+ const re = new RegExp(componentPrefixSrc(store, '([a-z0-9][a-z0-9-]+)'), 'i');
263
+ for (const p of byPath.keys()) {
264
+ const m = p.match(re);
265
+ if (m) set.add(m[1].toLowerCase());
266
+ }
267
+ return set;
268
+ }
269
+
270
+ // A capitalized multiword proper noun, e.g. "Coherent Human Channel" / "RuvSense Domain".
271
+ const PROPER_NOUN_RE = /\b([A-Z][a-z0-9]+(?:\s+[A-Z][a-z0-9]+){1,})\b/g;
272
+ // A file / .rs token, e.g. "lib.rs", "main.rs", "versioning.rs", "config.toml".
273
+ const FILE_TOKEN_RE = /\b[a-z0-9_]+\.(rs|toml|md|ts|js|mjs|py|json|yaml|yml)\b/i;
274
+ // Common English / orientation words that may appear Title-Cased ("How Complete Is RuVector",
275
+ // "Getting Started Guide") — a proper-noun candidate built ONLY from these is NOT a named entity, so
276
+ // Title-Cased orientation queries still route correctly. (The product name is also a common word, so
277
+ // "RuVector"/"RuView" alone in a Title-Cased orientation query doesn't count as a specific entity.)
278
+ const COMMON_TITLE_WORDS = new Set(['how','what','when','where','why','which','who','is','are','the','a','an',
279
+ 'and','or','of','to','in','for','on','with','do','does','complete','mature','maturity','production',
280
+ 'ready','overview','introduction','getting','started','start','guide','setup','install','use','using',
281
+ 'core','capabilities','capability','feature','features','concept','concepts','docs','documentation',
282
+ 'tutorial','tutorials','example','examples','crate','crates','inventory','about','it','this','that',
283
+ 'ruvector','ruview','playbook','quickstart','end','reference',
284
+ // Product feature names / host names that appear capitalized in orientation queries but do NOT
285
+ // indicate a specific-entity deep-dive (MetaHarness orientation queries often say "Darwin Mode",
286
+ // "Claude Code", "Rust Core", "Mode", "Claude", "Code", "Layer" etc. — treat as orientation terms).
287
+ 'darwin','mode','claude','code','harness','metaharness','layer','layers','surface','rust','rust core',
288
+ 'host','hosts','adapter','adapters','kernel','wizard','composer','scaffold','stage','stages','genome',
289
+ 'router','factory','framework','model','wrapper','account','server','threat','scan','posture']);
290
+
291
+ // Does the query name a SPECIFIC entity? Returns the matched hyphen-crate token(s) (lowercased)
292
+ // plus a boolean. Used to (a) suppress generic orientation lift, (b) demote primer-orientation,
293
+ // (c) drive the crate-overview README/BENCHMARK boost (FIX 2).
294
+ function specificEntity(query, crateTokens) {
295
+ const hyphenTokens = (query.match(/\b[a-z][a-z0-9]+-[a-z0-9][a-z0-9-]*\b/gi) || [])
296
+ .map((t) => t.toLowerCase());
297
+ // Keep only hyphen tokens that ARE a known crate (or a known crate's prefix) — this excludes
298
+ // generic words like "end-to-end", "step-by-step", "real-time" while catching "ruvector-snapshot".
299
+ const crates = hyphenTokens.filter((t) =>
300
+ crateTokens.has(t) || [...crateTokens].some((c) => c.startsWith(t + '-') || t.startsWith(c)));
301
+ // A Title-Cased multiword phrase counts as a proper noun ONLY if it contains a token that is NOT a
302
+ // common English/orientation word (so "How Complete Is RuVector" / "Getting Started Guide" do NOT
303
+ // misfire, but "Coherent Human Channel" / "Tauri Desktop Frontend" do).
304
+ let hasProperNoun = false;
305
+ for (const m of query.matchAll(PROPER_NOUN_RE)) {
306
+ const toks = m[1].split(/\s+/);
307
+ if (toks.some((w) => !COMMON_TITLE_WORDS.has(w.toLowerCase()))) { hasProperNoun = true; break; }
308
+ }
309
+ const hasAdr = /\badr[-\s_]?\d{1,4}\b/i.test(query);
310
+ const hasFile = FILE_TOKEN_RE.test(query);
311
+ const named = crates.length > 0 || hasAdr || hasFile || hasProperNoun;
312
+ return { named, crates };
313
+ }
314
+
315
+ // Demotion penalty added to a primer-orientation doc when the query names a specific entity.
316
+ // Pushes primer-orientation BELOW source/adr/crate-src/doc for that query (FIX 1). Large enough to
317
+ // out-weigh the (now-suppressed) generic lift but applied as a positive penalty on eff distance.
318
+ const PRIMER_DEMOTE_WHEN_SPECIFIC = 0.60;
319
+
320
+ // ===================================================================================
321
+ // Retrieval-quality layer (retrieval-only; KBs are NOT rebuilt).
322
+ // FIX 1 whole-document return, FIX 2 demote low-signal files,
323
+ // FIX 3 exact-term/ADR/title boost, FIX 4 "Cognitum Seed" disambiguation.
324
+ // ===================================================================================
325
+
326
+ const MAX_DOC_CHARS = 12000; // cap for an assembled full document
327
+ // Chunks fetched from the vector index to group into documents and rerank. Kept generous so the
328
+ // small TOP-DOWN ORIENTATION LAYER (a few dozen `PRIMER#` section chunks) reliably enters the
329
+ // candidate pool for orientation queries — a tiny synthesized section can sit just outside a
330
+ // narrow window yet be the correct whole-document answer once reranked (FIX 5). Reranking is
331
+ // order-stable for the closest deep docs, so widening does not disturb non-orientation results.
332
+ const RAW_HITS = 96;
333
+
334
+ // FIX 2 — low-signal path patterns and the query keyword that *re-enables* each.
335
+ // A penalty is added to a doc's effective distance UNLESS the query mentions the kind.
336
+ const LOW_SIGNAL = [
337
+ { re: /(^|\/)readme[^/]*$/i, pen: 0.18, allow: /\breadme\b/i },
338
+ { re: /-checklist\.md$/i, pen: 0.15, allow: /\bchecklist\b/i },
339
+ { re: /overview[^/]*\.md$/i, pen: 0.10, allow: /\boverview\b/i }, // TOC / link-list pages
340
+ { re: /(^|\/)(index|toc|table-of-contents)[^/]*\.md$/i, pen: 0.10, allow: /\b(index|toc|contents)\b/i },
341
+ { re: /(^|\/)archive\//i, pen: 0.20, allow: /\barchiv/i },
342
+ { re: /(^|\/)examples?\/.*\.rs$/i, pen: 0.18, allow: /\bexamples?\b/i },
343
+ { re: /(^|\/)benches?\//i, pen: 0.22, allow: /\b(bench|benchmark)/i },
344
+ { re: /(^|\/)tests?\//i, pen: 0.16, allow: /\btest/i },
345
+ { re: /(_test\.rs|\.test\.[jt]s|_spec\.rb)$/i, pen: 0.16, allow: /\btest/i },
346
+ ];
347
+
348
+ // FIX 4 — "Cognitum Seed" product disambiguation. When the query is about the Seed
349
+ // product/onboarding, bias toward onboarding/Seed docs and away from RNG/pretraining seeds.
350
+ const SEED_QUERY_RE = /\b(cognitum\s+seed|seed\s+(onboard\w*|pipeline|product)|onboard\w*\s+seed)\b/i;
351
+ const SEED_GOOD_RE = /(adr[-_]?069|adr[-_]?116|(^|\/)seed|onboard|(^|\/)cog-)/i;
352
+ const SEED_BAD_RE = /(rng|random|pretrain|nvsim|prng|np\.random|torch\.manual_seed)/i;
353
+
354
+ // FIX 5 — TOP-DOWN ORIENTATION LAYER. The primers are indexed as synthetic `PRIMER#<section>`
355
+ // documents (kind 'primer-orientation') that synthesize the answers to the six comprehension-
356
+ // journey archetypes a raw repo lacks: what-is-it / concepts / how-each-works / maturity /
357
+ // where-are-the-docs / how-to-use-end-to-end. When a query is one of those top-down orientation
358
+ // questions, bias toward the matching PRIMER section so the SYNTHESIZED answer wins over a deep
359
+ // ADR/source fragment. Deep ADR/source still wins for narrow how-X-works questions (no orient cue).
360
+ const PRIMER_PATH_RE = /^PRIMER#/;
361
+ // Generic orientation cue: the query is asking to be oriented to the product as a whole.
362
+ const ORIENT_QUERY_RE = new RegExp([
363
+ 'what\\s+(is|are|does)\\b', // "what is X" / "what does X do" / "what are the concepts"
364
+ '\\bwhat\\s+can\\b',
365
+ '\\bcore\\s+(capabilit|concept|feature)', // "core capabilities/concepts"
366
+ '\\bcapabilit(y|ies)\\b',
367
+ '\\bhow\\s+(mature|complete)\\b', // maturity archetype
368
+ '\\b(production|experimental)\\b',
369
+ '\\bwhat\\s+works\\b',
370
+ '\\bwhere\\s+(is|are)\\b.*\\b(doc|documentation|adr)', // docs/ADR-location archetype
371
+ '\\bdocumentation\\b.*\\badr',
372
+ '\\badr\\s+index\\b',
373
+ '\\b(install|set\\s*up|setup|use)\\b.*\\bend[\\s-]*to[\\s-]*end\\b', // end-to-end usage
374
+ '\\bend[\\s-]*to[\\s-]*end\\b',
375
+ '\\bget(ting)?\\s+started\\b',
376
+ '\\boverview\\b',
377
+ ].join('|'), 'i');
378
+
379
+ // Archetype → words that, when present in BOTH the query and a PRIMER section's title/path, mean
380
+ // THIS section is the better-routed orientation answer (e.g. "where are the ADRs" -> the section
381
+ // titled "ADR index"). Used to nudge between competing PRIMER sections so the closest-titled one
382
+ // wins, without overriding the generic orientation lift. Each matched cue adds a small extra boost.
383
+ const PRIMER_ROUTE_CUES = [
384
+ { q: /\b(adr|decision\s+record)/i, sec: /\badr\b|decision/i, w: 0.20 },
385
+ { q: /\b(doc|documentation)/i, sec: /\bdoc|where everything lives|tutorial/i, w: 0.10 },
386
+ { q: /\b(mature|maturity|complete|production|experimental|works|graded|honest)/i, sec: /matur|gotcha|graded|honest|complete/i, w: 0.18 },
387
+ { q: /\b(capabilit|concept|feature)/i, sec: /capabilit|concept|crate inventory|big/i, w: 0.16 },
388
+ { q: /\b(install|set\s*up|setup|quickstart|get\s*started|use|end[\s-]*to[\s-]*end|playbook)/i, sec: /executive summary|install|quickstart|playbook|knowledge base|use it/i, w: 0.14 },
389
+ { q: /\b(crate|inventory)/i, sec: /crate inventory|inventory/i, w: 0.16 },
390
+ ];
391
+
392
+ // Returns a NON-NEGATIVE amount to SUBTRACT from a PRIMER document's effective distance when the
393
+ // query is an orientation question. The generic lift makes the synthesized layer beat a vector-
394
+ // closer deep doc; the route cues then nudge between PRIMER sections toward the best-titled one.
395
+ // Gentle enough that a clearly-better deep match still wins for narrow how-X-works questions.
396
+ function orientationBoost(query, path, title = '', suppressGeneric = false) {
397
+ if (!PRIMER_PATH_RE.test(path)) return 0;
398
+ // For a concept what-is query the generic orientation lift is suppressed: we do NOT want every
399
+ // PRIMER (especially the thin product blurb) lifted over the real defining doc. Only the targeted
400
+ // route cues (if any) apply. (FIX 1)
401
+ const generic = suppressGeneric ? 0 : (ORIENT_QUERY_RE.test(query) ? 0.55 : 0.12);
402
+ let route = 0;
403
+ const hay = `${path} ${title}`;
404
+ for (const c of PRIMER_ROUTE_CUES) {
405
+ if (c.q.test(query) && c.sec.test(hay)) route += c.w;
406
+ }
407
+ return generic + route;
408
+ }
409
+
410
+ // ===================================================================================
411
+ // INTENT ROUTING LAYER (the second structural fix). MiniLM collapses every top-down
412
+ // "orientation" query onto the generic "what X is" PRIMER section, and implementation
413
+ // queries don't reliably surface code. This adds DETERMINISTIC intent classification on
414
+ // top of the vector+rerank pipeline: it (a) detects an orientation archetype and force-routes
415
+ // to the matching PRIMER#<slug> for that store, (b) hard-routes an exact "ADR-NNN" query to
416
+ // the real ADR document (beating the index table), (c) tilts ranking toward code or toward
417
+ // design docs by intent, and (d) guarantees an ADR proposal is paired with its built source.
418
+ // All of this is layered as effective-distance adjustments / hard rank overrides — no rebuild.
419
+ // ===================================================================================
420
+
421
+ // Per-store map of orientation archetype -> the EXACT PRIMER# slug that answers it. Slugs were
422
+ // discovered from the live sidecars (grep 'PRIMER#…' on the ids/meta files); only real slugs are
423
+ // listed. `adr` is the docs-location archetype's ADR-specific sub-target (the ADR index table).
424
+ // Legacy hard-coded PRIMER slug maps are kept ONLY as a fallback for the prototype stores so an old
425
+ // ruvector/ruview KB still routes; new targets supply slugs via config (an explicit per-archetype
426
+ // map) or use primerSlugs:'auto' (slugs discovered from the live sidecar at query time — see
427
+ // resolvePrimerSlug). NO new repo names are added here.
428
+ const LEGACY_PRIMER_SLUGS = {
429
+ ruvector: {
430
+ maturity: 'PRIMER#8-maturity-gotchas',
431
+ capabilities: 'PRIMER#2-the-big-capabilities-and-how-to-actually-call-them',
432
+ docs: 'PRIMER#5-docs-tutorials-examples-skills',
433
+ adr: 'PRIMER#4-adr-index-the-complete-table-208-main-series-files-in-docs-adr-54-in-4-sub-ser',
434
+ playbook: 'PRIMER#0-executive-summary-which-crate-do-i-need',
435
+ whatis: 'PRIMER#1-what-ruvector-is',
436
+ crates: 'PRIMER#3-complete-crate-inventory',
437
+ hardware: null,
438
+ glossary: null,
439
+ },
440
+ ruview: {
441
+ maturity: 'PRIMER#7-capabilities-graded-honestly',
442
+ capabilities: 'PRIMER#7-capabilities-graded-honestly',
443
+ docs: 'PRIMER#9-docs-tutorials-scripts-firmware-where-everything-lives',
444
+ adr: 'PRIMER#8-the-complete-adr-index-160-adr-numbered-files-156-unique-numbers',
445
+ playbook: 'PRIMER#0-1-instant-playbooks-task-exact-steps',
446
+ whatis: 'PRIMER#1-what-ruview-is',
447
+ crates: 'PRIMER#3-the-crates-v2-workspace-39-incl-the-ruv-neural-git-submodule-and-homecore-plug',
448
+ hardware: 'PRIMER#10-hardware-matrix',
449
+ glossary: 'PRIMER#0-3-glossary-so-terms-are-never-guessed',
450
+ },
451
+ };
452
+
453
+ // Keyword cues per archetype used by 'auto' slug discovery (matched against a PRIMER#<slug> path).
454
+ // First section whose slug matches the archetype's cues wins; whatis prefers an early "what-is"/#1.
455
+ const AUTO_SLUG_CUES = {
456
+ whatis: [/what[-\s]?is/i, /#?1[-\b]/i, /overview/i, /introduction/i],
457
+ capabilities: [/capabilit/i, /feature/i, /what.*can/i],
458
+ crates: [/inventory/i, /package/i, /crate/i, /module/i, /component/i],
459
+ maturity: [/matur/i, /gotcha/i, /graded/i, /honest/i, /production/i, /limit/i],
460
+ docs: [/docs?/i, /where.*live/i, /tutorial/i, /reference/i],
461
+ adr: [/\badr\b/i, /decision/i, /index/i],
462
+ playbook: [/playbook/i, /quickstart/i, /get[-\s]?started/i, /executive/i, /usage/i, /how[-\s]?to/i],
463
+ hardware: [/hardware/i, /board/i, /device/i],
464
+ glossary: [/glossary/i, /terms/i],
465
+ };
466
+
467
+ // Resolve the PRIMER#<slug> for an archetype, store, and the live sidecar paths.
468
+ // 1. explicit config map (target.primerSlugs is an object): use it.
469
+ // 2. legacy hard-coded map (ruvector/ruview): use it.
470
+ // 3. 'auto' (or anything else): discover from byPath PRIMER#… keys via AUTO_SLUG_CUES.
471
+ function resolvePrimerSlug(archetype, store, byPath) {
472
+ if (!archetype) return null;
473
+ const t = cfgFor(store);
474
+ const ps = t && t.primerSlugs;
475
+ if (ps && typeof ps === 'object') return ps[archetype] ?? null;
476
+ if (LEGACY_PRIMER_SLUGS[store]) return LEGACY_PRIMER_SLUGS[store][archetype] ?? null;
477
+ // 'auto' discovery from the live sidecar.
478
+ const cues = AUTO_SLUG_CUES[archetype];
479
+ if (!cues || !byPath) return null;
480
+ const primerPaths = [...byPath.keys()].filter((p) => PRIMER_PATH_RE.test(p)).sort();
481
+ for (const p of primerPaths) {
482
+ if (cues.some((re) => re.test(p))) return p;
483
+ }
484
+ return null;
485
+ }
486
+
487
+ // The store's PRODUCT NAME matcher, built from config.productNames (see productNameRe above). A
488
+ // what-is/concept query naming ONLY the product (no other concrete concept noun) is a product
489
+ // overview -> force-route to the "what X is" primer; a query carrying a concrete concept noun is
490
+ // NOT a product overview -> let vector+rerank find the DEFINING doc.
491
+ const _productReCache = new Map();
492
+ function productRe(store) {
493
+ if (!_productReCache.has(store)) _productReCache.set(store, productNameRe(store));
494
+ return _productReCache.get(store);
495
+ }
496
+
497
+ // Archetype detectors, ORDERED most-specific-first (first match wins). Each regex tests the raw
498
+ // query. Patterns mirror the spec's intent buckets. `adr` is folded into `docs` but additionally
499
+ // flips the docs target to the ADR-index slug when the query is specifically about ADRs.
500
+ const ARCHETYPE_RES = [
501
+ // maturity / production-readiness / "how good/solid/reliable" / works-today-vs-experiment /
502
+ // host counts / release status / should-I / which-are-newer / what-is-NOT.
503
+ // NOTE: "not a" is intentionally NARROW — only specific maturity-context negations to avoid
504
+ // catching "why is it called a factory and NOT a framework" (a whatis query).
505
+ // The \bNOT[?!]?\s*$ tail catches "What is metaharness NOT?" (query ends in NOT + punctuation).
506
+ { name: 'maturity', re: /(\b(mature|maturity|production[- ]?ready|production\b|how (good|solid|reliable|complete)|how complete|is it (ready|done|complete)|works?\b.*\b(experiment|stub|today|yet)|ready for production|battle[- ]?tested|graded honestly|how many\b.*(stable|newer|addition|host)|which\b.*(stable|newer|addition)|release status|release pipeline|what release|not a (?:chatbot|no[- ]code|hosted service|fine[- ]?tune)|should i assert|fixed test count|honest limits|posture\b|default[- ]deny|guarantee privacy|privacy guarantee|real hardware|simulator or hardware|hardware or a? ?simulator|out of scope|beat a cnn|beats? a cnn|state[- ]of[- ]the[- ]art)\b|\bNOT[?!]?\s*$)/i },
507
+ // capabilities / features / "what can it do" / tool actions / sub-commands / specific tools
508
+ // (score, genome, mcp-scan, threat-model, Darwin Mode, router, execute, cost, npm audit).
509
+ // Capabilities BEFORE composer/crates so "What does Darwin Mode do" / "What does genome report"
510
+ // / "What is the npm audit for agent tools command" routes here, not to whatis or composer.
511
+ { name: 'capabilities', re: /\b(capabilit(y|ies)|what can (it|the tool) do|what can ruv\w+ do|what can photonlayer do|features?\b|what does it (do|offer)|what does ruv\w+ (do|offer)|big (capabilities|features)|does\b.*\bexecute\b|ever execute|how does\b.*\bcut\b|how (do i |to )?score\b|genome\b.*\breport|what does\b.*(genome|darwin|threat[- ]?model|mcp[- ]scan|router|mode) (do|report|produce)|what artifact\b|what can the\b|\bgenome\b|\bdarwin mode\b|mcp[- ]scan\b|threat[- ]model\b|npm audit\b|audit for agent\b|how much\b.*\bcompress|compress(es|ion)?\b.*\bcapture|what accuracy\b|gradient training\b.*\breach|reach\b.*\baccuracy|what does the (receipt|blake3 receipt)|receipt prove|reproduce the (exact|same)|multi[- ]?plane cascade\b.*(achieve|do))\b/i },
512
+ // docs / tutorials / examples / ADRs / "where do I find/read" / "which doc covers/gives/lives" /
513
+ // "where is X described/documented". BEFORE composer and crates so "where is the composer 9-stage
514
+ // flow documented" / "where is the three-layer model described" route to docs (PRIMER#6), not
515
+ // composer (PRIMER#4) or crates (PRIMER#3).
516
+ { name: 'docs', re: /\b(where (are|is|can i find|do i find|do i read).*(doc|documentation|tutorial|example|adr|guide|find|live|architecture|exist|why|described|documented)|documentation\b|tutorials?\b|list of adrs?|adr index|where everything lives|where.*\b(docs?|guides?)\b|which doc\b|where is.*\b(documented|described)\b|where does.*live\b|read[- ]in[- ]order\b|adr (series|index)\b|plain[- ]language usage\b)\b/i },
517
+ // composer / scaffold stages / overlays / HarnessChoice / template selection — PRIMER#4.
518
+ // "scaffold" only matches when NOT preceded by "one-liner" or "show" (those are playbook).
519
+ // "composer" without "where" context to avoid catching "where is the composer flow documented".
520
+ { name: 'composer', re: /\b(how does\b.*\b(composer\b.*scaffold|scaffold\b.*\bharness)\b|how many stages\b|template overlays?\b|overlay\b.*merge|merge\b.*overlay|default agents?\b|default skills?\b|harnesschoice\b|which (composer )?stage\b|toggles? kernel\b|7[- ]arc\b|teaching outline\b|last\b.*\bstage\b|stage\b.*\bgeneration\b|primitives\b.*\btoggle\b|9 stages?\b|what object drives\b|drives template\b|(optical )?pipeline\b.*\bstep[- ]by[- ]step\b|step[- ]by[- ]step\b.*\bpipeline\b|how (does|do)\b.*\b(optical )?pipeline\b.*\bwork\b|how (does|do)\b.*\b(a |the )?(quantum )?circuit\b.*\b(run|works?|simulat)|how (do i |to )?run a (circuit|simulation)\b|run a simulation\b.*\b(in javascript|entirely in|client-?side)|build (and run )?a bell state|steps to (build|run)|how does\b.*\bpick which backend|how does\b.*\bbackend\b.*\b(work|chosen|selected|pick)|how does\b.*\bspeed up\b.*\bsimulat|stages?\b.*\bfrom\b.*\b(image|input)\b.*\bto\b.*\b(decision|answer|output)\b)\b/i },
521
+ // hardware / boards / devices — enumeration of supported physical hardware
522
+ { name: 'hardware', re: /\b(hardware|boards?|devices?|which (chip|board|sensor)|supported (hardware|board|device))\b/i },
523
+ // component inventory — enumeration of the components that make up the workspace / a domain.
524
+ // NOTE: this static entry only covers the literal "crate" word; the store-aware
525
+ // componentInventoryRe(store) (injected with the componentWord synonym group: crate|package|
526
+ // module|component) is checked FIRST in classifyArchetype so an npm-package repo fires too.
527
+ // Broadened to also catch: "what are the layers", "which host adapter packages", subsystems,
528
+ // kernel boundary, which packages (surface/layer), model router package, kernel subsystems.
529
+ // "three-layer" is excluded here (handled by docs) for "where is the three-layer model described".
530
+ { name: 'crates', re: /\b(which crates|crate inventory|what crates|crates (that |which )?(make up|in|for|comprise)|list of crates|\w+ domain crates|what are the (layers?|three layers?|subsystems?|adapters?)\b|which (host |adapter )?(packages?|adapters?)\b|kernel (boundary|subsystems?)\b|what is (in |the )?the kernel\b|what is the kernel\b|subsystems? (bundled|in)\b|surface layer\b|user[- ]facing (surface|layer|packages?)\b|model router (package|component)\b)\b/i },
531
+ // playbook / setup / onboarding / end-to-end usage / wizard / publish / scaffold health /
532
+ // fastest-path / one-liner / after-scaffolding / own-files / release-gate.
533
+ { name: 'playbook', re: /\b(how (do i |to |can i )?(use|set ?up|onboard|get started|getting started|start|deploy|build|run|publish|check|generate|try|install)|try\b.*\bwithout\b.*\binstall|without\b.*\binstall\w*\b.*\banything|no[- ]install\b|try it (out|now|in (a |the )?browser)|end[- ]to[- ]end|end to end|quick ?start|playbook|walkthrough|step[- ]by[- ]step|get up and running|wizard\b|what does the wizard\b|fastest path\b|one[- ]liner\b|after scaffolding\b|own the files\b|what command\b.*\brelease\b|release gate\b|scaffold is healthy\b|harness doctor\b|harness validate\b|publish my harness\b|what do my users run\b|users run\b)\b/i },
534
+ // what-is / overview / introduce / what-does-produce / is-X-a-Y / why-called / do-I-need.
535
+ // "not another" catches "not another agent framework" (product identity). "why.*factory|framework"
536
+ // catches "why is it called a factory and not a framework" without matching generic maturity.
537
+ { name: 'whatis', re: /\b(what is|what'?s |overview of|introduce|introduction to|tell me about|difference between|role of|what does\b.*\b(produce|turn|make into|forbid|mean)\b|is\b.*\b(model|wrapper|framework|factory|account|server)\b|why (is it|called|a factory)\b|do i need\b|in one line\b|called a factory\b|not another\b|why.*\bfactory\b|why.*\bframework\b)\b/i },
538
+ ];
539
+
540
+ // Strong playbook verbs — when present, the playbook force-route fires EVEN for a long query
541
+ // (the word-count cap is bypassed). These signal an end-to-end "do this from scratch" walkthrough.
542
+ const STRONG_PLAYBOOK_RE = /\b(set ?up|end[- ]to[- ]end|get started|getting started|from scratch|walk ?through|unbox|first time|step by step)\b/i;
543
+
544
+ // A query is "clearly orientation" only when it is short & conceptual: no concrete symbol, file
545
+ // path, ADR number, code-y token, or function/struct reference. This keeps the force-route from
546
+ // firing on a deep how-X-works-in-the-code question that happens to contain "how to".
547
+ const SPECIFIC_SIGNAL_RE = /(\badr[-\s_]?\d|[a-z_]+\.[a-z]{1,4}\b|\bfn\b|\bstruct\b|\bimpl\b|::|\/|\b[a-z_]+\(\)|\bcrate::|\bsrc\b)/i;
548
+ function isOrientationQuery(query) {
549
+ // A strong playbook verb (set up / end-to-end / from scratch / walkthrough …) marks an
550
+ // end-to-end "do this from scratch" request. These run long ("set up a single sensor node end to
551
+ // end and see data in home assistant" = 16 words) yet should still force-route to the playbook
552
+ // PRIMER, so bypass the word-count cap when one is present (FIX 3).
553
+ if (!STRONG_PLAYBOOK_RE.test(query)) {
554
+ const words = (query.trim().match(/\S+/g) || []).length;
555
+ if (words > 14) return false; // long queries are usually specific
556
+ }
557
+ if (SPECIFIC_SIGNAL_RE.test(query)) return false;
558
+ return true;
559
+ }
560
+
561
+ // Is a what-is/concept query about the PRODUCT ITSELF (force PRIMER#1) vs about a CONCRETE concept
562
+ // noun (let vector+rerank find the defining doc)? Product-only: names the store product (ruvector/
563
+ // ruview) and contains NO other concrete concept noun, OR is literally "what is this / it". A query
564
+ // carrying any concept noun OTHER than the product name (rvf, witness, hnsw, gnn, segment, presence,
565
+ // occupancy, quantization, …) is a concept query, not a product overview. (FIX 1)
566
+ function isProductOverviewQuery(query, store) {
567
+ const prod = productRe(store);
568
+ // strip the product name, then see whether any meaningful concept term remains.
569
+ const stripped = prod ? query.replace(prod, ' ') : query;
570
+ const rest = queryTerms(stripped).filter((t) => t !== store && t !== 'product' && t !== 'overview');
571
+ if (prod && prod.test(query) && rest.length === 0) return true; // "what is <product>"
572
+ if (/\bwhat'?s?\s+(is\s+)?(this|it)\b/i.test(query) && rest.length === 0) return true; // "what is this"
573
+ return false;
574
+ }
575
+
576
+ // Store-aware component-inventory detector — the componentWord synonym group (crate|package|module|
577
+ // component, from config) is injected so "what packages make up X" / "which modules comprise Y" fire
578
+ // the SAME 'crates' archetype the prototype only fired for "which crates". Cached per store.
579
+ const _compInvCache = new Map();
580
+ function componentInventoryRe(store) {
581
+ if (_compInvCache.has(store)) return _compInvCache.get(store);
582
+ const w = componentWordAlt(store); // e.g. (?:crate|package|module|component)
583
+ const wp = `${w}s?`; // allow plural
584
+ const re = new RegExp(
585
+ `\\b(which ${wp}|${w} inventory|what ${wp}|${wp} (that |which )?(make up|in|for|comprise|comprise)|`
586
+ + `list of ${wp}|\\w+ domain ${wp})\\b`, 'i');
587
+ _compInvCache.set(store, re);
588
+ return re;
589
+ }
590
+
591
+ // Patterns in the whatis archetype that are ALWAYS a product-overview (force PRIMER#1), regardless
592
+ // of whether other concept nouns appear. These paraphrased forms ("what does X produce", "is it a
593
+ // wrapper", "why called a factory", "do I need an account", "in one line") are orientation queries
594
+ // about the product as a whole, not about a sub-concept. They bypass the isProductOverviewQuery
595
+ // concept-noun strip so they force-route to PRIMER#1 rather than falling to whatis-concept.
596
+ const WHATIS_FORCE_RE = /\b(what does\b.*\b(produce|turn into|make into|forbid|turn light into|output)\b|is\b.*\b(model|wrapper|framework|factory|account|server)\b|why (is it |a )?called\b|do i need\b|in one line\b|not another\b|called a factory\b|why a factory\b|published cli\b|cli name\b.*versus|versus\b.*cli name|what.*\bversus\b.*\balias\b|learned phase mask\b|what is a phase mask\b|optical neural network\b|why.*\bfront end\b|described as a front end\b)\b/i;
597
+ // A "what is X in one line" phrasing is a PRODUCT overview ONLY when it is NOT enumerating named
598
+ // features/algorithms/commands. "What are VQE, Grover, and QAOA in one line each?" is a CAPABILITIES
599
+ // query, not a product overview — the comma/"and"/"each" enumeration is the tell. This guard keeps
600
+ // WHATIS_FORCE's "in one line" from hijacking feature-list questions. (Generic; no repo names.)
601
+ const WHATIS_ENUMERATION_RE = /\b(what (are|is)\b[^?]*\b(and|,)\b[^?]*\b(in one line|each)\b|each\b.*\bin one line|in one line each)\b/i;
602
+
603
+ // Classify the orientation archetype (most-specific-first). Returns archetype name or null.
604
+ // `store` lets the what-is split distinguish a product-overview query from a concept query, and
605
+ // drives the componentWord synonym injection for the component-inventory archetype.
606
+ function classifyArchetype(query, store) {
607
+ if (!isOrientationQuery(query)) return null;
608
+ // Component-inventory FIRST, with the componentWord synonym group injected (config-driven), so a
609
+ // package/module repo fires the inventory route exactly like a crate repo did.
610
+ if (componentInventoryRe(store).test(query)) return 'crates';
611
+ for (const a of ARCHETYPE_RES) {
612
+ if (a.re.test(query)) {
613
+ // The docs archetype splits: if the query is specifically about ADRs, target the ADR index.
614
+ if (a.name === 'docs' && /\b(adr|decision record)\b/i.test(query)) return 'adr';
615
+ // The what-is archetype splits: product-overview -> PRIMER#1; concept query -> no force-route
616
+ // (let vector+rerank find the DEFINING doc; a mild concept boost is applied downstream).
617
+ // WHATIS_FORCE_RE patterns are always product-overview (bypass the concept-noun strip).
618
+ if (a.name === 'whatis') {
619
+ // An enumeration ("what are X, Y, and Z in one line each") is a capabilities/feature-list
620
+ // query, not a product overview — do NOT force-route it to PRIMER#1.
621
+ if (WHATIS_ENUMERATION_RE.test(query)) return 'whatis-concept';
622
+ if (WHATIS_FORCE_RE.test(query) || isProductOverviewQuery(query, store)) return 'whatis';
623
+ return 'whatis-concept';
624
+ }
625
+ return a.name;
626
+ }
627
+ }
628
+ return null;
629
+ }
630
+
631
+ // Code-vs-doc intent. Returns 'code' | 'design' | null.
632
+ const CODE_INTENT_RE = /\b(in (the )?code|in source|implementation|how is .*(computed|implemented|calculated|done)|\bfunction\b|\bstruct\b|signature|source code|actual code|which file|in the source|the (rust|code))\b/i;
633
+ const DESIGN_INTENT_RE = /\b(why\b|rationale|design decision|design choice|proposed\b|proposal\b|trade[- ]?off|tradeoff|motivation|reasoning behind|the decision to)\b/i;
634
+ function codeDocIntent(query) {
635
+ // Code intent takes priority when both fire (explicit "in the code" beats a stray "why").
636
+ if (CODE_INTENT_RE.test(query)) return 'code';
637
+ if (DESIGN_INTENT_RE.test(query)) return 'design';
638
+ return null;
639
+ }
640
+
641
+ // ===================================================================================
642
+ // FIX A — "how-works-in-code" / implementation intent (RETRIEVAL POLISH). A query that asks
643
+ // how something is IMPLEMENTED/coded ("how is X implemented", "how does X work in code",
644
+ // "implementation of X", "where is X coded") wants the REAL algorithm source in the crate's
645
+ // own src/ — NOT a vendored/copied dependency, NOT the CLI entrypoint, NOT the manifest. So we:
646
+ // • DEMOTE wrong-file types: vendored/patched dep copies (patches/** + any hnsw_rs-style
647
+ // copied-dep tree), bare entrypoints (**/main.rs, **/bin/**), and Cargo.toml.
648
+ // • PROMOTE the crate's own src/**/*.rs (excluding main.rs) — with an EXTRA promotion for a
649
+ // file whose name token-matches the named operation (e.g. "insert" -> *insert*.rs,
650
+ // "count/counting" -> *count*.rs), so the operation's implementation module wins.
651
+ // Scoped to the named crate(s) from the query (entity.crates) where possible; the vendored-dep
652
+ // demotion is global (a copied dep is never the answer to "how is X implemented HERE").
653
+ // ===================================================================================
654
+ const IMPL_INTENT_RE = new RegExp([
655
+ '\\bimplement(ed|ation)?\\b', // "X implemented", "implementation of X"
656
+ '\\bhow\\s+(is|does)\\b.*\\b(work|works|coded|done)\\b.*\\bin\\s+(the\\s+)?(code|source)\\b',
657
+ '\\bhow\\s+\\w+\\s+(is|works?)\\s+coded\\b',
658
+ '\\bwhere\\s+is\\s+\\w+\\s+(coded|implemented)\\b',
659
+ ].join('|'), 'i');
660
+ function isImplIntent(query) { return IMPL_INTENT_RE.test(query); }
661
+
662
+ // Vendored / copied-dependency trees that are NEVER the answer to "how is X implemented here".
663
+ // patches/** is the explicit vendored-patch tree; the hnsw_rs token catches the copied upstream
664
+ // HNSW crate wherever it lands (e.g. scripts/patches/hnsw_rs/**). Kept conservative.
665
+ const VENDORED_DEP_RE = /(^|\/)(patches)\/|(^|\/)hnsw_rs\//i;
666
+
667
+ // The operation noun(s) the impl query is about: meaningful terms MINUS the named crate token(s)
668
+ // MINUS generic impl words. Used to give an extra promotion to a src file whose name token-matches
669
+ // the operation (e.g. "insert" -> insert.rs / *insert*.rs).
670
+ const IMPL_STOP = new Set(['how','does','work','works','implement','implemented','implementation',
671
+ 'code','coded','source','where','the','rust','module','crate','function','method','logic']);
672
+ function implOperationNouns(query, crates) {
673
+ const crateSet = new Set((crates || []).map((c) => c.toLowerCase()));
674
+ return queryTerms(query)
675
+ .filter((t) => !crateSet.has(t) && !IMPL_STOP.has(t)
676
+ && !(crates || []).some((c) => t === c || c.includes(t)));
677
+ }
678
+
679
+ // Implementation-intent path adjustment (negative = promote, positive = demote). `crateTok` is the
680
+ // named crate the query is about (or null for unscoped). `opNouns` are the operation tokens.
681
+ function implAdjust(path, crateTok, opNouns, store) {
682
+ let adj = 0;
683
+ // Global: a vendored/copied-dep tree is never the real implementation of "X here".
684
+ if (VENDORED_DEP_RE.test(path)) adj += 0.55;
685
+
686
+ const slug = (path.split('/').pop() || '').toLowerCase();
687
+ const isMain = /(^|\/)main\.rs$/i.test(path);
688
+ const isBin = /(^|\/)bin\//i.test(path);
689
+ const isCargo = /(^|\/)cargo\.toml$/i.test(path);
690
+ // Bare entrypoints + manifest are not the algorithm (apply globally for impl intent).
691
+ if (isMain || isBin) adj += 0.30;
692
+ if (isCargo) adj += 0.30;
693
+
694
+ if (crateTok) {
695
+ const inCrate = inComponentRe(store, crateTok).test(path);
696
+ if (inCrate) {
697
+ // Real source body of the named crate: src/**/*.rs that is NOT main.rs / a module-stub dir.
698
+ const isSrcRs = /(?:^|\/)src\/.+\.rs$/i.test(path);
699
+ if (isSrcRs && !isMain) {
700
+ adj -= 0.40; // promote the crate's own algorithm source
701
+ // Extra promotion when the filename token-matches the named operation.
702
+ if (opNouns && opNouns.some((t) => t.length >= 3 && slug.includes(t))) adj -= 0.30;
703
+ }
704
+ }
705
+ }
706
+ return adj;
707
+ }
708
+
709
+ // FIX D — NAMED-CRATE source boost (no rebuild). A query that explicitly names a real crate but
710
+ // carries NO implementation verb ("HNSW index in ruvector-core", "ruvector-mmwave radar parser")
711
+ // got no crate scoping before — so a sibling crate / example / bridge could outrank the named
712
+ // crate's own source. When the query names crate(s) AND it is not an impl query (implAdjust already
713
+ // covers those), GENTLY promote the named crate's own src/**/*.rs (a little extra when the filename
714
+ // token-matches an operation noun). Gentle enough that a clearly-better vector match still wins.
715
+ function namedCrateAdjust(path, crateToks, opNouns, store) {
716
+ if (!crateToks || !crateToks.length) return 0;
717
+ for (const c of crateToks) {
718
+ const inCrate = inComponentRe(store, c).test(path);
719
+ if (!inCrate) continue;
720
+ const isSrcRs = /(?:^|\/)src\/.+\.rs$/i.test(path) && !/(?:^|\/)main\.rs$/i.test(path);
721
+ if (isSrcRs) {
722
+ let adj = -0.28; // promote the named crate's own source
723
+ const slug = (path.split('/').pop() || '').toLowerCase();
724
+ if (opNouns && opNouns.some((t) => t.length >= 3 && slug.includes(t))) adj -= 0.22;
725
+ return adj;
726
+ }
727
+ return -0.10; // mild lift for anything else in the named crate
728
+ }
729
+ return 0;
730
+ }
731
+
732
+ // Exact ADR-by-number, e.g. "ADR-027" / "adr 27" -> zero-padded "027". Returns [nums] or [].
733
+ function adrNumbers(query) {
734
+ return (query.match(/\badr[-\s_]?(\d{1,4})\b/gi) || [])
735
+ .map((m) => m.replace(/[^0-9]/g, '').padStart(3, '0'));
736
+ }
737
+ // Does a path point at the REAL ADR doc for this number (not the index table / a passing mention)?
738
+ function pathIsAdrDoc(p, num) {
739
+ return new RegExp(`(^|/)adr[-_]?0*${num}\\b`, 'i').test(p) || new RegExp(`adr[-_]?0*${num}[-_]`, 'i').test(p);
740
+ }
741
+
742
+ // FIX 4 — parse an ADR's Status (Proposed / Accepted / Implemented / Superseded / Rejected /
743
+ // Deprecated) from the top of the document. ADRs in this corpus carry the status in any of:
744
+ // a metadata table row: | **Status** | Proposed | or | Status | Proposed |
745
+ // an inline header: **Status**: Proposed or Status: Proposed
746
+ // a section + bold value: ## Status\n**Proposed**
747
+ // Returns the normalized UPPERCASE status string, or null if none is found. We scan the doc's
748
+ // first chunk(s) where the header lives.
749
+ const ADR_STATUS_WORDS = '(proposed|accepted|implemented|superseded|rejected|deprecated|draft|in[\\s-]?progress)';
750
+ function parseAdrStatus(chunks) {
751
+ if (!chunks || !chunks.length) return null;
752
+ const head = chunks.slice(0, 2).map((c) => c.text).join('\n');
753
+ const patterns = [
754
+ // table row: | **Status** | Proposed | / | Status | Accepted |
755
+ new RegExp(`\\|\\s*\\**\\s*status\\s*\\**\\s*\\|\\s*\\**\\s*${ADR_STATUS_WORDS}`, 'i'),
756
+ // inline: **Status**: Proposed / Status: Proposed
757
+ new RegExp(`\\**status\\**\\s*:\\s*\\**\\s*${ADR_STATUS_WORDS}`, 'i'),
758
+ // section: ## Status\n**Proposed**
759
+ new RegExp(`#+\\s*status\\b[^\\n]*\\n+\\s*\\**\\s*${ADR_STATUS_WORDS}`, 'i'),
760
+ ];
761
+ for (const re of patterns) {
762
+ const m = head.match(re);
763
+ if (m && m[1]) return m[1].toUpperCase().replace(/[\s-]+/g, '-');
764
+ }
765
+ return null;
766
+ }
767
+ // A status that means "design intent, not confirmed shipped" (vs ACCEPTED/IMPLEMENTED = built).
768
+ function statusIsProposed(status) {
769
+ return !!status && /^(PROPOSED|DRAFT|IN-PROGRESS|REJECTED|DEPRECATED|SUPERSEDED)$/i.test(status);
770
+ }
771
+ // Back-compat: does the doc carry ANY status header? (trigger for INTENT(4) ADR-vs-code pairing.)
772
+ function adrHasStatus(chunks) {
773
+ if (!chunks || !chunks.length) return false;
774
+ const head = chunks.slice(0, 2).map((c) => c.text).join('\n');
775
+ return parseAdrStatus(chunks) !== null
776
+ || /(^|\n)\s*(#+\s*status\b|\*\*status\*\*\s*:|status\s*:|\|\s*\**status)/i.test(head);
777
+ }
778
+
779
+ const STOPWORDS = new Set(['the','a','an','and','or','of','to','in','for','on','with','how','do','i','is','are',
780
+ 'what','when','where','why','it','this','that','kb','query','question','search','find','show','me','please','about']);
781
+
782
+ // Tokenize a query into meaningful lexical terms (FIX 3 hybrid lexical).
783
+ function queryTerms(q) {
784
+ return (q.toLowerCase().match(/[a-z0-9][a-z0-9._-]*/g) || [])
785
+ .filter((t) => t.length >= 3 && !STOPWORDS.has(t));
786
+ }
787
+
788
+ // Concept nouns from a concept what-is query (FIX 1). The query's meaningful terms MINUS the
789
+ // product name MINUS generic question words = the concrete concept(s) being asked about (rvf,
790
+ // witness, hnsw, gnn, segment, presence, occupancy, …). A doc whose title/path token-overlaps one
791
+ // of these is more likely to DEFINE it, so it gets a mild boost (below) — letting the real defining
792
+ // ADR/source/doc out-rank the thin product blurb without a hard force-route.
793
+ const CONCEPT_STOP = new Set(['what','difference','between','role','format','file','files','does',
794
+ 'store','stores','support','supported','index','indices','chain','segment','detection','counting',
795
+ 'augmented','work','works']);
796
+ function conceptNouns(query, store) {
797
+ const prod = productRe(store);
798
+ const stripped = prod ? query.replace(prod, ' ') : query;
799
+ return queryTerms(stripped).filter((t) => t !== store && !CONCEPT_STOP.has(t));
800
+ }
801
+
802
+ // Concept boost: SUBTRACT from a doc that names a concept noun from a concept what-is query.
803
+ // FIX 3 — the DEFINING doc must beat an ADJACENT one. A concept noun that appears in the doc's
804
+ // FILENAME SLUG or TITLE is a strong "this doc DEFINES the concept" signal (e.g. the file
805
+ // `ADR-029-ruvsense-multistatic-sensing-mode.md` / title containing "multistatic" defines
806
+ // "multistatic", while a sibling ADR that merely mentions it in the body does not). So we weight
807
+ // a filename-slug / title hit FAR above a bare path-substring hit, which makes the title/slug-exact
808
+ // defining doc out-rank an adjacent ADR. PRIMER#1 (thin product blurb) is excluded. NON-NEGATIVE.
809
+ function conceptBoost(nouns, path, title) {
810
+ if (!nouns || !nouns.length) return 0;
811
+ if (PRIMER_PATH_RE.test(path) && /what\b.*\bis\b|#1-/i.test(`${path} ${title}`)) return 0;
812
+ const slug = (path.split('/').pop() || '').toLowerCase(); // filename slug (the defining signal)
813
+ const titleL = String(title || '').toLowerCase();
814
+ const hay = `${path} ${title}`.toLowerCase();
815
+ let strong = 0; // concept noun present in the filename slug or the title (defining)
816
+ let weak = 0; // concept noun present elsewhere in the path (adjacent / mention)
817
+ for (const t of nouns) {
818
+ if (slug.includes(t) || titleL.includes(t)) strong += 1;
819
+ else if (hay.includes(t)) weak += 1;
820
+ }
821
+ if (strong === 0 && weak === 0) return 0;
822
+ // Strong (slug/title) hits dominate so the defining doc clears any adjacent doc's vector lead.
823
+ const b = 0.30 * strong + 0.06 * weak;
824
+ // Glossary section (ruview) is a legitimate concept target and benefits via the same rule; a real
825
+ // defining doc with a slug/title hit out-scores it.
826
+ return Math.min(0.62, b);
827
+ }
828
+
829
+ // FIX 2 — crate-overview / metric intent. "what does crate X do" or "X compression ratio /
830
+ // throughput / benchmark" should surface the crate's README.md / BENCHMARK.md / docs (where the
831
+ // headline numbers live) ABOVE its benches/ + main.rs harness + bare Cargo.toml. Returns the crate
832
+ // token the query is about, or null. (Distinct from the crate-INVENTORY archetype "which crates…".)
833
+ const CRATE_METRIC_RE = /\b(compression(\s+ratio)?|throughput|benchmark|latency|qps|recall|speed ?up|ops\/s|ratio|perf(ormance)?)\b/i;
834
+ const CRATE_OVERVIEW_RE = /\b(what (does|is)|overview of|tell me about|describe)\b/i;
835
+ function crateOverviewTarget(query, entityCrates) {
836
+ if (!entityCrates || !entityCrates.length) return null;
837
+ if (CRATE_METRIC_RE.test(query) || CRATE_OVERVIEW_RE.test(query)) return entityCrates[0];
838
+ return null;
839
+ }
840
+ // Boost for a README/BENCHMARK/docs path of the targeted crate; mild penalty for that crate's
841
+ // harness (benches/ + main.rs) and bare Cargo.toml so the prose with the numbers wins. NON-NEGATIVE
842
+ // return = subtract from eff distance; harness/Cargo handled as a separate positive penalty.
843
+ function crateOverviewAdjust(crateTok, path, store) {
844
+ if (!crateTok) return 0;
845
+ const inCrate = inComponentRe(store, crateTok).test(path);
846
+ if (!inCrate) return 0;
847
+ let adj = 0;
848
+ if (/\/(readme|benchmark)\.md$/i.test(path) || /\/docs\//i.test(path)) adj -= 0.45; // prose w/ numbers
849
+ if (/\/benches?\//i.test(path) || /\/main\.rs$/i.test(path)) adj += 0.20; // harness
850
+ if (/\/cargo\.toml$/i.test(path)) adj += 0.18; // bare manifest
851
+ return adj;
852
+ }
853
+
854
+ // ===================================================================================
855
+ // FIX B — targeted off-topic-magnet down-weight (RETRIEVAL POLISH). A specific document that
856
+ // keeps surfacing as off-topic noise on UNRELATED queries gets a mild penalty UNLESS the query is
857
+ // actually about that document's subject. General mechanism (a small allow-listed down-weight
858
+ // table); currently a single entry for ADR-096 (rvCSI crate layout), which intruded on unrelated
859
+ // queries (e.g. worldgraph spatial relationships). The penalty is mild so an on-topic query
860
+ // (about rvCSI crate layout / structure) still surfaces it normally via its allow regex.
861
+ // ===================================================================================
862
+ // Legacy default magnet table (ruvector/ruview only): ADR-096 (rvCSI crate layout) intruded on
863
+ // unrelated queries. New targets supply their own via config.offtopicMagnets (filled during gate-A
864
+ // tuning, P6a) — each entry { re|reSource, pen, allow|allowSource }.
865
+ const LEGACY_OFFTOPIC_MAGNETS = {
866
+ ruvector: [
867
+ { re: /(^|\/)adr[-_]?0*96\b/i, pen: 0.22,
868
+ allow: /\b(rvcsi|rv[-_]?csi|crate\s+layout|crate\s+structure|crate\s+organi|workspace\s+layout|adr[-\s_]?0*96)\b/i },
869
+ ],
870
+ ruview: [
871
+ { re: /(^|\/)adr[-_]?0*96\b/i, pen: 0.22,
872
+ allow: /\b(rvcsi|rv[-_]?csi|crate\s+layout|crate\s+structure|crate\s+organi|workspace\s+layout|adr[-\s_]?0*96)\b/i },
873
+ ],
874
+ };
875
+ // Compile a config magnet entry (which may carry string sources or RegExp objects) into a usable
876
+ // { re, pen, allow } triple. Cached per store so we don't recompile each call.
877
+ const _magnetCache = new Map();
878
+ function magnetsFor(store) {
879
+ if (_magnetCache.has(store)) return _magnetCache.get(store);
880
+ const t = cfgFor(store);
881
+ let list = (t && Array.isArray(t.offtopicMagnets) && t.offtopicMagnets.length) ? t.offtopicMagnets : null;
882
+ if (!list) list = LEGACY_OFFTOPIC_MAGNETS[store] || [];
883
+ const compiled = list.map((m) => ({
884
+ re: m.re instanceof RegExp ? m.re : new RegExp(m.reSource || String(m.re || ''), 'i'),
885
+ allow: m.allow instanceof RegExp ? m.allow : (m.allow || m.allowSource ? new RegExp(m.allowSource || String(m.allow), 'i') : null),
886
+ pen: typeof m.pen === 'number' ? m.pen : 0.22,
887
+ }));
888
+ _magnetCache.set(store, compiled);
889
+ return compiled;
890
+ }
891
+ function offtopicMagnetPenalty(query, path, store) {
892
+ let pen = 0;
893
+ for (const m of magnetsFor(store)) {
894
+ if (m.re.test(path) && !(m.allow && m.allow.test(query))) pen += m.pen;
895
+ }
896
+ return pen;
897
+ }
898
+
899
+ // ===================================================================================
900
+ // FIX C — crate-specific maturity → the crate's OWN README/BENCHMARK (RETRIEVAL POLISH). A query
901
+ // like "is <crate> production-ready / experimental / complete" is answered by that crate's OWN
902
+ // README.md / BENCHMARK.md (which usually carry a status/maturity note), NOT by the global
903
+ // capabilities-graded primer or a cross-crate benchmark doc. Returns the named crate token when
904
+ // the query is a crate-scoped maturity question, else null.
905
+ const MATURITY_QUERY_RE = /\b(production[- ]?ready|production\b|experimental|prototype|complete|completeness|mature|maturity|stable|ready\s+for\s+production|battle[- ]?tested|is\s+it\s+(done|ready))\b/i;
906
+ function crateMaturityTarget(query, entityCrates) {
907
+ if (!entityCrates || !entityCrates.length) return null;
908
+ if (MATURITY_QUERY_RE.test(query)) return entityCrates[0];
909
+ return null;
910
+ }
911
+ // Boost the named crate's OWN README.md / BENCHMARK.md for a crate-maturity query (negative = boost).
912
+ function crateMaturityAdjust(crateTok, path, store) {
913
+ if (!crateTok) return 0;
914
+ const own = new RegExp(componentPrefixSrc(store, `${crateTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}(?:-[a-z0-9-]+)?/(?:readme|benchmark)\\.md$`), 'i');
915
+ return own.test(path) ? -0.50 : 0;
916
+ }
917
+
918
+ // FIX 3 — lexical boost: ADR-number exact hit, then proper-noun/title token overlap on
919
+ // the doc's path+title. Returns a NON-NEGATIVE amount to SUBTRACT from effective distance.
920
+ function lexicalBoost(query, terms, path, title) {
921
+ let boost = 0;
922
+ const hay = `${path} ${title}`.toLowerCase();
923
+
924
+ // ADR id in query that the doc carries (e.g. "ADR-027" -> path adr/ADR-027-*.md)
925
+ const adrIds = (query.match(/adr[-\s_]?(\d{1,4})/gi) || [])
926
+ .map((m) => m.replace(/[^0-9]/g, '').padStart(3, '0'));
927
+ for (const num of adrIds) {
928
+ if (new RegExp(`adr[-_]?0*${num}\\b`, 'i').test(hay)) { boost += 0.30; break; }
929
+ }
930
+
931
+ // Title / path token overlap (proper-noun & multiword title tokens count strongest).
932
+ let overlap = 0;
933
+ for (const t of terms) {
934
+ if (hay.includes(t)) overlap += 1;
935
+ }
936
+ if (overlap > 0) boost += Math.min(0.18, 0.06 * overlap);
937
+
938
+ return boost;
939
+ }
940
+
941
+ // FIX 2 — demotion penalty for a path given the query (skipped if query references the kind).
942
+ function demotionPenalty(query, path) {
943
+ let pen = 0;
944
+ for (const ls of LOW_SIGNAL) {
945
+ if (ls.re.test(path) && !ls.allow.test(query)) pen += ls.pen;
946
+ }
947
+ return pen;
948
+ }
949
+
950
+ // Substance boost — a self-contained answer-bearing document (multiple chunks / real length)
951
+ // should not be out-ranked by a vector-closer but tiny one-line doc-comment fragment. This
952
+ // keeps results SELF-CONTAINED (the grading bar) without re-embedding. Capped & gentle so it
953
+ // only breaks near-ties, never overrides a clearly-better match.
954
+ function substanceBoost(chunks) {
955
+ if (!chunks || !chunks.length) return 0;
956
+ const totalChars = chunks.reduce((s, c) => s + c.text.length, 0);
957
+ let b = 0;
958
+ if (chunks.length >= 2) b += 0.06;
959
+ if (chunks.length >= 4) b += 0.06;
960
+ if (totalChars >= 4000) b += 0.06;
961
+ if (totalChars < 400) b -= 0.06; // a sub-400-char stub is a fragment, demote it
962
+ return Math.max(-0.06, Math.min(0.18, b));
963
+ }
964
+
965
+ // Disambiguation adjustment (negative = boost a "good" path, positive = penalize a "bad" path) for
966
+ // queries matching a config disambiguation entry. Each entry: { whenSource|when, goodSource|good,
967
+ // badSource|bad, goodBoost?, badPenalty? }. New targets fill target.disambiguation during gate-A
968
+ // tuning (P6a). Legacy ruvector/ruview keep the hard-coded "Cognitum Seed" rule as a fallback only.
969
+ const _disambigCache = new Map();
970
+ function disambigFor(store) {
971
+ if (_disambigCache.has(store)) return _disambigCache.get(store);
972
+ const t = cfgFor(store);
973
+ let list = (t && Array.isArray(t.disambiguation) && t.disambiguation.length) ? t.disambiguation : null;
974
+ if (!list) {
975
+ // legacy fallback: the prototype's "Cognitum Seed" product disambiguation.
976
+ if (store === 'ruvector' || store === 'ruview') {
977
+ list = [{ when: SEED_QUERY_RE, good: SEED_GOOD_RE, bad: SEED_BAD_RE, goodBoost: 0.25, badPenalty: 0.30 }];
978
+ } else {
979
+ list = [];
980
+ }
981
+ }
982
+ const compiled = list.map((d) => ({
983
+ when: d.when instanceof RegExp ? d.when : new RegExp(d.whenSource || String(d.when || ''), 'i'),
984
+ good: d.good instanceof RegExp ? d.good : (d.good || d.goodSource ? new RegExp(d.goodSource || String(d.good), 'i') : null),
985
+ bad: d.bad instanceof RegExp ? d.bad : (d.bad || d.badSource ? new RegExp(d.badSource || String(d.bad), 'i') : null),
986
+ goodBoost: typeof d.goodBoost === 'number' ? d.goodBoost : 0.25,
987
+ badPenalty: typeof d.badPenalty === 'number' ? d.badPenalty : 0.30,
988
+ }));
989
+ _disambigCache.set(store, compiled);
990
+ return compiled;
991
+ }
992
+ function seedAdjust(query, path, store) {
993
+ let adj = 0;
994
+ for (const d of disambigFor(store)) {
995
+ if (!d.when.test(query)) continue;
996
+ if (d.good && d.good.test(path)) adj -= d.goodBoost;
997
+ if (d.bad && d.bad.test(path)) adj += d.badPenalty;
998
+ }
999
+ return adj;
1000
+ }
1001
+
1002
+ // disambigPrimerTargets — for the matched config disambiguation rules, extract any LITERAL PRIMER
1003
+ // section path named in the rule's goodSource (e.g. 'PRIMER#2-what-can-ruqu-do-for-you'). These are
1004
+ // force-injected into the candidate pool (like targetPrimerSlug) so the rule's goodBoost can rank a
1005
+ // synthesized section that bge ranked outside the raw vector window. Config-driven; no repo baked in.
1006
+ function disambigPrimerTargets(query, store) {
1007
+ const out = [];
1008
+ const t = cfgFor(store);
1009
+ const raw = (t && Array.isArray(t.disambiguation)) ? t.disambiguation : [];
1010
+ for (let i = 0; i < raw.length; i++) {
1011
+ const d = disambigFor(store)[i];
1012
+ if (!d || !d.when.test(query)) continue;
1013
+ const gs = String(raw[i].goodSource || '');
1014
+ for (const m of gs.matchAll(/PRIMER#[0-9][A-Za-z0-9#-]*/g)) out.push(m[0]);
1015
+ }
1016
+ return [...new Set(out)];
1017
+ }
1018
+
1019
+ // The KB builder emits OVERLAPPING chunks (a sliding window repeats ~half of each neighbour).
1020
+ // Naively concatenating them duplicates paragraphs. stitch() drops the longest suffix of the
1021
+ // running text that is also a prefix of the next chunk, so the document reads cleanly as one.
1022
+ function stitch(prevTail, next) {
1023
+ const maxOv = Math.min(prevTail.length, next.length, 2000);
1024
+ for (let len = maxOv; len >= 24; len--) {
1025
+ if (prevTail.slice(prevTail.length - len) === next.slice(0, len)) {
1026
+ return next.slice(len); // drop the duplicated overlap
1027
+ }
1028
+ }
1029
+ return next;
1030
+ }
1031
+
1032
+ // Assemble the FULL document from its chunks (id-ordered), de-overlapping as we go so it reads
1033
+ // as one clean document. If the stitched text fits under MAX_DOC_CHARS, the whole document is
1034
+ // returned. If it exceeds the cap, the window is CENTERED on the matched chunk (the chunk that
1035
+ // actually scored the hit) and expanded outward — alternating following then preceding chunks —
1036
+ // so the answer-bearing region is ALWAYS included, even when the match is a late chunk in a long
1037
+ // document. (The old behavior counted from chunk 0 and could truncate the answer.) De-overlap
1038
+ // stitching is preserved across the contiguous kept window.
1039
+ function assembleDocument(chunks, matchedId) {
1040
+ const SEP = '\n\n';
1041
+ if (!chunks.length) return { fullText: '', chunksJoined: 0, truncated: false };
1042
+
1043
+ // Locate the matched chunk; default to the first chunk if not found (back-compat).
1044
+ let center = 0;
1045
+ if (matchedId != null) {
1046
+ const idx = chunks.findIndex((c) => c.id === matchedId);
1047
+ if (idx >= 0) center = idx;
1048
+ }
1049
+
1050
+ // Grow a contiguous [lo, hi] window outward from `center` while it fits under the cap.
1051
+ // Always include the matched chunk itself first, then expand following, then preceding.
1052
+ let lo = center, hi = center;
1053
+ let budget = chunks[center].text.length;
1054
+ let nextLo = center - 1, nextHi = center + 1;
1055
+ let toggle = 1; // 1 = try to extend forward first, then backward
1056
+ while (nextLo >= 0 || nextHi < chunks.length) {
1057
+ let extended = false;
1058
+ const tryHi = () => {
1059
+ if (nextHi < chunks.length) {
1060
+ const cost = SEP.length + chunks[nextHi].text.length;
1061
+ if (budget + cost <= MAX_DOC_CHARS) { budget += cost; hi = nextHi; nextHi++; return true; }
1062
+ nextHi = chunks.length; // stop growing forward once it no longer fits
1063
+ }
1064
+ return false;
1065
+ };
1066
+ const tryLo = () => {
1067
+ if (nextLo >= 0) {
1068
+ const cost = SEP.length + chunks[nextLo].text.length;
1069
+ if (budget + cost <= MAX_DOC_CHARS) { budget += cost; lo = nextLo; nextLo--; return true; }
1070
+ nextLo = -1; // stop growing backward once it no longer fits
1071
+ }
1072
+ return false;
1073
+ };
1074
+ if (toggle === 1) { extended = tryHi() || tryLo(); } else { extended = tryLo() || tryHi(); }
1075
+ toggle ^= 1;
1076
+ if (!extended) break;
1077
+ }
1078
+
1079
+ // Stitch the kept contiguous window [lo..hi] into one clean document.
1080
+ let out = '';
1081
+ let joined = 0;
1082
+ for (let i = lo; i <= hi; i++) {
1083
+ const piece = out ? stitch(out.slice(-2000), chunks[i].text) : chunks[i].text;
1084
+ out = out ? out + (piece ? SEP + piece : '') : piece;
1085
+ joined++;
1086
+ }
1087
+
1088
+ const omitted = chunks.length - joined;
1089
+ if (omitted > 0) {
1090
+ const before = lo, after = chunks.length - 1 - hi;
1091
+ const parts = [];
1092
+ if (before > 0) parts.push(`${before} earlier`);
1093
+ if (after > 0) parts.push(`${after} later`);
1094
+ const note = `${SEP}${SEP}[... ${parts.join(' + ')} chunk(s) omitted; window centered on the `
1095
+ + `matched section, capped at ${MAX_DOC_CHARS} chars ...]`;
1096
+ return { fullText: out + note, chunksJoined: joined, truncated: true };
1097
+ }
1098
+ return { fullText: out, chunksJoined: joined, truncated: false };
1099
+ }
1100
+
1101
+ // ---------- core search: returns whole-document results ----------
1102
+ // Each result: { path, title, fullText, bestDistance, effDistance, chunksJoined, truncated,
1103
+ // distance (alias of bestDistance), text (alias of fullText) }.
1104
+ export async function searchKb({ query, k = 6, store, n, variant }) {
1105
+ const conf = resolveConf(store, variant);
1106
+ if (!fs.existsSync(conf.rvf)) throw new Error(`rvf not found: ${conf.rvf} (variant=${conf.variant}; run \`npm i\` then build, or copy the bundle in)`);
1107
+ const topN = Math.max(1, n || 5);
1108
+ const [qv, { byId, byPath }] = await Promise.all([embed(query, conf.embedCfg), loadPassages(conf.passages)]);
1109
+ const byPathKind = loadKinds(conf.meta); // intent layer: per-path content kind
1110
+ const terms = queryTerms(query);
1111
+
1112
+ // ---- INTENT CLASSIFICATION (deterministic, computed once per query) ----
1113
+ // FIX 1 — specific-entity detection. If the query names a crate / ADR id / file / proper noun it
1114
+ // is NOT a generic product-orientation question: suppress the orientation force-route + generic
1115
+ // PRIMER lift and demote primer-orientation docs. The crate-INVENTORY archetype ("which crates…")
1116
+ // is exempt (carries no hyphen-crate token) so it still routes to the inventory PRIMER.
1117
+ const crateTokens = crateTokenSet(byPath, store);
1118
+ const entity = specificEntity(query, crateTokens);
1119
+ let archetype = classifyArchetype(query, store); // 'maturity'|'capabilities'|…|'whatis-concept'|null
1120
+ // Suppress force-routing archetypes when a specific entity is named — EXCEPT:
1121
+ // (a) the crate inventory archetype ('crates') — a legitimate enumeration even with a hyphen token.
1122
+ // (b) a product-overview 'whatis' query that names a product alias (e.g. "CLI name versus
1123
+ // create-agent-harness") — the alias IS the product, so the orientation PRIMER should still win.
1124
+ const isProductWhatis = archetype === 'whatis' && WHATIS_FORCE_RE.test(query);
1125
+ if (entity.named && archetype && archetype !== 'crates' && archetype !== 'whatis-concept' && !isProductWhatis) {
1126
+ archetype = null;
1127
+ }
1128
+ // 'whatis-concept' is a NON-routing archetype: no force-route to a PRIMER, instead a mild concept
1129
+ // boost (below) lets the vector+rerank pipeline surface the true DEFINING doc. Other archetypes
1130
+ // force-route to their PRIMER slug (null slug -> no force-route, e.g. ruvector hardware).
1131
+ const targetPrimerSlug = (archetype && archetype !== 'whatis-concept')
1132
+ ? resolvePrimerSlug(archetype, store, byPath)
1133
+ : null;
1134
+ // FIX 2 — crate-overview / metric target: the crate whose README/BENCHMARK/docs should win.
1135
+ const crateOverviewTok = crateOverviewTarget(query, entity.crates);
1136
+ // Concept nouns drive the mild concept boost for concept what-is queries (FIX 1).
1137
+ const concepts = archetype === 'whatis-concept' ? conceptNouns(query, store) : [];
1138
+ // FIX 3 — DEFINING-DOC nouns: a concept/topic query that is NOT a product-overview, NOT a
1139
+ // force-routed orientation archetype, and NOT a specific-entity query should still surface the
1140
+ // DEFINING doc (e.g. "multistatic vs monostatic sensing" -> the ADR whose filename slug names
1141
+ // "multistatic"). These nouns drive the slug/title concept boost AND pull a slug-named defining
1142
+ // doc into the candidate pool, so a title/slug-exact doc beats an adjacent one. The whatis-concept
1143
+ // nouns are folded in so that path keeps its existing behavior.
1144
+ const definingNouns = (concepts.length)
1145
+ ? concepts
1146
+ : (!entity.named && !targetPrimerSlug && archetype !== 'crates'
1147
+ ? conceptNouns(query, store)
1148
+ : []);
1149
+ // For ruview, a concept what-is query may also softly boost the glossary section (a real defining
1150
+ // doc still wins when present, because the glossary is short/synthesized with a worse distance).
1151
+ const glossarySlug = archetype === 'whatis-concept' ? resolvePrimerSlug('glossary', store, byPath) : null;
1152
+ const adrNums = adrNumbers(query);
1153
+ const intent = codeDocIntent(query); // 'code' | 'design' | null
1154
+ // FIX A — implementation ("how is X coded") intent: demote wrong-file types (vendored deps,
1155
+ // bare entrypoints, manifests), promote the named crate's own src/**/*.rs.
1156
+ const implIntent = isImplIntent(query);
1157
+ const implCrateTok = (implIntent && entity.crates.length) ? entity.crates[0] : null;
1158
+ const implOpNouns = implIntent ? implOperationNouns(query, entity.crates) : [];
1159
+ // FIX D — named crate(s) WITHOUT impl intent: gently scope to the named crate's own source.
1160
+ // BUT NOT for a definitional "what is X" / overview query: there the user wants the concept doc,
1161
+ // and a crate-family token in the name (e.g. "wifi-densepose") must NOT boost crate src over the
1162
+ // plain-answer PRIMER. So suppress on the whatis-concept archetype + product-overview queries.
1163
+ const definitionalQ = archetype === 'whatis-concept' || isProductOverviewQuery(query, store);
1164
+ const namedCrateToks = (!implIntent && !definitionalQ && entity.crates.length) ? entity.crates : [];
1165
+ const namedOpNouns = namedCrateToks.length ? implOperationNouns(query, entity.crates) : [];
1166
+ // FIX C — crate-scoped maturity question: prefer the crate's OWN README/BENCHMARK over the
1167
+ // global capabilities primer / cross-crate benchmark doc.
1168
+ const crateMaturityTok = crateMaturityTarget(query, entity.crates);
1169
+
1170
+ const db = await RvfDatabase.openReadonly(conf.rvf);
1171
+ let hits;
1172
+ try {
1173
+ // Fetch plenty of raw chunk hits so we have material to group into documents and rerank.
1174
+ hits = await db.query(qv, Math.max(RAW_HITS, k * 4));
1175
+ } finally {
1176
+ await db.close();
1177
+ }
1178
+
1179
+ // FIX 1 — collapse chunk hits into documents keyed by path; doc score = best (min) distance.
1180
+ const docs = new Map(); // path -> { path, title, bestDistance, matchedId }
1181
+ for (const h of hits) {
1182
+ const rec = byId.get(String(h.id));
1183
+ if (!rec) continue;
1184
+ const cur = docs.get(rec.path);
1185
+ if (!cur || h.distance < cur.bestDistance) {
1186
+ docs.set(rec.path, { path: rec.path, title: rec.title, bestDistance: h.distance, matchedId: rec.id });
1187
+ }
1188
+ }
1189
+
1190
+ // INTENT: ensure force-routed targets are IN the candidate pool even if MiniLM ranked them out
1191
+ // of the raw window. (a) the target PRIMER slug for an orientation archetype; (b) the real ADR
1192
+ // document for an exact ADR-NNN query. We synthesize a doc entry from byPath so it can be ranked
1193
+ // and then hard-boosted below. Without this, a force-route could point at a doc not in `docs`.
1194
+ const ensureDoc = (p) => {
1195
+ if (!p || docs.has(p)) return;
1196
+ const chunks = byPath.get(p);
1197
+ if (!chunks || !chunks.length) return;
1198
+ docs.set(p, { path: p, title: chunks[0].title, bestDistance: 1.0, matchedId: chunks[0].id });
1199
+ };
1200
+ if (targetPrimerSlug) ensureDoc(targetPrimerSlug);
1201
+ if (glossarySlug) ensureDoc(glossarySlug); // concept query: glossary may softly win for ruview
1202
+ // Config disambiguation: pull any PRIMER section named in a matched rule's goodSource into the
1203
+ // pool so its goodBoost can rank it even if bge ranked the synthesized section out of the window.
1204
+ // The goodSource names a PRIMER by PREFIX (e.g. "PRIMER#3-what-is-ruqu-made-of"); resolve it to
1205
+ // the actual full slug path(s) in byPath (e.g. "...-the-five-crates") so ensureDoc finds them.
1206
+ for (const pp of disambigPrimerTargets(query, store)) {
1207
+ if (byPath.has(pp)) { ensureDoc(pp); continue; }
1208
+ for (const realPath of byPath.keys()) {
1209
+ if (realPath.startsWith(pp) && PRIMER_PATH_RE.test(realPath)) ensureDoc(realPath);
1210
+ }
1211
+ }
1212
+ // For an exact ADR query, find the real ADR doc path(s) by scanning the passages index.
1213
+ const adrDocPaths = [];
1214
+ if (adrNums.length) {
1215
+ for (const num of adrNums) {
1216
+ for (const p of byPath.keys()) {
1217
+ if (pathIsAdrDoc(p, num) && !PRIMER_PATH_RE.test(p)) { adrDocPaths.push(p); ensureDoc(p); }
1218
+ }
1219
+ }
1220
+ }
1221
+ // FIX 3 — a concept what-is query's DEFINING doc may sit OUTSIDE the raw vector window (the defining
1222
+ // ADR can be titled by its codename, not the concept). Pull any doc whose FILENAME SLUG names a
1223
+ // concept noun into the candidate pool so the strengthened concept boost can rank it; the boost
1224
+ // (not a hard route) decides whether it actually wins. Capped scan to stay cheap.
1225
+ if (definingNouns.length) {
1226
+ let added = 0;
1227
+ for (const p of byPath.keys()) {
1228
+ if (docs.has(p) || PRIMER_PATH_RE.test(p)) continue;
1229
+ const slug = (p.split('/').pop() || '').toLowerCase();
1230
+ if (definingNouns.some((t) => slug.includes(t))) { ensureDoc(p); if (++added >= 40) break; }
1231
+ }
1232
+ }
1233
+ // FIX 2 — ensure the targeted crate's README/BENCHMARK/docs are in the pool even if MiniLM ranked
1234
+ // them out (the harness file is often the closer vector match).
1235
+ if (crateOverviewTok) {
1236
+ const esc = crateOverviewTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1237
+ const cre = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/(?:readme|benchmark)\\.md$`), 'i');
1238
+ for (const p of byPath.keys()) { if (cre.test(p)) ensureDoc(p); }
1239
+ }
1240
+ // FIX A — for an implementation query naming a component, pull that component's own src/** source
1241
+ // into the pool so the real algorithm source can be promoted above a vendored copy / entrypoint.
1242
+ if (implCrateTok) {
1243
+ const esc = implCrateTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1244
+ const srcRe = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/src/.+\\.(?:rs|ts|tsx|js|mjs|py)$`), 'i');
1245
+ let added = 0;
1246
+ for (const p of byPath.keys()) {
1247
+ if (docs.has(p) || /(?:^|\/)main\.rs$/i.test(p)) continue;
1248
+ if (srcRe.test(p)) { ensureDoc(p); if (++added >= 40) break; }
1249
+ }
1250
+ }
1251
+ // FIX D — for a (non-impl) query naming component(s), pull each named component's own src/** into
1252
+ // the pool so the named component's source can be promoted above a sibling / example / bridge.
1253
+ for (const c of namedCrateToks) {
1254
+ const esc = c.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1255
+ const srcRe = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/src/.+\\.(?:rs|ts|tsx|js|mjs|py)$`), 'i');
1256
+ let added = 0;
1257
+ for (const p of byPath.keys()) {
1258
+ if (docs.has(p) || /(?:^|\/)main\.rs$/i.test(p)) continue;
1259
+ if (srcRe.test(p)) { ensureDoc(p); if (++added >= 40) break; }
1260
+ }
1261
+ }
1262
+ // FIX C — for a component-maturity query, ensure the component's own README/BENCHMARK are pooled.
1263
+ if (crateMaturityTok) {
1264
+ const esc = crateMaturityTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1265
+ const cre = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/(?:readme|benchmark)\\.md$`), 'i');
1266
+ for (const p of byPath.keys()) { if (cre.test(p)) ensureDoc(p); }
1267
+ }
1268
+
1269
+ // FIXes 2/3/4 + INTENT — compute effective distance per document.
1270
+ // Hard routes use a large negative adjustment so the routed doc wins decisively; intent tilts
1271
+ // are gentle (break ties / nudge) so they don't override a clearly-better vector match.
1272
+ const HARD_WIN = 5.0; // dominates any plausible distance + penalty (force #1)
1273
+ const ranked = [...docs.values()].map((d) => {
1274
+ const dkind = byPathKind.get(d.path) || null;
1275
+ const pen = demotionPenalty(query, d.path);
1276
+ const boost = lexicalBoost(query, terms, d.path, d.title);
1277
+ const seed = seedAdjust(query, d.path, store);
1278
+ const sub = substanceBoost(byPath.get(d.path));
1279
+ // FIX 1 — when a specific entity is named, the generic orientation lift is suppressed AND
1280
+ // primer-orientation docs are demoted below source/adr/crate-src/doc for this query.
1281
+ const suppressOrient = entity.named || archetype === 'whatis-concept';
1282
+ const orient = orientationBoost(query, d.path, d.title, suppressOrient); // FIX 5 — top-down orientation layer
1283
+ const primerDemote = (entity.named && dkind === 'primer-orientation') ? PRIMER_DEMOTE_WHEN_SPECIFIC : 0;
1284
+ // FIX 2 — crate-overview / metric: boost the crate's README/BENCHMARK/docs, demote its harness.
1285
+ const crateAdj = crateOverviewAdjust(crateOverviewTok, d.path, store); // negative=boost, positive=demote
1286
+ // FIX A — implementation intent: demote vendored deps / entrypoints / manifest, promote the
1287
+ // named component's own src/**/* (extra for an operation-token-matching filename).
1288
+ const implAdj = implIntent ? implAdjust(d.path, implCrateTok, implOpNouns, store) : 0;
1289
+ // FIX D — named component (no impl verb): gently promote the named component's own source.
1290
+ const namedCrateAdj = namedCrateToks.length ? namedCrateAdjust(d.path, namedCrateToks, namedOpNouns, store) : 0;
1291
+ // FIX B — targeted off-topic-magnet down-weight (config disambiguation/offtopicMagnets).
1292
+ const magnetPen = offtopicMagnetPenalty(query, d.path, store);
1293
+ // FIX C — crate-maturity: boost the named component's OWN README/BENCHMARK.
1294
+ const matAdj = crateMaturityAdjust(crateMaturityTok, d.path, store); // negative=boost
1295
+ // FIX 1/3 — concept boost: nudge docs whose slug/title names the concept (defining doc beats
1296
+ // adjacent); extra nudge for the glossary section on a whatis-concept query.
1297
+ let concept = conceptBoost(definingNouns, d.path, d.title);
1298
+ if (glossarySlug && d.path === glossarySlug && concepts.length) concept += 0.06;
1299
+ let intentAdj = 0;
1300
+
1301
+ // INTENT (1) — orientation archetype force-route to the matching PRIMER slug.
1302
+ if (targetPrimerSlug && d.path === targetPrimerSlug) intentAdj += HARD_WIN;
1303
+
1304
+ // INTENT (2) — exact ADR-by-number hard route to the real ADR doc (must beat the index table).
1305
+ if (adrDocPaths.includes(d.path)) intentAdj += HARD_WIN;
1306
+
1307
+ // INTENT (3) — code-vs-doc tilt. Use the doc's content kind from the metadata sidecar.
1308
+ if (intent) {
1309
+ const kind = byPathKind.get(d.path);
1310
+ if (intent === 'code' && isSourceKind(kind)) intentAdj += 0.30; // prefer real code body
1311
+ if (intent === 'design' && kind === 'adr') intentAdj += 0.22; // prefer the ADR/doc
1312
+ }
1313
+
1314
+ // RANK SCALE — the additive offsets below were calibrated against MiniLM's distance scale
1315
+ // (relevant ~0.9–1.1). bge-base packs relevant docs much tighter (~0.4–0.8), so the same
1316
+ // raw offset over-corrects and inverts good raw rankings. Scale the WHOLE offset bundle by
1317
+ // the per-variant rankScale (small=1.0 → unchanged; big<1 → gentler, trusts bge's raw order).
1318
+ const RANK_SCALE = conf.embedCfg.rankScale ?? 1;
1319
+ const offsets = pen - boost + seed - sub - orient - concept - intentAdj
1320
+ + primerDemote + crateAdj + implAdj + namedCrateAdj + magnetPen + matAdj;
1321
+ const effDistance = d.bestDistance + RANK_SCALE * offsets;
1322
+ return { ...d, effDistance, kind: dkind };
1323
+ }).sort((a, b) => a.effDistance - b.effDistance);
1324
+
1325
+ // INTENT (4) — ADR-vs-code pairing for completeness. If #1 is an ADR carrying a Status: header
1326
+ // (a proposal/decision = intent, not built reality) and NO source doc is in the top-N, pull the
1327
+ // best-matching source doc into the returned set so the reader sees proposal vs built code. We
1328
+ // only ADD a result (and tag it); we never displace the routed #1 or break whole-doc return.
1329
+ let pairedSource = null;
1330
+ if (ranked.length) {
1331
+ const top = ranked[0];
1332
+ const topKind = top.kind || byPathKind.get(top.path);
1333
+ if (topKind === 'adr' && adrHasStatus(byPath.get(top.path))) {
1334
+ const inTop = ranked.slice(0, topN).some((d) => isSourceKind(d.kind));
1335
+ if (!inTop) {
1336
+ pairedSource = ranked.find((d) => isSourceKind(d.kind)) || null;
1337
+ }
1338
+ }
1339
+ }
1340
+
1341
+ // FIX 1 — assemble the FULL document for the top-N distinct documents.
1342
+ const assemble = (d, label) => {
1343
+ const chunks = byPath.get(d.path) || [];
1344
+ const { fullText, chunksJoined, truncated } = chunks.length
1345
+ ? assembleDocument(chunks, d.matchedId)
1346
+ : { fullText: '(NO PASSAGE TEXT — path not found in sidecar)', chunksJoined: 0, truncated: false };
1347
+ // FIX 4 — parse + surface the ADR Status (Proposed/Accepted/Implemented/…) for ADR docs (or any
1348
+ // doc whose head carries a Status block). `adrStatus` rides the result object so the MCP path
1349
+ // carries it too; `statusLabel` is the human-visible header tag.
1350
+ const kind = d.kind || byPathKind.get(d.path) || null;
1351
+ const adrStatus = (kind === 'adr' || adrHasStatus(chunks)) ? parseAdrStatus(chunks) : null;
1352
+ const statusLabel = adrStatus
1353
+ ? `ADR STATUS: ${adrStatus}${statusIsProposed(adrStatus)
1354
+ ? ' — design intent, NOT confirmed shipped'
1355
+ : ' — accepted/implemented'}`
1356
+ : null;
1357
+ return {
1358
+ path: d.path,
1359
+ title: d.title,
1360
+ fullText,
1361
+ bestDistance: d.bestDistance,
1362
+ effDistance: d.effDistance,
1363
+ kind,
1364
+ adrStatus, // FIX 4 — parsed ADR status (null if none) — carried to MCP
1365
+ statusLabel, // FIX 4 — human-visible "[ADR STATUS: …]" tag
1366
+ label: label || null, // intent label (e.g. 'paired-source') for callers/UI
1367
+ chunksJoined,
1368
+ truncated,
1369
+ // back-compat aliases for callers that still read .text / .distance
1370
+ text: fullText,
1371
+ distance: d.bestDistance,
1372
+ };
1373
+ };
1374
+
1375
+ const out = ranked.slice(0, topN).map((d) => assemble(d));
1376
+
1377
+ // INTENT (4) — append the paired implementing source so proposal-vs-built-reality is visible.
1378
+ // Appended (not inserted) so the routed/ranked order — including the whole-doc #1 ADR — is intact.
1379
+ if (pairedSource && !out.some((r) => r.path === pairedSource.path)) {
1380
+ out.push(assemble(pairedSource, 'paired-source (implements the ADR above)'));
1381
+ }
1382
+
1383
+ // FIX 4 — proposal-as-reality guard. If the #1 result is a Proposed/not-yet-Implemented ADR (or a
1384
+ // design/DDD doc with no parseable status) AND no kind:'source' implementing file is in the set,
1385
+ // attach a clear design-intent warning so the reader never treats a proposal as shipped reality.
1386
+ if (out.length) {
1387
+ const top = out[0];
1388
+ const isDesignTop = top.kind === 'adr'
1389
+ || top.kind === 'doc' || top.kind === 'doc-deep' || top.kind === 'ddd';
1390
+ const proposed = top.adrStatus ? statusIsProposed(top.adrStatus) : (top.kind !== 'source' && top.kind !== 'crate-src');
1391
+ const hasSource = out.some((r) => isSourceKind(r.kind));
1392
+ if (isDesignTop && proposed && !hasSource) {
1393
+ const st = top.adrStatus || 'unstated (design/DDD doc)';
1394
+ top.designIntentWarning =
1395
+ `⚠ This is design intent (ADR status: ${st}); no implementing source was retrieved — `
1396
+ + `treat as proposed, not confirmed built.`;
1397
+ }
1398
+ }
1399
+
1400
+ return out;
1401
+ }
1402
+
1403
+ // ---------- structured lookups (exact, not semantic) ----------
1404
+ // The drop-in for-ai/ ships <store>-symbols.json / -dep-graph.json / -entrypoints.json. These give
1405
+ // an AI EXACT answers (a function signature, who-depends-on-what, the build/test/run commands)
1406
+ // without a vector search. Exported so the MCP server can surface them too.
1407
+ export function loadStructured(store) {
1408
+ const dir = storeDir(store);
1409
+ const read = (suffix) => { try { return JSON.parse(fs.readFileSync(path.join(dir, `${store}-${suffix}.json`), 'utf8')); } catch { return null; } };
1410
+ return { symbols: read('symbols'), depGraph: read('dep-graph'), entrypoints: read('entrypoints') };
1411
+ }
1412
+
1413
+ // Exact symbol lookup: name substring (case-insensitive), optional kind filter. Returns matches
1414
+ // with signature + module + source location + doc.
1415
+ export function lookupSymbol(store, name, { kind, limit = 25 } = {}) {
1416
+ const { symbols } = loadStructured(store);
1417
+ if (!symbols) return { available: false, matches: [] };
1418
+ const needle = String(name || '').toLowerCase();
1419
+ let m = symbols.symbols.filter((s) => (!needle || s.name.toLowerCase().includes(needle) || (s.module || '').toLowerCase().includes(needle)));
1420
+ if (kind) m = m.filter((s) => s.kind === kind);
1421
+ return { available: true, count: m.length, method: symbols.method, matches: m.slice(0, limit) };
1422
+ }
1423
+
1424
+ // Exact entrypoints + dep-graph passthroughs.
1425
+ export function getEntrypoints(store) { return loadStructured(store).entrypoints; }
1426
+ export function getDepGraph(store) { return loadStructured(store).depGraph; }
1427
+
1428
+ // ---------- CLI ----------
1429
+ async function main() {
1430
+ const argv = process.argv.slice(2);
1431
+
1432
+ // Structured-lookup subcommands (exact, no vector search): --symbol NAME | --entrypoints | --deps.
1433
+ const sIdx = argv.findIndex((a) => a === '--symbol' || a === '--entrypoints' || a === '--deps');
1434
+ if (sIdx !== -1) {
1435
+ const store = argv.find((a) => !a.startsWith('--')) || CONFIG_DEFAULT;
1436
+ if (!knownStore(store)) { console.error(`unknown store: ${store}`); process.exit(2); }
1437
+ const flag = argv[sIdx];
1438
+ if (flag === '--symbol') {
1439
+ const name = argv[sIdx + 1] || '';
1440
+ const r = lookupSymbol(store, name, { limit: 40 });
1441
+ if (!r.available) { console.error(`no ${store}-symbols.json present (run extract-symbols.mjs)`); process.exit(1); }
1442
+ console.log(`\n=== ${store} symbols matching "${name}" (${r.count} via ${r.method}) ===\n`);
1443
+ for (const s of r.matches) console.log(`${s.kind.padEnd(9)} ${s.signature}\n @ ${s.file}:${s.line}${s.doc ? `\n ${s.doc}` : ''}\n`);
1444
+ } else if (flag === '--entrypoints') {
1445
+ const e = getEntrypoints(store);
1446
+ if (!e) { console.error(`no ${store}-entrypoints.json present`); process.exit(1); }
1447
+ console.log(JSON.stringify({ workspace: e.workspace, install: e.install, quickstart: e.quickstart, binaries: e.binaries, commands: e.commands }, null, 2));
1448
+ } else {
1449
+ const g = getDepGraph(store);
1450
+ if (!g) { console.error(`no ${store}-dep-graph.json present`); process.exit(1); }
1451
+ console.log(JSON.stringify({ nodes: g.nodes.map((n) => ({ name: n.name, ecosystem: n.ecosystem, description: n.description })), internalEdges: g.internalEdges, externalDepNames: g.externalDepNames }, null, 2));
1452
+ }
1453
+ return;
1454
+ }
1455
+
1456
+ // optional trailing [big|small] variant selector; default auto-picks big if present
1457
+ let variant;
1458
+ const vIdx = argv.findIndex((a) => a === 'big' || a === 'small');
1459
+ if (vIdx !== -1) variant = argv.splice(vIdx, 1)[0];
1460
+ const [store, query, kArg] = argv;
1461
+ if (!store || !query) {
1462
+ console.error(`Usage: node kb/ask-kb.mjs <${[...KNOWN_STORES].join('|')}> "question" [k] [big|small]`);
1463
+ console.error(` or: node kb/ask-kb.mjs <store> --symbol <name> | --entrypoints | --deps`);
1464
+ process.exit(2);
1465
+ }
1466
+ const k = Math.max(1, parseInt(kArg || '6', 10) || 6);
1467
+ const conf = resolveConf(store, variant);
1468
+ const results = await searchKb({ query, k, store, variant });
1469
+ console.log(`\n=== ${store} KB (${conf.variant} · ${conf.embedCfg.model}) — "${query}" — top ${results.length} documents ===\n`);
1470
+ results.forEach((r, i) => {
1471
+ console.log(`#${i + 1} distance=${r.bestDistance.toFixed(4)} (eff=${r.effDistance.toFixed(4)})`
1472
+ + `${r.kind ? ` kind=${r.kind}` : ''}${r.label ? ` [${r.label}]` : ''}`
1473
+ + `${r.statusLabel ? ` [${r.statusLabel}]` : ''}`); // FIX 4 — surface ADR status in the header
1474
+ console.log(`path : ${r.path}`);
1475
+ console.log(`title: ${r.title}`);
1476
+ if (r.designIntentWarning) console.log(r.designIntentWarning); // FIX 4 — proposal-as-reality guard
1477
+ console.log(`chars: ${r.fullText.length} | chunks: ${r.chunksJoined}${r.truncated ? ' (truncated)' : ''}`);
1478
+ console.log('----- full document -----');
1479
+ console.log(r.fullText);
1480
+ console.log('===================================================================\n');
1481
+ });
1482
+ }
1483
+
1484
+ // Run as CLI when invoked directly (compare decoded real paths; handles spaces in path).
1485
+ if (process.argv[1] && path.resolve(process.argv[1]) === path.resolve(__filename)) {
1486
+ main().catch((e) => { console.error('ERROR:', e.message); process.exit(1); });
1487
+ }