explainmyrepo 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -0
- package/assets/design-system/design-system.css +833 -0
- package/assets/design-system/theme-example.css +83 -0
- package/bin/explainmyrepo.mjs +115 -0
- package/kb/ask-kb.mjs +1487 -0
- package/kb/build-kb.mjs +353 -0
- package/kb/corpus-rules.mjs +341 -0
- package/kb/dep-graph.mjs +184 -0
- package/kb/entrypoints.mjs +207 -0
- package/kb/extract-symbols.mjs +322 -0
- package/kb/index-primer.mjs +255 -0
- package/kb/kb-mcp-server.mjs +186 -0
- package/kb/kb.config.mjs +1362 -0
- package/kb/make-dropin.mjs +224 -0
- package/kb/resolve-deps.mjs +126 -0
- package/package.json +52 -0
- package/src/brain.mjs +298 -0
- package/src/build-context.mjs +66 -0
- package/src/claude.mjs +97 -0
- package/src/env.mjs +77 -0
- package/src/orchestrator.mjs +419 -0
- package/src/run-tool.mjs +49 -0
- package/tools/CONTRACT.md +301 -0
- package/tools/assemble-page.mjs +631 -0
- package/tools/build-kb.mjs +159 -0
- package/tools/clone-repo.mjs +161 -0
- package/tools/deploy.mjs +160 -0
- package/tools/generate-image.mjs +280 -0
- package/tools/make-diagrams.mjs +835 -0
- package/tools/make-favicon.mjs +145 -0
- package/tools/make-pack.mjs +295 -0
- package/tools/make-social-card.mjs +198 -0
- package/tools/notify.mjs +327 -0
- package/tools/publish-repo.mjs +156 -0
- package/tools/quality-grade.mjs +746 -0
- package/tools/readme-enhance.mjs +310 -0
- package/tools/repo-seo.mjs +143 -0
package/kb/ask-kb.mjs
ADDED
|
@@ -0,0 +1,1487 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// ask-kb.mjs — self-contained CLI to query a Cognitum RVF knowledge base and print
|
|
3
|
+
// the FULL top-k passages (not previews). Joins .rvf vector hits to the full-text
|
|
4
|
+
// passages sidecar (.passages.jsonl) by id.
|
|
5
|
+
//
|
|
6
|
+
// Usage:
|
|
7
|
+
// node kb/ask-kb.mjs <store-slug> "your question" [k] (store slugs come from kb.config.mjs)
|
|
8
|
+
//
|
|
9
|
+
// Deps: @ruvector/rvf + @xenova/transformers (resolved PORTABLY — see resolve-deps.mjs:
|
|
10
|
+
// project node_modules first, then RVF_MODULE_PATH/XENOVA_PATH env, then author Mac paths)
|
|
11
|
+
// + the bundled kb/*.rvf and kb/*.passages.jsonl files. So `cd kb && npm i` then run.
|
|
12
|
+
// Model cache is configurable via KB_MODEL_CACHE (offline if cached, else downloads MiniLM
|
|
13
|
+
// from HuggingFace — works on a fresh machine).
|
|
14
|
+
|
|
15
|
+
import fs from 'node:fs';
|
|
16
|
+
import path from 'node:path';
|
|
17
|
+
import readline from 'node:readline';
|
|
18
|
+
import { fileURLToPath } from 'node:url';
|
|
19
|
+
import { loadRvf, loadTransformers, configureModel } from './resolve-deps.mjs';
|
|
20
|
+
import { targets as CONFIG_TARGETS, defaultTarget as CONFIG_DEFAULT } from './kb.config.mjs';
|
|
21
|
+
|
|
22
|
+
const { mod: rvfMod, via: rvfVia } = loadRvf();
|
|
23
|
+
const { RvfDatabase } = rvfMod;
|
|
24
|
+
if (process.env.KB_DEBUG) console.error(`[ask-kb] @ruvector/rvf via: ${rvfVia}`);
|
|
25
|
+
|
|
26
|
+
const __filename = fileURLToPath(import.meta.url); // decodes %20 etc.
|
|
27
|
+
const KB_DIR = path.dirname(__filename);
|
|
28
|
+
|
|
29
|
+
// ===================================================================================
|
|
30
|
+
// CONFIG BRIDGE — per-store config is read from kb.config.mjs (NO hard-coded repo names).
|
|
31
|
+
// Everything the intent/rerank layer needs that used to be a hard-coded ruvector/ruview map is
|
|
32
|
+
// now derived here from the target entry: metaName, productNames, componentRoots, componentWord,
|
|
33
|
+
// disambiguation, offtopicMagnets, primerSlugs (auto = discovered from the live sidecar at query
|
|
34
|
+
// time). knownStore() replaces the hard-coded `store !== 'ruvector' && store !== 'ruview'` checks.
|
|
35
|
+
// ===================================================================================
|
|
36
|
+
const KNOWN_STORES = new Set(Object.keys(CONFIG_TARGETS));
|
|
37
|
+
function knownStore(store) { return KNOWN_STORES.has(store); }
|
|
38
|
+
function cfgFor(store) { return CONFIG_TARGETS[store] || null; }
|
|
39
|
+
|
|
40
|
+
// componentWord synonym group (e.g. ['crate','package','module','component']) — injected into the
|
|
41
|
+
// intent regexes so an npm-package repo (packages/<name>) fires the same routes a crate repo did.
|
|
42
|
+
function componentWords(store) {
|
|
43
|
+
const t = cfgFor(store);
|
|
44
|
+
const w = (t && t.componentWord && t.componentWord.length) ? t.componentWord : ['crate', 'package', 'module', 'component'];
|
|
45
|
+
return w.map((x) => String(x).toLowerCase());
|
|
46
|
+
}
|
|
47
|
+
// Regex alternation of the component words, e.g. (?:crate|package|module|component).
|
|
48
|
+
function componentWordAlt(store) {
|
|
49
|
+
return `(?:${componentWords(store).map((w) => w.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|')})`;
|
|
50
|
+
}
|
|
51
|
+
// componentRoots (e.g. ['packages'] or ['crates']) — the directory segment(s) that hold components.
|
|
52
|
+
// A component path looks like `<root>/<name>` (optionally under a `vN/` prefix, legacy crates case).
|
|
53
|
+
function componentRoots(store) {
|
|
54
|
+
const t = cfgFor(store);
|
|
55
|
+
return (t && t.componentRoots && t.componentRoots.length) ? t.componentRoots : ['crates', 'packages'];
|
|
56
|
+
}
|
|
57
|
+
// Build a regex SOURCE that matches `(?:vN/)?<root>/<NAME>` for the store's componentRoots, with the
|
|
58
|
+
// component-name capture being `nameSrc` (caller supplies a literal token or a generic name pattern).
|
|
59
|
+
function componentPrefixSrc(store, nameSrc) {
|
|
60
|
+
const rootsAlt = componentRoots(store).map((r) => r.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('|');
|
|
61
|
+
return `(?:^|/)(?:v\\d+/)?(?:${rootsAlt})/${nameSrc}`;
|
|
62
|
+
}
|
|
63
|
+
// A regex testing whether `path` lives inside a SPECIFIC named component (token may itself be a
|
|
64
|
+
// prefix, e.g. "ruvector-core" matches "crates/ruvector-core/..." or "packages/ruvector-core/...").
|
|
65
|
+
function inComponentRe(store, token) {
|
|
66
|
+
const tok = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
67
|
+
return new RegExp(componentPrefixSrc(store, `${tok}(?:-[a-z0-9-]+)?/`), 'i');
|
|
68
|
+
}
|
|
69
|
+
// The product-name matcher for a store, built from config.productNames (escaped, word-ish bounded).
|
|
70
|
+
// Used by isProductOverviewQuery / conceptNouns. Falls back to the store slug.
|
|
71
|
+
function productNameRe(store) {
|
|
72
|
+
const t = cfgFor(store);
|
|
73
|
+
const names = (t && t.productNames && t.productNames.length) ? t.productNames : [store];
|
|
74
|
+
const alt = names
|
|
75
|
+
.map((n) => String(n).trim())
|
|
76
|
+
.filter(Boolean)
|
|
77
|
+
.map((n) => n.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\s+/g, '[\\s-]?'))
|
|
78
|
+
.sort((a, b) => b.length - a.length) // longest-first so "create-agent-harness" beats a prefix
|
|
79
|
+
.join('|');
|
|
80
|
+
return new RegExp(`\\b(?:${alt})\\b`, 'i');
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ---------- variant-aware store resolution ----------
|
|
84
|
+
// Two builds ship per repo (same passages, different embedder):
|
|
85
|
+
// small (384-dim MiniLM) — the Seed-compatible default; files: <store>-kb.rvf
|
|
86
|
+
// big (768-dim bge) — sharper, for Mac/PC; files: <store>-kb.big.rvf
|
|
87
|
+
// One tool serves both: the embedder for a query is read from the <rvf>.embed.json
|
|
88
|
+
// sidecar the build wrote next to each .rvf, so the query is always embedded with the
|
|
89
|
+
// SAME model the corpus was. Absent that sidecar we fall back to MiniLM (the small build).
|
|
90
|
+
const MINILM_CFG = { model: 'Xenova/all-MiniLM-L6-v2', pooling: 'mean', normalize: true, queryPrefix: '' };
|
|
91
|
+
|
|
92
|
+
// KB data files live in kb/stores/<store>/ when organized that way (clear, per-repo), and
|
|
93
|
+
// fall back to a flat kb/ layout otherwise (bundles unzip flat). One rule, both layouts.
|
|
94
|
+
function storeDir(store) {
|
|
95
|
+
const sub = path.join(KB_DIR, 'stores', store);
|
|
96
|
+
return fs.existsSync(sub) ? sub : KB_DIR;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function variantPaths(store, variant) {
|
|
100
|
+
const dir = storeDir(store);
|
|
101
|
+
const base = path.join(dir, `${store}-kb`);
|
|
102
|
+
// metadata sidecar (per-path `kind`): the generic builder writes *.ids.json; legacy stores used
|
|
103
|
+
// *.meta.json. Prefer ids.json, fall back to the legacy name when only it exists. (No repo baked in.)
|
|
104
|
+
const idsName = `${store}-kb.ids.json`;
|
|
105
|
+
const legacyName = `${store}-kb.meta.json`;
|
|
106
|
+
const metaFile = fs.existsSync(path.join(dir, idsName)) ? idsName
|
|
107
|
+
: (fs.existsSync(path.join(dir, legacyName)) ? legacyName : idsName);
|
|
108
|
+
// BOTH versions are explicitly named: .big.rvf (768-dim) and .small.rvf (384-dim).
|
|
109
|
+
// passages + metadata are SHARED (un-tagged) — built once, used by both.
|
|
110
|
+
// Single-384 build (recipe v1.3.1) writes the canonical un-suffixed <store>-kb.rvf; prefer it
|
|
111
|
+
// for the small variant, falling back to the legacy .small.rvf tag.
|
|
112
|
+
let rvf;
|
|
113
|
+
if (variant === 'big') {
|
|
114
|
+
rvf = `${base}.big.rvf`;
|
|
115
|
+
} else {
|
|
116
|
+
const plain = `${base}.rvf`;
|
|
117
|
+
rvf = fs.existsSync(plain) ? plain : `${base}.small.rvf`;
|
|
118
|
+
}
|
|
119
|
+
return {
|
|
120
|
+
rvf,
|
|
121
|
+
passages: `${base}.passages.jsonl`,
|
|
122
|
+
meta: path.join(dir, metaFile),
|
|
123
|
+
embedCfgPath: `${rvf}.embed.json`,
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Resolve the file set + embedder config for a (store, variant). variant defaults to
|
|
128
|
+
// 'big' when a big build is present (best answers), else 'small' — so a fresh checkout
|
|
129
|
+
// with only the Seed build still works, and a Mac bundle auto-uses the sharp one.
|
|
130
|
+
function resolveConf(store, variant) {
|
|
131
|
+
if (!knownStore(store)) throw new Error(`unknown store: ${store} (use ${[...KNOWN_STORES].join('|')})`);
|
|
132
|
+
if (variant !== 'big' && variant !== 'small') {
|
|
133
|
+
variant = fs.existsSync(path.join(storeDir(store), `${store}-kb.big.rvf`)) ? 'big' : 'small';
|
|
134
|
+
}
|
|
135
|
+
const p = variantPaths(store, variant);
|
|
136
|
+
// The big variant indexes the SAME passages/metadata as the small build (only the embedder
|
|
137
|
+
// differs), so a bundle ships ONE copy. If the big-tagged sidecars are absent, fall back to
|
|
138
|
+
// the small (untagged) ones — same content, ~115 MB smaller download per repo.
|
|
139
|
+
if (variant === 'big') {
|
|
140
|
+
const small = variantPaths(store, 'small');
|
|
141
|
+
if (!fs.existsSync(p.passages)) p.passages = small.passages;
|
|
142
|
+
if (!fs.existsSync(p.meta)) p.meta = small.meta;
|
|
143
|
+
}
|
|
144
|
+
let embedCfg = { ...MINILM_CFG };
|
|
145
|
+
if (fs.existsSync(p.embedCfgPath)) {
|
|
146
|
+
try { embedCfg = { ...MINILM_CFG, ...JSON.parse(fs.readFileSync(p.embedCfgPath, 'utf8')) }; }
|
|
147
|
+
catch (e) { if (process.env.KB_DEBUG) console.error(`[ask-kb] bad embed.json (${p.embedCfgPath}): ${e.message}`); }
|
|
148
|
+
}
|
|
149
|
+
return { ...p, embedCfg, variant };
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// ---------- embedder (lazy, per-model, offline-first with remote fallback) ----------
|
|
153
|
+
// Cached per model name so a single process can serve both the small (MiniLM) and big
|
|
154
|
+
// (bge) builds without reloading. Remote download is allowed only when THAT model isn't
|
|
155
|
+
// already cached locally (so a Seed-only box never reaches for the network).
|
|
156
|
+
const _feCache = new Map();
|
|
157
|
+
async function getEmbedder(model) {
|
|
158
|
+
if (_feCache.has(model)) return _feCache.get(model);
|
|
159
|
+
const { T, modelCache, via } = await loadTransformers();
|
|
160
|
+
T.env.localModelPath = modelCache;
|
|
161
|
+
T.env.allowRemoteModels = !fs.existsSync(path.join(modelCache, model));
|
|
162
|
+
if (process.env.KB_DEBUG) {
|
|
163
|
+
console.error(`[ask-kb] transformers via: ${via} | model ${model} | cache: ${modelCache} `
|
|
164
|
+
+ `(${T.env.allowRemoteModels ? 'remote download' : 'local'})`);
|
|
165
|
+
}
|
|
166
|
+
const fe = await T.pipeline('feature-extraction', model, { quantized: true });
|
|
167
|
+
_feCache.set(model, fe);
|
|
168
|
+
return fe;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Embed a QUERY with the build's embedder config. bge-style builds carry a queryPrefix
|
|
172
|
+
// (asymmetric retrieval — passages were embedded with NO prefix at build time, queries
|
|
173
|
+
// get the instruction prefix here); MiniLM uses no prefix and mean pooling.
|
|
174
|
+
async function embed(text, cfg = MINILM_CFG) {
|
|
175
|
+
const fe = await getEmbedder(cfg.model || MINILM_CFG.model);
|
|
176
|
+
const out = await fe([(cfg.queryPrefix || '') + text], {
|
|
177
|
+
pooling: cfg.pooling || 'mean',
|
|
178
|
+
normalize: cfg.normalize !== false,
|
|
179
|
+
});
|
|
180
|
+
return Float32Array.from(out.data);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// ---------- passages sidecar loader ----------
|
|
184
|
+
// Returns { byId, byPath } where:
|
|
185
|
+
// byId : Map id(str) -> { id(num), text, path, title }
|
|
186
|
+
// byPath : Map path -> [ {id,text,...}, ... ] sorted by numeric id (== chunk order)
|
|
187
|
+
// Numeric id order reconstructs document chunk order: the builder assigns ids sequentially
|
|
188
|
+
// while walking a document, so a path's chunks are id-ordered (verified on both KBs).
|
|
189
|
+
function loadPassages(file) {
|
|
190
|
+
return new Promise((resolve, reject) => {
|
|
191
|
+
const byId = new Map();
|
|
192
|
+
const byPath = new Map();
|
|
193
|
+
if (!fs.existsSync(file)) return reject(new Error(`passages sidecar not found: ${file}`));
|
|
194
|
+
const rl = readline.createInterface({ input: fs.createReadStream(file, 'utf8'), crlfDelay: Infinity });
|
|
195
|
+
rl.on('line', (line) => {
|
|
196
|
+
if (!line.trim()) return;
|
|
197
|
+
try {
|
|
198
|
+
const o = JSON.parse(line);
|
|
199
|
+
const rec = { id: Number(o.id), text: o.text || '', path: o.path || '(unknown path)', title: o.title || '(unknown title)' };
|
|
200
|
+
byId.set(String(o.id), rec);
|
|
201
|
+
if (!byPath.has(rec.path)) byPath.set(rec.path, []);
|
|
202
|
+
byPath.get(rec.path).push(rec);
|
|
203
|
+
} catch { /* skip malformed line */ }
|
|
204
|
+
});
|
|
205
|
+
rl.on('close', () => {
|
|
206
|
+
for (const arr of byPath.values()) arr.sort((a, b) => a.id - b.id);
|
|
207
|
+
resolve({ byId, byPath });
|
|
208
|
+
});
|
|
209
|
+
rl.on('error', reject);
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// ---------- kind metadata sidecar loader ----------
|
|
214
|
+
// The passages sidecar (.passages.jsonl) carries only {id,text,path,title}. The per-chunk
|
|
215
|
+
// `kind` (source / crate-src / adr / doc / doc-deep / primer-orientation / …) lives in the
|
|
216
|
+
// build metadata sidecar (.ids.json / .meta.json) keyed by the SAME numeric id. The intent
|
|
217
|
+
// layer (code-vs-doc routing, ADR-vs-code pairing, PRIMER detection) needs `kind`, so we load
|
|
218
|
+
// it once and fold it down to a per-PATH kind. If the sidecar is missing the layer degrades
|
|
219
|
+
// gracefully (kind unknown -> no kind-based adjustments; vector+rerank still works).
|
|
220
|
+
function loadKinds(file) {
|
|
221
|
+
const byPathKind = new Map(); // path -> representative kind (the doc's dominant content kind)
|
|
222
|
+
try {
|
|
223
|
+
if (!file || !fs.existsSync(file)) return byPathKind;
|
|
224
|
+
const j = JSON.parse(fs.readFileSync(file, 'utf8'));
|
|
225
|
+
const entries = j.entries || {};
|
|
226
|
+
const counts = new Map(); // path -> Map(kind -> n)
|
|
227
|
+
for (const v of Object.values(entries)) {
|
|
228
|
+
if (!v || !v.path || !v.kind) continue;
|
|
229
|
+
if (!counts.has(v.path)) counts.set(v.path, new Map());
|
|
230
|
+
const m = counts.get(v.path);
|
|
231
|
+
m.set(v.kind, (m.get(v.kind) || 0) + 1);
|
|
232
|
+
}
|
|
233
|
+
for (const [p, m] of counts) {
|
|
234
|
+
let best = null, bestN = -1;
|
|
235
|
+
for (const [kind, n] of m) { if (n > bestN) { best = kind; bestN = n; } }
|
|
236
|
+
byPathKind.set(p, best);
|
|
237
|
+
}
|
|
238
|
+
} catch { /* sidecar unreadable -> empty map, graceful degrade */ }
|
|
239
|
+
return byPathKind;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// A path is "source code" if its dominant kind is a code kind.
|
|
243
|
+
const SOURCE_KINDS = new Set(['source', 'crate-src', 'example']);
|
|
244
|
+
function isSourceKind(kind) { return SOURCE_KINDS.has(kind); }
|
|
245
|
+
|
|
246
|
+
// ===================================================================================
|
|
247
|
+
// SPECIFIC-ENTITY DETECTION (FIX 1 — orientation over-fire). A query that names a SPECIFIC
|
|
248
|
+
// entity (a crate, an ADR id, a filename/.rs token, or a Capitalized multiword proper noun)
|
|
249
|
+
// is NOT a generic product-orientation question, even if it begins "what does …". For such a
|
|
250
|
+
// query we suppress the generic PRIMER-orientation lift AND demote primer-orientation docs so a
|
|
251
|
+
// vector-closer deep doc (source / adr / crate-src / doc) wins. The crate-INVENTORY archetype
|
|
252
|
+
// ("which crates make up X") is handled separately (it carries no hyphen-crate token) and still
|
|
253
|
+
// routes to the inventory PRIMER.
|
|
254
|
+
// ===================================================================================
|
|
255
|
+
|
|
256
|
+
// Build the set of component-style path prefixes actually present in the KB (data-driven, so the
|
|
257
|
+
// detector never fires on a generic hyphenated word like "end-to-end" — only on real components).
|
|
258
|
+
// Prefixes are taken from the store's componentRoots segments (e.g. `packages/<name>` for AHG,
|
|
259
|
+
// `crates/<name>` / `v2/crates/<name>` for a Rust monorepo) — NO hard-coded root name.
|
|
260
|
+
function crateTokenSet(byPath, store) {
|
|
261
|
+
const set = new Set();
|
|
262
|
+
const re = new RegExp(componentPrefixSrc(store, '([a-z0-9][a-z0-9-]+)'), 'i');
|
|
263
|
+
for (const p of byPath.keys()) {
|
|
264
|
+
const m = p.match(re);
|
|
265
|
+
if (m) set.add(m[1].toLowerCase());
|
|
266
|
+
}
|
|
267
|
+
return set;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// A capitalized multiword proper noun, e.g. "Coherent Human Channel" / "RuvSense Domain".
|
|
271
|
+
const PROPER_NOUN_RE = /\b([A-Z][a-z0-9]+(?:\s+[A-Z][a-z0-9]+){1,})\b/g;
|
|
272
|
+
// A file / .rs token, e.g. "lib.rs", "main.rs", "versioning.rs", "config.toml".
|
|
273
|
+
const FILE_TOKEN_RE = /\b[a-z0-9_]+\.(rs|toml|md|ts|js|mjs|py|json|yaml|yml)\b/i;
|
|
274
|
+
// Common English / orientation words that may appear Title-Cased ("How Complete Is RuVector",
|
|
275
|
+
// "Getting Started Guide") — a proper-noun candidate built ONLY from these is NOT a named entity, so
|
|
276
|
+
// Title-Cased orientation queries still route correctly. (The product name is also a common word, so
|
|
277
|
+
// "RuVector"/"RuView" alone in a Title-Cased orientation query doesn't count as a specific entity.)
|
|
278
|
+
const COMMON_TITLE_WORDS = new Set(['how','what','when','where','why','which','who','is','are','the','a','an',
|
|
279
|
+
'and','or','of','to','in','for','on','with','do','does','complete','mature','maturity','production',
|
|
280
|
+
'ready','overview','introduction','getting','started','start','guide','setup','install','use','using',
|
|
281
|
+
'core','capabilities','capability','feature','features','concept','concepts','docs','documentation',
|
|
282
|
+
'tutorial','tutorials','example','examples','crate','crates','inventory','about','it','this','that',
|
|
283
|
+
'ruvector','ruview','playbook','quickstart','end','reference',
|
|
284
|
+
// Product feature names / host names that appear capitalized in orientation queries but do NOT
|
|
285
|
+
// indicate a specific-entity deep-dive (MetaHarness orientation queries often say "Darwin Mode",
|
|
286
|
+
// "Claude Code", "Rust Core", "Mode", "Claude", "Code", "Layer" etc. — treat as orientation terms).
|
|
287
|
+
'darwin','mode','claude','code','harness','metaharness','layer','layers','surface','rust','rust core',
|
|
288
|
+
'host','hosts','adapter','adapters','kernel','wizard','composer','scaffold','stage','stages','genome',
|
|
289
|
+
'router','factory','framework','model','wrapper','account','server','threat','scan','posture']);
|
|
290
|
+
|
|
291
|
+
// Does the query name a SPECIFIC entity? Returns the matched hyphen-crate token(s) (lowercased)
|
|
292
|
+
// plus a boolean. Used to (a) suppress generic orientation lift, (b) demote primer-orientation,
|
|
293
|
+
// (c) drive the crate-overview README/BENCHMARK boost (FIX 2).
|
|
294
|
+
function specificEntity(query, crateTokens) {
|
|
295
|
+
const hyphenTokens = (query.match(/\b[a-z][a-z0-9]+-[a-z0-9][a-z0-9-]*\b/gi) || [])
|
|
296
|
+
.map((t) => t.toLowerCase());
|
|
297
|
+
// Keep only hyphen tokens that ARE a known crate (or a known crate's prefix) — this excludes
|
|
298
|
+
// generic words like "end-to-end", "step-by-step", "real-time" while catching "ruvector-snapshot".
|
|
299
|
+
const crates = hyphenTokens.filter((t) =>
|
|
300
|
+
crateTokens.has(t) || [...crateTokens].some((c) => c.startsWith(t + '-') || t.startsWith(c)));
|
|
301
|
+
// A Title-Cased multiword phrase counts as a proper noun ONLY if it contains a token that is NOT a
|
|
302
|
+
// common English/orientation word (so "How Complete Is RuVector" / "Getting Started Guide" do NOT
|
|
303
|
+
// misfire, but "Coherent Human Channel" / "Tauri Desktop Frontend" do).
|
|
304
|
+
let hasProperNoun = false;
|
|
305
|
+
for (const m of query.matchAll(PROPER_NOUN_RE)) {
|
|
306
|
+
const toks = m[1].split(/\s+/);
|
|
307
|
+
if (toks.some((w) => !COMMON_TITLE_WORDS.has(w.toLowerCase()))) { hasProperNoun = true; break; }
|
|
308
|
+
}
|
|
309
|
+
const hasAdr = /\badr[-\s_]?\d{1,4}\b/i.test(query);
|
|
310
|
+
const hasFile = FILE_TOKEN_RE.test(query);
|
|
311
|
+
const named = crates.length > 0 || hasAdr || hasFile || hasProperNoun;
|
|
312
|
+
return { named, crates };
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Demotion penalty added to a primer-orientation doc when the query names a specific entity.
|
|
316
|
+
// Pushes primer-orientation BELOW source/adr/crate-src/doc for that query (FIX 1). Large enough to
|
|
317
|
+
// out-weigh the (now-suppressed) generic lift but applied as a positive penalty on eff distance.
|
|
318
|
+
const PRIMER_DEMOTE_WHEN_SPECIFIC = 0.60;
|
|
319
|
+
|
|
320
|
+
// ===================================================================================
|
|
321
|
+
// Retrieval-quality layer (retrieval-only; KBs are NOT rebuilt).
|
|
322
|
+
// FIX 1 whole-document return, FIX 2 demote low-signal files,
|
|
323
|
+
// FIX 3 exact-term/ADR/title boost, FIX 4 "Cognitum Seed" disambiguation.
|
|
324
|
+
// ===================================================================================
|
|
325
|
+
|
|
326
|
+
const MAX_DOC_CHARS = 12000; // cap for an assembled full document
|
|
327
|
+
// Chunks fetched from the vector index to group into documents and rerank. Kept generous so the
|
|
328
|
+
// small TOP-DOWN ORIENTATION LAYER (a few dozen `PRIMER#` section chunks) reliably enters the
|
|
329
|
+
// candidate pool for orientation queries — a tiny synthesized section can sit just outside a
|
|
330
|
+
// narrow window yet be the correct whole-document answer once reranked (FIX 5). Reranking is
|
|
331
|
+
// order-stable for the closest deep docs, so widening does not disturb non-orientation results.
|
|
332
|
+
const RAW_HITS = 96;
|
|
333
|
+
|
|
334
|
+
// FIX 2 — low-signal path patterns and the query keyword that *re-enables* each.
|
|
335
|
+
// A penalty is added to a doc's effective distance UNLESS the query mentions the kind.
|
|
336
|
+
const LOW_SIGNAL = [
|
|
337
|
+
{ re: /(^|\/)readme[^/]*$/i, pen: 0.18, allow: /\breadme\b/i },
|
|
338
|
+
{ re: /-checklist\.md$/i, pen: 0.15, allow: /\bchecklist\b/i },
|
|
339
|
+
{ re: /overview[^/]*\.md$/i, pen: 0.10, allow: /\boverview\b/i }, // TOC / link-list pages
|
|
340
|
+
{ re: /(^|\/)(index|toc|table-of-contents)[^/]*\.md$/i, pen: 0.10, allow: /\b(index|toc|contents)\b/i },
|
|
341
|
+
{ re: /(^|\/)archive\//i, pen: 0.20, allow: /\barchiv/i },
|
|
342
|
+
{ re: /(^|\/)examples?\/.*\.rs$/i, pen: 0.18, allow: /\bexamples?\b/i },
|
|
343
|
+
{ re: /(^|\/)benches?\//i, pen: 0.22, allow: /\b(bench|benchmark)/i },
|
|
344
|
+
{ re: /(^|\/)tests?\//i, pen: 0.16, allow: /\btest/i },
|
|
345
|
+
{ re: /(_test\.rs|\.test\.[jt]s|_spec\.rb)$/i, pen: 0.16, allow: /\btest/i },
|
|
346
|
+
];
|
|
347
|
+
|
|
348
|
+
// FIX 4 — "Cognitum Seed" product disambiguation. When the query is about the Seed
|
|
349
|
+
// product/onboarding, bias toward onboarding/Seed docs and away from RNG/pretraining seeds.
|
|
350
|
+
const SEED_QUERY_RE = /\b(cognitum\s+seed|seed\s+(onboard\w*|pipeline|product)|onboard\w*\s+seed)\b/i;
|
|
351
|
+
const SEED_GOOD_RE = /(adr[-_]?069|adr[-_]?116|(^|\/)seed|onboard|(^|\/)cog-)/i;
|
|
352
|
+
const SEED_BAD_RE = /(rng|random|pretrain|nvsim|prng|np\.random|torch\.manual_seed)/i;
|
|
353
|
+
|
|
354
|
+
// FIX 5 — TOP-DOWN ORIENTATION LAYER. The primers are indexed as synthetic `PRIMER#<section>`
|
|
355
|
+
// documents (kind 'primer-orientation') that synthesize the answers to the six comprehension-
|
|
356
|
+
// journey archetypes a raw repo lacks: what-is-it / concepts / how-each-works / maturity /
|
|
357
|
+
// where-are-the-docs / how-to-use-end-to-end. When a query is one of those top-down orientation
|
|
358
|
+
// questions, bias toward the matching PRIMER section so the SYNTHESIZED answer wins over a deep
|
|
359
|
+
// ADR/source fragment. Deep ADR/source still wins for narrow how-X-works questions (no orient cue).
|
|
360
|
+
const PRIMER_PATH_RE = /^PRIMER#/;
|
|
361
|
+
// Generic orientation cue: the query is asking to be oriented to the product as a whole.
|
|
362
|
+
const ORIENT_QUERY_RE = new RegExp([
|
|
363
|
+
'what\\s+(is|are|does)\\b', // "what is X" / "what does X do" / "what are the concepts"
|
|
364
|
+
'\\bwhat\\s+can\\b',
|
|
365
|
+
'\\bcore\\s+(capabilit|concept|feature)', // "core capabilities/concepts"
|
|
366
|
+
'\\bcapabilit(y|ies)\\b',
|
|
367
|
+
'\\bhow\\s+(mature|complete)\\b', // maturity archetype
|
|
368
|
+
'\\b(production|experimental)\\b',
|
|
369
|
+
'\\bwhat\\s+works\\b',
|
|
370
|
+
'\\bwhere\\s+(is|are)\\b.*\\b(doc|documentation|adr)', // docs/ADR-location archetype
|
|
371
|
+
'\\bdocumentation\\b.*\\badr',
|
|
372
|
+
'\\badr\\s+index\\b',
|
|
373
|
+
'\\b(install|set\\s*up|setup|use)\\b.*\\bend[\\s-]*to[\\s-]*end\\b', // end-to-end usage
|
|
374
|
+
'\\bend[\\s-]*to[\\s-]*end\\b',
|
|
375
|
+
'\\bget(ting)?\\s+started\\b',
|
|
376
|
+
'\\boverview\\b',
|
|
377
|
+
].join('|'), 'i');
|
|
378
|
+
|
|
379
|
+
// Archetype → words that, when present in BOTH the query and a PRIMER section's title/path, mean
|
|
380
|
+
// THIS section is the better-routed orientation answer (e.g. "where are the ADRs" -> the section
|
|
381
|
+
// titled "ADR index"). Used to nudge between competing PRIMER sections so the closest-titled one
|
|
382
|
+
// wins, without overriding the generic orientation lift. Each matched cue adds a small extra boost.
|
|
383
|
+
const PRIMER_ROUTE_CUES = [
|
|
384
|
+
{ q: /\b(adr|decision\s+record)/i, sec: /\badr\b|decision/i, w: 0.20 },
|
|
385
|
+
{ q: /\b(doc|documentation)/i, sec: /\bdoc|where everything lives|tutorial/i, w: 0.10 },
|
|
386
|
+
{ q: /\b(mature|maturity|complete|production|experimental|works|graded|honest)/i, sec: /matur|gotcha|graded|honest|complete/i, w: 0.18 },
|
|
387
|
+
{ q: /\b(capabilit|concept|feature)/i, sec: /capabilit|concept|crate inventory|big/i, w: 0.16 },
|
|
388
|
+
{ q: /\b(install|set\s*up|setup|quickstart|get\s*started|use|end[\s-]*to[\s-]*end|playbook)/i, sec: /executive summary|install|quickstart|playbook|knowledge base|use it/i, w: 0.14 },
|
|
389
|
+
{ q: /\b(crate|inventory)/i, sec: /crate inventory|inventory/i, w: 0.16 },
|
|
390
|
+
];
|
|
391
|
+
|
|
392
|
+
// Returns a NON-NEGATIVE amount to SUBTRACT from a PRIMER document's effective distance when the
|
|
393
|
+
// query is an orientation question. The generic lift makes the synthesized layer beat a vector-
|
|
394
|
+
// closer deep doc; the route cues then nudge between PRIMER sections toward the best-titled one.
|
|
395
|
+
// Gentle enough that a clearly-better deep match still wins for narrow how-X-works questions.
|
|
396
|
+
function orientationBoost(query, path, title = '', suppressGeneric = false) {
|
|
397
|
+
if (!PRIMER_PATH_RE.test(path)) return 0;
|
|
398
|
+
// For a concept what-is query the generic orientation lift is suppressed: we do NOT want every
|
|
399
|
+
// PRIMER (especially the thin product blurb) lifted over the real defining doc. Only the targeted
|
|
400
|
+
// route cues (if any) apply. (FIX 1)
|
|
401
|
+
const generic = suppressGeneric ? 0 : (ORIENT_QUERY_RE.test(query) ? 0.55 : 0.12);
|
|
402
|
+
let route = 0;
|
|
403
|
+
const hay = `${path} ${title}`;
|
|
404
|
+
for (const c of PRIMER_ROUTE_CUES) {
|
|
405
|
+
if (c.q.test(query) && c.sec.test(hay)) route += c.w;
|
|
406
|
+
}
|
|
407
|
+
return generic + route;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// ===================================================================================
|
|
411
|
+
// INTENT ROUTING LAYER (the second structural fix). MiniLM collapses every top-down
|
|
412
|
+
// "orientation" query onto the generic "what X is" PRIMER section, and implementation
|
|
413
|
+
// queries don't reliably surface code. This adds DETERMINISTIC intent classification on
|
|
414
|
+
// top of the vector+rerank pipeline: it (a) detects an orientation archetype and force-routes
|
|
415
|
+
// to the matching PRIMER#<slug> for that store, (b) hard-routes an exact "ADR-NNN" query to
|
|
416
|
+
// the real ADR document (beating the index table), (c) tilts ranking toward code or toward
|
|
417
|
+
// design docs by intent, and (d) guarantees an ADR proposal is paired with its built source.
|
|
418
|
+
// All of this is layered as effective-distance adjustments / hard rank overrides — no rebuild.
|
|
419
|
+
// ===================================================================================
|
|
420
|
+
|
|
421
|
+
// Per-store map of orientation archetype -> the EXACT PRIMER# slug that answers it. Slugs were
|
|
422
|
+
// discovered from the live sidecars (grep 'PRIMER#…' on the ids/meta files); only real slugs are
|
|
423
|
+
// listed. `adr` is the docs-location archetype's ADR-specific sub-target (the ADR index table).
|
|
424
|
+
// Legacy hard-coded PRIMER slug maps are kept ONLY as a fallback for the prototype stores so an old
|
|
425
|
+
// ruvector/ruview KB still routes; new targets supply slugs via config (an explicit per-archetype
|
|
426
|
+
// map) or use primerSlugs:'auto' (slugs discovered from the live sidecar at query time — see
|
|
427
|
+
// resolvePrimerSlug). NO new repo names are added here.
|
|
428
|
+
const LEGACY_PRIMER_SLUGS = {
|
|
429
|
+
ruvector: {
|
|
430
|
+
maturity: 'PRIMER#8-maturity-gotchas',
|
|
431
|
+
capabilities: 'PRIMER#2-the-big-capabilities-and-how-to-actually-call-them',
|
|
432
|
+
docs: 'PRIMER#5-docs-tutorials-examples-skills',
|
|
433
|
+
adr: 'PRIMER#4-adr-index-the-complete-table-208-main-series-files-in-docs-adr-54-in-4-sub-ser',
|
|
434
|
+
playbook: 'PRIMER#0-executive-summary-which-crate-do-i-need',
|
|
435
|
+
whatis: 'PRIMER#1-what-ruvector-is',
|
|
436
|
+
crates: 'PRIMER#3-complete-crate-inventory',
|
|
437
|
+
hardware: null,
|
|
438
|
+
glossary: null,
|
|
439
|
+
},
|
|
440
|
+
ruview: {
|
|
441
|
+
maturity: 'PRIMER#7-capabilities-graded-honestly',
|
|
442
|
+
capabilities: 'PRIMER#7-capabilities-graded-honestly',
|
|
443
|
+
docs: 'PRIMER#9-docs-tutorials-scripts-firmware-where-everything-lives',
|
|
444
|
+
adr: 'PRIMER#8-the-complete-adr-index-160-adr-numbered-files-156-unique-numbers',
|
|
445
|
+
playbook: 'PRIMER#0-1-instant-playbooks-task-exact-steps',
|
|
446
|
+
whatis: 'PRIMER#1-what-ruview-is',
|
|
447
|
+
crates: 'PRIMER#3-the-crates-v2-workspace-39-incl-the-ruv-neural-git-submodule-and-homecore-plug',
|
|
448
|
+
hardware: 'PRIMER#10-hardware-matrix',
|
|
449
|
+
glossary: 'PRIMER#0-3-glossary-so-terms-are-never-guessed',
|
|
450
|
+
},
|
|
451
|
+
};
|
|
452
|
+
|
|
453
|
+
// Keyword cues per archetype used by 'auto' slug discovery (matched against a PRIMER#<slug> path).
|
|
454
|
+
// First section whose slug matches the archetype's cues wins; whatis prefers an early "what-is"/#1.
|
|
455
|
+
const AUTO_SLUG_CUES = {
|
|
456
|
+
whatis: [/what[-\s]?is/i, /#?1[-\b]/i, /overview/i, /introduction/i],
|
|
457
|
+
capabilities: [/capabilit/i, /feature/i, /what.*can/i],
|
|
458
|
+
crates: [/inventory/i, /package/i, /crate/i, /module/i, /component/i],
|
|
459
|
+
maturity: [/matur/i, /gotcha/i, /graded/i, /honest/i, /production/i, /limit/i],
|
|
460
|
+
docs: [/docs?/i, /where.*live/i, /tutorial/i, /reference/i],
|
|
461
|
+
adr: [/\badr\b/i, /decision/i, /index/i],
|
|
462
|
+
playbook: [/playbook/i, /quickstart/i, /get[-\s]?started/i, /executive/i, /usage/i, /how[-\s]?to/i],
|
|
463
|
+
hardware: [/hardware/i, /board/i, /device/i],
|
|
464
|
+
glossary: [/glossary/i, /terms/i],
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
// Resolve the PRIMER#<slug> for an archetype, store, and the live sidecar paths.
|
|
468
|
+
// 1. explicit config map (target.primerSlugs is an object): use it.
|
|
469
|
+
// 2. legacy hard-coded map (ruvector/ruview): use it.
|
|
470
|
+
// 3. 'auto' (or anything else): discover from byPath PRIMER#… keys via AUTO_SLUG_CUES.
|
|
471
|
+
function resolvePrimerSlug(archetype, store, byPath) {
|
|
472
|
+
if (!archetype) return null;
|
|
473
|
+
const t = cfgFor(store);
|
|
474
|
+
const ps = t && t.primerSlugs;
|
|
475
|
+
if (ps && typeof ps === 'object') return ps[archetype] ?? null;
|
|
476
|
+
if (LEGACY_PRIMER_SLUGS[store]) return LEGACY_PRIMER_SLUGS[store][archetype] ?? null;
|
|
477
|
+
// 'auto' discovery from the live sidecar.
|
|
478
|
+
const cues = AUTO_SLUG_CUES[archetype];
|
|
479
|
+
if (!cues || !byPath) return null;
|
|
480
|
+
const primerPaths = [...byPath.keys()].filter((p) => PRIMER_PATH_RE.test(p)).sort();
|
|
481
|
+
for (const p of primerPaths) {
|
|
482
|
+
if (cues.some((re) => re.test(p))) return p;
|
|
483
|
+
}
|
|
484
|
+
return null;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// The store's PRODUCT NAME matcher, built from config.productNames (see productNameRe above). A
|
|
488
|
+
// what-is/concept query naming ONLY the product (no other concrete concept noun) is a product
|
|
489
|
+
// overview -> force-route to the "what X is" primer; a query carrying a concrete concept noun is
|
|
490
|
+
// NOT a product overview -> let vector+rerank find the DEFINING doc.
|
|
491
|
+
const _productReCache = new Map();
|
|
492
|
+
function productRe(store) {
|
|
493
|
+
if (!_productReCache.has(store)) _productReCache.set(store, productNameRe(store));
|
|
494
|
+
return _productReCache.get(store);
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Archetype detectors, ORDERED most-specific-first (first match wins). Each regex tests the raw
|
|
498
|
+
// query. Patterns mirror the spec's intent buckets. `adr` is folded into `docs` but additionally
|
|
499
|
+
// flips the docs target to the ADR-index slug when the query is specifically about ADRs.
|
|
500
|
+
const ARCHETYPE_RES = [
|
|
501
|
+
// maturity / production-readiness / "how good/solid/reliable" / works-today-vs-experiment /
|
|
502
|
+
// host counts / release status / should-I / which-are-newer / what-is-NOT.
|
|
503
|
+
// NOTE: "not a" is intentionally NARROW — only specific maturity-context negations to avoid
|
|
504
|
+
// catching "why is it called a factory and NOT a framework" (a whatis query).
|
|
505
|
+
// The \bNOT[?!]?\s*$ tail catches "What is metaharness NOT?" (query ends in NOT + punctuation).
|
|
506
|
+
{ name: 'maturity', re: /(\b(mature|maturity|production[- ]?ready|production\b|how (good|solid|reliable|complete)|how complete|is it (ready|done|complete)|works?\b.*\b(experiment|stub|today|yet)|ready for production|battle[- ]?tested|graded honestly|how many\b.*(stable|newer|addition|host)|which\b.*(stable|newer|addition)|release status|release pipeline|what release|not a (?:chatbot|no[- ]code|hosted service|fine[- ]?tune)|should i assert|fixed test count|honest limits|posture\b|default[- ]deny|guarantee privacy|privacy guarantee|real hardware|simulator or hardware|hardware or a? ?simulator|out of scope|beat a cnn|beats? a cnn|state[- ]of[- ]the[- ]art)\b|\bNOT[?!]?\s*$)/i },
|
|
507
|
+
// capabilities / features / "what can it do" / tool actions / sub-commands / specific tools
|
|
508
|
+
// (score, genome, mcp-scan, threat-model, Darwin Mode, router, execute, cost, npm audit).
|
|
509
|
+
// Capabilities BEFORE composer/crates so "What does Darwin Mode do" / "What does genome report"
|
|
510
|
+
// / "What is the npm audit for agent tools command" routes here, not to whatis or composer.
|
|
511
|
+
{ name: 'capabilities', re: /\b(capabilit(y|ies)|what can (it|the tool) do|what can ruv\w+ do|what can photonlayer do|features?\b|what does it (do|offer)|what does ruv\w+ (do|offer)|big (capabilities|features)|does\b.*\bexecute\b|ever execute|how does\b.*\bcut\b|how (do i |to )?score\b|genome\b.*\breport|what does\b.*(genome|darwin|threat[- ]?model|mcp[- ]scan|router|mode) (do|report|produce)|what artifact\b|what can the\b|\bgenome\b|\bdarwin mode\b|mcp[- ]scan\b|threat[- ]model\b|npm audit\b|audit for agent\b|how much\b.*\bcompress|compress(es|ion)?\b.*\bcapture|what accuracy\b|gradient training\b.*\breach|reach\b.*\baccuracy|what does the (receipt|blake3 receipt)|receipt prove|reproduce the (exact|same)|multi[- ]?plane cascade\b.*(achieve|do))\b/i },
|
|
512
|
+
// docs / tutorials / examples / ADRs / "where do I find/read" / "which doc covers/gives/lives" /
|
|
513
|
+
// "where is X described/documented". BEFORE composer and crates so "where is the composer 9-stage
|
|
514
|
+
// flow documented" / "where is the three-layer model described" route to docs (PRIMER#6), not
|
|
515
|
+
// composer (PRIMER#4) or crates (PRIMER#3).
|
|
516
|
+
{ name: 'docs', re: /\b(where (are|is|can i find|do i find|do i read).*(doc|documentation|tutorial|example|adr|guide|find|live|architecture|exist|why|described|documented)|documentation\b|tutorials?\b|list of adrs?|adr index|where everything lives|where.*\b(docs?|guides?)\b|which doc\b|where is.*\b(documented|described)\b|where does.*live\b|read[- ]in[- ]order\b|adr (series|index)\b|plain[- ]language usage\b)\b/i },
|
|
517
|
+
// composer / scaffold stages / overlays / HarnessChoice / template selection — PRIMER#4.
|
|
518
|
+
// "scaffold" only matches when NOT preceded by "one-liner" or "show" (those are playbook).
|
|
519
|
+
// "composer" without "where" context to avoid catching "where is the composer flow documented".
|
|
520
|
+
{ name: 'composer', re: /\b(how does\b.*\b(composer\b.*scaffold|scaffold\b.*\bharness)\b|how many stages\b|template overlays?\b|overlay\b.*merge|merge\b.*overlay|default agents?\b|default skills?\b|harnesschoice\b|which (composer )?stage\b|toggles? kernel\b|7[- ]arc\b|teaching outline\b|last\b.*\bstage\b|stage\b.*\bgeneration\b|primitives\b.*\btoggle\b|9 stages?\b|what object drives\b|drives template\b|(optical )?pipeline\b.*\bstep[- ]by[- ]step\b|step[- ]by[- ]step\b.*\bpipeline\b|how (does|do)\b.*\b(optical )?pipeline\b.*\bwork\b|how (does|do)\b.*\b(a |the )?(quantum )?circuit\b.*\b(run|works?|simulat)|how (do i |to )?run a (circuit|simulation)\b|run a simulation\b.*\b(in javascript|entirely in|client-?side)|build (and run )?a bell state|steps to (build|run)|how does\b.*\bpick which backend|how does\b.*\bbackend\b.*\b(work|chosen|selected|pick)|how does\b.*\bspeed up\b.*\bsimulat|stages?\b.*\bfrom\b.*\b(image|input)\b.*\bto\b.*\b(decision|answer|output)\b)\b/i },
|
|
521
|
+
// hardware / boards / devices — enumeration of supported physical hardware
|
|
522
|
+
{ name: 'hardware', re: /\b(hardware|boards?|devices?|which (chip|board|sensor)|supported (hardware|board|device))\b/i },
|
|
523
|
+
// component inventory — enumeration of the components that make up the workspace / a domain.
|
|
524
|
+
// NOTE: this static entry only covers the literal "crate" word; the store-aware
|
|
525
|
+
// componentInventoryRe(store) (injected with the componentWord synonym group: crate|package|
|
|
526
|
+
// module|component) is checked FIRST in classifyArchetype so an npm-package repo fires too.
|
|
527
|
+
// Broadened to also catch: "what are the layers", "which host adapter packages", subsystems,
|
|
528
|
+
// kernel boundary, which packages (surface/layer), model router package, kernel subsystems.
|
|
529
|
+
// "three-layer" is excluded here (handled by docs) for "where is the three-layer model described".
|
|
530
|
+
{ name: 'crates', re: /\b(which crates|crate inventory|what crates|crates (that |which )?(make up|in|for|comprise)|list of crates|\w+ domain crates|what are the (layers?|three layers?|subsystems?|adapters?)\b|which (host |adapter )?(packages?|adapters?)\b|kernel (boundary|subsystems?)\b|what is (in |the )?the kernel\b|what is the kernel\b|subsystems? (bundled|in)\b|surface layer\b|user[- ]facing (surface|layer|packages?)\b|model router (package|component)\b)\b/i },
|
|
531
|
+
// playbook / setup / onboarding / end-to-end usage / wizard / publish / scaffold health /
|
|
532
|
+
// fastest-path / one-liner / after-scaffolding / own-files / release-gate.
|
|
533
|
+
{ name: 'playbook', re: /\b(how (do i |to |can i )?(use|set ?up|onboard|get started|getting started|start|deploy|build|run|publish|check|generate|try|install)|try\b.*\bwithout\b.*\binstall|without\b.*\binstall\w*\b.*\banything|no[- ]install\b|try it (out|now|in (a |the )?browser)|end[- ]to[- ]end|end to end|quick ?start|playbook|walkthrough|step[- ]by[- ]step|get up and running|wizard\b|what does the wizard\b|fastest path\b|one[- ]liner\b|after scaffolding\b|own the files\b|what command\b.*\brelease\b|release gate\b|scaffold is healthy\b|harness doctor\b|harness validate\b|publish my harness\b|what do my users run\b|users run\b)\b/i },
|
|
534
|
+
// what-is / overview / introduce / what-does-produce / is-X-a-Y / why-called / do-I-need.
|
|
535
|
+
// "not another" catches "not another agent framework" (product identity). "why.*factory|framework"
|
|
536
|
+
// catches "why is it called a factory and not a framework" without matching generic maturity.
|
|
537
|
+
{ name: 'whatis', re: /\b(what is|what'?s |overview of|introduce|introduction to|tell me about|difference between|role of|what does\b.*\b(produce|turn|make into|forbid|mean)\b|is\b.*\b(model|wrapper|framework|factory|account|server)\b|why (is it|called|a factory)\b|do i need\b|in one line\b|called a factory\b|not another\b|why.*\bfactory\b|why.*\bframework\b)\b/i },
|
|
538
|
+
];
|
|
539
|
+
|
|
540
|
+
// Strong playbook verbs — when present, the playbook force-route fires EVEN for a long query
|
|
541
|
+
// (the word-count cap is bypassed). These signal an end-to-end "do this from scratch" walkthrough.
|
|
542
|
+
const STRONG_PLAYBOOK_RE = /\b(set ?up|end[- ]to[- ]end|get started|getting started|from scratch|walk ?through|unbox|first time|step by step)\b/i;
|
|
543
|
+
|
|
544
|
+
// A query is "clearly orientation" only when it is short & conceptual: no concrete symbol, file
|
|
545
|
+
// path, ADR number, code-y token, or function/struct reference. This keeps the force-route from
|
|
546
|
+
// firing on a deep how-X-works-in-the-code question that happens to contain "how to".
|
|
547
|
+
const SPECIFIC_SIGNAL_RE = /(\badr[-\s_]?\d|[a-z_]+\.[a-z]{1,4}\b|\bfn\b|\bstruct\b|\bimpl\b|::|\/|\b[a-z_]+\(\)|\bcrate::|\bsrc\b)/i;
|
|
548
|
+
function isOrientationQuery(query) {
|
|
549
|
+
// A strong playbook verb (set up / end-to-end / from scratch / walkthrough …) marks an
|
|
550
|
+
// end-to-end "do this from scratch" request. These run long ("set up a single sensor node end to
|
|
551
|
+
// end and see data in home assistant" = 16 words) yet should still force-route to the playbook
|
|
552
|
+
// PRIMER, so bypass the word-count cap when one is present (FIX 3).
|
|
553
|
+
if (!STRONG_PLAYBOOK_RE.test(query)) {
|
|
554
|
+
const words = (query.trim().match(/\S+/g) || []).length;
|
|
555
|
+
if (words > 14) return false; // long queries are usually specific
|
|
556
|
+
}
|
|
557
|
+
if (SPECIFIC_SIGNAL_RE.test(query)) return false;
|
|
558
|
+
return true;
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// Is a what-is/concept query about the PRODUCT ITSELF (force PRIMER#1) vs about a CONCRETE concept
|
|
562
|
+
// noun (let vector+rerank find the defining doc)? Product-only: names the store product (ruvector/
|
|
563
|
+
// ruview) and contains NO other concrete concept noun, OR is literally "what is this / it". A query
|
|
564
|
+
// carrying any concept noun OTHER than the product name (rvf, witness, hnsw, gnn, segment, presence,
|
|
565
|
+
// occupancy, quantization, …) is a concept query, not a product overview. (FIX 1)
|
|
566
|
+
function isProductOverviewQuery(query, store) {
|
|
567
|
+
const prod = productRe(store);
|
|
568
|
+
// strip the product name, then see whether any meaningful concept term remains.
|
|
569
|
+
const stripped = prod ? query.replace(prod, ' ') : query;
|
|
570
|
+
const rest = queryTerms(stripped).filter((t) => t !== store && t !== 'product' && t !== 'overview');
|
|
571
|
+
if (prod && prod.test(query) && rest.length === 0) return true; // "what is <product>"
|
|
572
|
+
if (/\bwhat'?s?\s+(is\s+)?(this|it)\b/i.test(query) && rest.length === 0) return true; // "what is this"
|
|
573
|
+
return false;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Store-aware component-inventory detector — the componentWord synonym group (crate|package|module|
|
|
577
|
+
// component, from config) is injected so "what packages make up X" / "which modules comprise Y" fire
|
|
578
|
+
// the SAME 'crates' archetype the prototype only fired for "which crates". Cached per store.
|
|
579
|
+
const _compInvCache = new Map();
|
|
580
|
+
function componentInventoryRe(store) {
|
|
581
|
+
if (_compInvCache.has(store)) return _compInvCache.get(store);
|
|
582
|
+
const w = componentWordAlt(store); // e.g. (?:crate|package|module|component)
|
|
583
|
+
const wp = `${w}s?`; // allow plural
|
|
584
|
+
const re = new RegExp(
|
|
585
|
+
`\\b(which ${wp}|${w} inventory|what ${wp}|${wp} (that |which )?(make up|in|for|comprise|comprise)|`
|
|
586
|
+
+ `list of ${wp}|\\w+ domain ${wp})\\b`, 'i');
|
|
587
|
+
_compInvCache.set(store, re);
|
|
588
|
+
return re;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
// Patterns in the whatis archetype that are ALWAYS a product-overview (force PRIMER#1), regardless
|
|
592
|
+
// of whether other concept nouns appear. These paraphrased forms ("what does X produce", "is it a
|
|
593
|
+
// wrapper", "why called a factory", "do I need an account", "in one line") are orientation queries
|
|
594
|
+
// about the product as a whole, not about a sub-concept. They bypass the isProductOverviewQuery
|
|
595
|
+
// concept-noun strip so they force-route to PRIMER#1 rather than falling to whatis-concept.
|
|
596
|
+
const WHATIS_FORCE_RE = /\b(what does\b.*\b(produce|turn into|make into|forbid|turn light into|output)\b|is\b.*\b(model|wrapper|framework|factory|account|server)\b|why (is it |a )?called\b|do i need\b|in one line\b|not another\b|called a factory\b|why a factory\b|published cli\b|cli name\b.*versus|versus\b.*cli name|what.*\bversus\b.*\balias\b|learned phase mask\b|what is a phase mask\b|optical neural network\b|why.*\bfront end\b|described as a front end\b)\b/i;
|
|
597
|
+
// A "what is X in one line" phrasing is a PRODUCT overview ONLY when it is NOT enumerating named
|
|
598
|
+
// features/algorithms/commands. "What are VQE, Grover, and QAOA in one line each?" is a CAPABILITIES
|
|
599
|
+
// query, not a product overview — the comma/"and"/"each" enumeration is the tell. This guard keeps
|
|
600
|
+
// WHATIS_FORCE's "in one line" from hijacking feature-list questions. (Generic; no repo names.)
|
|
601
|
+
const WHATIS_ENUMERATION_RE = /\b(what (are|is)\b[^?]*\b(and|,)\b[^?]*\b(in one line|each)\b|each\b.*\bin one line|in one line each)\b/i;
|
|
602
|
+
|
|
603
|
+
// Classify the orientation archetype (most-specific-first). Returns archetype name or null.
|
|
604
|
+
// `store` lets the what-is split distinguish a product-overview query from a concept query, and
|
|
605
|
+
// drives the componentWord synonym injection for the component-inventory archetype.
|
|
606
|
+
function classifyArchetype(query, store) {
|
|
607
|
+
if (!isOrientationQuery(query)) return null;
|
|
608
|
+
// Component-inventory FIRST, with the componentWord synonym group injected (config-driven), so a
|
|
609
|
+
// package/module repo fires the inventory route exactly like a crate repo did.
|
|
610
|
+
if (componentInventoryRe(store).test(query)) return 'crates';
|
|
611
|
+
for (const a of ARCHETYPE_RES) {
|
|
612
|
+
if (a.re.test(query)) {
|
|
613
|
+
// The docs archetype splits: if the query is specifically about ADRs, target the ADR index.
|
|
614
|
+
if (a.name === 'docs' && /\b(adr|decision record)\b/i.test(query)) return 'adr';
|
|
615
|
+
// The what-is archetype splits: product-overview -> PRIMER#1; concept query -> no force-route
|
|
616
|
+
// (let vector+rerank find the DEFINING doc; a mild concept boost is applied downstream).
|
|
617
|
+
// WHATIS_FORCE_RE patterns are always product-overview (bypass the concept-noun strip).
|
|
618
|
+
if (a.name === 'whatis') {
|
|
619
|
+
// An enumeration ("what are X, Y, and Z in one line each") is a capabilities/feature-list
|
|
620
|
+
// query, not a product overview — do NOT force-route it to PRIMER#1.
|
|
621
|
+
if (WHATIS_ENUMERATION_RE.test(query)) return 'whatis-concept';
|
|
622
|
+
if (WHATIS_FORCE_RE.test(query) || isProductOverviewQuery(query, store)) return 'whatis';
|
|
623
|
+
return 'whatis-concept';
|
|
624
|
+
}
|
|
625
|
+
return a.name;
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
return null;
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// Code-vs-doc intent. Returns 'code' | 'design' | null.
|
|
632
|
+
const CODE_INTENT_RE = /\b(in (the )?code|in source|implementation|how is .*(computed|implemented|calculated|done)|\bfunction\b|\bstruct\b|signature|source code|actual code|which file|in the source|the (rust|code))\b/i;
|
|
633
|
+
const DESIGN_INTENT_RE = /\b(why\b|rationale|design decision|design choice|proposed\b|proposal\b|trade[- ]?off|tradeoff|motivation|reasoning behind|the decision to)\b/i;
|
|
634
|
+
function codeDocIntent(query) {
|
|
635
|
+
// Code intent takes priority when both fire (explicit "in the code" beats a stray "why").
|
|
636
|
+
if (CODE_INTENT_RE.test(query)) return 'code';
|
|
637
|
+
if (DESIGN_INTENT_RE.test(query)) return 'design';
|
|
638
|
+
return null;
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
// ===================================================================================
|
|
642
|
+
// FIX A — "how-works-in-code" / implementation intent (RETRIEVAL POLISH). A query that asks
|
|
643
|
+
// how something is IMPLEMENTED/coded ("how is X implemented", "how does X work in code",
|
|
644
|
+
// "implementation of X", "where is X coded") wants the REAL algorithm source in the crate's
|
|
645
|
+
// own src/ — NOT a vendored/copied dependency, NOT the CLI entrypoint, NOT the manifest. So we:
|
|
646
|
+
// • DEMOTE wrong-file types: vendored/patched dep copies (patches/** + any hnsw_rs-style
|
|
647
|
+
// copied-dep tree), bare entrypoints (**/main.rs, **/bin/**), and Cargo.toml.
|
|
648
|
+
// • PROMOTE the crate's own src/**/*.rs (excluding main.rs) — with an EXTRA promotion for a
|
|
649
|
+
// file whose name token-matches the named operation (e.g. "insert" -> *insert*.rs,
|
|
650
|
+
// "count/counting" -> *count*.rs), so the operation's implementation module wins.
|
|
651
|
+
// Scoped to the named crate(s) from the query (entity.crates) where possible; the vendored-dep
|
|
652
|
+
// demotion is global (a copied dep is never the answer to "how is X implemented HERE").
|
|
653
|
+
// ===================================================================================
|
|
654
|
+
const IMPL_INTENT_RE = new RegExp([
|
|
655
|
+
'\\bimplement(ed|ation)?\\b', // "X implemented", "implementation of X"
|
|
656
|
+
'\\bhow\\s+(is|does)\\b.*\\b(work|works|coded|done)\\b.*\\bin\\s+(the\\s+)?(code|source)\\b',
|
|
657
|
+
'\\bhow\\s+\\w+\\s+(is|works?)\\s+coded\\b',
|
|
658
|
+
'\\bwhere\\s+is\\s+\\w+\\s+(coded|implemented)\\b',
|
|
659
|
+
].join('|'), 'i');
|
|
660
|
+
function isImplIntent(query) { return IMPL_INTENT_RE.test(query); }
|
|
661
|
+
|
|
662
|
+
// Vendored / copied-dependency trees that are NEVER the answer to "how is X implemented here".
|
|
663
|
+
// patches/** is the explicit vendored-patch tree; the hnsw_rs token catches the copied upstream
|
|
664
|
+
// HNSW crate wherever it lands (e.g. scripts/patches/hnsw_rs/**). Kept conservative.
|
|
665
|
+
const VENDORED_DEP_RE = /(^|\/)(patches)\/|(^|\/)hnsw_rs\//i;
|
|
666
|
+
|
|
667
|
+
// The operation noun(s) the impl query is about: meaningful terms MINUS the named crate token(s)
|
|
668
|
+
// MINUS generic impl words. Used to give an extra promotion to a src file whose name token-matches
|
|
669
|
+
// the operation (e.g. "insert" -> insert.rs / *insert*.rs).
|
|
670
|
+
const IMPL_STOP = new Set(['how','does','work','works','implement','implemented','implementation',
|
|
671
|
+
'code','coded','source','where','the','rust','module','crate','function','method','logic']);
|
|
672
|
+
function implOperationNouns(query, crates) {
|
|
673
|
+
const crateSet = new Set((crates || []).map((c) => c.toLowerCase()));
|
|
674
|
+
return queryTerms(query)
|
|
675
|
+
.filter((t) => !crateSet.has(t) && !IMPL_STOP.has(t)
|
|
676
|
+
&& !(crates || []).some((c) => t === c || c.includes(t)));
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
// Implementation-intent path adjustment (negative = promote, positive = demote). `crateTok` is the
|
|
680
|
+
// named crate the query is about (or null for unscoped). `opNouns` are the operation tokens.
|
|
681
|
+
function implAdjust(path, crateTok, opNouns, store) {
|
|
682
|
+
let adj = 0;
|
|
683
|
+
// Global: a vendored/copied-dep tree is never the real implementation of "X here".
|
|
684
|
+
if (VENDORED_DEP_RE.test(path)) adj += 0.55;
|
|
685
|
+
|
|
686
|
+
const slug = (path.split('/').pop() || '').toLowerCase();
|
|
687
|
+
const isMain = /(^|\/)main\.rs$/i.test(path);
|
|
688
|
+
const isBin = /(^|\/)bin\//i.test(path);
|
|
689
|
+
const isCargo = /(^|\/)cargo\.toml$/i.test(path);
|
|
690
|
+
// Bare entrypoints + manifest are not the algorithm (apply globally for impl intent).
|
|
691
|
+
if (isMain || isBin) adj += 0.30;
|
|
692
|
+
if (isCargo) adj += 0.30;
|
|
693
|
+
|
|
694
|
+
if (crateTok) {
|
|
695
|
+
const inCrate = inComponentRe(store, crateTok).test(path);
|
|
696
|
+
if (inCrate) {
|
|
697
|
+
// Real source body of the named crate: src/**/*.rs that is NOT main.rs / a module-stub dir.
|
|
698
|
+
const isSrcRs = /(?:^|\/)src\/.+\.rs$/i.test(path);
|
|
699
|
+
if (isSrcRs && !isMain) {
|
|
700
|
+
adj -= 0.40; // promote the crate's own algorithm source
|
|
701
|
+
// Extra promotion when the filename token-matches the named operation.
|
|
702
|
+
if (opNouns && opNouns.some((t) => t.length >= 3 && slug.includes(t))) adj -= 0.30;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
return adj;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// FIX D — NAMED-CRATE source boost (no rebuild). A query that explicitly names a real crate but
|
|
710
|
+
// carries NO implementation verb ("HNSW index in ruvector-core", "ruvector-mmwave radar parser")
|
|
711
|
+
// got no crate scoping before — so a sibling crate / example / bridge could outrank the named
|
|
712
|
+
// crate's own source. When the query names crate(s) AND it is not an impl query (implAdjust already
|
|
713
|
+
// covers those), GENTLY promote the named crate's own src/**/*.rs (a little extra when the filename
|
|
714
|
+
// token-matches an operation noun). Gentle enough that a clearly-better vector match still wins.
|
|
715
|
+
function namedCrateAdjust(path, crateToks, opNouns, store) {
|
|
716
|
+
if (!crateToks || !crateToks.length) return 0;
|
|
717
|
+
for (const c of crateToks) {
|
|
718
|
+
const inCrate = inComponentRe(store, c).test(path);
|
|
719
|
+
if (!inCrate) continue;
|
|
720
|
+
const isSrcRs = /(?:^|\/)src\/.+\.rs$/i.test(path) && !/(?:^|\/)main\.rs$/i.test(path);
|
|
721
|
+
if (isSrcRs) {
|
|
722
|
+
let adj = -0.28; // promote the named crate's own source
|
|
723
|
+
const slug = (path.split('/').pop() || '').toLowerCase();
|
|
724
|
+
if (opNouns && opNouns.some((t) => t.length >= 3 && slug.includes(t))) adj -= 0.22;
|
|
725
|
+
return adj;
|
|
726
|
+
}
|
|
727
|
+
return -0.10; // mild lift for anything else in the named crate
|
|
728
|
+
}
|
|
729
|
+
return 0;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Exact ADR-by-number, e.g. "ADR-027" / "adr 27" -> zero-padded "027". Returns [nums] or [].
|
|
733
|
+
function adrNumbers(query) {
|
|
734
|
+
return (query.match(/\badr[-\s_]?(\d{1,4})\b/gi) || [])
|
|
735
|
+
.map((m) => m.replace(/[^0-9]/g, '').padStart(3, '0'));
|
|
736
|
+
}
|
|
737
|
+
// Does a path point at the REAL ADR doc for this number (not the index table / a passing mention)?
|
|
738
|
+
function pathIsAdrDoc(p, num) {
|
|
739
|
+
return new RegExp(`(^|/)adr[-_]?0*${num}\\b`, 'i').test(p) || new RegExp(`adr[-_]?0*${num}[-_]`, 'i').test(p);
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// FIX 4 — parse an ADR's Status (Proposed / Accepted / Implemented / Superseded / Rejected /
|
|
743
|
+
// Deprecated) from the top of the document. ADRs in this corpus carry the status in any of:
|
|
744
|
+
// a metadata table row: | **Status** | Proposed | or | Status | Proposed |
|
|
745
|
+
// an inline header: **Status**: Proposed or Status: Proposed
|
|
746
|
+
// a section + bold value: ## Status\n**Proposed**
|
|
747
|
+
// Returns the normalized UPPERCASE status string, or null if none is found. We scan the doc's
|
|
748
|
+
// first chunk(s) where the header lives.
|
|
749
|
+
const ADR_STATUS_WORDS = '(proposed|accepted|implemented|superseded|rejected|deprecated|draft|in[\\s-]?progress)';
|
|
750
|
+
function parseAdrStatus(chunks) {
|
|
751
|
+
if (!chunks || !chunks.length) return null;
|
|
752
|
+
const head = chunks.slice(0, 2).map((c) => c.text).join('\n');
|
|
753
|
+
const patterns = [
|
|
754
|
+
// table row: | **Status** | Proposed | / | Status | Accepted |
|
|
755
|
+
new RegExp(`\\|\\s*\\**\\s*status\\s*\\**\\s*\\|\\s*\\**\\s*${ADR_STATUS_WORDS}`, 'i'),
|
|
756
|
+
// inline: **Status**: Proposed / Status: Proposed
|
|
757
|
+
new RegExp(`\\**status\\**\\s*:\\s*\\**\\s*${ADR_STATUS_WORDS}`, 'i'),
|
|
758
|
+
// section: ## Status\n**Proposed**
|
|
759
|
+
new RegExp(`#+\\s*status\\b[^\\n]*\\n+\\s*\\**\\s*${ADR_STATUS_WORDS}`, 'i'),
|
|
760
|
+
];
|
|
761
|
+
for (const re of patterns) {
|
|
762
|
+
const m = head.match(re);
|
|
763
|
+
if (m && m[1]) return m[1].toUpperCase().replace(/[\s-]+/g, '-');
|
|
764
|
+
}
|
|
765
|
+
return null;
|
|
766
|
+
}
|
|
767
|
+
// A status that means "design intent, not confirmed shipped" (vs ACCEPTED/IMPLEMENTED = built).
|
|
768
|
+
function statusIsProposed(status) {
|
|
769
|
+
return !!status && /^(PROPOSED|DRAFT|IN-PROGRESS|REJECTED|DEPRECATED|SUPERSEDED)$/i.test(status);
|
|
770
|
+
}
|
|
771
|
+
// Back-compat: does the doc carry ANY status header? (trigger for INTENT(4) ADR-vs-code pairing.)
|
|
772
|
+
function adrHasStatus(chunks) {
|
|
773
|
+
if (!chunks || !chunks.length) return false;
|
|
774
|
+
const head = chunks.slice(0, 2).map((c) => c.text).join('\n');
|
|
775
|
+
return parseAdrStatus(chunks) !== null
|
|
776
|
+
|| /(^|\n)\s*(#+\s*status\b|\*\*status\*\*\s*:|status\s*:|\|\s*\**status)/i.test(head);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
const STOPWORDS = new Set(['the','a','an','and','or','of','to','in','for','on','with','how','do','i','is','are',
|
|
780
|
+
'what','when','where','why','it','this','that','kb','query','question','search','find','show','me','please','about']);
|
|
781
|
+
|
|
782
|
+
// Tokenize a query into meaningful lexical terms (FIX 3 hybrid lexical).
|
|
783
|
+
function queryTerms(q) {
|
|
784
|
+
return (q.toLowerCase().match(/[a-z0-9][a-z0-9._-]*/g) || [])
|
|
785
|
+
.filter((t) => t.length >= 3 && !STOPWORDS.has(t));
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
// Concept nouns from a concept what-is query (FIX 1). The query's meaningful terms MINUS the
|
|
789
|
+
// product name MINUS generic question words = the concrete concept(s) being asked about (rvf,
|
|
790
|
+
// witness, hnsw, gnn, segment, presence, occupancy, …). A doc whose title/path token-overlaps one
|
|
791
|
+
// of these is more likely to DEFINE it, so it gets a mild boost (below) — letting the real defining
|
|
792
|
+
// ADR/source/doc out-rank the thin product blurb without a hard force-route.
|
|
793
|
+
const CONCEPT_STOP = new Set(['what','difference','between','role','format','file','files','does',
|
|
794
|
+
'store','stores','support','supported','index','indices','chain','segment','detection','counting',
|
|
795
|
+
'augmented','work','works']);
|
|
796
|
+
function conceptNouns(query, store) {
|
|
797
|
+
const prod = productRe(store);
|
|
798
|
+
const stripped = prod ? query.replace(prod, ' ') : query;
|
|
799
|
+
return queryTerms(stripped).filter((t) => t !== store && !CONCEPT_STOP.has(t));
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
// Concept boost: SUBTRACT from a doc that names a concept noun from a concept what-is query.
|
|
803
|
+
// FIX 3 — the DEFINING doc must beat an ADJACENT one. A concept noun that appears in the doc's
|
|
804
|
+
// FILENAME SLUG or TITLE is a strong "this doc DEFINES the concept" signal (e.g. the file
|
|
805
|
+
// `ADR-029-ruvsense-multistatic-sensing-mode.md` / title containing "multistatic" defines
|
|
806
|
+
// "multistatic", while a sibling ADR that merely mentions it in the body does not). So we weight
|
|
807
|
+
// a filename-slug / title hit FAR above a bare path-substring hit, which makes the title/slug-exact
|
|
808
|
+
// defining doc out-rank an adjacent ADR. PRIMER#1 (thin product blurb) is excluded. NON-NEGATIVE.
|
|
809
|
+
function conceptBoost(nouns, path, title) {
|
|
810
|
+
if (!nouns || !nouns.length) return 0;
|
|
811
|
+
if (PRIMER_PATH_RE.test(path) && /what\b.*\bis\b|#1-/i.test(`${path} ${title}`)) return 0;
|
|
812
|
+
const slug = (path.split('/').pop() || '').toLowerCase(); // filename slug (the defining signal)
|
|
813
|
+
const titleL = String(title || '').toLowerCase();
|
|
814
|
+
const hay = `${path} ${title}`.toLowerCase();
|
|
815
|
+
let strong = 0; // concept noun present in the filename slug or the title (defining)
|
|
816
|
+
let weak = 0; // concept noun present elsewhere in the path (adjacent / mention)
|
|
817
|
+
for (const t of nouns) {
|
|
818
|
+
if (slug.includes(t) || titleL.includes(t)) strong += 1;
|
|
819
|
+
else if (hay.includes(t)) weak += 1;
|
|
820
|
+
}
|
|
821
|
+
if (strong === 0 && weak === 0) return 0;
|
|
822
|
+
// Strong (slug/title) hits dominate so the defining doc clears any adjacent doc's vector lead.
|
|
823
|
+
const b = 0.30 * strong + 0.06 * weak;
|
|
824
|
+
// Glossary section (ruview) is a legitimate concept target and benefits via the same rule; a real
|
|
825
|
+
// defining doc with a slug/title hit out-scores it.
|
|
826
|
+
return Math.min(0.62, b);
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
// FIX 2 — crate-overview / metric intent. "what does crate X do" or "X compression ratio /
|
|
830
|
+
// throughput / benchmark" should surface the crate's README.md / BENCHMARK.md / docs (where the
|
|
831
|
+
// headline numbers live) ABOVE its benches/ + main.rs harness + bare Cargo.toml. Returns the crate
|
|
832
|
+
// token the query is about, or null. (Distinct from the crate-INVENTORY archetype "which crates…".)
|
|
833
|
+
const CRATE_METRIC_RE = /\b(compression(\s+ratio)?|throughput|benchmark|latency|qps|recall|speed ?up|ops\/s|ratio|perf(ormance)?)\b/i;
|
|
834
|
+
const CRATE_OVERVIEW_RE = /\b(what (does|is)|overview of|tell me about|describe)\b/i;
|
|
835
|
+
function crateOverviewTarget(query, entityCrates) {
|
|
836
|
+
if (!entityCrates || !entityCrates.length) return null;
|
|
837
|
+
if (CRATE_METRIC_RE.test(query) || CRATE_OVERVIEW_RE.test(query)) return entityCrates[0];
|
|
838
|
+
return null;
|
|
839
|
+
}
|
|
840
|
+
// Boost for a README/BENCHMARK/docs path of the targeted crate; mild penalty for that crate's
|
|
841
|
+
// harness (benches/ + main.rs) and bare Cargo.toml so the prose with the numbers wins. NON-NEGATIVE
|
|
842
|
+
// return = subtract from eff distance; harness/Cargo handled as a separate positive penalty.
|
|
843
|
+
function crateOverviewAdjust(crateTok, path, store) {
|
|
844
|
+
if (!crateTok) return 0;
|
|
845
|
+
const inCrate = inComponentRe(store, crateTok).test(path);
|
|
846
|
+
if (!inCrate) return 0;
|
|
847
|
+
let adj = 0;
|
|
848
|
+
if (/\/(readme|benchmark)\.md$/i.test(path) || /\/docs\//i.test(path)) adj -= 0.45; // prose w/ numbers
|
|
849
|
+
if (/\/benches?\//i.test(path) || /\/main\.rs$/i.test(path)) adj += 0.20; // harness
|
|
850
|
+
if (/\/cargo\.toml$/i.test(path)) adj += 0.18; // bare manifest
|
|
851
|
+
return adj;
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
// ===================================================================================
|
|
855
|
+
// FIX B — targeted off-topic-magnet down-weight (RETRIEVAL POLISH). A specific document that
|
|
856
|
+
// keeps surfacing as off-topic noise on UNRELATED queries gets a mild penalty UNLESS the query is
|
|
857
|
+
// actually about that document's subject. General mechanism (a small allow-listed down-weight
|
|
858
|
+
// table); currently a single entry for ADR-096 (rvCSI crate layout), which intruded on unrelated
|
|
859
|
+
// queries (e.g. worldgraph spatial relationships). The penalty is mild so an on-topic query
|
|
860
|
+
// (about rvCSI crate layout / structure) still surfaces it normally via its allow regex.
|
|
861
|
+
// ===================================================================================
|
|
862
|
+
// Legacy default magnet table (ruvector/ruview only): ADR-096 (rvCSI crate layout) intruded on
|
|
863
|
+
// unrelated queries. New targets supply their own via config.offtopicMagnets (filled during gate-A
|
|
864
|
+
// tuning, P6a) — each entry { re|reSource, pen, allow|allowSource }.
|
|
865
|
+
const LEGACY_OFFTOPIC_MAGNETS = {
|
|
866
|
+
ruvector: [
|
|
867
|
+
{ re: /(^|\/)adr[-_]?0*96\b/i, pen: 0.22,
|
|
868
|
+
allow: /\b(rvcsi|rv[-_]?csi|crate\s+layout|crate\s+structure|crate\s+organi|workspace\s+layout|adr[-\s_]?0*96)\b/i },
|
|
869
|
+
],
|
|
870
|
+
ruview: [
|
|
871
|
+
{ re: /(^|\/)adr[-_]?0*96\b/i, pen: 0.22,
|
|
872
|
+
allow: /\b(rvcsi|rv[-_]?csi|crate\s+layout|crate\s+structure|crate\s+organi|workspace\s+layout|adr[-\s_]?0*96)\b/i },
|
|
873
|
+
],
|
|
874
|
+
};
|
|
875
|
+
// Compile a config magnet entry (which may carry string sources or RegExp objects) into a usable
|
|
876
|
+
// { re, pen, allow } triple. Cached per store so we don't recompile each call.
|
|
877
|
+
const _magnetCache = new Map();
|
|
878
|
+
function magnetsFor(store) {
|
|
879
|
+
if (_magnetCache.has(store)) return _magnetCache.get(store);
|
|
880
|
+
const t = cfgFor(store);
|
|
881
|
+
let list = (t && Array.isArray(t.offtopicMagnets) && t.offtopicMagnets.length) ? t.offtopicMagnets : null;
|
|
882
|
+
if (!list) list = LEGACY_OFFTOPIC_MAGNETS[store] || [];
|
|
883
|
+
const compiled = list.map((m) => ({
|
|
884
|
+
re: m.re instanceof RegExp ? m.re : new RegExp(m.reSource || String(m.re || ''), 'i'),
|
|
885
|
+
allow: m.allow instanceof RegExp ? m.allow : (m.allow || m.allowSource ? new RegExp(m.allowSource || String(m.allow), 'i') : null),
|
|
886
|
+
pen: typeof m.pen === 'number' ? m.pen : 0.22,
|
|
887
|
+
}));
|
|
888
|
+
_magnetCache.set(store, compiled);
|
|
889
|
+
return compiled;
|
|
890
|
+
}
|
|
891
|
+
function offtopicMagnetPenalty(query, path, store) {
|
|
892
|
+
let pen = 0;
|
|
893
|
+
for (const m of magnetsFor(store)) {
|
|
894
|
+
if (m.re.test(path) && !(m.allow && m.allow.test(query))) pen += m.pen;
|
|
895
|
+
}
|
|
896
|
+
return pen;
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
// ===================================================================================
|
|
900
|
+
// FIX C — crate-specific maturity → the crate's OWN README/BENCHMARK (RETRIEVAL POLISH). A query
|
|
901
|
+
// like "is <crate> production-ready / experimental / complete" is answered by that crate's OWN
|
|
902
|
+
// README.md / BENCHMARK.md (which usually carry a status/maturity note), NOT by the global
|
|
903
|
+
// capabilities-graded primer or a cross-crate benchmark doc. Returns the named crate token when
|
|
904
|
+
// the query is a crate-scoped maturity question, else null.
|
|
905
|
+
const MATURITY_QUERY_RE = /\b(production[- ]?ready|production\b|experimental|prototype|complete|completeness|mature|maturity|stable|ready\s+for\s+production|battle[- ]?tested|is\s+it\s+(done|ready))\b/i;
|
|
906
|
+
function crateMaturityTarget(query, entityCrates) {
|
|
907
|
+
if (!entityCrates || !entityCrates.length) return null;
|
|
908
|
+
if (MATURITY_QUERY_RE.test(query)) return entityCrates[0];
|
|
909
|
+
return null;
|
|
910
|
+
}
|
|
911
|
+
// Boost the named crate's OWN README.md / BENCHMARK.md for a crate-maturity query (negative = boost).
|
|
912
|
+
function crateMaturityAdjust(crateTok, path, store) {
|
|
913
|
+
if (!crateTok) return 0;
|
|
914
|
+
const own = new RegExp(componentPrefixSrc(store, `${crateTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}(?:-[a-z0-9-]+)?/(?:readme|benchmark)\\.md$`), 'i');
|
|
915
|
+
return own.test(path) ? -0.50 : 0;
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
// FIX 3 — lexical boost: ADR-number exact hit, then proper-noun/title token overlap on
|
|
919
|
+
// the doc's path+title. Returns a NON-NEGATIVE amount to SUBTRACT from effective distance.
|
|
920
|
+
function lexicalBoost(query, terms, path, title) {
|
|
921
|
+
let boost = 0;
|
|
922
|
+
const hay = `${path} ${title}`.toLowerCase();
|
|
923
|
+
|
|
924
|
+
// ADR id in query that the doc carries (e.g. "ADR-027" -> path adr/ADR-027-*.md)
|
|
925
|
+
const adrIds = (query.match(/adr[-\s_]?(\d{1,4})/gi) || [])
|
|
926
|
+
.map((m) => m.replace(/[^0-9]/g, '').padStart(3, '0'));
|
|
927
|
+
for (const num of adrIds) {
|
|
928
|
+
if (new RegExp(`adr[-_]?0*${num}\\b`, 'i').test(hay)) { boost += 0.30; break; }
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
// Title / path token overlap (proper-noun & multiword title tokens count strongest).
|
|
932
|
+
let overlap = 0;
|
|
933
|
+
for (const t of terms) {
|
|
934
|
+
if (hay.includes(t)) overlap += 1;
|
|
935
|
+
}
|
|
936
|
+
if (overlap > 0) boost += Math.min(0.18, 0.06 * overlap);
|
|
937
|
+
|
|
938
|
+
return boost;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
// FIX 2 — demotion penalty for a path given the query (skipped if query references the kind).
|
|
942
|
+
function demotionPenalty(query, path) {
|
|
943
|
+
let pen = 0;
|
|
944
|
+
for (const ls of LOW_SIGNAL) {
|
|
945
|
+
if (ls.re.test(path) && !ls.allow.test(query)) pen += ls.pen;
|
|
946
|
+
}
|
|
947
|
+
return pen;
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
// Substance boost — a self-contained answer-bearing document (multiple chunks / real length)
|
|
951
|
+
// should not be out-ranked by a vector-closer but tiny one-line doc-comment fragment. This
|
|
952
|
+
// keeps results SELF-CONTAINED (the grading bar) without re-embedding. Capped & gentle so it
|
|
953
|
+
// only breaks near-ties, never overrides a clearly-better match.
|
|
954
|
+
function substanceBoost(chunks) {
|
|
955
|
+
if (!chunks || !chunks.length) return 0;
|
|
956
|
+
const totalChars = chunks.reduce((s, c) => s + c.text.length, 0);
|
|
957
|
+
let b = 0;
|
|
958
|
+
if (chunks.length >= 2) b += 0.06;
|
|
959
|
+
if (chunks.length >= 4) b += 0.06;
|
|
960
|
+
if (totalChars >= 4000) b += 0.06;
|
|
961
|
+
if (totalChars < 400) b -= 0.06; // a sub-400-char stub is a fragment, demote it
|
|
962
|
+
return Math.max(-0.06, Math.min(0.18, b));
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
// Disambiguation adjustment (negative = boost a "good" path, positive = penalize a "bad" path) for
|
|
966
|
+
// queries matching a config disambiguation entry. Each entry: { whenSource|when, goodSource|good,
|
|
967
|
+
// badSource|bad, goodBoost?, badPenalty? }. New targets fill target.disambiguation during gate-A
|
|
968
|
+
// tuning (P6a). Legacy ruvector/ruview keep the hard-coded "Cognitum Seed" rule as a fallback only.
|
|
969
|
+
const _disambigCache = new Map();
|
|
970
|
+
function disambigFor(store) {
|
|
971
|
+
if (_disambigCache.has(store)) return _disambigCache.get(store);
|
|
972
|
+
const t = cfgFor(store);
|
|
973
|
+
let list = (t && Array.isArray(t.disambiguation) && t.disambiguation.length) ? t.disambiguation : null;
|
|
974
|
+
if (!list) {
|
|
975
|
+
// legacy fallback: the prototype's "Cognitum Seed" product disambiguation.
|
|
976
|
+
if (store === 'ruvector' || store === 'ruview') {
|
|
977
|
+
list = [{ when: SEED_QUERY_RE, good: SEED_GOOD_RE, bad: SEED_BAD_RE, goodBoost: 0.25, badPenalty: 0.30 }];
|
|
978
|
+
} else {
|
|
979
|
+
list = [];
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
const compiled = list.map((d) => ({
|
|
983
|
+
when: d.when instanceof RegExp ? d.when : new RegExp(d.whenSource || String(d.when || ''), 'i'),
|
|
984
|
+
good: d.good instanceof RegExp ? d.good : (d.good || d.goodSource ? new RegExp(d.goodSource || String(d.good), 'i') : null),
|
|
985
|
+
bad: d.bad instanceof RegExp ? d.bad : (d.bad || d.badSource ? new RegExp(d.badSource || String(d.bad), 'i') : null),
|
|
986
|
+
goodBoost: typeof d.goodBoost === 'number' ? d.goodBoost : 0.25,
|
|
987
|
+
badPenalty: typeof d.badPenalty === 'number' ? d.badPenalty : 0.30,
|
|
988
|
+
}));
|
|
989
|
+
_disambigCache.set(store, compiled);
|
|
990
|
+
return compiled;
|
|
991
|
+
}
|
|
992
|
+
function seedAdjust(query, path, store) {
|
|
993
|
+
let adj = 0;
|
|
994
|
+
for (const d of disambigFor(store)) {
|
|
995
|
+
if (!d.when.test(query)) continue;
|
|
996
|
+
if (d.good && d.good.test(path)) adj -= d.goodBoost;
|
|
997
|
+
if (d.bad && d.bad.test(path)) adj += d.badPenalty;
|
|
998
|
+
}
|
|
999
|
+
return adj;
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
// disambigPrimerTargets — for the matched config disambiguation rules, extract any LITERAL PRIMER
|
|
1003
|
+
// section path named in the rule's goodSource (e.g. 'PRIMER#2-what-can-ruqu-do-for-you'). These are
|
|
1004
|
+
// force-injected into the candidate pool (like targetPrimerSlug) so the rule's goodBoost can rank a
|
|
1005
|
+
// synthesized section that bge ranked outside the raw vector window. Config-driven; no repo baked in.
|
|
1006
|
+
function disambigPrimerTargets(query, store) {
|
|
1007
|
+
const out = [];
|
|
1008
|
+
const t = cfgFor(store);
|
|
1009
|
+
const raw = (t && Array.isArray(t.disambiguation)) ? t.disambiguation : [];
|
|
1010
|
+
for (let i = 0; i < raw.length; i++) {
|
|
1011
|
+
const d = disambigFor(store)[i];
|
|
1012
|
+
if (!d || !d.when.test(query)) continue;
|
|
1013
|
+
const gs = String(raw[i].goodSource || '');
|
|
1014
|
+
for (const m of gs.matchAll(/PRIMER#[0-9][A-Za-z0-9#-]*/g)) out.push(m[0]);
|
|
1015
|
+
}
|
|
1016
|
+
return [...new Set(out)];
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
// The KB builder emits OVERLAPPING chunks (a sliding window repeats ~half of each neighbour).
|
|
1020
|
+
// Naively concatenating them duplicates paragraphs. stitch() drops the longest suffix of the
|
|
1021
|
+
// running text that is also a prefix of the next chunk, so the document reads cleanly as one.
|
|
1022
|
+
function stitch(prevTail, next) {
|
|
1023
|
+
const maxOv = Math.min(prevTail.length, next.length, 2000);
|
|
1024
|
+
for (let len = maxOv; len >= 24; len--) {
|
|
1025
|
+
if (prevTail.slice(prevTail.length - len) === next.slice(0, len)) {
|
|
1026
|
+
return next.slice(len); // drop the duplicated overlap
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
return next;
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// Assemble the FULL document from its chunks (id-ordered), de-overlapping as we go so it reads
|
|
1033
|
+
// as one clean document. If the stitched text fits under MAX_DOC_CHARS, the whole document is
|
|
1034
|
+
// returned. If it exceeds the cap, the window is CENTERED on the matched chunk (the chunk that
|
|
1035
|
+
// actually scored the hit) and expanded outward — alternating following then preceding chunks —
|
|
1036
|
+
// so the answer-bearing region is ALWAYS included, even when the match is a late chunk in a long
|
|
1037
|
+
// document. (The old behavior counted from chunk 0 and could truncate the answer.) De-overlap
|
|
1038
|
+
// stitching is preserved across the contiguous kept window.
|
|
1039
|
+
function assembleDocument(chunks, matchedId) {
|
|
1040
|
+
const SEP = '\n\n';
|
|
1041
|
+
if (!chunks.length) return { fullText: '', chunksJoined: 0, truncated: false };
|
|
1042
|
+
|
|
1043
|
+
// Locate the matched chunk; default to the first chunk if not found (back-compat).
|
|
1044
|
+
let center = 0;
|
|
1045
|
+
if (matchedId != null) {
|
|
1046
|
+
const idx = chunks.findIndex((c) => c.id === matchedId);
|
|
1047
|
+
if (idx >= 0) center = idx;
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
// Grow a contiguous [lo, hi] window outward from `center` while it fits under the cap.
|
|
1051
|
+
// Always include the matched chunk itself first, then expand following, then preceding.
|
|
1052
|
+
let lo = center, hi = center;
|
|
1053
|
+
let budget = chunks[center].text.length;
|
|
1054
|
+
let nextLo = center - 1, nextHi = center + 1;
|
|
1055
|
+
let toggle = 1; // 1 = try to extend forward first, then backward
|
|
1056
|
+
while (nextLo >= 0 || nextHi < chunks.length) {
|
|
1057
|
+
let extended = false;
|
|
1058
|
+
const tryHi = () => {
|
|
1059
|
+
if (nextHi < chunks.length) {
|
|
1060
|
+
const cost = SEP.length + chunks[nextHi].text.length;
|
|
1061
|
+
if (budget + cost <= MAX_DOC_CHARS) { budget += cost; hi = nextHi; nextHi++; return true; }
|
|
1062
|
+
nextHi = chunks.length; // stop growing forward once it no longer fits
|
|
1063
|
+
}
|
|
1064
|
+
return false;
|
|
1065
|
+
};
|
|
1066
|
+
const tryLo = () => {
|
|
1067
|
+
if (nextLo >= 0) {
|
|
1068
|
+
const cost = SEP.length + chunks[nextLo].text.length;
|
|
1069
|
+
if (budget + cost <= MAX_DOC_CHARS) { budget += cost; lo = nextLo; nextLo--; return true; }
|
|
1070
|
+
nextLo = -1; // stop growing backward once it no longer fits
|
|
1071
|
+
}
|
|
1072
|
+
return false;
|
|
1073
|
+
};
|
|
1074
|
+
if (toggle === 1) { extended = tryHi() || tryLo(); } else { extended = tryLo() || tryHi(); }
|
|
1075
|
+
toggle ^= 1;
|
|
1076
|
+
if (!extended) break;
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
// Stitch the kept contiguous window [lo..hi] into one clean document.
|
|
1080
|
+
let out = '';
|
|
1081
|
+
let joined = 0;
|
|
1082
|
+
for (let i = lo; i <= hi; i++) {
|
|
1083
|
+
const piece = out ? stitch(out.slice(-2000), chunks[i].text) : chunks[i].text;
|
|
1084
|
+
out = out ? out + (piece ? SEP + piece : '') : piece;
|
|
1085
|
+
joined++;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
const omitted = chunks.length - joined;
|
|
1089
|
+
if (omitted > 0) {
|
|
1090
|
+
const before = lo, after = chunks.length - 1 - hi;
|
|
1091
|
+
const parts = [];
|
|
1092
|
+
if (before > 0) parts.push(`${before} earlier`);
|
|
1093
|
+
if (after > 0) parts.push(`${after} later`);
|
|
1094
|
+
const note = `${SEP}${SEP}[... ${parts.join(' + ')} chunk(s) omitted; window centered on the `
|
|
1095
|
+
+ `matched section, capped at ${MAX_DOC_CHARS} chars ...]`;
|
|
1096
|
+
return { fullText: out + note, chunksJoined: joined, truncated: true };
|
|
1097
|
+
}
|
|
1098
|
+
return { fullText: out, chunksJoined: joined, truncated: false };
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
// ---------- core search: returns whole-document results ----------
|
|
1102
|
+
// Each result: { path, title, fullText, bestDistance, effDistance, chunksJoined, truncated,
|
|
1103
|
+
// distance (alias of bestDistance), text (alias of fullText) }.
|
|
1104
|
+
export async function searchKb({ query, k = 6, store, n, variant }) {
|
|
1105
|
+
const conf = resolveConf(store, variant);
|
|
1106
|
+
if (!fs.existsSync(conf.rvf)) throw new Error(`rvf not found: ${conf.rvf} (variant=${conf.variant}; run \`npm i\` then build, or copy the bundle in)`);
|
|
1107
|
+
const topN = Math.max(1, n || 5);
|
|
1108
|
+
const [qv, { byId, byPath }] = await Promise.all([embed(query, conf.embedCfg), loadPassages(conf.passages)]);
|
|
1109
|
+
const byPathKind = loadKinds(conf.meta); // intent layer: per-path content kind
|
|
1110
|
+
const terms = queryTerms(query);
|
|
1111
|
+
|
|
1112
|
+
// ---- INTENT CLASSIFICATION (deterministic, computed once per query) ----
|
|
1113
|
+
// FIX 1 — specific-entity detection. If the query names a crate / ADR id / file / proper noun it
|
|
1114
|
+
// is NOT a generic product-orientation question: suppress the orientation force-route + generic
|
|
1115
|
+
// PRIMER lift and demote primer-orientation docs. The crate-INVENTORY archetype ("which crates…")
|
|
1116
|
+
// is exempt (carries no hyphen-crate token) so it still routes to the inventory PRIMER.
|
|
1117
|
+
const crateTokens = crateTokenSet(byPath, store);
|
|
1118
|
+
const entity = specificEntity(query, crateTokens);
|
|
1119
|
+
let archetype = classifyArchetype(query, store); // 'maturity'|'capabilities'|…|'whatis-concept'|null
|
|
1120
|
+
// Suppress force-routing archetypes when a specific entity is named — EXCEPT:
|
|
1121
|
+
// (a) the crate inventory archetype ('crates') — a legitimate enumeration even with a hyphen token.
|
|
1122
|
+
// (b) a product-overview 'whatis' query that names a product alias (e.g. "CLI name versus
|
|
1123
|
+
// create-agent-harness") — the alias IS the product, so the orientation PRIMER should still win.
|
|
1124
|
+
const isProductWhatis = archetype === 'whatis' && WHATIS_FORCE_RE.test(query);
|
|
1125
|
+
if (entity.named && archetype && archetype !== 'crates' && archetype !== 'whatis-concept' && !isProductWhatis) {
|
|
1126
|
+
archetype = null;
|
|
1127
|
+
}
|
|
1128
|
+
// 'whatis-concept' is a NON-routing archetype: no force-route to a PRIMER, instead a mild concept
|
|
1129
|
+
// boost (below) lets the vector+rerank pipeline surface the true DEFINING doc. Other archetypes
|
|
1130
|
+
// force-route to their PRIMER slug (null slug -> no force-route, e.g. ruvector hardware).
|
|
1131
|
+
const targetPrimerSlug = (archetype && archetype !== 'whatis-concept')
|
|
1132
|
+
? resolvePrimerSlug(archetype, store, byPath)
|
|
1133
|
+
: null;
|
|
1134
|
+
// FIX 2 — crate-overview / metric target: the crate whose README/BENCHMARK/docs should win.
|
|
1135
|
+
const crateOverviewTok = crateOverviewTarget(query, entity.crates);
|
|
1136
|
+
// Concept nouns drive the mild concept boost for concept what-is queries (FIX 1).
|
|
1137
|
+
const concepts = archetype === 'whatis-concept' ? conceptNouns(query, store) : [];
|
|
1138
|
+
// FIX 3 — DEFINING-DOC nouns: a concept/topic query that is NOT a product-overview, NOT a
|
|
1139
|
+
// force-routed orientation archetype, and NOT a specific-entity query should still surface the
|
|
1140
|
+
// DEFINING doc (e.g. "multistatic vs monostatic sensing" -> the ADR whose filename slug names
|
|
1141
|
+
// "multistatic"). These nouns drive the slug/title concept boost AND pull a slug-named defining
|
|
1142
|
+
// doc into the candidate pool, so a title/slug-exact doc beats an adjacent one. The whatis-concept
|
|
1143
|
+
// nouns are folded in so that path keeps its existing behavior.
|
|
1144
|
+
const definingNouns = (concepts.length)
|
|
1145
|
+
? concepts
|
|
1146
|
+
: (!entity.named && !targetPrimerSlug && archetype !== 'crates'
|
|
1147
|
+
? conceptNouns(query, store)
|
|
1148
|
+
: []);
|
|
1149
|
+
// For ruview, a concept what-is query may also softly boost the glossary section (a real defining
|
|
1150
|
+
// doc still wins when present, because the glossary is short/synthesized with a worse distance).
|
|
1151
|
+
const glossarySlug = archetype === 'whatis-concept' ? resolvePrimerSlug('glossary', store, byPath) : null;
|
|
1152
|
+
const adrNums = adrNumbers(query);
|
|
1153
|
+
const intent = codeDocIntent(query); // 'code' | 'design' | null
|
|
1154
|
+
// FIX A — implementation ("how is X coded") intent: demote wrong-file types (vendored deps,
|
|
1155
|
+
// bare entrypoints, manifests), promote the named crate's own src/**/*.rs.
|
|
1156
|
+
const implIntent = isImplIntent(query);
|
|
1157
|
+
const implCrateTok = (implIntent && entity.crates.length) ? entity.crates[0] : null;
|
|
1158
|
+
const implOpNouns = implIntent ? implOperationNouns(query, entity.crates) : [];
|
|
1159
|
+
// FIX D — named crate(s) WITHOUT impl intent: gently scope to the named crate's own source.
|
|
1160
|
+
// BUT NOT for a definitional "what is X" / overview query: there the user wants the concept doc,
|
|
1161
|
+
// and a crate-family token in the name (e.g. "wifi-densepose") must NOT boost crate src over the
|
|
1162
|
+
// plain-answer PRIMER. So suppress on the whatis-concept archetype + product-overview queries.
|
|
1163
|
+
const definitionalQ = archetype === 'whatis-concept' || isProductOverviewQuery(query, store);
|
|
1164
|
+
const namedCrateToks = (!implIntent && !definitionalQ && entity.crates.length) ? entity.crates : [];
|
|
1165
|
+
const namedOpNouns = namedCrateToks.length ? implOperationNouns(query, entity.crates) : [];
|
|
1166
|
+
// FIX C — crate-scoped maturity question: prefer the crate's OWN README/BENCHMARK over the
|
|
1167
|
+
// global capabilities primer / cross-crate benchmark doc.
|
|
1168
|
+
const crateMaturityTok = crateMaturityTarget(query, entity.crates);
|
|
1169
|
+
|
|
1170
|
+
const db = await RvfDatabase.openReadonly(conf.rvf);
|
|
1171
|
+
let hits;
|
|
1172
|
+
try {
|
|
1173
|
+
// Fetch plenty of raw chunk hits so we have material to group into documents and rerank.
|
|
1174
|
+
hits = await db.query(qv, Math.max(RAW_HITS, k * 4));
|
|
1175
|
+
} finally {
|
|
1176
|
+
await db.close();
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
// FIX 1 — collapse chunk hits into documents keyed by path; doc score = best (min) distance.
|
|
1180
|
+
const docs = new Map(); // path -> { path, title, bestDistance, matchedId }
|
|
1181
|
+
for (const h of hits) {
|
|
1182
|
+
const rec = byId.get(String(h.id));
|
|
1183
|
+
if (!rec) continue;
|
|
1184
|
+
const cur = docs.get(rec.path);
|
|
1185
|
+
if (!cur || h.distance < cur.bestDistance) {
|
|
1186
|
+
docs.set(rec.path, { path: rec.path, title: rec.title, bestDistance: h.distance, matchedId: rec.id });
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
// INTENT: ensure force-routed targets are IN the candidate pool even if MiniLM ranked them out
|
|
1191
|
+
// of the raw window. (a) the target PRIMER slug for an orientation archetype; (b) the real ADR
|
|
1192
|
+
// document for an exact ADR-NNN query. We synthesize a doc entry from byPath so it can be ranked
|
|
1193
|
+
// and then hard-boosted below. Without this, a force-route could point at a doc not in `docs`.
|
|
1194
|
+
const ensureDoc = (p) => {
|
|
1195
|
+
if (!p || docs.has(p)) return;
|
|
1196
|
+
const chunks = byPath.get(p);
|
|
1197
|
+
if (!chunks || !chunks.length) return;
|
|
1198
|
+
docs.set(p, { path: p, title: chunks[0].title, bestDistance: 1.0, matchedId: chunks[0].id });
|
|
1199
|
+
};
|
|
1200
|
+
if (targetPrimerSlug) ensureDoc(targetPrimerSlug);
|
|
1201
|
+
if (glossarySlug) ensureDoc(glossarySlug); // concept query: glossary may softly win for ruview
|
|
1202
|
+
// Config disambiguation: pull any PRIMER section named in a matched rule's goodSource into the
|
|
1203
|
+
// pool so its goodBoost can rank it even if bge ranked the synthesized section out of the window.
|
|
1204
|
+
// The goodSource names a PRIMER by PREFIX (e.g. "PRIMER#3-what-is-ruqu-made-of"); resolve it to
|
|
1205
|
+
// the actual full slug path(s) in byPath (e.g. "...-the-five-crates") so ensureDoc finds them.
|
|
1206
|
+
for (const pp of disambigPrimerTargets(query, store)) {
|
|
1207
|
+
if (byPath.has(pp)) { ensureDoc(pp); continue; }
|
|
1208
|
+
for (const realPath of byPath.keys()) {
|
|
1209
|
+
if (realPath.startsWith(pp) && PRIMER_PATH_RE.test(realPath)) ensureDoc(realPath);
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
// For an exact ADR query, find the real ADR doc path(s) by scanning the passages index.
|
|
1213
|
+
const adrDocPaths = [];
|
|
1214
|
+
if (adrNums.length) {
|
|
1215
|
+
for (const num of adrNums) {
|
|
1216
|
+
for (const p of byPath.keys()) {
|
|
1217
|
+
if (pathIsAdrDoc(p, num) && !PRIMER_PATH_RE.test(p)) { adrDocPaths.push(p); ensureDoc(p); }
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
}
|
|
1221
|
+
// FIX 3 — a concept what-is query's DEFINING doc may sit OUTSIDE the raw vector window (the defining
|
|
1222
|
+
// ADR can be titled by its codename, not the concept). Pull any doc whose FILENAME SLUG names a
|
|
1223
|
+
// concept noun into the candidate pool so the strengthened concept boost can rank it; the boost
|
|
1224
|
+
// (not a hard route) decides whether it actually wins. Capped scan to stay cheap.
|
|
1225
|
+
if (definingNouns.length) {
|
|
1226
|
+
let added = 0;
|
|
1227
|
+
for (const p of byPath.keys()) {
|
|
1228
|
+
if (docs.has(p) || PRIMER_PATH_RE.test(p)) continue;
|
|
1229
|
+
const slug = (p.split('/').pop() || '').toLowerCase();
|
|
1230
|
+
if (definingNouns.some((t) => slug.includes(t))) { ensureDoc(p); if (++added >= 40) break; }
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
// FIX 2 — ensure the targeted crate's README/BENCHMARK/docs are in the pool even if MiniLM ranked
|
|
1234
|
+
// them out (the harness file is often the closer vector match).
|
|
1235
|
+
if (crateOverviewTok) {
|
|
1236
|
+
const esc = crateOverviewTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
1237
|
+
const cre = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/(?:readme|benchmark)\\.md$`), 'i');
|
|
1238
|
+
for (const p of byPath.keys()) { if (cre.test(p)) ensureDoc(p); }
|
|
1239
|
+
}
|
|
1240
|
+
// FIX A — for an implementation query naming a component, pull that component's own src/** source
|
|
1241
|
+
// into the pool so the real algorithm source can be promoted above a vendored copy / entrypoint.
|
|
1242
|
+
if (implCrateTok) {
|
|
1243
|
+
const esc = implCrateTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
1244
|
+
const srcRe = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/src/.+\\.(?:rs|ts|tsx|js|mjs|py)$`), 'i');
|
|
1245
|
+
let added = 0;
|
|
1246
|
+
for (const p of byPath.keys()) {
|
|
1247
|
+
if (docs.has(p) || /(?:^|\/)main\.rs$/i.test(p)) continue;
|
|
1248
|
+
if (srcRe.test(p)) { ensureDoc(p); if (++added >= 40) break; }
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1251
|
+
// FIX D — for a (non-impl) query naming component(s), pull each named component's own src/** into
|
|
1252
|
+
// the pool so the named component's source can be promoted above a sibling / example / bridge.
|
|
1253
|
+
for (const c of namedCrateToks) {
|
|
1254
|
+
const esc = c.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
1255
|
+
const srcRe = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/src/.+\\.(?:rs|ts|tsx|js|mjs|py)$`), 'i');
|
|
1256
|
+
let added = 0;
|
|
1257
|
+
for (const p of byPath.keys()) {
|
|
1258
|
+
if (docs.has(p) || /(?:^|\/)main\.rs$/i.test(p)) continue;
|
|
1259
|
+
if (srcRe.test(p)) { ensureDoc(p); if (++added >= 40) break; }
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
// FIX C — for a component-maturity query, ensure the component's own README/BENCHMARK are pooled.
|
|
1263
|
+
if (crateMaturityTok) {
|
|
1264
|
+
const esc = crateMaturityTok.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
1265
|
+
const cre = new RegExp(componentPrefixSrc(store, `${esc}(?:-[a-z0-9-]+)?/(?:readme|benchmark)\\.md$`), 'i');
|
|
1266
|
+
for (const p of byPath.keys()) { if (cre.test(p)) ensureDoc(p); }
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
// FIXes 2/3/4 + INTENT — compute effective distance per document.
|
|
1270
|
+
// Hard routes use a large negative adjustment so the routed doc wins decisively; intent tilts
|
|
1271
|
+
// are gentle (break ties / nudge) so they don't override a clearly-better vector match.
|
|
1272
|
+
const HARD_WIN = 5.0; // dominates any plausible distance + penalty (force #1)
|
|
1273
|
+
const ranked = [...docs.values()].map((d) => {
|
|
1274
|
+
const dkind = byPathKind.get(d.path) || null;
|
|
1275
|
+
const pen = demotionPenalty(query, d.path);
|
|
1276
|
+
const boost = lexicalBoost(query, terms, d.path, d.title);
|
|
1277
|
+
const seed = seedAdjust(query, d.path, store);
|
|
1278
|
+
const sub = substanceBoost(byPath.get(d.path));
|
|
1279
|
+
// FIX 1 — when a specific entity is named, the generic orientation lift is suppressed AND
|
|
1280
|
+
// primer-orientation docs are demoted below source/adr/crate-src/doc for this query.
|
|
1281
|
+
const suppressOrient = entity.named || archetype === 'whatis-concept';
|
|
1282
|
+
const orient = orientationBoost(query, d.path, d.title, suppressOrient); // FIX 5 — top-down orientation layer
|
|
1283
|
+
const primerDemote = (entity.named && dkind === 'primer-orientation') ? PRIMER_DEMOTE_WHEN_SPECIFIC : 0;
|
|
1284
|
+
// FIX 2 — crate-overview / metric: boost the crate's README/BENCHMARK/docs, demote its harness.
|
|
1285
|
+
const crateAdj = crateOverviewAdjust(crateOverviewTok, d.path, store); // negative=boost, positive=demote
|
|
1286
|
+
// FIX A — implementation intent: demote vendored deps / entrypoints / manifest, promote the
|
|
1287
|
+
// named component's own src/**/* (extra for an operation-token-matching filename).
|
|
1288
|
+
const implAdj = implIntent ? implAdjust(d.path, implCrateTok, implOpNouns, store) : 0;
|
|
1289
|
+
// FIX D — named component (no impl verb): gently promote the named component's own source.
|
|
1290
|
+
const namedCrateAdj = namedCrateToks.length ? namedCrateAdjust(d.path, namedCrateToks, namedOpNouns, store) : 0;
|
|
1291
|
+
// FIX B — targeted off-topic-magnet down-weight (config disambiguation/offtopicMagnets).
|
|
1292
|
+
const magnetPen = offtopicMagnetPenalty(query, d.path, store);
|
|
1293
|
+
// FIX C — crate-maturity: boost the named component's OWN README/BENCHMARK.
|
|
1294
|
+
const matAdj = crateMaturityAdjust(crateMaturityTok, d.path, store); // negative=boost
|
|
1295
|
+
// FIX 1/3 — concept boost: nudge docs whose slug/title names the concept (defining doc beats
|
|
1296
|
+
// adjacent); extra nudge for the glossary section on a whatis-concept query.
|
|
1297
|
+
let concept = conceptBoost(definingNouns, d.path, d.title);
|
|
1298
|
+
if (glossarySlug && d.path === glossarySlug && concepts.length) concept += 0.06;
|
|
1299
|
+
let intentAdj = 0;
|
|
1300
|
+
|
|
1301
|
+
// INTENT (1) — orientation archetype force-route to the matching PRIMER slug.
|
|
1302
|
+
if (targetPrimerSlug && d.path === targetPrimerSlug) intentAdj += HARD_WIN;
|
|
1303
|
+
|
|
1304
|
+
// INTENT (2) — exact ADR-by-number hard route to the real ADR doc (must beat the index table).
|
|
1305
|
+
if (adrDocPaths.includes(d.path)) intentAdj += HARD_WIN;
|
|
1306
|
+
|
|
1307
|
+
// INTENT (3) — code-vs-doc tilt. Use the doc's content kind from the metadata sidecar.
|
|
1308
|
+
if (intent) {
|
|
1309
|
+
const kind = byPathKind.get(d.path);
|
|
1310
|
+
if (intent === 'code' && isSourceKind(kind)) intentAdj += 0.30; // prefer real code body
|
|
1311
|
+
if (intent === 'design' && kind === 'adr') intentAdj += 0.22; // prefer the ADR/doc
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1314
|
+
// RANK SCALE — the additive offsets below were calibrated against MiniLM's distance scale
|
|
1315
|
+
// (relevant ~0.9–1.1). bge-base packs relevant docs much tighter (~0.4–0.8), so the same
|
|
1316
|
+
// raw offset over-corrects and inverts good raw rankings. Scale the WHOLE offset bundle by
|
|
1317
|
+
// the per-variant rankScale (small=1.0 → unchanged; big<1 → gentler, trusts bge's raw order).
|
|
1318
|
+
const RANK_SCALE = conf.embedCfg.rankScale ?? 1;
|
|
1319
|
+
const offsets = pen - boost + seed - sub - orient - concept - intentAdj
|
|
1320
|
+
+ primerDemote + crateAdj + implAdj + namedCrateAdj + magnetPen + matAdj;
|
|
1321
|
+
const effDistance = d.bestDistance + RANK_SCALE * offsets;
|
|
1322
|
+
return { ...d, effDistance, kind: dkind };
|
|
1323
|
+
}).sort((a, b) => a.effDistance - b.effDistance);
|
|
1324
|
+
|
|
1325
|
+
// INTENT (4) — ADR-vs-code pairing for completeness. If #1 is an ADR carrying a Status: header
|
|
1326
|
+
// (a proposal/decision = intent, not built reality) and NO source doc is in the top-N, pull the
|
|
1327
|
+
// best-matching source doc into the returned set so the reader sees proposal vs built code. We
|
|
1328
|
+
// only ADD a result (and tag it); we never displace the routed #1 or break whole-doc return.
|
|
1329
|
+
let pairedSource = null;
|
|
1330
|
+
if (ranked.length) {
|
|
1331
|
+
const top = ranked[0];
|
|
1332
|
+
const topKind = top.kind || byPathKind.get(top.path);
|
|
1333
|
+
if (topKind === 'adr' && adrHasStatus(byPath.get(top.path))) {
|
|
1334
|
+
const inTop = ranked.slice(0, topN).some((d) => isSourceKind(d.kind));
|
|
1335
|
+
if (!inTop) {
|
|
1336
|
+
pairedSource = ranked.find((d) => isSourceKind(d.kind)) || null;
|
|
1337
|
+
}
|
|
1338
|
+
}
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
// FIX 1 — assemble the FULL document for the top-N distinct documents.
|
|
1342
|
+
const assemble = (d, label) => {
|
|
1343
|
+
const chunks = byPath.get(d.path) || [];
|
|
1344
|
+
const { fullText, chunksJoined, truncated } = chunks.length
|
|
1345
|
+
? assembleDocument(chunks, d.matchedId)
|
|
1346
|
+
: { fullText: '(NO PASSAGE TEXT — path not found in sidecar)', chunksJoined: 0, truncated: false };
|
|
1347
|
+
// FIX 4 — parse + surface the ADR Status (Proposed/Accepted/Implemented/…) for ADR docs (or any
|
|
1348
|
+
// doc whose head carries a Status block). `adrStatus` rides the result object so the MCP path
|
|
1349
|
+
// carries it too; `statusLabel` is the human-visible header tag.
|
|
1350
|
+
const kind = d.kind || byPathKind.get(d.path) || null;
|
|
1351
|
+
const adrStatus = (kind === 'adr' || adrHasStatus(chunks)) ? parseAdrStatus(chunks) : null;
|
|
1352
|
+
const statusLabel = adrStatus
|
|
1353
|
+
? `ADR STATUS: ${adrStatus}${statusIsProposed(adrStatus)
|
|
1354
|
+
? ' — design intent, NOT confirmed shipped'
|
|
1355
|
+
: ' — accepted/implemented'}`
|
|
1356
|
+
: null;
|
|
1357
|
+
return {
|
|
1358
|
+
path: d.path,
|
|
1359
|
+
title: d.title,
|
|
1360
|
+
fullText,
|
|
1361
|
+
bestDistance: d.bestDistance,
|
|
1362
|
+
effDistance: d.effDistance,
|
|
1363
|
+
kind,
|
|
1364
|
+
adrStatus, // FIX 4 — parsed ADR status (null if none) — carried to MCP
|
|
1365
|
+
statusLabel, // FIX 4 — human-visible "[ADR STATUS: …]" tag
|
|
1366
|
+
label: label || null, // intent label (e.g. 'paired-source') for callers/UI
|
|
1367
|
+
chunksJoined,
|
|
1368
|
+
truncated,
|
|
1369
|
+
// back-compat aliases for callers that still read .text / .distance
|
|
1370
|
+
text: fullText,
|
|
1371
|
+
distance: d.bestDistance,
|
|
1372
|
+
};
|
|
1373
|
+
};
|
|
1374
|
+
|
|
1375
|
+
const out = ranked.slice(0, topN).map((d) => assemble(d));
|
|
1376
|
+
|
|
1377
|
+
// INTENT (4) — append the paired implementing source so proposal-vs-built-reality is visible.
|
|
1378
|
+
// Appended (not inserted) so the routed/ranked order — including the whole-doc #1 ADR — is intact.
|
|
1379
|
+
if (pairedSource && !out.some((r) => r.path === pairedSource.path)) {
|
|
1380
|
+
out.push(assemble(pairedSource, 'paired-source (implements the ADR above)'));
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
// FIX 4 — proposal-as-reality guard. If the #1 result is a Proposed/not-yet-Implemented ADR (or a
|
|
1384
|
+
// design/DDD doc with no parseable status) AND no kind:'source' implementing file is in the set,
|
|
1385
|
+
// attach a clear design-intent warning so the reader never treats a proposal as shipped reality.
|
|
1386
|
+
if (out.length) {
|
|
1387
|
+
const top = out[0];
|
|
1388
|
+
const isDesignTop = top.kind === 'adr'
|
|
1389
|
+
|| top.kind === 'doc' || top.kind === 'doc-deep' || top.kind === 'ddd';
|
|
1390
|
+
const proposed = top.adrStatus ? statusIsProposed(top.adrStatus) : (top.kind !== 'source' && top.kind !== 'crate-src');
|
|
1391
|
+
const hasSource = out.some((r) => isSourceKind(r.kind));
|
|
1392
|
+
if (isDesignTop && proposed && !hasSource) {
|
|
1393
|
+
const st = top.adrStatus || 'unstated (design/DDD doc)';
|
|
1394
|
+
top.designIntentWarning =
|
|
1395
|
+
`⚠ This is design intent (ADR status: ${st}); no implementing source was retrieved — `
|
|
1396
|
+
+ `treat as proposed, not confirmed built.`;
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
return out;
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
// ---------- structured lookups (exact, not semantic) ----------
|
|
1404
|
+
// The drop-in for-ai/ ships <store>-symbols.json / -dep-graph.json / -entrypoints.json. These give
|
|
1405
|
+
// an AI EXACT answers (a function signature, who-depends-on-what, the build/test/run commands)
|
|
1406
|
+
// without a vector search. Exported so the MCP server can surface them too.
|
|
1407
|
+
export function loadStructured(store) {
|
|
1408
|
+
const dir = storeDir(store);
|
|
1409
|
+
const read = (suffix) => { try { return JSON.parse(fs.readFileSync(path.join(dir, `${store}-${suffix}.json`), 'utf8')); } catch { return null; } };
|
|
1410
|
+
return { symbols: read('symbols'), depGraph: read('dep-graph'), entrypoints: read('entrypoints') };
|
|
1411
|
+
}
|
|
1412
|
+
|
|
1413
|
+
// Exact symbol lookup: name substring (case-insensitive), optional kind filter. Returns matches
|
|
1414
|
+
// with signature + module + source location + doc.
|
|
1415
|
+
export function lookupSymbol(store, name, { kind, limit = 25 } = {}) {
|
|
1416
|
+
const { symbols } = loadStructured(store);
|
|
1417
|
+
if (!symbols) return { available: false, matches: [] };
|
|
1418
|
+
const needle = String(name || '').toLowerCase();
|
|
1419
|
+
let m = symbols.symbols.filter((s) => (!needle || s.name.toLowerCase().includes(needle) || (s.module || '').toLowerCase().includes(needle)));
|
|
1420
|
+
if (kind) m = m.filter((s) => s.kind === kind);
|
|
1421
|
+
return { available: true, count: m.length, method: symbols.method, matches: m.slice(0, limit) };
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
// Exact entrypoints + dep-graph passthroughs.
|
|
1425
|
+
export function getEntrypoints(store) { return loadStructured(store).entrypoints; }
|
|
1426
|
+
export function getDepGraph(store) { return loadStructured(store).depGraph; }
|
|
1427
|
+
|
|
1428
|
+
// ---------- CLI ----------
|
|
1429
|
+
async function main() {
|
|
1430
|
+
const argv = process.argv.slice(2);
|
|
1431
|
+
|
|
1432
|
+
// Structured-lookup subcommands (exact, no vector search): --symbol NAME | --entrypoints | --deps.
|
|
1433
|
+
const sIdx = argv.findIndex((a) => a === '--symbol' || a === '--entrypoints' || a === '--deps');
|
|
1434
|
+
if (sIdx !== -1) {
|
|
1435
|
+
const store = argv.find((a) => !a.startsWith('--')) || CONFIG_DEFAULT;
|
|
1436
|
+
if (!knownStore(store)) { console.error(`unknown store: ${store}`); process.exit(2); }
|
|
1437
|
+
const flag = argv[sIdx];
|
|
1438
|
+
if (flag === '--symbol') {
|
|
1439
|
+
const name = argv[sIdx + 1] || '';
|
|
1440
|
+
const r = lookupSymbol(store, name, { limit: 40 });
|
|
1441
|
+
if (!r.available) { console.error(`no ${store}-symbols.json present (run extract-symbols.mjs)`); process.exit(1); }
|
|
1442
|
+
console.log(`\n=== ${store} symbols matching "${name}" (${r.count} via ${r.method}) ===\n`);
|
|
1443
|
+
for (const s of r.matches) console.log(`${s.kind.padEnd(9)} ${s.signature}\n @ ${s.file}:${s.line}${s.doc ? `\n ${s.doc}` : ''}\n`);
|
|
1444
|
+
} else if (flag === '--entrypoints') {
|
|
1445
|
+
const e = getEntrypoints(store);
|
|
1446
|
+
if (!e) { console.error(`no ${store}-entrypoints.json present`); process.exit(1); }
|
|
1447
|
+
console.log(JSON.stringify({ workspace: e.workspace, install: e.install, quickstart: e.quickstart, binaries: e.binaries, commands: e.commands }, null, 2));
|
|
1448
|
+
} else {
|
|
1449
|
+
const g = getDepGraph(store);
|
|
1450
|
+
if (!g) { console.error(`no ${store}-dep-graph.json present`); process.exit(1); }
|
|
1451
|
+
console.log(JSON.stringify({ nodes: g.nodes.map((n) => ({ name: n.name, ecosystem: n.ecosystem, description: n.description })), internalEdges: g.internalEdges, externalDepNames: g.externalDepNames }, null, 2));
|
|
1452
|
+
}
|
|
1453
|
+
return;
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
// optional trailing [big|small] variant selector; default auto-picks big if present
|
|
1457
|
+
let variant;
|
|
1458
|
+
const vIdx = argv.findIndex((a) => a === 'big' || a === 'small');
|
|
1459
|
+
if (vIdx !== -1) variant = argv.splice(vIdx, 1)[0];
|
|
1460
|
+
const [store, query, kArg] = argv;
|
|
1461
|
+
if (!store || !query) {
|
|
1462
|
+
console.error(`Usage: node kb/ask-kb.mjs <${[...KNOWN_STORES].join('|')}> "question" [k] [big|small]`);
|
|
1463
|
+
console.error(` or: node kb/ask-kb.mjs <store> --symbol <name> | --entrypoints | --deps`);
|
|
1464
|
+
process.exit(2);
|
|
1465
|
+
}
|
|
1466
|
+
const k = Math.max(1, parseInt(kArg || '6', 10) || 6);
|
|
1467
|
+
const conf = resolveConf(store, variant);
|
|
1468
|
+
const results = await searchKb({ query, k, store, variant });
|
|
1469
|
+
console.log(`\n=== ${store} KB (${conf.variant} · ${conf.embedCfg.model}) — "${query}" — top ${results.length} documents ===\n`);
|
|
1470
|
+
results.forEach((r, i) => {
|
|
1471
|
+
console.log(`#${i + 1} distance=${r.bestDistance.toFixed(4)} (eff=${r.effDistance.toFixed(4)})`
|
|
1472
|
+
+ `${r.kind ? ` kind=${r.kind}` : ''}${r.label ? ` [${r.label}]` : ''}`
|
|
1473
|
+
+ `${r.statusLabel ? ` [${r.statusLabel}]` : ''}`); // FIX 4 — surface ADR status in the header
|
|
1474
|
+
console.log(`path : ${r.path}`);
|
|
1475
|
+
console.log(`title: ${r.title}`);
|
|
1476
|
+
if (r.designIntentWarning) console.log(r.designIntentWarning); // FIX 4 — proposal-as-reality guard
|
|
1477
|
+
console.log(`chars: ${r.fullText.length} | chunks: ${r.chunksJoined}${r.truncated ? ' (truncated)' : ''}`);
|
|
1478
|
+
console.log('----- full document -----');
|
|
1479
|
+
console.log(r.fullText);
|
|
1480
|
+
console.log('===================================================================\n');
|
|
1481
|
+
});
|
|
1482
|
+
}
|
|
1483
|
+
|
|
1484
|
+
// Run as CLI when invoked directly (compare decoded real paths; handles spaces in path).
|
|
1485
|
+
if (process.argv[1] && path.resolve(process.argv[1]) === path.resolve(__filename)) {
|
|
1486
|
+
main().catch((e) => { console.error('ERROR:', e.message); process.exit(1); });
|
|
1487
|
+
}
|