explainmyrepo 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ #!/usr/bin/env node
2
+ // build-kb.mjs — GENERIC, config-driven RVF corpus builder.
3
+ //
4
+ // Replaces the per-repo prototypes (build-ruview-kb.mjs + .build-ruvector-kb/build.mjs): repo
5
+ // shape is now DATA in kb.config.mjs (repoDir, scopeExclude, extension classes, componentRoots,
6
+ // include[] rules). This script:
7
+ // 1. reads the target entry from kb.config.mjs (--target <slug>, default = defaultTarget),
8
+ // 2. force-walks the target's OWN tree only (scopeExclude + .gitmodules submodule paths skipped),
9
+ // 3. dispatches config.include[] through corpus-rules.mjs (the rule-type registry),
10
+ // 4. embeds every chunk with local MiniLM-384 (Xenova/all-MiniLM-L6-v2, quantized ONNX, offline),
11
+ // 5. writes the SMALL store into kb/stores/<slug>/:
12
+ // <slug>-kb.small.rvf (+ .idmap.json written by RVF on close)
13
+ // <slug>-kb.passages.jsonl (full untruncated chunk text, shared with the big variant)
14
+ // <slug>-kb.ids.json (per-id kind/preview index; guard-check + ask-kb read this).
15
+ //
16
+ // Vectors: @ruvector/rvf (RvfDatabase, HNSW, cosine). Embeddings: local @xenova/transformers.
17
+ // Deps resolved PORTABLY via resolve-deps.mjs. NO cloud APIs. close() is the only persist path.
18
+ //
19
+ // Usage: node kb/build-kb.mjs --target agent-harness-generator
20
+ // node kb/build-kb.mjs (uses defaultTarget)
21
+
22
+ import fs from 'node:fs';
23
+ import path from 'node:path';
24
+ import { fileURLToPath } from 'node:url';
25
+ import { loadRvf, loadTransformers, configureModel, chooseModelCache } from './resolve-deps.mjs';
26
+ import { getTarget, defaultTarget } from './kb.config.mjs';
27
+ import { RULE_IMPLS } from './corpus-rules.mjs';
28
+
29
+ const __dirname = path.dirname(fileURLToPath(import.meta.url)); // kb/
30
+
31
+ // Default embedder (legacy / Seed-compatible): MiniLM-384 → <slug>-kb.small.rvf.
32
+ // A target may override via an `embed` block in kb.config.mjs (ADR-0001 v1.3.1, single 384-dim
33
+ // desktop variant — e.g. ruvn uses bge-small-en-v1.5, mean-pooled, asymmetric query prefix,
34
+ // written to the canonical un-suffixed <slug>-kb.rvf with an .embed.json sidecar that ask-kb +
35
+ // index-primer read so the query is embedded with the SAME model).
36
+ const DEFAULT_MODEL = 'Xenova/all-MiniLM-L6-v2';
37
+ const DEFAULT_DIM = 384;
38
+ const CHUNK_CHARS = 4000; // ~1000 tokens (hard ceiling for the window fallback)
39
+ const OVERLAP_CHARS = 400;
40
+ // Structure-aware target (ADR-0001 v1.3.0 D5): split at code/document STRUCTURE boundaries,
41
+ // keep a doc-comment attached to the symbol it documents, target ≤512 tokens (~2048 chars).
42
+ const STRUCT_TARGET_CHARS = 2048; // ≤512 tokens
43
+ const STRUCT_MIN_CHARS = 160; // don't emit micro-fragments; coalesce upward
44
+
45
+ // ---------- arg parsing ----------
46
+ function parseArgs(argv) {
47
+ const a = { target: defaultTarget };
48
+ for (let i = 0; i < argv.length; i++) {
49
+ if (argv[i] === '--target') a.target = argv[++i];
50
+ else if (argv[i].startsWith('--target=')) a.target = argv[i].slice('--target='.length);
51
+ }
52
+ return a;
53
+ }
54
+
55
+ // ---------- source_type tagging (ADR-0001 v1.3.2: src|test|example|doc|config) ----------
56
+ // Every passage carries a coarse source_type so the AI knows what a hit IS (production source vs a
57
+ // test vs an example vs prose vs config) and can ask for tests/examples as usage docs. Derived from
58
+ // the path first (tests/, examples/, benches/ dirs win), then the corpus `kind`, then the extension.
59
+ function classifySourceType(relPath, kind) {
60
+ const p = (relPath || '').toLowerCase();
61
+ if (/(^|\/)(tests?|__tests__|spec|__mocks__)\//.test(p) || /[._-]test\.|[._-]spec\.|\.test$/.test(p)) return 'test';
62
+ if (/(^|\/)(examples?|demos?)\//.test(p)) return 'example';
63
+ if (/(^|\/)benches?\//.test(p)) return 'test'; // benchmarks are exercised-usage; group with test
64
+ if (kind === 'template') return 'example'; // scaffolding templates are usage exemplars
65
+ if (kind === 'crate' || kind === 'npm') return 'config';
66
+ if (/(^|\/)(cargo\.toml|package\.json|tsconfig\.json|\.toml|\.ya?ml|\.json|\.config\.)/.test(p)) return 'config';
67
+ if (kind === 'doc' || kind === 'adr' || kind === 'ddd' || kind === 'tutorial' || /\.(md|mdx|txt)$/.test(p)) return 'doc';
68
+ if (kind === 'source' || kind === 'crate-src' || /\.(rs|ts|tsx|js|mjs|cjs|py|go)$/.test(p)) return 'src';
69
+ return 'doc';
70
+ }
71
+
72
+ // ---------- shared text helpers ----------
73
+ // windowChunk — naive fixed-window fallback for any single STRUCTURAL segment that is still
74
+ // larger than the hard ceiling (e.g. one enormous function or one giant prose block).
75
+ function windowChunk(text) {
76
+ const out = [];
77
+ if (text.length <= CHUNK_CHARS) return [text];
78
+ let i = 0;
79
+ while (i < text.length) {
80
+ let end = Math.min(i + CHUNK_CHARS, text.length);
81
+ if (end < text.length) {
82
+ const para = text.lastIndexOf('\n\n', end);
83
+ if (para > i + CHUNK_CHARS / 2) end = para; // prefer paragraph boundary
84
+ }
85
+ out.push(text.slice(i, end));
86
+ if (end >= text.length) break;
87
+ i = end - OVERLAP_CHARS;
88
+ }
89
+ return out;
90
+ }
91
+
92
+ // structureBoundaries — return char offsets where a NEW structural unit STARTS, so a split
93
+ // there keeps a doc-comment attached to the symbol that immediately follows it. Detects:
94
+ // • Markdown/prose: heading lines (#, ##, …) and fenced-block edges.
95
+ // • Rust / TS / JS source: the start of a doc-comment run (//!, ///, /** , //, #-doc) OR,
96
+ // if no doc-comment precedes, the symbol line itself (pub fn / fn / impl / struct / enum /
97
+ // trait / mod / function / class / export). A doc-comment immediately above a symbol is
98
+ // treated as the START of that symbol's unit (so they never split apart).
99
+ // Lines are 0-indexed offsets into `text`.
100
+ function structureBoundaries(text) {
101
+ const lines = text.split('\n');
102
+ const offsets = [];
103
+ let pos = 0;
104
+ const lineStart = lines.map((l) => { const s = pos; pos += l.length + 1; return s; });
105
+
106
+ const isHeading = (l) => /^#{1,6}\s/.test(l);
107
+ const isSymbol = (l) => /^\s*(pub\s+)?(async\s+)?(unsafe\s+)?(fn|impl|struct|enum|trait|mod|union|macro_rules!)\b/.test(l)
108
+ || /^\s*(export\s+)?(default\s+)?(async\s+)?(function|class)\b/.test(l)
109
+ || /^\s*export\s+(const|let|var|interface|type|enum)\b/.test(l);
110
+ const isDocComment = (l) => /^\s*(\/\/[!/]|\/\*\*|#\s)/.test(l) || /^\s*\/\/\s/.test(l);
111
+
112
+ const bset = new Set([0]);
113
+ for (let i = 1; i < lines.length; i++) {
114
+ if (isHeading(lines[i])) { bset.add(lineStart[i]); continue; }
115
+ if (isSymbol(lines[i])) {
116
+ // Walk UP over an attached doc-comment run; the unit starts at the top of that run.
117
+ let j = i - 1;
118
+ while (j >= 0 && isDocComment(lines[j]) && lines[j].trim() !== '') j--;
119
+ bset.add(lineStart[j + 1]);
120
+ }
121
+ }
122
+ for (const b of [...bset].sort((a, z) => a - z)) offsets.push(b);
123
+ return offsets;
124
+ }
125
+
126
+ // chunk — STRUCTURE-AWARE chunker (ADR-0001 v1.3.0 D5). Split at structural boundaries first,
127
+ // coalesce adjacent small units up to ~512 tokens, and only fall back to the char-window
128
+ // splitter for a single structural unit that still exceeds the hard ceiling. Content is never
129
+ // dropped (every char of `text` lands in exactly one chunk's coverage; oversized units overlap).
130
+ function chunk(text) {
131
+ if (text.length <= STRUCT_TARGET_CHARS) return [text];
132
+ const bounds = structureBoundaries(text);
133
+ if (bounds.length <= 1) return windowChunk(text); // no structure found → window fallback
134
+ // Build raw segments between consecutive boundaries.
135
+ const segs = [];
136
+ for (let i = 0; i < bounds.length; i++) {
137
+ const start = bounds[i];
138
+ const end = i + 1 < bounds.length ? bounds[i + 1] : text.length;
139
+ segs.push(text.slice(start, end));
140
+ }
141
+ // Coalesce adjacent segments up to the ≤512-token target; window-split any oversized one.
142
+ const out = [];
143
+ let buf = '';
144
+ const flush = () => { if (buf.trim()) out.push(buf); buf = ''; };
145
+ for (const s of segs) {
146
+ if (s.length > CHUNK_CHARS) { // a single huge unit → window-split it alone
147
+ flush();
148
+ for (const w of windowChunk(s)) out.push(w);
149
+ continue;
150
+ }
151
+ if (buf.length + s.length > STRUCT_TARGET_CHARS && buf.length >= STRUCT_MIN_CHARS) flush();
152
+ buf += s;
153
+ }
154
+ flush();
155
+ return out.length ? out : [text];
156
+ }
157
+
158
+ // ---------- build context factory ----------
159
+ // Builds the `ctx` object the corpus rules consume. Honors scopeExclude + .gitmodules (force-walk:
160
+ // a repo with no .gitmodules walks its whole tree, which is the intended AHG case).
161
+ function makeContext(target) {
162
+ const repoDir = path.resolve(__dirname, target.repoDir);
163
+ if (!fs.existsSync(repoDir)) {
164
+ const hint = process.env.KB_REPO_DIR
165
+ ? `(KB_REPO_DIR override: ${process.env.KB_REPO_DIR} — check that the clone path exists)`
166
+ : '(clone into .targets/ for direct use, or pass via tools/build-kb.mjs which sets KB_REPO_DIR)';
167
+ throw new Error(`build-kb: repoDir not found for target "${target.slug}": ${repoDir} ${hint}`);
168
+ }
169
+ const skip = new Set(target.scopeExclude || []);
170
+
171
+ // Read .gitmodules so nested submodules (external upstream repos) are never indexed. Absent
172
+ // .gitmodules -> force-walk (no-op exclusion), the AHG case (Constraint A).
173
+ const submoduleDirs = (() => {
174
+ const set = new Set();
175
+ const gm = (() => { try { return fs.readFileSync(path.join(repoDir, '.gitmodules'), 'utf8'); } catch { return null; } })();
176
+ if (gm) for (const m of gm.matchAll(/^\s*path\s*=\s*(.+?)\s*$/gm)) set.add(path.resolve(repoDir, m[1].trim()));
177
+ return set;
178
+ })();
179
+ const inSubmodule = (p) => {
180
+ for (const d of submoduleDirs) { if (p === d || p.startsWith(d + path.sep)) return true; }
181
+ return false;
182
+ };
183
+
184
+ function* walk(dir) {
185
+ let dirents;
186
+ try { dirents = fs.readdirSync(dir, { withFileTypes: true }).sort((a, b) => a.name.localeCompare(b.name)); }
187
+ catch { return; }
188
+ for (const e of dirents) {
189
+ const p = path.join(dir, e.name);
190
+ if (e.isDirectory()) {
191
+ if (skip.has(e.name)) continue;
192
+ if (inSubmodule(p)) continue;
193
+ yield* walk(p);
194
+ } else if (e.isFile()) {
195
+ yield p;
196
+ }
197
+ }
198
+ }
199
+
200
+ const rel = (p) => path.relative(repoDir, p);
201
+
202
+ // Corpus accumulation state.
203
+ const entries = []; // { path, kind, source_type, title, chunkIdx, chunkTotal, text }
204
+ const sourceCounts = {}; // kind -> source-file count
205
+ const ingestedPaths = new Set(); // absolute paths already ingested (md-sweep / literal de-dup)
206
+ const fullBodyPaths = new Set(); // absolute paths ingested as full source bodies
207
+
208
+ function addDoc(relPath, kind, title, text, absPath) {
209
+ const source_type = classifySourceType(relPath, kind);
210
+ const chunks = chunk(text);
211
+ chunks.forEach((c, i) => entries.push({
212
+ path: relPath, kind, source_type, title, chunkIdx: i, chunkTotal: chunks.length, text: c,
213
+ }));
214
+ sourceCounts[kind] = (sourceCounts[kind] || 0) + 1;
215
+ if (absPath) ingestedPaths.add(absPath);
216
+ }
217
+
218
+ return {
219
+ repoDir,
220
+ walk,
221
+ rel,
222
+ addDoc,
223
+ alreadyIngested: (absPath) => ingestedPaths.has(absPath),
224
+ isFullBody: (absPath) => fullBodyPaths.has(absPath),
225
+ markFullBody: (absPath) => { if (absPath) fullBodyPaths.add(absPath); },
226
+ entries,
227
+ sourceCounts,
228
+ };
229
+ }
230
+
231
+ // ---------- main ----------
232
+ async function main() {
233
+ const { target: slug } = parseArgs(process.argv.slice(2));
234
+ const target = getTarget(slug);
235
+
236
+ // Resolve the embedder for this target (per-target override beats the MiniLM default).
237
+ const emb = target.embed || {};
238
+ const MODEL = emb.model || DEFAULT_MODEL;
239
+ const DIM = emb.dim || DEFAULT_DIM;
240
+ const POOLING = emb.pooling || 'mean';
241
+ const QUERY_PREFIX = emb.queryPrefix || ''; // passages are embedded WITHOUT the prefix
242
+ const RANK_SCALE = typeof emb.rankScale === 'number' ? emb.rankScale : 1.0;
243
+ // Output naming: an overridden embedder writes the canonical un-suffixed <slug>-kb.rvf
244
+ // (single 384-dim desktop variant, recipe v1.3.1). The MiniLM default keeps .small.rvf for
245
+ // backward-compat with the already-shipped reference repo.
246
+ const RVF_SUFFIX = emb.rvfSuffix || (target.embed ? '.rvf' : '.small.rvf');
247
+ console.log(`[build-kb] target=${slug} metaName=${target.metaName} | model=${MODEL} dim=${DIM} pooling=${POOLING} out=${slug}-kb${RVF_SUFFIX}`);
248
+
249
+ const ctx = makeContext(target);
250
+ console.log(`[build-kb] repoDir=${ctx.repoDir}`);
251
+
252
+ // ---- run the configured include rules in order ----
253
+ for (const rule of target.include || []) {
254
+ const impl = RULE_IMPLS[rule.rule];
255
+ if (!impl) { console.error(`[build-kb] unknown include rule "${rule.rule}" — skipped`); continue; }
256
+ const n = impl(ctx, rule);
257
+ console.log(`[build-kb] rule ${rule.rule.padEnd(18)} -> ${n} source(s)`);
258
+ }
259
+
260
+ const entries = ctx.entries;
261
+ console.log('=== CORPUS (source files per kind) ===');
262
+ console.log(JSON.stringify(ctx.sourceCounts, null, 2));
263
+ const kindTotals = {};
264
+ for (const e of entries) kindTotals[e.kind] = (kindTotals[e.kind] || 0) + 1;
265
+ console.log('Chunks per kind:', JSON.stringify(kindTotals));
266
+ console.log('Total chunks to embed:', entries.length);
267
+ console.log('Distinct source paths:', new Set(entries.map((e) => e.path)).size);
268
+ if (!entries.length) { console.error('[build-kb] corpus is EMPTY — nothing to build'); process.exit(1); }
269
+
270
+ // ---- output layout: kb/stores/<slug>/ ----
271
+ const storeDir = path.join(__dirname, 'stores', slug);
272
+ fs.mkdirSync(storeDir, { recursive: true });
273
+ const base = path.join(storeDir, `${slug}-kb`);
274
+ const OUT_RVF = `${base}${RVF_SUFFIX}`;
275
+ const OUT_PASSAGES = `${base}.passages.jsonl`;
276
+ const OUT_IDS = `${base}.ids.json`;
277
+ const OUT_EMBEDCFG = `${OUT_RVF}.embed.json`;
278
+
279
+ // ---- embedder (offline-first; remote download only if model not cached) ----
280
+ const { mod: rvfMod, via: rvfVia } = loadRvf();
281
+ const { RvfDatabase } = rvfMod;
282
+ console.log('[build-kb] @ruvector/rvf via:', rvfVia);
283
+ const { T, modelCache, via: tVia } = await loadTransformers();
284
+ const cache = chooseModelCache(MODEL);
285
+ const { haveLocalModel } = configureModel(T, cache, MODEL);
286
+ console.log(`[build-kb] transformers via ${tVia} | model ${haveLocalModel ? 'local' : 'remote'} (${cache})`);
287
+ const fe = await T.pipeline('feature-extraction', MODEL, { quantized: true });
288
+
289
+ // ---- fresh store + sidecars ----
290
+ fs.rmSync(OUT_RVF, { force: true });
291
+ fs.rmSync(OUT_RVF + '.idmap.json', { force: true });
292
+ const db = await RvfDatabase.create(OUT_RVF, { dimensions: DIM, metric: 'cosine' });
293
+
294
+ const idsIndex = {}; // id -> { path, kind, title, chunk, preview }
295
+ fs.rmSync(OUT_PASSAGES, { force: true });
296
+ const passagesFd = fs.openSync(OUT_PASSAGES, 'w');
297
+ let passageLines = 0;
298
+ const BATCH = 32;
299
+ let ingested = 0;
300
+ for (let i = 0; i < entries.length; i += BATCH) {
301
+ const batch = entries.slice(i, i + BATCH);
302
+ // PASSAGES are embedded WITHOUT the query prefix (bge asymmetric retrieval).
303
+ const out = await fe(batch.map((e) => e.text), { pooling: POOLING, normalize: true });
304
+ const dim = out.dims[1];
305
+ if (dim !== DIM) throw new Error(`embed dim ${dim} != ${DIM}`);
306
+ const ingest = batch.map((e, j) => {
307
+ const id = String(i + j + 1);
308
+ idsIndex[id] = {
309
+ path: e.path, kind: e.kind, source_type: e.source_type, title: e.title,
310
+ chunk: e.chunkIdx + 1, of: e.chunkTotal,
311
+ preview: e.text.slice(0, 240).replace(/\s+/g, ' '),
312
+ };
313
+ fs.writeSync(passagesFd, JSON.stringify({ id, text: e.text, path: e.path, title: e.title, source_type: e.source_type, kind: e.kind }) + '\n');
314
+ passageLines++;
315
+ return {
316
+ id,
317
+ vector: Float32Array.from(out.data.slice(j * dim, (j + 1) * dim)),
318
+ metadata: { path: e.path, kind: e.kind, source_type: e.source_type, title: e.title, chunk: e.chunkIdx },
319
+ };
320
+ });
321
+ const r = await db.ingestBatch(ingest);
322
+ ingested += r.accepted;
323
+ if (r.rejected) console.error('REJECTED', r.rejected, 'in batch at', i);
324
+ if ((i / BATCH) % 20 === 0) process.stdout.write(`\r${i + batch.length}/${entries.length}`);
325
+ }
326
+ fs.closeSync(passagesFd);
327
+ console.log(`\n[build-kb] ingested ${ingested} vectors | passages lines ${passageLines}`);
328
+
329
+ const status = await db.status();
330
+ await db.close(); // ONLY persist path
331
+
332
+ fs.writeFileSync(OUT_IDS, JSON.stringify({
333
+ model: MODEL, dimensions: DIM, metric: 'cosine', entries: idsIndex,
334
+ }, null, 0));
335
+
336
+ // Embedder sidecar (read by ask-kb + index-primer so the QUERY/primer use the SAME model).
337
+ // Only written for an overridden embedder; the MiniLM default needs none (ask-kb's default).
338
+ if (target.embed) {
339
+ fs.writeFileSync(OUT_EMBEDCFG, JSON.stringify({
340
+ model: MODEL, pooling: POOLING, normalize: true, queryPrefix: QUERY_PREFIX, rankScale: RANK_SCALE,
341
+ }, null, 2));
342
+ }
343
+
344
+ const idCount = Object.keys(idsIndex).length;
345
+ const ok = status.totalVectors === entries.length && passageLines === entries.length && idCount === entries.length;
346
+ console.log('=== POST-INGEST ===');
347
+ console.log('RVF status:', JSON.stringify(status));
348
+ console.log(`Reconcile: chunks=${entries.length} vectors=${status.totalVectors} passages=${passageLines} ids=${idCount} match=${ok}`);
349
+ if (!ok) { console.error('[build-kb] RECONCILE FAILED'); process.exit(1); }
350
+ console.log(`[build-kb] OK -> ${path.relative(__dirname, OUT_RVF)} (+passages,ids) | size ${fs.statSync(OUT_RVF).size} bytes`);
351
+ }
352
+
353
+ main().catch((e) => { console.error('[build-kb] ERROR:', e); process.exit(1); });