moflo 4.8.4 → 4.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,892 +1,899 @@
1
- #!/usr/bin/env node
2
- /**
3
- * Index guidance files into claude-flow memory with full RAG linked segments
4
- *
5
- * Strategy:
6
- * - Full documents stored as `doc-{name}` for complete retrieval
7
- * - Semantic chunks stored as `chunk-{name}-{n}` for precise search
8
- * - FULL RAG LINKING:
9
- * - parentDoc: link to full document
10
- * - prevChunk/nextChunk: forward/backward navigation
11
- * - siblings: all chunk keys from same document
12
- * - children: sub-chunks for hierarchical headers (h2 -> h3)
13
- * - contextBefore/contextAfter: overlapping text for context continuity
14
- * - Chunking based on markdown headers (## and ###) for natural boundaries
15
- * - After indexing, generates embeddings for semantic search (HNSW)
16
- *
17
- * Usage:
18
- * node node_modules/moflo/bin/index-guidance.mjs # Index all + generate embeddings
19
- * npx flo-index --force # Force reindex all
20
- * npx flo-index --file X # Index specific file
21
- * npx flo-index --no-embeddings # Skip embedding generation
22
- * npx flo-index --overlap 20 # Set context overlap % (default: 15)
23
- */
24
-
25
- import { existsSync, readdirSync, readFileSync, statSync, mkdirSync, writeFileSync } from 'fs';
26
- import { resolve, dirname, basename, extname } from 'path';
27
- import { fileURLToPath } from 'url';
28
- import { mofloResolveURL } from './lib/moflo-resolve.mjs';
29
- const initSqlJs = (await import(mofloResolveURL('sql.js'))).default;
30
-
31
-
32
- const __dirname = dirname(fileURLToPath(import.meta.url));
33
-
34
- function findProjectRoot() {
35
- let dir = process.cwd();
36
- const root = resolve(dir, '/');
37
- while (dir !== root) {
38
- if (existsSync(resolve(dir, 'package.json'))) return dir;
39
- dir = dirname(dir);
40
- }
41
- return process.cwd();
42
- }
43
-
44
- const projectRoot = findProjectRoot();
45
-
46
- // Locate the moflo package root (for bundled guidance that ships with moflo)
47
- const mofloRoot = resolve(__dirname, '..');
48
-
49
- const NAMESPACE = 'guidance';
50
- const DB_PATH = resolve(projectRoot, '.swarm/memory.db');
51
-
52
- // ============================================================================
53
- // Load guidance directories from moflo.yaml, falling back to defaults
54
- // ============================================================================
55
-
56
- function loadGuidanceDirs() {
57
- const dirs = [];
58
-
59
- // 1. Read moflo.yaml / moflo.config.json for user-configured directories
60
- let configDirs = null;
61
- const yamlPath = resolve(projectRoot, 'moflo.yaml');
62
- const jsonPath = resolve(projectRoot, 'moflo.config.json');
63
-
64
- if (existsSync(yamlPath)) {
65
- try {
66
- const content = readFileSync(yamlPath, 'utf-8');
67
- // Simple YAML array extraction — avoids needing js-yaml at runtime
68
- // Matches: guidance:\n directories:\n - .claude/guidance\n - docs/guides
69
- const guidanceBlock = content.match(/guidance:\s*\n\s+directories:\s*\n((?:\s+-\s+.+\n?)+)/);
70
- if (guidanceBlock) {
71
- const items = guidanceBlock[1].match(/-\s+(.+)/g);
72
- if (items && items.length > 0) {
73
- configDirs = items.map(item => item.replace(/^-\s+/, '').trim());
74
- }
75
- }
76
- } catch { /* ignore parse errors, fall through to defaults */ }
77
- } else if (existsSync(jsonPath)) {
78
- try {
79
- const raw = JSON.parse(readFileSync(jsonPath, 'utf-8'));
80
- if (raw.guidance?.directories && Array.isArray(raw.guidance.directories)) {
81
- configDirs = raw.guidance.directories;
82
- }
83
- } catch { /* ignore parse errors */ }
84
- }
85
-
86
- // Use config dirs or fall back to defaults
87
- const userDirs = configDirs || ['.claude/guidance', 'docs/guides'];
88
- for (const d of userDirs) {
89
- dirs.push({ path: d, prefix: 'guidance' });
90
- }
91
-
92
- // 2. Include moflo's own bundled guidance (ships with the package)
93
- // Only when running inside a consumer project (not moflo itself)
94
- const bundledGuidanceDir = resolve(mofloRoot, '.claude/guidance');
95
- const projectGuidanceDir = resolve(projectRoot, '.claude/guidance');
96
- if (
97
- existsSync(bundledGuidanceDir) &&
98
- resolve(bundledGuidanceDir) !== resolve(projectGuidanceDir)
99
- ) {
100
- dirs.push({ path: bundledGuidanceDir, prefix: 'moflo-bundled', absolute: true });
101
- }
102
-
103
- // 3. Include CLAUDE.md from project root if it exists
104
- // This is the primary project instruction file for Claude-enabled projects
105
- const claudeMdPath = resolve(projectRoot, 'CLAUDE.md');
106
- if (existsSync(claudeMdPath)) {
107
- dirs.push({ path: projectRoot, prefix: 'project-root', absolute: true, fileFilter: ['CLAUDE.md'] });
108
- }
109
-
110
- return dirs;
111
- }
112
-
113
- const GUIDANCE_DIRS = loadGuidanceDirs();
114
-
115
- // Chunking config - optimized for Claude's retrieval
116
- const MIN_CHUNK_SIZE = 50; // Lower minimum to avoid mega-chunks
117
- const MAX_CHUNK_SIZE = 4000; // Larger chunks for code-heavy docs (fits context better)
118
- const FORCE_CHUNK_THRESHOLD = 6000; // Force paragraph-split if file > this and < 3 chunks
119
- const DEFAULT_OVERLAP_PERCENT = 20; // Increased context overlap for better continuity
120
-
121
- // Parse args
122
- const args = process.argv.slice(2);
123
- const force = args.includes('--force');
124
- const specificFile = args.includes('--file') ? args[args.indexOf('--file') + 1] : null;
125
- const verbose = args.includes('--verbose') || args.includes('-v');
126
- const skipEmbeddings = args.includes('--no-embeddings');
127
- const overlapPercent = args.includes('--overlap')
128
- ? parseInt(args[args.indexOf('--overlap') + 1], 10) || DEFAULT_OVERLAP_PERCENT
129
- : DEFAULT_OVERLAP_PERCENT;
130
-
131
- function log(msg) {
132
- console.log(`[index-guidance] ${msg}`);
133
- }
134
-
135
- function debug(msg) {
136
- if (verbose) console.log(`[index-guidance] ${msg}`);
137
- }
138
-
139
- function ensureDbDir() {
140
- const dir = dirname(DB_PATH);
141
- if (!existsSync(dir)) {
142
- mkdirSync(dir, { recursive: true });
143
- }
144
- }
145
-
146
- async function getDb() {
147
- ensureDbDir();
148
- const SQL = await initSqlJs();
149
- let db;
150
- if (existsSync(DB_PATH)) {
151
- const buffer = readFileSync(DB_PATH);
152
- db = new SQL.Database(buffer);
153
- } else {
154
- db = new SQL.Database();
155
- }
156
-
157
- // Ensure table exists with unique constraint
158
- db.run(`
159
- CREATE TABLE IF NOT EXISTS memory_entries (
160
- id TEXT PRIMARY KEY,
161
- key TEXT NOT NULL,
162
- namespace TEXT DEFAULT 'default',
163
- content TEXT NOT NULL,
164
- type TEXT DEFAULT 'semantic',
165
- embedding TEXT,
166
- embedding_model TEXT DEFAULT 'local',
167
- embedding_dimensions INTEGER,
168
- tags TEXT,
169
- metadata TEXT,
170
- owner_id TEXT,
171
- created_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now') * 1000),
172
- updated_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now') * 1000),
173
- expires_at INTEGER,
174
- last_accessed_at INTEGER,
175
- access_count INTEGER DEFAULT 0,
176
- status TEXT DEFAULT 'active',
177
- UNIQUE(namespace, key)
178
- )
179
- `);
180
-
181
- db.run(`CREATE INDEX IF NOT EXISTS idx_memory_key_ns ON memory_entries(key, namespace)`);
182
- db.run(`CREATE INDEX IF NOT EXISTS idx_memory_namespace ON memory_entries(namespace)`);
183
-
184
- return db;
185
- }
186
-
187
- function saveDb(db) {
188
- const data = db.export();
189
- writeFileSync(DB_PATH, Buffer.from(data));
190
- }
191
-
192
- function generateId() {
193
- return `mem_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
194
- }
195
-
196
- function hashContent(content) {
197
- let hash = 0;
198
- for (let i = 0; i < content.length; i++) {
199
- const char = content.charCodeAt(i);
200
- hash = ((hash << 5) - hash) + char;
201
- hash = hash & hash;
202
- }
203
- return hash.toString(16);
204
- }
205
-
206
- function storeEntry(db, key, content, metadata = {}, tags = []) {
207
- const now = Date.now();
208
- const id = generateId();
209
- const metaJson = JSON.stringify(metadata);
210
- const tagsJson = JSON.stringify(tags);
211
-
212
- db.run(`
213
- INSERT OR REPLACE INTO memory_entries
214
- (id, key, namespace, content, metadata, tags, created_at, updated_at, status)
215
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active')
216
- `, [id, key, NAMESPACE, content, metaJson, tagsJson, now, now]);
217
-
218
- return true;
219
- }
220
-
221
- function deleteByPrefix(db, prefix) {
222
- db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key LIKE ?`, [NAMESPACE, `${prefix}%`]);
223
- }
224
-
225
- function getEntryHash(db, key) {
226
- const stmt = db.prepare('SELECT metadata FROM memory_entries WHERE key = ? AND namespace = ?');
227
- stmt.bind([key, NAMESPACE]);
228
- const entry = stmt.step() ? stmt.getAsObject() : null;
229
- stmt.free();
230
- if (entry?.metadata) {
231
- try {
232
- const meta = JSON.parse(entry.metadata);
233
- return meta.contentHash;
234
- } catch { /* ignore */ }
235
- }
236
- return null;
237
- }
238
-
239
- /**
240
- * Extract overlapping context from adjacent text
241
- * @param {string} text - The text to extract from
242
- * @param {number} percent - Percentage of text to extract
243
- * @param {string} position - 'start' or 'end'
244
- * @returns {string} - The extracted context
245
- */
246
- function extractOverlapContext(text, percent, position) {
247
- if (!text || percent <= 0) return '';
248
-
249
- const targetLength = Math.floor(text.length * (percent / 100));
250
- if (targetLength < 20) return ''; // Too short to be useful
251
-
252
- if (position === 'start') {
253
- // Get first N% of text, try to break at sentence/paragraph
254
- let end = targetLength;
255
- const nextPara = text.indexOf('\n\n', targetLength - 50);
256
- const nextSentence = text.indexOf('. ', targetLength - 30);
257
-
258
- if (nextPara > 0 && nextPara < targetLength + 100) {
259
- end = nextPara;
260
- } else if (nextSentence > 0 && nextSentence < targetLength + 50) {
261
- end = nextSentence + 1;
262
- }
263
-
264
- return text.substring(0, end).trim();
265
- } else {
266
- // Get last N% of text, try to break at sentence/paragraph
267
- let start = text.length - targetLength;
268
- const prevPara = text.lastIndexOf('\n\n', start + 50);
269
- const prevSentence = text.lastIndexOf('. ', start + 30);
270
-
271
- if (prevPara > 0 && prevPara > start - 100) {
272
- start = prevPara + 2;
273
- } else if (prevSentence > 0 && prevSentence > start - 50) {
274
- start = prevSentence + 2;
275
- }
276
-
277
- return text.substring(start).trim();
278
- }
279
- }
280
-
281
- /**
282
- * Split markdown content into semantic chunks based on headers
283
- * Returns array of { title, content, level, headerLine }
284
- */
285
- function chunkMarkdown(content, fileName) {
286
- const lines = content.split('\n');
287
- const chunks = [];
288
- let currentChunk = { title: fileName, content: [], level: 0, headerLine: 0 };
289
-
290
- for (let lineNum = 0; lineNum < lines.length; lineNum++) {
291
- // Strip CRLF carriage returns for Windows compatibility
292
- const line = lines[lineNum].replace(/\r$/, '');
293
-
294
- // Check for headers (## and ###)
295
- const h2Match = line.match(/^## (.+)$/);
296
- const h3Match = line.match(/^### (.+)$/);
297
-
298
- if (h2Match || h3Match) {
299
- // Save current chunk if it has content
300
- if (currentChunk.content.length > 0) {
301
- const chunkContent = currentChunk.content.join('\n').trim();
302
- if (chunkContent.length >= MIN_CHUNK_SIZE) {
303
- chunks.push({
304
- title: currentChunk.title,
305
- content: chunkContent,
306
- level: currentChunk.level,
307
- headerLine: currentChunk.headerLine
308
- });
309
- }
310
- }
311
-
312
- // Start new chunk
313
- currentChunk = {
314
- title: h2Match ? h2Match[1] : h3Match[1],
315
- content: [line],
316
- level: h2Match ? 2 : 3,
317
- headerLine: lineNum
318
- };
319
- } else {
320
- currentChunk.content.push(line);
321
- }
322
- }
323
-
324
- // Don't forget the last chunk
325
- if (currentChunk.content.length > 0) {
326
- const chunkContent = currentChunk.content.join('\n').trim();
327
- if (chunkContent.length >= MIN_CHUNK_SIZE) {
328
- chunks.push({
329
- title: currentChunk.title,
330
- content: chunkContent,
331
- level: currentChunk.level,
332
- headerLine: currentChunk.headerLine
333
- });
334
- }
335
- }
336
-
337
- // Handle chunks that are too large - split by paragraphs
338
- const finalChunks = [];
339
- for (const chunk of chunks) {
340
- if (chunk.content.length > MAX_CHUNK_SIZE) {
341
- const paragraphs = chunk.content.split(/\n\n+/);
342
- let currentPart = [];
343
- let currentLength = 0;
344
- let partNum = 1;
345
-
346
- for (const para of paragraphs) {
347
- if (currentLength + para.length > MAX_CHUNK_SIZE && currentPart.length > 0) {
348
- finalChunks.push({
349
- title: `${chunk.title} (part ${partNum})`,
350
- content: currentPart.join('\n\n'),
351
- level: chunk.level,
352
- headerLine: chunk.headerLine,
353
- isPart: true,
354
- partNum
355
- });
356
- currentPart = [para];
357
- currentLength = para.length;
358
- partNum++;
359
- } else {
360
- currentPart.push(para);
361
- currentLength += para.length;
362
- }
363
- }
364
-
365
- if (currentPart.length > 0) {
366
- finalChunks.push({
367
- title: partNum > 1 ? `${chunk.title} (part ${partNum})` : chunk.title,
368
- content: currentPart.join('\n\n'),
369
- level: chunk.level,
370
- headerLine: chunk.headerLine,
371
- isPart: partNum > 1,
372
- partNum: partNum > 1 ? partNum : undefined
373
- });
374
- }
375
- } else {
376
- finalChunks.push(chunk);
377
- }
378
- }
379
-
380
- // FORCE CHUNKING: If file is large but resulted in few chunks, split by sections
381
- const totalContent = finalChunks.reduce((acc, c) => acc + c.content.length, 0);
382
- if (totalContent > FORCE_CHUNK_THRESHOLD && finalChunks.length < 3) {
383
- debug(` Force-chunking: ${totalContent} bytes in ${finalChunks.length} chunks - splitting by sections`);
384
- const allContent = finalChunks.map(c => c.content).join('\n\n');
385
-
386
- // Split on --- horizontal rules first, then on ## headers, then on paragraphs
387
- const TARGET_CHUNK_SIZE = 2500;
388
- const rawSections = allContent.split(/\n---+\n/);
389
- let sections = [];
390
-
391
- for (const raw of rawSections) {
392
- // Further split on ## headers if section is too large
393
- if (raw.length > TARGET_CHUNK_SIZE) {
394
- const headerSplit = raw.split(/\n(?=## )/);
395
- for (const hSect of headerSplit) {
396
- if (hSect.length > TARGET_CHUNK_SIZE) {
397
- // Split very long sections on single newlines as last resort
398
- const lines = hSect.split('\n');
399
- let chunk = '';
400
- for (const line of lines) {
401
- if (chunk.length + line.length > TARGET_CHUNK_SIZE && chunk.length > 100) {
402
- sections.push(chunk.trim());
403
- chunk = line;
404
- } else {
405
- chunk += (chunk ? '\n' : '') + line;
406
- }
407
- }
408
- if (chunk.trim().length > 30) sections.push(chunk.trim());
409
- } else if (hSect.trim().length > 30) {
410
- sections.push(hSect.trim());
411
- }
412
- }
413
- } else if (raw.trim().length > 30) {
414
- sections.push(raw.trim());
415
- }
416
- }
417
-
418
- // Now group sections into chunks
419
- const forcedChunks = [];
420
- let currentGroup = [];
421
- let currentLength = 0;
422
- let groupNum = 1;
423
-
424
- const flushGroup = () => {
425
- if (currentGroup.length === 0) return;
426
- const firstLine = currentGroup[0].split('\n')[0].trim();
427
- const title = firstLine.startsWith('#')
428
- ? firstLine.replace(/^#+\s*/, '').slice(0, 60)
429
- : `${fileName} Section ${groupNum}`;
430
-
431
- forcedChunks.push({
432
- title,
433
- content: currentGroup.join('\n\n'),
434
- level: 2,
435
- headerLine: 0,
436
- isForced: true,
437
- forceNum: groupNum
438
- });
439
- groupNum++;
440
- currentGroup = [];
441
- currentLength = 0;
442
- };
443
-
444
- for (const section of sections) {
445
- if (currentLength + section.length > TARGET_CHUNK_SIZE && currentGroup.length > 0) {
446
- flushGroup();
447
- }
448
- currentGroup.push(section);
449
- currentLength += section.length;
450
- }
451
- flushGroup();
452
-
453
- // Always use force-chunked results if we got multiple chunks
454
- if (forcedChunks.length >= 2) {
455
- debug(` Force-chunking produced ${forcedChunks.length} chunks (was ${finalChunks.length})`);
456
- return forcedChunks;
457
- }
458
- }
459
-
460
- return finalChunks;
461
- }
462
-
463
- /**
464
- * Build hierarchical relationships between chunks
465
- * H2 chunks are parents of subsequent H3 chunks
466
- */
467
- function buildHierarchy(chunks, chunkPrefix) {
468
- const hierarchy = {};
469
- let currentH2Index = null;
470
-
471
- for (let i = 0; i < chunks.length; i++) {
472
- const chunk = chunks[i];
473
- const chunkKey = `${chunkPrefix}-${i}`;
474
-
475
- hierarchy[chunkKey] = {
476
- parent: null,
477
- children: []
478
- };
479
-
480
- if (chunk.level === 2) {
481
- currentH2Index = i;
482
- } else if (chunk.level === 3 && currentH2Index !== null) {
483
- const parentKey = `${chunkPrefix}-${currentH2Index}`;
484
- hierarchy[chunkKey].parent = parentKey;
485
- hierarchy[parentKey].children.push(chunkKey);
486
- }
487
- }
488
-
489
- return hierarchy;
490
- }
491
-
492
- function indexFile(db, filePath, keyPrefix) {
493
- const fileName = basename(filePath, extname(filePath));
494
- const docKey = `doc-${keyPrefix}-${fileName}`;
495
- const chunkPrefix = `chunk-${keyPrefix}-${fileName}`;
496
-
497
- try {
498
- const content = readFileSync(filePath, 'utf-8');
499
- const contentHash = hashContent(content);
500
-
501
- // Check if content changed (skip if same hash unless --force)
502
- if (!force) {
503
- const existingHash = getEntryHash(db, docKey);
504
- if (existingHash === contentHash) {
505
- return { docKey, status: 'unchanged', chunks: 0 };
506
- }
507
- }
508
-
509
- const stats = statSync(filePath);
510
- const relativePath = filePath.replace(projectRoot, '').replace(/\\/g, '/');
511
-
512
- // Delete old chunks for this file before re-indexing
513
- deleteByPrefix(db, chunkPrefix);
514
-
515
- // 1. Store full document
516
- const docMetadata = {
517
- type: 'document',
518
- filePath: relativePath,
519
- fileSize: stats.size,
520
- lastModified: stats.mtime.toISOString(),
521
- contentHash,
522
- indexedAt: new Date().toISOString(),
523
- ragVersion: '2.0', // Mark as full RAG indexed
524
- };
525
-
526
- storeEntry(db, docKey, content, docMetadata, [keyPrefix, 'document']);
527
- debug(`Stored document: ${docKey}`);
528
-
529
- // 2. Chunk and store semantic pieces with full RAG linking
530
- const chunks = chunkMarkdown(content, fileName);
531
-
532
- if (chunks.length === 0) {
533
- return { docKey, status: 'indexed', chunks: 0 };
534
- }
535
-
536
- // Build hierarchy and sibling list
537
- const hierarchy = buildHierarchy(chunks, chunkPrefix);
538
- const siblings = chunks.map((_, i) => `${chunkPrefix}-${i}`);
539
-
540
- // Update document with children references
541
- const docChildrenMeta = {
542
- ...docMetadata,
543
- children: siblings,
544
- chunkCount: chunks.length,
545
- };
546
- storeEntry(db, docKey, content, docChildrenMeta, [keyPrefix, 'document']);
547
-
548
- for (let i = 0; i < chunks.length; i++) {
549
- const chunk = chunks[i];
550
- const chunkKey = `${chunkPrefix}-${i}`;
551
-
552
- // Build prev/next links
553
- const prevChunk = i > 0 ? `${chunkPrefix}-${i - 1}` : null;
554
- const nextChunk = i < chunks.length - 1 ? `${chunkPrefix}-${i + 1}` : null;
555
-
556
- // Extract overlapping context from adjacent chunks
557
- const contextBefore = i > 0
558
- ? extractOverlapContext(chunks[i - 1].content, overlapPercent, 'end')
559
- : null;
560
- const contextAfter = i < chunks.length - 1
561
- ? extractOverlapContext(chunks[i + 1].content, overlapPercent, 'start')
562
- : null;
563
-
564
- // Get hierarchical relationships
565
- const hierInfo = hierarchy[chunkKey];
566
-
567
- const chunkMetadata = {
568
- type: 'chunk',
569
- ragVersion: '2.0',
570
-
571
- // Document relationship
572
- parentDoc: docKey,
573
- parentPath: relativePath,
574
-
575
- // Sequential navigation (forward/backward links)
576
- chunkIndex: i,
577
- totalChunks: chunks.length,
578
- prevChunk,
579
- nextChunk,
580
-
581
- // Sibling awareness
582
- siblings,
583
-
584
- // Hierarchical relationships (h2 -> h3)
585
- hierarchicalParent: hierInfo.parent,
586
- hierarchicalChildren: hierInfo.children.length > 0 ? hierInfo.children : null,
587
-
588
- // Chunk info
589
- chunkTitle: chunk.title,
590
- headerLevel: chunk.level,
591
- headerLine: chunk.headerLine,
592
- isPart: chunk.isPart || false,
593
- partNum: chunk.partNum || null,
594
-
595
- // Overlapping context for continuity
596
- contextOverlapPercent: overlapPercent,
597
- hasContextBefore: !!contextBefore,
598
- hasContextAfter: !!contextAfter,
599
-
600
- // Content metadata
601
- contentLength: chunk.content.length,
602
- contentHash: hashContent(chunk.content),
603
- indexedAt: new Date().toISOString(),
604
- };
605
-
606
- // Build searchable content with title context
607
- // Include overlap context for better retrieval
608
- let searchableContent = `# ${chunk.title}\n\n`;
609
-
610
- if (contextBefore) {
611
- searchableContent += `[Context from previous section:]\n${contextBefore}\n\n---\n\n`;
612
- }
613
-
614
- searchableContent += chunk.content;
615
-
616
- if (contextAfter) {
617
- searchableContent += `\n\n---\n\n[Context from next section:]\n${contextAfter}`;
618
- }
619
-
620
- // Store chunk with full metadata
621
- storeEntry(
622
- db,
623
- chunkKey,
624
- searchableContent,
625
- chunkMetadata,
626
- [keyPrefix, 'chunk', `level-${chunk.level}`, chunk.title.toLowerCase().replace(/[^a-z0-9]+/g, '-')]
627
- );
628
-
629
- debug(` Stored chunk ${i}: ${chunk.title} (${chunk.content.length} chars, prev=${!!prevChunk}, next=${!!nextChunk})`);
630
- }
631
-
632
- return { docKey, status: 'indexed', chunks: chunks.length };
633
- } catch (err) {
634
- return { docKey, status: 'error', error: err.message, chunks: 0 };
635
- }
636
- }
637
-
638
- /**
639
- * Recursively collect all .md files under a directory.
640
- * Skips node_modules, .git, and other non-content directories.
641
- */
642
- function walkMdFiles(dir) {
643
- const SKIP_DIRS = new Set(['node_modules', '.git', 'dist', 'build', 'coverage', '.next']);
644
- const files = [];
645
-
646
- function walk(current) {
647
- if (!existsSync(current)) return;
648
- for (const entry of readdirSync(current, { withFileTypes: true })) {
649
- if (entry.isDirectory()) {
650
- if (!SKIP_DIRS.has(entry.name)) walk(resolve(current, entry.name));
651
- } else if (entry.isFile() && entry.name.endsWith('.md')) {
652
- files.push(resolve(current, entry.name));
653
- }
654
- }
655
- }
656
-
657
- walk(dir);
658
- return files;
659
- }
660
-
661
- function indexDirectory(db, dirConfig) {
662
- const dirPath = dirConfig.absolute ? dirConfig.path : resolve(projectRoot, dirConfig.path);
663
- const results = [];
664
-
665
- if (!existsSync(dirPath)) {
666
- if (verbose) debug(`Directory not found: ${dirConfig.path}`);
667
- return results;
668
- }
669
-
670
- const allMdFiles = walkMdFiles(dirPath);
671
- const filtered = dirConfig.fileFilter
672
- ? allMdFiles.filter(f => dirConfig.fileFilter.includes(basename(f)))
673
- : allMdFiles;
674
-
675
- for (const filePath of filtered) {
676
- const result = indexFile(db, filePath, dirConfig.prefix);
677
- results.push(result);
678
- }
679
-
680
- return results;
681
- }
682
-
683
- /**
684
- * Remove stale entries for files that no longer exist on disk.
685
- * Runs after indexing to keep the memory DB clean.
686
- */
687
- function cleanStaleEntries(db) {
688
- const docsStmt = db.prepare(
689
- `SELECT DISTINCT key FROM memory_entries WHERE namespace = ? AND key LIKE 'doc-%'`
690
- );
691
- docsStmt.bind([NAMESPACE]);
692
- const docs = [];
693
- while (docsStmt.step()) docs.push(docsStmt.getAsObject());
694
- docsStmt.free();
695
-
696
- let staleCount = 0;
697
-
698
- // Build a lookup of all indexed directory configs for stale detection
699
- const prefixToDirMap = {};
700
- for (const dirConfig of GUIDANCE_DIRS) {
701
- const dirPath = dirConfig.absolute ? dirConfig.path : resolve(projectRoot, dirConfig.path);
702
- prefixToDirMap[dirConfig.prefix] = dirPath;
703
- }
704
-
705
- for (const { key } of docs) {
706
- // Convert key back to file path by matching doc-{prefix}-{filename}
707
- let filePath;
708
- for (const [prefix, dirPath] of Object.entries(prefixToDirMap)) {
709
- const docPrefix = `doc-${prefix}-`;
710
- if (key.startsWith(docPrefix)) {
711
- filePath = resolve(dirPath, key.replace(docPrefix, '') + '.md');
712
- break;
713
- }
714
- }
715
- if (!filePath) continue; // Unknown prefix, skip
716
-
717
- if (!existsSync(filePath)) {
718
- const chunkPrefix = key.replace('doc-', 'chunk-');
719
- const countBefore = db.exec(`SELECT COUNT(*) as cnt FROM memory_entries WHERE namespace = '${NAMESPACE}'`)[0]?.values[0][0] || 0;
720
- db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key LIKE ?`, [NAMESPACE, `${chunkPrefix}%`]);
721
- db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key = ?`, [NAMESPACE, key]);
722
- const countAfter = db.exec(`SELECT COUNT(*) as cnt FROM memory_entries WHERE namespace = '${NAMESPACE}'`)[0]?.values[0][0] || 0;
723
- const removed = countBefore - countAfter;
724
- if (removed > 0) {
725
- log(` Removed ${removed} stale entries for deleted file: ${key}`);
726
- staleCount += removed;
727
- }
728
- }
729
- }
730
-
731
- // Also clean any orphaned entries not matching doc-/chunk- patterns
732
- const orphanStmt = db.prepare(
733
- `SELECT key FROM memory_entries WHERE namespace = ? AND key NOT LIKE 'doc-%' AND key NOT LIKE 'chunk-%'`
734
- );
735
- orphanStmt.bind([NAMESPACE]);
736
- const orphans = [];
737
- while (orphanStmt.step()) orphans.push(orphanStmt.getAsObject());
738
- orphanStmt.free();
739
- for (const { key } of orphans) {
740
- db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key = ?`, [NAMESPACE, key]);
741
- staleCount++;
742
- log(` Removed orphan entry: ${key}`);
743
- }
744
-
745
- return staleCount;
746
- }
747
-
748
- // Main
749
- console.log('');
750
- log('Indexing guidance files with FULL RAG linked segments...');
751
- log(` Context overlap: ${overlapPercent}%`);
752
- log(` Directories (${GUIDANCE_DIRS.length}):`);
753
- for (const d of GUIDANCE_DIRS) {
754
- const dirPath = d.absolute ? d.path : resolve(projectRoot, d.path);
755
- const exists = existsSync(dirPath);
756
- log(` ${exists ? '' : '✗'} ${d.absolute ? dirPath : d.path} [${d.prefix}]`);
757
- }
758
- console.log('');
759
-
760
- const db = await getDb();
761
- let docsIndexed = 0;
762
- let chunksIndexed = 0;
763
- let unchanged = 0;
764
- let errors = 0;
765
-
766
- if (specificFile) {
767
- // Index single file
768
- const filePath = resolve(projectRoot, specificFile);
769
- if (!existsSync(filePath)) {
770
- log(`File not found: ${specificFile}`);
771
- process.exit(1);
772
- }
773
-
774
- let prefix = 'docs';
775
- if (specificFile.includes('.claude/guidance/')) {
776
- prefix = 'guidance';
777
- }
778
-
779
- const result = indexFile(db, filePath, prefix);
780
- log(`${result.docKey}: ${result.status} (${result.chunks} chunks)`);
781
-
782
- if (result.status === 'indexed') {
783
- docsIndexed++;
784
- chunksIndexed += result.chunks;
785
- } else if (result.status === 'unchanged') {
786
- unchanged++;
787
- } else {
788
- errors++;
789
- }
790
- } else {
791
- // Index all directories
792
- for (const dir of GUIDANCE_DIRS) {
793
- log(`Scanning ${dir.path}/...`);
794
- const results = indexDirectory(db, dir);
795
-
796
- for (const result of results) {
797
- if (result.status === 'indexed') {
798
- log(` ${result.docKey} (${result.chunks} chunks)`);
799
- docsIndexed++;
800
- chunksIndexed += result.chunks;
801
- } else if (result.status === 'unchanged') {
802
- unchanged++;
803
- } else {
804
- log(` ❌ ${result.docKey}: ${result.error}`);
805
- errors++;
806
- }
807
- }
808
- }
809
- }
810
-
811
- // Clean stale entries for deleted files (unless indexing a specific file)
812
- let staleRemoved = 0;
813
- if (!specificFile) {
814
- log('Cleaning stale entries for deleted files...');
815
- staleRemoved = cleanStaleEntries(db);
816
- if (staleRemoved === 0) {
817
- log(' No stale entries found');
818
- }
819
- }
820
-
821
- // Write changes back to disk and close
822
- if (docsIndexed > 0 || chunksIndexed > 0 || staleRemoved > 0) saveDb(db);
823
- db.close();
824
-
825
- console.log('');
826
- log('═══════════════════════════════════════════════════════════');
827
- log(' FULL RAG INDEXING COMPLETE');
828
- log('═══════════════════════════════════════════════════════════');
829
- log(` Documents indexed: ${docsIndexed}`);
830
- log(` Chunks created: ${chunksIndexed}`);
831
- log(` Unchanged: ${unchanged}`);
832
- log(` Stale removed: ${staleRemoved}`);
833
- log(` Errors: ${errors}`);
834
- log('');
835
- log(' RAG Features Enabled:');
836
- log(` • Forward/backward links (prevChunk/nextChunk)`);
837
- log(` Sibling awareness (all chunks from same doc)`);
838
- log(` • Hierarchical links (h2 -> h3 parent/children)`);
839
- log(` Context overlap: ${overlapPercent}% (contextBefore/contextAfter)`);
840
- log('═══════════════════════════════════════════════════════════');
841
-
842
- // Generate embeddings for new entries (unless skipped or nothing changed)
843
- // Runs in BACKGROUND to avoid blocking startup
844
- if (!skipEmbeddings && (docsIndexed > 0 || chunksIndexed > 0)) {
845
- console.log('');
846
- log('Spawning embedding generation in background...');
847
-
848
- const { spawn } = await import('child_process');
849
-
850
- // Look for build-embeddings script in multiple locations:
851
- // 1. Shipped with moflo (node_modules/moflo/bin/)
852
- // 2. Project-local (.claude/scripts/)
853
- const mofloScript = resolve(__dirname, 'build-embeddings.mjs');
854
- const projectLocalScript = resolve(projectRoot, '.claude/scripts/build-embeddings.mjs');
855
- const embeddingScript = existsSync(mofloScript) ? mofloScript : projectLocalScript;
856
-
857
- if (existsSync(embeddingScript)) {
858
- const embeddingArgs = ['--namespace', NAMESPACE];
859
-
860
- // Create log file for background process output
861
- const logDir = resolve(projectRoot, '.swarm/logs');
862
- if (!existsSync(logDir)) {
863
- mkdirSync(logDir, { recursive: true });
864
- }
865
- const logFile = resolve(logDir, 'embeddings.log');
866
- const { openSync } = await import('fs');
867
- const out = openSync(logFile, 'a');
868
- const err = openSync(logFile, 'a');
869
-
870
- // Spawn in background - don't wait for completion
871
- const proc = spawn('node', [embeddingScript, ...embeddingArgs], {
872
- stdio: ['ignore', out, err],
873
- cwd: projectRoot,
874
- detached: true,
875
- windowsHide: true // Suppress command windows on Windows
876
- });
877
- proc.unref(); // Allow parent to exit independently
878
-
879
- log(`Background embedding started (PID: ${proc.pid})`);
880
- log(`Log file: .swarm/logs/embeddings.log`);
881
- } else {
882
- log('⚠️ Embedding script not found, skipping embedding generation');
883
- }
884
- } else if (skipEmbeddings) {
885
- log('Skipping embedding generation (--no-embeddings)');
886
- } else {
887
- log('No new content indexed, skipping embedding generation');
888
- }
889
-
890
- if (errors > 0) {
891
- process.exit(1);
892
- }
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Index guidance files into claude-flow memory with full RAG linked segments
4
+ *
5
+ * Strategy:
6
+ * - Full documents stored as `doc-{name}` for complete retrieval
7
+ * - Semantic chunks stored as `chunk-{name}-{n}` for precise search
8
+ * - FULL RAG LINKING:
9
+ * - parentDoc: link to full document
10
+ * - prevChunk/nextChunk: forward/backward navigation
11
+ * - siblings: all chunk keys from same document
12
+ * - children: sub-chunks for hierarchical headers (h2 -> h3)
13
+ * - contextBefore/contextAfter: overlapping text for context continuity
14
+ * - Chunking based on markdown headers (## and ###) for natural boundaries
15
+ * - After indexing, generates embeddings for semantic search (HNSW)
16
+ *
17
+ * Usage:
18
+ * node node_modules/moflo/bin/index-guidance.mjs # Index all + generate embeddings
19
+ * npx flo-index --force # Force reindex all
20
+ * npx flo-index --file X # Index specific file
21
+ * npx flo-index --no-embeddings # Skip embedding generation
22
+ * npx flo-index --overlap 20 # Set context overlap % (default: 15)
23
+ */
24
+
25
+ import { existsSync, readdirSync, readFileSync, statSync, mkdirSync, writeFileSync } from 'fs';
26
+ import { resolve, dirname, basename, extname } from 'path';
27
+ import { fileURLToPath } from 'url';
28
+ import { mofloResolveURL } from './lib/moflo-resolve.mjs';
29
+ const initSqlJs = (await import(mofloResolveURL('sql.js'))).default;
30
+
31
+
32
+ const __dirname = dirname(fileURLToPath(import.meta.url));
33
+
34
+ function findProjectRoot() {
35
+ let dir = process.cwd();
36
+ const root = resolve(dir, '/');
37
+ while (dir !== root) {
38
+ if (existsSync(resolve(dir, 'package.json'))) return dir;
39
+ dir = dirname(dir);
40
+ }
41
+ return process.cwd();
42
+ }
43
+
44
+ const projectRoot = findProjectRoot();
45
+
46
+ // Locate the moflo package root (for bundled guidance that ships with moflo)
47
+ const mofloRoot = resolve(__dirname, '..');
48
+
49
+ const NAMESPACE = 'guidance';
50
+ const DB_PATH = resolve(projectRoot, '.swarm/memory.db');
51
+
52
+ // ============================================================================
53
+ // Load guidance directories from moflo.yaml, falling back to defaults
54
+ // ============================================================================
55
+
56
+ function loadGuidanceDirs() {
57
+ const dirs = [];
58
+
59
+ // 1. Read moflo.yaml / moflo.config.json for user-configured directories
60
+ let configDirs = null;
61
+ const yamlPath = resolve(projectRoot, 'moflo.yaml');
62
+ const jsonPath = resolve(projectRoot, 'moflo.config.json');
63
+
64
+ if (existsSync(yamlPath)) {
65
+ try {
66
+ const content = readFileSync(yamlPath, 'utf-8');
67
+ // Simple YAML array extraction — avoids needing js-yaml at runtime
68
+ // Matches: guidance:\n directories:\n - .claude/guidance\n - docs/guides
69
+ const guidanceBlock = content.match(/guidance:\s*\n\s+directories:\s*\n((?:\s+-\s+.+\n?)+)/);
70
+ if (guidanceBlock) {
71
+ const items = guidanceBlock[1].match(/-\s+(.+)/g);
72
+ if (items && items.length > 0) {
73
+ configDirs = items.map(item => item.replace(/^-\s+/, '').trim());
74
+ }
75
+ }
76
+ } catch { /* ignore parse errors, fall through to defaults */ }
77
+ } else if (existsSync(jsonPath)) {
78
+ try {
79
+ const raw = JSON.parse(readFileSync(jsonPath, 'utf-8'));
80
+ if (raw.guidance?.directories && Array.isArray(raw.guidance.directories)) {
81
+ configDirs = raw.guidance.directories;
82
+ }
83
+ } catch { /* ignore parse errors */ }
84
+ }
85
+
86
+ // Use config dirs or fall back to defaults
87
+ // Each directory gets a unique prefix derived from its path to avoid key collisions
88
+ // when multiple directories contain files with the same name.
89
+ const userDirs = configDirs || ['.claude/guidance', 'docs/guides'];
90
+ for (const d of userDirs) {
91
+ const prefix = d.replace(/\\/g, '/')
92
+ .replace(/^\.claude\//, '')
93
+ .replace(/^back-office\/api\/\.claude\//, 'bo-api-')
94
+ .replace(/^back-office\/ui\/\.claude\//, 'bo-ui-')
95
+ .replace(/[^a-zA-Z0-9-]/g, '-')
96
+ .replace(/-+/g, '-')
97
+ .replace(/^-|-$/g, '') || 'guidance';
98
+ dirs.push({ path: d, prefix });
99
+ }
100
+
101
+ // 2. Include moflo's own bundled guidance (ships with the package)
102
+ // Only when running inside a consumer project (not moflo itself)
103
+ const bundledGuidanceDir = resolve(mofloRoot, '.claude/guidance');
104
+ const projectGuidanceDir = resolve(projectRoot, '.claude/guidance');
105
+ if (
106
+ existsSync(bundledGuidanceDir) &&
107
+ resolve(bundledGuidanceDir) !== resolve(projectGuidanceDir)
108
+ ) {
109
+ dirs.push({ path: bundledGuidanceDir, prefix: 'moflo-bundled', absolute: true });
110
+ }
111
+
112
+ // 3. CLAUDE.md files are NOT indexed — Claude loads them into context automatically.
113
+ // Indexing them wastes vectors and creates duplicate keys across subprojects.
114
+
115
+ return dirs;
116
+ }
117
+
118
+ const GUIDANCE_DIRS = loadGuidanceDirs();
119
+
120
+ // Chunking config - optimized for Claude's retrieval
121
+ const MIN_CHUNK_SIZE = 50; // Lower minimum to avoid mega-chunks
122
+ const MAX_CHUNK_SIZE = 4000; // Larger chunks for code-heavy docs (fits context better)
123
+ const FORCE_CHUNK_THRESHOLD = 6000; // Force paragraph-split if file > this and < 3 chunks
124
+ const DEFAULT_OVERLAP_PERCENT = 20; // Increased context overlap for better continuity
125
+
126
+ // Parse args
127
+ const args = process.argv.slice(2);
128
+ const force = args.includes('--force');
129
+ const specificFile = args.includes('--file') ? args[args.indexOf('--file') + 1] : null;
130
+ const verbose = args.includes('--verbose') || args.includes('-v');
131
+ const skipEmbeddings = args.includes('--no-embeddings');
132
+ const overlapPercent = args.includes('--overlap')
133
+ ? parseInt(args[args.indexOf('--overlap') + 1], 10) || DEFAULT_OVERLAP_PERCENT
134
+ : DEFAULT_OVERLAP_PERCENT;
135
+
136
+ function log(msg) {
137
+ console.log(`[index-guidance] ${msg}`);
138
+ }
139
+
140
+ function debug(msg) {
141
+ if (verbose) console.log(`[index-guidance] ${msg}`);
142
+ }
143
+
144
+ function ensureDbDir() {
145
+ const dir = dirname(DB_PATH);
146
+ if (!existsSync(dir)) {
147
+ mkdirSync(dir, { recursive: true });
148
+ }
149
+ }
150
+
151
+ async function getDb() {
152
+ ensureDbDir();
153
+ const SQL = await initSqlJs();
154
+ let db;
155
+ if (existsSync(DB_PATH)) {
156
+ const buffer = readFileSync(DB_PATH);
157
+ db = new SQL.Database(buffer);
158
+ } else {
159
+ db = new SQL.Database();
160
+ }
161
+
162
+ // Ensure table exists with unique constraint
163
+ db.run(`
164
+ CREATE TABLE IF NOT EXISTS memory_entries (
165
+ id TEXT PRIMARY KEY,
166
+ key TEXT NOT NULL,
167
+ namespace TEXT DEFAULT 'default',
168
+ content TEXT NOT NULL,
169
+ type TEXT DEFAULT 'semantic',
170
+ embedding TEXT,
171
+ embedding_model TEXT DEFAULT 'local',
172
+ embedding_dimensions INTEGER,
173
+ tags TEXT,
174
+ metadata TEXT,
175
+ owner_id TEXT,
176
+ created_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now') * 1000),
177
+ updated_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now') * 1000),
178
+ expires_at INTEGER,
179
+ last_accessed_at INTEGER,
180
+ access_count INTEGER DEFAULT 0,
181
+ status TEXT DEFAULT 'active',
182
+ UNIQUE(namespace, key)
183
+ )
184
+ `);
185
+
186
+ db.run(`CREATE INDEX IF NOT EXISTS idx_memory_key_ns ON memory_entries(key, namespace)`);
187
+ db.run(`CREATE INDEX IF NOT EXISTS idx_memory_namespace ON memory_entries(namespace)`);
188
+
189
+ return db;
190
+ }
191
+
192
+ function saveDb(db) {
193
+ const data = db.export();
194
+ writeFileSync(DB_PATH, Buffer.from(data));
195
+ }
196
+
197
+ function generateId() {
198
+ return `mem_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
199
+ }
200
+
201
+ function hashContent(content) {
202
+ let hash = 0;
203
+ for (let i = 0; i < content.length; i++) {
204
+ const char = content.charCodeAt(i);
205
+ hash = ((hash << 5) - hash) + char;
206
+ hash = hash & hash;
207
+ }
208
+ return hash.toString(16);
209
+ }
210
+
211
+ function storeEntry(db, key, content, metadata = {}, tags = []) {
212
+ const now = Date.now();
213
+ const id = generateId();
214
+ const metaJson = JSON.stringify(metadata);
215
+ const tagsJson = JSON.stringify(tags);
216
+
217
+ db.run(`
218
+ INSERT OR REPLACE INTO memory_entries
219
+ (id, key, namespace, content, metadata, tags, created_at, updated_at, status)
220
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, 'active')
221
+ `, [id, key, NAMESPACE, content, metaJson, tagsJson, now, now]);
222
+
223
+ return true;
224
+ }
225
+
226
+ function deleteByPrefix(db, prefix) {
227
+ db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key LIKE ?`, [NAMESPACE, `${prefix}%`]);
228
+ }
229
+
230
+ function getEntryHash(db, key) {
231
+ const stmt = db.prepare('SELECT metadata FROM memory_entries WHERE key = ? AND namespace = ?');
232
+ stmt.bind([key, NAMESPACE]);
233
+ const entry = stmt.step() ? stmt.getAsObject() : null;
234
+ stmt.free();
235
+ if (entry?.metadata) {
236
+ try {
237
+ const meta = JSON.parse(entry.metadata);
238
+ return meta.contentHash;
239
+ } catch { /* ignore */ }
240
+ }
241
+ return null;
242
+ }
243
+
244
+ /**
245
+ * Extract overlapping context from adjacent text
246
+ * @param {string} text - The text to extract from
247
+ * @param {number} percent - Percentage of text to extract
248
+ * @param {string} position - 'start' or 'end'
249
+ * @returns {string} - The extracted context
250
+ */
251
+ function extractOverlapContext(text, percent, position) {
252
+ if (!text || percent <= 0) return '';
253
+
254
+ const targetLength = Math.floor(text.length * (percent / 100));
255
+ if (targetLength < 20) return ''; // Too short to be useful
256
+
257
+ if (position === 'start') {
258
+ // Get first N% of text, try to break at sentence/paragraph
259
+ let end = targetLength;
260
+ const nextPara = text.indexOf('\n\n', targetLength - 50);
261
+ const nextSentence = text.indexOf('. ', targetLength - 30);
262
+
263
+ if (nextPara > 0 && nextPara < targetLength + 100) {
264
+ end = nextPara;
265
+ } else if (nextSentence > 0 && nextSentence < targetLength + 50) {
266
+ end = nextSentence + 1;
267
+ }
268
+
269
+ return text.substring(0, end).trim();
270
+ } else {
271
+ // Get last N% of text, try to break at sentence/paragraph
272
+ let start = text.length - targetLength;
273
+ const prevPara = text.lastIndexOf('\n\n', start + 50);
274
+ const prevSentence = text.lastIndexOf('. ', start + 30);
275
+
276
+ if (prevPara > 0 && prevPara > start - 100) {
277
+ start = prevPara + 2;
278
+ } else if (prevSentence > 0 && prevSentence > start - 50) {
279
+ start = prevSentence + 2;
280
+ }
281
+
282
+ return text.substring(start).trim();
283
+ }
284
+ }
285
+
286
+ /**
287
+ * Split markdown content into semantic chunks based on headers
288
+ * Returns array of { title, content, level, headerLine }
289
+ */
290
+ function chunkMarkdown(content, fileName) {
291
+ const lines = content.split('\n');
292
+ const chunks = [];
293
+ let currentChunk = { title: fileName, content: [], level: 0, headerLine: 0 };
294
+
295
+ for (let lineNum = 0; lineNum < lines.length; lineNum++) {
296
+ // Strip CRLF carriage returns for Windows compatibility
297
+ const line = lines[lineNum].replace(/\r$/, '');
298
+
299
+ // Check for headers (## and ###)
300
+ const h2Match = line.match(/^## (.+)$/);
301
+ const h3Match = line.match(/^### (.+)$/);
302
+
303
+ if (h2Match || h3Match) {
304
+ // Save current chunk if it has content
305
+ if (currentChunk.content.length > 0) {
306
+ const chunkContent = currentChunk.content.join('\n').trim();
307
+ if (chunkContent.length >= MIN_CHUNK_SIZE) {
308
+ chunks.push({
309
+ title: currentChunk.title,
310
+ content: chunkContent,
311
+ level: currentChunk.level,
312
+ headerLine: currentChunk.headerLine
313
+ });
314
+ }
315
+ }
316
+
317
+ // Start new chunk
318
+ currentChunk = {
319
+ title: h2Match ? h2Match[1] : h3Match[1],
320
+ content: [line],
321
+ level: h2Match ? 2 : 3,
322
+ headerLine: lineNum
323
+ };
324
+ } else {
325
+ currentChunk.content.push(line);
326
+ }
327
+ }
328
+
329
+ // Don't forget the last chunk
330
+ if (currentChunk.content.length > 0) {
331
+ const chunkContent = currentChunk.content.join('\n').trim();
332
+ if (chunkContent.length >= MIN_CHUNK_SIZE) {
333
+ chunks.push({
334
+ title: currentChunk.title,
335
+ content: chunkContent,
336
+ level: currentChunk.level,
337
+ headerLine: currentChunk.headerLine
338
+ });
339
+ }
340
+ }
341
+
342
+ // Handle chunks that are too large - split by paragraphs
343
+ const finalChunks = [];
344
+ for (const chunk of chunks) {
345
+ if (chunk.content.length > MAX_CHUNK_SIZE) {
346
+ const paragraphs = chunk.content.split(/\n\n+/);
347
+ let currentPart = [];
348
+ let currentLength = 0;
349
+ let partNum = 1;
350
+
351
+ for (const para of paragraphs) {
352
+ if (currentLength + para.length > MAX_CHUNK_SIZE && currentPart.length > 0) {
353
+ finalChunks.push({
354
+ title: `${chunk.title} (part ${partNum})`,
355
+ content: currentPart.join('\n\n'),
356
+ level: chunk.level,
357
+ headerLine: chunk.headerLine,
358
+ isPart: true,
359
+ partNum
360
+ });
361
+ currentPart = [para];
362
+ currentLength = para.length;
363
+ partNum++;
364
+ } else {
365
+ currentPart.push(para);
366
+ currentLength += para.length;
367
+ }
368
+ }
369
+
370
+ if (currentPart.length > 0) {
371
+ finalChunks.push({
372
+ title: partNum > 1 ? `${chunk.title} (part ${partNum})` : chunk.title,
373
+ content: currentPart.join('\n\n'),
374
+ level: chunk.level,
375
+ headerLine: chunk.headerLine,
376
+ isPart: partNum > 1,
377
+ partNum: partNum > 1 ? partNum : undefined
378
+ });
379
+ }
380
+ } else {
381
+ finalChunks.push(chunk);
382
+ }
383
+ }
384
+
385
+ // FORCE CHUNKING: If file is large but resulted in few chunks, split by sections
386
+ const totalContent = finalChunks.reduce((acc, c) => acc + c.content.length, 0);
387
+ if (totalContent > FORCE_CHUNK_THRESHOLD && finalChunks.length < 3) {
388
+ debug(` Force-chunking: ${totalContent} bytes in ${finalChunks.length} chunks - splitting by sections`);
389
+ const allContent = finalChunks.map(c => c.content).join('\n\n');
390
+
391
+ // Split on --- horizontal rules first, then on ## headers, then on paragraphs
392
+ const TARGET_CHUNK_SIZE = 2500;
393
+ const rawSections = allContent.split(/\n---+\n/);
394
+ let sections = [];
395
+
396
+ for (const raw of rawSections) {
397
+ // Further split on ## headers if section is too large
398
+ if (raw.length > TARGET_CHUNK_SIZE) {
399
+ const headerSplit = raw.split(/\n(?=## )/);
400
+ for (const hSect of headerSplit) {
401
+ if (hSect.length > TARGET_CHUNK_SIZE) {
402
+ // Split very long sections on single newlines as last resort
403
+ const lines = hSect.split('\n');
404
+ let chunk = '';
405
+ for (const line of lines) {
406
+ if (chunk.length + line.length > TARGET_CHUNK_SIZE && chunk.length > 100) {
407
+ sections.push(chunk.trim());
408
+ chunk = line;
409
+ } else {
410
+ chunk += (chunk ? '\n' : '') + line;
411
+ }
412
+ }
413
+ if (chunk.trim().length > 30) sections.push(chunk.trim());
414
+ } else if (hSect.trim().length > 30) {
415
+ sections.push(hSect.trim());
416
+ }
417
+ }
418
+ } else if (raw.trim().length > 30) {
419
+ sections.push(raw.trim());
420
+ }
421
+ }
422
+
423
+ // Now group sections into chunks
424
+ const forcedChunks = [];
425
+ let currentGroup = [];
426
+ let currentLength = 0;
427
+ let groupNum = 1;
428
+
429
+ const flushGroup = () => {
430
+ if (currentGroup.length === 0) return;
431
+ const firstLine = currentGroup[0].split('\n')[0].trim();
432
+ const title = firstLine.startsWith('#')
433
+ ? firstLine.replace(/^#+\s*/, '').slice(0, 60)
434
+ : `${fileName} Section ${groupNum}`;
435
+
436
+ forcedChunks.push({
437
+ title,
438
+ content: currentGroup.join('\n\n'),
439
+ level: 2,
440
+ headerLine: 0,
441
+ isForced: true,
442
+ forceNum: groupNum
443
+ });
444
+ groupNum++;
445
+ currentGroup = [];
446
+ currentLength = 0;
447
+ };
448
+
449
+ for (const section of sections) {
450
+ if (currentLength + section.length > TARGET_CHUNK_SIZE && currentGroup.length > 0) {
451
+ flushGroup();
452
+ }
453
+ currentGroup.push(section);
454
+ currentLength += section.length;
455
+ }
456
+ flushGroup();
457
+
458
+ // Always use force-chunked results if we got multiple chunks
459
+ if (forcedChunks.length >= 2) {
460
+ debug(` Force-chunking produced ${forcedChunks.length} chunks (was ${finalChunks.length})`);
461
+ return forcedChunks;
462
+ }
463
+ }
464
+
465
+ return finalChunks;
466
+ }
467
+
468
+ /**
469
+ * Build hierarchical relationships between chunks
470
+ * H2 chunks are parents of subsequent H3 chunks
471
+ */
472
+ function buildHierarchy(chunks, chunkPrefix) {
473
+ const hierarchy = {};
474
+ let currentH2Index = null;
475
+
476
+ for (let i = 0; i < chunks.length; i++) {
477
+ const chunk = chunks[i];
478
+ const chunkKey = `${chunkPrefix}-${i}`;
479
+
480
+ hierarchy[chunkKey] = {
481
+ parent: null,
482
+ children: []
483
+ };
484
+
485
+ if (chunk.level === 2) {
486
+ currentH2Index = i;
487
+ } else if (chunk.level === 3 && currentH2Index !== null) {
488
+ const parentKey = `${chunkPrefix}-${currentH2Index}`;
489
+ hierarchy[chunkKey].parent = parentKey;
490
+ hierarchy[parentKey].children.push(chunkKey);
491
+ }
492
+ }
493
+
494
+ return hierarchy;
495
+ }
496
+
497
+ function indexFile(db, filePath, keyPrefix) {
498
+ const fileName = basename(filePath, extname(filePath));
499
+ const docKey = `doc-${keyPrefix}-${fileName}`;
500
+ const chunkPrefix = `chunk-${keyPrefix}-${fileName}`;
501
+
502
+ try {
503
+ const content = readFileSync(filePath, 'utf-8');
504
+ const contentHash = hashContent(content);
505
+
506
+ // Check if content changed (skip if same hash unless --force)
507
+ if (!force) {
508
+ const existingHash = getEntryHash(db, docKey);
509
+ if (existingHash === contentHash) {
510
+ return { docKey, status: 'unchanged', chunks: 0 };
511
+ }
512
+ }
513
+
514
+ const stats = statSync(filePath);
515
+ const relativePath = filePath.replace(projectRoot, '').replace(/\\/g, '/');
516
+
517
+ // Delete old chunks for this file before re-indexing
518
+ deleteByPrefix(db, chunkPrefix);
519
+
520
+ // 1. Store full document
521
+ const docMetadata = {
522
+ type: 'document',
523
+ filePath: relativePath,
524
+ fileSize: stats.size,
525
+ lastModified: stats.mtime.toISOString(),
526
+ contentHash,
527
+ indexedAt: new Date().toISOString(),
528
+ ragVersion: '2.0', // Mark as full RAG indexed
529
+ };
530
+
531
+ storeEntry(db, docKey, content, docMetadata, [keyPrefix, 'document']);
532
+ debug(`Stored document: ${docKey}`);
533
+
534
+ // 2. Chunk and store semantic pieces with full RAG linking
535
+ const chunks = chunkMarkdown(content, fileName);
536
+
537
+ if (chunks.length === 0) {
538
+ return { docKey, status: 'indexed', chunks: 0 };
539
+ }
540
+
541
+ // Build hierarchy and sibling list
542
+ const hierarchy = buildHierarchy(chunks, chunkPrefix);
543
+ const siblings = chunks.map((_, i) => `${chunkPrefix}-${i}`);
544
+
545
+ // Update document with children references
546
+ const docChildrenMeta = {
547
+ ...docMetadata,
548
+ children: siblings,
549
+ chunkCount: chunks.length,
550
+ };
551
+ storeEntry(db, docKey, content, docChildrenMeta, [keyPrefix, 'document']);
552
+
553
+ for (let i = 0; i < chunks.length; i++) {
554
+ const chunk = chunks[i];
555
+ const chunkKey = `${chunkPrefix}-${i}`;
556
+
557
+ // Build prev/next links
558
+ const prevChunk = i > 0 ? `${chunkPrefix}-${i - 1}` : null;
559
+ const nextChunk = i < chunks.length - 1 ? `${chunkPrefix}-${i + 1}` : null;
560
+
561
+ // Extract overlapping context from adjacent chunks
562
+ const contextBefore = i > 0
563
+ ? extractOverlapContext(chunks[i - 1].content, overlapPercent, 'end')
564
+ : null;
565
+ const contextAfter = i < chunks.length - 1
566
+ ? extractOverlapContext(chunks[i + 1].content, overlapPercent, 'start')
567
+ : null;
568
+
569
+ // Get hierarchical relationships
570
+ const hierInfo = hierarchy[chunkKey];
571
+
572
+ const chunkMetadata = {
573
+ type: 'chunk',
574
+ ragVersion: '2.0',
575
+
576
+ // Document relationship
577
+ parentDoc: docKey,
578
+ parentPath: relativePath,
579
+
580
+ // Sequential navigation (forward/backward links)
581
+ chunkIndex: i,
582
+ totalChunks: chunks.length,
583
+ prevChunk,
584
+ nextChunk,
585
+
586
+ // Sibling awareness
587
+ siblings,
588
+
589
+ // Hierarchical relationships (h2 -> h3)
590
+ hierarchicalParent: hierInfo.parent,
591
+ hierarchicalChildren: hierInfo.children.length > 0 ? hierInfo.children : null,
592
+
593
+ // Chunk info
594
+ chunkTitle: chunk.title,
595
+ headerLevel: chunk.level,
596
+ headerLine: chunk.headerLine,
597
+ isPart: chunk.isPart || false,
598
+ partNum: chunk.partNum || null,
599
+
600
+ // Overlapping context for continuity
601
+ contextOverlapPercent: overlapPercent,
602
+ hasContextBefore: !!contextBefore,
603
+ hasContextAfter: !!contextAfter,
604
+
605
+ // Content metadata
606
+ contentLength: chunk.content.length,
607
+ contentHash: hashContent(chunk.content),
608
+ indexedAt: new Date().toISOString(),
609
+ };
610
+
611
+ // Build searchable content with title context
612
+ // Include overlap context for better retrieval
613
+ let searchableContent = `# ${chunk.title}\n\n`;
614
+
615
+ if (contextBefore) {
616
+ searchableContent += `[Context from previous section:]\n${contextBefore}\n\n---\n\n`;
617
+ }
618
+
619
+ searchableContent += chunk.content;
620
+
621
+ if (contextAfter) {
622
+ searchableContent += `\n\n---\n\n[Context from next section:]\n${contextAfter}`;
623
+ }
624
+
625
+ // Store chunk with full metadata
626
+ storeEntry(
627
+ db,
628
+ chunkKey,
629
+ searchableContent,
630
+ chunkMetadata,
631
+ [keyPrefix, 'chunk', `level-${chunk.level}`, chunk.title.toLowerCase().replace(/[^a-z0-9]+/g, '-')]
632
+ );
633
+
634
+ debug(` Stored chunk ${i}: ${chunk.title} (${chunk.content.length} chars, prev=${!!prevChunk}, next=${!!nextChunk})`);
635
+ }
636
+
637
+ return { docKey, status: 'indexed', chunks: chunks.length };
638
+ } catch (err) {
639
+ return { docKey, status: 'error', error: err.message, chunks: 0 };
640
+ }
641
+ }
642
+
643
+ /**
644
+ * Recursively collect all .md files under a directory.
645
+ * Skips node_modules, .git, and other non-content directories.
646
+ */
647
+ function walkMdFiles(dir) {
648
+ const SKIP_DIRS = new Set(['node_modules', '.git', 'dist', 'build', 'coverage', '.next', '.reports']);
649
+ // CLAUDE.md is loaded into context by Claude automatically — skip to avoid duplicate vectors
650
+ const SKIP_FILES = new Set(['CLAUDE.md']);
651
+ const files = [];
652
+
653
+ function walk(current) {
654
+ if (!existsSync(current)) return;
655
+ for (const entry of readdirSync(current, { withFileTypes: true })) {
656
+ if (entry.isDirectory()) {
657
+ if (!SKIP_DIRS.has(entry.name)) walk(resolve(current, entry.name));
658
+ } else if (entry.isFile() && entry.name.endsWith('.md') && !SKIP_FILES.has(entry.name)) {
659
+ files.push(resolve(current, entry.name));
660
+ }
661
+ }
662
+ }
663
+
664
+ walk(dir);
665
+ return files;
666
+ }
667
+
668
+ function indexDirectory(db, dirConfig) {
669
+ const dirPath = dirConfig.absolute ? dirConfig.path : resolve(projectRoot, dirConfig.path);
670
+ const results = [];
671
+
672
+ if (!existsSync(dirPath)) {
673
+ if (verbose) debug(`Directory not found: ${dirConfig.path}`);
674
+ return results;
675
+ }
676
+
677
+ const allMdFiles = walkMdFiles(dirPath);
678
+ const filtered = dirConfig.fileFilter
679
+ ? allMdFiles.filter(f => dirConfig.fileFilter.includes(basename(f)))
680
+ : allMdFiles;
681
+
682
+ for (const filePath of filtered) {
683
+ const result = indexFile(db, filePath, dirConfig.prefix);
684
+ results.push(result);
685
+ }
686
+
687
+ return results;
688
+ }
689
+
690
+ /**
691
+ * Remove stale entries for files that no longer exist on disk.
692
+ * Runs after indexing to keep the memory DB clean.
693
+ */
694
+ function cleanStaleEntries(db) {
695
+ const docsStmt = db.prepare(
696
+ `SELECT DISTINCT key FROM memory_entries WHERE namespace = ? AND key LIKE 'doc-%'`
697
+ );
698
+ docsStmt.bind([NAMESPACE]);
699
+ const docs = [];
700
+ while (docsStmt.step()) docs.push(docsStmt.getAsObject());
701
+ docsStmt.free();
702
+
703
+ let staleCount = 0;
704
+
705
+ // Build a lookup of all indexed directory configs for stale detection
706
+ const prefixToDirMap = {};
707
+ for (const dirConfig of GUIDANCE_DIRS) {
708
+ const dirPath = dirConfig.absolute ? dirConfig.path : resolve(projectRoot, dirConfig.path);
709
+ prefixToDirMap[dirConfig.prefix] = dirPath;
710
+ }
711
+
712
+ for (const { key } of docs) {
713
+ // Convert key back to file path by matching doc-{prefix}-{filename}
714
+ let filePath;
715
+ for (const [prefix, dirPath] of Object.entries(prefixToDirMap)) {
716
+ const docPrefix = `doc-${prefix}-`;
717
+ if (key.startsWith(docPrefix)) {
718
+ filePath = resolve(dirPath, key.replace(docPrefix, '') + '.md');
719
+ break;
720
+ }
721
+ }
722
+ if (!filePath) continue; // Unknown prefix, skip
723
+
724
+ if (!existsSync(filePath)) {
725
+ const chunkPrefix = key.replace('doc-', 'chunk-');
726
+ const countBefore = db.exec(`SELECT COUNT(*) as cnt FROM memory_entries WHERE namespace = '${NAMESPACE}'`)[0]?.values[0][0] || 0;
727
+ db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key LIKE ?`, [NAMESPACE, `${chunkPrefix}%`]);
728
+ db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key = ?`, [NAMESPACE, key]);
729
+ const countAfter = db.exec(`SELECT COUNT(*) as cnt FROM memory_entries WHERE namespace = '${NAMESPACE}'`)[0]?.values[0][0] || 0;
730
+ const removed = countBefore - countAfter;
731
+ if (removed > 0) {
732
+ log(` Removed ${removed} stale entries for deleted file: ${key}`);
733
+ staleCount += removed;
734
+ }
735
+ }
736
+ }
737
+
738
+ // Also clean any orphaned entries not matching doc-/chunk- patterns
739
+ const orphanStmt = db.prepare(
740
+ `SELECT key FROM memory_entries WHERE namespace = ? AND key NOT LIKE 'doc-%' AND key NOT LIKE 'chunk-%'`
741
+ );
742
+ orphanStmt.bind([NAMESPACE]);
743
+ const orphans = [];
744
+ while (orphanStmt.step()) orphans.push(orphanStmt.getAsObject());
745
+ orphanStmt.free();
746
+ for (const { key } of orphans) {
747
+ db.run(`DELETE FROM memory_entries WHERE namespace = ? AND key = ?`, [NAMESPACE, key]);
748
+ staleCount++;
749
+ log(` Removed orphan entry: ${key}`);
750
+ }
751
+
752
+ return staleCount;
753
+ }
754
+
755
+ // Main
756
+ console.log('');
757
+ log('Indexing guidance files with FULL RAG linked segments...');
758
+ log(` Context overlap: ${overlapPercent}%`);
759
+ log(` Directories (${GUIDANCE_DIRS.length}):`);
760
+ for (const d of GUIDANCE_DIRS) {
761
+ const dirPath = d.absolute ? d.path : resolve(projectRoot, d.path);
762
+ const exists = existsSync(dirPath);
763
+ log(` ${exists ? '✓' : '✗'} ${d.absolute ? dirPath : d.path} [${d.prefix}]`);
764
+ }
765
+ console.log('');
766
+
767
+ const db = await getDb();
768
+ let docsIndexed = 0;
769
+ let chunksIndexed = 0;
770
+ let unchanged = 0;
771
+ let errors = 0;
772
+
773
+ if (specificFile) {
774
+ // Index single file
775
+ const filePath = resolve(projectRoot, specificFile);
776
+ if (!existsSync(filePath)) {
777
+ log(`File not found: ${specificFile}`);
778
+ process.exit(1);
779
+ }
780
+
781
+ let prefix = 'docs';
782
+ if (specificFile.includes('.claude/guidance/')) {
783
+ prefix = 'guidance';
784
+ }
785
+
786
+ const result = indexFile(db, filePath, prefix);
787
+ log(`${result.docKey}: ${result.status} (${result.chunks} chunks)`);
788
+
789
+ if (result.status === 'indexed') {
790
+ docsIndexed++;
791
+ chunksIndexed += result.chunks;
792
+ } else if (result.status === 'unchanged') {
793
+ unchanged++;
794
+ } else {
795
+ errors++;
796
+ }
797
+ } else {
798
+ // Index all directories
799
+ for (const dir of GUIDANCE_DIRS) {
800
+ log(`Scanning ${dir.path}/...`);
801
+ const results = indexDirectory(db, dir);
802
+
803
+ for (const result of results) {
804
+ if (result.status === 'indexed') {
805
+ log(` ✅ ${result.docKey} (${result.chunks} chunks)`);
806
+ docsIndexed++;
807
+ chunksIndexed += result.chunks;
808
+ } else if (result.status === 'unchanged') {
809
+ unchanged++;
810
+ } else {
811
+ log(` ❌ ${result.docKey}: ${result.error}`);
812
+ errors++;
813
+ }
814
+ }
815
+ }
816
+ }
817
+
818
+ // Clean stale entries for deleted files (unless indexing a specific file)
819
+ let staleRemoved = 0;
820
+ if (!specificFile) {
821
+ log('Cleaning stale entries for deleted files...');
822
+ staleRemoved = cleanStaleEntries(db);
823
+ if (staleRemoved === 0) {
824
+ log(' No stale entries found');
825
+ }
826
+ }
827
+
828
+ // Write changes back to disk and close
829
+ if (docsIndexed > 0 || chunksIndexed > 0 || staleRemoved > 0) saveDb(db);
830
+ db.close();
831
+
832
+ console.log('');
833
+ log('═══════════════════════════════════════════════════════════');
834
+ log(' FULL RAG INDEXING COMPLETE');
835
+ log('═══════════════════════════════════════════════════════════');
836
+ log(` Documents indexed: ${docsIndexed}`);
837
+ log(` Chunks created: ${chunksIndexed}`);
838
+ log(` Unchanged: ${unchanged}`);
839
+ log(` Stale removed: ${staleRemoved}`);
840
+ log(` Errors: ${errors}`);
841
+ log('');
842
+ log(' RAG Features Enabled:');
843
+ log(` • Forward/backward links (prevChunk/nextChunk)`);
844
+ log(` • Sibling awareness (all chunks from same doc)`);
845
+ log(` • Hierarchical links (h2 -> h3 parent/children)`);
846
+ log(` • Context overlap: ${overlapPercent}% (contextBefore/contextAfter)`);
847
+ log('═══════════════════════════════════════════════════════════');
848
+
849
+ // Generate embeddings for new entries (unless skipped or nothing changed)
850
+ // Runs in BACKGROUND to avoid blocking startup
851
+ if (!skipEmbeddings && (docsIndexed > 0 || chunksIndexed > 0)) {
852
+ console.log('');
853
+ log('Spawning embedding generation in background...');
854
+
855
+ const { spawn } = await import('child_process');
856
+
857
+ // Look for build-embeddings script in multiple locations:
858
+ // 1. Shipped with moflo (node_modules/moflo/bin/)
859
+ // 2. Project-local (.claude/scripts/)
860
+ const mofloScript = resolve(__dirname, 'build-embeddings.mjs');
861
+ const projectLocalScript = resolve(projectRoot, '.claude/scripts/build-embeddings.mjs');
862
+ const embeddingScript = existsSync(mofloScript) ? mofloScript : projectLocalScript;
863
+
864
+ if (existsSync(embeddingScript)) {
865
+ const embeddingArgs = ['--namespace', NAMESPACE];
866
+
867
+ // Create log file for background process output
868
+ const logDir = resolve(projectRoot, '.swarm/logs');
869
+ if (!existsSync(logDir)) {
870
+ mkdirSync(logDir, { recursive: true });
871
+ }
872
+ const logFile = resolve(logDir, 'embeddings.log');
873
+ const { openSync } = await import('fs');
874
+ const out = openSync(logFile, 'a');
875
+ const err = openSync(logFile, 'a');
876
+
877
+ // Spawn in background - don't wait for completion
878
+ const proc = spawn('node', [embeddingScript, ...embeddingArgs], {
879
+ stdio: ['ignore', out, err],
880
+ cwd: projectRoot,
881
+ detached: true,
882
+ windowsHide: true // Suppress command windows on Windows
883
+ });
884
+ proc.unref(); // Allow parent to exit independently
885
+
886
+ log(`Background embedding started (PID: ${proc.pid})`);
887
+ log(`Log file: .swarm/logs/embeddings.log`);
888
+ } else {
889
+ log('⚠️ Embedding script not found, skipping embedding generation');
890
+ }
891
+ } else if (skipEmbeddings) {
892
+ log('Skipping embedding generation (--no-embeddings)');
893
+ } else {
894
+ log('No new content indexed, skipping embedding generation');
895
+ }
896
+
897
+ if (errors > 0) {
898
+ process.exit(1);
899
+ }