ted-mosby 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +175 -20
- package/dist/cli.js +433 -16
- package/dist/cli.js.map +1 -1
- package/dist/prompts/wiki-system.d.ts +1 -1
- package/dist/prompts/wiki-system.d.ts.map +1 -1
- package/dist/prompts/wiki-system.js +52 -0
- package/dist/prompts/wiki-system.js.map +1 -1
- package/dist/rag/index.d.ts +68 -9
- package/dist/rag/index.d.ts.map +1 -1
- package/dist/rag/index.js +384 -79
- package/dist/rag/index.js.map +1 -1
- package/dist/site/scripts.d.ts +22 -0
- package/dist/site/scripts.d.ts.map +1 -0
- package/dist/site/scripts.js +855 -0
- package/dist/site/scripts.js.map +1 -0
- package/dist/site/styles.d.ts +11 -0
- package/dist/site/styles.d.ts.map +1 -0
- package/dist/site/styles.js +1572 -0
- package/dist/site/styles.js.map +1 -0
- package/dist/site/templates.d.ts +40 -0
- package/dist/site/templates.d.ts.map +1 -0
- package/dist/site/templates.js +336 -0
- package/dist/site/templates.js.map +1 -0
- package/dist/site-generator.d.ts +197 -0
- package/dist/site-generator.d.ts.map +1 -0
- package/dist/site-generator.js +694 -0
- package/dist/site-generator.js.map +1 -0
- package/dist/wiki-agent.d.ts +73 -0
- package/dist/wiki-agent.d.ts.map +1 -1
- package/dist/wiki-agent.js +1133 -7
- package/dist/wiki-agent.js.map +1 -1
- package/package.json +6 -2
package/dist/rag/index.js
CHANGED
|
@@ -7,15 +7,22 @@
|
|
|
7
7
|
import * as fs from 'fs';
|
|
8
8
|
import * as path from 'path';
|
|
9
9
|
import { glob } from 'glob';
|
|
10
|
-
import
|
|
11
|
-
|
|
10
|
+
import { simpleGit } from 'simple-git';
|
|
11
|
+
import { createRequire } from 'module';
|
|
12
|
+
import { pipeline, env } from '@huggingface/transformers';
|
|
13
|
+
// Configure transformers.js to use local cache
|
|
14
|
+
env.cacheDir = './.ted-mosby-models';
|
|
15
|
+
// FAISS types (faiss-node) - use createRequire for CommonJS module in ESM context
|
|
12
16
|
let faiss;
|
|
13
17
|
try {
|
|
18
|
+
const require = createRequire(import.meta.url);
|
|
14
19
|
faiss = require('faiss-node');
|
|
15
20
|
}
|
|
16
21
|
catch (e) {
|
|
17
22
|
console.warn('Warning: faiss-node not available, using fallback similarity search');
|
|
18
23
|
}
|
|
24
|
+
// Embedding model - will be initialized lazily
|
|
25
|
+
let embeddingPipeline = null;
|
|
19
26
|
// File extensions to index
|
|
20
27
|
const INDEXABLE_EXTENSIONS = [
|
|
21
28
|
'.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
|
|
@@ -52,37 +59,90 @@ const EXCLUDE_PATTERNS = [
|
|
|
52
59
|
];
|
|
53
60
|
export class RAGSystem {
|
|
54
61
|
config;
|
|
55
|
-
anthropic;
|
|
56
62
|
index = null; // FAISS index
|
|
57
63
|
metadata = new Map();
|
|
58
|
-
embeddingDimension =
|
|
64
|
+
embeddingDimension = 384; // all-MiniLM-L6-v2 dimension
|
|
59
65
|
documentCount = 0;
|
|
66
|
+
indexState = null;
|
|
60
67
|
constructor(config) {
|
|
61
68
|
this.config = {
|
|
62
69
|
chunkSize: 1500,
|
|
63
70
|
chunkOverlap: 200,
|
|
64
71
|
...config
|
|
65
72
|
};
|
|
66
|
-
this.anthropic = new Anthropic();
|
|
67
73
|
// Ensure cache directory exists
|
|
68
74
|
if (!fs.existsSync(this.config.storePath)) {
|
|
69
75
|
fs.mkdirSync(this.config.storePath, { recursive: true });
|
|
70
76
|
}
|
|
71
77
|
}
|
|
78
|
+
/**
|
|
79
|
+
* Initialize the embedding model (lazy loading)
|
|
80
|
+
*/
|
|
81
|
+
async getEmbeddingPipeline() {
|
|
82
|
+
if (!embeddingPipeline) {
|
|
83
|
+
console.log(' Loading embedding model (first run only)...');
|
|
84
|
+
// Use all-MiniLM-L6-v2 - small, fast, good quality for code search
|
|
85
|
+
embeddingPipeline = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2');
|
|
86
|
+
}
|
|
87
|
+
return embeddingPipeline;
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Get the current git commit hash for the repository
|
|
91
|
+
*/
|
|
92
|
+
async getCurrentCommitHash() {
|
|
93
|
+
try {
|
|
94
|
+
const git = simpleGit(this.config.repoPath);
|
|
95
|
+
const log = await git.log({ maxCount: 1 });
|
|
96
|
+
return log.latest?.hash || 'unknown';
|
|
97
|
+
}
|
|
98
|
+
catch {
|
|
99
|
+
return 'unknown';
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Get the index state (last indexed commit)
|
|
104
|
+
*/
|
|
105
|
+
getIndexState() {
|
|
106
|
+
return this.indexState;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Get files changed since a specific commit
|
|
110
|
+
*/
|
|
111
|
+
async getChangedFilesSince(commitHash) {
|
|
112
|
+
try {
|
|
113
|
+
const git = simpleGit(this.config.repoPath);
|
|
114
|
+
const diff = await git.diff(['--name-only', commitHash, 'HEAD']);
|
|
115
|
+
return diff.split('\n').filter(f => f.trim().length > 0);
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
return [];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
72
121
|
/**
|
|
73
122
|
* Index the repository for semantic search
|
|
74
123
|
*/
|
|
75
124
|
async indexRepository() {
|
|
76
125
|
const cachedIndexPath = path.join(this.config.storePath, 'index.faiss');
|
|
77
126
|
const cachedMetaPath = path.join(this.config.storePath, 'metadata.json');
|
|
127
|
+
const indexStatePath = path.join(this.config.storePath, 'index-state.json');
|
|
128
|
+
// Get current commit hash
|
|
129
|
+
const currentCommit = await this.getCurrentCommitHash();
|
|
78
130
|
// Try to load cached index
|
|
79
131
|
if (fs.existsSync(cachedIndexPath) && fs.existsSync(cachedMetaPath) && faiss) {
|
|
80
132
|
try {
|
|
81
|
-
|
|
133
|
+
// faiss-node API: IndexFlatIP.read(path) to load index
|
|
134
|
+
this.index = faiss.IndexFlatIP.read(cachedIndexPath);
|
|
82
135
|
const metaData = JSON.parse(fs.readFileSync(cachedMetaPath, 'utf-8'));
|
|
83
136
|
this.metadata = new Map(Object.entries(metaData).map(([k, v]) => [parseInt(k), v]));
|
|
84
137
|
this.documentCount = this.metadata.size;
|
|
85
|
-
|
|
138
|
+
// Load index state if available
|
|
139
|
+
if (fs.existsSync(indexStatePath)) {
|
|
140
|
+
this.indexState = JSON.parse(fs.readFileSync(indexStatePath, 'utf-8'));
|
|
141
|
+
console.log(`Loaded cached index with ${this.documentCount} chunks (indexed at commit ${this.indexState?.commitHash?.slice(0, 7) || 'unknown'})`);
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
console.log(`Loaded cached index with ${this.documentCount} chunks`);
|
|
145
|
+
}
|
|
86
146
|
return;
|
|
87
147
|
}
|
|
88
148
|
catch (e) {
|
|
@@ -121,36 +181,74 @@ export class RAGSystem {
|
|
|
121
181
|
console.warn('No code chunks to index');
|
|
122
182
|
return;
|
|
123
183
|
}
|
|
184
|
+
// Apply maxChunks limit if configured (for large codebases)
|
|
185
|
+
let chunksToIndex = chunks;
|
|
186
|
+
if (this.config.maxChunks && chunks.length > this.config.maxChunks) {
|
|
187
|
+
console.log(` ⚠️ Limiting to ${this.config.maxChunks} chunks (was ${chunks.length}) to manage memory`);
|
|
188
|
+
// Prioritize chunks from smaller files and main source directories
|
|
189
|
+
chunksToIndex = this.prioritizeChunks(chunks, this.config.maxChunks);
|
|
190
|
+
}
|
|
124
191
|
// Generate embeddings
|
|
125
|
-
console.log(` Generating embeddings for ${
|
|
126
|
-
const embeddings = await this.generateEmbeddings(
|
|
192
|
+
console.log(` Generating embeddings for ${chunksToIndex.length} chunks...`);
|
|
193
|
+
const embeddings = await this.generateEmbeddings(chunksToIndex);
|
|
127
194
|
// Build FAISS index
|
|
128
195
|
console.log(` Building search index...`);
|
|
129
196
|
if (faiss && embeddings.length > 0) {
|
|
197
|
+
// Get actual dimension from first embedding
|
|
198
|
+
const actualDimension = embeddings[0].length;
|
|
199
|
+
if (actualDimension !== this.embeddingDimension) {
|
|
200
|
+
console.log(` Adjusting dimension: expected ${this.embeddingDimension}, got ${actualDimension}`);
|
|
201
|
+
this.embeddingDimension = actualDimension;
|
|
202
|
+
}
|
|
130
203
|
this.index = new faiss.IndexFlatIP(this.embeddingDimension); // Inner product for cosine similarity
|
|
131
|
-
//
|
|
204
|
+
// Normalize all embeddings and prepare for batch add
|
|
205
|
+
const normalizedEmbeddings = [];
|
|
132
206
|
for (let i = 0; i < embeddings.length; i++) {
|
|
133
|
-
// Normalize for cosine similarity
|
|
134
207
|
const normalized = this.normalizeVector(embeddings[i]);
|
|
135
|
-
|
|
208
|
+
normalizedEmbeddings.push(normalized);
|
|
136
209
|
this.metadata.set(i, {
|
|
137
|
-
id:
|
|
138
|
-
filePath:
|
|
139
|
-
startLine:
|
|
140
|
-
endLine:
|
|
141
|
-
content:
|
|
142
|
-
language:
|
|
210
|
+
id: chunksToIndex[i].id,
|
|
211
|
+
filePath: chunksToIndex[i].filePath,
|
|
212
|
+
startLine: chunksToIndex[i].startLine,
|
|
213
|
+
endLine: chunksToIndex[i].endLine,
|
|
214
|
+
content: chunksToIndex[i].content,
|
|
215
|
+
language: chunksToIndex[i].language
|
|
143
216
|
});
|
|
144
217
|
}
|
|
145
|
-
//
|
|
146
|
-
faiss
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
218
|
+
// Add all vectors in one batch to avoid threading issues
|
|
219
|
+
// IMPORTANT: faiss-node expects a flat array, not array of arrays
|
|
220
|
+
// e.g., [v1_d1, v1_d2, ..., v2_d1, v2_d2, ...] not [[v1], [v2], ...]
|
|
221
|
+
try {
|
|
222
|
+
const flatEmbeddings = normalizedEmbeddings.flat();
|
|
223
|
+
this.index.add(flatEmbeddings);
|
|
224
|
+
}
|
|
225
|
+
catch (faissError) {
|
|
226
|
+
console.warn(` FAISS batch add failed, falling back to keyword search: ${faissError}`);
|
|
227
|
+
// Fall through to keyword search fallback
|
|
228
|
+
this.index = null;
|
|
229
|
+
}
|
|
230
|
+
if (this.index) {
|
|
231
|
+
// Save index and metadata
|
|
232
|
+
// faiss-node API: index.write(path) to save index
|
|
233
|
+
this.index.write(cachedIndexPath);
|
|
234
|
+
fs.writeFileSync(cachedMetaPath, JSON.stringify(Object.fromEntries(this.metadata)), 'utf-8');
|
|
235
|
+
// Save index state with commit hash
|
|
236
|
+
this.indexState = {
|
|
237
|
+
commitHash: currentCommit,
|
|
238
|
+
indexedAt: new Date().toISOString(),
|
|
239
|
+
fileCount: files.length,
|
|
240
|
+
chunkCount: chunksToIndex.length
|
|
241
|
+
};
|
|
242
|
+
fs.writeFileSync(indexStatePath, JSON.stringify(this.indexState, null, 2), 'utf-8');
|
|
243
|
+
this.documentCount = chunksToIndex.length;
|
|
244
|
+
console.log(` ✓ Indexed ${this.documentCount} chunks with FAISS (commit ${currentCommit.slice(0, 7)})`);
|
|
245
|
+
return;
|
|
246
|
+
}
|
|
150
247
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
248
|
+
// Fallback: keyword search mode (when FAISS not available or failed)
|
|
249
|
+
// Metadata may already be populated from the FAISS attempt, but ensure it's complete
|
|
250
|
+
if (this.metadata.size === 0) {
|
|
251
|
+
chunksToIndex.forEach((chunk, i) => {
|
|
154
252
|
this.metadata.set(i, {
|
|
155
253
|
id: chunk.id,
|
|
156
254
|
filePath: chunk.filePath,
|
|
@@ -160,9 +258,19 @@ export class RAGSystem {
|
|
|
160
258
|
language: chunk.language
|
|
161
259
|
});
|
|
162
260
|
});
|
|
163
|
-
this.documentCount = chunks.length;
|
|
164
|
-
console.log(` ✓ Indexed ${this.documentCount} chunks (keyword search mode)`);
|
|
165
261
|
}
|
|
262
|
+
// Save metadata for fallback search
|
|
263
|
+
fs.writeFileSync(cachedMetaPath, JSON.stringify(Object.fromEntries(this.metadata)), 'utf-8');
|
|
264
|
+
// Save index state with commit hash (even in fallback mode)
|
|
265
|
+
this.indexState = {
|
|
266
|
+
commitHash: currentCommit,
|
|
267
|
+
indexedAt: new Date().toISOString(),
|
|
268
|
+
fileCount: files.length,
|
|
269
|
+
chunkCount: chunksToIndex.length
|
|
270
|
+
};
|
|
271
|
+
fs.writeFileSync(indexStatePath, JSON.stringify(this.indexState, null, 2), 'utf-8');
|
|
272
|
+
this.documentCount = chunksToIndex.length;
|
|
273
|
+
console.log(` ✓ Indexed ${this.documentCount} chunks (keyword search mode, commit ${currentCommit.slice(0, 7)})`);
|
|
166
274
|
}
|
|
167
275
|
/**
|
|
168
276
|
* Discover all indexable files in the repository
|
|
@@ -246,20 +354,25 @@ export class RAGSystem {
|
|
|
246
354
|
return chunks;
|
|
247
355
|
}
|
|
248
356
|
/**
|
|
249
|
-
* Generate embeddings for code chunks using
|
|
357
|
+
* Generate embeddings for code chunks using local Transformers.js model
|
|
358
|
+
* Uses all-MiniLM-L6-v2 - a fast, high-quality embedding model
|
|
250
359
|
*/
|
|
251
360
|
async generateEmbeddings(chunks) {
|
|
252
361
|
const embeddings = [];
|
|
253
|
-
|
|
254
|
-
|
|
362
|
+
const extractor = await this.getEmbeddingPipeline();
|
|
363
|
+
// Process in batches for memory efficiency
|
|
364
|
+
const batchSize = 32;
|
|
255
365
|
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
256
366
|
const batch = chunks.slice(i, i + batchSize);
|
|
257
367
|
try {
|
|
258
|
-
//
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
368
|
+
// Generate embeddings for the batch
|
|
369
|
+
for (const chunk of batch) {
|
|
370
|
+
// Truncate content to avoid memory issues (model max is 512 tokens)
|
|
371
|
+
const text = chunk.content.slice(0, 2000);
|
|
372
|
+
const output = await extractor(text, { pooling: 'mean', normalize: true });
|
|
373
|
+
// Convert to array
|
|
374
|
+
embeddings.push(Array.from(output.data));
|
|
375
|
+
}
|
|
263
376
|
// Progress update every batch
|
|
264
377
|
const processed = Math.min(i + batchSize, chunks.length);
|
|
265
378
|
const percent = Math.floor(processed / chunks.length * 100);
|
|
@@ -277,36 +390,13 @@ export class RAGSystem {
|
|
|
277
390
|
return embeddings;
|
|
278
391
|
}
|
|
279
392
|
/**
|
|
280
|
-
*
|
|
281
|
-
* This is a simplified implementation - production should use proper embeddings
|
|
282
|
-
*/
|
|
283
|
-
async generateSimpleEmbeddings(chunks) {
|
|
284
|
-
return chunks.map(chunk => {
|
|
285
|
-
// Create a simple bag-of-words vector
|
|
286
|
-
const words = chunk.content.toLowerCase()
|
|
287
|
-
.replace(/[^a-z0-9_]/g, ' ')
|
|
288
|
-
.split(/\s+/)
|
|
289
|
-
.filter(w => w.length > 2);
|
|
290
|
-
// Simple hash-based embedding
|
|
291
|
-
const vector = new Array(this.embeddingDimension).fill(0);
|
|
292
|
-
for (const word of words) {
|
|
293
|
-
const hash = this.simpleHash(word) % this.embeddingDimension;
|
|
294
|
-
vector[hash] += 1;
|
|
295
|
-
}
|
|
296
|
-
return this.normalizeVector(vector);
|
|
297
|
-
});
|
|
298
|
-
}
|
|
299
|
-
/**
|
|
300
|
-
* Simple string hash function
|
|
393
|
+
* Generate embedding for a single text (used for queries)
|
|
301
394
|
*/
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
hash = hash & hash;
|
|
308
|
-
}
|
|
309
|
-
return Math.abs(hash);
|
|
395
|
+
async generateSingleEmbedding(text) {
|
|
396
|
+
const extractor = await this.getEmbeddingPipeline();
|
|
397
|
+
const truncated = text.slice(0, 2000);
|
|
398
|
+
const output = await extractor(truncated, { pooling: 'mean', normalize: true });
|
|
399
|
+
return Array.from(output.data);
|
|
310
400
|
}
|
|
311
401
|
/**
|
|
312
402
|
* Normalize a vector for cosine similarity
|
|
@@ -325,22 +415,16 @@ export class RAGSystem {
|
|
|
325
415
|
if (this.metadata.size === 0) {
|
|
326
416
|
return [];
|
|
327
417
|
}
|
|
328
|
-
// Generate query embedding
|
|
329
|
-
const
|
|
330
|
-
id: 'query',
|
|
331
|
-
filePath: '',
|
|
332
|
-
startLine: 0,
|
|
333
|
-
endLine: 0,
|
|
334
|
-
content: query,
|
|
335
|
-
language: ''
|
|
336
|
-
}]);
|
|
418
|
+
// Generate query embedding using local model
|
|
419
|
+
const queryEmbedding = await this.generateSingleEmbedding(query);
|
|
337
420
|
let results = [];
|
|
338
421
|
if (this.index && faiss) {
|
|
339
|
-
// FAISS search
|
|
422
|
+
// FAISS search - pass flat array (faiss-node expects flat, not nested)
|
|
423
|
+
// Results are also flat arrays: { distances: [d1, d2, ...], labels: [l1, l2, ...] }
|
|
340
424
|
const normalized = this.normalizeVector(queryEmbedding);
|
|
341
|
-
const { distances, labels } = this.index.search(
|
|
342
|
-
for (let i = 0; i < labels
|
|
343
|
-
const label = labels[
|
|
425
|
+
const { distances, labels } = this.index.search(normalized, maxResults * 2);
|
|
426
|
+
for (let i = 0; i < labels.length; i++) {
|
|
427
|
+
const label = labels[i];
|
|
344
428
|
if (label === -1)
|
|
345
429
|
continue;
|
|
346
430
|
const meta = this.metadata.get(label);
|
|
@@ -353,7 +437,7 @@ export class RAGSystem {
|
|
|
353
437
|
continue;
|
|
354
438
|
results.push({
|
|
355
439
|
...meta,
|
|
356
|
-
score: distances[
|
|
440
|
+
score: distances[i]
|
|
357
441
|
});
|
|
358
442
|
if (results.length >= maxResults)
|
|
359
443
|
break;
|
|
@@ -442,5 +526,226 @@ export class RAGSystem {
|
|
|
442
526
|
getDocumentCount() {
|
|
443
527
|
return this.documentCount;
|
|
444
528
|
}
|
|
529
|
+
/**
|
|
530
|
+
* Discover total chunk count without indexing (for batch planning)
|
|
531
|
+
*/
|
|
532
|
+
async discoverChunkCount() {
|
|
533
|
+
const files = await this.discoverFiles();
|
|
534
|
+
let totalChunks = 0;
|
|
535
|
+
for (const file of files) {
|
|
536
|
+
try {
|
|
537
|
+
const fileChunks = await this.chunkFile(file);
|
|
538
|
+
totalChunks += fileChunks.length;
|
|
539
|
+
}
|
|
540
|
+
catch {
|
|
541
|
+
// Skip files that fail
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
return { files: files.length, chunks: totalChunks };
|
|
545
|
+
}
|
|
546
|
+
/**
|
|
547
|
+
* Index a specific batch of chunks (for chunked generation mode).
|
|
548
|
+
* Returns batch info for progress tracking.
|
|
549
|
+
*/
|
|
550
|
+
async indexBatch(batchNumber, batchSize) {
|
|
551
|
+
const batchStatePath = path.join(this.config.storePath, `batch-${batchNumber}-state.json`);
|
|
552
|
+
// Discover all files and chunks
|
|
553
|
+
console.log(` [Batch ${batchNumber}] Discovering files...`);
|
|
554
|
+
const files = await this.discoverFiles();
|
|
555
|
+
// Chunk all files
|
|
556
|
+
const allChunks = [];
|
|
557
|
+
for (const file of files) {
|
|
558
|
+
try {
|
|
559
|
+
const fileChunks = await this.chunkFile(file);
|
|
560
|
+
allChunks.push(...fileChunks);
|
|
561
|
+
}
|
|
562
|
+
catch {
|
|
563
|
+
// Skip files that fail
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
const totalChunks = allChunks.length;
|
|
567
|
+
const totalBatches = Math.ceil(totalChunks / batchSize);
|
|
568
|
+
const batchStart = batchNumber * batchSize;
|
|
569
|
+
const batchEnd = Math.min(batchStart + batchSize, totalChunks);
|
|
570
|
+
if (batchStart >= totalChunks) {
|
|
571
|
+
return {
|
|
572
|
+
totalChunks,
|
|
573
|
+
totalBatches,
|
|
574
|
+
currentBatch: batchNumber,
|
|
575
|
+
batchStart,
|
|
576
|
+
batchEnd: batchStart,
|
|
577
|
+
chunksInBatch: 0
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
// Get chunks for this batch
|
|
581
|
+
const batchChunks = allChunks.slice(batchStart, batchEnd);
|
|
582
|
+
console.log(` [Batch ${batchNumber}] Processing chunks ${batchStart + 1}-${batchEnd} of ${totalChunks}`);
|
|
583
|
+
// Generate embeddings for batch (validates chunks are processable)
|
|
584
|
+
console.log(` [Batch ${batchNumber}] Generating embeddings for ${batchChunks.length} chunks...`);
|
|
585
|
+
await this.generateEmbeddings(batchChunks);
|
|
586
|
+
// Store metadata for this batch (append to main metadata)
|
|
587
|
+
const mainMetaPath = path.join(this.config.storePath, 'metadata.json');
|
|
588
|
+
let existingMeta = {};
|
|
589
|
+
if (fs.existsSync(mainMetaPath)) {
|
|
590
|
+
existingMeta = JSON.parse(fs.readFileSync(mainMetaPath, 'utf-8'));
|
|
591
|
+
}
|
|
592
|
+
// Add batch chunks to metadata with global indices
|
|
593
|
+
batchChunks.forEach((chunk, i) => {
|
|
594
|
+
const globalIndex = batchStart + i;
|
|
595
|
+
existingMeta[globalIndex.toString()] = {
|
|
596
|
+
id: chunk.id,
|
|
597
|
+
filePath: chunk.filePath,
|
|
598
|
+
startLine: chunk.startLine,
|
|
599
|
+
endLine: chunk.endLine,
|
|
600
|
+
content: chunk.content,
|
|
601
|
+
language: chunk.language
|
|
602
|
+
};
|
|
603
|
+
});
|
|
604
|
+
// Save updated metadata
|
|
605
|
+
fs.writeFileSync(mainMetaPath, JSON.stringify(existingMeta), 'utf-8');
|
|
606
|
+
// Save batch state
|
|
607
|
+
const batchState = {
|
|
608
|
+
batchNumber,
|
|
609
|
+
batchSize,
|
|
610
|
+
batchStart,
|
|
611
|
+
batchEnd,
|
|
612
|
+
chunksProcessed: batchChunks.length,
|
|
613
|
+
completedAt: new Date().toISOString()
|
|
614
|
+
};
|
|
615
|
+
fs.writeFileSync(batchStatePath, JSON.stringify(batchState, null, 2), 'utf-8');
|
|
616
|
+
// Update in-memory metadata
|
|
617
|
+
this.metadata = new Map(Object.entries(existingMeta).map(([k, v]) => [parseInt(k), v]));
|
|
618
|
+
this.documentCount = this.metadata.size;
|
|
619
|
+
console.log(` [Batch ${batchNumber}] ✓ Indexed ${batchChunks.length} chunks (total: ${this.documentCount})`);
|
|
620
|
+
return {
|
|
621
|
+
totalChunks,
|
|
622
|
+
totalBatches,
|
|
623
|
+
currentBatch: batchNumber,
|
|
624
|
+
batchStart,
|
|
625
|
+
batchEnd,
|
|
626
|
+
chunksInBatch: batchChunks.length
|
|
627
|
+
};
|
|
628
|
+
}
|
|
629
|
+
/**
|
|
630
|
+
* Load metadata only (for batched mode - metadata was already saved during batches)
|
|
631
|
+
* This avoids regenerating embeddings which is expensive and was causing the issue.
|
|
632
|
+
*/
|
|
633
|
+
async loadMetadataOnly() {
|
|
634
|
+
const mainMetaPath = path.join(this.config.storePath, 'metadata.json');
|
|
635
|
+
const cachedIndexPath = path.join(this.config.storePath, 'index.faiss');
|
|
636
|
+
if (!fs.existsSync(mainMetaPath)) {
|
|
637
|
+
console.warn('No metadata found to load');
|
|
638
|
+
return;
|
|
639
|
+
}
|
|
640
|
+
// Load metadata
|
|
641
|
+
const metaData = JSON.parse(fs.readFileSync(mainMetaPath, 'utf-8'));
|
|
642
|
+
this.metadata = new Map(Object.entries(metaData).map(([k, v]) => [parseInt(k), v]));
|
|
643
|
+
this.documentCount = this.metadata.size;
|
|
644
|
+
// Try to load FAISS index if it exists
|
|
645
|
+
if (fs.existsSync(cachedIndexPath) && faiss) {
|
|
646
|
+
try {
|
|
647
|
+
this.index = faiss.IndexFlatIP.read(cachedIndexPath);
|
|
648
|
+
console.log(` Loaded FAISS index with ${this.documentCount} chunks`);
|
|
649
|
+
}
|
|
650
|
+
catch (e) {
|
|
651
|
+
console.log(` FAISS index not available, using keyword search (${this.documentCount} chunks)`);
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
else {
|
|
655
|
+
console.log(` Loaded ${this.documentCount} chunks (keyword search mode)`);
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
/**
|
|
659
|
+
* Build FAISS index from all accumulated metadata (call after all batches complete)
|
|
660
|
+
*/
|
|
661
|
+
async finalizeIndex() {
|
|
662
|
+
const mainMetaPath = path.join(this.config.storePath, 'metadata.json');
|
|
663
|
+
const cachedIndexPath = path.join(this.config.storePath, 'index.faiss');
|
|
664
|
+
const indexStatePath = path.join(this.config.storePath, 'index-state.json');
|
|
665
|
+
if (!fs.existsSync(mainMetaPath)) {
|
|
666
|
+
console.warn('No metadata found to finalize');
|
|
667
|
+
return;
|
|
668
|
+
}
|
|
669
|
+
const metaData = JSON.parse(fs.readFileSync(mainMetaPath, 'utf-8'));
|
|
670
|
+
this.metadata = new Map(Object.entries(metaData).map(([k, v]) => [parseInt(k), v]));
|
|
671
|
+
this.documentCount = this.metadata.size;
|
|
672
|
+
console.log(` Finalizing index with ${this.documentCount} chunks...`);
|
|
673
|
+
if (!faiss || this.documentCount === 0) {
|
|
674
|
+
console.log(' Using keyword search mode (FAISS not available or no chunks)');
|
|
675
|
+
return;
|
|
676
|
+
}
|
|
677
|
+
// Regenerate embeddings for FAISS index
|
|
678
|
+
const chunks = Array.from(this.metadata.values());
|
|
679
|
+
console.log(` Generating embeddings for final index...`);
|
|
680
|
+
const embeddings = await this.generateEmbeddings(chunks);
|
|
681
|
+
// Build FAISS index
|
|
682
|
+
this.index = new faiss.IndexFlatIP(this.embeddingDimension);
|
|
683
|
+
const normalizedEmbeddings = [];
|
|
684
|
+
for (let i = 0; i < embeddings.length; i++) {
|
|
685
|
+
normalizedEmbeddings.push(this.normalizeVector(embeddings[i]));
|
|
686
|
+
}
|
|
687
|
+
try {
|
|
688
|
+
const flatEmbeddings = normalizedEmbeddings.flat();
|
|
689
|
+
this.index.add(flatEmbeddings);
|
|
690
|
+
this.index.write(cachedIndexPath);
|
|
691
|
+
const currentCommit = await this.getCurrentCommitHash();
|
|
692
|
+
this.indexState = {
|
|
693
|
+
commitHash: currentCommit,
|
|
694
|
+
indexedAt: new Date().toISOString(),
|
|
695
|
+
fileCount: new Set(chunks.map(c => c.filePath)).size,
|
|
696
|
+
chunkCount: this.documentCount
|
|
697
|
+
};
|
|
698
|
+
fs.writeFileSync(indexStatePath, JSON.stringify(this.indexState, null, 2), 'utf-8');
|
|
699
|
+
console.log(` ✓ Finalized FAISS index with ${this.documentCount} chunks`);
|
|
700
|
+
}
|
|
701
|
+
catch (err) {
|
|
702
|
+
console.warn(` FAISS indexing failed, using keyword search: ${err}`);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Prioritize chunks for indexing when maxChunks limit is set.
|
|
707
|
+
* Prioritizes:
|
|
708
|
+
* 1. Core source directories (src/, lib/, app/)
|
|
709
|
+
* 2. Entry points and config files
|
|
710
|
+
* 3. Non-test files over test files
|
|
711
|
+
* 4. Smaller chunks (more complete code units)
|
|
712
|
+
*/
|
|
713
|
+
prioritizeChunks(chunks, maxChunks) {
|
|
714
|
+
// Score each chunk by priority
|
|
715
|
+
const scored = chunks.map(chunk => {
|
|
716
|
+
let score = 0;
|
|
717
|
+
const fp = chunk.filePath.toLowerCase();
|
|
718
|
+
// Prioritize core source directories
|
|
719
|
+
if (fp.startsWith('src/') || fp.startsWith('lib/') || fp.startsWith('app/')) {
|
|
720
|
+
score += 100;
|
|
721
|
+
}
|
|
722
|
+
// Entry points and important files
|
|
723
|
+
if (fp.includes('index.') || fp.includes('main.') || fp.includes('app.')) {
|
|
724
|
+
score += 50;
|
|
725
|
+
}
|
|
726
|
+
// Config files are important for understanding architecture
|
|
727
|
+
if (fp.includes('config') || fp.endsWith('.json') || fp.endsWith('.yaml')) {
|
|
728
|
+
score += 30;
|
|
729
|
+
}
|
|
730
|
+
// Deprioritize test files
|
|
731
|
+
if (this.isTestFile(chunk.filePath)) {
|
|
732
|
+
score -= 50;
|
|
733
|
+
}
|
|
734
|
+
// Deprioritize vendor/generated
|
|
735
|
+
if (fp.includes('vendor/') || fp.includes('generated/') || fp.includes('.min.')) {
|
|
736
|
+
score -= 100;
|
|
737
|
+
}
|
|
738
|
+
// Prefer smaller chunks (more likely to be complete logical units)
|
|
739
|
+
const chunkSize = chunk.content.length;
|
|
740
|
+
if (chunkSize < 1000)
|
|
741
|
+
score += 20;
|
|
742
|
+
else if (chunkSize > 3000)
|
|
743
|
+
score -= 10;
|
|
744
|
+
return { chunk, score };
|
|
745
|
+
});
|
|
746
|
+
// Sort by score descending and take top maxChunks
|
|
747
|
+
scored.sort((a, b) => b.score - a.score);
|
|
748
|
+
return scored.slice(0, maxChunks).map(s => s.chunk);
|
|
749
|
+
}
|
|
445
750
|
}
|
|
446
751
|
//# sourceMappingURL=index.js.map
|