viberag 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli/commands/mcp-setup.d.ts +1 -1
- package/dist/cli/commands/mcp-setup.js +23 -3
- package/dist/cli/components/CleanWizard.js +16 -1
- package/dist/cli/components/InitWizard.js +37 -15
- package/dist/common/types.d.ts +2 -2
- package/dist/mcp/index.js +5 -1
- package/dist/mcp/warmup.d.ts +5 -0
- package/dist/mcp/warmup.js +7 -0
- package/dist/rag/config/index.d.ts +4 -0
- package/dist/rag/config/index.js +37 -13
- package/dist/rag/embeddings/gemini.js +34 -7
- package/dist/rag/embeddings/index.d.ts +1 -0
- package/dist/rag/embeddings/index.js +1 -0
- package/dist/rag/embeddings/mistral.d.ts +2 -2
- package/dist/rag/embeddings/mistral.js +18 -5
- package/dist/rag/embeddings/openai.js +22 -3
- package/dist/rag/embeddings/validate.d.ts +22 -0
- package/dist/rag/embeddings/validate.js +148 -0
- package/dist/rag/index.d.ts +1 -1
- package/dist/rag/index.js +1 -1
- package/dist/rag/indexer/chunker.js +31 -19
- package/dist/rag/indexer/indexer.d.ts +10 -0
- package/dist/rag/indexer/indexer.js +88 -53
- package/dist/rag/search/index.d.ts +6 -0
- package/dist/rag/search/index.js +35 -9
- package/dist/rag/storage/index.d.ts +15 -1
- package/dist/rag/storage/index.js +108 -21
- package/package.json +33 -4
- package/dist/cli/__tests__/mcp-setup-comprehensive.test.d.ts +0 -10
- package/dist/cli/__tests__/mcp-setup-comprehensive.test.js +0 -515
- package/dist/cli/__tests__/mcp-setup-global.test.d.ts +0 -7
- package/dist/cli/__tests__/mcp-setup-global.test.js +0 -577
- package/dist/cli/__tests__/mcp-setup.test.d.ts +0 -6
- package/dist/cli/__tests__/mcp-setup.test.js +0 -704
- package/dist/rag/__tests__/grammar-smoke.test.d.ts +0 -9
- package/dist/rag/__tests__/grammar-smoke.test.js +0 -161
- package/dist/rag/__tests__/helpers.d.ts +0 -30
- package/dist/rag/__tests__/helpers.js +0 -67
- package/dist/rag/__tests__/merkle.test.d.ts +0 -5
- package/dist/rag/__tests__/merkle.test.js +0 -161
- package/dist/rag/__tests__/metadata-extraction.test.d.ts +0 -10
- package/dist/rag/__tests__/metadata-extraction.test.js +0 -202
- package/dist/rag/__tests__/multi-language.test.d.ts +0 -13
- package/dist/rag/__tests__/multi-language.test.js +0 -535
- package/dist/rag/__tests__/rag.test.d.ts +0 -10
- package/dist/rag/__tests__/rag.test.js +0 -311
- package/dist/rag/__tests__/search-exhaustive.test.d.ts +0 -9
- package/dist/rag/__tests__/search-exhaustive.test.js +0 -87
- package/dist/rag/__tests__/search-filters.test.d.ts +0 -10
- package/dist/rag/__tests__/search-filters.test.js +0 -250
- package/dist/rag/__tests__/search-modes.test.d.ts +0 -8
- package/dist/rag/__tests__/search-modes.test.js +0 -133
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* API key validation for cloud embedding providers.
|
|
3
|
+
*
|
|
4
|
+
* Makes a minimal test embedding call to verify the API key is valid
|
|
5
|
+
* before proceeding with indexing.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* API endpoints for each cloud provider.
|
|
9
|
+
*/
|
|
10
|
+
const ENDPOINTS = {
|
|
11
|
+
gemini: 'https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-001:embedContent',
|
|
12
|
+
mistral: 'https://api.mistral.ai/v1/embeddings',
|
|
13
|
+
openai: 'https://api.openai.com/v1/embeddings',
|
|
14
|
+
};
|
|
15
|
+
/**
|
|
16
|
+
* Safely parse JSON response, logging parse failures instead of swallowing them.
|
|
17
|
+
*/
|
|
18
|
+
async function safeParseJson(response, provider) {
|
|
19
|
+
try {
|
|
20
|
+
return (await response.json());
|
|
21
|
+
}
|
|
22
|
+
catch (error) {
|
|
23
|
+
console.warn(`[${provider}] Failed to parse error response:`, error instanceof Error ? error.message : String(error));
|
|
24
|
+
return {};
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Validate an API key by making a minimal test embedding call.
|
|
29
|
+
*
|
|
30
|
+
* @param provider - The embedding provider type
|
|
31
|
+
* @param apiKey - The API key to validate
|
|
32
|
+
* @returns Validation result with error message if invalid
|
|
33
|
+
*/
|
|
34
|
+
export async function validateApiKey(provider, apiKey) {
|
|
35
|
+
// Local providers don't need API key validation
|
|
36
|
+
if (provider === 'local' || provider === 'local-4b') {
|
|
37
|
+
return { valid: true };
|
|
38
|
+
}
|
|
39
|
+
if (!apiKey || apiKey.trim() === '') {
|
|
40
|
+
return { valid: false, error: 'API key is required' };
|
|
41
|
+
}
|
|
42
|
+
try {
|
|
43
|
+
switch (provider) {
|
|
44
|
+
case 'gemini':
|
|
45
|
+
return await validateGeminiKey(apiKey);
|
|
46
|
+
case 'mistral':
|
|
47
|
+
return await validateMistralKey(apiKey);
|
|
48
|
+
case 'openai':
|
|
49
|
+
return await validateOpenAIKey(apiKey);
|
|
50
|
+
default:
|
|
51
|
+
return { valid: false, error: `Unknown provider: ${provider}` };
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
catch (err) {
|
|
55
|
+
return {
|
|
56
|
+
valid: false,
|
|
57
|
+
error: err instanceof Error ? err.message : String(err),
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Validate Gemini API key.
|
|
63
|
+
*/
|
|
64
|
+
async function validateGeminiKey(apiKey) {
|
|
65
|
+
const response = await fetch(ENDPOINTS.gemini, {
|
|
66
|
+
method: 'POST',
|
|
67
|
+
headers: {
|
|
68
|
+
'Content-Type': 'application/json',
|
|
69
|
+
'x-goog-api-key': apiKey,
|
|
70
|
+
},
|
|
71
|
+
body: JSON.stringify({
|
|
72
|
+
content: { parts: [{ text: 'test' }] },
|
|
73
|
+
}),
|
|
74
|
+
});
|
|
75
|
+
if (response.ok) {
|
|
76
|
+
return { valid: true };
|
|
77
|
+
}
|
|
78
|
+
const data = await safeParseJson(response, 'gemini');
|
|
79
|
+
const message = data?.error?.message ||
|
|
80
|
+
`HTTP ${response.status}`;
|
|
81
|
+
if (response.status === 400 && message.includes('API key')) {
|
|
82
|
+
return { valid: false, error: 'Invalid API key' };
|
|
83
|
+
}
|
|
84
|
+
if (response.status === 403) {
|
|
85
|
+
return { valid: false, error: 'API key not authorized for this API' };
|
|
86
|
+
}
|
|
87
|
+
return { valid: false, error: message };
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Validate Mistral API key.
|
|
91
|
+
*/
|
|
92
|
+
async function validateMistralKey(apiKey) {
|
|
93
|
+
const response = await fetch(ENDPOINTS.mistral, {
|
|
94
|
+
method: 'POST',
|
|
95
|
+
headers: {
|
|
96
|
+
'Content-Type': 'application/json',
|
|
97
|
+
Authorization: `Bearer ${apiKey}`,
|
|
98
|
+
},
|
|
99
|
+
body: JSON.stringify({
|
|
100
|
+
model: 'codestral-embed',
|
|
101
|
+
input: ['test'],
|
|
102
|
+
}),
|
|
103
|
+
});
|
|
104
|
+
if (response.ok) {
|
|
105
|
+
return { valid: true };
|
|
106
|
+
}
|
|
107
|
+
const data = await safeParseJson(response, 'mistral');
|
|
108
|
+
const message = data?.message ||
|
|
109
|
+
data?.detail ||
|
|
110
|
+
`HTTP ${response.status}`;
|
|
111
|
+
if (response.status === 401) {
|
|
112
|
+
return { valid: false, error: 'Invalid API key' };
|
|
113
|
+
}
|
|
114
|
+
return { valid: false, error: message };
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Validate OpenAI API key.
|
|
118
|
+
*/
|
|
119
|
+
async function validateOpenAIKey(apiKey) {
|
|
120
|
+
const response = await fetch(ENDPOINTS.openai, {
|
|
121
|
+
method: 'POST',
|
|
122
|
+
headers: {
|
|
123
|
+
'Content-Type': 'application/json',
|
|
124
|
+
Authorization: `Bearer ${apiKey}`,
|
|
125
|
+
},
|
|
126
|
+
body: JSON.stringify({
|
|
127
|
+
model: 'text-embedding-3-small',
|
|
128
|
+
input: ['test'],
|
|
129
|
+
}),
|
|
130
|
+
});
|
|
131
|
+
if (response.ok) {
|
|
132
|
+
return { valid: true };
|
|
133
|
+
}
|
|
134
|
+
const data = await safeParseJson(response, 'openai');
|
|
135
|
+
const error = data?.error;
|
|
136
|
+
if (response.status === 401) {
|
|
137
|
+
return { valid: false, error: 'Invalid API key' };
|
|
138
|
+
}
|
|
139
|
+
if (error?.message) {
|
|
140
|
+
// Truncate long error messages
|
|
141
|
+
const msg = error.message;
|
|
142
|
+
return {
|
|
143
|
+
valid: false,
|
|
144
|
+
error: msg.length > 100 ? msg.slice(0, 100) + '...' : msg,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
return { valid: false, error: `HTTP ${response.status}` };
|
|
148
|
+
}
|
package/dist/rag/index.d.ts
CHANGED
|
@@ -10,6 +10,6 @@ export { loadManifest, saveManifest, manifestExists, createEmptyManifest, update
|
|
|
10
10
|
export { Storage, SCHEMA_VERSION, createCodeChunksSchema, createEmbeddingCacheSchema, chunkToRow, rowToChunk, embeddingToRow, rowToEmbedding, type CodeChunk, type CodeChunkRow, type CachedEmbedding, type CachedEmbeddingRow, type ChunkType, } from './storage/index.js';
|
|
11
11
|
export { MerkleTree, compareTrees, createEmptyDiff, computeFileHash, computeStringHash, computeDirectoryHash, isBinaryFile, shouldExclude, hasValidExtension, serializeNode, deserializeNode, createFileNode, createDirectoryNode, type MerkleNode, type NodeType, type SerializedNode, type TreeDiff, type BuildStats, } from './merkle/index.js';
|
|
12
12
|
export { Chunker, Indexer, createEmptyIndexStats, type Chunk, type IndexOptions, type IndexStats, type ProgressCallback, type SupportedLanguage, } from './indexer/index.js';
|
|
13
|
-
export { GeminiEmbeddingProvider, MistralEmbeddingProvider, OpenAIEmbeddingProvider, type EmbeddingProvider, } from './embeddings/index.js';
|
|
13
|
+
export { GeminiEmbeddingProvider, MistralEmbeddingProvider, OpenAIEmbeddingProvider, validateApiKey, type EmbeddingProvider, type ValidationResult, } from './embeddings/index.js';
|
|
14
14
|
export { SearchEngine, vectorSearch, ftsSearch, ensureFtsIndex, hybridRerank, type SearchFilters, type SearchMode, type SearchOptions, type SearchResult, type SearchResults, } from './search/index.js';
|
|
15
15
|
export { loadGitignore, shouldIgnore, createGitignoreFilter, clearGitignoreCache, clearAllGitignoreCache, } from './gitignore/index.js';
|
package/dist/rag/index.js
CHANGED
|
@@ -18,7 +18,7 @@ export { MerkleTree, compareTrees, createEmptyDiff, computeFileHash, computeStri
|
|
|
18
18
|
// Indexer (Chunking & Orchestration)
|
|
19
19
|
export { Chunker, Indexer, createEmptyIndexStats, } from './indexer/index.js';
|
|
20
20
|
// Embeddings
|
|
21
|
-
export { GeminiEmbeddingProvider, MistralEmbeddingProvider, OpenAIEmbeddingProvider, } from './embeddings/index.js';
|
|
21
|
+
export { GeminiEmbeddingProvider, MistralEmbeddingProvider, OpenAIEmbeddingProvider, validateApiKey, } from './embeddings/index.js';
|
|
22
22
|
// Search
|
|
23
23
|
export { SearchEngine, vectorSearch, ftsSearch, ensureFtsIndex, hybridRerank, } from './search/index.js';
|
|
24
24
|
// Gitignore
|
|
@@ -188,28 +188,40 @@ export class Chunker {
|
|
|
188
188
|
await Parser.init();
|
|
189
189
|
// Create parser instance after init
|
|
190
190
|
this.parser = new Parser();
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
191
|
+
try {
|
|
192
|
+
// Resolve the path to tree-sitter-wasms/out/
|
|
193
|
+
const wasmPackagePath = require.resolve('tree-sitter-wasms/package.json');
|
|
194
|
+
this.wasmBasePath = path.join(path.dirname(wasmPackagePath), 'out');
|
|
195
|
+
// Load all language grammars sequentially (skip null entries like Dart)
|
|
196
|
+
// IMPORTANT: Must be sequential - web-tree-sitter has global state that
|
|
197
|
+
// gets corrupted when loading multiple WASM modules in parallel.
|
|
198
|
+
for (const [lang, wasmFile] of Object.entries(LANGUAGE_WASM_FILES)) {
|
|
199
|
+
if (!wasmFile) {
|
|
200
|
+
// Language temporarily disabled (e.g., Dart due to version mismatch)
|
|
201
|
+
continue;
|
|
202
|
+
}
|
|
203
|
+
try {
|
|
204
|
+
const wasmPath = path.join(this.wasmBasePath, wasmFile);
|
|
205
|
+
const language = await Parser.Language.load(wasmPath);
|
|
206
|
+
this.languages.set(lang, language);
|
|
207
|
+
}
|
|
208
|
+
catch (error) {
|
|
209
|
+
// Log but don't fail - we can still work with other languages
|
|
210
|
+
console.error(`Failed to load ${lang} grammar:`, error);
|
|
211
|
+
}
|
|
206
212
|
}
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
213
|
+
this.initialized = true;
|
|
214
|
+
}
|
|
215
|
+
catch (error) {
|
|
216
|
+
// Cleanup parser on failure to prevent resource leak
|
|
217
|
+
if (this.parser) {
|
|
218
|
+
this.parser.delete();
|
|
219
|
+
this.parser = null;
|
|
210
220
|
}
|
|
221
|
+
this.wasmBasePath = null;
|
|
222
|
+
this.languages.clear();
|
|
223
|
+
throw error;
|
|
211
224
|
}
|
|
212
|
-
this.initialized = true;
|
|
213
225
|
}
|
|
214
226
|
/**
|
|
215
227
|
* Get the language for a file extension.
|
|
@@ -33,11 +33,17 @@ export declare class Indexer {
|
|
|
33
33
|
private chunker;
|
|
34
34
|
private embeddings;
|
|
35
35
|
private logger;
|
|
36
|
+
private indexPromise;
|
|
36
37
|
constructor(projectRoot: string, logger?: Logger);
|
|
37
38
|
/**
|
|
38
39
|
* Run the indexing pipeline.
|
|
40
|
+
* Uses mutex to prevent concurrent index operations.
|
|
39
41
|
*/
|
|
40
42
|
index(options?: IndexOptions): Promise<IndexStats>;
|
|
43
|
+
/**
|
|
44
|
+
* Perform the actual indexing operation.
|
|
45
|
+
*/
|
|
46
|
+
private doIndex;
|
|
41
47
|
/**
|
|
42
48
|
* Create a diff that treats all files as new (for force reindex).
|
|
43
49
|
*/
|
|
@@ -48,6 +54,10 @@ export declare class Indexer {
|
|
|
48
54
|
private collectAllFilesFromSerialized;
|
|
49
55
|
/**
|
|
50
56
|
* Process a batch of files: read, chunk, embed, and prepare CodeChunks.
|
|
57
|
+
*
|
|
58
|
+
* Error handling strategy:
|
|
59
|
+
* - File read/parse errors: Log and continue (file-specific, recoverable)
|
|
60
|
+
* - Embedding/storage errors: Let propagate (fatal, affects all files)
|
|
51
61
|
*/
|
|
52
62
|
private processFileBatch;
|
|
53
63
|
/**
|
|
@@ -62,13 +62,37 @@ export class Indexer {
|
|
|
62
62
|
writable: true,
|
|
63
63
|
value: null
|
|
64
64
|
});
|
|
65
|
+
Object.defineProperty(this, "indexPromise", {
|
|
66
|
+
enumerable: true,
|
|
67
|
+
configurable: true,
|
|
68
|
+
writable: true,
|
|
69
|
+
value: null
|
|
70
|
+
});
|
|
65
71
|
this.projectRoot = projectRoot;
|
|
66
72
|
this.logger = logger ?? null;
|
|
67
73
|
}
|
|
68
74
|
/**
|
|
69
75
|
* Run the indexing pipeline.
|
|
76
|
+
* Uses mutex to prevent concurrent index operations.
|
|
70
77
|
*/
|
|
71
78
|
async index(options = {}) {
|
|
79
|
+
// If indexing is already in progress, wait for it
|
|
80
|
+
if (this.indexPromise) {
|
|
81
|
+
this.log('warn', 'Index already in progress, waiting for completion');
|
|
82
|
+
return this.indexPromise;
|
|
83
|
+
}
|
|
84
|
+
this.indexPromise = this.doIndex(options);
|
|
85
|
+
try {
|
|
86
|
+
return await this.indexPromise;
|
|
87
|
+
}
|
|
88
|
+
finally {
|
|
89
|
+
this.indexPromise = null;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Perform the actual indexing operation.
|
|
94
|
+
*/
|
|
95
|
+
async doIndex(options = {}) {
|
|
72
96
|
const stats = createEmptyIndexStats();
|
|
73
97
|
const { force = false, progressCallback } = options;
|
|
74
98
|
try {
|
|
@@ -193,68 +217,79 @@ export class Indexer {
|
|
|
193
217
|
}
|
|
194
218
|
/**
|
|
195
219
|
* Process a batch of files: read, chunk, embed, and prepare CodeChunks.
|
|
220
|
+
*
|
|
221
|
+
* Error handling strategy:
|
|
222
|
+
* - File read/parse errors: Log and continue (file-specific, recoverable)
|
|
223
|
+
* - Embedding/storage errors: Let propagate (fatal, affects all files)
|
|
196
224
|
*/
|
|
197
225
|
async processFileBatch(filepaths, chunker, embeddings, storage, stats) {
|
|
198
226
|
const allChunks = [];
|
|
199
227
|
for (const filepath of filepaths) {
|
|
228
|
+
// Phase 1: File reading and chunking (recoverable errors)
|
|
229
|
+
let content;
|
|
230
|
+
let fileHash;
|
|
231
|
+
let chunks;
|
|
200
232
|
try {
|
|
201
233
|
const absolutePath = path.join(this.projectRoot, filepath);
|
|
202
|
-
|
|
203
|
-
|
|
234
|
+
content = await fs.readFile(absolutePath, 'utf-8');
|
|
235
|
+
fileHash = (await import('../merkle/hash.js')).computeStringHash(content);
|
|
204
236
|
// Chunk the file (with size limits from config)
|
|
205
|
-
|
|
206
|
-
// Check embedding cache for each chunk
|
|
207
|
-
const contentHashes = chunks.map(c => c.contentHash);
|
|
208
|
-
const cachedEmbeddings = await storage.getCachedEmbeddings(contentHashes);
|
|
209
|
-
// Compute embeddings for cache misses
|
|
210
|
-
const missingChunks = chunks.filter(c => !cachedEmbeddings.has(c.contentHash));
|
|
211
|
-
if (missingChunks.length > 0) {
|
|
212
|
-
// Embed contextHeader + text for semantic relevance
|
|
213
|
-
const texts = missingChunks.map(c => c.contextHeader ? `${c.contextHeader}\n${c.text}` : c.text);
|
|
214
|
-
const newEmbeddings = await embeddings.embed(texts);
|
|
215
|
-
stats.embeddingsComputed += missingChunks.length;
|
|
216
|
-
// Cache the new embeddings
|
|
217
|
-
const cacheEntries = missingChunks.map((chunk, i) => ({
|
|
218
|
-
contentHash: chunk.contentHash,
|
|
219
|
-
vector: newEmbeddings[i],
|
|
220
|
-
createdAt: new Date().toISOString(),
|
|
221
|
-
}));
|
|
222
|
-
await storage.cacheEmbeddings(cacheEntries);
|
|
223
|
-
// Add to cachedEmbeddings map
|
|
224
|
-
missingChunks.forEach((chunk, i) => {
|
|
225
|
-
cachedEmbeddings.set(chunk.contentHash, newEmbeddings[i]);
|
|
226
|
-
});
|
|
227
|
-
}
|
|
228
|
-
stats.embeddingsCached += chunks.length - missingChunks.length;
|
|
229
|
-
// Build CodeChunk objects
|
|
230
|
-
const filename = path.basename(filepath);
|
|
231
|
-
const extension = path.extname(filepath);
|
|
232
|
-
for (const chunk of chunks) {
|
|
233
|
-
const vector = cachedEmbeddings.get(chunk.contentHash);
|
|
234
|
-
allChunks.push({
|
|
235
|
-
id: `${filepath}:${chunk.startLine}`,
|
|
236
|
-
vector,
|
|
237
|
-
text: chunk.text,
|
|
238
|
-
contentHash: chunk.contentHash,
|
|
239
|
-
filepath,
|
|
240
|
-
filename,
|
|
241
|
-
extension,
|
|
242
|
-
type: chunk.type,
|
|
243
|
-
name: chunk.name,
|
|
244
|
-
startLine: chunk.startLine,
|
|
245
|
-
endLine: chunk.endLine,
|
|
246
|
-
fileHash,
|
|
247
|
-
// New metadata fields from schema v2
|
|
248
|
-
signature: chunk.signature,
|
|
249
|
-
docstring: chunk.docstring,
|
|
250
|
-
isExported: chunk.isExported,
|
|
251
|
-
decoratorNames: chunk.decoratorNames,
|
|
252
|
-
});
|
|
253
|
-
}
|
|
237
|
+
chunks = await chunker.chunkFile(filepath, content, this.config.chunkMaxSize);
|
|
254
238
|
}
|
|
255
239
|
catch (error) {
|
|
256
|
-
|
|
257
|
-
|
|
240
|
+
// File-specific error (read/parse) - log and continue with other files
|
|
241
|
+
this.log('warn', `Failed to read/parse file: ${filepath}`, error);
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
// Phase 2: Embedding and storage (fatal errors - let propagate)
|
|
245
|
+
// NO try-catch here - API/storage errors should stop indexing
|
|
246
|
+
// Check embedding cache for each chunk
|
|
247
|
+
const contentHashes = chunks.map(c => c.contentHash);
|
|
248
|
+
const cachedEmbeddings = await storage.getCachedEmbeddings(contentHashes);
|
|
249
|
+
// Compute embeddings for cache misses
|
|
250
|
+
const missingChunks = chunks.filter(c => !cachedEmbeddings.has(c.contentHash));
|
|
251
|
+
if (missingChunks.length > 0) {
|
|
252
|
+
// Embed contextHeader + text for semantic relevance
|
|
253
|
+
const texts = missingChunks.map(c => c.contextHeader ? `${c.contextHeader}\n${c.text}` : c.text);
|
|
254
|
+
const newEmbeddings = await embeddings.embed(texts);
|
|
255
|
+
stats.embeddingsComputed += missingChunks.length;
|
|
256
|
+
// Cache the new embeddings
|
|
257
|
+
const cacheEntries = missingChunks.map((chunk, i) => ({
|
|
258
|
+
contentHash: chunk.contentHash,
|
|
259
|
+
vector: newEmbeddings[i],
|
|
260
|
+
createdAt: new Date().toISOString(),
|
|
261
|
+
}));
|
|
262
|
+
await storage.cacheEmbeddings(cacheEntries);
|
|
263
|
+
// Add to cachedEmbeddings map
|
|
264
|
+
missingChunks.forEach((chunk, i) => {
|
|
265
|
+
cachedEmbeddings.set(chunk.contentHash, newEmbeddings[i]);
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
stats.embeddingsCached += chunks.length - missingChunks.length;
|
|
269
|
+
// Build CodeChunk objects
|
|
270
|
+
const filename = path.basename(filepath);
|
|
271
|
+
const extension = path.extname(filepath);
|
|
272
|
+
for (const chunk of chunks) {
|
|
273
|
+
const vector = cachedEmbeddings.get(chunk.contentHash);
|
|
274
|
+
allChunks.push({
|
|
275
|
+
id: `${filepath}:${chunk.startLine}`,
|
|
276
|
+
vector,
|
|
277
|
+
text: chunk.text,
|
|
278
|
+
contentHash: chunk.contentHash,
|
|
279
|
+
filepath,
|
|
280
|
+
filename,
|
|
281
|
+
extension,
|
|
282
|
+
type: chunk.type,
|
|
283
|
+
name: chunk.name,
|
|
284
|
+
startLine: chunk.startLine,
|
|
285
|
+
endLine: chunk.endLine,
|
|
286
|
+
fileHash,
|
|
287
|
+
// New metadata fields from schema v2
|
|
288
|
+
signature: chunk.signature,
|
|
289
|
+
docstring: chunk.docstring,
|
|
290
|
+
isExported: chunk.isExported,
|
|
291
|
+
decoratorNames: chunk.decoratorNames,
|
|
292
|
+
});
|
|
258
293
|
}
|
|
259
294
|
}
|
|
260
295
|
return allChunks;
|
|
@@ -24,6 +24,7 @@ export declare class SearchEngine {
|
|
|
24
24
|
private embeddings;
|
|
25
25
|
private logger;
|
|
26
26
|
private initialized;
|
|
27
|
+
private initPromise;
|
|
27
28
|
constructor(projectRoot: string, logger?: Logger);
|
|
28
29
|
/**
|
|
29
30
|
* Primary search method. Dispatches to appropriate search mode.
|
|
@@ -74,8 +75,13 @@ export declare class SearchEngine {
|
|
|
74
75
|
warmup(): Promise<void>;
|
|
75
76
|
/**
|
|
76
77
|
* Initialize the search engine.
|
|
78
|
+
* Uses idempotent promise pattern to prevent race conditions.
|
|
77
79
|
*/
|
|
78
80
|
private ensureInitialized;
|
|
81
|
+
/**
|
|
82
|
+
* Perform actual initialization.
|
|
83
|
+
*/
|
|
84
|
+
private doInitialize;
|
|
79
85
|
/**
|
|
80
86
|
* Create the appropriate embedding provider based on config.
|
|
81
87
|
*/
|
package/dist/rag/search/index.js
CHANGED
|
@@ -64,6 +64,12 @@ export class SearchEngine {
|
|
|
64
64
|
writable: true,
|
|
65
65
|
value: false
|
|
66
66
|
});
|
|
67
|
+
Object.defineProperty(this, "initPromise", {
|
|
68
|
+
enumerable: true,
|
|
69
|
+
configurable: true,
|
|
70
|
+
writable: true,
|
|
71
|
+
value: null
|
|
72
|
+
});
|
|
67
73
|
this.projectRoot = projectRoot;
|
|
68
74
|
this.logger = logger ?? null;
|
|
69
75
|
}
|
|
@@ -315,19 +321,39 @@ export class SearchEngine {
|
|
|
315
321
|
}
|
|
316
322
|
/**
|
|
317
323
|
* Initialize the search engine.
|
|
324
|
+
* Uses idempotent promise pattern to prevent race conditions.
|
|
318
325
|
*/
|
|
319
326
|
async ensureInitialized() {
|
|
327
|
+
// Fast path: already initialized
|
|
320
328
|
if (this.initialized)
|
|
321
329
|
return;
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
330
|
+
// Idempotent: return existing promise if initialization in progress
|
|
331
|
+
if (this.initPromise)
|
|
332
|
+
return this.initPromise;
|
|
333
|
+
// Start initialization and store promise
|
|
334
|
+
this.initPromise = this.doInitialize();
|
|
335
|
+
return this.initPromise;
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Perform actual initialization.
|
|
339
|
+
*/
|
|
340
|
+
async doInitialize() {
|
|
341
|
+
try {
|
|
342
|
+
const config = await loadConfig(this.projectRoot);
|
|
343
|
+
// Initialize storage
|
|
344
|
+
this.storage = new Storage(this.projectRoot, config.embeddingDimensions);
|
|
345
|
+
await this.storage.connect();
|
|
346
|
+
// Initialize embeddings with config (includes apiKey for cloud providers)
|
|
347
|
+
this.embeddings = this.createEmbeddingProvider(config);
|
|
348
|
+
await this.embeddings.initialize();
|
|
349
|
+
this.initialized = true;
|
|
350
|
+
this.log('info', 'SearchEngine initialized');
|
|
351
|
+
}
|
|
352
|
+
catch (error) {
|
|
353
|
+
// Reset promise on failure to allow retry
|
|
354
|
+
this.initPromise = null;
|
|
355
|
+
throw error;
|
|
356
|
+
}
|
|
331
357
|
}
|
|
332
358
|
/**
|
|
333
359
|
* Create the appropriate embedding provider based on config.
|
|
@@ -15,8 +15,14 @@ export declare class Storage {
|
|
|
15
15
|
/**
|
|
16
16
|
* Connect to the LanceDB database.
|
|
17
17
|
* Creates tables if they don't exist.
|
|
18
|
+
* Validates that existing tables have matching dimensions.
|
|
18
19
|
*/
|
|
19
20
|
connect(): Promise<void>;
|
|
21
|
+
/**
|
|
22
|
+
* Get the vector column dimensions from a table schema.
|
|
23
|
+
* Returns null if vector column not found.
|
|
24
|
+
*/
|
|
25
|
+
private getTableVectorDimensions;
|
|
20
26
|
/**
|
|
21
27
|
* Close the database connection.
|
|
22
28
|
*/
|
|
@@ -26,6 +32,14 @@ export declare class Storage {
|
|
|
26
32
|
* Note: chunksTable may be null after resetChunksTable().
|
|
27
33
|
*/
|
|
28
34
|
private ensureConnected;
|
|
35
|
+
/**
|
|
36
|
+
* Get the database connection with a clear error if not connected.
|
|
37
|
+
*/
|
|
38
|
+
private getDb;
|
|
39
|
+
/**
|
|
40
|
+
* Get the cache table with a clear error if not connected.
|
|
41
|
+
*/
|
|
42
|
+
private getCacheTable;
|
|
29
43
|
/**
|
|
30
44
|
* Upsert chunks into the database.
|
|
31
45
|
* Uses merge insert to update existing chunks or add new ones.
|
|
@@ -34,7 +48,6 @@ export declare class Storage {
|
|
|
34
48
|
/**
|
|
35
49
|
* Add chunks to the database (no merge, just insert).
|
|
36
50
|
* Use this after resetChunksTable() to avoid schema mismatch issues.
|
|
37
|
-
* Creates the table from data if it doesn't exist.
|
|
38
51
|
*/
|
|
39
52
|
addChunks(chunks: CodeChunk[]): Promise<void>;
|
|
40
53
|
/**
|
|
@@ -87,6 +100,7 @@ export declare class Storage {
|
|
|
87
100
|
clearCache(): Promise<void>;
|
|
88
101
|
/**
|
|
89
102
|
* Get the chunks table for direct querying (e.g., search).
|
|
103
|
+
* @throws Error if table doesn't exist (not indexed yet)
|
|
90
104
|
*/
|
|
91
105
|
getChunksTable(): Table;
|
|
92
106
|
}
|