viberag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -0
- package/README.md +219 -0
- package/dist/cli/__tests__/mcp-setup.test.d.ts +6 -0
- package/dist/cli/__tests__/mcp-setup.test.js +597 -0
- package/dist/cli/app.d.ts +2 -0
- package/dist/cli/app.js +238 -0
- package/dist/cli/commands/handlers.d.ts +57 -0
- package/dist/cli/commands/handlers.js +231 -0
- package/dist/cli/commands/index.d.ts +2 -0
- package/dist/cli/commands/index.js +2 -0
- package/dist/cli/commands/mcp-setup.d.ts +107 -0
- package/dist/cli/commands/mcp-setup.js +509 -0
- package/dist/cli/commands/useRagCommands.d.ts +23 -0
- package/dist/cli/commands/useRagCommands.js +180 -0
- package/dist/cli/components/CleanWizard.d.ts +17 -0
- package/dist/cli/components/CleanWizard.js +169 -0
- package/dist/cli/components/InitWizard.d.ts +20 -0
- package/dist/cli/components/InitWizard.js +370 -0
- package/dist/cli/components/McpSetupWizard.d.ts +37 -0
- package/dist/cli/components/McpSetupWizard.js +387 -0
- package/dist/cli/components/SearchResultsDisplay.d.ts +13 -0
- package/dist/cli/components/SearchResultsDisplay.js +130 -0
- package/dist/cli/components/WelcomeBanner.d.ts +10 -0
- package/dist/cli/components/WelcomeBanner.js +26 -0
- package/dist/cli/components/index.d.ts +1 -0
- package/dist/cli/components/index.js +1 -0
- package/dist/cli/data/mcp-editors.d.ts +80 -0
- package/dist/cli/data/mcp-editors.js +270 -0
- package/dist/cli/index.d.ts +2 -0
- package/dist/cli/index.js +26 -0
- package/dist/cli-bundle.cjs +5269 -0
- package/dist/common/commands/terminalSetup.d.ts +2 -0
- package/dist/common/commands/terminalSetup.js +144 -0
- package/dist/common/components/CommandSuggestions.d.ts +9 -0
- package/dist/common/components/CommandSuggestions.js +20 -0
- package/dist/common/components/StaticWithResize.d.ts +23 -0
- package/dist/common/components/StaticWithResize.js +62 -0
- package/dist/common/components/StatusBar.d.ts +8 -0
- package/dist/common/components/StatusBar.js +64 -0
- package/dist/common/components/TextInput.d.ts +12 -0
- package/dist/common/components/TextInput.js +239 -0
- package/dist/common/components/index.d.ts +3 -0
- package/dist/common/components/index.js +3 -0
- package/dist/common/hooks/index.d.ts +4 -0
- package/dist/common/hooks/index.js +4 -0
- package/dist/common/hooks/useCommandHistory.d.ts +7 -0
- package/dist/common/hooks/useCommandHistory.js +51 -0
- package/dist/common/hooks/useCtrlC.d.ts +9 -0
- package/dist/common/hooks/useCtrlC.js +40 -0
- package/dist/common/hooks/useKittyKeyboard.d.ts +10 -0
- package/dist/common/hooks/useKittyKeyboard.js +26 -0
- package/dist/common/hooks/useStaticOutputBuffer.d.ts +31 -0
- package/dist/common/hooks/useStaticOutputBuffer.js +58 -0
- package/dist/common/hooks/useTerminalResize.d.ts +28 -0
- package/dist/common/hooks/useTerminalResize.js +51 -0
- package/dist/common/hooks/useTextBuffer.d.ts +13 -0
- package/dist/common/hooks/useTextBuffer.js +165 -0
- package/dist/common/index.d.ts +13 -0
- package/dist/common/index.js +17 -0
- package/dist/common/types.d.ts +162 -0
- package/dist/common/types.js +1 -0
- package/dist/mcp/index.d.ts +12 -0
- package/dist/mcp/index.js +66 -0
- package/dist/mcp/server.d.ts +25 -0
- package/dist/mcp/server.js +837 -0
- package/dist/mcp/watcher.d.ts +86 -0
- package/dist/mcp/watcher.js +334 -0
- package/dist/rag/__tests__/grammar-smoke.test.d.ts +9 -0
- package/dist/rag/__tests__/grammar-smoke.test.js +161 -0
- package/dist/rag/__tests__/helpers.d.ts +30 -0
- package/dist/rag/__tests__/helpers.js +67 -0
- package/dist/rag/__tests__/merkle.test.d.ts +5 -0
- package/dist/rag/__tests__/merkle.test.js +161 -0
- package/dist/rag/__tests__/metadata-extraction.test.d.ts +10 -0
- package/dist/rag/__tests__/metadata-extraction.test.js +202 -0
- package/dist/rag/__tests__/multi-language.test.d.ts +13 -0
- package/dist/rag/__tests__/multi-language.test.js +535 -0
- package/dist/rag/__tests__/rag.test.d.ts +10 -0
- package/dist/rag/__tests__/rag.test.js +311 -0
- package/dist/rag/__tests__/search-exhaustive.test.d.ts +9 -0
- package/dist/rag/__tests__/search-exhaustive.test.js +87 -0
- package/dist/rag/__tests__/search-filters.test.d.ts +10 -0
- package/dist/rag/__tests__/search-filters.test.js +250 -0
- package/dist/rag/__tests__/search-modes.test.d.ts +8 -0
- package/dist/rag/__tests__/search-modes.test.js +133 -0
- package/dist/rag/config/index.d.ts +61 -0
- package/dist/rag/config/index.js +111 -0
- package/dist/rag/constants.d.ts +41 -0
- package/dist/rag/constants.js +57 -0
- package/dist/rag/embeddings/fastembed.d.ts +62 -0
- package/dist/rag/embeddings/fastembed.js +124 -0
- package/dist/rag/embeddings/gemini.d.ts +26 -0
- package/dist/rag/embeddings/gemini.js +116 -0
- package/dist/rag/embeddings/index.d.ts +10 -0
- package/dist/rag/embeddings/index.js +9 -0
- package/dist/rag/embeddings/local-4b.d.ts +28 -0
- package/dist/rag/embeddings/local-4b.js +51 -0
- package/dist/rag/embeddings/local.d.ts +29 -0
- package/dist/rag/embeddings/local.js +119 -0
- package/dist/rag/embeddings/mistral.d.ts +22 -0
- package/dist/rag/embeddings/mistral.js +85 -0
- package/dist/rag/embeddings/openai.d.ts +22 -0
- package/dist/rag/embeddings/openai.js +85 -0
- package/dist/rag/embeddings/types.d.ts +37 -0
- package/dist/rag/embeddings/types.js +1 -0
- package/dist/rag/gitignore/index.d.ts +57 -0
- package/dist/rag/gitignore/index.js +178 -0
- package/dist/rag/index.d.ts +15 -0
- package/dist/rag/index.js +25 -0
- package/dist/rag/indexer/chunker.d.ts +129 -0
- package/dist/rag/indexer/chunker.js +1352 -0
- package/dist/rag/indexer/index.d.ts +6 -0
- package/dist/rag/indexer/index.js +6 -0
- package/dist/rag/indexer/indexer.d.ts +73 -0
- package/dist/rag/indexer/indexer.js +356 -0
- package/dist/rag/indexer/types.d.ts +68 -0
- package/dist/rag/indexer/types.js +47 -0
- package/dist/rag/logger/index.d.ts +20 -0
- package/dist/rag/logger/index.js +75 -0
- package/dist/rag/manifest/index.d.ts +50 -0
- package/dist/rag/manifest/index.js +97 -0
- package/dist/rag/merkle/diff.d.ts +26 -0
- package/dist/rag/merkle/diff.js +95 -0
- package/dist/rag/merkle/hash.d.ts +34 -0
- package/dist/rag/merkle/hash.js +165 -0
- package/dist/rag/merkle/index.d.ts +68 -0
- package/dist/rag/merkle/index.js +298 -0
- package/dist/rag/merkle/node.d.ts +51 -0
- package/dist/rag/merkle/node.js +69 -0
- package/dist/rag/search/filters.d.ts +21 -0
- package/dist/rag/search/filters.js +100 -0
- package/dist/rag/search/fts.d.ts +32 -0
- package/dist/rag/search/fts.js +61 -0
- package/dist/rag/search/hybrid.d.ts +17 -0
- package/dist/rag/search/hybrid.js +58 -0
- package/dist/rag/search/index.d.ts +89 -0
- package/dist/rag/search/index.js +367 -0
- package/dist/rag/search/types.d.ts +130 -0
- package/dist/rag/search/types.js +4 -0
- package/dist/rag/search/vector.d.ts +25 -0
- package/dist/rag/search/vector.js +44 -0
- package/dist/rag/storage/index.d.ts +92 -0
- package/dist/rag/storage/index.js +287 -0
- package/dist/rag/storage/lancedb-native.d.ts +7 -0
- package/dist/rag/storage/lancedb-native.js +10 -0
- package/dist/rag/storage/schema.d.ts +23 -0
- package/dist/rag/storage/schema.js +50 -0
- package/dist/rag/storage/types.d.ts +100 -0
- package/dist/rag/storage/types.js +68 -0
- package/package.json +67 -0
- package/scripts/check-node-version.js +37 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector similarity search using LanceDB.
|
|
3
|
+
*/
|
|
4
|
+
/**
|
|
5
|
+
* Perform vector similarity search.
|
|
6
|
+
*
|
|
7
|
+
* @param table - LanceDB table to search
|
|
8
|
+
* @param queryVector - Query embedding vector
|
|
9
|
+
* @param options - Search options
|
|
10
|
+
* @returns Array of search results with vector scores
|
|
11
|
+
*/
|
|
12
|
+
export async function vectorSearch(table, queryVector, options) {
|
|
13
|
+
// Support legacy signature: vectorSearch(table, vector, limit)
|
|
14
|
+
const opts = typeof options === 'number' ? { limit: options } : options;
|
|
15
|
+
let query = table.search(queryVector).limit(opts.limit);
|
|
16
|
+
// Apply filter if provided
|
|
17
|
+
if (opts.filterClause) {
|
|
18
|
+
query = query.where(opts.filterClause);
|
|
19
|
+
}
|
|
20
|
+
const results = await query.toArray();
|
|
21
|
+
return results
|
|
22
|
+
.map(row => {
|
|
23
|
+
const chunk = row;
|
|
24
|
+
// LanceDB returns _distance (lower is better for L2/cosine)
|
|
25
|
+
// Convert to similarity score (higher is better)
|
|
26
|
+
const distance = chunk._distance ?? 0;
|
|
27
|
+
const vectorScore = 1 / (1 + distance);
|
|
28
|
+
return {
|
|
29
|
+
id: chunk.id,
|
|
30
|
+
text: chunk.text,
|
|
31
|
+
filepath: chunk.filepath,
|
|
32
|
+
filename: chunk.filename,
|
|
33
|
+
name: chunk.name,
|
|
34
|
+
type: chunk.type,
|
|
35
|
+
startLine: chunk.start_line,
|
|
36
|
+
endLine: chunk.end_line,
|
|
37
|
+
score: vectorScore,
|
|
38
|
+
vectorScore,
|
|
39
|
+
signature: chunk.signature,
|
|
40
|
+
isExported: chunk.is_exported,
|
|
41
|
+
};
|
|
42
|
+
})
|
|
43
|
+
.filter(r => !opts.minScore || r.score >= opts.minScore);
|
|
44
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import type { Table } from '@lancedb/lancedb';
|
|
2
|
+
import { type CachedEmbedding, type CodeChunk } from './types.js';
|
|
3
|
+
export * from './types.js';
|
|
4
|
+
export * from './schema.js';
|
|
5
|
+
/**
|
|
6
|
+
* Storage layer wrapping LanceDB for code chunks and embedding cache.
|
|
7
|
+
*/
|
|
8
|
+
export declare class Storage {
|
|
9
|
+
private readonly projectRoot;
|
|
10
|
+
private readonly dimensions;
|
|
11
|
+
private db;
|
|
12
|
+
private chunksTable;
|
|
13
|
+
private cacheTable;
|
|
14
|
+
constructor(projectRoot: string, dimensions?: number);
|
|
15
|
+
/**
|
|
16
|
+
* Connect to the LanceDB database.
|
|
17
|
+
* Creates tables if they don't exist.
|
|
18
|
+
*/
|
|
19
|
+
connect(): Promise<void>;
|
|
20
|
+
/**
|
|
21
|
+
* Close the database connection.
|
|
22
|
+
*/
|
|
23
|
+
close(): void;
|
|
24
|
+
/**
|
|
25
|
+
* Ensure we're connected.
|
|
26
|
+
* Note: chunksTable may be null after resetChunksTable().
|
|
27
|
+
*/
|
|
28
|
+
private ensureConnected;
|
|
29
|
+
/**
|
|
30
|
+
* Upsert chunks into the database.
|
|
31
|
+
* Uses merge insert to update existing chunks or add new ones.
|
|
32
|
+
*/
|
|
33
|
+
upsertChunks(chunks: CodeChunk[]): Promise<void>;
|
|
34
|
+
/**
|
|
35
|
+
* Add chunks to the database (no merge, just insert).
|
|
36
|
+
* Use this after resetChunksTable() to avoid schema mismatch issues.
|
|
37
|
+
* Creates the table from data if it doesn't exist.
|
|
38
|
+
*/
|
|
39
|
+
addChunks(chunks: CodeChunk[]): Promise<void>;
|
|
40
|
+
/**
|
|
41
|
+
* Delete all chunks for a specific file.
|
|
42
|
+
* @returns Number of chunks deleted
|
|
43
|
+
*/
|
|
44
|
+
deleteChunksByFilepath(filepath: string): Promise<number>;
|
|
45
|
+
/**
|
|
46
|
+
* Delete all chunks for multiple files.
|
|
47
|
+
* @returns Number of chunks deleted
|
|
48
|
+
*/
|
|
49
|
+
deleteChunksByFilepaths(filepaths: string[]): Promise<number>;
|
|
50
|
+
/**
|
|
51
|
+
* Get all chunks for a specific file.
|
|
52
|
+
*/
|
|
53
|
+
getChunksByFilepath(filepath: string): Promise<CodeChunk[]>;
|
|
54
|
+
/**
|
|
55
|
+
* Get all unique filepaths in the database.
|
|
56
|
+
*/
|
|
57
|
+
getAllFilepaths(): Promise<Set<string>>;
|
|
58
|
+
/**
|
|
59
|
+
* Count total number of chunks.
|
|
60
|
+
*/
|
|
61
|
+
countChunks(): Promise<number>;
|
|
62
|
+
/**
|
|
63
|
+
* Get cached embeddings for a list of content hashes.
|
|
64
|
+
* @returns Map from content hash to vector
|
|
65
|
+
*/
|
|
66
|
+
getCachedEmbeddings(hashes: string[]): Promise<Map<string, number[]>>;
|
|
67
|
+
/**
|
|
68
|
+
* Cache embeddings for future use.
|
|
69
|
+
*/
|
|
70
|
+
cacheEmbeddings(entries: CachedEmbedding[]): Promise<void>;
|
|
71
|
+
/**
|
|
72
|
+
* Count total number of cached embeddings.
|
|
73
|
+
*/
|
|
74
|
+
countCachedEmbeddings(): Promise<number>;
|
|
75
|
+
/**
|
|
76
|
+
* Clear all chunks but keep the embedding cache.
|
|
77
|
+
*/
|
|
78
|
+
clearAll(): Promise<void>;
|
|
79
|
+
/**
|
|
80
|
+
* Drop and recreate the chunks table.
|
|
81
|
+
* Use this for force reindex to avoid schema mismatch issues.
|
|
82
|
+
*/
|
|
83
|
+
resetChunksTable(): Promise<void>;
|
|
84
|
+
/**
|
|
85
|
+
* Clear the embedding cache.
|
|
86
|
+
*/
|
|
87
|
+
clearCache(): Promise<void>;
|
|
88
|
+
/**
|
|
89
|
+
* Get the chunks table for direct querying (e.g., search).
|
|
90
|
+
*/
|
|
91
|
+
getChunksTable(): Table;
|
|
92
|
+
}
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import * as lancedb from '@lancedb/lancedb';
|
|
2
|
+
import { makeArrowTable } from '@lancedb/lancedb';
|
|
3
|
+
import { getLanceDbPath, TABLE_NAMES } from '../constants.js';
|
|
4
|
+
import { createCodeChunksSchema, createEmbeddingCacheSchema } from './schema.js';
|
|
5
|
+
import { chunkToRow, embeddingToRow, rowToChunk, } from './types.js';
|
|
6
|
+
export * from './types.js';
|
|
7
|
+
export * from './schema.js';
|
|
8
|
+
/**
|
|
9
|
+
* Storage layer wrapping LanceDB for code chunks and embedding cache.
|
|
10
|
+
*/
|
|
11
|
+
export class Storage {
|
|
12
|
+
constructor(projectRoot, dimensions = 768) {
|
|
13
|
+
Object.defineProperty(this, "projectRoot", {
|
|
14
|
+
enumerable: true,
|
|
15
|
+
configurable: true,
|
|
16
|
+
writable: true,
|
|
17
|
+
value: void 0
|
|
18
|
+
});
|
|
19
|
+
Object.defineProperty(this, "dimensions", {
|
|
20
|
+
enumerable: true,
|
|
21
|
+
configurable: true,
|
|
22
|
+
writable: true,
|
|
23
|
+
value: void 0
|
|
24
|
+
});
|
|
25
|
+
Object.defineProperty(this, "db", {
|
|
26
|
+
enumerable: true,
|
|
27
|
+
configurable: true,
|
|
28
|
+
writable: true,
|
|
29
|
+
value: null
|
|
30
|
+
});
|
|
31
|
+
Object.defineProperty(this, "chunksTable", {
|
|
32
|
+
enumerable: true,
|
|
33
|
+
configurable: true,
|
|
34
|
+
writable: true,
|
|
35
|
+
value: null
|
|
36
|
+
});
|
|
37
|
+
Object.defineProperty(this, "cacheTable", {
|
|
38
|
+
enumerable: true,
|
|
39
|
+
configurable: true,
|
|
40
|
+
writable: true,
|
|
41
|
+
value: null
|
|
42
|
+
});
|
|
43
|
+
this.projectRoot = projectRoot;
|
|
44
|
+
this.dimensions = dimensions;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Connect to the LanceDB database.
|
|
48
|
+
* Creates tables if they don't exist.
|
|
49
|
+
*/
|
|
50
|
+
async connect() {
|
|
51
|
+
const dbPath = getLanceDbPath(this.projectRoot);
|
|
52
|
+
this.db = await lancedb.connect(dbPath);
|
|
53
|
+
// Get existing table names
|
|
54
|
+
const tableNames = await this.db.tableNames();
|
|
55
|
+
// Open or create code_chunks table
|
|
56
|
+
if (tableNames.includes(TABLE_NAMES.CODE_CHUNKS)) {
|
|
57
|
+
this.chunksTable = await this.db.openTable(TABLE_NAMES.CODE_CHUNKS);
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
const schema = createCodeChunksSchema(this.dimensions);
|
|
61
|
+
this.chunksTable = await this.db.createEmptyTable(TABLE_NAMES.CODE_CHUNKS, schema);
|
|
62
|
+
}
|
|
63
|
+
// Open or create embedding_cache table
|
|
64
|
+
if (tableNames.includes(TABLE_NAMES.EMBEDDING_CACHE)) {
|
|
65
|
+
this.cacheTable = await this.db.openTable(TABLE_NAMES.EMBEDDING_CACHE);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
const schema = createEmbeddingCacheSchema(this.dimensions);
|
|
69
|
+
this.cacheTable = await this.db.createEmptyTable(TABLE_NAMES.EMBEDDING_CACHE, schema);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Close the database connection.
|
|
74
|
+
*/
|
|
75
|
+
close() {
|
|
76
|
+
// LanceDB connections don't need explicit closing in the JS SDK
|
|
77
|
+
this.db = null;
|
|
78
|
+
this.chunksTable = null;
|
|
79
|
+
this.cacheTable = null;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Ensure we're connected.
|
|
83
|
+
* Note: chunksTable may be null after resetChunksTable().
|
|
84
|
+
*/
|
|
85
|
+
ensureConnected() {
|
|
86
|
+
if (!this.db || !this.cacheTable) {
|
|
87
|
+
throw new Error('Storage not connected. Call connect() first.');
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// ============================================================
|
|
91
|
+
// Chunk Operations
|
|
92
|
+
// ============================================================
|
|
93
|
+
/**
|
|
94
|
+
* Upsert chunks into the database.
|
|
95
|
+
* Uses merge insert to update existing chunks or add new ones.
|
|
96
|
+
*/
|
|
97
|
+
async upsertChunks(chunks) {
|
|
98
|
+
this.ensureConnected();
|
|
99
|
+
if (chunks.length === 0)
|
|
100
|
+
return;
|
|
101
|
+
const rows = chunks.map(chunkToRow);
|
|
102
|
+
// Use merge insert for upsert behavior
|
|
103
|
+
await this.chunksTable.mergeInsert('id')
|
|
104
|
+
.whenMatchedUpdateAll()
|
|
105
|
+
.whenNotMatchedInsertAll()
|
|
106
|
+
.execute(rows);
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Add chunks to the database (no merge, just insert).
|
|
110
|
+
* Use this after resetChunksTable() to avoid schema mismatch issues.
|
|
111
|
+
* Creates the table from data if it doesn't exist.
|
|
112
|
+
*/
|
|
113
|
+
async addChunks(chunks) {
|
|
114
|
+
this.ensureConnected();
|
|
115
|
+
if (chunks.length === 0)
|
|
116
|
+
return;
|
|
117
|
+
const rows = chunks.map(chunkToRow);
|
|
118
|
+
const schema = createCodeChunksSchema(this.dimensions);
|
|
119
|
+
// Use makeArrowTable to properly convert data with schema
|
|
120
|
+
const arrowTable = makeArrowTable(rows, { schema });
|
|
121
|
+
// If table was reset (null), create from Arrow table
|
|
122
|
+
if (!this.chunksTable) {
|
|
123
|
+
this.chunksTable = await this.db.createTable(TABLE_NAMES.CODE_CHUNKS, arrowTable);
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
await this.chunksTable.add(arrowTable);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Delete all chunks for a specific file.
|
|
131
|
+
* @returns Number of chunks deleted
|
|
132
|
+
*/
|
|
133
|
+
async deleteChunksByFilepath(filepath) {
|
|
134
|
+
this.ensureConnected();
|
|
135
|
+
const countBefore = await this.chunksTable.countRows();
|
|
136
|
+
await this.chunksTable.delete(`filepath = '${escapeString(filepath)}'`);
|
|
137
|
+
const countAfter = await this.chunksTable.countRows();
|
|
138
|
+
return countBefore - countAfter;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Delete all chunks for multiple files.
|
|
142
|
+
* @returns Number of chunks deleted
|
|
143
|
+
*/
|
|
144
|
+
async deleteChunksByFilepaths(filepaths) {
|
|
145
|
+
this.ensureConnected();
|
|
146
|
+
if (filepaths.length === 0)
|
|
147
|
+
return 0;
|
|
148
|
+
const countBefore = await this.chunksTable.countRows();
|
|
149
|
+
// Build IN clause with escaped strings
|
|
150
|
+
const escaped = filepaths.map(fp => `'${escapeString(fp)}'`).join(', ');
|
|
151
|
+
await this.chunksTable.delete(`filepath IN (${escaped})`);
|
|
152
|
+
const countAfter = await this.chunksTable.countRows();
|
|
153
|
+
return countBefore - countAfter;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Get all chunks for a specific file.
|
|
157
|
+
*/
|
|
158
|
+
async getChunksByFilepath(filepath) {
|
|
159
|
+
this.ensureConnected();
|
|
160
|
+
const results = await this.chunksTable.query()
|
|
161
|
+
.where(`filepath = '${escapeString(filepath)}'`)
|
|
162
|
+
.toArray();
|
|
163
|
+
return results.map(row => rowToChunk(row));
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Get all unique filepaths in the database.
|
|
167
|
+
*/
|
|
168
|
+
async getAllFilepaths() {
|
|
169
|
+
this.ensureConnected();
|
|
170
|
+
// Query all rows but only need filepath column
|
|
171
|
+
const results = await this.chunksTable.query()
|
|
172
|
+
.select(['filepath'])
|
|
173
|
+
.toArray();
|
|
174
|
+
const filepaths = new Set();
|
|
175
|
+
for (const row of results) {
|
|
176
|
+
filepaths.add(row.filepath);
|
|
177
|
+
}
|
|
178
|
+
return filepaths;
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Count total number of chunks.
|
|
182
|
+
*/
|
|
183
|
+
async countChunks() {
|
|
184
|
+
this.ensureConnected();
|
|
185
|
+
return this.chunksTable.countRows();
|
|
186
|
+
}
|
|
187
|
+
// ============================================================
|
|
188
|
+
// Cache Operations
|
|
189
|
+
// ============================================================
|
|
190
|
+
/**
|
|
191
|
+
* Get cached embeddings for a list of content hashes.
|
|
192
|
+
* @returns Map from content hash to vector
|
|
193
|
+
*/
|
|
194
|
+
async getCachedEmbeddings(hashes) {
|
|
195
|
+
this.ensureConnected();
|
|
196
|
+
if (hashes.length === 0)
|
|
197
|
+
return new Map();
|
|
198
|
+
// Build IN clause
|
|
199
|
+
const escaped = hashes.map(h => `'${escapeString(h)}'`).join(', ');
|
|
200
|
+
const results = await this.cacheTable.query()
|
|
201
|
+
.where(`content_hash IN (${escaped})`)
|
|
202
|
+
.toArray();
|
|
203
|
+
const cache = new Map();
|
|
204
|
+
for (const row of results) {
|
|
205
|
+
const typed = row;
|
|
206
|
+
// Ensure vector is a plain array (LanceDB may return typed arrays)
|
|
207
|
+
const vector = Array.isArray(typed.vector)
|
|
208
|
+
? typed.vector
|
|
209
|
+
: Array.from(typed.vector);
|
|
210
|
+
cache.set(typed.content_hash, vector);
|
|
211
|
+
}
|
|
212
|
+
return cache;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Cache embeddings for future use.
|
|
216
|
+
*/
|
|
217
|
+
async cacheEmbeddings(entries) {
|
|
218
|
+
this.ensureConnected();
|
|
219
|
+
if (entries.length === 0)
|
|
220
|
+
return;
|
|
221
|
+
const rows = entries.map(embeddingToRow);
|
|
222
|
+
// Use merge insert for upsert behavior
|
|
223
|
+
await this.cacheTable.mergeInsert('content_hash')
|
|
224
|
+
.whenMatchedUpdateAll()
|
|
225
|
+
.whenNotMatchedInsertAll()
|
|
226
|
+
.execute(rows);
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Count total number of cached embeddings.
|
|
230
|
+
*/
|
|
231
|
+
async countCachedEmbeddings() {
|
|
232
|
+
this.ensureConnected();
|
|
233
|
+
return this.cacheTable.countRows();
|
|
234
|
+
}
|
|
235
|
+
// ============================================================
|
|
236
|
+
// Maintenance Operations
|
|
237
|
+
// ============================================================
|
|
238
|
+
/**
|
|
239
|
+
* Clear all chunks but keep the embedding cache.
|
|
240
|
+
*/
|
|
241
|
+
async clearAll() {
|
|
242
|
+
this.ensureConnected();
|
|
243
|
+
// Delete all rows from chunks table
|
|
244
|
+
// LanceDB doesn't have a truncate, so we delete all
|
|
245
|
+
const count = await this.chunksTable.countRows();
|
|
246
|
+
if (count > 0) {
|
|
247
|
+
// Delete with a condition that matches all rows
|
|
248
|
+
await this.chunksTable.delete('id IS NOT NULL');
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Drop and recreate the chunks table.
|
|
253
|
+
* Use this for force reindex to avoid schema mismatch issues.
|
|
254
|
+
*/
|
|
255
|
+
async resetChunksTable() {
|
|
256
|
+
this.ensureConnected();
|
|
257
|
+
// Drop existing table
|
|
258
|
+
await this.db.dropTable(TABLE_NAMES.CODE_CHUNKS);
|
|
259
|
+
// Don't pre-create - let it be created from first data insert
|
|
260
|
+
// This avoids Arrow schema mismatch issues
|
|
261
|
+
this.chunksTable = null;
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Clear the embedding cache.
|
|
265
|
+
*/
|
|
266
|
+
async clearCache() {
|
|
267
|
+
this.ensureConnected();
|
|
268
|
+
const count = await this.cacheTable.countRows();
|
|
269
|
+
if (count > 0) {
|
|
270
|
+
await this.cacheTable.delete('content_hash IS NOT NULL');
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
/**
|
|
274
|
+
* Get the chunks table for direct querying (e.g., search).
|
|
275
|
+
*/
|
|
276
|
+
getChunksTable() {
|
|
277
|
+
this.ensureConnected();
|
|
278
|
+
return this.chunksTable;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Escape a string for use in SQL-like LanceDB filter expressions.
|
|
283
|
+
* Escapes single quotes by doubling them.
|
|
284
|
+
*/
|
|
285
|
+
function escapeString(s) {
|
|
286
|
+
return s.replace(/'/g, "''");
|
|
287
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Force bundler to include LanceDB native module.
|
|
3
|
+
* This file exists to make Bun's bundler statically analyze and include
|
|
4
|
+
* the LanceDB native binding, which is normally loaded dynamically.
|
|
5
|
+
*/
|
|
6
|
+
import '@lancedb/lancedb-darwin-arm64';
|
|
7
|
+
export * from '@lancedb/lancedb';
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Force bundler to include LanceDB native module.
|
|
3
|
+
* This file exists to make Bun's bundler statically analyze and include
|
|
4
|
+
* the LanceDB native binding, which is normally loaded dynamically.
|
|
5
|
+
*/
|
|
6
|
+
// Force bundler to include the darwin-arm64 native module
|
|
7
|
+
// @ts-ignore - native module
|
|
8
|
+
import '@lancedb/lancedb-darwin-arm64';
|
|
9
|
+
// Re-export everything from lancedb
|
|
10
|
+
export * from '@lancedb/lancedb';
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { Schema } from 'apache-arrow';
|
|
2
|
+
/**
|
|
3
|
+
* Current schema version. Increment when schema changes require reindex.
|
|
4
|
+
*/
|
|
5
|
+
export declare const SCHEMA_VERSION = 2;
|
|
6
|
+
/**
|
|
7
|
+
* Arrow schema for the code_chunks table.
|
|
8
|
+
*
|
|
9
|
+
* Stores indexed code chunks with their embeddings.
|
|
10
|
+
*
|
|
11
|
+
* Schema v2 adds:
|
|
12
|
+
* - signature: Function/method signature line
|
|
13
|
+
* - docstring: Extracted documentation
|
|
14
|
+
* - is_exported: Whether symbol is exported
|
|
15
|
+
* - decorator_names: Comma-separated decorator names
|
|
16
|
+
*/
|
|
17
|
+
export declare function createCodeChunksSchema(dimensions?: number): Schema;
|
|
18
|
+
/**
|
|
19
|
+
* Arrow schema for the embedding_cache table.
|
|
20
|
+
*
|
|
21
|
+
* Content-addressed cache for embeddings to avoid recomputation.
|
|
22
|
+
*/
|
|
23
|
+
export declare function createEmbeddingCacheSchema(dimensions?: number): Schema;
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { Field, FixedSizeList, Float32, Int32, Schema, Utf8, Bool, } from 'apache-arrow';
|
|
2
|
+
import { DEFAULT_EMBEDDING_DIMENSIONS } from '../constants.js';
|
|
3
|
+
/**
|
|
4
|
+
* Current schema version. Increment when schema changes require reindex.
|
|
5
|
+
*/
|
|
6
|
+
export const SCHEMA_VERSION = 2;
|
|
7
|
+
/**
|
|
8
|
+
* Arrow schema for the code_chunks table.
|
|
9
|
+
*
|
|
10
|
+
* Stores indexed code chunks with their embeddings.
|
|
11
|
+
*
|
|
12
|
+
* Schema v2 adds:
|
|
13
|
+
* - signature: Function/method signature line
|
|
14
|
+
* - docstring: Extracted documentation
|
|
15
|
+
* - is_exported: Whether symbol is exported
|
|
16
|
+
* - decorator_names: Comma-separated decorator names
|
|
17
|
+
*/
|
|
18
|
+
export function createCodeChunksSchema(dimensions = DEFAULT_EMBEDDING_DIMENSIONS) {
|
|
19
|
+
return new Schema([
|
|
20
|
+
new Field('id', new Utf8(), false), // "{filepath}:{startLine}"
|
|
21
|
+
new Field('vector', new FixedSizeList(dimensions, new Field('item', new Float32(), false)), false),
|
|
22
|
+
new Field('text', new Utf8(), false),
|
|
23
|
+
new Field('content_hash', new Utf8(), false),
|
|
24
|
+
new Field('filepath', new Utf8(), false),
|
|
25
|
+
new Field('filename', new Utf8(), false),
|
|
26
|
+
new Field('extension', new Utf8(), false),
|
|
27
|
+
new Field('type', new Utf8(), false), // function/class/method/module
|
|
28
|
+
new Field('name', new Utf8(), false),
|
|
29
|
+
new Field('start_line', new Int32(), false),
|
|
30
|
+
new Field('end_line', new Int32(), false),
|
|
31
|
+
new Field('file_hash', new Utf8(), false),
|
|
32
|
+
// New in schema v2: deterministic AST-derived metadata
|
|
33
|
+
new Field('signature', new Utf8(), true), // Function/class signature line (nullable)
|
|
34
|
+
new Field('docstring', new Utf8(), true), // Extracted documentation (nullable)
|
|
35
|
+
new Field('is_exported', new Bool(), false), // Has export modifier
|
|
36
|
+
new Field('decorator_names', new Utf8(), true), // Comma-separated decorators (nullable)
|
|
37
|
+
]);
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Arrow schema for the embedding_cache table.
|
|
41
|
+
*
|
|
42
|
+
* Content-addressed cache for embeddings to avoid recomputation.
|
|
43
|
+
*/
|
|
44
|
+
export function createEmbeddingCacheSchema(dimensions = DEFAULT_EMBEDDING_DIMENSIONS) {
|
|
45
|
+
return new Schema([
|
|
46
|
+
new Field('content_hash', new Utf8(), false), // Primary key
|
|
47
|
+
new Field('vector', new FixedSizeList(dimensions, new Field('item', new Float32(), false)), false),
|
|
48
|
+
new Field('created_at', new Utf8(), false), // ISO timestamp
|
|
49
|
+
]);
|
|
50
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Types of code chunks extracted by tree-sitter.
|
|
3
|
+
*/
|
|
4
|
+
export type ChunkType = 'function' | 'class' | 'method' | 'module';
|
|
5
|
+
/**
|
|
6
|
+
* A code chunk stored in LanceDB.
|
|
7
|
+
* Represents a semantic unit of code (function, class, method, or module).
|
|
8
|
+
*/
|
|
9
|
+
export interface CodeChunk {
|
|
10
|
+
/** Unique ID: "{filepath}:{startLine}" */
|
|
11
|
+
id: string;
|
|
12
|
+
/** Embedding vector (768 dimensions for Jina code embeddings) */
|
|
13
|
+
vector: number[];
|
|
14
|
+
/** Source code content */
|
|
15
|
+
text: string;
|
|
16
|
+
/** SHA256 hash of the text content */
|
|
17
|
+
contentHash: string;
|
|
18
|
+
/** Relative file path from project root */
|
|
19
|
+
filepath: string;
|
|
20
|
+
/** Just the filename (e.g., "utils.py") */
|
|
21
|
+
filename: string;
|
|
22
|
+
/** File extension (e.g., ".py") */
|
|
23
|
+
extension: string;
|
|
24
|
+
/** Chunk type: function, class, method, or module */
|
|
25
|
+
type: ChunkType;
|
|
26
|
+
/** Symbol name (empty for module chunks) */
|
|
27
|
+
name: string;
|
|
28
|
+
/** Start line number (1-indexed) */
|
|
29
|
+
startLine: number;
|
|
30
|
+
/** End line number (1-indexed) */
|
|
31
|
+
endLine: number;
|
|
32
|
+
/** SHA256 hash of the entire source file */
|
|
33
|
+
fileHash: string;
|
|
34
|
+
/** Function/method signature line (null for module/class without params) */
|
|
35
|
+
signature: string | null;
|
|
36
|
+
/** Extracted documentation (JSDoc, docstring, etc.) */
|
|
37
|
+
docstring: string | null;
|
|
38
|
+
/** Whether symbol has export modifier (JS/TS) or is in __all__ (Python) */
|
|
39
|
+
isExported: boolean;
|
|
40
|
+
/** Comma-separated decorator/annotation names (null if none) */
|
|
41
|
+
decoratorNames: string | null;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Row format for LanceDB code_chunks table.
|
|
45
|
+
* Uses snake_case to match Arrow/LanceDB conventions.
|
|
46
|
+
*/
|
|
47
|
+
export interface CodeChunkRow {
|
|
48
|
+
id: string;
|
|
49
|
+
vector: number[];
|
|
50
|
+
text: string;
|
|
51
|
+
content_hash: string;
|
|
52
|
+
filepath: string;
|
|
53
|
+
filename: string;
|
|
54
|
+
extension: string;
|
|
55
|
+
type: string;
|
|
56
|
+
name: string;
|
|
57
|
+
start_line: number;
|
|
58
|
+
end_line: number;
|
|
59
|
+
file_hash: string;
|
|
60
|
+
signature: string | null;
|
|
61
|
+
docstring: string | null;
|
|
62
|
+
is_exported: boolean;
|
|
63
|
+
decorator_names: string | null;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* A cached embedding stored in LanceDB.
|
|
67
|
+
* Content-addressed by the hash of the text.
|
|
68
|
+
*/
|
|
69
|
+
export interface CachedEmbedding {
|
|
70
|
+
/** SHA256 hash of the text content (primary key) */
|
|
71
|
+
contentHash: string;
|
|
72
|
+
/** Embedding vector */
|
|
73
|
+
vector: number[];
|
|
74
|
+
/** ISO timestamp when cached */
|
|
75
|
+
createdAt: string;
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Row format for LanceDB embedding_cache table.
|
|
79
|
+
*/
|
|
80
|
+
export interface CachedEmbeddingRow {
|
|
81
|
+
content_hash: string;
|
|
82
|
+
vector: number[];
|
|
83
|
+
created_at: string;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Convert a CodeChunk to a LanceDB row format.
|
|
87
|
+
*/
|
|
88
|
+
export declare function chunkToRow(chunk: CodeChunk): CodeChunkRow;
|
|
89
|
+
/**
|
|
90
|
+
* Convert a LanceDB row to a CodeChunk.
|
|
91
|
+
*/
|
|
92
|
+
export declare function rowToChunk(row: CodeChunkRow): CodeChunk;
|
|
93
|
+
/**
|
|
94
|
+
* Convert a CachedEmbedding to a LanceDB row format.
|
|
95
|
+
*/
|
|
96
|
+
export declare function embeddingToRow(embedding: CachedEmbedding): CachedEmbeddingRow;
|
|
97
|
+
/**
|
|
98
|
+
* Convert a LanceDB row to a CachedEmbedding.
|
|
99
|
+
*/
|
|
100
|
+
export declare function rowToEmbedding(row: CachedEmbeddingRow): CachedEmbedding;
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert a CodeChunk to a LanceDB row format.
|
|
3
|
+
*/
|
|
4
|
+
export function chunkToRow(chunk) {
|
|
5
|
+
return {
|
|
6
|
+
id: chunk.id,
|
|
7
|
+
vector: chunk.vector,
|
|
8
|
+
text: chunk.text,
|
|
9
|
+
content_hash: chunk.contentHash,
|
|
10
|
+
filepath: chunk.filepath,
|
|
11
|
+
filename: chunk.filename,
|
|
12
|
+
extension: chunk.extension,
|
|
13
|
+
type: chunk.type,
|
|
14
|
+
name: chunk.name,
|
|
15
|
+
start_line: chunk.startLine,
|
|
16
|
+
end_line: chunk.endLine,
|
|
17
|
+
file_hash: chunk.fileHash,
|
|
18
|
+
// New in schema v2
|
|
19
|
+
signature: chunk.signature,
|
|
20
|
+
docstring: chunk.docstring,
|
|
21
|
+
is_exported: chunk.isExported,
|
|
22
|
+
decorator_names: chunk.decoratorNames,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Convert a LanceDB row to a CodeChunk.
|
|
27
|
+
*/
|
|
28
|
+
export function rowToChunk(row) {
|
|
29
|
+
return {
|
|
30
|
+
id: row.id,
|
|
31
|
+
vector: row.vector,
|
|
32
|
+
text: row.text,
|
|
33
|
+
contentHash: row.content_hash,
|
|
34
|
+
filepath: row.filepath,
|
|
35
|
+
filename: row.filename,
|
|
36
|
+
extension: row.extension,
|
|
37
|
+
type: row.type,
|
|
38
|
+
name: row.name,
|
|
39
|
+
startLine: row.start_line,
|
|
40
|
+
endLine: row.end_line,
|
|
41
|
+
fileHash: row.file_hash,
|
|
42
|
+
// New in schema v2
|
|
43
|
+
signature: row.signature,
|
|
44
|
+
docstring: row.docstring,
|
|
45
|
+
isExported: row.is_exported,
|
|
46
|
+
decoratorNames: row.decorator_names,
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Convert a CachedEmbedding to a LanceDB row format.
|
|
51
|
+
*/
|
|
52
|
+
export function embeddingToRow(embedding) {
|
|
53
|
+
return {
|
|
54
|
+
content_hash: embedding.contentHash,
|
|
55
|
+
vector: embedding.vector,
|
|
56
|
+
created_at: embedding.createdAt,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Convert a LanceDB row to a CachedEmbedding.
|
|
61
|
+
*/
|
|
62
|
+
export function rowToEmbedding(row) {
|
|
63
|
+
return {
|
|
64
|
+
contentHash: row.content_hash,
|
|
65
|
+
vector: row.vector,
|
|
66
|
+
createdAt: row.created_at,
|
|
67
|
+
};
|
|
68
|
+
}
|